From 13713d4a4bb043222d6f19e05ddf16c8d71076cb Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 21 Aug 2025 13:42:00 +0000 Subject: [PATCH 01/10] reduces cli args redundancy --- src/lighteval/cli_args.py | 148 +++++++++++++ src/lighteval/main_accelerate.py | 122 ++++------- src/lighteval/main_baseline.py | 36 ++-- src/lighteval/main_custom.py | 98 +++------ src/lighteval/main_endpoint.py | 360 +++++++++---------------------- src/lighteval/main_nanotron.py | 26 +-- src/lighteval/main_sglang.py | 121 ++++------- src/lighteval/main_vllm.py | 121 ++++------- 8 files changed, 428 insertions(+), 604 deletions(-) create mode 100644 src/lighteval/cli_args.py diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py new file mode 100644 index 000000000..70697b906 --- /dev/null +++ b/src/lighteval/cli_args.py @@ -0,0 +1,148 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +Common CLI argument types for LightEval main files. +This module exports pre-defined argument types to reduce redundancy across main_*.py files. +""" + +from typing import Optional + +from typer import Argument, Option +from typing_extensions import Annotated + + +# Help panel names for consistent organization +HELP_PANEL_NAME_1 = "Common Parameters" +HELP_PANEL_NAME_2 = "Logging Parameters" +HELP_PANEL_NAME_3 = "Debug Parameters" +HELP_PANEL_NAME_4 = "Modeling Parameters" + + +# Common Parameters (HELP_PANEL_NAME_1) +DatasetLoadingProcesses = Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) +] + +CustomTasks = Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) +] + +NumFewshotSeeds = Annotated[ + int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) +] + +LoadResponsesFromDetailsDateId = Annotated[ + Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) +] + +RemoveReasoningTags = Annotated[ + bool | None, + Option( + help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", + rich_help_panel=HELP_PANEL_NAME_1, + ), +] + +ReasoningTags = Annotated[ + str | None, + Option( + help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", + rich_help_panel=HELP_PANEL_NAME_1, + ), +] + + +# Logging Parameters (HELP_PANEL_NAME_2) +OutputDir = Annotated[str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)] + +ResultsPathTemplate = Annotated[ + str | None, + Option( + help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + rich_help_panel=HELP_PANEL_NAME_2, + ), +] + +PushToHub = Annotated[bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)] + +PushToTensorboard = Annotated[bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)] + +PublicRun = Annotated[ + bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) +] + +ResultsOrg = Annotated[ + Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) +] + +SaveDetails = Annotated[ + bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) +] + +Wandb = Annotated[ + bool, + Option( + help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", + rich_help_panel=HELP_PANEL_NAME_2, + ), +] + + +# Debug Parameters (HELP_PANEL_NAME_3) +MaxSamples = Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) +] + +JobId = Annotated[int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)] + + +# Common argument patterns +Tasks = Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")] + +ModelArgs = Annotated[ + str, + Argument( + help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" + ), +] + + +# Default values for common arguments +DEFAULT_VALUES = { + "dataset_loading_processes": 1, + "custom_tasks": None, + "num_fewshot_seeds": 1, + "load_responses_from_details_date_id": None, + "remove_reasoning_tags": True, + "reasoning_tags": None, + "output_dir": "results", + "results_path_template": None, + "push_to_hub": False, + "push_to_tensorboard": False, + "public_run": False, + "results_org": None, + "save_details": False, + "wandb": False, + "max_samples": None, + "job_id": 0, +} diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index 1b3a3c6c8..ab9d74aa5 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -21,99 +21,65 @@ # SOFTWARE. import logging -from typing import Optional -from typer import Argument, Option +from typer import Option from typing_extensions import Annotated +from lighteval.cli_args import ( + DEFAULT_VALUES, + HELP_PANEL_NAME_4, + CustomTasks, + DatasetLoadingProcesses, + JobId, + LoadResponsesFromDetailsDateId, + MaxSamples, + ModelArgs, + NumFewshotSeeds, + OutputDir, + PublicRun, + PushToHub, + PushToTensorboard, + ReasoningTags, + RemoveReasoningTags, + ResultsOrg, + ResultsPathTemplate, + SaveDetails, + Tasks, + Wandb, +) -logger = logging.getLogger(__name__) -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +logger = logging.getLogger(__name__) def accelerate( # noqa C901 # === general === - model_args: Annotated[ - str, - Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" - ), - ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + model_args: ModelArgs, + tasks: Tasks, # === Common parameters === vision_model: Annotated[ bool, Option(help="Use vision model for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) ] = False, - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ + "load_responses_from_details_date_id" + ], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], + wandb: Wandb = DEFAULT_VALUES["wandb"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate models using accelerate and transformers as backend. diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py index b4195d116..035cad276 100644 --- a/src/lighteval/main_baseline.py +++ b/src/lighteval/main_baseline.py @@ -21,32 +21,22 @@ # SOFTWARE. -from typing import Optional - -from typer import Argument, Option -from typing_extensions import Annotated - - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + DEFAULT_VALUES, + CustomTasks, + DatasetLoadingProcesses, + MaxSamples, + OutputDir, + Tasks, +) def baseline( - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, + tasks: Tasks, + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], ): """ Compute baselines for given tasks. diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py index 6883e3667..d2152b585 100644 --- a/src/lighteval/main_custom.py +++ b/src/lighteval/main_custom.py @@ -19,87 +19,59 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Optional + import typer -from typer import Argument, Option +from typer import Argument from typing_extensions import Annotated +from lighteval.cli_args import ( + DEFAULT_VALUES, + CustomTasks, + DatasetLoadingProcesses, + JobId, + MaxSamples, + NumFewshotSeeds, + OutputDir, + PublicRun, + PushToHub, + PushToTensorboard, + ReasoningTags, + RemoveReasoningTags, + ResultsOrg, + ResultsPathTemplate, + SaveDetails, + Tasks, +) from lighteval.models.custom.custom_model import CustomModelConfig app = typer.Typer() -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" - - @app.command(rich_help_panel="Evaluation Backends") def custom( # === general === model_name: Annotated[str, Argument(help="The model name to evaluate")], model_definition_file_path: Annotated[str, Argument(help="The model definition file path to evaluate")], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: Tasks, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate custom models (can be anything). diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index 7ce9aa996..08c9ee33c 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -19,20 +19,36 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Optional + import typer from typer import Argument, Option from typing_extensions import Annotated - -app = typer.Typer() +from lighteval.cli_args import ( + DEFAULT_VALUES, + HELP_PANEL_NAME_4, + CustomTasks, + DatasetLoadingProcesses, + JobId, + LoadResponsesFromDetailsDateId, + MaxSamples, + NumFewshotSeeds, + OutputDir, + PublicRun, + PushToHub, + PushToTensorboard, + ReasoningTags, + RemoveReasoningTags, + ResultsOrg, + ResultsPathTemplate, + SaveDetails, + Tasks, + Wandb, +) -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +app = typer.Typer() @app.command(rich_help_panel="Evaluation Backends") @@ -41,7 +57,7 @@ def inference_endpoint( model_config_path: Annotated[ str, Argument(help="Path to model config yaml file. (examples/model_configs/endpoint_model.yaml)") ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: Tasks, free_endpoint: Annotated[ bool, Option( @@ -50,72 +66,26 @@ def inference_endpoint( ), ] = False, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ + "load_responses_from_details_date_id" + ], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], + wandb: Wandb = DEFAULT_VALUES["wandb"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate models using inference-endpoints as backend. @@ -177,74 +147,28 @@ def tgi( model_config_path: Annotated[ str, Argument(help="Path to model config yaml file. (examples/model_configs/tgi_model.yaml)") ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: Tasks, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ + "load_responses_from_details_date_id" + ], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], + wandb: Wandb = DEFAULT_VALUES["wandb"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate models using TGI as backend. @@ -313,74 +237,28 @@ def litellm( help="config file path for the litellm model, or a comma separated string of model args (model_name={},base_url={},provider={})" ), ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: Tasks, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ + "load_responses_from_details_date_id" + ], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], + wandb: Wandb = DEFAULT_VALUES["wandb"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate models using LiteLLM as backend. @@ -453,71 +331,25 @@ def inference_providers( help="config file path for the inference provider model, or a comma separated string of model args (model_name={},provider={},generation={temperature: 0.6})" ), ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: Tasks, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], + wandb: Wandb = DEFAULT_VALUES["wandb"], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate models using HuggingFace's inference providers as backend. diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index 1ded89850..936220331 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -28,11 +28,11 @@ from typing_extensions import Annotated from yaml import SafeLoader - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + DEFAULT_VALUES, + ReasoningTags, + RemoveReasoningTags, +) SEED = 1234 @@ -43,20 +43,8 @@ def nanotron( str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.") ], lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")], - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], ): """ Evaluate models using nanotron as backend. diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index c458bcc01..89867fd84 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -19,94 +19,55 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Optional -from typer import Argument, Option -from typing_extensions import Annotated - - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + DEFAULT_VALUES, + CustomTasks, + DatasetLoadingProcesses, + JobId, + LoadResponsesFromDetailsDateId, + MaxSamples, + ModelArgs, + NumFewshotSeeds, + OutputDir, + PublicRun, + PushToHub, + PushToTensorboard, + ReasoningTags, + RemoveReasoningTags, + ResultsOrg, + ResultsPathTemplate, + SaveDetails, + Tasks, + Wandb, +) def sglang( # === general === - model_args: Annotated[ - str, - Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" - ), - ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + model_args: ModelArgs, + tasks: Tasks, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ + "load_responses_from_details_date_id" + ], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], + wandb: Wandb = DEFAULT_VALUES["wandb"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate models using sglang as backend. diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index fe243c317..7f381f19b 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -19,97 +19,64 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. + from typing import Optional -from typer import Argument, Option +from typer import Option from typing_extensions import Annotated - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + DEFAULT_VALUES, + HELP_PANEL_NAME_4, + CustomTasks, + DatasetLoadingProcesses, + JobId, + LoadResponsesFromDetailsDateId, + MaxSamples, + ModelArgs, + NumFewshotSeeds, + OutputDir, + PublicRun, + PushToHub, + PushToTensorboard, + ReasoningTags, + RemoveReasoningTags, + ResultsOrg, + ResultsPathTemplate, + SaveDetails, + Tasks, + Wandb, +) def vllm( # === general === - model_args: Annotated[ - str, - Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" - ), - ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + model_args: ModelArgs, + tasks: Tasks, # === Common parameters === cot_prompt: Annotated[ Optional[str], Option(help="Use chain of thought prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) ] = None, - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], + custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], + num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ + "load_responses_from_details_date_id" + ], + remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], + reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: OutputDir = DEFAULT_VALUES["output_dir"], + results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], + push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], + push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], + public_run: PublicRun = DEFAULT_VALUES["public_run"], + results_org: ResultsOrg = DEFAULT_VALUES["results_org"], + save_details: SaveDetails = DEFAULT_VALUES["save_details"], + wandb: Wandb = DEFAULT_VALUES["wandb"], # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + job_id: JobId = DEFAULT_VALUES["job_id"], ): """ Evaluate models using vllm as backend. From eeeb34a919af0b310cab8d72795587639c2714a7 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 21 Aug 2025 13:44:07 +0000 Subject: [PATCH 02/10] fix typing --- src/lighteval/cli_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py index 70697b906..1db2da141 100644 --- a/src/lighteval/cli_args.py +++ b/src/lighteval/cli_args.py @@ -56,7 +56,7 @@ ] RemoveReasoningTags = Annotated[ - bool | None, + bool, Option( help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", rich_help_panel=HELP_PANEL_NAME_1, From f4bf926fbff2a72022e5d247675ee39ab82d4536 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 21 Aug 2025 13:54:08 +0000 Subject: [PATCH 03/10] reasoning tags do not need to default to None to then be attributed to actual default --- src/lighteval/cli_args.py | 12 +++++------- src/lighteval/pipeline.py | 4 +--- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py index 1db2da141..03bd0cc37 100644 --- a/src/lighteval/cli_args.py +++ b/src/lighteval/cli_args.py @@ -43,9 +43,7 @@ int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) ] -CustomTasks = Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) -] +CustomTasks = Annotated[Optional[str], Option(help="Path to custom tasks file.", rich_help_panel=HELP_PANEL_NAME_1)] NumFewshotSeeds = Annotated[ int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) @@ -58,15 +56,15 @@ RemoveReasoningTags = Annotated[ bool, Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", + help="Remove reasoning tags from responses.", rich_help_panel=HELP_PANEL_NAME_1, ), ] ReasoningTags = Annotated[ - str | None, + str, Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", + help="List of reasoning tags (provided as pairs) to remove from responses.", rich_help_panel=HELP_PANEL_NAME_1, ), ] @@ -134,7 +132,7 @@ "num_fewshot_seeds": 1, "load_responses_from_details_date_id": None, "remove_reasoning_tags": True, - "reasoning_tags": None, + "reasoning_tags": "[('', '')]", "output_dir": "results", "results_path_template": None, "push_to_hub": False, diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index eb27de58a..0c89145d5 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -105,7 +105,7 @@ class PipelineParameters: max_samples: int | None = None cot_prompt: str | None = None remove_reasoning_tags: bool = True - reasoning_tags: str | list[tuple[str, str]] | None = None + reasoning_tags: str | list[tuple[str, str]] = [("", "")] load_responses_from_details_date_id: str | None = None bootstrap_iters: int = 1000 @@ -129,8 +129,6 @@ def __post_init__(self): # noqa C901 elif self.launcher_type == ParallelismManager.OPENAI: if not is_openai_available(): raise ImportError(NO_OPENAI_ERROR_MSG) - if self.reasoning_tags is None: - self.reasoning_tags = [("", "")] else: # Convert reasoning tags to list if needed if not isinstance(self.reasoning_tags, list): From 081bfa2273bd544e443b526db7edfed2cd8f5739 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 21 Aug 2025 14:23:06 +0000 Subject: [PATCH 04/10] fix typing for dataclass --- src/lighteval/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 0c89145d5..c13fda1a0 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -105,7 +105,7 @@ class PipelineParameters: max_samples: int | None = None cot_prompt: str | None = None remove_reasoning_tags: bool = True - reasoning_tags: str | list[tuple[str, str]] = [("", "")] + reasoning_tags: str | list[tuple[str, str]] = "[('', '')]" load_responses_from_details_date_id: str | None = None bootstrap_iters: int = 1000 From 7939c9fcb4027fb9b80ce059df235bcf2c42c1a0 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 21 Aug 2025 14:31:44 +0000 Subject: [PATCH 05/10] better docs for cli args --- src/lighteval/cli_args.py | 99 ++++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 18 deletions(-) diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py index 03bd0cc37..31aefbe0d 100644 --- a/src/lighteval/cli_args.py +++ b/src/lighteval/cli_args.py @@ -40,23 +40,41 @@ # Common Parameters (HELP_PANEL_NAME_1) DatasetLoadingProcesses = Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) + int, + Option( + help="Number of parallel processes to use for loading datasets. Higher values can speed up dataset loading but use more memory.", + rich_help_panel=HELP_PANEL_NAME_1, + ), ] -CustomTasks = Annotated[Optional[str], Option(help="Path to custom tasks file.", rich_help_panel=HELP_PANEL_NAME_1)] +CustomTasks = Annotated[ + Optional[str], + Option( + help="Path to a Python file containing custom task definitions. The file should define a TASKS_TABLE with LightevalTaskConfig objects.", + rich_help_panel=HELP_PANEL_NAME_1, + ), +] NumFewshotSeeds = Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) + int, + Option( + help="Number of different random seeds to use for few-shot evaluation. Each seed will generate different few-shot examples, providing more robust evaluation.", + rich_help_panel=HELP_PANEL_NAME_1, + ), ] LoadResponsesFromDetailsDateId = Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) + Optional[str], + Option( + help="Load previously generated model responses from a specific evaluation run instead of running the model. Use the timestamp/date_id from a previous run's details directory.", + rich_help_panel=HELP_PANEL_NAME_1, + ), ] RemoveReasoningTags = Annotated[ bool, Option( - help="Remove reasoning tags from responses.", + help="Whether to remove reasoning tags from model responses before computing metrics.", rich_help_panel=HELP_PANEL_NAME_1, ), ] @@ -64,43 +82,73 @@ ReasoningTags = Annotated[ str, Option( - help="List of reasoning tags (provided as pairs) to remove from responses.", + help="List of reasoning tag pairs to remove from responses, formatted as a Python list of tuples.", rich_help_panel=HELP_PANEL_NAME_1, ), ] # Logging Parameters (HELP_PANEL_NAME_2) -OutputDir = Annotated[str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)] +OutputDir = Annotated[ + str, + Option( + help="Directory where evaluation results and details will be saved. Supports fsspec-compliant paths (local, s3, hf hub, etc.).", + rich_help_panel=HELP_PANEL_NAME_2, + ), +] ResultsPathTemplate = Annotated[ str | None, Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", + help="Custom template for results file path. Available variables: {output_dir}, {org}, {model}. Example: '{output_dir}/experiments/{org}_{model}' creates results in a subdirectory.", rich_help_panel=HELP_PANEL_NAME_2, ), ] -PushToHub = Annotated[bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)] +PushToHub = Annotated[ + bool, + Option( + help="Whether to push evaluation results and details to the Hugging Face Hub. Requires --results-org to be set.", + rich_help_panel=HELP_PANEL_NAME_2, + ), +] -PushToTensorboard = Annotated[bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)] +PushToTensorboard = Annotated[ + bool, + Option( + help="Whether to create and push TensorBoard logs to the Hugging Face Hub. Requires --results-org to be set.", + rich_help_panel=HELP_PANEL_NAME_2, + ), +] PublicRun = Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) + bool, + Option( + help="Whether to make the uploaded results and details public on the Hugging Face Hub. If False, datasets will be private.", + rich_help_panel=HELP_PANEL_NAME_2, + ), ] ResultsOrg = Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) + Optional[str], + Option( + help="Hugging Face organization where results will be pushed. Required when using --push-to-hub or --push-to-tensorboard.", + rich_help_panel=HELP_PANEL_NAME_2, + ), ] SaveDetails = Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) + bool, + Option( + help="Whether to save detailed per-sample results including model inputs, outputs, and metrics. Useful for analysis and debugging.", + rich_help_panel=HELP_PANEL_NAME_2, + ), ] Wandb = Annotated[ bool, Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", + help="Whether to log results to Weights & Biases (wandb) or Trackio. Configure with environment variables: WANDB_PROJECT, WANDB_SPACE_ID, etc. See wandb docs for full configuration options.", rich_help_panel=HELP_PANEL_NAME_2, ), ] @@ -108,19 +156,34 @@ # Debug Parameters (HELP_PANEL_NAME_3) MaxSamples = Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) + Optional[int], + Option( + help="Maximum number of samples to evaluate per task. Useful for quick testing or debugging. If None, evaluates on all available samples.", + rich_help_panel=HELP_PANEL_NAME_3, + ), ] -JobId = Annotated[int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)] +JobId = Annotated[ + int, + Option( + help="Optional job identifier for tracking and organizing multiple evaluation runs. Useful in cluster environments.", + rich_help_panel=HELP_PANEL_NAME_3, + ), +] # Common argument patterns -Tasks = Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")] +Tasks = Annotated[ + str, + Argument( + help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks." + ), +] ModelArgs = Annotated[ str, Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" + help="Model configuration in key=value format (e.g., 'pretrained=model_name,device=cuda') or path to YAML config file. See examples/model_configs/ for template files." ), ] From 35c6c6610a15b4a0f663d2f95bfd3419b8ea22ca Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Fri, 22 Aug 2025 07:56:16 +0000 Subject: [PATCH 06/10] fix reasoning tags parsing --- src/lighteval/pipeline.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index c13fda1a0..3a904ffac 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -129,24 +129,24 @@ def __post_init__(self): # noqa C901 elif self.launcher_type == ParallelismManager.OPENAI: if not is_openai_available(): raise ImportError(NO_OPENAI_ERROR_MSG) - else: - # Convert reasoning tags to list if needed - if not isinstance(self.reasoning_tags, list): - try: - self.reasoning_tags = ast.literal_eval(self.reasoning_tags) - except ValueError as e: - raise ValueError( - "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. " - f"Got {self.reasoning_tags} instead, which caused parsing error {e}." - ) - - # Make sure format is correct - if not all(isinstance(tag, tuple) and len(tag) == 2 for tag in self.reasoning_tags): + + # Convert reasoning tags to list if needed + if not isinstance(self.reasoning_tags, list): + try: + self.reasoning_tags = ast.literal_eval(self.reasoning_tags) + except ValueError as e: raise ValueError( "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. " - f"Got {self.reasoning_tags} instead." + f"Got {self.reasoning_tags} instead, which caused parsing error {e}." ) + # Make sure format is correct + if not all(isinstance(tag, tuple) and len(tag) == 2 for tag in self.reasoning_tags): + raise ValueError( + "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. " + f"Got {self.reasoning_tags} instead." + ) + class Pipeline: def __init__( From c319160380ee7ba950902517d924c861513bff38 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 26 Aug 2025 09:35:29 +0000 Subject: [PATCH 07/10] update from suggestion --- src/lighteval/cli_args.py | 362 +++++++++++++++++-------------- src/lighteval/main_accelerate.py | 75 +++---- src/lighteval/main_baseline.py | 21 +- src/lighteval/main_custom.py | 61 +++--- src/lighteval/main_endpoint.py | 175 +++++++-------- src/lighteval/main_nanotron.py | 9 +- src/lighteval/main_sglang.py | 75 +++---- src/lighteval/main_tasks.py | 7 +- src/lighteval/main_vllm.py | 75 +++---- 9 files changed, 442 insertions(+), 418 deletions(-) diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py index 31aefbe0d..472941ad7 100644 --- a/src/lighteval/cli_args.py +++ b/src/lighteval/cli_args.py @@ -25,7 +25,8 @@ This module exports pre-defined argument types to reduce redundancy across main_*.py files. """ -from typing import Optional +from dataclasses import dataclass +from typing import Any, Optional from typer import Argument, Option from typing_extensions import Annotated @@ -38,172 +39,213 @@ HELP_PANEL_NAME_4 = "Modeling Parameters" +@dataclass +class Arg: + """Base class for CLI arguments with type and default value.""" + + type: Annotated + default: Any + + # Common Parameters (HELP_PANEL_NAME_1) -DatasetLoadingProcesses = Annotated[ - int, - Option( - help="Number of parallel processes to use for loading datasets. Higher values can speed up dataset loading but use more memory.", - rich_help_panel=HELP_PANEL_NAME_1, - ), -] - -CustomTasks = Annotated[ - Optional[str], - Option( - help="Path to a Python file containing custom task definitions. The file should define a TASKS_TABLE with LightevalTaskConfig objects.", - rich_help_panel=HELP_PANEL_NAME_1, - ), -] - -NumFewshotSeeds = Annotated[ - int, - Option( - help="Number of different random seeds to use for few-shot evaluation. Each seed will generate different few-shot examples, providing more robust evaluation.", - rich_help_panel=HELP_PANEL_NAME_1, - ), -] - -LoadResponsesFromDetailsDateId = Annotated[ - Optional[str], - Option( - help="Load previously generated model responses from a specific evaluation run instead of running the model. Use the timestamp/date_id from a previous run's details directory.", - rich_help_panel=HELP_PANEL_NAME_1, - ), -] - -RemoveReasoningTags = Annotated[ - bool, - Option( - help="Whether to remove reasoning tags from model responses before computing metrics.", - rich_help_panel=HELP_PANEL_NAME_1, - ), -] - -ReasoningTags = Annotated[ - str, - Option( - help="List of reasoning tag pairs to remove from responses, formatted as a Python list of tuples.", - rich_help_panel=HELP_PANEL_NAME_1, - ), -] +dataset_loading_processes = Arg( + type=Annotated[ + int, + Option( + help="Number of parallel processes to use for loading datasets. Higher values can speed up dataset loading but use more memory.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=1, +) + +custom_tasks = Arg( + type=Annotated[ + Optional[str], + Option( + help="Path to a Python file containing custom task definitions. The file should define a TASKS_TABLE with LightevalTaskConfig objects.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=None, +) + +num_fewshot_seeds = Arg( + type=Annotated[ + int, + Option( + help="Number of different random seeds to use for few-shot evaluation. Each seed will generate different few-shot examples, providing more robust evaluation.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=1, +) + +load_responses_from_details_date_id = Arg( + type=Annotated[ + Optional[str], + Option( + help="Load previously generated model responses from a specific evaluation run instead of running the model. Use the timestamp/date_id from a previous run's details directory.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=None, +) + +remove_reasoning_tags = Arg( + type=Annotated[ + bool, + Option( + help="Whether to remove reasoning tags from model responses before computing metrics.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=True, +) + +reasoning_tags = Arg( + type=Annotated[ + str, + Option( + help="List of reasoning tag pairs to remove from responses, formatted as a Python list of tuples.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default="[('', '')]", +) # Logging Parameters (HELP_PANEL_NAME_2) -OutputDir = Annotated[ - str, - Option( - help="Directory where evaluation results and details will be saved. Supports fsspec-compliant paths (local, s3, hf hub, etc.).", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] - -ResultsPathTemplate = Annotated[ - str | None, - Option( - help="Custom template for results file path. Available variables: {output_dir}, {org}, {model}. Example: '{output_dir}/experiments/{org}_{model}' creates results in a subdirectory.", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] - -PushToHub = Annotated[ - bool, - Option( - help="Whether to push evaluation results and details to the Hugging Face Hub. Requires --results-org to be set.", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] - -PushToTensorboard = Annotated[ - bool, - Option( - help="Whether to create and push TensorBoard logs to the Hugging Face Hub. Requires --results-org to be set.", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] - -PublicRun = Annotated[ - bool, - Option( - help="Whether to make the uploaded results and details public on the Hugging Face Hub. If False, datasets will be private.", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] - -ResultsOrg = Annotated[ - Optional[str], - Option( - help="Hugging Face organization where results will be pushed. Required when using --push-to-hub or --push-to-tensorboard.", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] - -SaveDetails = Annotated[ - bool, - Option( - help="Whether to save detailed per-sample results including model inputs, outputs, and metrics. Useful for analysis and debugging.", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] - -Wandb = Annotated[ - bool, - Option( - help="Whether to log results to Weights & Biases (wandb) or Trackio. Configure with environment variables: WANDB_PROJECT, WANDB_SPACE_ID, etc. See wandb docs for full configuration options.", - rich_help_panel=HELP_PANEL_NAME_2, - ), -] +output_dir = Arg( + type=Annotated[ + str, + Option( + help="Directory where evaluation results and details will be saved. Supports fsspec-compliant paths (local, s3, hf hub, etc.).", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default="results", +) + +results_path_template = Arg( + type=Annotated[ + str | None, + Option( + help="Custom template for results file path. Available variables: {output_dir}, {org}, {model}. Example: '{output_dir}/experiments/{org}_{model}' creates results in a subdirectory.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=None, +) + +push_to_hub = Arg( + type=Annotated[ + bool, + Option( + help="Whether to push evaluation results and details to the Hugging Face Hub. Requires --results-org to be set.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +push_to_tensorboard = Arg( + type=Annotated[ + bool, + Option( + help="Whether to create and push TensorBoard logs to the Hugging Face Hub. Requires --results-org to be set.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +public_run = Arg( + type=Annotated[ + bool, + Option( + help="Whether to make the uploaded results and details public on the Hugging Face Hub. If False, datasets will be private.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +results_org = Arg( + type=Annotated[ + Optional[str], + Option( + help="Hugging Face organization where results will be pushed. Required when using --push-to-hub or --push-to-tensorboard.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=None, +) + +save_details = Arg( + type=Annotated[ + bool, + Option( + help="Whether to save detailed per-sample results including model inputs, outputs, and metrics. Useful for analysis and debugging.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +wandb = Arg( + type=Annotated[ + bool, + Option( + help="Whether to log results to Weights & Biases (wandb) or Trackio. Configure with environment variables: WANDB_PROJECT, WANDB_SPACE_ID, etc. See wandb docs for full configuration options.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) # Debug Parameters (HELP_PANEL_NAME_3) -MaxSamples = Annotated[ - Optional[int], - Option( - help="Maximum number of samples to evaluate per task. Useful for quick testing or debugging. If None, evaluates on all available samples.", - rich_help_panel=HELP_PANEL_NAME_3, - ), -] - -JobId = Annotated[ - int, - Option( - help="Optional job identifier for tracking and organizing multiple evaluation runs. Useful in cluster environments.", - rich_help_panel=HELP_PANEL_NAME_3, - ), -] +max_samples = Arg( + type=Annotated[ + Optional[int], + Option( + help="Maximum number of samples to evaluate per task. Useful for quick testing or debugging. If None, evaluates on all available samples.", + rich_help_panel=HELP_PANEL_NAME_3, + ), + ], + default=None, +) + +job_id = Arg( + type=Annotated[ + int, + Option( + help="Optional job identifier for tracking and organizing multiple evaluation runs. Useful in cluster environments.", + rich_help_panel=HELP_PANEL_NAME_3, + ), + ], + default=0, +) # Common argument patterns -Tasks = Annotated[ - str, - Argument( - help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks." - ), -] - -ModelArgs = Annotated[ - str, - Argument( - help="Model configuration in key=value format (e.g., 'pretrained=model_name,device=cuda') or path to YAML config file. See examples/model_configs/ for template files." - ), -] - - -# Default values for common arguments -DEFAULT_VALUES = { - "dataset_loading_processes": 1, - "custom_tasks": None, - "num_fewshot_seeds": 1, - "load_responses_from_details_date_id": None, - "remove_reasoning_tags": True, - "reasoning_tags": "[('', '')]", - "output_dir": "results", - "results_path_template": None, - "push_to_hub": False, - "push_to_tensorboard": False, - "public_run": False, - "results_org": None, - "save_details": False, - "wandb": False, - "max_samples": None, - "job_id": 0, -} +tasks = Arg( + type=Annotated[ + str, + Argument( + help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks." + ), + ], + default=None, # Required argument, no default +) + +model_args = Arg( + type=Annotated[ + str, + Argument( + help="Model configuration in key=value format (e.g., 'pretrained=model_name,device=cuda') or path to YAML config file. See examples/model_configs/ for template files." + ), + ], + default=None, # Required argument, no default +) diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index ab9d74aa5..1e5726f86 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -26,26 +26,25 @@ from typing_extensions import Annotated from lighteval.cli_args import ( - DEFAULT_VALUES, HELP_PANEL_NAME_4, - CustomTasks, - DatasetLoadingProcesses, - JobId, - LoadResponsesFromDetailsDateId, - MaxSamples, - ModelArgs, - NumFewshotSeeds, - OutputDir, - PublicRun, - PushToHub, - PushToTensorboard, - ReasoningTags, - RemoveReasoningTags, - ResultsOrg, - ResultsPathTemplate, - SaveDetails, - Tasks, - Wandb, + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + model_args, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, ) @@ -54,32 +53,30 @@ def accelerate( # noqa C901 # === general === - model_args: ModelArgs, - tasks: Tasks, + model_args: model_args.type, + tasks: tasks.type, # === Common parameters === vision_model: Annotated[ bool, Option(help="Use vision model for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) ] = False, - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], - load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ - "load_responses_from_details_date_id" - ], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], - wandb: Wandb = DEFAULT_VALUES["wandb"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using accelerate and transformers as backend. diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py index 035cad276..7d4d34248 100644 --- a/src/lighteval/main_baseline.py +++ b/src/lighteval/main_baseline.py @@ -22,21 +22,20 @@ from lighteval.cli_args import ( - DEFAULT_VALUES, - CustomTasks, - DatasetLoadingProcesses, - MaxSamples, - OutputDir, - Tasks, + custom_tasks, + dataset_loading_processes, + max_samples, + output_dir, + tasks, ) def baseline( - tasks: Tasks, - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], + tasks: tasks.type, + custom_tasks: custom_tasks.type = custom_tasks.default, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + output_dir: output_dir.type = output_dir.default, + max_samples: max_samples.type = max_samples.default, ): """ Compute baselines for given tasks. diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py index d2152b585..14507ae8d 100644 --- a/src/lighteval/main_custom.py +++ b/src/lighteval/main_custom.py @@ -26,22 +26,21 @@ from typing_extensions import Annotated from lighteval.cli_args import ( - DEFAULT_VALUES, - CustomTasks, - DatasetLoadingProcesses, - JobId, - MaxSamples, - NumFewshotSeeds, - OutputDir, - PublicRun, - PushToHub, - PushToTensorboard, - ReasoningTags, - RemoveReasoningTags, - ResultsOrg, - ResultsPathTemplate, - SaveDetails, - Tasks, + custom_tasks, + dataset_loading_processes, + job_id, + max_samples, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, ) from lighteval.models.custom.custom_model import CustomModelConfig @@ -54,24 +53,24 @@ def custom( # === general === model_name: Annotated[str, Argument(help="The model name to evaluate")], model_definition_file_path: Annotated[str, Argument(help="The model definition file path to evaluate")], - tasks: Tasks, + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate custom models (can be anything). diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index 08c9ee33c..8b2c7602f 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -26,25 +26,24 @@ from typing_extensions import Annotated from lighteval.cli_args import ( - DEFAULT_VALUES, HELP_PANEL_NAME_4, - CustomTasks, - DatasetLoadingProcesses, - JobId, - LoadResponsesFromDetailsDateId, - MaxSamples, - NumFewshotSeeds, - OutputDir, - PublicRun, - PushToHub, - PushToTensorboard, - ReasoningTags, - RemoveReasoningTags, - ResultsOrg, - ResultsPathTemplate, - SaveDetails, - Tasks, - Wandb, + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, ) @@ -57,7 +56,7 @@ def inference_endpoint( model_config_path: Annotated[ str, Argument(help="Path to model config yaml file. (examples/model_configs/endpoint_model.yaml)") ], - tasks: Tasks, + tasks: tasks.type, free_endpoint: Annotated[ bool, Option( @@ -66,26 +65,24 @@ def inference_endpoint( ), ] = False, # === Common parameters === - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], - load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ - "load_responses_from_details_date_id" - ], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], - wandb: Wandb = DEFAULT_VALUES["wandb"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using inference-endpoints as backend. @@ -147,28 +144,26 @@ def tgi( model_config_path: Annotated[ str, Argument(help="Path to model config yaml file. (examples/model_configs/tgi_model.yaml)") ], - tasks: Tasks, + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], - load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ - "load_responses_from_details_date_id" - ], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], - wandb: Wandb = DEFAULT_VALUES["wandb"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using TGI as backend. @@ -237,28 +232,26 @@ def litellm( help="config file path for the litellm model, or a comma separated string of model args (model_name={},base_url={},provider={})" ), ], - tasks: Tasks, + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], - load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ - "load_responses_from_details_date_id" - ], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], - wandb: Wandb = DEFAULT_VALUES["wandb"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using LiteLLM as backend. @@ -331,25 +324,25 @@ def inference_providers( help="config file path for the inference provider model, or a comma separated string of model args (model_name={},provider={},generation={temperature: 0.6})" ), ], - tasks: Tasks, + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], - wandb: Wandb = DEFAULT_VALUES["wandb"], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using HuggingFace's inference providers as backend. diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index 936220331..06935e69c 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -29,9 +29,8 @@ from yaml import SafeLoader from lighteval.cli_args import ( - DEFAULT_VALUES, - ReasoningTags, - RemoveReasoningTags, + reasoning_tags, + remove_reasoning_tags, ) @@ -43,8 +42,8 @@ def nanotron( str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.") ], lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, ): """ Evaluate models using nanotron as backend. diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index 89867fd84..135396263 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -21,53 +21,50 @@ # SOFTWARE. from lighteval.cli_args import ( - DEFAULT_VALUES, - CustomTasks, - DatasetLoadingProcesses, - JobId, - LoadResponsesFromDetailsDateId, - MaxSamples, - ModelArgs, - NumFewshotSeeds, - OutputDir, - PublicRun, - PushToHub, - PushToTensorboard, - ReasoningTags, - RemoveReasoningTags, - ResultsOrg, - ResultsPathTemplate, - SaveDetails, - Tasks, - Wandb, + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + model_args, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, ) def sglang( # === general === - model_args: ModelArgs, - tasks: Tasks, + model_args: model_args.type, + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], - load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ - "load_responses_from_details_date_id" - ], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], - wandb: Wandb = DEFAULT_VALUES["wandb"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using sglang as backend. diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index 80255a45a..1a283b057 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -20,12 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import logging -from typing import Optional import typer from typer import Argument, Option from typing_extensions import Annotated +from lighteval.cli_args import custom_tasks + app = typer.Typer() @@ -33,7 +34,7 @@ @app.command() def inspect( tasks: Annotated[str, Argument(help="Id of tasks or path to a text file with a list of tasks")], - custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None, + custom_tasks: custom_tasks.type = custom_tasks.default, num_samples: Annotated[int, Option(help="Number of samples to display")] = 10, show_config: Annotated[bool, Option(help="Will display the full task config")] = False, ): @@ -65,7 +66,7 @@ def inspect( @app.command() -def list(custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None): +def list(custom_tasks: custom_tasks.type = custom_tasks.default): """ List all tasks """ diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index 7f381f19b..45e40fd70 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -26,57 +26,54 @@ from typing_extensions import Annotated from lighteval.cli_args import ( - DEFAULT_VALUES, HELP_PANEL_NAME_4, - CustomTasks, - DatasetLoadingProcesses, - JobId, - LoadResponsesFromDetailsDateId, - MaxSamples, - ModelArgs, - NumFewshotSeeds, - OutputDir, - PublicRun, - PushToHub, - PushToTensorboard, - ReasoningTags, - RemoveReasoningTags, - ResultsOrg, - ResultsPathTemplate, - SaveDetails, - Tasks, - Wandb, + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + model_args, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, ) def vllm( # === general === - model_args: ModelArgs, - tasks: Tasks, + model_args: model_args.type, + tasks: tasks.type, # === Common parameters === cot_prompt: Annotated[ Optional[str], Option(help="Use chain of thought prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) ] = None, - dataset_loading_processes: DatasetLoadingProcesses = DEFAULT_VALUES["dataset_loading_processes"], - custom_tasks: CustomTasks = DEFAULT_VALUES["custom_tasks"], - num_fewshot_seeds: NumFewshotSeeds = DEFAULT_VALUES["num_fewshot_seeds"], - load_responses_from_details_date_id: LoadResponsesFromDetailsDateId = DEFAULT_VALUES[ - "load_responses_from_details_date_id" - ], - remove_reasoning_tags: RemoveReasoningTags = DEFAULT_VALUES["remove_reasoning_tags"], - reasoning_tags: ReasoningTags = DEFAULT_VALUES["reasoning_tags"], + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: OutputDir = DEFAULT_VALUES["output_dir"], - results_path_template: ResultsPathTemplate = DEFAULT_VALUES["results_path_template"], - push_to_hub: PushToHub = DEFAULT_VALUES["push_to_hub"], - push_to_tensorboard: PushToTensorboard = DEFAULT_VALUES["push_to_tensorboard"], - public_run: PublicRun = DEFAULT_VALUES["public_run"], - results_org: ResultsOrg = DEFAULT_VALUES["results_org"], - save_details: SaveDetails = DEFAULT_VALUES["save_details"], - wandb: Wandb = DEFAULT_VALUES["wandb"], + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: MaxSamples = DEFAULT_VALUES["max_samples"], - job_id: JobId = DEFAULT_VALUES["job_id"], + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using vllm as backend. From 776219e94afcc3e000c677bd06ea0ad374d08ae3 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 26 Aug 2025 09:58:49 +0000 Subject: [PATCH 08/10] styling --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index e50483f1e..bd4dfbca9 100644 --- a/Makefile +++ b/Makefile @@ -2,10 +2,10 @@ style: - ruff format . - ruff check --fix . + uvx ruff format . + uvx ruff check --fix . quality: - ruff format --check . - ruff check . + uvx ruff format --check . + uvx ruff check . From e29ba876bbcf3ae2ad6d03b1879d6769fcfb8050 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 26 Aug 2025 10:01:26 +0000 Subject: [PATCH 09/10] styling --- src/lighteval/main_tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index 4fc7d59e8..706dd1a06 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -67,9 +67,9 @@ def inspect( @app.command() def list( - custom_tasks: custom_tasks.type = custom_tasks.default + custom_tasks: custom_tasks.type = custom_tasks.default, suites: Annotated[ - Optional[str], + str | None, Option( help="Comma-separated list of suites to display (e.g., 'helm,harness'). Use 'all' for all suites. If not specified, shows core suites only." ), From 87a8b5e11bcc03f1da29002f8fb43e4cca336ba1 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 26 Aug 2025 10:19:03 +0000 Subject: [PATCH 10/10] styling --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index bd4dfbca9..e50483f1e 100644 --- a/Makefile +++ b/Makefile @@ -2,10 +2,10 @@ style: - uvx ruff format . - uvx ruff check --fix . + ruff format . + ruff check --fix . quality: - uvx ruff format --check . - uvx ruff check . + ruff format --check . + ruff check .