diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py new file mode 100644 index 000000000..472941ad7 --- /dev/null +++ b/src/lighteval/cli_args.py @@ -0,0 +1,251 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +Common CLI argument types for LightEval main files. +This module exports pre-defined argument types to reduce redundancy across main_*.py files. +""" + +from dataclasses import dataclass +from typing import Any, Optional + +from typer import Argument, Option +from typing_extensions import Annotated + + +# Help panel names for consistent organization +HELP_PANEL_NAME_1 = "Common Parameters" +HELP_PANEL_NAME_2 = "Logging Parameters" +HELP_PANEL_NAME_3 = "Debug Parameters" +HELP_PANEL_NAME_4 = "Modeling Parameters" + + +@dataclass +class Arg: + """Base class for CLI arguments with type and default value.""" + + type: Annotated + default: Any + + +# Common Parameters (HELP_PANEL_NAME_1) +dataset_loading_processes = Arg( + type=Annotated[ + int, + Option( + help="Number of parallel processes to use for loading datasets. Higher values can speed up dataset loading but use more memory.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=1, +) + +custom_tasks = Arg( + type=Annotated[ + Optional[str], + Option( + help="Path to a Python file containing custom task definitions. The file should define a TASKS_TABLE with LightevalTaskConfig objects.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=None, +) + +num_fewshot_seeds = Arg( + type=Annotated[ + int, + Option( + help="Number of different random seeds to use for few-shot evaluation. Each seed will generate different few-shot examples, providing more robust evaluation.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=1, +) + +load_responses_from_details_date_id = Arg( + type=Annotated[ + Optional[str], + Option( + help="Load previously generated model responses from a specific evaluation run instead of running the model. Use the timestamp/date_id from a previous run's details directory.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=None, +) + +remove_reasoning_tags = Arg( + type=Annotated[ + bool, + Option( + help="Whether to remove reasoning tags from model responses before computing metrics.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=True, +) + +reasoning_tags = Arg( + type=Annotated[ + str, + Option( + help="List of reasoning tag pairs to remove from responses, formatted as a Python list of tuples.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default="[('', '')]", +) + + +# Logging Parameters (HELP_PANEL_NAME_2) +output_dir = Arg( + type=Annotated[ + str, + Option( + help="Directory where evaluation results and details will be saved. Supports fsspec-compliant paths (local, s3, hf hub, etc.).", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default="results", +) + +results_path_template = Arg( + type=Annotated[ + str | None, + Option( + help="Custom template for results file path. Available variables: {output_dir}, {org}, {model}. Example: '{output_dir}/experiments/{org}_{model}' creates results in a subdirectory.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=None, +) + +push_to_hub = Arg( + type=Annotated[ + bool, + Option( + help="Whether to push evaluation results and details to the Hugging Face Hub. Requires --results-org to be set.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +push_to_tensorboard = Arg( + type=Annotated[ + bool, + Option( + help="Whether to create and push TensorBoard logs to the Hugging Face Hub. Requires --results-org to be set.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +public_run = Arg( + type=Annotated[ + bool, + Option( + help="Whether to make the uploaded results and details public on the Hugging Face Hub. If False, datasets will be private.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +results_org = Arg( + type=Annotated[ + Optional[str], + Option( + help="Hugging Face organization where results will be pushed. Required when using --push-to-hub or --push-to-tensorboard.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=None, +) + +save_details = Arg( + type=Annotated[ + bool, + Option( + help="Whether to save detailed per-sample results including model inputs, outputs, and metrics. Useful for analysis and debugging.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + +wandb = Arg( + type=Annotated[ + bool, + Option( + help="Whether to log results to Weights & Biases (wandb) or Trackio. Configure with environment variables: WANDB_PROJECT, WANDB_SPACE_ID, etc. See wandb docs for full configuration options.", + rich_help_panel=HELP_PANEL_NAME_2, + ), + ], + default=False, +) + + +# Debug Parameters (HELP_PANEL_NAME_3) +max_samples = Arg( + type=Annotated[ + Optional[int], + Option( + help="Maximum number of samples to evaluate per task. Useful for quick testing or debugging. If None, evaluates on all available samples.", + rich_help_panel=HELP_PANEL_NAME_3, + ), + ], + default=None, +) + +job_id = Arg( + type=Annotated[ + int, + Option( + help="Optional job identifier for tracking and organizing multiple evaluation runs. Useful in cluster environments.", + rich_help_panel=HELP_PANEL_NAME_3, + ), + ], + default=0, +) + + +# Common argument patterns +tasks = Arg( + type=Annotated[ + str, + Argument( + help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks." + ), + ], + default=None, # Required argument, no default +) + +model_args = Arg( + type=Annotated[ + str, + Argument( + help="Model configuration in key=value format (e.g., 'pretrained=model_name,device=cuda') or path to YAML config file. See examples/model_configs/ for template files." + ), + ], + default=None, # Required argument, no default +) diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index 1b3a3c6c8..1e5726f86 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -21,99 +21,62 @@ # SOFTWARE. import logging -from typing import Optional -from typer import Argument, Option +from typer import Option from typing_extensions import Annotated +from lighteval.cli_args import ( + HELP_PANEL_NAME_4, + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + model_args, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, +) -logger = logging.getLogger(__name__) -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +logger = logging.getLogger(__name__) def accelerate( # noqa C901 # === general === - model_args: Annotated[ - str, - Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" - ), - ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + model_args: model_args.type, + tasks: tasks.type, # === Common parameters === vision_model: Annotated[ bool, Option(help="Use vision model for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) ] = False, - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using accelerate and transformers as backend. diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py index b4195d116..7d4d34248 100644 --- a/src/lighteval/main_baseline.py +++ b/src/lighteval/main_baseline.py @@ -21,32 +21,21 @@ # SOFTWARE. -from typing import Optional - -from typer import Argument, Option -from typing_extensions import Annotated - - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + custom_tasks, + dataset_loading_processes, + max_samples, + output_dir, + tasks, +) def baseline( - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, + tasks: tasks.type, + custom_tasks: custom_tasks.type = custom_tasks.default, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + output_dir: output_dir.type = output_dir.default, + max_samples: max_samples.type = max_samples.default, ): """ Compute baselines for given tasks. diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py index 6883e3667..14507ae8d 100644 --- a/src/lighteval/main_custom.py +++ b/src/lighteval/main_custom.py @@ -19,87 +19,58 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Optional + import typer -from typer import Argument, Option +from typer import Argument from typing_extensions import Annotated +from lighteval.cli_args import ( + custom_tasks, + dataset_loading_processes, + job_id, + max_samples, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, +) from lighteval.models.custom.custom_model import CustomModelConfig app = typer.Typer() -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" - - @app.command(rich_help_panel="Evaluation Backends") def custom( # === general === model_name: Annotated[str, Argument(help="The model name to evaluate")], model_definition_file_path: Annotated[str, Argument(help="The model definition file path to evaluate")], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate custom models (can be anything). diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index f824ca7ab..7d40f1661 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -19,20 +19,35 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Optional + import typer from typer import Argument, Option from typing_extensions import Annotated - -app = typer.Typer() +from lighteval.cli_args import ( + HELP_PANEL_NAME_4, + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, +) -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +app = typer.Typer() @app.command(rich_help_panel="Evaluation Backends") @@ -41,7 +56,7 @@ def inference_endpoint( model_config_path: Annotated[ str, Argument(help="Path to model config yaml file. (examples/model_configs/endpoint_model.yaml)") ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: tasks.type, free_endpoint: Annotated[ bool, Option( @@ -50,72 +65,24 @@ def inference_endpoint( ), ] = False, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using inference-endpoints as backend. @@ -177,74 +144,26 @@ def tgi( model_config_path: Annotated[ str, Argument(help="Path to model config yaml file. (examples/model_configs/tgi_model.yaml)") ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using TGI as backend. @@ -306,74 +225,26 @@ def litellm( help="config file path for the litellm model, or a comma separated string of model args (model_name={},base_url={},provider={})" ), ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using LiteLLM as backend. @@ -446,71 +317,25 @@ def inference_providers( help="config file path for the inference provider model, or a comma separated string of model args (model_name={},provider={},generation={temperature: 0.6})" ), ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using HuggingFace's inference providers as backend. diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index 1ded89850..06935e69c 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -28,11 +28,10 @@ from typing_extensions import Annotated from yaml import SafeLoader - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + reasoning_tags, + remove_reasoning_tags, +) SEED = 1234 @@ -43,20 +42,8 @@ def nanotron( str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.") ], lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")], - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, ): """ Evaluate models using nanotron as backend. diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index c458bcc01..135396263 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -19,94 +19,52 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Optional -from typer import Argument, Option -from typing_extensions import Annotated - - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + model_args, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, +) def sglang( # === general === - model_args: Annotated[ - str, - Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" - ), - ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + model_args: model_args.type, + tasks: tasks.type, # === Common parameters === - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using sglang as backend. diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index d79b06f81..706dd1a06 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -20,12 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import logging -from typing import Optional import typer from typer import Argument, Option from typing_extensions import Annotated +from lighteval.cli_args import custom_tasks + app = typer.Typer() @@ -33,7 +34,7 @@ @app.command() def inspect( tasks: Annotated[str, Argument(help="Id of tasks or path to a text file with a list of tasks")], - custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None, + custom_tasks: custom_tasks.type = custom_tasks.default, num_samples: Annotated[int, Option(help="Number of samples to display")] = 10, show_config: Annotated[bool, Option(help="Will display the full task config")] = False, ): @@ -66,9 +67,9 @@ def inspect( @app.command() def list( - custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None, + custom_tasks: custom_tasks.type = custom_tasks.default, suites: Annotated[ - Optional[str], + str | None, Option( help="Comma-separated list of suites to display (e.g., 'helm,harness'). Use 'all' for all suites. If not specified, shows core suites only." ), diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index fe243c317..45e40fd70 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -19,97 +19,61 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. + from typing import Optional -from typer import Argument, Option +from typer import Option from typing_extensions import Annotated - -HELP_PANEL_NAME_1 = "Common Parameters" -HELP_PANEL_NAME_2 = "Logging Parameters" -HELP_PANEL_NAME_3 = "Debug Parameters" -HELP_PANEL_NAME_4 = "Modeling Parameters" +from lighteval.cli_args import ( + HELP_PANEL_NAME_4, + custom_tasks, + dataset_loading_processes, + job_id, + load_responses_from_details_date_id, + max_samples, + model_args, + num_fewshot_seeds, + output_dir, + public_run, + push_to_hub, + push_to_tensorboard, + reasoning_tags, + remove_reasoning_tags, + results_org, + results_path_template, + save_details, + tasks, + wandb, +) def vllm( # === general === - model_args: Annotated[ - str, - Argument( - help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" - ), - ], - tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + model_args: model_args.type, + tasks: tasks.type, # === Common parameters === cot_prompt: Annotated[ Optional[str], Option(help="Use chain of thought prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) ] = None, - dataset_loading_processes: Annotated[ - int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - custom_tasks: Annotated[ - Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - num_fewshot_seeds: Annotated[ - int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) - ] = 1, - load_responses_from_details_date_id: Annotated[ - Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) - ] = None, - remove_reasoning_tags: Annotated[ - bool | None, - Option( - help="Remove reasoning tags from responses (true to remove, false to leave - true by default).", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = True, - reasoning_tags: Annotated[ - str | None, - Option( - help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].", - rich_help_panel=HELP_PANEL_NAME_1, - ), - ] = None, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, + num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, + remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, + reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === - output_dir: Annotated[ - str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = "results", - results_path_template: Annotated[ - str | None, - Option( - help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = None, - push_to_hub: Annotated[ - bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - push_to_tensorboard: Annotated[ - bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - public_run: Annotated[ - bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - results_org: Annotated[ - Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) - ] = None, - save_details: Annotated[ - bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) - ] = False, - wandb: Annotated[ - bool, - Option( - help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio", - rich_help_panel=HELP_PANEL_NAME_2, - ), - ] = False, + output_dir: output_dir.type = output_dir.default, + results_path_template: results_path_template.type = results_path_template.default, + push_to_hub: push_to_hub.type = push_to_hub.default, + push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default, + public_run: public_run.type = public_run.default, + results_org: results_org.type = results_org.default, + save_details: save_details.type = save_details.default, + wandb: wandb.type = wandb.default, # === debug === - max_samples: Annotated[ - Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) - ] = None, - job_id: Annotated[ - int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) - ] = 0, + max_samples: max_samples.type = max_samples.default, + job_id: job_id.type = job_id.default, ): """ Evaluate models using vllm as backend. diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 91c1b590d..4cf1dbee2 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -103,7 +103,7 @@ class PipelineParameters: max_samples: int | None = None cot_prompt: str | None = None remove_reasoning_tags: bool = True - reasoning_tags: str | list[tuple[str, str]] | None = None + reasoning_tags: str | list[tuple[str, str]] = "[('', '')]" load_responses_from_details_date_id: str | None = None bootstrap_iters: int = 1000 @@ -127,26 +127,24 @@ def __post_init__(self): # noqa C901 elif self.launcher_type == ParallelismManager.OPENAI: if not is_openai_available(): raise ImportError(NO_OPENAI_ERROR_MSG) - if self.reasoning_tags is None: - self.reasoning_tags = [("", "")] - else: - # Convert reasoning tags to list if needed - if not isinstance(self.reasoning_tags, list): - try: - self.reasoning_tags = ast.literal_eval(self.reasoning_tags) - except ValueError as e: - raise ValueError( - "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. " - f"Got {self.reasoning_tags} instead, which caused parsing error {e}." - ) - - # Make sure format is correct - if not all(isinstance(tag, tuple) and len(tag) == 2 for tag in self.reasoning_tags): + + # Convert reasoning tags to list if needed + if not isinstance(self.reasoning_tags, list): + try: + self.reasoning_tags = ast.literal_eval(self.reasoning_tags) + except ValueError as e: raise ValueError( "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. " - f"Got {self.reasoning_tags} instead." + f"Got {self.reasoning_tags} instead, which caused parsing error {e}." ) + # Make sure format is correct + if not all(isinstance(tag, tuple) and len(tag) == 2 for tag in self.reasoning_tags): + raise ValueError( + "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. " + f"Got {self.reasoning_tags} instead." + ) + class Pipeline: def __init__(