diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py
new file mode 100644
index 000000000..472941ad7
--- /dev/null
+++ b/src/lighteval/cli_args.py
@@ -0,0 +1,251 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+Common CLI argument types for LightEval main files.
+This module exports pre-defined argument types to reduce redundancy across main_*.py files.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Optional
+
+from typer import Argument, Option
+from typing_extensions import Annotated
+
+
+# Help panel names for consistent organization
+HELP_PANEL_NAME_1 = "Common Parameters"
+HELP_PANEL_NAME_2 = "Logging Parameters"
+HELP_PANEL_NAME_3 = "Debug Parameters"
+HELP_PANEL_NAME_4 = "Modeling Parameters"
+
+
+@dataclass
+class Arg:
+ """Base class for CLI arguments with type and default value."""
+
+ type: Annotated
+ default: Any
+
+
+# Common Parameters (HELP_PANEL_NAME_1)
+dataset_loading_processes = Arg(
+ type=Annotated[
+ int,
+ Option(
+ help="Number of parallel processes to use for loading datasets. Higher values can speed up dataset loading but use more memory.",
+ rich_help_panel=HELP_PANEL_NAME_1,
+ ),
+ ],
+ default=1,
+)
+
+custom_tasks = Arg(
+ type=Annotated[
+ Optional[str],
+ Option(
+ help="Path to a Python file containing custom task definitions. The file should define a TASKS_TABLE with LightevalTaskConfig objects.",
+ rich_help_panel=HELP_PANEL_NAME_1,
+ ),
+ ],
+ default=None,
+)
+
+num_fewshot_seeds = Arg(
+ type=Annotated[
+ int,
+ Option(
+ help="Number of different random seeds to use for few-shot evaluation. Each seed will generate different few-shot examples, providing more robust evaluation.",
+ rich_help_panel=HELP_PANEL_NAME_1,
+ ),
+ ],
+ default=1,
+)
+
+load_responses_from_details_date_id = Arg(
+ type=Annotated[
+ Optional[str],
+ Option(
+ help="Load previously generated model responses from a specific evaluation run instead of running the model. Use the timestamp/date_id from a previous run's details directory.",
+ rich_help_panel=HELP_PANEL_NAME_1,
+ ),
+ ],
+ default=None,
+)
+
+remove_reasoning_tags = Arg(
+ type=Annotated[
+ bool,
+ Option(
+ help="Whether to remove reasoning tags from model responses before computing metrics.",
+ rich_help_panel=HELP_PANEL_NAME_1,
+ ),
+ ],
+ default=True,
+)
+
+reasoning_tags = Arg(
+ type=Annotated[
+ str,
+ Option(
+ help="List of reasoning tag pairs to remove from responses, formatted as a Python list of tuples.",
+ rich_help_panel=HELP_PANEL_NAME_1,
+ ),
+ ],
+ default="[('', '')]",
+)
+
+
+# Logging Parameters (HELP_PANEL_NAME_2)
+output_dir = Arg(
+ type=Annotated[
+ str,
+ Option(
+ help="Directory where evaluation results and details will be saved. Supports fsspec-compliant paths (local, s3, hf hub, etc.).",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default="results",
+)
+
+results_path_template = Arg(
+ type=Annotated[
+ str | None,
+ Option(
+ help="Custom template for results file path. Available variables: {output_dir}, {org}, {model}. Example: '{output_dir}/experiments/{org}_{model}' creates results in a subdirectory.",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default=None,
+)
+
+push_to_hub = Arg(
+ type=Annotated[
+ bool,
+ Option(
+ help="Whether to push evaluation results and details to the Hugging Face Hub. Requires --results-org to be set.",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default=False,
+)
+
+push_to_tensorboard = Arg(
+ type=Annotated[
+ bool,
+ Option(
+ help="Whether to create and push TensorBoard logs to the Hugging Face Hub. Requires --results-org to be set.",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default=False,
+)
+
+public_run = Arg(
+ type=Annotated[
+ bool,
+ Option(
+ help="Whether to make the uploaded results and details public on the Hugging Face Hub. If False, datasets will be private.",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default=False,
+)
+
+results_org = Arg(
+ type=Annotated[
+ Optional[str],
+ Option(
+ help="Hugging Face organization where results will be pushed. Required when using --push-to-hub or --push-to-tensorboard.",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default=None,
+)
+
+save_details = Arg(
+ type=Annotated[
+ bool,
+ Option(
+ help="Whether to save detailed per-sample results including model inputs, outputs, and metrics. Useful for analysis and debugging.",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default=False,
+)
+
+wandb = Arg(
+ type=Annotated[
+ bool,
+ Option(
+ help="Whether to log results to Weights & Biases (wandb) or Trackio. Configure with environment variables: WANDB_PROJECT, WANDB_SPACE_ID, etc. See wandb docs for full configuration options.",
+ rich_help_panel=HELP_PANEL_NAME_2,
+ ),
+ ],
+ default=False,
+)
+
+
+# Debug Parameters (HELP_PANEL_NAME_3)
+max_samples = Arg(
+ type=Annotated[
+ Optional[int],
+ Option(
+ help="Maximum number of samples to evaluate per task. Useful for quick testing or debugging. If None, evaluates on all available samples.",
+ rich_help_panel=HELP_PANEL_NAME_3,
+ ),
+ ],
+ default=None,
+)
+
+job_id = Arg(
+ type=Annotated[
+ int,
+ Option(
+ help="Optional job identifier for tracking and organizing multiple evaluation runs. Useful in cluster environments.",
+ rich_help_panel=HELP_PANEL_NAME_3,
+ ),
+ ],
+ default=0,
+)
+
+
+# Common argument patterns
+tasks = Arg(
+ type=Annotated[
+ str,
+ Argument(
+ help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks."
+ ),
+ ],
+ default=None, # Required argument, no default
+)
+
+model_args = Arg(
+ type=Annotated[
+ str,
+ Argument(
+ help="Model configuration in key=value format (e.g., 'pretrained=model_name,device=cuda') or path to YAML config file. See examples/model_configs/ for template files."
+ ),
+ ],
+ default=None, # Required argument, no default
+)
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index 1b3a3c6c8..1e5726f86 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -21,99 +21,62 @@
# SOFTWARE.
import logging
-from typing import Optional
-from typer import Argument, Option
+from typer import Option
from typing_extensions import Annotated
+from lighteval.cli_args import (
+ HELP_PANEL_NAME_4,
+ custom_tasks,
+ dataset_loading_processes,
+ job_id,
+ load_responses_from_details_date_id,
+ max_samples,
+ model_args,
+ num_fewshot_seeds,
+ output_dir,
+ public_run,
+ push_to_hub,
+ push_to_tensorboard,
+ reasoning_tags,
+ remove_reasoning_tags,
+ results_org,
+ results_path_template,
+ save_details,
+ tasks,
+ wandb,
+)
-logger = logging.getLogger(__name__)
-HELP_PANEL_NAME_1 = "Common Parameters"
-HELP_PANEL_NAME_2 = "Logging Parameters"
-HELP_PANEL_NAME_3 = "Debug Parameters"
-HELP_PANEL_NAME_4 = "Modeling Parameters"
+logger = logging.getLogger(__name__)
def accelerate( # noqa C901
# === general ===
- model_args: Annotated[
- str,
- Argument(
- help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
- ),
- ],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ model_args: model_args.type,
+ tasks: tasks.type,
# === Common parameters ===
vision_model: Annotated[
bool, Option(help="Use vision model for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
] = False,
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- load_responses_from_details_date_id: Annotated[
- Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- wandb: Annotated[
- bool,
- Option(
- help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = False,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
+ wandb: wandb.type = wandb.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate models using accelerate and transformers as backend.
diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py
index b4195d116..7d4d34248 100644
--- a/src/lighteval/main_baseline.py
+++ b/src/lighteval/main_baseline.py
@@ -21,32 +21,21 @@
# SOFTWARE.
-from typing import Optional
-
-from typer import Argument, Option
-from typing_extensions import Annotated
-
-
-HELP_PANEL_NAME_1 = "Common Parameters"
-HELP_PANEL_NAME_2 = "Logging Parameters"
-HELP_PANEL_NAME_3 = "Debug Parameters"
-HELP_PANEL_NAME_4 = "Modeling Parameters"
+from lighteval.cli_args import (
+ custom_tasks,
+ dataset_loading_processes,
+ max_samples,
+ output_dir,
+ tasks,
+)
def baseline(
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
+ tasks: tasks.type,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ output_dir: output_dir.type = output_dir.default,
+ max_samples: max_samples.type = max_samples.default,
):
"""
Compute baselines for given tasks.
diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py
index 6883e3667..14507ae8d 100644
--- a/src/lighteval/main_custom.py
+++ b/src/lighteval/main_custom.py
@@ -19,87 +19,58 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-from typing import Optional
+
import typer
-from typer import Argument, Option
+from typer import Argument
from typing_extensions import Annotated
+from lighteval.cli_args import (
+ custom_tasks,
+ dataset_loading_processes,
+ job_id,
+ max_samples,
+ num_fewshot_seeds,
+ output_dir,
+ public_run,
+ push_to_hub,
+ push_to_tensorboard,
+ reasoning_tags,
+ remove_reasoning_tags,
+ results_org,
+ results_path_template,
+ save_details,
+ tasks,
+)
from lighteval.models.custom.custom_model import CustomModelConfig
app = typer.Typer()
-HELP_PANEL_NAME_1 = "Common Parameters"
-HELP_PANEL_NAME_2 = "Logging Parameters"
-HELP_PANEL_NAME_3 = "Debug Parameters"
-HELP_PANEL_NAME_4 = "Modeling Parameters"
-
-
@app.command(rich_help_panel="Evaluation Backends")
def custom(
# === general ===
model_name: Annotated[str, Argument(help="The model name to evaluate")],
model_definition_file_path: Annotated[str, Argument(help="The model definition file path to evaluate")],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ tasks: tasks.type,
# === Common parameters ===
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate custom models (can be anything).
diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
index f824ca7ab..7d40f1661 100644
--- a/src/lighteval/main_endpoint.py
+++ b/src/lighteval/main_endpoint.py
@@ -19,20 +19,35 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-from typing import Optional
+
import typer
from typer import Argument, Option
from typing_extensions import Annotated
-
-app = typer.Typer()
+from lighteval.cli_args import (
+ HELP_PANEL_NAME_4,
+ custom_tasks,
+ dataset_loading_processes,
+ job_id,
+ load_responses_from_details_date_id,
+ max_samples,
+ num_fewshot_seeds,
+ output_dir,
+ public_run,
+ push_to_hub,
+ push_to_tensorboard,
+ reasoning_tags,
+ remove_reasoning_tags,
+ results_org,
+ results_path_template,
+ save_details,
+ tasks,
+ wandb,
+)
-HELP_PANEL_NAME_1 = "Common Parameters"
-HELP_PANEL_NAME_2 = "Logging Parameters"
-HELP_PANEL_NAME_3 = "Debug Parameters"
-HELP_PANEL_NAME_4 = "Modeling Parameters"
+app = typer.Typer()
@app.command(rich_help_panel="Evaluation Backends")
@@ -41,7 +56,7 @@ def inference_endpoint(
model_config_path: Annotated[
str, Argument(help="Path to model config yaml file. (examples/model_configs/endpoint_model.yaml)")
],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ tasks: tasks.type,
free_endpoint: Annotated[
bool,
Option(
@@ -50,72 +65,24 @@ def inference_endpoint(
),
] = False,
# === Common parameters ===
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- load_responses_from_details_date_id: Annotated[
- Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- wandb: Annotated[
- bool,
- Option(
- help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = False,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
+ wandb: wandb.type = wandb.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate models using inference-endpoints as backend.
@@ -177,74 +144,26 @@ def tgi(
model_config_path: Annotated[
str, Argument(help="Path to model config yaml file. (examples/model_configs/tgi_model.yaml)")
],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ tasks: tasks.type,
# === Common parameters ===
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- load_responses_from_details_date_id: Annotated[
- Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- wandb: Annotated[
- bool,
- Option(
- help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = False,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
+ wandb: wandb.type = wandb.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate models using TGI as backend.
@@ -306,74 +225,26 @@ def litellm(
help="config file path for the litellm model, or a comma separated string of model args (model_name={},base_url={},provider={})"
),
],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ tasks: tasks.type,
# === Common parameters ===
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- load_responses_from_details_date_id: Annotated[
- Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- wandb: Annotated[
- bool,
- Option(
- help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = False,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
+ wandb: wandb.type = wandb.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate models using LiteLLM as backend.
@@ -446,71 +317,25 @@ def inference_providers(
help="config file path for the inference provider model, or a comma separated string of model args (model_name={},provider={},generation={temperature: 0.6})"
),
],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ tasks: tasks.type,
# === Common parameters ===
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- wandb: Annotated[
- bool,
- Option(
- help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = False,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
+ wandb: wandb.type = wandb.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate models using HuggingFace's inference providers as backend.
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 1ded89850..06935e69c 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -28,11 +28,10 @@
from typing_extensions import Annotated
from yaml import SafeLoader
-
-HELP_PANEL_NAME_1 = "Common Parameters"
-HELP_PANEL_NAME_2 = "Logging Parameters"
-HELP_PANEL_NAME_3 = "Debug Parameters"
-HELP_PANEL_NAME_4 = "Modeling Parameters"
+from lighteval.cli_args import (
+ reasoning_tags,
+ remove_reasoning_tags,
+)
SEED = 1234
@@ -43,20 +42,8 @@ def nanotron(
str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
],
lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
):
"""
Evaluate models using nanotron as backend.
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
index c458bcc01..135396263 100644
--- a/src/lighteval/main_sglang.py
+++ b/src/lighteval/main_sglang.py
@@ -19,94 +19,52 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-from typing import Optional
-from typer import Argument, Option
-from typing_extensions import Annotated
-
-
-HELP_PANEL_NAME_1 = "Common Parameters"
-HELP_PANEL_NAME_2 = "Logging Parameters"
-HELP_PANEL_NAME_3 = "Debug Parameters"
-HELP_PANEL_NAME_4 = "Modeling Parameters"
+from lighteval.cli_args import (
+ custom_tasks,
+ dataset_loading_processes,
+ job_id,
+ load_responses_from_details_date_id,
+ max_samples,
+ model_args,
+ num_fewshot_seeds,
+ output_dir,
+ public_run,
+ push_to_hub,
+ push_to_tensorboard,
+ reasoning_tags,
+ remove_reasoning_tags,
+ results_org,
+ results_path_template,
+ save_details,
+ tasks,
+ wandb,
+)
def sglang(
# === general ===
- model_args: Annotated[
- str,
- Argument(
- help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
- ),
- ],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ model_args: model_args.type,
+ tasks: tasks.type,
# === Common parameters ===
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- load_responses_from_details_date_id: Annotated[
- Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- wandb: Annotated[
- bool,
- Option(
- help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = False,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
+ wandb: wandb.type = wandb.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate models using sglang as backend.
diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py
index d79b06f81..706dd1a06 100644
--- a/src/lighteval/main_tasks.py
+++ b/src/lighteval/main_tasks.py
@@ -20,12 +20,13 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
-from typing import Optional
import typer
from typer import Argument, Option
from typing_extensions import Annotated
+from lighteval.cli_args import custom_tasks
+
app = typer.Typer()
@@ -33,7 +34,7 @@
@app.command()
def inspect(
tasks: Annotated[str, Argument(help="Id of tasks or path to a text file with a list of tasks")],
- custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
num_samples: Annotated[int, Option(help="Number of samples to display")] = 10,
show_config: Annotated[bool, Option(help="Will display the full task config")] = False,
):
@@ -66,9 +67,9 @@ def inspect(
@app.command()
def list(
- custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
suites: Annotated[
- Optional[str],
+ str | None,
Option(
help="Comma-separated list of suites to display (e.g., 'helm,harness'). Use 'all' for all suites. If not specified, shows core suites only."
),
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
index fe243c317..45e40fd70 100644
--- a/src/lighteval/main_vllm.py
+++ b/src/lighteval/main_vllm.py
@@ -19,97 +19,61 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+
from typing import Optional
-from typer import Argument, Option
+from typer import Option
from typing_extensions import Annotated
-
-HELP_PANEL_NAME_1 = "Common Parameters"
-HELP_PANEL_NAME_2 = "Logging Parameters"
-HELP_PANEL_NAME_3 = "Debug Parameters"
-HELP_PANEL_NAME_4 = "Modeling Parameters"
+from lighteval.cli_args import (
+ HELP_PANEL_NAME_4,
+ custom_tasks,
+ dataset_loading_processes,
+ job_id,
+ load_responses_from_details_date_id,
+ max_samples,
+ model_args,
+ num_fewshot_seeds,
+ output_dir,
+ public_run,
+ push_to_hub,
+ push_to_tensorboard,
+ reasoning_tags,
+ remove_reasoning_tags,
+ results_org,
+ results_path_template,
+ save_details,
+ tasks,
+ wandb,
+)
def vllm(
# === general ===
- model_args: Annotated[
- str,
- Argument(
- help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
- ),
- ],
- tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+ model_args: model_args.type,
+ tasks: tasks.type,
# === Common parameters ===
cot_prompt: Annotated[
Optional[str], Option(help="Use chain of thought prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
] = None,
- dataset_loading_processes: Annotated[
- int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- custom_tasks: Annotated[
- Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- num_fewshot_seeds: Annotated[
- int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = 1,
- load_responses_from_details_date_id: Annotated[
- Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
- ] = None,
- remove_reasoning_tags: Annotated[
- bool | None,
- Option(
- help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = True,
- reasoning_tags: Annotated[
- str | None,
- Option(
- help="List of reasoning tags (provided as pairs) to remove from responses. Default is [('', '')].",
- rich_help_panel=HELP_PANEL_NAME_1,
- ),
- ] = None,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
+ num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
+ remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
+ reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
- output_dir: Annotated[
- str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = "results",
- results_path_template: Annotated[
- str | None,
- Option(
- help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = None,
- push_to_hub: Annotated[
- bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- push_to_tensorboard: Annotated[
- bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- public_run: Annotated[
- bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- results_org: Annotated[
- Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = None,
- save_details: Annotated[
- bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
- ] = False,
- wandb: Annotated[
- bool,
- Option(
- help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
- rich_help_panel=HELP_PANEL_NAME_2,
- ),
- ] = False,
+ output_dir: output_dir.type = output_dir.default,
+ results_path_template: results_path_template.type = results_path_template.default,
+ push_to_hub: push_to_hub.type = push_to_hub.default,
+ push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
+ public_run: public_run.type = public_run.default,
+ results_org: results_org.type = results_org.default,
+ save_details: save_details.type = save_details.default,
+ wandb: wandb.type = wandb.default,
# === debug ===
- max_samples: Annotated[
- Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = None,
- job_id: Annotated[
- int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
- ] = 0,
+ max_samples: max_samples.type = max_samples.default,
+ job_id: job_id.type = job_id.default,
):
"""
Evaluate models using vllm as backend.
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 91c1b590d..4cf1dbee2 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -103,7 +103,7 @@ class PipelineParameters:
max_samples: int | None = None
cot_prompt: str | None = None
remove_reasoning_tags: bool = True
- reasoning_tags: str | list[tuple[str, str]] | None = None
+ reasoning_tags: str | list[tuple[str, str]] = "[('', '')]"
load_responses_from_details_date_id: str | None = None
bootstrap_iters: int = 1000
@@ -127,26 +127,24 @@ def __post_init__(self): # noqa C901
elif self.launcher_type == ParallelismManager.OPENAI:
if not is_openai_available():
raise ImportError(NO_OPENAI_ERROR_MSG)
- if self.reasoning_tags is None:
- self.reasoning_tags = [("", "")]
- else:
- # Convert reasoning tags to list if needed
- if not isinstance(self.reasoning_tags, list):
- try:
- self.reasoning_tags = ast.literal_eval(self.reasoning_tags)
- except ValueError as e:
- raise ValueError(
- "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. "
- f"Got {self.reasoning_tags} instead, which caused parsing error {e}."
- )
-
- # Make sure format is correct
- if not all(isinstance(tag, tuple) and len(tag) == 2 for tag in self.reasoning_tags):
+
+ # Convert reasoning tags to list if needed
+ if not isinstance(self.reasoning_tags, list):
+ try:
+ self.reasoning_tags = ast.literal_eval(self.reasoning_tags)
+ except ValueError as e:
raise ValueError(
"reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. "
- f"Got {self.reasoning_tags} instead."
+ f"Got {self.reasoning_tags} instead, which caused parsing error {e}."
)
+ # Make sure format is correct
+ if not all(isinstance(tag, tuple) and len(tag) == 2 for tag in self.reasoning_tags):
+ raise ValueError(
+ "reasoning_tags must be a list of pair tuples, e.g. [('start_tag', 'end_tag'), ...]. "
+ f"Got {self.reasoning_tags} instead."
+ )
+
class Pipeline:
def __init__(