Skip to content
251 changes: 251 additions & 0 deletions src/lighteval/cli_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Common CLI argument types for LightEval main files.
This module exports pre-defined argument types to reduce redundancy across main_*.py files.
"""

from dataclasses import dataclass
from typing import Any, Optional

from typer import Argument, Option
from typing_extensions import Annotated


# Help panel names for consistent organization
HELP_PANEL_NAME_1 = "Common Parameters"
HELP_PANEL_NAME_2 = "Logging Parameters"
HELP_PANEL_NAME_3 = "Debug Parameters"
HELP_PANEL_NAME_4 = "Modeling Parameters"


@dataclass
class Arg:
"""Base class for CLI arguments with type and default value."""

type: Annotated
default: Any


# Common Parameters (HELP_PANEL_NAME_1)
dataset_loading_processes = Arg(
type=Annotated[
int,
Option(
help="Number of parallel processes to use for loading datasets. Higher values can speed up dataset loading but use more memory.",
rich_help_panel=HELP_PANEL_NAME_1,
),
],
default=1,
)

custom_tasks = Arg(
type=Annotated[
Optional[str],
Option(
help="Path to a Python file containing custom task definitions. The file should define a TASKS_TABLE with LightevalTaskConfig objects.",
rich_help_panel=HELP_PANEL_NAME_1,
),
],
default=None,
)

num_fewshot_seeds = Arg(
type=Annotated[
int,
Option(
help="Number of different random seeds to use for few-shot evaluation. Each seed will generate different few-shot examples, providing more robust evaluation.",
rich_help_panel=HELP_PANEL_NAME_1,
),
],
default=1,
)

load_responses_from_details_date_id = Arg(
type=Annotated[
Optional[str],
Option(
help="Load previously generated model responses from a specific evaluation run instead of running the model. Use the timestamp/date_id from a previous run's details directory.",
rich_help_panel=HELP_PANEL_NAME_1,
),
],
default=None,
)

remove_reasoning_tags = Arg(
type=Annotated[
bool,
Option(
help="Whether to remove reasoning tags from model responses before computing metrics.",
rich_help_panel=HELP_PANEL_NAME_1,
),
],
default=True,
)

reasoning_tags = Arg(
type=Annotated[
str,
Option(
help="List of reasoning tag pairs to remove from responses, formatted as a Python list of tuples.",
rich_help_panel=HELP_PANEL_NAME_1,
),
],
default="[('<think>', '</think>')]",
)


# Logging Parameters (HELP_PANEL_NAME_2)
output_dir = Arg(
type=Annotated[
str,
Option(
help="Directory where evaluation results and details will be saved. Supports fsspec-compliant paths (local, s3, hf hub, etc.).",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default="results",
)

results_path_template = Arg(
type=Annotated[
str | None,
Option(
help="Custom template for results file path. Available variables: {output_dir}, {org}, {model}. Example: '{output_dir}/experiments/{org}_{model}' creates results in a subdirectory.",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default=None,
)

push_to_hub = Arg(
type=Annotated[
bool,
Option(
help="Whether to push evaluation results and details to the Hugging Face Hub. Requires --results-org to be set.",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default=False,
)

push_to_tensorboard = Arg(
type=Annotated[
bool,
Option(
help="Whether to create and push TensorBoard logs to the Hugging Face Hub. Requires --results-org to be set.",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default=False,
)

public_run = Arg(
type=Annotated[
bool,
Option(
help="Whether to make the uploaded results and details public on the Hugging Face Hub. If False, datasets will be private.",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default=False,
)

results_org = Arg(
type=Annotated[
Optional[str],
Option(
help="Hugging Face organization where results will be pushed. Required when using --push-to-hub or --push-to-tensorboard.",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default=None,
)

save_details = Arg(
type=Annotated[
bool,
Option(
help="Whether to save detailed per-sample results including model inputs, outputs, and metrics. Useful for analysis and debugging.",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default=False,
)

wandb = Arg(
type=Annotated[
bool,
Option(
help="Whether to log results to Weights & Biases (wandb) or Trackio. Configure with environment variables: WANDB_PROJECT, WANDB_SPACE_ID, etc. See wandb docs for full configuration options.",
rich_help_panel=HELP_PANEL_NAME_2,
),
],
default=False,
)


# Debug Parameters (HELP_PANEL_NAME_3)
max_samples = Arg(
type=Annotated[
Optional[int],
Option(
help="Maximum number of samples to evaluate per task. Useful for quick testing or debugging. If None, evaluates on all available samples.",
rich_help_panel=HELP_PANEL_NAME_3,
),
],
default=None,
)

job_id = Arg(
type=Annotated[
int,
Option(
help="Optional job identifier for tracking and organizing multiple evaluation runs. Useful in cluster environments.",
rich_help_panel=HELP_PANEL_NAME_3,
),
],
default=0,
)


# Common argument patterns
tasks = Arg(
type=Annotated[
str,
Argument(
help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks."
),
],
default=None, # Required argument, no default
)

model_args = Arg(
type=Annotated[
str,
Argument(
help="Model configuration in key=value format (e.g., 'pretrained=model_name,device=cuda') or path to YAML config file. See examples/model_configs/ for template files."
),
],
default=None, # Required argument, no default
)
119 changes: 41 additions & 78 deletions src/lighteval/main_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,99 +21,62 @@
# SOFTWARE.

import logging
from typing import Optional

from typer import Argument, Option
from typer import Option
from typing_extensions import Annotated

from lighteval.cli_args import (
HELP_PANEL_NAME_4,
custom_tasks,
dataset_loading_processes,
job_id,
load_responses_from_details_date_id,
max_samples,
model_args,
num_fewshot_seeds,
output_dir,
public_run,
push_to_hub,
push_to_tensorboard,
reasoning_tags,
remove_reasoning_tags,
results_org,
results_path_template,
save_details,
tasks,
wandb,
)

logger = logging.getLogger(__name__)

HELP_PANEL_NAME_1 = "Common Parameters"
HELP_PANEL_NAME_2 = "Logging Parameters"
HELP_PANEL_NAME_3 = "Debug Parameters"
HELP_PANEL_NAME_4 = "Modeling Parameters"
logger = logging.getLogger(__name__)


def accelerate( # noqa C901
# === general ===
model_args: Annotated[
str,
Argument(
help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
),
],
tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
model_args: model_args.type,
tasks: tasks.type,
# === Common parameters ===
vision_model: Annotated[
bool, Option(help="Use vision model for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
] = False,
dataset_loading_processes: Annotated[
int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
] = 1,
custom_tasks: Annotated[
Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
] = None,
num_fewshot_seeds: Annotated[
int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
] = 1,
load_responses_from_details_date_id: Annotated[
Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
] = None,
remove_reasoning_tags: Annotated[
bool | None,
Option(
help="Remove reasoning tags from responses (true to remove, false to leave - true by default).",
rich_help_panel=HELP_PANEL_NAME_1,
),
] = True,
reasoning_tags: Annotated[
str | None,
Option(
help="List of reasoning tags (as pairs) to remove from responses. Default is [('<think>', '</think>')].",
rich_help_panel=HELP_PANEL_NAME_1,
),
] = None,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
custom_tasks: custom_tasks.type = custom_tasks.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
output_dir: Annotated[
str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
] = "results",
results_path_template: Annotated[
str | None,
Option(
help="Template path for where to save the results, you have access to 3 variables, `output_dir`, `org` and `model`. for example a template can be `'{output_dir}/1234/{org}+{model}'`",
rich_help_panel=HELP_PANEL_NAME_2,
),
] = None,
push_to_hub: Annotated[
bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
] = False,
push_to_tensorboard: Annotated[
bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
] = False,
public_run: Annotated[
bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
] = False,
results_org: Annotated[
Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
] = None,
save_details: Annotated[
bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
] = False,
wandb: Annotated[
bool,
Option(
help="Push results to wandb or trackio if available. We use env variable to configure trackio or wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/, https://github.com/gradio-app/trackio",
rich_help_panel=HELP_PANEL_NAME_2,
),
] = False,
output_dir: output_dir.type = output_dir.default,
results_path_template: results_path_template.type = results_path_template.default,
push_to_hub: push_to_hub.type = push_to_hub.default,
push_to_tensorboard: push_to_tensorboard.type = push_to_tensorboard.default,
public_run: public_run.type = public_run.default,
results_org: results_org.type = results_org.default,
save_details: save_details.type = save_details.default,
wandb: wandb.type = wandb.default,
# === debug ===
max_samples: Annotated[
Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
] = None,
job_id: Annotated[
int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
] = 0,
max_samples: max_samples.type = max_samples.default,
job_id: job_id.type = job_id.default,
):
"""
Evaluate models using accelerate and transformers as backend.
Expand Down
Loading
Loading