From d7c370566be314a3a4b54beedff572096fdd33e3 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 12 Nov 2025 16:50:36 +0100 Subject: [PATCH 1/7] add a task dump in registry for better documentation of tasks --- src/lighteval/main_tasks.py | 15 ++ .../tasks/multilingual/tasks/global_mmlu.py | 6 +- src/lighteval/tasks/registry.py | 162 +++++++++++++++++- 3 files changed, 176 insertions(+), 7 deletions(-) diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index 8286d13f1..fb71b1b77 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -19,6 +19,7 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import json import logging import typer @@ -92,3 +93,17 @@ def create(template: str, task_name: str, dataset_name: str): f.write(content) logger.info(f"Task created in custom_{task_name}_task.py") + + +@app.command() +def dump( + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, + custom_tasks: custom_tasks.type = custom_tasks.default, +): + """Dump all task names, metadata, and docstrings as JSON""" + from lighteval.tasks.registry import Registry + + registry = Registry(custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual) + modules_data = registry.get_tasks_dump() + + print(json.dumps(modules_data, indent=2, default=str)) diff --git a/src/lighteval/tasks/multilingual/tasks/global_mmlu.py b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py index e9fdd2b04..54d9e8a71 100644 --- a/src/lighteval/tasks/multilingual/tasks/global_mmlu.py +++ b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py @@ -35,8 +35,6 @@ from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation from lighteval.tasks.templates.multichoice import get_mcq_prompt_function from lighteval.tasks.templates.utils.formulation import ( - CFFormulation, - HybridFormulation, MCFFormulation, ) from lighteval.utils.language import Language @@ -176,8 +174,6 @@ ] for formulation in [ MCFFormulation(), - CFFormulation(), - HybridFormulation(), ] - for sensitivity_label in ["ALL", "CA", "CS", "UNK"] + for sensitivity_label in ["ALL"] ] diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 324a38680..5ce5f1017 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -25,14 +25,16 @@ import copy import importlib import importlib.util +import inspect import logging import os import sys import time -from functools import lru_cache +from dataclasses import asdict +from functools import lru_cache, partial from itertools import groupby from pathlib import Path -from types import ModuleType +from types import FunctionType, ModuleType from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig @@ -149,6 +151,9 @@ def __init__( else: self.tasks_list = self._get_full_task_list_from_input_string(tasks) + self._load_multilingual = load_multilingual + self._custom_tasks = custom_tasks + self._task_registry = Registry.load_all_task_configs( custom_tasks=custom_tasks, load_multilingual=load_multilingual ) @@ -433,3 +438,156 @@ def print_all_tasks(self, suites: str | None = None): # Print summary total_tasks = len([t for t in tasks_names if t.split("|")[1]]) print(f"\nTotal tasks displayed: {total_tasks}") + + def get_tasks_dump(self) -> list[dict]: # noqa: C901 + """Get all task names, metadata, and docstrings as a Python object. + + Returns: + list[dict]: List of dictionaries, each containing: + - module: Module name + - docstring: Parsed docstring as dict + - tasks: List of task configs for this module + """ + task_configs = self._task_registry + + TASKS_DIR = Path(__file__).parent / "tasks" + TASKS_DIR_MULTILINGUAL = Path(__file__).parent / "multilingual" / "tasks" + TASKS_DIR_SUBDIRS = Path(__file__).parent / "tasks" + + task_files = [f for f in TASKS_DIR.glob("*.py") if f.name != "__init__.py"] + task_files_multilingual = [f for f in TASKS_DIR_MULTILINGUAL.glob("*.py") if f.name != "__init__.py"] + task_subdirs = [d for d in TASKS_DIR_SUBDIRS.iterdir() if d.is_dir() and (d / "main.py").exists()] + + module_to_docstring = {} + + for task_file in task_files: + module_name = task_file.stem + module = importlib.import_module(f"lighteval.tasks.tasks.{module_name}") + docstring = (inspect.getdoc(module) or module.__doc__ or "").strip() + module_to_docstring[module] = docstring + + if self._load_multilingual: + for task_file in task_files_multilingual: + module_name = task_file.stem + module = importlib.import_module(f"lighteval.tasks.multilingual.tasks.{module_name}") + docstring = (inspect.getdoc(module) or module.__doc__ or "").strip() + module_to_docstring[module] = docstring + + for task_dir in task_subdirs: + module_name = task_dir.name + module = importlib.import_module(f"lighteval.tasks.tasks.{module_name}.main") + docstring = (inspect.getdoc(module) or module.__doc__ or "").strip() + module_to_docstring[module] = docstring + + if self._custom_tasks is not None: + custom_tasks_module = Registry.create_custom_tasks_module(self._custom_tasks) + docstring = (inspect.getdoc(custom_tasks_module) or custom_tasks_module.__doc__ or "").strip() + module_to_docstring[custom_tasks_module] = docstring + + config_to_module = {} + module_to_task_names = {} + for module, docstring in module_to_docstring.items(): + if hasattr(module, "TASKS_TABLE"): + task_names_in_module = [] + for config in getattr(module, "TASKS_TABLE"): + config_to_module[config.name] = module + if config.name in task_configs: + task_names_in_module.append(config.name) + if task_names_in_module: + module_to_task_names[module] = task_names_in_module + + def parse_docstring(docstring: str) -> dict: # noqa: C901 + """Parse a structured docstring into a JSON object. + + Expected format: + key: + value + + key2: + value2 + + Fields 'dataset', 'languages', and 'tags' are parsed as lists if comma-separated. + """ + if not docstring: + return {} + + parsed = {} + lines = docstring.split("\n") + current_key = None + current_value = [] + + list_fields = {"dataset", "languages", "tags"} + + for line in lines: + line = line.strip() + if not line: + if current_key and current_value: + value = "\n".join(current_value).strip() + if current_key in list_fields: + if "," in value: + parsed[current_key] = [item.strip() for item in value.split(",") if item.strip()] + else: + parsed[current_key] = [value] if value else [] + else: + parsed[current_key] = value + current_value = [] + continue + + if line.endswith(":"): + if current_key and current_value: + value = "\n".join(current_value).strip() + if current_key in list_fields: + if "," in value: + parsed[current_key] = [item.strip() for item in value.split(",") if item.strip()] + else: + parsed[current_key] = [value] if value else [] + else: + parsed[current_key] = value + current_key = line[:-1].strip() + current_value = [] + else: + if current_key: + current_value.append(line) + + if current_key and current_value: + value = "\n".join(current_value).strip() + if current_key in list_fields: + if "," in value: + parsed[current_key] = [item.strip() for item in value.split(",") if item.strip()] + else: + parsed[current_key] = [value] if value else [] + else: + parsed[current_key] = value + + return parsed + + def serialize_value(v): + if isinstance(v, (FunctionType, partial)): + func_name = getattr(v.func if isinstance(v, partial) else v, "__name__", str(v)) + return f"" + if callable(v) and not isinstance(v, type): + return f"" + if isinstance(v, type): + return f"" + if type(v) in (list, tuple): + return [serialize_value(item) for item in v] + if isinstance(v, dict): + return {k: serialize_value(val) for k, val in v.items()} + return v + + modules_data = [] + for module, task_names in module_to_task_names.items(): + docstring_raw = module_to_docstring.get(module, "") + docstring_parsed = parse_docstring(docstring_raw) + module_name = getattr(module, "__name__", str(module)) + + tasks_in_module = [] + for task_name in task_names: + config = task_configs[task_name] + config_dict = asdict(config) + config_dict = {k: serialize_value(v) for k, v in config_dict.items()} + tasks_in_module.append({"name": task_name, "config": config_dict}) + + modules_data.append({"module": module_name, "docstring": docstring_parsed, "tasks": tasks_in_module}) + + return modules_data From c4f655d1a0ea17ee68d9c22d92b55a449009093e Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 12 Nov 2025 16:57:21 +0100 Subject: [PATCH 2/7] Update src/lighteval/tasks/registry.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/lighteval/tasks/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 5ce5f1017..be8b8803d 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -452,11 +452,11 @@ def get_tasks_dump(self) -> list[dict]: # noqa: C901 TASKS_DIR = Path(__file__).parent / "tasks" TASKS_DIR_MULTILINGUAL = Path(__file__).parent / "multilingual" / "tasks" - TASKS_DIR_SUBDIRS = Path(__file__).parent / "tasks" + task_files = [f for f in TASKS_DIR.glob("*.py") if f.name != "__init__.py"] task_files_multilingual = [f for f in TASKS_DIR_MULTILINGUAL.glob("*.py") if f.name != "__init__.py"] - task_subdirs = [d for d in TASKS_DIR_SUBDIRS.iterdir() if d.is_dir() and (d / "main.py").exists()] + task_subdirs = [d for d in TASKS_DIR.iterdir() if d.is_dir() and (d / "main.py").exists()] module_to_docstring = {} From fa181746c38922fa46a3e563829d40ce89eec8d7 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 12 Nov 2025 16:57:35 +0100 Subject: [PATCH 3/7] Update src/lighteval/tasks/registry.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/lighteval/tasks/registry.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index be8b8803d..d5cb47b25 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -484,13 +484,11 @@ def get_tasks_dump(self) -> list[dict]: # noqa: C901 docstring = (inspect.getdoc(custom_tasks_module) or custom_tasks_module.__doc__ or "").strip() module_to_docstring[custom_tasks_module] = docstring - config_to_module = {} module_to_task_names = {} for module, docstring in module_to_docstring.items(): if hasattr(module, "TASKS_TABLE"): task_names_in_module = [] for config in getattr(module, "TASKS_TABLE"): - config_to_module[config.name] = module if config.name in task_configs: task_names_in_module.append(config.name) if task_names_in_module: From d9a005a8132576817ac6c93710398177ed4631a4 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 12 Nov 2025 16:57:52 +0100 Subject: [PATCH 4/7] Update src/lighteval/tasks/registry.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/lighteval/tasks/registry.py | 44 +++++++++++---------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index d5cb47b25..04d36c7c6 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -516,47 +516,33 @@ def parse_docstring(docstring: str) -> dict: # noqa: C901 list_fields = {"dataset", "languages", "tags"} + def process_current_key_value(current_key, current_value, list_fields, parsed): + if current_key and current_value: + value = "\n".join(current_value).strip() + if current_key in list_fields: + if "," in value: + parsed[current_key] = [item.strip() for item in value.split(",") if item.strip()] + else: + parsed[current_key] = [value] if value else [] + else: + parsed[current_key] = value + for line in lines: line = line.strip() if not line: - if current_key and current_value: - value = "\n".join(current_value).strip() - if current_key in list_fields: - if "," in value: - parsed[current_key] = [item.strip() for item in value.split(",") if item.strip()] - else: - parsed[current_key] = [value] if value else [] - else: - parsed[current_key] = value - current_value = [] + process_current_key_value(current_key, current_value, list_fields, parsed) + current_value = [] continue if line.endswith(":"): - if current_key and current_value: - value = "\n".join(current_value).strip() - if current_key in list_fields: - if "," in value: - parsed[current_key] = [item.strip() for item in value.split(",") if item.strip()] - else: - parsed[current_key] = [value] if value else [] - else: - parsed[current_key] = value + process_current_key_value(current_key, current_value, list_fields, parsed) current_key = line[:-1].strip() current_value = [] else: if current_key: current_value.append(line) - if current_key and current_value: - value = "\n".join(current_value).strip() - if current_key in list_fields: - if "," in value: - parsed[current_key] = [item.strip() for item in value.split(",") if item.strip()] - else: - parsed[current_key] = [value] if value else [] - else: - parsed[current_key] = value - + process_current_key_value(current_key, current_value, list_fields, parsed) return parsed def serialize_value(v): From f4fb2878ec3a8f926d3b50ec33a3fb34be08a58a Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 12 Nov 2025 16:58:46 +0100 Subject: [PATCH 5/7] fix --- src/lighteval/tasks/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 04d36c7c6..52d23f032 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -453,7 +453,6 @@ def get_tasks_dump(self) -> list[dict]: # noqa: C901 TASKS_DIR = Path(__file__).parent / "tasks" TASKS_DIR_MULTILINGUAL = Path(__file__).parent / "multilingual" / "tasks" - task_files = [f for f in TASKS_DIR.glob("*.py") if f.name != "__init__.py"] task_files_multilingual = [f for f in TASKS_DIR_MULTILINGUAL.glob("*.py") if f.name != "__init__.py"] task_subdirs = [d for d in TASKS_DIR.iterdir() if d.is_dir() and (d / "main.py").exists()] From 0a18cb86b49164918edc2b39b081e087e3e12491 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 12 Nov 2025 17:02:20 +0100 Subject: [PATCH 6/7] remove --- src/lighteval/tasks/registry.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 52d23f032..e7c4e9eb6 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -31,10 +31,10 @@ import sys import time from dataclasses import asdict -from functools import lru_cache, partial +from functools import lru_cache from itertools import groupby from pathlib import Path -from types import FunctionType, ModuleType +from types import ModuleType from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig @@ -544,20 +544,6 @@ def process_current_key_value(current_key, current_value, list_fields, parsed): process_current_key_value(current_key, current_value, list_fields, parsed) return parsed - def serialize_value(v): - if isinstance(v, (FunctionType, partial)): - func_name = getattr(v.func if isinstance(v, partial) else v, "__name__", str(v)) - return f"" - if callable(v) and not isinstance(v, type): - return f"" - if isinstance(v, type): - return f"" - if type(v) in (list, tuple): - return [serialize_value(item) for item in v] - if isinstance(v, dict): - return {k: serialize_value(val) for k, val in v.items()} - return v - modules_data = [] for module, task_names in module_to_task_names.items(): docstring_raw = module_to_docstring.get(module, "") @@ -568,7 +554,7 @@ def serialize_value(v): for task_name in task_names: config = task_configs[task_name] config_dict = asdict(config) - config_dict = {k: serialize_value(v) for k, v in config_dict.items()} + config_dict = {k: v.__str__() for k, v in config_dict.items()} tasks_in_module.append({"name": task_name, "config": config_dict}) modules_data.append({"module": module_name, "docstring": docstring_parsed, "tasks": tasks_in_module}) From c1366530e344c519c2ade73b475442d9deb7c82f Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 12 Nov 2025 17:09:52 +0100 Subject: [PATCH 7/7] fix aimo --- src/lighteval/tasks/tasks/aimo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/tasks/tasks/aimo.py b/src/lighteval/tasks/tasks/aimo.py index 0bd519dc9..fdfc5ff95 100644 --- a/src/lighteval/tasks/tasks/aimo.py +++ b/src/lighteval/tasks/tasks/aimo.py @@ -3,7 +3,7 @@ AIMO Progress Prize 1 dataset: -https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize +lighteval/aimo_progress_prize_1 abstract: Task to evaluate LLMs on the training set of the Kaggle AIMO competition: