From d8fe0d5f60a105fd62163ec601c12bd6e94ae145 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Fri, 31 Oct 2025 14:13:28 +0100 Subject: [PATCH 1/4] adds mmlu-pro --- src/lighteval/tasks/tasks/mmlu_pro.py | 77 +++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/lighteval/tasks/tasks/mmlu_pro.py diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py new file mode 100644 index 000000000..08c6da192 --- /dev/null +++ b/src/lighteval/tasks/tasks/mmlu_pro.py @@ -0,0 +1,77 @@ +""" +name: +MMLU Pro + +dataset: +TIGER-Lab/MMLU-Pro + +abstract: + +languages: +english + +tags: +general-knowledge + +paper: + +""" +from string import ascii_uppercase + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.requests import Doc +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. + +{question} + +{choices} + +Answer:""".strip() + + +def mmlu_pro_prompt_function(line, task_name: str = None): + choices = "\n".join([f"{letter}: {choice}" for letter, choice in zip(ascii_uppercase, line["options"])]) + + query = TEMPLATE.format( + question=line["question"], + choices=choices, + ) + + return Doc( + task_name=task_name, + query=query, + choices=ascii_uppercase[: len(choices)], + gold_index=line["answer_index"], + instruction=query, + ) + + +mmlu_pro = LightevalTaskConfig( + name="mmlu_pro", + prompt_function=mmlu_pro_prompt_function, + suite=("lighteval",), + hf_repo="TIGER-Lab/MMLU-Pro", + hf_subset="default", + hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", + evaluation_splits=("test",), + few_shots_split="validation", + metrics=[Metrics.gpqa_instruct_metric], + ) + +TASKS_TABLE = [mmlu_pro] From 9c611aab9bf22a7429fbd4c41be9e2f71fd9caff Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Fri, 31 Oct 2025 14:15:02 +0100 Subject: [PATCH 2/4] adds mmlu-pro --- src/lighteval/tasks/tasks/mmlu_pro.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py index 08c6da192..39a6d2e28 100644 --- a/src/lighteval/tasks/tasks/mmlu_pro.py +++ b/src/lighteval/tasks/tasks/mmlu_pro.py @@ -6,33 +6,25 @@ TIGER-Lab/MMLU-Pro abstract: +MMLU-Pro dataset is a more robust and challenging massive multi-task +understanding dataset tailored to more rigorously benchmark large language +models' capabilities. This dataset contains 12K complex questions across various +disciplines. languages: english tags: -general-knowledge +general-knowledge, knowledge, multiple-choice paper: - +https://arxiv.org/abs/2406.01574 """ from string import ascii_uppercase -from lighteval.metrics.dynamic_metrics import ( - LogLikelihoodAccMetric, -) from lighteval.metrics.metrics import Metrics -from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation from lighteval.tasks.requests import Doc -from lighteval.tasks.templates.multichoice import get_mcq_prompt_function -from lighteval.tasks.templates.utils.formulation import ( - CFFormulation, - HybridFormulation, - MCFFormulation, -) -from lighteval.utils.language import Language TEMPLATE = """ From 1fc70091485226d6e84610de711003363139b915 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 4 Nov 2025 11:49:23 +0100 Subject: [PATCH 3/4] add mmlu-pro with inspectai --- src/lighteval/main_inspect.py | 1 + src/lighteval/tasks/tasks/mmlu_pro.py | 32 ++++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index e51869f41..0fdb67ead 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -473,5 +473,6 @@ def eval( task = "lighteval|ifeval|0" task = "lighteval|gpqa|0" task = "lighteval|ifbench_test|0" + task = "lighteval|mmlu_pro|0" model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct:nebius" eval(models=[model], tasks=task) diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py index 39a6d2e28..5a536ffed 100644 --- a/src/lighteval/tasks/tasks/mmlu_pro.py +++ b/src/lighteval/tasks/tasks/mmlu_pro.py @@ -20,8 +20,13 @@ paper: https://arxiv.org/abs/2406.01574 """ + from string import ascii_uppercase +from inspect_ai.dataset import Sample +from inspect_ai.scorer import choice +from inspect_ai.solver import multiple_choice + from lighteval.metrics.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -54,16 +59,23 @@ def mmlu_pro_prompt_function(line, task_name: str = None): ) +def record_to_sample(record): + return Sample(input=record["question"], target=record["answer"], choices=record["options"]) + + mmlu_pro = LightevalTaskConfig( - name="mmlu_pro", - prompt_function=mmlu_pro_prompt_function, - suite=("lighteval",), - hf_repo="TIGER-Lab/MMLU-Pro", - hf_subset="default", - hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", - evaluation_splits=("test",), - few_shots_split="validation", - metrics=[Metrics.gpqa_instruct_metric], - ) + name="mmlu_pro", + prompt_function=mmlu_pro_prompt_function, + sample_fields=record_to_sample, + solver=[multiple_choice(cache=True)], + scorer=choice(), + suite=("lighteval",), + hf_repo="TIGER-Lab/MMLU-Pro", + hf_subset="default", + hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", + evaluation_splits=("test",), + few_shots_split="validation", + metrics=[Metrics.gpqa_instruct_metric], +) TASKS_TABLE = [mmlu_pro] From 91367b7bad276f51041cd2f244a3ba92f2b7c47a Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 4 Nov 2025 13:30:35 +0100 Subject: [PATCH 4/4] fix reasoning effrot --- src/lighteval/main_inspect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index 0fdb67ead..be7958873 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -283,7 +283,7 @@ def eval( bool | None, Option(help="Cache prompt prefix.", rich_help_panel=HELP_PANEL_NAME_1) ] = None, reasoning_effort: Annotated[ - int | None, Option(help="Value: `minimal`, `low`, `medium`, `high`", rich_help_panel=HELP_PANEL_NAME_1) + str | None, Option(help="Value: `minimal`, `low`, `medium`, `high`", rich_help_panel=HELP_PANEL_NAME_1) ] = None, reasoning_tokens: Annotated[ int | None,