diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx index 8493d844d..5490e846b 100644 --- a/docs/source/contributing-to-multilingual-evaluations.mdx +++ b/docs/source/contributing-to-multilingual-evaluations.mdx @@ -147,8 +147,6 @@ your_tasks = [ LightevalTaskConfig( # Name of your evaluation name=f"evalname_{language.value}_{formulation.name.lower()}", - # The evaluation is community contributed - suite=["community"], # This will automatically get the correct metrics for your chosen formulation metric=get_metrics_for_formulation( formulation, diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx index 1652ff60e..919c8a1a1 100644 --- a/docs/source/quicktour.mdx +++ b/docs/source/quicktour.mdx @@ -60,12 +60,6 @@ lighteval accelerate \ ### Task Specification -The syntax for the task specification might be a bit hard to grasp at first. The format is as follows: - -```txt -{suite}|{task}|{num_few_shot} -``` - Tasks have a function applied at the sample level and one at the corpus level. For example, - an exact match can be applied per sample, then averaged over the corpus to give the final score - samples can be left untouched before applying Corpus BLEU at the corpus level @@ -74,7 +68,7 @@ etc. If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI. For example ```txt -{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0 +{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0 ``` All officially supported tasks can be found at the [tasks_list](available-tasks) and in the diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index ba13b14fd..167cf12e0 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -247,9 +247,6 @@ The main results file contains several sections: "Question=" ], "num_samples": null, - "suite": [ - "lighteval" - ], "original_num_docs": 1319, "effective_num_docs": 1, "must_remove_duplicate_docs": null, diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index c0e166116..adf0ae286 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -300,7 +300,6 @@ def __init__( evaluation_splits=["test"], few_shots_split=None, few_shots_select=None, - suite=["custom"], generation_size=40, stop_sequence=None, ): @@ -314,7 +313,6 @@ def __init__( evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, - suite=suite, generation_size=generation_size, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) @@ -401,7 +399,6 @@ def __init__( evaluation_splits=["test"], few_shots_split="dev", few_shots_select=None, - suite=None, generation_size=-1, stop_sequence=None, ): @@ -415,7 +412,6 @@ def __init__( evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, - suite=suite, generation_size=generation_size, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) @@ -512,7 +508,6 @@ def __init__( evaluation_splits=["train"], few_shots_split="train", few_shots_select=None, - suite=None, generation_size=4, stop_sequence=None, ): @@ -526,7 +521,6 @@ def __init__( evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, - suite=suite, generation_size=generation_size, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) @@ -646,7 +640,6 @@ def __init__( evaluation_splits=["train"], few_shots_split="validation", few_shots_select=None, - suite=None, generation_size=-1, stop_sequence=None, ): @@ -660,7 +653,6 @@ def __init__( evaluation_splits=evaluation_splits, few_shots_split=few_shots_split, few_shots_select=few_shots_select, - suite=suite, generation_size=generation_size, stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]), ) diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py index 55a318edd..ac55aa4dd 100644 --- a/examples/nanotron/custom_task.py +++ b/examples/nanotron/custom_task.py @@ -71,7 +71,6 @@ def mmlu_anatomy(line): TASKS_TABLE = [ LightevalTaskConfig( name="mmlu:anatomy", - suite=["custom"], prompt_function=mmlu_anatomy, hf_repo="lighteval/mmlu", hf_subset="anatomy", @@ -85,7 +84,6 @@ def mmlu_anatomy(line): ), LightevalTaskConfig( name="mmlu:anatomy_signs", - suite=["custom"], prompt_function=mmlu_anatomy_signs, hf_repo="lighteval/mmlu", hf_subset="anatomy", diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py index a8123218f..2bd52af31 100644 --- a/src/lighteval/cli_args.py +++ b/src/lighteval/cli_args.py @@ -243,7 +243,7 @@ class Arg: type=Annotated[ str, Argument( - help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks." + help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'task{|fewshot}'. Use 'lighteval tasks list' to see available tasks." ), ], default=None, # Required argument, no default diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 83723410f..73eb5309b 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import functools import logging import random from dataclasses import asdict, dataclass, field @@ -155,7 +156,7 @@ def __post_init__(self): self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else () self.full_name = f"{self.name}|{self.num_fewshots}" # todo clefourrier: this is likely incorrect - def __str__(self, lite: bool = False): + def __str__(self, lite: bool = False): # noqa: C901 md_writer = MarkdownTableWriter() md_writer.headers = ["Key", "Value"] @@ -170,8 +171,11 @@ def __str__(self, lite: bool = False): if k == "metrics": for ix, metrics in enumerate(v): for metric_k, metric_v in metrics.items(): - if isinstance(metric_v, Callable): - repr_v = metric_v.__name__ + if isinstance(metric_v, functools.partial): + func_name = getattr(metric_v.func, "__name__", str(metric_v.func)) + repr_v = f"partial({func_name}, ...)" + elif isinstance(metric_v, Callable): + repr_v = getattr(metric_v, "__name__", repr(metric_v)) elif isinstance(metric_v, Metric.get_allowed_types_for_metrics()): repr_v = str(metric_v) else: @@ -179,8 +183,11 @@ def __str__(self, lite: bool = False): values.append([f"{k} {ix}: {metric_k}", repr_v]) else: - if isinstance(v, Callable): - values.append([k, v.__name__]) + if isinstance(v, functools.partial): + func_name = getattr(v.func, "__name__", str(v.func)) + values.append([k, f"partial({func_name}, ...)"]) + elif isinstance(v, Callable): + values.append([k, getattr(v, "__name__", repr(v))]) else: values.append([k, repr(v)])