diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index 62f1129f4..eccfd8678 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -46,7 +46,9 @@ def inspect( from lighteval.tasks.registry import Registry - registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True) + registry = Registry( + tasks=tasks, custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True + ) # Loading task task_dict = registry.load_tasks() @@ -54,7 +56,7 @@ def inspect( print("-" * 10, name, "-" * 10) if show_config: print("-" * 10, "CONFIG") - task.cfg.print() + task.config.print() for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]): if ix == 0: print("-" * 10, "SAMPLES") diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 54b7f9fc6..92c2c574a 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -47,7 +47,7 @@ class CorpusLevelComputation(ABC): @abstractmethod - def compute_corpus(self): + def compute_corpus(self, items): raise NotImplementedError def __str__(self): diff --git a/src/lighteval/tasks/extended/ifbench/instructions.py b/src/lighteval/tasks/extended/ifbench/instructions.py index 0c4f0a9a0..bdfd3b379 100644 --- a/src/lighteval/tasks/extended/ifbench/instructions.py +++ b/src/lighteval/tasks/extended/ifbench/instructions.py @@ -788,7 +788,7 @@ def check_following(self, value): """Checks if the response only includes words with prime length.""" value = value.translate(str.maketrans("", "", string.punctuation)) words = value.split() - primes = set(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97) + primes = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97} for word in words: if len(word) not in primes: return False diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index 44e05d0cc..b91eb4e00 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -32,16 +32,16 @@ import numpy as np import requests -from aenum import extend_enum from scipy.optimize import minimize import lighteval.tasks.default_prompts as prompt -from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics +from lighteval.metrics.metrics import CorpusLevelMetricGrouping from lighteval.metrics.metrics_corpus import CorpusLevelComputation from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation from lighteval.metrics.normalizations import gsm8k_normalizer +from lighteval.models.model_output import ModelResponse from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import SamplingMethod +from lighteval.tasks.requests import Doc, SamplingMethod # Utility functions @@ -101,18 +101,18 @@ def download(self): with open(path_dld, "wb") as file: file.write(response.content) - def compute(self, **args): + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: if self.task == "gsm8k": res = ExactMatches( strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer - ).compute(**args) + ).compute(doc, model_response, **kwargs) return dict.fromkeys(self.METRICS, res) else: - res = LoglikelihoodAcc().compute(**args) + res = LoglikelihoodAcc().compute(doc, model_response, **kwargs) return dict.fromkeys(self.METRICS, res) - def compute_corpus(self, y_input): - if len(y_input) == self.num_samples and self.estimates is not None: + def compute_corpus(self, items): + if len(items) == self.num_samples and self.estimates is not None: return self.estimates[self.task] # We load the weights for the relevant examples @@ -149,7 +149,7 @@ def compute_corpus(self, y_input): # Creating vector y and estimating theta y = np.zeros(N) for i, j in enumerate(seen_examples): - y[j] = y_input[i] + y[j] = items[i] # Getting estimates theta = fit_theta(y, seen_examples, A, B) @@ -175,7 +175,7 @@ def compute_corpus(self, y_input): estimates[scenario]["pirt"] = IRTp estimates[scenario]["gpirt"] = IRTpp - self.num_samples = len(y_input) + self.num_samples = len(items) self.estimates = estimates return estimates[self.task] @@ -238,6 +238,25 @@ def compute_corpus(self, y_input): # }, ] +metrics = {} + +for task_param in task_params: + name = task_param["name"] + if name == "gsm8k": + category = SamplingMethod.GENERATIVE + else: + category = SamplingMethod.LOGPROBS + + metrics[f"tinybench_metric_{name}"] = ( + CorpusLevelMetricGrouping( + metric_name=TinyCorpusAggregator.METRICS, + higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), + sample_level_fn=TinyCorpusAggregator(name), + category=category, + corpus_level_fn=TinyCorpusAggregator(name), + ), + ) + TASKS_TABLE = [] for task in task_params: name = task["name"] @@ -256,28 +275,8 @@ def compute_corpus(self, y_input): evaluation_splits=task["evaluation_split"], few_shots_split=None, few_shots_select="random_sampling", - metrics=[f"tinybench_metric_{name}"], + metrics=metrics[f"tinybench_metric_{name}"], generation_size=generation_size, stop_sequence=stop_sequence, ) TASKS_TABLE.append(task) - -# CUSTOM METRIC -for task_param in task_params: - name = task_param["name"] - if name == "gsm8k": - category = SamplingMethod.GENERATIVE - else: - category = SamplingMethod.LOGPROBS - - extend_enum( - Metrics, - f"tinybench_metric_{name}", - CorpusLevelMetricGrouping( - metric_name=TinyCorpusAggregator.METRICS, - higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True), - sample_level_fn=TinyCorpusAggregator(name), - category=category, - corpus_level_fn=TinyCorpusAggregator(name), - ), - )