Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/lighteval/main_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,17 @@ def inspect(

from lighteval.tasks.registry import Registry

registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True)
registry = Registry(
tasks=tasks, custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True
)

# Loading task
task_dict = registry.load_tasks()
for name, task in task_dict.items():
print("-" * 10, name, "-" * 10)
if show_config:
print("-" * 10, "CONFIG")
task.cfg.print()
task.config.print()
for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]):
if ix == 0:
print("-" * 10, "SAMPLES")
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

class CorpusLevelComputation(ABC):
@abstractmethod
def compute_corpus(self):
def compute_corpus(self, items):
raise NotImplementedError

def __str__(self):
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/tasks/extended/ifbench/instructions.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,7 +788,7 @@ def check_following(self, value):
"""Checks if the response only includes words with prime length."""
value = value.translate(str.maketrans("", "", string.punctuation))
words = value.split()
primes = set(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97)
primes = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}
for word in words:
if len(word) not in primes:
return False
Expand Down
61 changes: 30 additions & 31 deletions src/lighteval/tasks/extended/tiny_benchmarks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@

import numpy as np
import requests
from aenum import extend_enum
from scipy.optimize import minimize

import lighteval.tasks.default_prompts as prompt
from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics
from lighteval.metrics.metrics import CorpusLevelMetricGrouping
from lighteval.metrics.metrics_corpus import CorpusLevelComputation
from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation
from lighteval.metrics.normalizations import gsm8k_normalizer
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import SamplingMethod
from lighteval.tasks.requests import Doc, SamplingMethod


# Utility functions
Expand Down Expand Up @@ -101,18 +101,18 @@ def download(self):
with open(path_dld, "wb") as file:
file.write(response.content)

def compute(self, **args):
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
if self.task == "gsm8k":
res = ExactMatches(
strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
).compute(**args)
).compute(doc, model_response, **kwargs)
return dict.fromkeys(self.METRICS, res)
else:
res = LoglikelihoodAcc().compute(**args)
res = LoglikelihoodAcc().compute(doc, model_response, **kwargs)
return dict.fromkeys(self.METRICS, res)

def compute_corpus(self, y_input):
if len(y_input) == self.num_samples and self.estimates is not None:
def compute_corpus(self, items):
if len(items) == self.num_samples and self.estimates is not None:
return self.estimates[self.task]

# We load the weights for the relevant examples
Expand Down Expand Up @@ -149,7 +149,7 @@ def compute_corpus(self, y_input):
# Creating vector y and estimating theta
y = np.zeros(N)
for i, j in enumerate(seen_examples):
y[j] = y_input[i]
y[j] = items[i]

# Getting estimates
theta = fit_theta(y, seen_examples, A, B)
Expand All @@ -175,7 +175,7 @@ def compute_corpus(self, y_input):
estimates[scenario]["pirt"] = IRTp
estimates[scenario]["gpirt"] = IRTpp

self.num_samples = len(y_input)
self.num_samples = len(items)
self.estimates = estimates

return estimates[self.task]
Expand Down Expand Up @@ -238,6 +238,25 @@ def compute_corpus(self, y_input):
# },
]

metrics = {}

for task_param in task_params:
name = task_param["name"]
if name == "gsm8k":
category = SamplingMethod.GENERATIVE
else:
category = SamplingMethod.LOGPROBS

metrics[f"tinybench_metric_{name}"] = (
CorpusLevelMetricGrouping(
metric_name=TinyCorpusAggregator.METRICS,
higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
sample_level_fn=TinyCorpusAggregator(name),
category=category,
corpus_level_fn=TinyCorpusAggregator(name),
),
)

TASKS_TABLE = []
for task in task_params:
name = task["name"]
Expand All @@ -256,28 +275,8 @@ def compute_corpus(self, y_input):
evaluation_splits=task["evaluation_split"],
few_shots_split=None,
few_shots_select="random_sampling",
metrics=[f"tinybench_metric_{name}"],
metrics=metrics[f"tinybench_metric_{name}"],
generation_size=generation_size,
stop_sequence=stop_sequence,
)
TASKS_TABLE.append(task)

# CUSTOM METRIC
for task_param in task_params:
name = task_param["name"]
if name == "gsm8k":
category = SamplingMethod.GENERATIVE
else:
category = SamplingMethod.LOGPROBS

extend_enum(
Metrics,
f"tinybench_metric_{name}",
CorpusLevelMetricGrouping(
metric_name=TinyCorpusAggregator.METRICS,
higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
sample_level_fn=TinyCorpusAggregator(name),
category=category,
corpus_level_fn=TinyCorpusAggregator(name),
),
)
Loading