Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/source/contributing-to-multilingual-evaluations.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,6 @@ your_tasks = [
LightevalTaskConfig(
# Name of your evaluation
name=f"evalname_{language.value}_{formulation.name.lower()}",
# The evaluation is community contributed
suite=["community"],
# This will automatically get the correct metrics for your chosen formulation
metric=get_metrics_for_formulation(
formulation,
Expand Down
8 changes: 1 addition & 7 deletions docs/source/quicktour.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,6 @@ lighteval accelerate \

### Task Specification

The syntax for the task specification might be a bit hard to grasp at first. The format is as follows:

```txt
{suite}|{task}|{num_few_shot}
```

Tasks have a function applied at the sample level and one at the corpus level. For example,
- an exact match can be applied per sample, then averaged over the corpus to give the final score
- samples can be left untouched before applying Corpus BLEU at the corpus level
Expand All @@ -74,7 +68,7 @@ etc.
If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI.
For example
```txt
{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
```

All officially supported tasks can be found at the [tasks_list](available-tasks) and in the
Expand Down
3 changes: 0 additions & 3 deletions docs/source/saving-and-reading-results.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,6 @@ The main results file contains several sections:
"Question="
],
"num_samples": null,
"suite": [
"lighteval"
],
"original_num_docs": 1319,
"effective_num_docs": 1,
"must_remove_duplicate_docs": null,
Expand Down
8 changes: 0 additions & 8 deletions examples/nanotron/custom_evaluation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,6 @@ def __init__(
evaluation_splits=["test"],
few_shots_split=None,
few_shots_select=None,
suite=["custom"],
generation_size=40,
stop_sequence=None,
):
Expand All @@ -314,7 +313,6 @@ def __init__(
evaluation_splits=evaluation_splits,
few_shots_split=few_shots_split,
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)
Expand Down Expand Up @@ -401,7 +399,6 @@ def __init__(
evaluation_splits=["test"],
few_shots_split="dev",
few_shots_select=None,
suite=None,
generation_size=-1,
stop_sequence=None,
):
Expand All @@ -415,7 +412,6 @@ def __init__(
evaluation_splits=evaluation_splits,
few_shots_split=few_shots_split,
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)
Expand Down Expand Up @@ -512,7 +508,6 @@ def __init__(
evaluation_splits=["train"],
few_shots_split="train",
few_shots_select=None,
suite=None,
generation_size=4,
stop_sequence=None,
):
Expand All @@ -526,7 +521,6 @@ def __init__(
evaluation_splits=evaluation_splits,
few_shots_split=few_shots_split,
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)
Expand Down Expand Up @@ -646,7 +640,6 @@ def __init__(
evaluation_splits=["train"],
few_shots_split="validation",
few_shots_select=None,
suite=None,
generation_size=-1,
stop_sequence=None,
):
Expand All @@ -660,7 +653,6 @@ def __init__(
evaluation_splits=evaluation_splits,
few_shots_split=few_shots_split,
few_shots_select=few_shots_select,
suite=suite,
generation_size=generation_size,
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
)
Expand Down
2 changes: 0 additions & 2 deletions examples/nanotron/custom_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def mmlu_anatomy(line):
TASKS_TABLE = [
LightevalTaskConfig(
name="mmlu:anatomy",
suite=["custom"],
prompt_function=mmlu_anatomy,
hf_repo="lighteval/mmlu",
hf_subset="anatomy",
Expand All @@ -85,7 +84,6 @@ def mmlu_anatomy(line):
),
LightevalTaskConfig(
name="mmlu:anatomy_signs",
suite=["custom"],
prompt_function=mmlu_anatomy_signs,
hf_repo="lighteval/mmlu",
hf_subset="anatomy",
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ class Arg:
type=Annotated[
str,
Argument(
help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks."
help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'task{|fewshot}'. Use 'lighteval tasks list' to see available tasks."
),
],
default=None, # Required argument, no default
Expand Down
17 changes: 12 additions & 5 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import functools
import logging
import random
from dataclasses import asdict, dataclass, field
Expand Down Expand Up @@ -155,7 +156,7 @@ def __post_init__(self):
self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else ()
self.full_name = f"{self.name}|{self.num_fewshots}" # todo clefourrier: this is likely incorrect

def __str__(self, lite: bool = False):
def __str__(self, lite: bool = False): # noqa: C901
md_writer = MarkdownTableWriter()
md_writer.headers = ["Key", "Value"]

Expand All @@ -170,17 +171,23 @@ def __str__(self, lite: bool = False):
if k == "metrics":
for ix, metrics in enumerate(v):
for metric_k, metric_v in metrics.items():
if isinstance(metric_v, Callable):
repr_v = metric_v.__name__
if isinstance(metric_v, functools.partial):
func_name = getattr(metric_v.func, "__name__", str(metric_v.func))
repr_v = f"partial({func_name}, ...)"
elif isinstance(metric_v, Callable):
repr_v = getattr(metric_v, "__name__", repr(metric_v))
elif isinstance(metric_v, Metric.get_allowed_types_for_metrics()):
repr_v = str(metric_v)
else:
repr_v = repr(metric_v)
values.append([f"{k} {ix}: {metric_k}", repr_v])

else:
if isinstance(v, Callable):
values.append([k, v.__name__])
if isinstance(v, functools.partial):
func_name = getattr(v.func, "__name__", str(v.func))
values.append([k, f"partial({func_name}, ...)"])
elif isinstance(v, Callable):
values.append([k, getattr(v, "__name__", repr(v))])
else:
values.append([k, repr(v)])

Expand Down
Loading