diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 9f8c9b3c4..ea5fc5f88 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -523,11 +523,13 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = else: self.metric_aggregated[task_name][metric_name] = metric_result - if isinstance(metric_result, dict): - stderr = None # We skip stderr for some corpus metrics that return dicts + if isinstance(metric_result, dict) or bootstrap_iters == 0: + stderr = ( + None # We skip stderr for some corpus metrics that return dicts, or if bootstrap_iters is 0 + ) else: aggregation = task.aggregation()[metric_name] - stderr = get_stderr_function(aggregation=aggregation, number_experiments=1000) + stderr = get_stderr_function(aggregation=aggregation, number_experiments=bootstrap_iters) if stderr is not None and len(metric_values) > 1: try: self.metric_aggregated[task_name][f"{metric_name}_stderr"] = stderr(metric_values) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 50e4ca3ef..c81a52f17 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -107,6 +107,7 @@ class PipelineParameters: system_prompt: str | None = None cot_prompt: str | None = None load_responses_from_details_date_id: str | None = None + bootstrap_iters: int = 1000 def __post_init__(self): # noqa C901 if self.launcher_type == ParallelismManager.ACCELERATE: @@ -293,7 +294,9 @@ def evaluate(self): if self.is_main_process(): self.evaluation_tracker.general_config_logger.log_end_time() - self.evaluation_tracker.metrics_logger.aggregate(task_dict=self.task_dict, bootstrap_iters=1000) + self.evaluation_tracker.metrics_logger.aggregate( + task_dict=self.task_dict, bootstrap_iters=self.pipeline_parameters.bootstrap_iters + ) self.evaluation_tracker.details_logger.aggregate() def _unpack(self, x):