From 17ea112b5517b81f8399a6ea5b825eea3465f69b Mon Sep 17 00:00:00 2001 From: Luca Rolshoven Date: Mon, 15 Sep 2025 11:59:25 +0200 Subject: [PATCH 1/3] Added `backend_options` parameter to llm judges. Currently only used for litellm backend but can be extended to other backends as well. Allows to specify whether to use caching or not, the number of concurrent requests, and whether the token output budget should be increased for reasoning models. --- src/lighteval/metrics/metrics_sample.py | 2 + src/lighteval/metrics/utils/llm_as_judge.py | 63 ++++++++++++++++++--- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 38a0e2f52..6935f9321 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -947,6 +947,7 @@ def __init__( url: str | None = None, hf_provider: str | None = None, max_tokens: int | None = None, + backend_options: dict | None = None, ) -> None: logger.debug(f"Initializing JudgeLLM with backend: {judge_backend}, model: {judge_model_name}") @@ -993,6 +994,7 @@ def __init__( url=url, hf_provider=hf_provider, max_tokens=max_tokens, + backend_options=backend_options, ) def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list: diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py index dcf0a5a88..7b36a7e85 100644 --- a/src/lighteval/metrics/utils/llm_as_judge.py +++ b/src/lighteval/metrics/utils/llm_as_judge.py @@ -25,7 +25,8 @@ import logging import time from concurrent.futures import ThreadPoolExecutor -from typing import Callable, Literal, Optional +from dataclasses import dataclass +from typing import Callable, Dict, Literal, Optional from huggingface_hub import AsyncInferenceClient, InferenceTimeoutError from pydantic import BaseModel @@ -45,16 +46,41 @@ DEFAULT_FORMAT = {"type": "text"} +@dataclass +class LitellmBackendOptions: + """Options for the LiteLLM judge backend with default values. + + Attributes: + caching (bool): Whether to enable caching for the API responses. Defaults to True. + concurrent_requests (int): The maximum number of concurrent requests to the API. Defaults to 10. + increase_max_tokens_for_reasoning (bool): Whether to increase the max tokens for certain reasoning + models. Defaults to True. + """ + + caching: bool = True + concurrent_requests: int = 10 + + # Increases max_tokens depending on the model used, see implementation below + increase_max_tokens_for_reasoning: bool = True + + class JudgeLM: - """A class representing a judge for evaluating answers using either the OpenAI or Transformers library. + """A class representing a judge for evaluating answers using either the chosen backend. Args: model (str): The name of the model. templates (Callable): A function taking into account the question, options, answer, and gold and returning the judge prompt. process_judge_response (Callable): A function for processing the judge's response. - judge_backend (Literal["openai", "transformers", "tgi", "vllm"]): The backend for the judge. + judge_backend (Literal["litellm", "openai", "transformers", "tgi", "vllm", "inference-providers"]): The backend for the judge. url (str | None): The URL for the OpenAI API. api_key (str | None): The API key for the OpenAI API (either OpenAI or HF key). + max_tokens (int): The maximum number of tokens to generate. + response_format (BaseModel | None): The format of the response from the API, used for the OpenAI and TGI backend. If not set, + no structured outputs will be generated, just text. + hf_provider (Optional[Literal["black-forest-labs", "cerebras", "cohere", "fal-ai", "fireworks-ai", + "inference-providers", "hyperbolic", "nebius", "novita", "openai", "replicate", "sambanova", "together"]]): + The HuggingFace provider when using the inference-providers backend. + backend_options (Optional[Dict]): Options for the backend. Currently only supported for litellm. Attributes: model (str): The name of the model. @@ -66,7 +92,13 @@ class JudgeLM: process_judge_response (Callable): A function for processing the judge's response. url (str | None): The URL for the OpenAI API. api_key (str | None): The API key for the OpenAI API (either OpenAI or HF key). - backend (Literal["openai", "transformers", "tgi", "vllm"]): The backend for the judge + backend (Literal["litellm", "openai", "transformers", "tgi", "vllm", "inference-providers"]): The backend for the judge. + max_tokens (int): The maximum number of tokens to generate. + response_format (BaseModel | dict): The format of the response from the API, used for the OpenAI and TGI backend. + hf_provider (Optional[Literal["black-forest-labs", "cerebras", "cohere", "fal-ai", "fireworks-ai", + "inference-providers", "hyperbolic", "nebius", "novita", "openai", "replicate", "sambanova", "together"]]): + The HuggingFace provider when using the inference-providers backend. + backend_options (Union[LitellmBackendOptions, Dict]): Options for the backend. Currently only supported for litellm. Methods: evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library. @@ -103,6 +135,7 @@ def __init__( "together", ] ] = None, + backend_options: Optional[Dict] = None, ): self.model = model self.template = templates @@ -122,6 +155,12 @@ def __init__( self.response_format = response_format if not None else DEFAULT_FORMAT + self.backend_options = backend_options if backend_options else {} + + # Override backend options dictionary with the corresponding dataclass to ensure all specified options are valid + if judge_backend == "litellm": + self.backend_options = LitellmBackendOptions(**self.backend_options) + # Validate that hf_provider is specified when using inference-providers backend if self.backend == "inference-providers" and self.hf_provider is None: raise ValueError("When using 'inference-providers' as backend, you must specify an 'hf_provider'") @@ -286,12 +325,22 @@ def __call_vllm(self, prompt): def __call_litellm(self, prompts): import litellm + if self.backend_options.caching: + from litellm.caching.caching import Cache, LiteLLMCacheType + + litellm.cache = Cache(type=LiteLLMCacheType.DISK) + + # Automatically drop parameters that are not supported by the currently used inference API + litellm.drop_params = True + def __call_api(prompt): error_message = "ERROR: Failed to get response from the API." for _ in range(self.API_MAX_RETRY): try: - max_new_tokens = 512 - if "o1" in self.model or "o3" in self.model or "R1" in self.model: + max_new_tokens = self.max_tokens + + is_reasoning_model = "o1" in self.model or "o3" in self.model or "R1" in self.model + if is_reasoning_model and self.backend_options.increase_max_tokens_for_reasoning: max_new_tokens = min(max_new_tokens * 10, 32000) kwargs = { @@ -319,7 +368,7 @@ def __call_api(prompt): return error_message results = [] - with ThreadPoolExecutor(100) as executor: + with ThreadPoolExecutor(self.backend_options.concurrent_requests) as executor: for entry in tqdm(executor.map(__call_api, prompts), total=len(prompts)): results.append(entry) From 7392c18971a7c322077a34264b70df83b005ffd1 Mon Sep 17 00:00:00 2001 From: Luca Rolshoven Date: Mon, 15 Sep 2025 15:22:00 +0200 Subject: [PATCH 2/3] Implemented changes from code review --- src/lighteval/metrics/utils/llm_as_judge.py | 36 +++++---------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py index 7b36a7e85..c2b38778d 100644 --- a/src/lighteval/metrics/utils/llm_as_judge.py +++ b/src/lighteval/metrics/utils/llm_as_judge.py @@ -37,7 +37,6 @@ from lighteval.utils.imports import is_litellm_available, is_openai_available, is_vllm_available from lighteval.utils.utils import as_list - logging.getLogger("openai").setLevel(logging.ERROR) logging.getLogger("httpx").setLevel(logging.ERROR) logger = logging.getLogger(__name__) @@ -67,38 +66,19 @@ class LitellmBackendOptions: class JudgeLM: """A class representing a judge for evaluating answers using either the chosen backend. - Args: + Attributes: model (str): The name of the model. templates (Callable): A function taking into account the question, options, answer, and gold and returning the judge prompt. process_judge_response (Callable): A function for processing the judge's response. judge_backend (Literal["litellm", "openai", "transformers", "tgi", "vllm", "inference-providers"]): The backend for the judge. url (str | None): The URL for the OpenAI API. api_key (str | None): The API key for the OpenAI API (either OpenAI or HF key). - max_tokens (int): The maximum number of tokens to generate. - response_format (BaseModel | None): The format of the response from the API, used for the OpenAI and TGI backend. If not set, - no structured outputs will be generated, just text. - hf_provider (Optional[Literal["black-forest-labs", "cerebras", "cohere", "fal-ai", "fireworks-ai", - "inference-providers", "hyperbolic", "nebius", "novita", "openai", "replicate", "sambanova", "together"]]): - The HuggingFace provider when using the inference-providers backend. - backend_options (Optional[Dict]): Options for the backend. Currently only supported for litellm. - - Attributes: - model (str): The name of the model. - template (Callable): A function taking into account the question, options, answer, and gold and returning the judge prompt. - API_MAX_RETRY (int): The maximum number of retries for the API. - API_RETRY_SLEEP (int): The time to sleep between retries. - client (OpenAI | None): The OpenAI client. - pipe (LLM | AutoModel | None): The Transformers or vllm pipeline. - process_judge_response (Callable): A function for processing the judge's response. - url (str | None): The URL for the OpenAI API. - api_key (str | None): The API key for the OpenAI API (either OpenAI or HF key). - backend (Literal["litellm", "openai", "transformers", "tgi", "vllm", "inference-providers"]): The backend for the judge. - max_tokens (int): The maximum number of tokens to generate. - response_format (BaseModel | dict): The format of the response from the API, used for the OpenAI and TGI backend. - hf_provider (Optional[Literal["black-forest-labs", "cerebras", "cohere", "fal-ai", "fireworks-ai", - "inference-providers", "hyperbolic", "nebius", "novita", "openai", "replicate", "sambanova", "together"]]): + max_tokens (int): The maximum number of tokens to generate. Defaults to 512. + response_format (BaseModel | None): The format of the response from the API, used for the OpenAI and TGI backend. + hf_provider (Literal["black-forest-labs", "cerebras", "cohere", "fal-ai", "fireworks-ai", + "inference-providers", "hyperbolic", "nebius", "novita", "openai", "replicate", "sambanova", "together"] | None): The HuggingFace provider when using the inference-providers backend. - backend_options (Union[LitellmBackendOptions, Dict]): Options for the backend. Currently only supported for litellm. + backend_options (dict | None): Options for the backend. Currently only supported for litellm. Methods: evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library. @@ -135,7 +115,7 @@ def __init__( "together", ] ] = None, - backend_options: Optional[Dict] = None, + backend_options: dict | None = None, ): self.model = model self.template = templates @@ -155,7 +135,7 @@ def __init__( self.response_format = response_format if not None else DEFAULT_FORMAT - self.backend_options = backend_options if backend_options else {} + self.backend_options = backend_options or {} # Override backend options dictionary with the corresponding dataclass to ensure all specified options are valid if judge_backend == "litellm": From 8bf62ae7bdcbdc923e7fd5da6dd502eeca4e98cf Mon Sep 17 00:00:00 2001 From: Luca Rolshoven Date: Mon, 15 Sep 2025 15:23:52 +0200 Subject: [PATCH 3/3] Ran pre-commit hooks --- src/lighteval/metrics/utils/llm_as_judge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py index c2b38778d..22da4b3e3 100644 --- a/src/lighteval/metrics/utils/llm_as_judge.py +++ b/src/lighteval/metrics/utils/llm_as_judge.py @@ -26,7 +26,7 @@ import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import Callable, Dict, Literal, Optional +from typing import Callable, Literal, Optional from huggingface_hub import AsyncInferenceClient, InferenceTimeoutError from pydantic import BaseModel @@ -37,6 +37,7 @@ from lighteval.utils.imports import is_litellm_available, is_openai_available, is_vllm_available from lighteval.utils.utils import as_list + logging.getLogger("openai").setLevel(logging.ERROR) logging.getLogger("httpx").setLevel(logging.ERROR) logger = logging.getLogger(__name__)