huggingface · f14-bertolotti · Nov 17, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/evaluating-a-custom-model.mdx
@@ -16,16 +16,13 @@ Here's a basic example:
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 
 class MyCustomModel(LightevalModel):
     def __init__(self, config):
         super().__init__(config)
         # Initialize your model here...
 
-        # Enable caching (recommended)
-        self._cache = SampleCache(config)
-
     @cached(SamplingMethod.GENERATIVE)
     def greedy_until(self, docs: List[Doc]) -> List[ModelResponse]:
         # Implement generation logic
@@ -168,15 +165,14 @@ To enable caching in your custom model:
 
 ### Step 1: Import Caching Components
 ```python
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 ```
 
 ### Step 2: Initialize Cache in Constructor
 ```python
 def __init__(self, config):
     super().__init__(config)
     # Your initialization code...
-    self._cache = SampleCache(config)
 ```
 
 3. Add cache decorators to your prediction methods:

diff --git a/pyproject.toml b/pyproject.toml
@@ -65,8 +65,8 @@ dependencies = [
     "GitPython>=3.1.41", # for logging
     "datasets>=4.0.0",
     "pydantic",
-    "numpy>=2",  # pinned to avoid incompatibilities
-    "hf-xet>=1.1.8",  # pinned to avoid failing test suite
+    "numpy>=2", # pinned to avoid incompatibilities
+    "hf-xet>=1.1.8", # pinned to avoid failing test suite
     # Prettiness
     "typer>=0.20.0",
     "termcolor==2.3.0",
@@ -87,6 +87,7 @@ dependencies = [
     "httpx>=0.27.2",
     "latex2sympy2_extended==1.0.6",
     "langcodes",
+    "diskcache>=5.6.3",
 ]
 
 [project.optional-dependencies]

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -72,6 +72,8 @@ def default(self, o):  # noqa : C901
                     return o.__dict__
                 except Exception:
                     return str(o)
+        if hasattr(o, "model_dump"):  # is pydantic BaseModel
+            return o.model_dump()
         if callable(o):
             if hasattr(o, "__name__"):
                 return o.__name__

diff --git a/src/lighteval/models/dummy/dummy_model.py b/src/lighteval/models/dummy/dummy_model.py
@@ -29,7 +29,7 @@
 from lighteval.models.abstract_model import LightevalModel, ModelConfig
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 
 
 class DummyModelConfig(ModelConfig):
@@ -70,9 +70,6 @@ def __init__(
         self._random = random.Random(self.config.seed)
         self._tokenizer = None
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     @property
     def tokenizer(self):
         if not self._tokenizer:

diff --git a/src/lighteval/models/endpoints/endpoint_model.py b/src/lighteval/models/endpoints/endpoint_model.py
@@ -49,7 +49,7 @@
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 
 
 logger = logging.getLogger(__name__)
@@ -268,9 +268,6 @@ def __init__(self, config: Union[InferenceEndpointModelConfig, ServerlessEndpoin
         self.generation_parameters = config.generation_parameters
         self.generation_config = self.generation_parameters.to_tgi_ie_dict()
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     def _create_endpoint(  # noqa: C901
         self, config: InferenceEndpointModelConfig | ServerlessEndpointModelConfig
     ) -> Tuple[Union[InferenceEndpoint | None], AsyncInferenceClient, InferenceClient]:  # noqa: C901

diff --git a/src/lighteval/models/endpoints/inference_providers_model.py b/src/lighteval/models/endpoints/inference_providers_model.py
@@ -36,7 +36,7 @@
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 
 
 logger = logging.getLogger(__name__)
@@ -134,9 +134,6 @@ def __init__(self, config: InferenceProvidersModelConfig) -> None:
             use_chat_template=True, tokenizer=self.tokenizer, system_prompt=config.system_prompt
         )
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     async def __call_api(self, prompt: List[dict], num_samples: int) -> Optional[ChatCompletionOutput]:
         """Make API call with exponential backoff retry logic.
 

diff --git a/src/lighteval/models/endpoints/litellm_model.py b/src/lighteval/models/endpoints/litellm_model.py
@@ -33,7 +33,7 @@
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 from lighteval.utils.imports import is_package_available, requires
 
 
@@ -162,9 +162,6 @@ def __init__(self, config: LiteLLMModelConfig) -> None:
             use_chat_template=True, tokenizer=self.tokenizer, system_prompt=config.system_prompt
         )
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     def _prepare_stop_sequence(self, stop_sequence):
         """Prepare and validate stop sequence."""
         if self.provider == "anthropic":

diff --git a/src/lighteval/models/endpoints/tgi_model.py b/src/lighteval/models/endpoints/tgi_model.py
@@ -31,7 +31,6 @@
 from lighteval.models.abstract_model import ModelConfig
 from lighteval.models.endpoints.endpoint_model import InferenceEndpointModel
 from lighteval.tasks.prompt_manager import PromptManager
-from lighteval.utils.cache_management import SampleCache
 from lighteval.utils.imports import Extra, is_package_available, requires
 
 
@@ -130,9 +129,6 @@ def __init__(self, config: TGIModelConfig) -> None:
             use_chat_template=True, tokenizer=self.tokenizer, system_prompt=config.system_prompt
         )
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     @requires(Extra.TGI)
     def _async_process_request(
         self,

diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
@@ -50,7 +50,7 @@
     Doc,
     SamplingMethod,
 )
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 from lighteval.utils.imports import is_package_available
 from lighteval.utils.parallelism import find_executable_batch_size
 from lighteval.utils.utils import as_list
@@ -304,9 +304,6 @@ def __init__(
         self.pairwise_tokenization = nanotron_config.lighteval_config.tasks.pairwise_tokenization
         self.batch_size = nanotron_config.lighteval_config.batch_size
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(nanotron_config)
-
     @property
     def tokenizer(self):
         return self._tokenizer

diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
@@ -34,7 +34,7 @@
 from lighteval.models.utils import _simplify_name, uses_chat_template
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 from lighteval.utils.imports import is_package_available, requires
 
 
@@ -163,9 +163,6 @@ def __init__(
         self.pairwise_tokenization = config.pairwise_tokenization
         self.prompt_manager = PromptManager(self.use_chat_template, self.tokenizer, config.system_prompt)
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     @property
     def tokenizer(self):
         return self._tokenizer

diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py
@@ -53,7 +53,7 @@
 from lighteval.models.utils import _get_dtype, _get_model_sha, _simplify_name, uses_chat_template
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 from lighteval.utils.imports import (
     is_package_available,
 )
@@ -237,9 +237,6 @@ def __init__(
             use_chat_template=self.use_chat_template, tokenizer=self.tokenizer, system_prompt=config.system_prompt
         )
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     def cleanup(self):
         """Clean up operations if needed, such as closing an endpoint."""
         del self.model
@@ -301,9 +298,6 @@ def from_model(
             system_prompt=config.system_prompt if config else None,
         )
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config) if config else None
-
         return self
 
     @property

diff --git a/src/lighteval/models/transformers/vlm_transformers_model.py b/src/lighteval/models/transformers/vlm_transformers_model.py
@@ -45,7 +45,7 @@
 from lighteval.models.utils import _get_dtype, _get_model_sha, _simplify_name
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 from lighteval.utils.imports import (
     is_package_available,
 )
@@ -177,9 +177,6 @@ def __init__(
             use_chat_template=True, tokenizer=self.tokenizer, system_prompt=config.system_prompt
         )
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     @property
     def tokenizer(self):
         return self.processor.tokenizer

diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -37,7 +37,7 @@
 from lighteval.models.utils import _simplify_name, uses_chat_template
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc, SamplingMethod
-from lighteval.utils.cache_management import SampleCache, cached
+from lighteval.utils.cache_management import cached
 from lighteval.utils.imports import is_package_available, requires
 
 
@@ -216,9 +216,6 @@ def __init__(
 
         self.prompt_manager = PromptManager(self.use_chat_template, self.tokenizer, config.system_prompt)
 
-        # Initialize cache for tokenization and predictions
-        self._cache = SampleCache(config)
-
     @property
     def tokenizer(self):
         return self._tokenizer

diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
@@ -144,8 +144,6 @@ def __init__(
         self.model_config = model_config
         self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
-        # Must occur after model and task init
-        self.model._cache._init_registry(self.registry)
         # Must occur after model init
         self._init_accelerator_seeds()
 
@@ -308,6 +306,8 @@ async def _run_model_async(self):
                     model_outputs = await self.model.loglikelihood(docs)
                     outputs[sampling_method] = model_outputs
 
+        self.model.cleanup()
+
         return outputs
 
     def _run_model_sync(self):
@@ -327,6 +327,8 @@ def _run_model_sync(self):
                     model_outputs = self.model.loglikelihood_rolling(docs)
                     outputs[sampling_method] = model_outputs
 
+        self.model.cleanup()
+
         return outputs
 
     def _run_model(self):
@@ -339,9 +341,6 @@ def _run_model(self):
         else:
             outputs = self._run_model_sync()
 
-        # Cleaning up the model before running metrics
-        self.model.cleanup()
-
         return outputs
 
     def _post_process_outputs(self, sampling_method_responses: dict[str, list[ModelResponse]]):