From e985799cbcb2a4f3788688f2fb454d1c461a65ff Mon Sep 17 00:00:00 2001 From: SulRash Date: Sun, 27 Apr 2025 01:04:22 +0300 Subject: [PATCH 1/2] Added support for quanitzation in vllm backend --- src/lighteval/models/vllm/vllm_model.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 2bdf07529..4f76f1b9f 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -81,6 +81,8 @@ class VLLMModelConfig(ModelConfig): pipeline_parallel_size: PositiveInt = 1 # how many GPUs to use for pipeline parallelism gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory max_model_length: PositiveInt | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough + quantization: str | None = None + load_format: str | None = None swap_space: PositiveInt = 4 # CPU swap space size (GiB) per GPU. seed: PositiveInt = 1234 trust_remote_code: bool = False @@ -176,6 +178,12 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: "max_num_seqs": int(config.max_num_seqs), "max_num_batched_tokens": int(config.max_num_batched_tokens), } + + if config.quantization is not None: + self.model_args["quantization"] = config.quantization + if config.load_format is not None: + self.model_args["load_format"] = config.load_format + if config.data_parallel_size > 1: self.model_args["distributed_executor_backend"] = "ray" self._batch_size = "auto" From d6ebe58f84e858c14f9cafc9c1bf4465e1d96008 Mon Sep 17 00:00:00 2001 From: SulRash Date: Mon, 28 Apr 2025 14:41:35 +0300 Subject: [PATCH 2/2] Fixed style issues --- src/lighteval/models/vllm/vllm_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 4f76f1b9f..7fca5e161 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -178,7 +178,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: "max_num_seqs": int(config.max_num_seqs), "max_num_batched_tokens": int(config.max_num_batched_tokens), } - + if config.quantization is not None: self.model_args["quantization"] = config.quantization if config.load_format is not None: