From 9b56b621aae9f23166c07599229a4a74464de21c Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 12 Nov 2025 20:55:37 -0500 Subject: [PATCH 1/4] fix bug of lm_head and dispatch model Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 23 +++++++++++++++++++++-- auto_round/compressors/utils.py | 3 +++ auto_round/envs.py | 1 + auto_round/eval/eval_cli.py | 1 + test/test_cuda/test_gguf.py | 2 +- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c098320f9..2cb7baa8e 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1994,6 +1994,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l devices = parse_available_devices(self.device_map) max_memory = get_max_memory() new_max_memory = {} + if "cpu" not in devices: + devices.append("cpu") for device in devices: if ":" in device: device = int(device.split(":")[-1]) @@ -2005,8 +2007,25 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l device_map = infer_auto_device_map( self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules ) - - self.model = dispatch_model(self.model, device_map=device_map) + if len(devices) > 1 and "cpu" in device_map: + logger.warning( + "Not enough memory cause the CPU to be used, which may severely impact speed." + " Please consider using more cards." + ) + + try: + self.model = dispatch_model(self.model, device_map=device_map) + except ValueError as e: + if "offload_dir" in e.__str__(): + logger.warning( + f"Due to insufficient resources, disk is used to store the model." + f" `offload_dir={envs.AR_WORK_SPACE}`" + ) + self.model = dispatch_model( + self.model, device_map=device_map, offload_dir=envs.AR_WORK_SPACE + ) + else: + raise else: self.model = self.model.to(self.device) diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index cf8d90098..2514c9f34 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -380,6 +380,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): tie_word_embeddings = model.config.tie_word_embeddings + if lm_head_name in layer_config: + quant_lm_head = True + if quant_lm_head and tie_word_embeddings and not gguf_name: quant_lm_head = False logger.warning( diff --git a/auto_round/envs.py b/auto_round/envs.py index 1a1f51de0..5ff90d170 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -25,6 +25,7 @@ "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(), "AR_ENABLE_COMPILE_PACKING": lambda: os.getenv("AR_ENABLE_COMPILE_PACKING", "0").lower() in ("1", "true", "yes"), "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"], + "AR_WORK_SPACE": lambda: os.getenv("AR_WORK_SPACE", "ar_work_space").lower(), } diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 7235e9e4c..0c96b4bd8 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -166,6 +166,7 @@ def eval(args): if file.endswith(".gguf"): is_gguf_file = True gguf_file = file + model = args.model eval_model_dtype = get_model_dtype(args.eval_model_dtype) if is_gguf_file: import torch diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index 299c45668..312e561cf 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -220,7 +220,7 @@ def test_vlm_gguf(self): autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 6962, delta=5.0) + self.assertAlmostEqual(file_size, 6568, delta=5.0) file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 self.assertAlmostEqual(file_size, 1599, delta=5.0) shutil.rmtree(quantized_model_path, ignore_errors=True) From 4cdfd75b33b7a616c55d40853cdf4a1c70bf822d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 12 Nov 2025 20:59:44 -0500 Subject: [PATCH 2/4] update Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 2cb7baa8e..9abebe7a9 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2007,7 +2007,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l device_map = infer_auto_device_map( self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules ) - if len(devices) > 1 and "cpu" in device_map: + if len(devices) > 1 and "cpu" in device_map.values(): logger.warning( "Not enough memory cause the CPU to be used, which may severely impact speed." " Please consider using more cards." From f62cf38d31320df31f14e1ecbbc3afa604572d4d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 12 Nov 2025 21:07:15 -0500 Subject: [PATCH 3/4] update Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 9abebe7a9..2634769a1 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2009,7 +2009,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l ) if len(devices) > 1 and "cpu" in device_map.values(): logger.warning( - "Not enough memory cause the CPU to be used, which may severely impact speed." + "Not enough vram cause the ram to be used, which may severely impact speed." " Please consider using more cards." ) From 8e372f1e703ba716eb688fb1b24a8d01ee202a87 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 12 Nov 2025 23:07:42 -0500 Subject: [PATCH 4/4] add ut Signed-off-by: n1ck-guo --- test/test_cpu/test_autoround.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index dd188e6ad..87b9e5a96 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -731,6 +731,13 @@ def test_quant_lm_head(self): assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 + layer_config = {"lm_head": {"bits": 4}} + ar = AutoRound(model_name, quant_lm_head=False, iters=0, disable_opt_rtn=True, layer_config=layer_config) + ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + assert "lm_head" in model.config.quantization_config.extra_config + assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 + def test_quant_lm_head_layer_config(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B" layer_config = {"lm_head": {"bits": 4}}