From 9b56b621aae9f23166c07599229a4a74464de21c Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 12 Nov 2025 20:55:37 -0500
Subject: [PATCH 1/4] fix bug of lm_head and dispatch model

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py  | 23 +++++++++++++++++++++--
 auto_round/compressors/utils.py |  3 +++
 auto_round/envs.py              |  1 +
 auto_round/eval/eval_cli.py     |  1 +
 test/test_cuda/test_gguf.py     |  2 +-
 5 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c098320f9..2cb7baa8e 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1994,6 +1994,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             devices = parse_available_devices(self.device_map)
                             max_memory = get_max_memory()
                             new_max_memory = {}
+                            if "cpu" not in devices:
+                                devices.append("cpu")
                             for device in devices:
                                 if ":" in device:
                                     device = int(device.split(":")[-1])
@@ -2005,8 +2007,25 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             device_map = infer_auto_device_map(
                                 self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
                             )
-
-                            self.model = dispatch_model(self.model, device_map=device_map)
+                            if len(devices) > 1 and "cpu" in device_map:
+                                logger.warning(
+                                    "Not enough memory cause the CPU to be used, which may severely impact speed."
+                                    " Please consider using more cards."
+                                )
+
+                            try:
+                                self.model = dispatch_model(self.model, device_map=device_map)
+                            except ValueError as e:
+                                if "offload_dir" in e.__str__():
+                                    logger.warning(
+                                        f"Due to insufficient resources, disk is used to store the model."
+                                        f" `offload_dir={envs.AR_WORK_SPACE}`"
+                                    )
+                                    self.model = dispatch_model(
+                                        self.model, device_map=device_map, offload_dir=envs.AR_WORK_SPACE
+                                    )
+                                else:
+                                    raise
                         else:
                             self.model = self.model.to(self.device)
 
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
index cf8d90098..2514c9f34 100644
--- a/auto_round/compressors/utils.py
+++ b/auto_round/compressors/utils.py
@@ -380,6 +380,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
     if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"):
         tie_word_embeddings = model.config.tie_word_embeddings
 
+    if lm_head_name in layer_config:
+        quant_lm_head = True
+
     if quant_lm_head and tie_word_embeddings and not gguf_name:
         quant_lm_head = False
         logger.warning(
diff --git a/auto_round/envs.py b/auto_round/envs.py
index 1a1f51de0..5ff90d170 100644
--- a/auto_round/envs.py
+++ b/auto_round/envs.py
@@ -25,6 +25,7 @@
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
     "AR_ENABLE_COMPILE_PACKING": lambda: os.getenv("AR_ENABLE_COMPILE_PACKING", "0").lower() in ("1", "true", "yes"),
     "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"],
+    "AR_WORK_SPACE": lambda: os.getenv("AR_WORK_SPACE", "ar_work_space").lower(),
 }
 
 
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 7235e9e4c..0c96b4bd8 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -166,6 +166,7 @@ def eval(args):
                 if file.endswith(".gguf"):
                     is_gguf_file = True
                     gguf_file = file
+            model = args.model
     eval_model_dtype = get_model_dtype(args.eval_model_dtype)
     if is_gguf_file:
         import torch
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
index 299c45668..312e561cf 100644
--- a/test/test_cuda/test_gguf.py
+++ b/test/test_cuda/test_gguf.py
@@ -220,7 +220,7 @@ def test_vlm_gguf(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
         self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
         file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 6962, delta=5.0)
+        self.assertAlmostEqual(file_size, 6568, delta=5.0)
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
         self.assertAlmostEqual(file_size, 1599, delta=5.0)
         shutil.rmtree(quantized_model_path, ignore_errors=True)

From 4cdfd75b33b7a616c55d40853cdf4a1c70bf822d Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 12 Nov 2025 20:59:44 -0500
Subject: [PATCH 2/4] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 2cb7baa8e..9abebe7a9 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2007,7 +2007,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             device_map = infer_auto_device_map(
                                 self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
                             )
-                            if len(devices) > 1 and "cpu" in device_map:
+                            if len(devices) > 1 and "cpu" in device_map.values():
                                 logger.warning(
                                     "Not enough memory cause the CPU to be used, which may severely impact speed."
                                     " Please consider using more cards."

From f62cf38d31320df31f14e1ecbbc3afa604572d4d Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 12 Nov 2025 21:07:15 -0500
Subject: [PATCH 3/4] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 9abebe7a9..2634769a1 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2009,7 +2009,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             )
                             if len(devices) > 1 and "cpu" in device_map.values():
                                 logger.warning(
-                                    "Not enough memory cause the CPU to be used, which may severely impact speed."
+                                    "Not enough vram cause the ram to be used, which may severely impact speed."
                                     " Please consider using more cards."
                                 )
 

From 8e372f1e703ba716eb688fb1b24a8d01ee202a87 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 12 Nov 2025 23:07:42 -0500
Subject: [PATCH 4/4] add ut

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cpu/test_autoround.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index dd188e6ad..87b9e5a96 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -731,6 +731,13 @@ def test_quant_lm_head(self):
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
+        layer_config = {"lm_head": {"bits": 4}}
+        ar = AutoRound(model_name, quant_lm_head=False, iters=0, disable_opt_rtn=True, layer_config=layer_config)
+        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        assert "lm_head" in model.config.quantization_config.extra_config
+        assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
+
     def test_quant_lm_head_layer_config(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B"
         layer_config = {"lm_head": {"bits": 4}}