intel · wenhuach21 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -1994,6 +1994,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             devices = parse_available_devices(self.device_map)
                             max_memory = get_max_memory()
                             new_max_memory = {}
+                            if "cpu" not in devices:
+                                devices.append("cpu")
                             for device in devices:
                                 if ":" in device:
                                     device = int(device.split(":")[-1])
@@ -2005,8 +2007,25 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             device_map = infer_auto_device_map(
                                 self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
                             )
-
-                            self.model = dispatch_model(self.model, device_map=device_map)
+                            if len(devices) > 1 and "cpu" in device_map.values():
+                                logger.warning(
+                                    "Not enough vram cause the ram to be used, which may severely impact speed."
+                                    " Please consider using more cards."
+                                )
+
+                            try:
+                                self.model = dispatch_model(self.model, device_map=device_map)
+                            except ValueError as e:
+                                if "offload_dir" in e.__str__():
+                                    logger.warning(
+                                        f"Due to insufficient resources, disk is used to store the model."
+                                        f" `offload_dir={envs.AR_WORK_SPACE}`"
+                                    )
+                                    self.model = dispatch_model(
+                                        self.model, device_map=device_map, offload_dir=envs.AR_WORK_SPACE
+                                    )
+                                else:
+                                    raise
                         else:
                             self.model = self.model.to(self.device)
 

diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
@@ -380,6 +380,9 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
     if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"):
         tie_word_embeddings = model.config.tie_word_embeddings
 
+    if lm_head_name in layer_config:
+        quant_lm_head = True
+
     if quant_lm_head and tie_word_embeddings and not gguf_name:
         quant_lm_head = False
         logger.warning(

diff --git a/auto_round/envs.py b/auto_round/envs.py
@@ -25,6 +25,7 @@
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
     "AR_ENABLE_COMPILE_PACKING": lambda: os.getenv("AR_ENABLE_COMPILE_PACKING", "0").lower() in ("1", "true", "yes"),
     "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"],
+    "AR_WORK_SPACE": lambda: os.getenv("AR_WORK_SPACE", "ar_work_space").lower(),
 }
 
 

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
@@ -166,6 +166,7 @@ def eval(args):
                 if file.endswith(".gguf"):
                     is_gguf_file = True
                     gguf_file = file
+            model = args.model
     eval_model_dtype = get_model_dtype(args.eval_model_dtype)
     if is_gguf_file:
         import torch

diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
@@ -731,6 +731,13 @@ def test_quant_lm_head(self):
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
+        layer_config = {"lm_head": {"bits": 4}}
+        ar = AutoRound(model_name, quant_lm_head=False, iters=0, disable_opt_rtn=True, layer_config=layer_config)
+        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        assert "lm_head" in model.config.quantization_config.extra_config
+        assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
+
     def test_quant_lm_head_layer_config(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B"
         layer_config = {"lm_head": {"bits": 4}}

diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
@@ -220,7 +220,7 @@ def test_vlm_gguf(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
         self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
         file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 6962, delta=5.0)
+        self.assertAlmostEqual(file_size, 6568, delta=5.0)
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
         self.assertAlmostEqual(file_size, 1599, delta=5.0)
         shutil.rmtree(quantized_model_path, ignore_errors=True)