intel · wenhuach21 · Jun 17, 2025 · Jun 16, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -461,13 +461,24 @@ def _check_compatibility(self):
                     has_besides_gguf = True
             if has_gguf and has_besides_gguf:
                 raise ValueError("gguf format is not compatible with other formats, please choose only one of them")
-            if  has_gguf and self.iters!=0:
+            if has_gguf and self.iters != 0:
                 logger.warning(
                     "We recommend setting `iters=0` when exporting to GGUF format,"
                     " as we have optimized the RTN method for this case."
                     " We are likely to release new algorithms for certain configurations in the future."
                 )
 
+        ##check group_size 32 for auto_round
+        if self.data_type == "int" and hasattr(self, "formats") and (
+                "auto_round" in self.formats or "auto_gptq" in self.formats or "auto_awq" in self.formats):
+            for n, m in self.model.named_modules():
+                if isinstance(m, self.supported_types) :
+                    if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                        self.layer_config[n] = {"bits": 16}
+                        logger.info(
+                            f"{n} will not be quantized due to its shape not being divisible by 32,"
+                            " resulting in an exporting issue to autogptq")
+
         if self.seqlen is not None and hasattr(self.model, "config") and \
                 hasattr(self.model.config, "max_position_embeddings"):
             if self.model.config.max_position_embeddings < self.seqlen:
@@ -1319,7 +1330,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                 logger.info("switch to cpu to cache block inputs")
                 if (self.has_qlayer_outside_block or
                         self.__class__.__name__ == "AutoRoundMLLM"):
-                    logger.warning("We strongly recommend using more GPUs."
+                    logger.warning("We strongly recommend using more GPUs in calibration."
                                    " Otherwise, some layers may fall back to `rtn` mode, which can affect accuracy.")
                 self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
                 clear_memory()

diff --git a/auto_round/export/export_to_gguf/config.py b/auto_round/export/export_to_gguf/config.py
@@ -157,8 +157,6 @@
 GGUF_CONFIG["gguf:q5_0"]["mostly"]= "gguf:q5_0"
 GGUF_CONFIG["gguf:q5_1"] = GGUF_INNER_CONFIG["gguf:q5_1"]
 GGUF_CONFIG["gguf:q5_1"]["mostly"] = "gguf:q5_1"
-GGUF_CONFIG["gguf:q2_k"] = GGUF_INNER_CONFIG["gguf:q2_k"]
-GGUF_CONFIG["gguf:q2_k"]["mostly"] = "gguf:q2_k"
 GGUF_CONFIG["gguf:q2_k_s"] = GGUF_INNER_CONFIG["gguf:q2_k"]
 GGUF_CONFIG["gguf:q2_k_s"]["mostly"]= "gguf:q2_k"
 # GGUF_CONFIG["gguf:q3_k"] = GGUF_INNER_CONFIG["gguf:q3_k"]
@@ -175,8 +173,8 @@
 GGUF_CONFIG["gguf:q4_k_s"]["mostly"]= "gguf:q4_k"
 GGUF_CONFIG["gguf:q4_k_m"] = GGUF_INNER_CONFIG["gguf:q4_k"]
 GGUF_CONFIG["gguf:q4_k_m"]["mostly"] = "gguf:q4_k"
-GGUF_CONFIG["gguf:q5_k"] = GGUF_INNER_CONFIG["gguf:q5_k"]
-GGUF_CONFIG["gguf:q5_k"]["mostly"]= "gguf:q5_k"
+# GGUF_CONFIG["gguf:q5_k"] = GGUF_INNER_CONFIG["gguf:q5_k"]
+# GGUF_CONFIG["gguf:q5_k"]["mostly"]= "gguf:q5_k"
 GGUF_CONFIG["gguf:q5_k_s"] = GGUF_INNER_CONFIG["gguf:q5_k"]
 GGUF_CONFIG["gguf:q5_k_s"]["mostly"] = "gguf:q5_k"
 GGUF_CONFIG["gguf:q5_k_m"] = GGUF_INNER_CONFIG["gguf:q5_k"]

diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
@@ -403,14 +403,6 @@ def tune(args):
         round = AutoRoundAdam
 
     layer_config = {}
-    for n, m in model.named_modules():
-        if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.pytorch_utils.Conv1D):
-            if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                layer_config[n] = {"bits": 16}
-                logger.info(
-                    f"{n} will not be quantized due to its shape not being divisible by 32,"
-                    " resulting in an exporting issue to autogptq")
-
     not_quantize_layer_names = get_fp_layer_names(model, args.fp_layers)
     for name in not_quantize_layer_names:
         layer_config[name] = {"bits": 16}

diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
@@ -369,13 +369,6 @@ def tune(args):
                     ##TODO gptq, awq could support some mixed precision config
                     logger.warning(f"mixed precision exporting does not support {format} currently")
 
-    for n, m in model.named_modules():
-        if isinstance(m, (torch.nn.Linear, transformers.pytorch_utils.Conv1D)):
-            if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                layer_config[n] = {"bits": 32}
-                logger.info(
-                    f"{n} will not be quantized due to its shape not being divisible by 32,"
-                    " resulting in an exporting issue to autogptq")
     lm_head_layer_name = "lm_head"
     for n, _ in model.named_modules():
         lm_head_layer_name = n

diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
@@ -239,3 +239,4 @@ def test_gguf_baseline(self):
 
 if __name__ == "__main__":
     unittest.main()
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -239,3 +239,4 @@ def test_gguf_baseline(self):

		if __name__ == "__main__":
		unittest.main()