From 0ff613d913e8e7c0f5970700ea4ee8120b337884 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 17 Jun 2025 01:08:18 +0800
Subject: [PATCH 1/6] fix

---
 auto_round/script/mllm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index 621d974d1..c498fbb6a 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -454,7 +454,8 @@ def tune(args):
         to_quant_block_names=args.to_quant_block_names,
         enable_torch_compile=enable_torch_compile,
         device_map=args.device_map,
-        model_kwargs=model_kwargs
+        model_kwargs=model_kwargs,
+        data_type=args.data_type,
         )
     model, _ = autoround.quantize()
 

From 7c15691c6535a0fe57dc11cd455ccde99d2f976f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 17 Jun 2025 10:11:42 +0800
Subject: [PATCH 2/6] fix

---
 auto_round/export/export_to_gguf/config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_gguf/config.py b/auto_round/export/export_to_gguf/config.py
index d227cfa9d..536f3f6c4 100644
--- a/auto_round/export/export_to_gguf/config.py
+++ b/auto_round/export/export_to_gguf/config.py
@@ -21,7 +21,7 @@
     "sym": True,
     "data_type": "int",
     "embedding": "gguf:q4_0",
-    "lm_head": "gguf:q6_k_s",
+    "lm_head": "gguf:q6_k",
     "super_bits": None,
     "super_group_size": None,
 }
@@ -33,7 +33,7 @@
     "sym": False,
     "data_type": "int_asym_float_zp",
     "embedding": "gguf:q4_1",
-    "lm_head": "gguf:q6_k_s",
+    "lm_head": "gguf:q6_k",
     "super_bits": None,
     "super_group_size": None,
 }
@@ -45,7 +45,7 @@
     "sym": True,
     "data_type": "int",
     "embedding": "gguf:q5_0",
-    "lm_head": "gguf:q6_k_s",
+    "lm_head": "gguf:q6_k",
     "super_bits": None,
     "super_group_size": None,
 }
@@ -57,7 +57,7 @@
     "sym": False,
     "data_type": "int_asym_float_zp",
     "embedding": "gguf:q5_1",
-    "lm_head": "gguf:q6_k_s",
+    "lm_head": "gguf:q6_k",
     "super_bits": None,
     "super_group_size": None,
 }

From ea466a7d0b48db5a0bbe626091c9c7646fe6dfce Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 17 Jun 2025 10:51:25 +0800
Subject: [PATCH 3/6] fix

---
 auto_round/autoround.py           | 5 ++---
 test/test_cpu/test_gguf_format.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 0ceef40c1..cc439ebd3 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -383,9 +383,8 @@ def _set_device_for_matching_module(self, name, device):
 
     def _dq_check(self):
         """Reset the default value of super_bits and super_group_size"""
-        from auto_round.export.export_to_gguf.config import GGUF_CONFIG
         if self.data_type.endswith("_dq"):
-            gguf_config = GGUF_CONFIG[f"gguf:q{self.bits}_k_s"]
+            gguf_config = GGUF_INNER_CONFIG[f"gguf:q{self.bits}_k"]
             self.super_bits = gguf_config["super_bits"] if self.super_bits is None else self.super_bits
             self.super_group_size = gguf_config["super_group_size"] \
                 if self.super_group_size is None else self.super_group_size
@@ -466,7 +465,7 @@ def _check_compatibility(self):
                 logger.warning(
                     "We recommend setting `iters=0` when exporting to GGUF format,"
                     " as we have optimized the RTN method for this case."
-                    " We will release new algorithms for certain configurations in the future."
+                    " We are likely to release new algorithms for certain configurations in the future."
                 )
 
         if self.seqlen is not None and hasattr(self.model, "config") and \
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index dbeb8a3bf..4d9c5164a 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -185,7 +185,7 @@ def test_q6_k(self):
             super_bits=8
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s")
+        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k")
         gguf_file = os.listdir("saved")[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"

From 73f9f4e289cebf2e9372bcc3ed090d76bd3ddb47 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 17 Jun 2025 10:55:18 +0800
Subject: [PATCH 4/6] trigger ut

---
 test/test_cpu/test_gguf_format.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 4d9c5164a..b58d3acc2 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -239,3 +239,4 @@ def test_gguf_baseline(self):
 
 if __name__ == "__main__":
     unittest.main()
+

From d97203fd83b9436683325eccb9f6a5145b2af20d Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 17 Jun 2025 16:21:19 +0800
Subject: [PATCH 5/6] fix

---
 auto_round/autoround.py   | 15 +++++++++++++--
 auto_round/script/llm.py  |  8 --------
 auto_round/script/mllm.py |  7 -------
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index cc439ebd3..b78be16ea 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -461,13 +461,24 @@ def _check_compatibility(self):
                     has_besides_gguf = True
             if has_gguf and has_besides_gguf:
                 raise ValueError("gguf format is not compatible with other formats, please choose only one of them")
-            if  has_gguf and self.iters!=0:
+            if has_gguf and self.iters != 0:
                 logger.warning(
                     "We recommend setting `iters=0` when exporting to GGUF format,"
                     " as we have optimized the RTN method for this case."
                     " We are likely to release new algorithms for certain configurations in the future."
                 )
 
+        ##check group_size 32 for auto_round
+        if self.data_type == "int" and hasattr(self, "formats") and (
+                "auto_round" in self.formats or "auto_gptq" in self.formats or "auto_awq" in self.formats):
+            for n, m in self.model.named_modules():
+                if isinstance(m, self.supported_types) :
+                    if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                        self.layer_config[n] = {"bits": 16}
+                        logger.info(
+                            f"{n} will not be quantized due to its shape not being divisible by 32,"
+                            " resulting in an exporting issue to autogptq")
+
         if self.seqlen is not None and hasattr(self.model, "config") and \
                 hasattr(self.model.config, "max_position_embeddings"):
             if self.model.config.max_position_embeddings < self.seqlen:
@@ -1319,7 +1330,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                 logger.info("switch to cpu to cache block inputs")
                 if (self.has_qlayer_outside_block or
                         self.__class__.__name__ == "AutoRoundMLLM"):
-                    logger.warning("We strongly recommend using more GPUs."
+                    logger.warning("We strongly recommend using more GPUs in calibration."
                                    " Otherwise, some layers may fall back to `rtn` mode, which can affect accuracy.")
                 self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
                 clear_memory()
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index f7fc3af4e..7499b1630 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -403,14 +403,6 @@ def tune(args):
         round = AutoRoundAdam
 
     layer_config = {}
-    for n, m in model.named_modules():
-        if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.pytorch_utils.Conv1D):
-            if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                layer_config[n] = {"bits": 16}
-                logger.info(
-                    f"{n} will not be quantized due to its shape not being divisible by 32,"
-                    " resulting in an exporting issue to autogptq")
-
     not_quantize_layer_names = get_fp_layer_names(model, args.fp_layers)
     for name in not_quantize_layer_names:
         layer_config[name] = {"bits": 16}
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index c498fbb6a..5e3fdea8f 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -369,13 +369,6 @@ def tune(args):
                     ##TODO gptq, awq could support some mixed precision config
                     logger.warning(f"mixed precision exporting does not support {format} currently")
 
-    for n, m in model.named_modules():
-        if isinstance(m, (torch.nn.Linear, transformers.pytorch_utils.Conv1D)):
-            if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
-                layer_config[n] = {"bits": 32}
-                logger.info(
-                    f"{n} will not be quantized due to its shape not being divisible by 32,"
-                    " resulting in an exporting issue to autogptq")
     lm_head_layer_name = "lm_head"
     for n, _ in model.named_modules():
         lm_head_layer_name = n

From cd4524623036df172cf9a33e4763915ef12ee425 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 17 Jun 2025 16:53:38 +0800
Subject: [PATCH 6/6] fix

---
 auto_round/export/export_to_gguf/config.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_gguf/config.py b/auto_round/export/export_to_gguf/config.py
index 536f3f6c4..484c5f033 100644
--- a/auto_round/export/export_to_gguf/config.py
+++ b/auto_round/export/export_to_gguf/config.py
@@ -157,8 +157,6 @@
 GGUF_CONFIG["gguf:q5_0"]["mostly"]= "gguf:q5_0"
 GGUF_CONFIG["gguf:q5_1"] = GGUF_INNER_CONFIG["gguf:q5_1"]
 GGUF_CONFIG["gguf:q5_1"]["mostly"] = "gguf:q5_1"
-GGUF_CONFIG["gguf:q2_k"] = GGUF_INNER_CONFIG["gguf:q2_k"]
-GGUF_CONFIG["gguf:q2_k"]["mostly"] = "gguf:q2_k"
 GGUF_CONFIG["gguf:q2_k_s"] = GGUF_INNER_CONFIG["gguf:q2_k"]
 GGUF_CONFIG["gguf:q2_k_s"]["mostly"]= "gguf:q2_k"
 # GGUF_CONFIG["gguf:q3_k"] = GGUF_INNER_CONFIG["gguf:q3_k"]
@@ -175,8 +173,8 @@
 GGUF_CONFIG["gguf:q4_k_s"]["mostly"]= "gguf:q4_k"
 GGUF_CONFIG["gguf:q4_k_m"] = GGUF_INNER_CONFIG["gguf:q4_k"]
 GGUF_CONFIG["gguf:q4_k_m"]["mostly"] = "gguf:q4_k"
-GGUF_CONFIG["gguf:q5_k"] = GGUF_INNER_CONFIG["gguf:q5_k"]
-GGUF_CONFIG["gguf:q5_k"]["mostly"]= "gguf:q5_k"
+# GGUF_CONFIG["gguf:q5_k"] = GGUF_INNER_CONFIG["gguf:q5_k"]
+# GGUF_CONFIG["gguf:q5_k"]["mostly"]= "gguf:q5_k"
 GGUF_CONFIG["gguf:q5_k_s"] = GGUF_INNER_CONFIG["gguf:q5_k"]
 GGUF_CONFIG["gguf:q5_k_s"]["mostly"] = "gguf:q5_k"
 GGUF_CONFIG["gguf:q5_k_m"] = GGUF_INNER_CONFIG["gguf:q5_k"]