From 0ff613d913e8e7c0f5970700ea4ee8120b337884 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 17 Jun 2025 01:08:18 +0800 Subject: [PATCH 1/6] fix --- auto_round/script/mllm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index 621d974d1..c498fbb6a 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -454,7 +454,8 @@ def tune(args): to_quant_block_names=args.to_quant_block_names, enable_torch_compile=enable_torch_compile, device_map=args.device_map, - model_kwargs=model_kwargs + model_kwargs=model_kwargs, + data_type=args.data_type, ) model, _ = autoround.quantize() From 7c15691c6535a0fe57dc11cd455ccde99d2f976f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 17 Jun 2025 10:11:42 +0800 Subject: [PATCH 2/6] fix --- auto_round/export/export_to_gguf/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_gguf/config.py b/auto_round/export/export_to_gguf/config.py index d227cfa9d..536f3f6c4 100644 --- a/auto_round/export/export_to_gguf/config.py +++ b/auto_round/export/export_to_gguf/config.py @@ -21,7 +21,7 @@ "sym": True, "data_type": "int", "embedding": "gguf:q4_0", - "lm_head": "gguf:q6_k_s", + "lm_head": "gguf:q6_k", "super_bits": None, "super_group_size": None, } @@ -33,7 +33,7 @@ "sym": False, "data_type": "int_asym_float_zp", "embedding": "gguf:q4_1", - "lm_head": "gguf:q6_k_s", + "lm_head": "gguf:q6_k", "super_bits": None, "super_group_size": None, } @@ -45,7 +45,7 @@ "sym": True, "data_type": "int", "embedding": "gguf:q5_0", - "lm_head": "gguf:q6_k_s", + "lm_head": "gguf:q6_k", "super_bits": None, "super_group_size": None, } @@ -57,7 +57,7 @@ "sym": False, "data_type": "int_asym_float_zp", "embedding": "gguf:q5_1", - "lm_head": "gguf:q6_k_s", + "lm_head": "gguf:q6_k", "super_bits": None, "super_group_size": None, } From ea466a7d0b48db5a0bbe626091c9c7646fe6dfce Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 17 Jun 2025 10:51:25 +0800 Subject: [PATCH 3/6] fix --- auto_round/autoround.py | 5 ++--- test/test_cpu/test_gguf_format.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 0ceef40c1..cc439ebd3 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -383,9 +383,8 @@ def _set_device_for_matching_module(self, name, device): def _dq_check(self): """Reset the default value of super_bits and super_group_size""" - from auto_round.export.export_to_gguf.config import GGUF_CONFIG if self.data_type.endswith("_dq"): - gguf_config = GGUF_CONFIG[f"gguf:q{self.bits}_k_s"] + gguf_config = GGUF_INNER_CONFIG[f"gguf:q{self.bits}_k"] self.super_bits = gguf_config["super_bits"] if self.super_bits is None else self.super_bits self.super_group_size = gguf_config["super_group_size"] \ if self.super_group_size is None else self.super_group_size @@ -466,7 +465,7 @@ def _check_compatibility(self): logger.warning( "We recommend setting `iters=0` when exporting to GGUF format," " as we have optimized the RTN method for this case." - " We will release new algorithms for certain configurations in the future." + " We are likely to release new algorithms for certain configurations in the future." ) if self.seqlen is not None and hasattr(self.model, "config") and \ diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index dbeb8a3bf..4d9c5164a 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -185,7 +185,7 @@ def test_q6_k(self): super_bits=8 ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s") + autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k") gguf_file = os.listdir("saved")[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," From 73f9f4e289cebf2e9372bcc3ed090d76bd3ddb47 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 17 Jun 2025 10:55:18 +0800 Subject: [PATCH 4/6] trigger ut --- test/test_cpu/test_gguf_format.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 4d9c5164a..b58d3acc2 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -239,3 +239,4 @@ def test_gguf_baseline(self): if __name__ == "__main__": unittest.main() + From d97203fd83b9436683325eccb9f6a5145b2af20d Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 17 Jun 2025 16:21:19 +0800 Subject: [PATCH 5/6] fix --- auto_round/autoround.py | 15 +++++++++++++-- auto_round/script/llm.py | 8 -------- auto_round/script/mllm.py | 7 ------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index cc439ebd3..b78be16ea 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -461,13 +461,24 @@ def _check_compatibility(self): has_besides_gguf = True if has_gguf and has_besides_gguf: raise ValueError("gguf format is not compatible with other formats, please choose only one of them") - if has_gguf and self.iters!=0: + if has_gguf and self.iters != 0: logger.warning( "We recommend setting `iters=0` when exporting to GGUF format," " as we have optimized the RTN method for this case." " We are likely to release new algorithms for certain configurations in the future." ) + ##check group_size 32 for auto_round + if self.data_type == "int" and hasattr(self, "formats") and ( + "auto_round" in self.formats or "auto_gptq" in self.formats or "auto_awq" in self.formats): + for n, m in self.model.named_modules(): + if isinstance(m, self.supported_types) : + if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: + self.layer_config[n] = {"bits": 16} + logger.info( + f"{n} will not be quantized due to its shape not being divisible by 32," + " resulting in an exporting issue to autogptq") + if self.seqlen is not None and hasattr(self.model, "config") and \ hasattr(self.model.config, "max_position_embeddings"): if self.model.config.max_position_embeddings < self.seqlen: @@ -1319,7 +1330,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l logger.info("switch to cpu to cache block inputs") if (self.has_qlayer_outside_block or self.__class__.__name__ == "AutoRoundMLLM"): - logger.warning("We strongly recommend using more GPUs." + logger.warning("We strongly recommend using more GPUs in calibration." " Otherwise, some layers may fall back to `rtn` mode, which can affect accuracy.") self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage) clear_memory() diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index f7fc3af4e..7499b1630 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -403,14 +403,6 @@ def tune(args): round = AutoRoundAdam layer_config = {} - for n, m in model.named_modules(): - if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.pytorch_utils.Conv1D): - if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: - layer_config[n] = {"bits": 16} - logger.info( - f"{n} will not be quantized due to its shape not being divisible by 32," - " resulting in an exporting issue to autogptq") - not_quantize_layer_names = get_fp_layer_names(model, args.fp_layers) for name in not_quantize_layer_names: layer_config[name] = {"bits": 16} diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index c498fbb6a..5e3fdea8f 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -369,13 +369,6 @@ def tune(args): ##TODO gptq, awq could support some mixed precision config logger.warning(f"mixed precision exporting does not support {format} currently") - for n, m in model.named_modules(): - if isinstance(m, (torch.nn.Linear, transformers.pytorch_utils.Conv1D)): - if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: - layer_config[n] = {"bits": 32} - logger.info( - f"{n} will not be quantized due to its shape not being divisible by 32," - " resulting in an exporting issue to autogptq") lm_head_layer_name = "lm_head" for n, _ in model.named_modules(): lm_head_layer_name = n From cd4524623036df172cf9a33e4763915ef12ee425 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 17 Jun 2025 16:53:38 +0800 Subject: [PATCH 6/6] fix --- auto_round/export/export_to_gguf/config.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_gguf/config.py b/auto_round/export/export_to_gguf/config.py index 536f3f6c4..484c5f033 100644 --- a/auto_round/export/export_to_gguf/config.py +++ b/auto_round/export/export_to_gguf/config.py @@ -157,8 +157,6 @@ GGUF_CONFIG["gguf:q5_0"]["mostly"]= "gguf:q5_0" GGUF_CONFIG["gguf:q5_1"] = GGUF_INNER_CONFIG["gguf:q5_1"] GGUF_CONFIG["gguf:q5_1"]["mostly"] = "gguf:q5_1" -GGUF_CONFIG["gguf:q2_k"] = GGUF_INNER_CONFIG["gguf:q2_k"] -GGUF_CONFIG["gguf:q2_k"]["mostly"] = "gguf:q2_k" GGUF_CONFIG["gguf:q2_k_s"] = GGUF_INNER_CONFIG["gguf:q2_k"] GGUF_CONFIG["gguf:q2_k_s"]["mostly"]= "gguf:q2_k" # GGUF_CONFIG["gguf:q3_k"] = GGUF_INNER_CONFIG["gguf:q3_k"] @@ -175,8 +173,8 @@ GGUF_CONFIG["gguf:q4_k_s"]["mostly"]= "gguf:q4_k" GGUF_CONFIG["gguf:q4_k_m"] = GGUF_INNER_CONFIG["gguf:q4_k"] GGUF_CONFIG["gguf:q4_k_m"]["mostly"] = "gguf:q4_k" -GGUF_CONFIG["gguf:q5_k"] = GGUF_INNER_CONFIG["gguf:q5_k"] -GGUF_CONFIG["gguf:q5_k"]["mostly"]= "gguf:q5_k" +# GGUF_CONFIG["gguf:q5_k"] = GGUF_INNER_CONFIG["gguf:q5_k"] +# GGUF_CONFIG["gguf:q5_k"]["mostly"]= "gguf:q5_k" GGUF_CONFIG["gguf:q5_k_s"] = GGUF_INNER_CONFIG["gguf:q5_k"] GGUF_CONFIG["gguf:q5_k_s"]["mostly"] = "gguf:q5_k" GGUF_CONFIG["gguf:q5_k_m"] = GGUF_INNER_CONFIG["gguf:q5_k"]