From 422dc16cf8ad5a95cc2b5fa54a9182b78081ac9b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 15 Oct 2025 00:59:33 -0400 Subject: [PATCH 01/12] gguf weight type align with original, output.weight, token_embed Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 6 +++++- auto_round/utils.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 8ceaefc00..333cd3cda 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1333,7 +1333,11 @@ def _check_need_to_quantize_lm_head_embedding(self) -> bool: tie_word_embeddings: bool = getattr(getattr(self.model, "config", None), "tie_word_embeddings", True) for name, module in self.model.named_modules(): if isinstance(module, torch.nn.Embedding): - key: str = "lm_head" if tie_word_embeddings else "embedding" + if tie_word_embeddings: + config: dict[str, Any] = GGUF_INNER_CONFIG[GGUF_CONFIG[target_format]["lm_head"]] + self._apply_config_to_layer("lm_head", config, False) + # key: str = "lm_head" if tie_word_embeddings else "embedding" + key: str = "embedding" config: dict[str, Any] = GGUF_INNER_CONFIG[GGUF_CONFIG[target_format][key]] self._apply_config_to_layer(name, config, True) diff --git a/auto_round/utils.py b/auto_round/utils.py index 90f161df7..27ae896e2 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -1999,7 +1999,7 @@ def _set_config(config, target_config): elif new_type != "gguf:q8_0": new_type = "gguf:q6_k" elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings: - pass + new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] elif isinstance(layer, torch.nn.Embedding): if "embedding" in GGUF_CONFIG[target_gguf_format]: new_type = GGUF_CONFIG[target_gguf_format]["embedding"] From b422c26d184fd415c4d2d57f0fc5c4694c7070a6 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 15 Oct 2025 03:37:50 -0400 Subject: [PATCH 02/12] update Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 18 ++++------- auto_round/utils.py | 27 +++++++++++++++- test/test_cpu/test_gguf_format.py | 54 +++++++++++++++++++++++++++---- 3 files changed, 81 insertions(+), 18 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 333cd3cda..9ce6ff473 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -85,6 +85,7 @@ is_hpex_available, is_mx_fp, is_nv_fp, + is_separate_lm_head, is_standard_fp, is_static_wfp8afp8, is_wfp8afp8, @@ -1331,24 +1332,19 @@ def _check_need_to_quantize_lm_head_embedding(self) -> bool: target_format = self.scheme.lower() tie_word_embeddings: bool = getattr(getattr(self.model, "config", None), "tie_word_embeddings", True) + tie_word_embeddings &= not is_separate_lm_head(self.model) + lm_head_name: str = get_lm_head_name(self.model) + check_fixed_by_user = ( + self.layer_config[lm_head_name].get("fixed_by_user", False) if lm_head_name in self.layer_config else False + ) for name, module in self.model.named_modules(): if isinstance(module, torch.nn.Embedding): - if tie_word_embeddings: - config: dict[str, Any] = GGUF_INNER_CONFIG[GGUF_CONFIG[target_format]["lm_head"]] - self._apply_config_to_layer("lm_head", config, False) - # key: str = "lm_head" if tie_word_embeddings else "embedding" - key: str = "embedding" + key: str = "lm_head" if tie_word_embeddings else "embedding" config: dict[str, Any] = GGUF_INNER_CONFIG[GGUF_CONFIG[target_format][key]] self._apply_config_to_layer(name, config, True) if not tie_word_embeddings: - lm_head_name: str = get_lm_head_name(self.model) config: dict[str, Any] = GGUF_CONFIG[GGUF_CONFIG[target_format]["lm_head"]] - check_fixed_by_user = ( - self.layer_config[lm_head_name].get("fixed_by_user", False) - if lm_head_name in self.layer_config - else None - ) self._apply_config_to_layer(lm_head_name, config, check_fixed_by_user=check_fixed_by_user) return True diff --git a/auto_round/utils.py b/auto_round/utils.py index 27ae896e2..43c42faac 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -1999,7 +1999,8 @@ def _set_config(config, target_config): elif new_type != "gguf:q8_0": new_type = "gguf:q6_k" elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings: - new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] + # new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] + continue elif isinstance(layer, torch.nn.Embedding): if "embedding" in GGUF_CONFIG[target_gguf_format]: new_type = GGUF_CONFIG[target_gguf_format]["embedding"] @@ -2838,3 +2839,27 @@ def is_diffusion_model(model_or_path: Union[str, object]): return isinstance(model_or_path, pipeline_utils.DiffusionPipeline) else: return False + + +def is_separate_lm_head(model: torch.nn.Module) -> bool: + dir_path = model.name_or_path + if not os.path.isdir(dir_path): + dir_path = download_hf_model(dir_path) + lm_head_name: str = get_lm_head_name(model) + lm_head_name += ".weight" + + if "model.safetensors.index.json" in os.listdir(dir_path): + with open(os.path.join(dir_path, "model.safetensors.index.json")) as f: + index_mapping = json.load(f) + if lm_head_name in index_mapping["weight_map"]: + return True + else: + return False + else: + from safetensors import safe_open + + f = safe_open(os.path.join(dir_path, "model.safetensors"), framework="pt") + if lm_head_name in f.keys(): + return True + else: + return False diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index d71920b39..0e76bf464 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -24,8 +24,6 @@ class TestGGUF(unittest.TestCase): @classmethod def setUpClass(self): self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" - self.model_name = "Qwen/Qwen2.5-0.5B-Instruct" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -55,8 +53,7 @@ def test_basic_usage(self): def test_q4_0(self): bits, group_size, sym = 4, 32, True autoround = AutoRound( - self.model, - self.tokenizer, + self.model_name, bits=bits, group_size=group_size, sym=sym, @@ -103,8 +100,7 @@ def test_q4_0(self): def test_func(self): bits, group_size, sym = 4, 128, True autoround = AutoRound( - self.model, - self.tokenizer, + self.model_name, # bits=bits, # group_size=group_size, # sym=sym, @@ -336,6 +332,52 @@ def test_vlm_gguf(self): self.assertAlmostEqual(file_size, 892, delta=1.0) shutil.rmtree("./saved", ignore_errors=True) + def test_qtype_setting(self): + # Qwen2.5-0.5B-Instruct no output, token_embed q6_k fallbakc to q8_0 336M + # Qwen3-0.6B output q6_k, token_embed q4_0 448M + # Qwen3-8B output q6_k, token_embed q4_0 4.5G + # Llama-3.2-1B-Instruct o output, token_embed q6_k 736M + from auto_round.export.export_to_gguf.config import ModelType + from auto_round.utils import get_layer_config_by_gguf_format + + model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = "/models/Qwen2.5-0.5B-Instruct" + ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) + ar.formats = ["gguf:q4_0"] + ar._set_layerwise_config(ar.layer_config) + ar.layer_config, _ = get_layer_config_by_gguf_format( + ar.layer_config, ar.formats, ar.model, model_type=ModelType.TEXT + ) + self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 8) + self.assertTrue(ar.layer_config["lm_head"]["bits"] == 16) + + model_name = "Qwen/Qwen3-0.6B" + model_name = "/models/Qwen3-0.6B" + ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) + ar.formats = ["gguf:q4_0"] + ar._set_layerwise_config(ar.layer_config) + ar.layer_config, _ = get_layer_config_by_gguf_format( + ar.layer_config, ar.formats, ar.model, model_type=ModelType.TEXT + ) + self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 4) + self.assertTrue(ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8) + + layer_config = { + "model.embed_tokens": {"bits": 6, "super_bits": 8}, + "lm_head": {"bits": 4}, + } + ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0, layer_config=layer_config) + ar.formats = ["gguf:q4_0"] + ar._set_layerwise_config(ar.layer_config) + ar.layer_config, _ = get_layer_config_by_gguf_format( + ar.layer_config, ar.formats, ar.model, model_type=ModelType.TEXT + ) + self.assertTrue(ar.layer_config["lm_head"]["bits"] == 4) + self.assertTrue( + ar.layer_config["model.embed_tokens"]["bits"] == 6 + and ar.layer_config["model.embed_tokens"]["super_bits"] == 8 + ) + if __name__ == "__main__": unittest.main() From 115f1fd9454645558cc17f8405f7b7d61dd789fa Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 15 Oct 2025 20:34:40 -0400 Subject: [PATCH 03/12] fix Signed-off-by: n1ck-guo --- test/test_cpu/test_gguf_format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 0e76bf464..a81d01575 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -341,7 +341,6 @@ def test_qtype_setting(self): from auto_round.utils import get_layer_config_by_gguf_format model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" - model_name = "/models/Qwen2.5-0.5B-Instruct" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar._set_layerwise_config(ar.layer_config) From 74a50c118d761a134290965df3a5a6b0f7d2d809 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 15 Oct 2025 21:34:34 -0400 Subject: [PATCH 04/12] fix Signed-off-by: n1ck-guo --- test/test_cpu/test_gguf_format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index a81d01575..a4ce0c7b4 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -351,7 +351,6 @@ def test_qtype_setting(self): self.assertTrue(ar.layer_config["lm_head"]["bits"] == 16) model_name = "Qwen/Qwen3-0.6B" - model_name = "/models/Qwen3-0.6B" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar._set_layerwise_config(ar.layer_config) From a9c3c65ff14aa391445d4fda7067d42bc4245c61 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 16 Oct 2025 20:58:45 -0400 Subject: [PATCH 05/12] fix merge and clean base __init__ Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 178 +++++++++++++++++---------------- auto_round/utils.py | 3 +- 2 files changed, 95 insertions(+), 86 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 6df271396..ca2b0387d 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -85,7 +85,6 @@ is_hpex_available, is_mx_fp, is_nv_fp, - is_separate_lm_head, is_standard_fp, is_static_wfp8afp8, is_wfp8afp8, @@ -209,35 +208,7 @@ def __init__( ... } """ - if isinstance(scheme, AutoScheme): - if len(scheme.options) <= 0: - raise ValueError("options of AutoScheme must not be empty") - options = [] - for option in scheme.options: - new_option = self._parse_and_set_scheme(option, kwargs) - options.append(new_option) - scheme.options = options - for opt in options: - if isinstance(opt, str) and opt == "BF16": - continue - if isinstance(opt, QuantizationScheme): - if opt.bits >= 16 and (opt.act_bits is None or opt.act_bits >= 16): - continue - self.scheme = opt # Choose the first one that not 16 bits - break - - # apply scheme to set default bits - self._parse_and_set_scheme(self.scheme, kwargs) - - self.is_auto_scheme = True - - else: - self.scheme = self._parse_and_set_scheme(scheme, kwargs) - self.is_auto_scheme = False - - scheme_keys = [f.name for f in fields(QuantizationScheme)] - for key in scheme_keys: - kwargs.pop(key, None) + self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs) gguf_scheme_name = get_gguf_scheme(self.scheme) # GGUF uses fp32 scale dtype as default @@ -503,65 +474,102 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: def _parse_and_set_scheme(self, scheme: Union[str, dict, QuantizationScheme], kwargs) -> QuantizationScheme: """Parse and set the quantization scheme.""" - res = "" - if isinstance(scheme, QuantizationScheme): - scheme = asdict(scheme) - elif isinstance(scheme, dict): - scheme = scheme - elif isinstance(scheme, str): - res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different - scheme = scheme.upper() - scheme = asdict(preset_name_to_scheme(scheme)) - scheme_keys = [f.name for f in fields(QuantizationScheme)] - for key in scheme_keys: - if key in kwargs and kwargs[key] is not None: - setattr(self, key, kwargs[key]) - else: - setattr(self, key, scheme.get(key, None)) - # kwargs.pop(key, None) - if self.act_dynamic is None: - self.act_dynamic = True - - tmp_bits = infer_bits_by_data_type(self.data_type) - if tmp_bits is not None and tmp_bits < 16 and tmp_bits != self.bits: - logger.warning(f"'data_type' do not match the specified 'bits' setting. Resetting 'bits' to {tmp_bits}.") - self.bits = tmp_bits - if tmp_bits is not None and tmp_bits < 16: - for supported_dtype in SUPPORTED_DTYPES: # to easily handle dtype mx_fp4 and layer_config={xxx:{bits:8}} - if self.data_type.startswith(supported_dtype): - if supported_dtype + str(tmp_bits) == self.data_type: # could not replace FP8_e4m3 - self.data_type = supported_dtype - break - self.act_group_size = self.act_group_size if self.act_group_size is not None else self.group_size - self.act_bits = self.act_bits if self.act_bits is not None else 16 - self.act_sym = self.act_sym if self.act_sym is not None else self.sym + def _parse_and_set(scheme, kwargs): + res = "" + if isinstance(scheme, QuantizationScheme): + scheme = asdict(scheme) + elif isinstance(scheme, dict): + scheme = scheme + elif isinstance(scheme, str): + res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different + scheme = scheme.upper() + scheme = asdict(preset_name_to_scheme(scheme)) + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key in scheme_keys: + if key in kwargs and kwargs[key] is not None: + setattr(self, key, kwargs[key]) + else: + setattr(self, key, scheme.get(key, None)) + # kwargs.pop(key, None) + if self.act_dynamic is None: + self.act_dynamic = True - if self.act_data_type is None: - if self.data_type in SUPPORTED_DTYPES and self.act_bits < 16: - self.act_data_type = self.data_type - logger.info(f"activation adopts {self.data_type}") + tmp_bits = infer_bits_by_data_type(self.data_type) + if tmp_bits is not None and tmp_bits < 16 and tmp_bits != self.bits: + logger.warning( + f"'data_type' do not match the specified 'bits' setting. Resetting 'bits' to {tmp_bits}." + ) + self.bits = tmp_bits + if tmp_bits is not None and tmp_bits < 16: + for ( + supported_dtype + ) in SUPPORTED_DTYPES: # to easily handle dtype mx_fp4 and layer_config={xxx:{bits:8}} + if self.data_type.startswith(supported_dtype): + if supported_dtype + str(tmp_bits) == self.data_type: # could not replace FP8_e4m3 + self.data_type = supported_dtype + break + + self.act_group_size = self.act_group_size if self.act_group_size is not None else self.group_size + self.act_bits = self.act_bits if self.act_bits is not None else 16 + self.act_sym = self.act_sym if self.act_sym is not None else self.sym + + if self.act_data_type is None: + if self.data_type in SUPPORTED_DTYPES and self.act_bits < 16: + self.act_data_type = self.data_type + logger.info(f"activation adopts {self.data_type}") + else: + self.act_data_type = "float" + tmp_act_bits = infer_bits_by_data_type(self.act_data_type) + if tmp_act_bits is not None and tmp_act_bits < 16 and tmp_act_bits != self.act_bits: + self.act_bits = tmp_act_bits + logger.warning( + f"`act_data_type` do not" + f" match the specified 'act_bits' setting. Resetting 'act_bits' to {tmp_act_bits}." + ) + if tmp_act_bits is not None and tmp_act_bits < 16: + for ( + supported_dtype + ) in SUPPORTED_DTYPES: # To easily handle dtype mx_fp4 and layer_config={xxx:{bits:8}} + if self.act_data_type.startswith(supported_dtype): + if supported_dtype + str(tmp_act_bits) == self.act_data_type: # Could not replace FP8_e4m3 + self.act_data_type = supported_dtype + break + for key in scheme_keys: + scheme[key] = getattr(self, key) + if res and QuantizationScheme.from_dict(scheme) == preset_name_to_scheme(res): + return res else: - self.act_data_type = "float" - tmp_act_bits = infer_bits_by_data_type(self.act_data_type) - if tmp_act_bits is not None and tmp_act_bits < 16 and tmp_act_bits != self.act_bits: - self.act_bits = tmp_act_bits - logger.warning( - f"`act_data_type` do not" - f" match the specified 'act_bits' setting. Resetting 'act_bits' to {tmp_act_bits}." - ) - if tmp_act_bits is not None and tmp_act_bits < 16: - for supported_dtype in SUPPORTED_DTYPES: # To easily handle dtype mx_fp4 and layer_config={xxx:{bits:8}} - if self.act_data_type.startswith(supported_dtype): - if supported_dtype + str(tmp_act_bits) == self.act_data_type: # Could not replace FP8_e4m3 - self.act_data_type = supported_dtype - break - for key in scheme_keys: - scheme[key] = getattr(self, key) - if res and QuantizationScheme.from_dict(scheme) == preset_name_to_scheme(res): - return res + return QuantizationScheme.from_dict(scheme) + + if isinstance(scheme, AutoScheme): + if len(scheme.options) <= 0: + raise ValueError("options of AutoScheme must not be empty") + options = [] + for option in scheme.options: + new_option = _parse_and_set(option, kwargs) + options.append(new_option) + scheme.options = options + for opt in options: + if isinstance(opt, str) and opt == "BF16": + continue + if isinstance(opt, QuantizationScheme): + if opt.bits >= 16 and (opt.act_bits is None or opt.act_bits >= 16): + continue + self.scheme = opt # Choose the first one that not 16 bits + break + # apply scheme to set default bits + scheme = _parse_and_set(self.scheme, kwargs) + is_auto_scheme = True else: - return QuantizationScheme.from_dict(scheme) + scheme = _parse_and_set(scheme, kwargs) + is_auto_scheme = False + + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key in scheme_keys: + kwargs.pop(key, None) + + return scheme, is_auto_scheme def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: """Sets the torch compile configuration for the tuning.""" diff --git a/auto_round/utils.py b/auto_round/utils.py index b2a12832b..3d500a2f6 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2947,7 +2947,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): tie_word_embeddings = model.config.tie_word_embeddings - if quant_lm_head and tie_word_embeddings: + if quant_lm_head and tie_word_embeddings and not gguf_name: quant_lm_head = False logger.warning( "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently" @@ -2987,6 +2987,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str return layer_config, has_qlayer_outside_block # embed + lm_head defaults for gguf + tie_word_embeddings &= is_separate_lm_head(model) if lm_head_name not in layer_config and not tie_word_embeddings: cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} From 903c296135e5248a4b1153d00467d91cdd48fe34 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 20 Oct 2025 21:49:40 -0400 Subject: [PATCH 06/12] fix Signed-off-by: n1ck-guo --- auto_round/calib_dataset.py | 1 - auto_round/utils.py | 6 ++-- test/test_cpu/test_gguf_format.py | 57 +++++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index 4825177b6..419a60cf9 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -654,7 +654,6 @@ def get_dataloader( Returns: DataLoader: The DataLoader for the calibrated dataset. """ - dataset_names = dataset_name.split(",") def filter_func(example): diff --git a/auto_round/utils.py b/auto_round/utils.py index 3d500a2f6..9d9535aed 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -1994,9 +1994,9 @@ def _set_config(config, target_config): config_tmp.pop(key, None) matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched if not matched_scheme: - if config.get("super_group_size", None) is not None: + if config.get("super_group_size", None) is not None or config.get("super_bits", None) is not None: new_type = new_type[:bits_index] + str(config["bits"]) + "_k" - if config.get("super_group_size", None) is None or new_type not in GGUF_INNER_CONFIG: + elif new_type not in GGUF_INNER_CONFIG: prefix_idx = 0 if config.get("sym", True) else 1 new_type = new_type[:bits_index] + str(config["bits"]) + f"_{prefix_idx}" if new_type not in GGUF_INNER_CONFIG: @@ -2987,7 +2987,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str return layer_config, has_qlayer_outside_block # embed + lm_head defaults for gguf - tie_word_embeddings &= is_separate_lm_head(model) + tie_word_embeddings &= not is_separate_lm_head(model) if lm_head_name not in layer_config and not tie_word_embeddings: cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index a4ce0c7b4..c491a1fdd 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -12,6 +12,7 @@ class LLMDataLoader: + def __init__(self): self.batch_size = 1 @@ -21,9 +22,11 @@ def __iter__(self): class TestGGUF(unittest.TestCase): + @classmethod def setUpClass(self): self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + self.model_name = "/models/Qwen2.5-0.5B-Instruct" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -338,24 +341,47 @@ def test_qtype_setting(self): # Qwen3-8B output q6_k, token_embed q4_0 4.5G # Llama-3.2-1B-Instruct o output, token_embed q6_k 736M from auto_round.export.export_to_gguf.config import ModelType - from auto_round.utils import get_layer_config_by_gguf_format + from auto_round.utils import get_layer_config_by_gguf_format, set_layer_config model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = "/models/Qwen2.5-0.5B-Instruct" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] - ar._set_layerwise_config(ar.layer_config) - ar.layer_config, _ = get_layer_config_by_gguf_format( - ar.layer_config, ar.formats, ar.model, model_type=ModelType.TEXT + ar.layer_config, _ = set_layer_config( + ar.model, + ar.layer_config, + ar.scheme, + ar.scale_dtype, + ar.supported_types, + ar.inner_supported_types, + ar.quant_block_list, + ar.fp_layers, + ar.quant_lm_head, + enable_gguf_official_mixed=True, + is_mllm=ar.mllm, ) + # ar.layer_config, _ = get_layer_config_by_gguf_format( + # ar.layer_config, ar.formats[0], ar.model, model_type=ModelType.TEXT) + print(ar.layer_config["model.embed_tokens"]["bits"]) self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 8) - self.assertTrue(ar.layer_config["lm_head"]["bits"] == 16) + # self.assertTrue(ar.layer_config["lm_head"]["bits"] == 16) + self.assertTrue("lm_head" not in ar.layer_config) model_name = "Qwen/Qwen3-0.6B" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] - ar._set_layerwise_config(ar.layer_config) - ar.layer_config, _ = get_layer_config_by_gguf_format( - ar.layer_config, ar.formats, ar.model, model_type=ModelType.TEXT + ar.layer_config, _ = set_layer_config( + ar.model, + ar.layer_config, + ar.scheme, + ar.scale_dtype, + ar.supported_types, + ar.inner_supported_types, + ar.quant_block_list, + ar.fp_layers, + ar.quant_lm_head, + enable_gguf_official_mixed=True, + is_mllm=ar.mllm, ) self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 4) self.assertTrue(ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8) @@ -366,9 +392,18 @@ def test_qtype_setting(self): } ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0, layer_config=layer_config) ar.formats = ["gguf:q4_0"] - ar._set_layerwise_config(ar.layer_config) - ar.layer_config, _ = get_layer_config_by_gguf_format( - ar.layer_config, ar.formats, ar.model, model_type=ModelType.TEXT + ar.layer_config, _ = set_layer_config( + ar.model, + ar.layer_config, + ar.scheme, + ar.scale_dtype, + ar.supported_types, + ar.inner_supported_types, + ar.quant_block_list, + ar.fp_layers, + ar.quant_lm_head, + enable_gguf_official_mixed=True, + is_mllm=ar.mllm, ) self.assertTrue(ar.layer_config["lm_head"]["bits"] == 4) self.assertTrue( From e5967c1053f3d30aa833629133ab86e6e6b49056 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 21 Oct 2025 21:33:27 -0400 Subject: [PATCH 07/12] fix Signed-off-by: n1ck-guo --- test/test_cpu/test_gguf_format.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index c491a1fdd..c3981ab5d 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -26,7 +26,6 @@ class TestGGUF(unittest.TestCase): @classmethod def setUpClass(self): self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" - self.model_name = "/models/Qwen2.5-0.5B-Instruct" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -344,7 +343,6 @@ def test_qtype_setting(self): from auto_round.utils import get_layer_config_by_gguf_format, set_layer_config model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" - model_name = "/models/Qwen2.5-0.5B-Instruct" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar.layer_config, _ = set_layer_config( From 17ca354c00ebafd7176408ffed75a7f1d6ff3f7b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 21 Oct 2025 22:59:25 -0400 Subject: [PATCH 08/12] fix ut Signed-off-by: n1ck-guo --- auto_round/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index 9d9535aed..cb44a43c8 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -1996,7 +1996,7 @@ def _set_config(config, target_config): if not matched_scheme: if config.get("super_group_size", None) is not None or config.get("super_bits", None) is not None: new_type = new_type[:bits_index] + str(config["bits"]) + "_k" - elif new_type not in GGUF_INNER_CONFIG: + if new_type not in GGUF_INNER_CONFIG: prefix_idx = 0 if config.get("sym", True) else 1 new_type = new_type[:bits_index] + str(config["bits"]) + f"_{prefix_idx}" if new_type not in GGUF_INNER_CONFIG: From 24c00c72118fa02aeb8259d676189d10038e1663 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Oct 2025 03:00:30 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index dafe67bd6..c97457a51 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -3084,7 +3084,8 @@ def is_separate_lm_head(model: torch.nn.Module) -> bool: return True else: return False - + + def to_standard_regex(pattern: str) -> str: """ Convert a user-specified string into a standardized regex for layer matching. From fd74c462198364d12322f434abc930cad1dc1f05 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 24 Oct 2025 02:36:39 -0400 Subject: [PATCH 10/12] fix Signed-off-by: n1ck-guo --- test/test_cpu/test_gguf_format.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index c3981ab5d..7505db913 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -345,7 +345,7 @@ def test_qtype_setting(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] - ar.layer_config, _ = set_layer_config( + ar.layer_config, _, _ = set_layer_config( ar.model, ar.layer_config, ar.scheme, @@ -358,17 +358,13 @@ def test_qtype_setting(self): enable_gguf_official_mixed=True, is_mllm=ar.mllm, ) - # ar.layer_config, _ = get_layer_config_by_gguf_format( - # ar.layer_config, ar.formats[0], ar.model, model_type=ModelType.TEXT) - print(ar.layer_config["model.embed_tokens"]["bits"]) self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 8) - # self.assertTrue(ar.layer_config["lm_head"]["bits"] == 16) self.assertTrue("lm_head" not in ar.layer_config) model_name = "Qwen/Qwen3-0.6B" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] - ar.layer_config, _ = set_layer_config( + ar.layer_config, _, _ = set_layer_config( ar.model, ar.layer_config, ar.scheme, @@ -390,7 +386,7 @@ def test_qtype_setting(self): } ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0, layer_config=layer_config) ar.formats = ["gguf:q4_0"] - ar.layer_config, _ = set_layer_config( + ar.layer_config, _, _ = set_layer_config( ar.model, ar.layer_config, ar.scheme, From e7b4503f290137a0ac719e9a5bf82f12579c6900 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 27 Oct 2025 09:58:37 +0800 Subject: [PATCH 11/12] Update base.py --- auto_round/compressors/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 6ec6ae03b..c890d3fbe 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -484,7 +484,10 @@ def _parse_and_set(scheme, kwargs): elif isinstance(scheme, dict): scheme = scheme elif isinstance(scheme, str): - res = scheme # gguf:q4_k_s and gguf_q4_k_m has the same dict scheme, but the result is different + # We’d better keep the string scheme instead of the dict config, + # since GGUF uses different mixed-bit strategies for q4_k_s and q4_k_m + # even though they share the same scheme dict. + res = scheme scheme = scheme.upper() scheme = asdict(preset_name_to_scheme(scheme)) scheme_keys = [f.name for f in fields(QuantizationScheme)] From adcf1f6843cc86d6a714967922e09841e8cf4e21 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 01:58:57 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c890d3fbe..1d259862e 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -484,8 +484,8 @@ def _parse_and_set(scheme, kwargs): elif isinstance(scheme, dict): scheme = scheme elif isinstance(scheme, str): - # We’d better keep the string scheme instead of the dict config, - # since GGUF uses different mixed-bit strategies for q4_k_s and q4_k_m + # We’d better keep the string scheme instead of the dict config, + # since GGUF uses different mixed-bit strategies for q4_k_s and q4_k_m # even though they share the same scheme dict. res = scheme scheme = scheme.upper()