From 8c735ba92456c2f5828278132fd1a3c61e1161ef Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Fri, 17 Oct 2025 15:54:44 +0800 Subject: [PATCH 1/7] Loosen the packing restrictions for mxfp&nvfp, enable Qwen1.5-MoE-A2.7B quantize Signed-off-by: Zhang, Weiwei1 --- .../export/export_to_autoround/qlinear_fp.py | 3 +- auto_round/inference/backend.py | 7 +- test/test_cpu/test_export.py | 269 -------------- test/test_cpu/test_mxfp_nvfp.py | 338 ++++++++++++++++++ test/test_cuda/test_export.py | 110 ------ test/test_cuda/test_mxfp_nvfp.py | 173 +++++++++ 6 files changed, 516 insertions(+), 384 deletions(-) create mode 100644 test/test_cpu/test_mxfp_nvfp.py create mode 100644 test/test_cuda/test_mxfp_nvfp.py diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py index 34cbb6f94..3299440a0 100644 --- a/auto_round/export/export_to_autoround/qlinear_fp.py +++ b/auto_round/export/export_to_autoround/qlinear_fp.py @@ -72,8 +72,6 @@ def __init__( super().__init__() if bits not in [4, 8]: raise NotImplementedError("Only 4,8 bits are supported.") - if infeatures % 32 != 0 or outfeatures % 32 != 0: - raise NotImplementedError("in_feature and out_feature must be divisible by 32.") self.is_mx = is_mx_fp(data_type) self.is_nv = is_nv_fp(data_type) if self.is_mx and group_size != 32: @@ -236,3 +234,4 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor: packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8) return packed.reshape(m, n // 2) + diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index aa74f37e1..a49c0ac4c 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -239,7 +239,7 @@ def fp8_static_scheme_checker( act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, - checkers=[feature_multiply_checker_32], + checkers=[], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -259,7 +259,7 @@ def fp8_static_scheme_checker( act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, - checkers=[feature_multiply_checker_32], + checkers=[], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -280,7 +280,7 @@ def fp8_static_scheme_checker( act_data_type=["nv_fp4_with_static_gs"], act_dynamic=[True], priority=0, - checkers=[feature_multiply_checker_16], + checkers=[], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -1025,3 +1025,4 @@ def build_pip_commands(gptq_req, other_reqs): log(joined_cmds) if logger_level == "error": exit(-1) + diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 180fd8f2f..ea484316b 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -302,275 +302,6 @@ def test_static_afp8_export(self, static_kv_dtype): self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mxfp4_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "MXFP4" - layer_config = {} - fp_layers_str = "k_proj" - from auto_round.utils import get_fp_layer_names - - not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - layer_config=layer_config, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - autoround.quantize() - compressed_model = autoround.save_quantized( - output_dir=quantized_model_path, inplace=True, format="llm_compressor" - ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_packed") - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal MXFP4 packing name or data_type or shape" - assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers - skip_layer, "weight_packed" - ), "Illegal MXFP4 quantization for fp_layers" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 - ), f"Invalid MXFP4 quantization configuration: {quantization_config}" - - shutil.rmtree("./saved", ignore_errors=True) - - def test_rtn_mxfp4_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "MXFP4" - layer_config = {} - fp_layers_str = "k_proj" - from auto_round.utils import get_fp_layer_names - - not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=0, - seqlen=2, - layer_config=layer_config, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - autoround.quantize() - compressed_model = autoround.save_quantized( - output_dir=quantized_model_path, inplace=True, format="llm_compressor" - ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_packed") - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal MXFP4 packing name or data_type or shape" - assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers - skip_layer, "weight_packed" - ), "Illegal MXFP4 quantization for fp_layers" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 - ), f"Invalid MXFP4 quantization configuration: {quantization_config}" - shutil.rmtree("./saved", ignore_errors=True) - - def test_mxfp8_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "MXFP8" - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight") - and tmp_layer.weight.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal MXFP8 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8 - ), f"Invalid MXFP8 quantization configuration: {quantization_config}" - folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty - assert ( - 0.15 < folder_size_gb < 0.2 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "NVFP4" - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "nvfp4-pack-quantized" - and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 - ), f"Invalid NVFP4 quantization configuration: {quantization_config}" - folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty - assert ( - 0.1 < folder_size_gb < 0.15 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_autoround_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "NVFP4" - autoround = AutoRound( - model, - self.tokenizer, - scheme="NVFP4", - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_autoround_save_quantized(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "NVFP4" - autoround = AutoRound( - model, - self.tokenizer, - scheme="NVFP4", - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - autoround.quantize() - compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_moe_actmax_rtn(self): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" - layer_config = { - "self_attn": {"bits": 16, "act_bits": 16}, - "mlp.shared_experts": {"bits": 16, "act_bits": 16}, - } - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=0, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - layer_config=layer_config, - ) - compressed_model, _ = autoround.quantize() - assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max") - - def test_nvfp4_moe_actmax_ar(self): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" - layer_config = { - "q_proj": {"bits": 16, "act_bits": 16}, - "mlp.shared_experts": {"bits": 16, "act_bits": 16}, - "experts.*2": {"bits": 16, "act_bits": 16}, - "experts.*5": {"bits": 16, "act_bits": 16}, - } - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=1, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - layer_config=layer_config, - ) - autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") - if __name__ == "__main__": unittest.main() diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py new file mode 100644 index 000000000..dd084add1 --- /dev/null +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -0,0 +1,338 @@ +import os +import shutil +import sys +import unittest + +from parameterized import parameterized + +sys.path.insert(0, "../..") +import torch +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer + +from auto_round import AutoRound + + +def _get_folder_size(path: str) -> float: + """Return folder size in GB.""" + total_size = 0 + for dirpath, _, filenames in os.walk(path): + for f in filenames: + fp = os.path.join(dirpath, f) + if os.path.isfile(fp): + total_size += os.path.getsize(fp) + return total_size / (1024**3) # convert to GB + + +class LLMDataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +class TestAutoRoundFP(unittest.TestCase): + @classmethod + def setUpClass(self): + model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/ + self.save_dir = "./saved" + self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + self.llm_dataloader = LLMDataLoader() + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + + def test_nvfp4_moe_actmax_rtn(self): + model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + layer_config = { + "self_attn": {"bits": 16, "act_bits": 16}, + "mlp.shared_experts": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=0, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + compressed_model, _ = autoround.quantize() + assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max") + + def test_nvfp4_moe_actmax_ar(self): + model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + layer_config = { + "q_proj": {"bits": 16, "act_bits": 16}, + "mlp.shared_experts": {"bits": 16, "act_bits": 16}, + "experts.*2": {"bits": 16, "act_bits": 16}, + "experts.*5": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") + + + def test_mxfp4_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "MXFP4" + layer_config = {} + fp_layers_str = "k_proj" + from auto_round.utils import get_fp_layer_names + + not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) + for name in not_quantize_layer_names: + layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} + autoround = AutoRound( + model_name, + scheme=scheme, + iters=2, + seqlen=2, + layer_config=layer_config, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + autoround.quantize() + compressed_model = autoround.save_quantized( + output_dir=quantized_model_path, inplace=True, format="llm_compressor" + ) + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_packed") + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal MXFP4 packing name or data_type or shape" + assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers + skip_layer, "weight_packed" + ), "Illegal MXFP4 quantization for fp_layers" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 + ), f"Invalid MXFP4 quantization configuration: {quantization_config}" + + shutil.rmtree("./saved", ignore_errors=True) + + + def test_rtn_mxfp4_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + scheme = "MXFP4" + layer_config = {} + fp_layers_str = "k_proj" + from auto_round.utils import get_fp_layer_names + + not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) + for name in not_quantize_layer_names: + layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} + autoround = AutoRound( + model_name, + scheme=scheme, + iters=0, + seqlen=2, + layer_config=layer_config, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + autoround.quantize() + compressed_model = autoround.save_quantized( + output_dir=quantized_model_path, inplace=True, format="llm_compressor" + ) + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_packed") + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal MXFP4 packing name or data_type or shape" + assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers + skip_layer, "weight_packed" + ), "Illegal MXFP4 quantization for fp_layers" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 + ), f"Invalid MXFP4 quantization configuration: {quantization_config}" + shutil.rmtree("./saved", ignore_errors=True) + + + def test_mxfp8_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + scheme = "MXFP8" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight") + and tmp_layer.weight.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal MXFP8 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8 + ), f"Invalid MXFP8 quantization configuration: {quantization_config}" + folder_size_gb = _get_folder_size(quantized_model_path) + # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty + assert ( + 0.15 < folder_size_gb < 0.2 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" + shutil.rmtree("./saved", ignore_errors=True) + + + def test_nvfp4_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + scheme = "NVFP4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "nvfp4-pack-quantized" + and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 + ), f"Invalid NVFP4 quantization configuration: {quantization_config}" + folder_size_gb = _get_folder_size(quantized_model_path) + # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty + assert ( + 0.1 < folder_size_gb < 0.15 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" + shutil.rmtree("./saved", ignore_errors=True) + + + def test_nvfp4_autoround_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "NVFP4" + autoround = AutoRound( + model_name, + scheme="NVFP4", + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + shutil.rmtree("./saved", ignore_errors=True) + + + def test_nvfp4_autoround_save_quantized(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + scheme = "NVFP4" + autoround = AutoRound( + model_name, + scheme="NVFP4", + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + autoround.quantize() + compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + shutil.rmtree("./saved", ignore_errors=True) + + + def test_qwen_moe_quant_infer(self): + model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" + layer_config = { + "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + quantized_model_path = self.save_dir + autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) + print(result["results"]["piqa"]["acc,none"]) + self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) + shutil.rmtree(quantized_model_path, ignore_errors=True) + + +if __name__ == "__main__": + unittest.main() + diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index 0ab05134f..d6d6c1f93 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -322,116 +322,6 @@ def test_autoround_3bit_sym_torch_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_fp8input_mxfp4_llmcompressor_format(self): - model_name = "/models/Qwen3-0.6B-FP8" - scheme = "mxfp4" - ar = AutoRound( - model=model_name, - iters=2, - seqlen=2, - scheme=scheme, - dataset=self.llm_dataloader, - ) - compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") - tmp_layer = compressed_model.model.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_packed") - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 2048 - ), "Illegal MXFP4 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 - ), f"Invalid MXFP4 quantization configuration: {quantization_config}" - shutil.rmtree(self.save_dir, ignore_errors=True) - - def test_nvfp4_llmcompressor_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - scheme = "nvfp4" - autoround = AutoRound( - model, - tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "nvfp4-pack-quantized" - and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 - ), f"Invalid NVFP4 quantization configuration: {quantization_config}" - shutil.rmtree("./saved", ignore_errors=True) - # from vllm import LLM, SamplingParams - # prompts = [ - # "The capital of France is", - # "The future of AI is", - # ] - ## Create a sampling params object. - # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20) - # QUANTIZATION = "compressed-tensors" - # llm = LLM(model=quantized_model_path, - # # quantization=QUANTIZATION, - # trust_remote_code=True, - # tensor_parallel_size=1, - # enforce_eager=True, - # gpu_memory_utilization=0.7, - # ) - # outputs = llm.generate(prompts, sampling_params) - # # Print the outputs. - # for output in outputs: - # prompt = output.prompt - # generated_text = output.outputs[0].text - # if "France" in prompt: - # assert "Paris" in generated_text - - def test_nvfp4_moe_actmax_rtn(self): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=0, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - ) - autoround.quantize() - quantized_model_path = self.save_dir - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - - def test_nvfp4_moe_actmax_ar(self): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=1, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - ) - autoround.quantize() - quantized_model_path = self.save_dir - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - if __name__ == "__main__": unittest.main() diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py new file mode 100644 index 000000000..7f0d9c82b --- /dev/null +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -0,0 +1,173 @@ +import copy +import shutil +import sys +import unittest + +sys.path.insert(0, "../..") +import torch +import transformers +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from auto_round import AutoRound +from auto_round.testing_utils import require_awq, require_optimum + +class LLMDataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +class TestAutoRound(unittest.TestCase): + @classmethod + def setUpClass(self): + self.model_name = "facebook/opt-125m" + self.save_dir = "./saved" + self.llm_dataloader = LLMDataLoader() + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + + def test_fp8input_mxfp4_llmcompressor_format(self): + model_name = "/models/Qwen3-0.6B-FP8" + scheme = "mxfp4" + ar = AutoRound( + model=model_name, + iters=2, + seqlen=2, + scheme=scheme, + dataset=self.llm_dataloader, + ) + compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") + tmp_layer = compressed_model.model.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_packed") + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 2048 + ), "Illegal MXFP4 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 + ), f"Invalid MXFP4 quantization configuration: {quantization_config}" + shutil.rmtree(self.save_dir, ignore_errors=True) + + def test_nvfp4_llmcompressor_format(self): + scheme = "nvfp4" + autoround = AutoRound( + self.model_name, + scheme=scheme, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "nvfp4-pack-quantized" + and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 + ), f"Invalid NVFP4 quantization configuration: {quantization_config}" + shutil.rmtree("./saved", ignore_errors=True) + # from vllm import LLM, SamplingParams + # prompts = [ + # "The capital of France is", + # "The future of AI is", + # ] + ## Create a sampling params object. + # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20) + # QUANTIZATION = "compressed-tensors" + # llm = LLM(model=quantized_model_path, + # # quantization=QUANTIZATION, + # trust_remote_code=True, + # tensor_parallel_size=1, + # enforce_eager=True, + # gpu_memory_utilization=0.7, + # ) + # outputs = llm.generate(prompts, sampling_params) + # # Print the outputs. + # for output in outputs: + # prompt = output.prompt + # generated_text = output.outputs[0].text + # if "France" in prompt: + # assert "Paris" in generated_text + + + def test_nvfp4_moe_actmax_rtn(self): + model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=0, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = self.save_dir + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") + + def test_nvfp4_moe_actmax_ar(self): + model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = self.save_dir + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") + + + def test_qwen_moe_quant_infer(self): + model_name = "/models/Qwen1.5-MoE-A2.7B" + layer_config = { + "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + quantized_model_path = self.save_dir + autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa") + print(result["results"]["piqa"]["acc,none"]) + self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7) + shutil.rmtree(quantized_model_path, ignore_errors=True) + + +if __name__ == "__main__": + unittest.main() + From 51e0a5f418f7f4689a0c0708b70e37a73772882a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Oct 2025 07:58:53 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../export/export_to_autoround/qlinear_fp.py | 1 - auto_round/inference/backend.py | 1 - test/test_cpu/test_mxfp_nvfp.py | 16 ++++++---------- test/test_cuda/test_mxfp_nvfp.py | 6 ++---- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py index 3299440a0..c2ceaf9fa 100644 --- a/auto_round/export/export_to_autoround/qlinear_fp.py +++ b/auto_round/export/export_to_autoround/qlinear_fp.py @@ -234,4 +234,3 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor: packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8) return packed.reshape(m, n // 2) - diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index a49c0ac4c..4d67c4c15 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -1025,4 +1025,3 @@ def build_pip_commands(gptq_req, other_reqs): log(joined_cmds) if logger_level == "error": exit(-1) - diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index dd084add1..21bd1d36b 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -35,7 +35,7 @@ def __iter__(self): class TestAutoRoundFP(unittest.TestCase): @classmethod def setUpClass(self): - model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/ + model_name = "facebook/opt-125m" # /tf_dataset/auto_round/models/ self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -45,7 +45,6 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_nvfp4_moe_actmax_rtn(self): model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" @@ -86,7 +85,6 @@ def test_nvfp4_moe_actmax_ar(self): ) autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") - def test_mxfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig @@ -134,10 +132,10 @@ def test_mxfp4_llmcompressor_format(self): shutil.rmtree("./saved", ignore_errors=True) - def test_rtn_mxfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "MXFP4" layer_config = {} fp_layers_str = "k_proj" @@ -180,10 +178,10 @@ def test_rtn_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree("./saved", ignore_errors=True) - def test_mxfp8_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "MXFP8" autoround = AutoRound( model_name, @@ -217,10 +215,10 @@ def test_mxfp8_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" shutil.rmtree("./saved", ignore_errors=True) - def test_nvfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "NVFP4" autoround = AutoRound( model_name, @@ -254,7 +252,6 @@ def test_nvfp4_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" shutil.rmtree("./saved", ignore_errors=True) - def test_nvfp4_autoround_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig @@ -280,10 +277,10 @@ def test_nvfp4_autoround_format(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree("./saved", ignore_errors=True) - def test_nvfp4_autoround_save_quantized(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "NVFP4" autoround = AutoRound( model_name, @@ -306,7 +303,6 @@ def test_nvfp4_autoround_save_quantized(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree("./saved", ignore_errors=True) - def test_qwen_moe_quant_infer(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" layer_config = { @@ -327,6 +323,7 @@ def test_qwen_moe_quant_infer(self): model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) print(result["results"]["piqa"]["acc,none"]) self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) @@ -335,4 +332,3 @@ def test_qwen_moe_quant_infer(self): if __name__ == "__main__": unittest.main() - diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py index 7f0d9c82b..48dd27d9b 100644 --- a/test/test_cuda/test_mxfp_nvfp.py +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -11,6 +11,7 @@ from auto_round import AutoRound from auto_round.testing_utils import require_awq, require_optimum + class LLMDataLoader: def __init__(self): self.batch_size = 1 @@ -32,7 +33,6 @@ def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_fp8input_mxfp4_llmcompressor_format(self): model_name = "/models/Qwen3-0.6B-FP8" scheme = "mxfp4" @@ -110,7 +110,6 @@ def test_nvfp4_llmcompressor_format(self): # if "France" in prompt: # assert "Paris" in generated_text - def test_nvfp4_moe_actmax_rtn(self): model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" scheme = "nvfp4" @@ -141,7 +140,6 @@ def test_nvfp4_moe_actmax_ar(self): quantized_model_path = self.save_dir autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - def test_qwen_moe_quant_infer(self): model_name = "/models/Qwen1.5-MoE-A2.7B" layer_config = { @@ -162,6 +160,7 @@ def test_qwen_moe_quant_infer(self): model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa") print(result["results"]["piqa"]["acc,none"]) self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7) @@ -170,4 +169,3 @@ def test_qwen_moe_quant_infer(self): if __name__ == "__main__": unittest.main() - From b3a0037e9c71ed897c348a462b3a006b0fcb27fa Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Fri, 17 Oct 2025 16:40:18 +0800 Subject: [PATCH 3/7] fix UT Signed-off-by: Zhang, Weiwei1 --- test/test_cpu/test_mxfp_nvfp.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 21bd1d36b..7465ef45a 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -35,7 +35,7 @@ def __iter__(self): class TestAutoRoundFP(unittest.TestCase): @classmethod def setUpClass(self): - model_name = "facebook/opt-125m" # /tf_dataset/auto_round/models/ + model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/ self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -45,6 +45,7 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) + def test_nvfp4_moe_actmax_rtn(self): model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" @@ -85,18 +86,13 @@ def test_nvfp4_moe_actmax_ar(self): ) autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") + def test_mxfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig scheme = "MXFP4" - layer_config = {} - fp_layers_str = "k_proj" - from auto_round.utils import get_fp_layer_names - - not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} + layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}} autoround = AutoRound( model_name, scheme=scheme, @@ -132,18 +128,12 @@ def test_mxfp4_llmcompressor_format(self): shutil.rmtree("./saved", ignore_errors=True) + def test_rtn_mxfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig - scheme = "MXFP4" - layer_config = {} - fp_layers_str = "k_proj" - from auto_round.utils import get_fp_layer_names - - not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} + layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}} autoround = AutoRound( model_name, scheme=scheme, @@ -178,10 +168,10 @@ def test_rtn_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree("./saved", ignore_errors=True) + def test_mxfp8_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig - scheme = "MXFP8" autoround = AutoRound( model_name, @@ -215,10 +205,10 @@ def test_mxfp8_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" shutil.rmtree("./saved", ignore_errors=True) + def test_nvfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig - scheme = "NVFP4" autoround = AutoRound( model_name, @@ -252,6 +242,7 @@ def test_nvfp4_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" shutil.rmtree("./saved", ignore_errors=True) + def test_nvfp4_autoround_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig @@ -277,10 +268,10 @@ def test_nvfp4_autoround_format(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree("./saved", ignore_errors=True) + def test_nvfp4_autoround_save_quantized(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig - scheme = "NVFP4" autoround = AutoRound( model_name, @@ -303,6 +294,7 @@ def test_nvfp4_autoround_save_quantized(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree("./saved", ignore_errors=True) + def test_qwen_moe_quant_infer(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" layer_config = { @@ -323,7 +315,6 @@ def test_qwen_moe_quant_infer(self): model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model - result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) print(result["results"]["piqa"]["acc,none"]) self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) @@ -332,3 +323,4 @@ def test_qwen_moe_quant_infer(self): if __name__ == "__main__": unittest.main() + From 894775b3f1b8b28734c0f59d4d09283c9b25e0e9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:40:48 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/test_cpu/test_mxfp_nvfp.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 7465ef45a..4fcd25135 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -35,7 +35,7 @@ def __iter__(self): class TestAutoRoundFP(unittest.TestCase): @classmethod def setUpClass(self): - model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/ + model_name = "facebook/opt-125m" # /tf_dataset/auto_round/models/ self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -45,7 +45,6 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_nvfp4_moe_actmax_rtn(self): model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" @@ -86,7 +85,6 @@ def test_nvfp4_moe_actmax_ar(self): ) autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") - def test_mxfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig @@ -128,10 +126,10 @@ def test_mxfp4_llmcompressor_format(self): shutil.rmtree("./saved", ignore_errors=True) - def test_rtn_mxfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "MXFP4" layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}} autoround = AutoRound( @@ -168,10 +166,10 @@ def test_rtn_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree("./saved", ignore_errors=True) - def test_mxfp8_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "MXFP8" autoround = AutoRound( model_name, @@ -205,10 +203,10 @@ def test_mxfp8_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" shutil.rmtree("./saved", ignore_errors=True) - def test_nvfp4_llmcompressor_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "NVFP4" autoround = AutoRound( model_name, @@ -242,7 +240,6 @@ def test_nvfp4_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" shutil.rmtree("./saved", ignore_errors=True) - def test_nvfp4_autoround_format(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig @@ -268,10 +265,10 @@ def test_nvfp4_autoround_format(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree("./saved", ignore_errors=True) - def test_nvfp4_autoround_save_quantized(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from transformers import AutoConfig + scheme = "NVFP4" autoround = AutoRound( model_name, @@ -294,7 +291,6 @@ def test_nvfp4_autoround_save_quantized(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree("./saved", ignore_errors=True) - def test_qwen_moe_quant_infer(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" layer_config = { @@ -315,6 +311,7 @@ def test_qwen_moe_quant_infer(self): model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model + result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) print(result["results"]["piqa"]["acc,none"]) self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) @@ -323,4 +320,3 @@ def test_qwen_moe_quant_infer(self): if __name__ == "__main__": unittest.main() - From fe1e1988ab26709abbf3c787d4cb6d6ec2073281 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Mon, 20 Oct 2025 22:25:21 +0800 Subject: [PATCH 5/7] refine mxfp&nvfp layer checker Signed-off-by: Zhang, Weiwei1 --- .../export/export_to_autoround/qlinear_fp.py | 21 ++++++++++++++----- auto_round/inference/backend.py | 18 ++++++++++++---- auto_round/utils.py | 9 ++++++++ 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py index c2ceaf9fa..61df26483 100644 --- a/auto_round/export/export_to_autoround/qlinear_fp.py +++ b/auto_round/export/export_to_autoround/qlinear_fp.py @@ -38,7 +38,7 @@ from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad -from auto_round.utils import _get_packing_device, is_mx_fp, is_nv_fp +from auto_round.utils import BackendDataType, _get_packing_device, is_mx_fp, is_nv_fp # from auto_round.utils import get_weight_compress_dtype logger = getLogger(__name__) @@ -74,10 +74,20 @@ def __init__( raise NotImplementedError("Only 4,8 bits are supported.") self.is_mx = is_mx_fp(data_type) self.is_nv = is_nv_fp(data_type) - if self.is_mx and group_size != 32: - raise NotImplementedError("Only group_size 32 are supported for mxfp.") - if self.is_nv and group_size not in [16, 32]: - raise NotImplementedError("Only group_size 16 are supported for nvfp.") + if self.is_mx: + if group_size != 32: + raise NotImplementedError(f"Only group_size 32 are supported for {BackendDataType.MX_FP} data type.") + if infeatures % group_size != 0: + raise NotImplementedError( + f"in_feature must be divisible by {group_size} for {BackendDataType.MX_FP} data type." + ) + if self.is_nv: + if group_size % 16 != 0: + raise NotImplementedError(f"Only group_size 16 are supported for {BackendDataType.NV_FP} data type.") + if infeatures % group_size != 0: + raise NotImplementedError( + f"in_feature must be divisible by {group_size} for {BackendDataType.NV_FP} data type." + ) self.infeatures = infeatures self.outfeatures = outfeatures self.bits = bits @@ -234,3 +244,4 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor: packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8) return packed.reshape(m, n // 2) + diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 4d67c4c15..5482cf788 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -127,12 +127,19 @@ def feature_multiply_checker_group_size( ) +def in_feature_checker_group_size(in_feature, out_feature, config): + group_size = config["group_size"] + return in_feature % group_size == 0 + + feature_multiply_checker_32 = functools.partial(feature_multiply_checker, in_feature_multiplier=32) feature_multiply_checker_16 = functools.partial(feature_multiply_checker, in_feature_multiplier=16) in_output_feature_multiply_checker_32 = functools.partial( feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32 ) - +in_feature_multiply_checker_32 = functools.partial( + feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=None +) exllamav2_feature_checker = functools.partial( feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32 ) @@ -141,6 +148,8 @@ def feature_multiply_checker_group_size( feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64 ) +mxfp_nvfp_feature_checker = functools.partial(in_feature_checker_group_size) + def fp8_static_scheme_checker( in_feature: int, @@ -239,7 +248,7 @@ def fp8_static_scheme_checker( act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, - checkers=[], + checkers=[mxfp_nvfp_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -259,7 +268,7 @@ def fp8_static_scheme_checker( act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, - checkers=[], + checkers=[mxfp_nvfp_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -280,7 +289,7 @@ def fp8_static_scheme_checker( act_data_type=["nv_fp4_with_static_gs"], act_dynamic=[True], priority=0, - checkers=[], + checkers=[mxfp_nvfp_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -1025,3 +1034,4 @@ def build_pip_commands(gptq_req, other_reqs): log(joined_cmds) if logger_level == "error": exit(-1) + diff --git a/auto_round/utils.py b/auto_round/utils.py index 8c8c9acc5..6aa38f547 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2963,6 +2963,15 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str layer_config.setdefault(n, copy.deepcopy(default_dict)) layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") + # enforce shape divisibility for mxfp/nvfp + if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[1] % default_dict["group_size"]: + layer_config.setdefault(n, copy.deepcopy(default_dict)) + layer_config[n].update( + {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True}) + logger.warning_once(f"{n} skipped quantization (shape not divisible by {default_dict['group_size']}).") # 9. block layers: mark as in_blocks=True for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): From ca9672c47b585c8a3b288f27a1466f76db594bfe Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Mon, 20 Oct 2025 22:40:11 +0800 Subject: [PATCH 6/7] fix pylint Signed-off-by: Zhang, Weiwei1 --- auto_round/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index 6aa38f547..70d7f013b 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2970,8 +2970,11 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if m.weight.shape[1] % default_dict["group_size"]: layer_config.setdefault(n, copy.deepcopy(default_dict)) layer_config[n].update( - {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True}) - logger.warning_once(f"{n} skipped quantization (shape not divisible by {default_dict['group_size']}).") + {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True} + ) + logger.warning_once( + f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})." + ) # 9. block layers: mark as in_blocks=True for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): @@ -3051,3 +3054,4 @@ def is_diffusion_model(model_or_path: Union[str, object]) -> bool: return isinstance(model_or_path, pipeline_utils.DiffusionPipeline) else: return False + From f3cc522c7dfae8bb2d9e0dec8d3a6b3e6a5a7987 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:28:22 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_autoround/qlinear_fp.py | 1 - auto_round/inference/backend.py | 1 - auto_round/utils.py | 1 - 3 files changed, 3 deletions(-) diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py index 61df26483..f7979e269 100644 --- a/auto_round/export/export_to_autoround/qlinear_fp.py +++ b/auto_round/export/export_to_autoround/qlinear_fp.py @@ -244,4 +244,3 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor: packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8) return packed.reshape(m, n // 2) - diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 5482cf788..4d26eb143 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -1034,4 +1034,3 @@ def build_pip_commands(gptq_req, other_reqs): log(joined_cmds) if logger_level == "error": exit(-1) - diff --git a/auto_round/utils.py b/auto_round/utils.py index 70d7f013b..6742011fa 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -3054,4 +3054,3 @@ def is_diffusion_model(model_or_path: Union[str, object]) -> bool: return isinstance(model_or_path, pipeline_utils.DiffusionPipeline) else: return False -