From 8c735ba92456c2f5828278132fd1a3c61e1161ef Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 17 Oct 2025 15:54:44 +0800
Subject: [PATCH 1/7] Loosen the packing restrictions for mxfp&nvfp, enable
 Qwen1.5-MoE-A2.7B quantize

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../export/export_to_autoround/qlinear_fp.py  |   3 +-
 auto_round/inference/backend.py               |   7 +-
 test/test_cpu/test_export.py                  | 269 --------------
 test/test_cpu/test_mxfp_nvfp.py               | 338 ++++++++++++++++++
 test/test_cuda/test_export.py                 | 110 ------
 test/test_cuda/test_mxfp_nvfp.py              | 173 +++++++++
 6 files changed, 516 insertions(+), 384 deletions(-)
 create mode 100644 test/test_cpu/test_mxfp_nvfp.py
 create mode 100644 test/test_cuda/test_mxfp_nvfp.py

diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py
index 34cbb6f94..3299440a0 100644
--- a/auto_round/export/export_to_autoround/qlinear_fp.py
+++ b/auto_round/export/export_to_autoround/qlinear_fp.py
@@ -72,8 +72,6 @@ def __init__(
         super().__init__()
         if bits not in [4, 8]:
             raise NotImplementedError("Only 4,8 bits are supported.")
-        if infeatures % 32 != 0 or outfeatures % 32 != 0:
-            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
         self.is_mx = is_mx_fp(data_type)
         self.is_nv = is_nv_fp(data_type)
         if self.is_mx and group_size != 32:
@@ -236,3 +234,4 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8)
 
     return packed.reshape(m, n // 2)
+
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index aa74f37e1..a49c0ac4c 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -239,7 +239,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_32],
+    checkers=[],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -259,7 +259,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_32],
+    checkers=[],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -280,7 +280,7 @@ def fp8_static_scheme_checker(
     act_data_type=["nv_fp4_with_static_gs"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_16],
+    checkers=[],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -1025,3 +1025,4 @@ def build_pip_commands(gptq_req, other_reqs):
         log(joined_cmds)
         if logger_level == "error":
             exit(-1)
+
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 180fd8f2f..ea484316b 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -302,275 +302,6 @@ def test_static_afp8_export(self, static_kv_dtype):
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mxfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            layer_config=layer_config,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(
-            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
-        )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
-            skip_layer, "weight_packed"
-        ), "Illegal MXFP4 quantization for fp_layers"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_rtn_mxfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            layer_config=layer_config,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(
-            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
-        )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
-            skip_layer, "weight_packed"
-        ), "Illegal MXFP4 quantization for fp_layers"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_mxfp8_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP8"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight")
-            and tmp_layer.weight.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP8 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
-        ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
-        folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
-        assert (
-            0.15 < folder_size_gb < 0.2
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "nvfp4-pack-quantized"
-            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
-        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
-        folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
-        assert (
-            0.1 < folder_size_gb < 0.15
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_autoround_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme="NVFP4",
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_autoround_save_quantized(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme="NVFP4",
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_moe_actmax_rtn(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
-        layer_config = {
-            "self_attn": {"bits": 16, "act_bits": 16},
-            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
-        }
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-            layer_config=layer_config,
-        )
-        compressed_model, _ = autoround.quantize()
-        assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max")
-
-    def test_nvfp4_moe_actmax_ar(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
-        layer_config = {
-            "q_proj": {"bits": 16, "act_bits": 16},
-            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
-            "experts.*2": {"bits": 16, "act_bits": 16},
-            "experts.*5": {"bits": 16, "act_bits": 16},
-        }
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=1,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-            layer_config=layer_config,
-        )
-        autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
new file mode 100644
index 000000000..dd084add1
--- /dev/null
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -0,0 +1,338 @@
+import os
+import shutil
+import sys
+import unittest
+
+from parameterized import parameterized
+
+sys.path.insert(0, "../..")
+import torch
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
+
+from auto_round import AutoRound
+
+
+def _get_folder_size(path: str) -> float:
+    """Return folder size in GB."""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.isfile(fp):
+                total_size += os.path.getsize(fp)
+    return total_size / (1024**3)  # convert to GB
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRoundFP(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/
+        self.save_dir = "./saved"
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.llm_dataloader = LLMDataLoader()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+    
+
+    def test_nvfp4_moe_actmax_rtn(self):
+        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+        layer_config = {
+            "self_attn": {"bits": 16, "act_bits": 16},
+            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        compressed_model, _ = autoround.quantize()
+        assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max")
+
+    def test_nvfp4_moe_actmax_ar(self):
+        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+        layer_config = {
+            "q_proj": {"bits": 16, "act_bits": 16},
+            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
+            "experts.*2": {"bits": 16, "act_bits": 16},
+            "experts.*5": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
+
+    
+    def test_mxfp4_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "MXFP4"
+        layer_config = {}
+        fp_layers_str = "k_proj"
+        from auto_round.utils import get_fp_layer_names
+
+        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
+        for name in not_quantize_layer_names:
+            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize()
+        compressed_model = autoround.save_quantized(
+            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
+        )
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_packed")
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal MXFP4 packing name or data_type or shape"
+        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
+            skip_layer, "weight_packed"
+        ), "Illegal MXFP4 quantization for fp_layers"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
+        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
+
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_rtn_mxfp4_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+        scheme = "MXFP4"
+        layer_config = {}
+        fp_layers_str = "k_proj"
+        from auto_round.utils import get_fp_layer_names
+
+        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
+        for name in not_quantize_layer_names:
+            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize()
+        compressed_model = autoround.save_quantized(
+            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
+        )
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_packed")
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal MXFP4 packing name or data_type or shape"
+        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
+            skip_layer, "weight_packed"
+        ), "Illegal MXFP4 quantization for fp_layers"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
+        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_mxfp8_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+        scheme = "MXFP8"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight")
+            and tmp_layer.weight.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal MXFP8 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
+        ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
+        folder_size_gb = _get_folder_size(quantized_model_path)
+        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
+        assert (
+            0.15 < folder_size_gb < 0.2
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_nvfp4_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+        scheme = "NVFP4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "nvfp4-pack-quantized"
+            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
+        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
+        folder_size_gb = _get_folder_size(quantized_model_path)
+        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
+        assert (
+            0.1 < folder_size_gb < 0.15
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_nvfp4_autoround_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "NVFP4"
+        autoround = AutoRound(
+            model_name,
+            scheme="NVFP4",
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_nvfp4_autoround_save_quantized(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+        scheme = "NVFP4"
+        autoround = AutoRound(
+            model_name,
+            scheme="NVFP4",
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize()
+        compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_qwen_moe_quant_infer(self):
+        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
+        layer_config = {
+            "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        from auto_round.eval.evaluation import simple_evaluate_user_model
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
+        print(result["results"]["piqa"]["acc,none"])
+        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
+    
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index 0ab05134f..d6d6c1f93 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -322,116 +322,6 @@ def test_autoround_3bit_sym_torch_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_fp8input_mxfp4_llmcompressor_format(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        scheme = "mxfp4"
-        ar = AutoRound(
-            model=model_name,
-            iters=2,
-            seqlen=2,
-            scheme=scheme,
-            dataset=self.llm_dataloader,
-        )
-        compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
-        tmp_layer = compressed_model.model.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 2048
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-        shutil.rmtree(self.save_dir, ignore_errors=True)
-
-    def test_nvfp4_llmcompressor_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model,
-            tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "nvfp4-pack-quantized"
-            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
-        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
-        shutil.rmtree("./saved", ignore_errors=True)
-        # from vllm import LLM, SamplingParams
-        # prompts = [
-        #     "The capital of France is",
-        #     "The future of AI is",
-        # ]
-        ## Create a sampling params object.
-        # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20)
-        # QUANTIZATION = "compressed-tensors"
-        # llm = LLM(model=quantized_model_path,
-        #         #   quantization=QUANTIZATION,
-        #           trust_remote_code=True,
-        #           tensor_parallel_size=1,
-        #           enforce_eager=True,
-        #           gpu_memory_utilization=0.7,
-        # )
-        # outputs = llm.generate(prompts, sampling_params)
-        # # Print the outputs.
-        # for output in outputs:
-        #     prompt = output.prompt
-        #     generated_text = output.outputs[0].text
-        #     if "France" in prompt:
-        #         assert "Paris" in generated_text
-
-    def test_nvfp4_moe_actmax_rtn(self):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = self.save_dir
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
-
-    def test_nvfp4_moe_actmax_ar(self):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=1,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = self.save_dir
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
new file mode 100644
index 000000000..7f0d9c82b
--- /dev/null
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -0,0 +1,173 @@
+import copy
+import shutil
+import sys
+import unittest
+
+sys.path.insert(0, "../..")
+import torch
+import transformers
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+from auto_round import AutoRound
+from auto_round.testing_utils import require_awq, require_optimum
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRound(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.model_name = "facebook/opt-125m"
+        self.save_dir = "./saved"
+        self.llm_dataloader = LLMDataLoader()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+
+    def test_fp8input_mxfp4_llmcompressor_format(self):
+        model_name = "/models/Qwen3-0.6B-FP8"
+        scheme = "mxfp4"
+        ar = AutoRound(
+            model=model_name,
+            iters=2,
+            seqlen=2,
+            scheme=scheme,
+            dataset=self.llm_dataloader,
+        )
+        compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
+        tmp_layer = compressed_model.model.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_packed")
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 2048
+        ), "Illegal MXFP4 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
+        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
+        shutil.rmtree(self.save_dir, ignore_errors=True)
+
+    def test_nvfp4_llmcompressor_format(self):
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            self.model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "nvfp4-pack-quantized"
+            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
+        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
+        shutil.rmtree("./saved", ignore_errors=True)
+        # from vllm import LLM, SamplingParams
+        # prompts = [
+        #     "The capital of France is",
+        #     "The future of AI is",
+        # ]
+        ## Create a sampling params object.
+        # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20)
+        # QUANTIZATION = "compressed-tensors"
+        # llm = LLM(model=quantized_model_path,
+        #         #   quantization=QUANTIZATION,
+        #           trust_remote_code=True,
+        #           tensor_parallel_size=1,
+        #           enforce_eager=True,
+        #           gpu_memory_utilization=0.7,
+        # )
+        # outputs = llm.generate(prompts, sampling_params)
+        # # Print the outputs.
+        # for output in outputs:
+        #     prompt = output.prompt
+        #     generated_text = output.outputs[0].text
+        #     if "France" in prompt:
+        #         assert "Paris" in generated_text
+
+
+    def test_nvfp4_moe_actmax_rtn(self):
+        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = self.save_dir
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
+
+    def test_nvfp4_moe_actmax_ar(self):
+        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = self.save_dir
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
+
+
+    def test_qwen_moe_quant_infer(self):
+        model_name = "/models/Qwen1.5-MoE-A2.7B"
+        layer_config = {
+            "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        from auto_round.eval.evaluation import simple_evaluate_user_model
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa")
+        print(result["results"]["piqa"]["acc,none"])
+        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
+    

From 51e0a5f418f7f4689a0c0708b70e37a73772882a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 17 Oct 2025 07:58:53 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../export/export_to_autoround/qlinear_fp.py     |  1 -
 auto_round/inference/backend.py                  |  1 -
 test/test_cpu/test_mxfp_nvfp.py                  | 16 ++++++----------
 test/test_cuda/test_mxfp_nvfp.py                 |  6 ++----
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py
index 3299440a0..c2ceaf9fa 100644
--- a/auto_round/export/export_to_autoround/qlinear_fp.py
+++ b/auto_round/export/export_to_autoround/qlinear_fp.py
@@ -234,4 +234,3 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8)
 
     return packed.reshape(m, n // 2)
-
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index a49c0ac4c..4d67c4c15 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -1025,4 +1025,3 @@ def build_pip_commands(gptq_req, other_reqs):
         log(joined_cmds)
         if logger_level == "error":
             exit(-1)
-
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index dd084add1..21bd1d36b 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -35,7 +35,7 @@ def __iter__(self):
 class TestAutoRoundFP(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/
+        model_name = "facebook/opt-125m"  # /tf_dataset/auto_round/models/
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -45,7 +45,6 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-    
 
     def test_nvfp4_moe_actmax_rtn(self):
         model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
@@ -86,7 +85,6 @@ def test_nvfp4_moe_actmax_ar(self):
         )
         autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
 
-    
     def test_mxfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
@@ -134,10 +132,10 @@ def test_mxfp4_llmcompressor_format(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_rtn_mxfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "MXFP4"
         layer_config = {}
         fp_layers_str = "k_proj"
@@ -180,10 +178,10 @@ def test_rtn_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_mxfp8_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "MXFP8"
         autoround = AutoRound(
             model_name,
@@ -217,10 +215,10 @@ def test_mxfp8_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_nvfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "NVFP4"
         autoround = AutoRound(
             model_name,
@@ -254,7 +252,6 @@ def test_nvfp4_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_nvfp4_autoround_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
@@ -280,10 +277,10 @@ def test_nvfp4_autoround_format(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_nvfp4_autoround_save_quantized(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "NVFP4"
         autoround = AutoRound(
             model_name,
@@ -306,7 +303,6 @@ def test_nvfp4_autoround_save_quantized(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_qwen_moe_quant_infer(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
         layer_config = {
@@ -327,6 +323,7 @@ def test_qwen_moe_quant_infer(self):
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
+
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
         print(result["results"]["piqa"]["acc,none"])
         self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
@@ -335,4 +332,3 @@ def test_qwen_moe_quant_infer(self):
 
 if __name__ == "__main__":
     unittest.main()
-    
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
index 7f0d9c82b..48dd27d9b 100644
--- a/test/test_cuda/test_mxfp_nvfp.py
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -11,6 +11,7 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_awq, require_optimum
 
+
 class LLMDataLoader:
     def __init__(self):
         self.batch_size = 1
@@ -32,7 +33,6 @@ def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-
     def test_fp8input_mxfp4_llmcompressor_format(self):
         model_name = "/models/Qwen3-0.6B-FP8"
         scheme = "mxfp4"
@@ -110,7 +110,6 @@ def test_nvfp4_llmcompressor_format(self):
         #     if "France" in prompt:
         #         assert "Paris" in generated_text
 
-
     def test_nvfp4_moe_actmax_rtn(self):
         model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
         scheme = "nvfp4"
@@ -141,7 +140,6 @@ def test_nvfp4_moe_actmax_ar(self):
         quantized_model_path = self.save_dir
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
-
     def test_qwen_moe_quant_infer(self):
         model_name = "/models/Qwen1.5-MoE-A2.7B"
         layer_config = {
@@ -162,6 +160,7 @@ def test_qwen_moe_quant_infer(self):
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
+
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa")
         print(result["results"]["piqa"]["acc,none"])
         self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7)
@@ -170,4 +169,3 @@ def test_qwen_moe_quant_infer(self):
 
 if __name__ == "__main__":
     unittest.main()
-    

From b3a0037e9c71ed897c348a462b3a006b0fcb27fa Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 17 Oct 2025 16:40:18 +0800
Subject: [PATCH 3/7] fix UT

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 test/test_cpu/test_mxfp_nvfp.py | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 21bd1d36b..7465ef45a 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -35,7 +35,7 @@ def __iter__(self):
 class TestAutoRoundFP(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        model_name = "facebook/opt-125m"  # /tf_dataset/auto_round/models/
+        model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -45,6 +45,7 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
+    
 
     def test_nvfp4_moe_actmax_rtn(self):
         model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
@@ -85,18 +86,13 @@ def test_nvfp4_moe_actmax_ar(self):
         )
         autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
 
+    
     def test_mxfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
 
         scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
+        layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}}
         autoround = AutoRound(
             model_name,
             scheme=scheme,
@@ -132,18 +128,12 @@ def test_mxfp4_llmcompressor_format(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
+
     def test_rtn_mxfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
-
         scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
+        layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}}
         autoround = AutoRound(
             model_name,
             scheme=scheme,
@@ -178,10 +168,10 @@ def test_rtn_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree("./saved", ignore_errors=True)
 
+
     def test_mxfp8_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
-
         scheme = "MXFP8"
         autoround = AutoRound(
             model_name,
@@ -215,10 +205,10 @@ def test_mxfp8_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
+
     def test_nvfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
-
         scheme = "NVFP4"
         autoround = AutoRound(
             model_name,
@@ -252,6 +242,7 @@ def test_nvfp4_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
+
     def test_nvfp4_autoround_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
@@ -277,10 +268,10 @@ def test_nvfp4_autoround_format(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree("./saved", ignore_errors=True)
 
+
     def test_nvfp4_autoround_save_quantized(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
-
         scheme = "NVFP4"
         autoround = AutoRound(
             model_name,
@@ -303,6 +294,7 @@ def test_nvfp4_autoround_save_quantized(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree("./saved", ignore_errors=True)
 
+
     def test_qwen_moe_quant_infer(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
         layer_config = {
@@ -323,7 +315,6 @@ def test_qwen_moe_quant_infer(self):
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
-
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
         print(result["results"]["piqa"]["acc,none"])
         self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
@@ -332,3 +323,4 @@ def test_qwen_moe_quant_infer(self):
 
 if __name__ == "__main__":
     unittest.main()
+

From 894775b3f1b8b28734c0f59d4d09283c9b25e0e9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 17 Oct 2025 08:40:48 +0000
Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/test_cpu/test_mxfp_nvfp.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 7465ef45a..4fcd25135 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -35,7 +35,7 @@ def __iter__(self):
 class TestAutoRoundFP(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        model_name = "facebook/opt-125m" #/tf_dataset/auto_round/models/
+        model_name = "facebook/opt-125m"  # /tf_dataset/auto_round/models/
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -45,7 +45,6 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-    
 
     def test_nvfp4_moe_actmax_rtn(self):
         model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
@@ -86,7 +85,6 @@ def test_nvfp4_moe_actmax_ar(self):
         )
         autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
 
-    
     def test_mxfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
@@ -128,10 +126,10 @@ def test_mxfp4_llmcompressor_format(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_rtn_mxfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "MXFP4"
         layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}}
         autoround = AutoRound(
@@ -168,10 +166,10 @@ def test_rtn_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_mxfp8_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "MXFP8"
         autoround = AutoRound(
             model_name,
@@ -205,10 +203,10 @@ def test_mxfp8_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_nvfp4_llmcompressor_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "NVFP4"
         autoround = AutoRound(
             model_name,
@@ -242,7 +240,6 @@ def test_nvfp4_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_nvfp4_autoround_format(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
@@ -268,10 +265,10 @@ def test_nvfp4_autoround_format(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_nvfp4_autoround_save_quantized(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from transformers import AutoConfig
+
         scheme = "NVFP4"
         autoround = AutoRound(
             model_name,
@@ -294,7 +291,6 @@ def test_nvfp4_autoround_save_quantized(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree("./saved", ignore_errors=True)
 
-
     def test_qwen_moe_quant_infer(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
         layer_config = {
@@ -315,6 +311,7 @@ def test_qwen_moe_quant_infer(self):
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
+
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
         print(result["results"]["piqa"]["acc,none"])
         self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
@@ -323,4 +320,3 @@ def test_qwen_moe_quant_infer(self):
 
 if __name__ == "__main__":
     unittest.main()
-

From fe1e1988ab26709abbf3c787d4cb6d6ec2073281 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Mon, 20 Oct 2025 22:25:21 +0800
Subject: [PATCH 5/7] refine mxfp&nvfp layer checker

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../export/export_to_autoround/qlinear_fp.py  | 21 ++++++++++++++-----
 auto_round/inference/backend.py               | 18 ++++++++++++----
 auto_round/utils.py                           |  9 ++++++++
 3 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py
index c2ceaf9fa..61df26483 100644
--- a/auto_round/export/export_to_autoround/qlinear_fp.py
+++ b/auto_round/export/export_to_autoround/qlinear_fp.py
@@ -38,7 +38,7 @@
 from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL
 from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad
-from auto_round.utils import _get_packing_device, is_mx_fp, is_nv_fp
+from auto_round.utils import BackendDataType, _get_packing_device, is_mx_fp, is_nv_fp
 
 # from auto_round.utils import get_weight_compress_dtype
 logger = getLogger(__name__)
@@ -74,10 +74,20 @@ def __init__(
             raise NotImplementedError("Only 4,8 bits are supported.")
         self.is_mx = is_mx_fp(data_type)
         self.is_nv = is_nv_fp(data_type)
-        if self.is_mx and group_size != 32:
-            raise NotImplementedError("Only group_size 32 are supported for mxfp.")
-        if self.is_nv and group_size not in [16, 32]:
-            raise NotImplementedError("Only group_size 16 are supported for nvfp.")
+        if self.is_mx:
+            if group_size != 32:
+                raise NotImplementedError(f"Only group_size 32 are supported for {BackendDataType.MX_FP} data type.")
+            if infeatures % group_size != 0:
+                raise NotImplementedError(
+                    f"in_feature must be divisible by {group_size} for {BackendDataType.MX_FP} data type."
+                )
+        if self.is_nv:
+            if group_size % 16 != 0:
+                raise NotImplementedError(f"Only group_size 16 are supported for {BackendDataType.NV_FP} data type.")
+            if infeatures % group_size != 0:
+                raise NotImplementedError(
+                    f"in_feature must be divisible by {group_size} for {BackendDataType.NV_FP} data type."
+                )
         self.infeatures = infeatures
         self.outfeatures = outfeatures
         self.bits = bits
@@ -234,3 +244,4 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8)
 
     return packed.reshape(m, n // 2)
+
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 4d67c4c15..5482cf788 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -127,12 +127,19 @@ def feature_multiply_checker_group_size(
     )
 
 
+def in_feature_checker_group_size(in_feature, out_feature, config):
+    group_size = config["group_size"]
+    return in_feature % group_size == 0
+
+
 feature_multiply_checker_32 = functools.partial(feature_multiply_checker, in_feature_multiplier=32)
 feature_multiply_checker_16 = functools.partial(feature_multiply_checker, in_feature_multiplier=16)
 in_output_feature_multiply_checker_32 = functools.partial(
     feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32
 )
-
+in_feature_multiply_checker_32 = functools.partial(
+    feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=None
+)
 exllamav2_feature_checker = functools.partial(
     feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32
 )
@@ -141,6 +148,8 @@ def feature_multiply_checker_group_size(
     feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
 )
 
+mxfp_nvfp_feature_checker = functools.partial(in_feature_checker_group_size)
+
 
 def fp8_static_scheme_checker(
     in_feature: int,
@@ -239,7 +248,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -259,7 +268,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -280,7 +289,7 @@ def fp8_static_scheme_checker(
     act_data_type=["nv_fp4_with_static_gs"],
     act_dynamic=[True],
     priority=0,
-    checkers=[],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -1025,3 +1034,4 @@ def build_pip_commands(gptq_req, other_reqs):
         log(joined_cmds)
         if logger_level == "error":
             exit(-1)
+
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 8c8c9acc5..6aa38f547 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2963,6 +2963,15 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
                     layer_config.setdefault(n, copy.deepcopy(default_dict))
                     layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True})
                     logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).")
+    # enforce shape divisibility for mxfp/nvfp
+    if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name:
+        for n, m in model.named_modules():
+            if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
+                if m.weight.shape[1] % default_dict["group_size"]:
+                    layer_config.setdefault(n, copy.deepcopy(default_dict))
+                    layer_config[n].update(
+                        {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True})
+                    logger.warning_once(f"{n} skipped quantization (shape not divisible by {default_dict['group_size']}).")
 
     # 9. block layers: mark as in_blocks=True
     for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):

From ca9672c47b585c8a3b288f27a1466f76db594bfe Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Mon, 20 Oct 2025 22:40:11 +0800
Subject: [PATCH 6/7] fix pylint

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index 6aa38f547..70d7f013b 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2970,8 +2970,11 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
                 if m.weight.shape[1] % default_dict["group_size"]:
                     layer_config.setdefault(n, copy.deepcopy(default_dict))
                     layer_config[n].update(
-                        {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True})
-                    logger.warning_once(f"{n} skipped quantization (shape not divisible by {default_dict['group_size']}).")
+                        {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True}
+                    )
+                    logger.warning_once(
+                        f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})."
+                    )
 
     # 9. block layers: mark as in_blocks=True
     for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
@@ -3051,3 +3054,4 @@ def is_diffusion_model(model_or_path: Union[str, object]) -> bool:
         return isinstance(model_or_path, pipeline_utils.DiffusionPipeline)
     else:
         return False
+

From f3cc522c7dfae8bb2d9e0dec8d3a6b3e6a5a7987 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 20 Oct 2025 15:28:22 +0000
Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_autoround/qlinear_fp.py | 1 -
 auto_round/inference/backend.py                     | 1 -
 auto_round/utils.py                                 | 1 -
 3 files changed, 3 deletions(-)

diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py
index 61df26483..f7979e269 100644
--- a/auto_round/export/export_to_autoround/qlinear_fp.py
+++ b/auto_round/export/export_to_autoround/qlinear_fp.py
@@ -244,4 +244,3 @@ def _pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8)
 
     return packed.reshape(m, n // 2)
-
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 5482cf788..4d26eb143 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -1034,4 +1034,3 @@ def build_pip_commands(gptq_req, other_reqs):
         log(joined_cmds)
         if logger_level == "error":
             exit(-1)
-
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 70d7f013b..6742011fa 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -3054,4 +3054,3 @@ def is_diffusion_model(model_or_path: Union[str, object]) -> bool:
         return isinstance(model_or_path, pipeline_utils.DiffusionPipeline)
     else:
         return False
-