diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py
index 34cbb6f94..f7979e269 100644
--- a/auto_round/export/export_to_autoround/qlinear_fp.py
+++ b/auto_round/export/export_to_autoround/qlinear_fp.py
@@ -38,7 +38,7 @@
 from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL
 from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad
-from auto_round.utils import _get_packing_device, is_mx_fp, is_nv_fp
+from auto_round.utils import BackendDataType, _get_packing_device, is_mx_fp, is_nv_fp
 
 # from auto_round.utils import get_weight_compress_dtype
 logger = getLogger(__name__)
@@ -72,14 +72,22 @@ def __init__(
         super().__init__()
         if bits not in [4, 8]:
             raise NotImplementedError("Only 4,8 bits are supported.")
-        if infeatures % 32 != 0 or outfeatures % 32 != 0:
-            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
         self.is_mx = is_mx_fp(data_type)
         self.is_nv = is_nv_fp(data_type)
-        if self.is_mx and group_size != 32:
-            raise NotImplementedError("Only group_size 32 are supported for mxfp.")
-        if self.is_nv and group_size not in [16, 32]:
-            raise NotImplementedError("Only group_size 16 are supported for nvfp.")
+        if self.is_mx:
+            if group_size != 32:
+                raise NotImplementedError(f"Only group_size 32 are supported for {BackendDataType.MX_FP} data type.")
+            if infeatures % group_size != 0:
+                raise NotImplementedError(
+                    f"in_feature must be divisible by {group_size} for {BackendDataType.MX_FP} data type."
+                )
+        if self.is_nv:
+            if group_size % 16 != 0:
+                raise NotImplementedError(f"Only group_size 16 are supported for {BackendDataType.NV_FP} data type.")
+            if infeatures % group_size != 0:
+                raise NotImplementedError(
+                    f"in_feature must be divisible by {group_size} for {BackendDataType.NV_FP} data type."
+                )
         self.infeatures = infeatures
         self.outfeatures = outfeatures
         self.bits = bits
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index aa74f37e1..4d26eb143 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -127,12 +127,19 @@ def feature_multiply_checker_group_size(
     )
 
 
+def in_feature_checker_group_size(in_feature, out_feature, config):
+    group_size = config["group_size"]
+    return in_feature % group_size == 0
+
+
 feature_multiply_checker_32 = functools.partial(feature_multiply_checker, in_feature_multiplier=32)
 feature_multiply_checker_16 = functools.partial(feature_multiply_checker, in_feature_multiplier=16)
 in_output_feature_multiply_checker_32 = functools.partial(
     feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32
 )
-
+in_feature_multiply_checker_32 = functools.partial(
+    feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=None
+)
 exllamav2_feature_checker = functools.partial(
     feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32
 )
@@ -141,6 +148,8 @@ def feature_multiply_checker_group_size(
     feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
 )
 
+mxfp_nvfp_feature_checker = functools.partial(in_feature_checker_group_size)
+
 
 def fp8_static_scheme_checker(
     in_feature: int,
@@ -239,7 +248,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_32],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -259,7 +268,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_32],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -280,7 +289,7 @@ def fp8_static_scheme_checker(
     act_data_type=["nv_fp4_with_static_gs"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_16],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 8c8c9acc5..6742011fa 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2963,6 +2963,18 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
                     layer_config.setdefault(n, copy.deepcopy(default_dict))
                     layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True})
                     logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).")
+    # enforce shape divisibility for mxfp/nvfp
+    if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name:
+        for n, m in model.named_modules():
+            if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
+                if m.weight.shape[1] % default_dict["group_size"]:
+                    layer_config.setdefault(n, copy.deepcopy(default_dict))
+                    layer_config[n].update(
+                        {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True}
+                    )
+                    logger.warning_once(
+                        f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})."
+                    )
 
     # 9. block layers: mark as in_blocks=True
     for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 180fd8f2f..ea484316b 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -302,275 +302,6 @@ def test_static_afp8_export(self, static_kv_dtype):
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mxfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            layer_config=layer_config,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(
-            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
-        )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
-            skip_layer, "weight_packed"
-        ), "Illegal MXFP4 quantization for fp_layers"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_rtn_mxfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            layer_config=layer_config,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(
-            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
-        )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
-            skip_layer, "weight_packed"
-        ), "Illegal MXFP4 quantization for fp_layers"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_mxfp8_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP8"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight")
-            and tmp_layer.weight.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP8 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
-        ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
-        folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
-        assert (
-            0.15 < folder_size_gb < 0.2
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "nvfp4-pack-quantized"
-            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
-        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
-        folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
-        assert (
-            0.1 < folder_size_gb < 0.15
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_autoround_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme="NVFP4",
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_autoround_save_quantized(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme="NVFP4",
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_moe_actmax_rtn(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
-        layer_config = {
-            "self_attn": {"bits": 16, "act_bits": 16},
-            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
-        }
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-            layer_config=layer_config,
-        )
-        compressed_model, _ = autoround.quantize()
-        assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max")
-
-    def test_nvfp4_moe_actmax_ar(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
-        layer_config = {
-            "q_proj": {"bits": 16, "act_bits": 16},
-            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
-            "experts.*2": {"bits": 16, "act_bits": 16},
-            "experts.*5": {"bits": 16, "act_bits": 16},
-        }
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=1,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-            layer_config=layer_config,
-        )
-        autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
new file mode 100644
index 000000000..4fcd25135
--- /dev/null
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -0,0 +1,322 @@
+import os
+import shutil
+import sys
+import unittest
+
+from parameterized import parameterized
+
+sys.path.insert(0, "../..")
+import torch
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
+
+from auto_round import AutoRound
+
+
+def _get_folder_size(path: str) -> float:
+    """Return folder size in GB."""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.isfile(fp):
+                total_size += os.path.getsize(fp)
+    return total_size / (1024**3)  # convert to GB
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRoundFP(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        model_name = "facebook/opt-125m"  # /tf_dataset/auto_round/models/
+        self.save_dir = "./saved"
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.llm_dataloader = LLMDataLoader()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+    def test_nvfp4_moe_actmax_rtn(self):
+        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+        layer_config = {
+            "self_attn": {"bits": 16, "act_bits": 16},
+            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        compressed_model, _ = autoround.quantize()
+        assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max")
+
+    def test_nvfp4_moe_actmax_ar(self):
+        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+        layer_config = {
+            "q_proj": {"bits": 16, "act_bits": 16},
+            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
+            "experts.*2": {"bits": 16, "act_bits": 16},
+            "experts.*5": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
+
+    def test_mxfp4_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "MXFP4"
+        layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}}
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize()
+        compressed_model = autoround.save_quantized(
+            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
+        )
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_packed")
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal MXFP4 packing name or data_type or shape"
+        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
+            skip_layer, "weight_packed"
+        ), "Illegal MXFP4 quantization for fp_layers"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
+        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
+
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_rtn_mxfp4_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "MXFP4"
+        layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}}
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize()
+        compressed_model = autoround.save_quantized(
+            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
+        )
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_packed")
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal MXFP4 packing name or data_type or shape"
+        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
+            skip_layer, "weight_packed"
+        ), "Illegal MXFP4 quantization for fp_layers"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
+        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_mxfp8_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "MXFP8"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight")
+            and tmp_layer.weight.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal MXFP8 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
+        ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
+        folder_size_gb = _get_folder_size(quantized_model_path)
+        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
+        assert (
+            0.15 < folder_size_gb < 0.2
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_nvfp4_llmcompressor_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "NVFP4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "nvfp4-pack-quantized"
+            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
+        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
+        folder_size_gb = _get_folder_size(quantized_model_path)
+        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
+        assert (
+            0.1 < folder_size_gb < 0.15
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_nvfp4_autoround_format(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "NVFP4"
+        autoround = AutoRound(
+            model_name,
+            scheme="NVFP4",
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_nvfp4_autoround_save_quantized(self):
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        from transformers import AutoConfig
+
+        scheme = "NVFP4"
+        autoround = AutoRound(
+            model_name,
+            scheme="NVFP4",
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize()
+        compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_qwen_moe_quant_infer(self):
+        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
+        layer_config = {
+            "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        from auto_round.eval.evaluation import simple_evaluate_user_model
+
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
+        print(result["results"]["piqa"]["acc,none"])
+        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index 0ab05134f..d6d6c1f93 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -322,116 +322,6 @@ def test_autoround_3bit_sym_torch_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_fp8input_mxfp4_llmcompressor_format(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        scheme = "mxfp4"
-        ar = AutoRound(
-            model=model_name,
-            iters=2,
-            seqlen=2,
-            scheme=scheme,
-            dataset=self.llm_dataloader,
-        )
-        compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
-        tmp_layer = compressed_model.model.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 2048
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-        shutil.rmtree(self.save_dir, ignore_errors=True)
-
-    def test_nvfp4_llmcompressor_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model,
-            tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "nvfp4-pack-quantized"
-            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
-        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
-        shutil.rmtree("./saved", ignore_errors=True)
-        # from vllm import LLM, SamplingParams
-        # prompts = [
-        #     "The capital of France is",
-        #     "The future of AI is",
-        # ]
-        ## Create a sampling params object.
-        # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20)
-        # QUANTIZATION = "compressed-tensors"
-        # llm = LLM(model=quantized_model_path,
-        #         #   quantization=QUANTIZATION,
-        #           trust_remote_code=True,
-        #           tensor_parallel_size=1,
-        #           enforce_eager=True,
-        #           gpu_memory_utilization=0.7,
-        # )
-        # outputs = llm.generate(prompts, sampling_params)
-        # # Print the outputs.
-        # for output in outputs:
-        #     prompt = output.prompt
-        #     generated_text = output.outputs[0].text
-        #     if "France" in prompt:
-        #         assert "Paris" in generated_text
-
-    def test_nvfp4_moe_actmax_rtn(self):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = self.save_dir
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
-
-    def test_nvfp4_moe_actmax_ar(self):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=1,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = self.save_dir
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
new file mode 100644
index 000000000..48dd27d9b
--- /dev/null
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -0,0 +1,171 @@
+import copy
+import shutil
+import sys
+import unittest
+
+sys.path.insert(0, "../..")
+import torch
+import transformers
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+from auto_round import AutoRound
+from auto_round.testing_utils import require_awq, require_optimum
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRound(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.model_name = "facebook/opt-125m"
+        self.save_dir = "./saved"
+        self.llm_dataloader = LLMDataLoader()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+    def test_fp8input_mxfp4_llmcompressor_format(self):
+        model_name = "/models/Qwen3-0.6B-FP8"
+        scheme = "mxfp4"
+        ar = AutoRound(
+            model=model_name,
+            iters=2,
+            seqlen=2,
+            scheme=scheme,
+            dataset=self.llm_dataloader,
+        )
+        compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
+        tmp_layer = compressed_model.model.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_packed")
+            and tmp_layer.weight_scale.dtype is torch.uint8
+            and tmp_layer.weight_scale.shape[0] == 2048
+        ), "Illegal MXFP4 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config
+        assert (
+            quantization_config["format"] == "float-quantized"
+            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
+            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
+        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
+        shutil.rmtree(self.save_dir, ignore_errors=True)
+
+    def test_nvfp4_llmcompressor_format(self):
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            self.model_name,
+            scheme=scheme,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        assert (
+            hasattr(tmp_layer, "weight_scale")
+            and hasattr(tmp_layer, "weight_global_scale")
+            and hasattr(tmp_layer, "input_global_scale")
+            and tmp_layer.weight_packed.dtype is torch.uint8
+            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
+            and tmp_layer.weight_scale.shape[0] == 768
+        ), "Illegal NVFP4 packing name or data_type or shape"
+        quantization_config = AutoConfig.from_pretrained(
+            quantized_model_path, trust_remote_code=True
+        ).quantization_config
+        assert (
+            quantization_config["format"] == "nvfp4-pack-quantized"
+            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
+        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
+        shutil.rmtree("./saved", ignore_errors=True)
+        # from vllm import LLM, SamplingParams
+        # prompts = [
+        #     "The capital of France is",
+        #     "The future of AI is",
+        # ]
+        ## Create a sampling params object.
+        # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20)
+        # QUANTIZATION = "compressed-tensors"
+        # llm = LLM(model=quantized_model_path,
+        #         #   quantization=QUANTIZATION,
+        #           trust_remote_code=True,
+        #           tensor_parallel_size=1,
+        #           enforce_eager=True,
+        #           gpu_memory_utilization=0.7,
+        # )
+        # outputs = llm.generate(prompts, sampling_params)
+        # # Print the outputs.
+        # for output in outputs:
+        #     prompt = output.prompt
+        #     generated_text = output.outputs[0].text
+        #     if "France" in prompt:
+        #         assert "Paris" in generated_text
+
+    def test_nvfp4_moe_actmax_rtn(self):
+        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = self.save_dir
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
+
+    def test_nvfp4_moe_actmax_ar(self):
+        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = self.save_dir
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
+
+    def test_qwen_moe_quant_infer(self):
+        model_name = "/models/Qwen1.5-MoE-A2.7B"
+        layer_config = {
+            "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
+        }
+        scheme = "nvfp4"
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=1,
+            seqlen=2,
+            nsamples=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        from auto_round.eval.evaluation import simple_evaluate_user_model
+
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa")
+        print(result["results"]["piqa"]["acc,none"])
+        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    unittest.main()