diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 0bc5b35b3..aa74f37e1 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -107,11 +107,6 @@ class BackendInfo: "act_dynamic", ] -MX_TENSOR_DATA_TYPES = [ - "mx_fp", - "mx_fp_rceil", -] - def feature_multiply_checker(in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None): if out_feature_multiplier is None: @@ -235,13 +230,13 @@ def fp8_static_scheme_checker( packing_format=LLM_COMPRESSOR_FORMAT, sym=[True], compute_dtype=["float32", "float16", "bfloat16"], - data_type=MX_TENSOR_DATA_TYPES, + data_type=["mx_fp", "max_fp_rceil"], group_size=[32], bits=[8], act_bits=[8], act_group_size=[32], act_sym=[True], - act_data_type=MX_TENSOR_DATA_TYPES, + act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, checkers=[feature_multiply_checker_32], @@ -255,13 +250,13 @@ def fp8_static_scheme_checker( packing_format=LLM_COMPRESSOR_FORMAT, sym=[True], compute_dtype=["float32", "float16", "bfloat16"], - data_type=MX_TENSOR_DATA_TYPES, + data_type=["mx_fp"], group_size=[32], bits=[4], act_bits=[4], act_group_size=[32], act_sym=[True], - act_data_type=MX_TENSOR_DATA_TYPES, + act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, checkers=[feature_multiply_checker_32], diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py index d67329c2d..637c84146 100644 --- a/auto_round/testing_utils.py +++ b/auto_round/testing_utils.py @@ -268,11 +268,3 @@ def decorator(test_func: Callable) -> Callable: return unittest.skipUnless(require_package_version(package, version_spec, on_fail="skip"), reason)(test_func) return decorator - - -def has_module(model: torch.nn.Module, target_module_type: torch.nn.Module) -> bool: - """Check if the model contains a specific module type.""" - for _, module in model.named_modules(): - if isinstance(module, target_module_type): - return True - return False diff --git a/test/test_cpu/test_mxfp_save_load.py b/test/test_cpu/test_mxfp_save_load.py deleted file mode 100644 index 374ccf5ce..000000000 --- a/test/test_cpu/test_mxfp_save_load.py +++ /dev/null @@ -1,69 +0,0 @@ -import shutil -import tempfile - -import pytest -import torch -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM - -from auto_round import AutoRound -from auto_round import schemes as ar_schemes -from auto_round.experimental import qmodules as ar_qmodules -from auto_round.export.export_to_autoround import AutoRoundFormat -from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp -from auto_round.inference.backend import MX_TENSOR_DATA_TYPES -from auto_round.testing_utils import has_module - -testing_scheme_name_lst = [ - AutoRoundFormat.MXFP8.value, - AutoRoundFormat.MXFP4.value, -] -QMODULE_MAPPING = { - AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear, - AutoRoundFormat.MXFP4.value: ar_qmodules.MXFP4QuantLinear, -} -SCHEMES_MAPPING = { - AutoRoundFormat.MXFP8.value: ar_schemes.MXFP8, - AutoRoundFormat.MXFP4.value: ar_schemes.MXFP4, -} - - -@pytest.mark.parametrize("scheme_name", testing_scheme_name_lst) -@pytest.mark.parametrize("weight_data_type", MX_TENSOR_DATA_TYPES) -@pytest.mark.parametrize("act_data_type", MX_TENSOR_DATA_TYPES) -@torch.inference_mode() -def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): - # Use a temporary directory for saving the quantized model - with tempfile.TemporaryDirectory() as temp_dir: - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" - config = AutoConfig.from_pretrained(model_name) - config.num_hidden_layers = 2 # Use a smaller model for testing - - # Load the tokenizer and model - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - model = Qwen2ForCausalLM(config) - scheme = SCHEMES_MAPPING[scheme_name] - scheme.data_type = weight_data_type - scheme.act_data_type = act_data_type - # Initialize AutoRound for quantization - autoround = AutoRound( - model, - tokenizer, - scheme=scheme, - iters=0, - nsamples=2, - ) - - # Quantize and save the model to the temporary directory - quantized_model_path = f"{temp_dir}/tmp_autoround" - autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) - - # Perform inference with the quantized model - model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, - torch_dtype="auto", - ) - model.eval() - assert has_module( - model, QMODULE_MAPPING[scheme_name] - ), f"Expected {QMODULE_MAPPING[scheme_name].__name__} in the model." diff --git a/test/test_cuda/test_mxfp_and_nvfp_quant.py b/test/test_cuda/test_mxfp_and_nvfp_quant.py index 0dc43b093..d15cde5be 100644 --- a/test/test_cuda/test_mxfp_and_nvfp_quant.py +++ b/test/test_cuda/test_mxfp_and_nvfp_quant.py @@ -10,7 +10,6 @@ from auto_round.experimental import qmodules as ar_qmodules from auto_round.export.export_to_autoround import AutoRoundFormat from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp -from auto_round.testing_utils import has_module testing_schemes = [AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, AutoRoundFormat.NVFP4.value] QMODULE_MAPPING = { @@ -20,6 +19,14 @@ } +def has_module(model: torch.nn.Module, target_module_type: torch.nn.Module) -> bool: + """Check if the model contains a specific module type.""" + for _, module in model.named_modules(): + if isinstance(module, target_module_type): + return True + return False + + @pytest.mark.parametrize("scheme", testing_schemes) @torch.inference_mode() def test_e2e_quant_and_infer(scheme):