diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py index 34cbb6f94..f7979e269 100644 --- a/auto_round/export/export_to_autoround/qlinear_fp.py +++ b/auto_round/export/export_to_autoround/qlinear_fp.py @@ -38,7 +38,7 @@ from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad -from auto_round.utils import _get_packing_device, is_mx_fp, is_nv_fp +from auto_round.utils import BackendDataType, _get_packing_device, is_mx_fp, is_nv_fp # from auto_round.utils import get_weight_compress_dtype logger = getLogger(__name__) @@ -72,14 +72,22 @@ def __init__( super().__init__() if bits not in [4, 8]: raise NotImplementedError("Only 4,8 bits are supported.") - if infeatures % 32 != 0 or outfeatures % 32 != 0: - raise NotImplementedError("in_feature and out_feature must be divisible by 32.") self.is_mx = is_mx_fp(data_type) self.is_nv = is_nv_fp(data_type) - if self.is_mx and group_size != 32: - raise NotImplementedError("Only group_size 32 are supported for mxfp.") - if self.is_nv and group_size not in [16, 32]: - raise NotImplementedError("Only group_size 16 are supported for nvfp.") + if self.is_mx: + if group_size != 32: + raise NotImplementedError(f"Only group_size 32 are supported for {BackendDataType.MX_FP} data type.") + if infeatures % group_size != 0: + raise NotImplementedError( + f"in_feature must be divisible by {group_size} for {BackendDataType.MX_FP} data type." + ) + if self.is_nv: + if group_size % 16 != 0: + raise NotImplementedError(f"Only group_size 16 are supported for {BackendDataType.NV_FP} data type.") + if infeatures % group_size != 0: + raise NotImplementedError( + f"in_feature must be divisible by {group_size} for {BackendDataType.NV_FP} data type." + ) self.infeatures = infeatures self.outfeatures = outfeatures self.bits = bits diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index aa74f37e1..4d26eb143 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -127,12 +127,19 @@ def feature_multiply_checker_group_size( ) +def in_feature_checker_group_size(in_feature, out_feature, config): + group_size = config["group_size"] + return in_feature % group_size == 0 + + feature_multiply_checker_32 = functools.partial(feature_multiply_checker, in_feature_multiplier=32) feature_multiply_checker_16 = functools.partial(feature_multiply_checker, in_feature_multiplier=16) in_output_feature_multiply_checker_32 = functools.partial( feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32 ) - +in_feature_multiply_checker_32 = functools.partial( + feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=None +) exllamav2_feature_checker = functools.partial( feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32 ) @@ -141,6 +148,8 @@ def feature_multiply_checker_group_size( feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64 ) +mxfp_nvfp_feature_checker = functools.partial(in_feature_checker_group_size) + def fp8_static_scheme_checker( in_feature: int, @@ -239,7 +248,7 @@ def fp8_static_scheme_checker( act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, - checkers=[feature_multiply_checker_32], + checkers=[mxfp_nvfp_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -259,7 +268,7 @@ def fp8_static_scheme_checker( act_data_type=["mx_fp_rceil"], act_dynamic=[True], priority=0, - checkers=[feature_multiply_checker_32], + checkers=[mxfp_nvfp_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) @@ -280,7 +289,7 @@ def fp8_static_scheme_checker( act_data_type=["nv_fp4_with_static_gs"], act_dynamic=[True], priority=0, - checkers=[feature_multiply_checker_16], + checkers=[mxfp_nvfp_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.7.0"], ) diff --git a/auto_round/utils.py b/auto_round/utils.py index 8c8c9acc5..6742011fa 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2963,6 +2963,18 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str layer_config.setdefault(n, copy.deepcopy(default_dict)) layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") + # enforce shape divisibility for mxfp/nvfp + if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[1] % default_dict["group_size"]: + layer_config.setdefault(n, copy.deepcopy(default_dict)) + layer_config[n].update( + {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True} + ) + logger.warning_once( + f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})." + ) # 9. block layers: mark as in_blocks=True for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 180fd8f2f..ea484316b 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -302,275 +302,6 @@ def test_static_afp8_export(self, static_kv_dtype): self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mxfp4_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "MXFP4" - layer_config = {} - fp_layers_str = "k_proj" - from auto_round.utils import get_fp_layer_names - - not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - layer_config=layer_config, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - autoround.quantize() - compressed_model = autoround.save_quantized( - output_dir=quantized_model_path, inplace=True, format="llm_compressor" - ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_packed") - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal MXFP4 packing name or data_type or shape" - assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers - skip_layer, "weight_packed" - ), "Illegal MXFP4 quantization for fp_layers" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 - ), f"Invalid MXFP4 quantization configuration: {quantization_config}" - - shutil.rmtree("./saved", ignore_errors=True) - - def test_rtn_mxfp4_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "MXFP4" - layer_config = {} - fp_layers_str = "k_proj" - from auto_round.utils import get_fp_layer_names - - not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str) - for name in not_quantize_layer_names: - layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"} - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=0, - seqlen=2, - layer_config=layer_config, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - autoround.quantize() - compressed_model = autoround.save_quantized( - output_dir=quantized_model_path, inplace=True, format="llm_compressor" - ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_packed") - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal MXFP4 packing name or data_type or shape" - assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers - skip_layer, "weight_packed" - ), "Illegal MXFP4 quantization for fp_layers" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 - ), f"Invalid MXFP4 quantization configuration: {quantization_config}" - shutil.rmtree("./saved", ignore_errors=True) - - def test_mxfp8_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "MXFP8" - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight") - and tmp_layer.weight.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal MXFP8 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8 - ), f"Invalid MXFP8 quantization configuration: {quantization_config}" - folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty - assert ( - 0.15 < folder_size_gb < 0.2 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_llmcompressor_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "NVFP4" - autoround = AutoRound( - model, - self.tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "nvfp4-pack-quantized" - and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 - ), f"Invalid NVFP4 quantization configuration: {quantization_config}" - folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty - assert ( - 0.1 < folder_size_gb < 0.15 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_autoround_format(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "NVFP4" - autoround = AutoRound( - model, - self.tokenizer, - scheme="NVFP4", - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_autoround_save_quantized(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - from transformers import AutoConfig - - scheme = "NVFP4" - autoround = AutoRound( - model, - self.tokenizer, - scheme="NVFP4", - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - autoround.quantize() - compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - shutil.rmtree("./saved", ignore_errors=True) - - def test_nvfp4_moe_actmax_rtn(self): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" - layer_config = { - "self_attn": {"bits": 16, "act_bits": 16}, - "mlp.shared_experts": {"bits": 16, "act_bits": 16}, - } - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=0, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - layer_config=layer_config, - ) - compressed_model, _ = autoround.quantize() - assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max") - - def test_nvfp4_moe_actmax_ar(self): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" - layer_config = { - "q_proj": {"bits": 16, "act_bits": 16}, - "mlp.shared_experts": {"bits": 16, "act_bits": 16}, - "experts.*2": {"bits": 16, "act_bits": 16}, - "experts.*5": {"bits": 16, "act_bits": 16}, - } - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=1, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - layer_config=layer_config, - ) - autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") - if __name__ == "__main__": unittest.main() diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py new file mode 100644 index 000000000..4fcd25135 --- /dev/null +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -0,0 +1,322 @@ +import os +import shutil +import sys +import unittest + +from parameterized import parameterized + +sys.path.insert(0, "../..") +import torch +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer + +from auto_round import AutoRound + + +def _get_folder_size(path: str) -> float: + """Return folder size in GB.""" + total_size = 0 + for dirpath, _, filenames in os.walk(path): + for f in filenames: + fp = os.path.join(dirpath, f) + if os.path.isfile(fp): + total_size += os.path.getsize(fp) + return total_size / (1024**3) # convert to GB + + +class LLMDataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +class TestAutoRoundFP(unittest.TestCase): + @classmethod + def setUpClass(self): + model_name = "facebook/opt-125m" # /tf_dataset/auto_round/models/ + self.save_dir = "./saved" + self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + self.llm_dataloader = LLMDataLoader() + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + def test_nvfp4_moe_actmax_rtn(self): + model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + layer_config = { + "self_attn": {"bits": 16, "act_bits": 16}, + "mlp.shared_experts": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=0, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + compressed_model, _ = autoround.quantize() + assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max") + + def test_nvfp4_moe_actmax_ar(self): + model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + layer_config = { + "q_proj": {"bits": 16, "act_bits": 16}, + "mlp.shared_experts": {"bits": 16, "act_bits": 16}, + "experts.*2": {"bits": 16, "act_bits": 16}, + "experts.*5": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") + + def test_mxfp4_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "MXFP4" + layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}} + autoround = AutoRound( + model_name, + scheme=scheme, + iters=2, + seqlen=2, + layer_config=layer_config, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + autoround.quantize() + compressed_model = autoround.save_quantized( + output_dir=quantized_model_path, inplace=True, format="llm_compressor" + ) + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_packed") + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal MXFP4 packing name or data_type or shape" + assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers + skip_layer, "weight_packed" + ), "Illegal MXFP4 quantization for fp_layers" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 + ), f"Invalid MXFP4 quantization configuration: {quantization_config}" + + shutil.rmtree("./saved", ignore_errors=True) + + def test_rtn_mxfp4_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "MXFP4" + layer_config = {"k_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}} + autoround = AutoRound( + model_name, + scheme=scheme, + iters=0, + seqlen=2, + layer_config=layer_config, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + autoround.quantize() + compressed_model = autoround.save_quantized( + output_dir=quantized_model_path, inplace=True, format="llm_compressor" + ) + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_packed") + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal MXFP4 packing name or data_type or shape" + assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers + skip_layer, "weight_packed" + ), "Illegal MXFP4 quantization for fp_layers" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 + ), f"Invalid MXFP4 quantization configuration: {quantization_config}" + shutil.rmtree("./saved", ignore_errors=True) + + def test_mxfp8_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "MXFP8" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight") + and tmp_layer.weight.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal MXFP8 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8 + ), f"Invalid MXFP8 quantization configuration: {quantization_config}" + folder_size_gb = _get_folder_size(quantized_model_path) + # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty + assert ( + 0.15 < folder_size_gb < 0.2 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" + shutil.rmtree("./saved", ignore_errors=True) + + def test_nvfp4_llmcompressor_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "NVFP4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "nvfp4-pack-quantized" + and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 + ), f"Invalid NVFP4 quantization configuration: {quantization_config}" + folder_size_gb = _get_folder_size(quantized_model_path) + # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty + assert ( + 0.1 < folder_size_gb < 0.15 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" + shutil.rmtree("./saved", ignore_errors=True) + + def test_nvfp4_autoround_format(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "NVFP4" + autoround = AutoRound( + model_name, + scheme="NVFP4", + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + shutil.rmtree("./saved", ignore_errors=True) + + def test_nvfp4_autoround_save_quantized(self): + model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + from transformers import AutoConfig + + scheme = "NVFP4" + autoround = AutoRound( + model_name, + scheme="NVFP4", + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + autoround.quantize() + compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + shutil.rmtree("./saved", ignore_errors=True) + + def test_qwen_moe_quant_infer(self): + model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" + layer_config = { + "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + quantized_model_path = self.save_dir + autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + from auto_round.eval.evaluation import simple_evaluate_user_model + + result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) + print(result["results"]["piqa"]["acc,none"]) + self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) + shutil.rmtree(quantized_model_path, ignore_errors=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index 0ab05134f..d6d6c1f93 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -322,116 +322,6 @@ def test_autoround_3bit_sym_torch_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_fp8input_mxfp4_llmcompressor_format(self): - model_name = "/models/Qwen3-0.6B-FP8" - scheme = "mxfp4" - ar = AutoRound( - model=model_name, - iters=2, - seqlen=2, - scheme=scheme, - dataset=self.llm_dataloader, - ) - compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") - tmp_layer = compressed_model.model.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_packed") - and tmp_layer.weight_scale.dtype is torch.uint8 - and tmp_layer.weight_scale.shape[0] == 2048 - ), "Illegal MXFP4 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config - assert ( - quantization_config["format"] == "float-quantized" - and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True - and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 - ), f"Invalid MXFP4 quantization configuration: {quantization_config}" - shutil.rmtree(self.save_dir, ignore_errors=True) - - def test_nvfp4_llmcompressor_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - scheme = "nvfp4" - autoround = AutoRound( - model, - tokenizer, - scheme=scheme, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - assert ( - hasattr(tmp_layer, "weight_scale") - and hasattr(tmp_layer, "weight_global_scale") - and hasattr(tmp_layer, "input_global_scale") - and tmp_layer.weight_packed.dtype is torch.uint8 - and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn - and tmp_layer.weight_scale.shape[0] == 768 - ), "Illegal NVFP4 packing name or data_type or shape" - quantization_config = AutoConfig.from_pretrained( - quantized_model_path, trust_remote_code=True - ).quantization_config - assert ( - quantization_config["format"] == "nvfp4-pack-quantized" - and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 - ), f"Invalid NVFP4 quantization configuration: {quantization_config}" - shutil.rmtree("./saved", ignore_errors=True) - # from vllm import LLM, SamplingParams - # prompts = [ - # "The capital of France is", - # "The future of AI is", - # ] - ## Create a sampling params object. - # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20) - # QUANTIZATION = "compressed-tensors" - # llm = LLM(model=quantized_model_path, - # # quantization=QUANTIZATION, - # trust_remote_code=True, - # tensor_parallel_size=1, - # enforce_eager=True, - # gpu_memory_utilization=0.7, - # ) - # outputs = llm.generate(prompts, sampling_params) - # # Print the outputs. - # for output in outputs: - # prompt = output.prompt - # generated_text = output.outputs[0].text - # if "France" in prompt: - # assert "Paris" in generated_text - - def test_nvfp4_moe_actmax_rtn(self): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=0, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - ) - autoround.quantize() - quantized_model_path = self.save_dir - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - - def test_nvfp4_moe_actmax_ar(self): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" - scheme = "nvfp4" - autoround = AutoRound( - model_name, - scheme=scheme, - iters=1, - seqlen=2, - nsamples=2, - dataset=self.llm_dataloader, - ) - autoround.quantize() - quantized_model_path = self.save_dir - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - if __name__ == "__main__": unittest.main() diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py new file mode 100644 index 000000000..48dd27d9b --- /dev/null +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -0,0 +1,171 @@ +import copy +import shutil +import sys +import unittest + +sys.path.insert(0, "../..") +import torch +import transformers +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from auto_round import AutoRound +from auto_round.testing_utils import require_awq, require_optimum + + +class LLMDataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +class TestAutoRound(unittest.TestCase): + @classmethod + def setUpClass(self): + self.model_name = "facebook/opt-125m" + self.save_dir = "./saved" + self.llm_dataloader = LLMDataLoader() + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + def test_fp8input_mxfp4_llmcompressor_format(self): + model_name = "/models/Qwen3-0.6B-FP8" + scheme = "mxfp4" + ar = AutoRound( + model=model_name, + iters=2, + seqlen=2, + scheme=scheme, + dataset=self.llm_dataloader, + ) + compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") + tmp_layer = compressed_model.model.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_packed") + and tmp_layer.weight_scale.dtype is torch.uint8 + and tmp_layer.weight_scale.shape[0] == 2048 + ), "Illegal MXFP4 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained(self.save_dir, trust_remote_code=True).quantization_config + assert ( + quantization_config["format"] == "float-quantized" + and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True + and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4 + ), f"Invalid MXFP4 quantization configuration: {quantization_config}" + shutil.rmtree(self.save_dir, ignore_errors=True) + + def test_nvfp4_llmcompressor_format(self): + scheme = "nvfp4" + autoround = AutoRound( + self.model_name, + scheme=scheme, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + assert ( + hasattr(tmp_layer, "weight_scale") + and hasattr(tmp_layer, "weight_global_scale") + and hasattr(tmp_layer, "input_global_scale") + and tmp_layer.weight_packed.dtype is torch.uint8 + and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn + and tmp_layer.weight_scale.shape[0] == 768 + ), "Illegal NVFP4 packing name or data_type or shape" + quantization_config = AutoConfig.from_pretrained( + quantized_model_path, trust_remote_code=True + ).quantization_config + assert ( + quantization_config["format"] == "nvfp4-pack-quantized" + and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 + ), f"Invalid NVFP4 quantization configuration: {quantization_config}" + shutil.rmtree("./saved", ignore_errors=True) + # from vllm import LLM, SamplingParams + # prompts = [ + # "The capital of France is", + # "The future of AI is", + # ] + ## Create a sampling params object. + # sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20) + # QUANTIZATION = "compressed-tensors" + # llm = LLM(model=quantized_model_path, + # # quantization=QUANTIZATION, + # trust_remote_code=True, + # tensor_parallel_size=1, + # enforce_eager=True, + # gpu_memory_utilization=0.7, + # ) + # outputs = llm.generate(prompts, sampling_params) + # # Print the outputs. + # for output in outputs: + # prompt = output.prompt + # generated_text = output.outputs[0].text + # if "France" in prompt: + # assert "Paris" in generated_text + + def test_nvfp4_moe_actmax_rtn(self): + model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=0, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = self.save_dir + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") + + def test_nvfp4_moe_actmax_ar(self): + model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = self.save_dir + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") + + def test_qwen_moe_quant_infer(self): + model_name = "/models/Qwen1.5-MoE-A2.7B" + layer_config = { + "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, + } + scheme = "nvfp4" + autoround = AutoRound( + model_name, + scheme=scheme, + iters=1, + seqlen=2, + nsamples=2, + dataset=self.llm_dataloader, + layer_config=layer_config, + ) + quantized_model_path = self.save_dir + autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + from auto_round.eval.evaluation import simple_evaluate_user_model + + result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa") + print(result["results"]["piqa"]["acc,none"]) + self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7) + shutil.rmtree(quantized_model_path, ignore_errors=True) + + +if __name__ == "__main__": + unittest.main()