From 719e5abb36cab9ae1d6c184e66c45ae62fca7276 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Thu, 9 Oct 2025 15:31:54 +0800 Subject: [PATCH 1/6] fp8 exporting bugfix Signed-off-by: Zhang, Weiwei1 --- auto_round/export/export_to_autoround/export_to_fp8.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 7f069cb60..1f6cdbc65 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -109,11 +109,9 @@ def pack_layer(layer_name, model, data_type, device=None): torch_dtype = torch.float8_e5m2 info = torch.finfo(torch_dtype) if zp is not None: - q_weight = ( - weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) + zp.to(packing_device) - if isinstance(zp, torch.Tensor) - else zp - ) + if isinstance(zp, torch.Tensor): + zp = zp.to(packing_device) + q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) + zp else: q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len) @@ -235,3 +233,4 @@ def wrapper(name): save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype) return model + From 045d3a2a477ce9248671a58e034d086d8913cdcf Mon Sep 17 00:00:00 2001 From: Weiwei Date: Wed, 5 Nov 2025 14:49:15 +0800 Subject: [PATCH 2/6] Update mxnv_acc.md --- docs/mxnv_acc.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/mxnv_acc.md b/docs/mxnv_acc.md index cb764fa58..24ee865af 100644 --- a/docs/mxnv_acc.md +++ b/docs/mxnv_acc.md @@ -3,13 +3,13 @@ Average accuracy of hellaswag,lambada_openai,mmlu,piqa,winogrande. We evaluated using a fake model since we currently have no access to devices for running the real models. However, we have verified that in most cases the fake model closely matches the real model. | mxfp4 g32 | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4 | Qwen3-32B | -|-------------------|----------------------|--------------------|---------|-----------| -| RTN | 0.62124 | 0.65502 | 0.71674 | 0.69006 | -| AutoRound | 0.66862 | 0.67588 | 0.72472 | 0.72106 | -| AutoRound+alg_ext | 0.6732 | 0.68094 | 0.72252 | 0.72012 | +|:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:| +| RTN | 0.6212 | 0.6550 | 0.7167 | 0.6901 | +| AutoRound | 0.6686 | 0.6758 | 0.7247 | 0.7211 | +| AutoRound+alg_ext | 0.6732 | 0.6809 | 0.7225 | 0.7201 | | nvfp4 g16 | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4 | Qwen3-32B | -|-------------------|----------------------|--------------------|---------|-----------| -| RTN | 0.68756 | 0.6906 | 0.72962 | 0.71636 | -| AutoRound | 0.69184 | 0.69728 | 0.73058 | 0.73062 | -| AutoRound+alg_ext | 0.69648 | 0.6989 | 0.7318 | 0.72948 | \ No newline at end of file +|:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:| +| RTN | 0.6876 | 0.6906 | 0.7296 | 0.7164 | +| AutoRound | 0.6918 | 0.6973 | 0.7306 | 0.7306 | +| AutoRound+alg_ext | 0.6965 | 0.6989 | 0.7318 | 0.7295 | From c2daa79099713a7fcdd45f8a97f3fd5e7170c711 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Thu, 6 Nov 2025 16:52:46 +0800 Subject: [PATCH 3/6] refine exllama backend cuda UT Signed-off-by: Zhang, Weiwei1 --- test/test_cuda/test_exllamav2_backend.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index 5c12e0557..38905e9bd 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -12,7 +12,7 @@ from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model -from auto_round.testing_utils import require_autogptq, require_gptqmodel +from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut class LLMDataLoader: @@ -24,7 +24,7 @@ def __iter__(self): yield torch.ones([1, 10], dtype=torch.long) -class TestAutoRoundMarlinBackend(unittest.TestCase): +class TestAutoRoundexllamaBackend(unittest.TestCase): @classmethod def setUpClass(self): @@ -99,6 +99,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): shutil.rmtree("./saved", ignore_errors=True) @require_autogptq + @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -130,6 +131,7 @@ def test_gptq_exllamav2_4bits_sym(self): shutil.rmtree(self.save_folder, ignore_errors=True) @require_autogptq + @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym_group_size(self): for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!") @@ -166,3 +168,4 @@ def test_gptq_exllamav2_4bits_sym_group_size(self): if __name__ == "__main__": unittest.main() + From 018bb0fe86ae72a22b00212b20ca71eea6a93b1a Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Thu, 6 Nov 2025 17:01:33 +0800 Subject: [PATCH 4/6] refine md tables Signed-off-by: Zhang, Weiwei1 --- docs/alg_202508.md | 10 +++++----- docs/auto_scheme_acc.md | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/alg_202508.md b/docs/alg_202508.md index 086cd5cf6..58069d870 100644 --- a/docs/alg_202508.md +++ b/docs/alg_202508.md @@ -4,11 +4,11 @@ in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval. | Qwen3-8B W2G64 | Avg. | arc_challenge | hellaswag | gsm8k | lambada_openai | mmlu | mmlupro | truthfulqa_mc1 | winogrande | -|-------------------|--------|---------------|-----------|--------|----------------|--------|---------|----------------|------------| -| AutoRound | 0.4373 | 0.4019 | 0.4437 | 0.4215 | 0.4826 | 0.5474 | 0.263 | 0.3072 | 0.6314 | +|:-------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:| +| AutoRound | 0.4373 | 0.4019 | 0.4437 | 0.4215 | 0.4826 | 0.5474 | 0.2630 | 0.3072 | 0.6314 | | AutoRound+alg_ext | 0.4787 | 0.4275 | 0.4516 | 0.5944 | 0.5181 | 0.5773 | 0.2807 | 0.3305 | 0.6496 | | Llama3.1-8B W2G64 | Avg. | arc_challenge | hellaswag | gsm8k | lambada_openai | mmlu | mmlupro | truthfulqa_mc1 | winogrande | -|-------------------|--------|---------------|-----------|--------|----------------|--------|---------|----------------|------------| -| AutoRound | 0.382 | 0.3635 | 0.4562 | 0.1622 | 0.5069 | 0.4411 | 0.1661 | 0.3207 | 0.6393 | -| AutoRound+alg_ext | 0.4166 | 0.3712 | 0.4729 | 0.2039 | 0.5946 | 0.4981 | 0.2163 | 0.3011 | 0.6748 | \ No newline at end of file +|:-------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:| +| AutoRound | 0.3820 | 0.3635 | 0.4562 | 0.1622 | 0.5069 | 0.4411 | 0.1661 | 0.3207 | 0.6393 | +| AutoRound+alg_ext | 0.4166 | 0.3712 | 0.4729 | 0.2039 | 0.5946 | 0.4981 | 0.2163 | 0.3011 | 0.6748 | diff --git a/docs/auto_scheme_acc.md b/docs/auto_scheme_acc.md index cdf481d69..522a5f607 100644 --- a/docs/auto_scheme_acc.md +++ b/docs/auto_scheme_acc.md @@ -13,7 +13,7 @@ For mxfp experiment, we use fake model while for weight only model we use real m ### Table 1 MXFP4/8 mixed accuracy. | Average bits | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | Qwen3-32B | -|------------------|----------------|----------------|----------------|----------------| +|:------------------|:----------------:|:----------------:|:----------------:|:----------------:| | **BF16** | 0.7076 (100%) | 0.7075 (100%) | 0.6764 (100%) | 0.7321 (100%) | | **Pure 4-bit** | 0.6626 (93.6%) | 0.6550 (92.6%) | 0.6316 (93.4%) | 0.6901 (94.3%) | | **Ours 4.5-bit** | 0.6808 (96.2%) | 0.6776 (95.8%) | 0.6550 (96.8%) | 0.7176 (98.0%) | @@ -27,7 +27,7 @@ performance advantages. ### Table 2 Comparison with other recipes at an average of 5 bits of mxfp datatype | Avg. bits = 5 | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | -|-----------------------|-------------------:|-------------------:|-------------------:| +|:------------------|:----------------:|:----------------:|:----------------:| | **Tail layers 8-bit** | 0.6671 (94.3%) | 0.6616 (93.5%) | 0.6410 (94.8%) | | **Head layers 8-bit** | 0.6657 (94.1%) | 0.6686 (94.5%) | 0.6356 (94.0%) | | **Ours** | **0.6857 (96.9%)** | **0.6823 (96.4%)** | **0.6594 (97.5%)** | @@ -35,7 +35,7 @@ performance advantages. ### Table 3 Comparison with other recipes at an average of 4.5 bits of mxfp datatype | Avg. bits = 4.5 | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | -|-----------------------|-------------------:|-------------------:|-------------------:| +|:------------------|:----------------:|:----------------:|:----------------:| | **Tail layers 8-bit** | 0.6614 (93.5%) | 0.6535 (92.4%) | 0.6373 (94.2%) | | **Head layers 8-bit** | 0.6568 (92.8%) | 0.6642 (93.9%) | 0.6305 (93.2%) | | **Ours** | **0.6808 (96.2%)** | **0.6776 (95.5%)** | **0.6550 (95.8%)** | @@ -44,7 +44,7 @@ performance advantages. ### Table4 Comparison with other recipes at an average of 3 bits of W2G128 and W4G128 | Avg. bits = 4.5 | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B | -|-----------------------|--------------:|-------------:|---------:| +|:------------------|:----------------:|:----------------:|:----------------:| | **Tail layers 4-bit** | 0.6058 | 0.3798 | 0.4536 | | **Head layers 4-bit** | 0.3198 | 0.3270 | 0.3196 | -| **Ours** | 0.6148 | 0.4058 | 0.4862 | \ No newline at end of file +| **Ours** | 0.6148 | 0.4058 | 0.4862 | From a63077118a4e7c71d5ce4b01130d7ec1a9f0431d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 6 Nov 2025 09:02:13 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_autoround/export_to_fp8.py | 1 - test/test_cuda/test_exllamav2_backend.py | 1 - 2 files changed, 2 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 88f6d750c..8b8a618e2 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -228,4 +228,3 @@ def wrapper(name): save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype) return model - diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index 38905e9bd..c489b37b2 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -168,4 +168,3 @@ def test_gptq_exllamav2_4bits_sym_group_size(self): if __name__ == "__main__": unittest.main() - From 008e4809d5a524ccd62ed604bb9ff846d87ac7ba Mon Sep 17 00:00:00 2001 From: Weiwei Date: Thu, 6 Nov 2025 17:06:17 +0800 Subject: [PATCH 6/6] revert typo --- test/test_cuda/test_exllamav2_backend.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index c489b37b2..5c12e0557 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -12,7 +12,7 @@ from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model -from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut +from auto_round.testing_utils import require_autogptq, require_gptqmodel class LLMDataLoader: @@ -24,7 +24,7 @@ def __iter__(self): yield torch.ones([1, 10], dtype=torch.long) -class TestAutoRoundexllamaBackend(unittest.TestCase): +class TestAutoRoundMarlinBackend(unittest.TestCase): @classmethod def setUpClass(self): @@ -99,7 +99,6 @@ def test_gptqmodel_exllmav2_4bits_asym(self): shutil.rmtree("./saved", ignore_errors=True) @require_autogptq - @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -131,7 +130,6 @@ def test_gptq_exllamav2_4bits_sym(self): shutil.rmtree(self.save_folder, ignore_errors=True) @require_autogptq - @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym_group_size(self): for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")