From 719e5abb36cab9ae1d6c184e66c45ae62fca7276 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 9 Oct 2025 15:31:54 +0800
Subject: [PATCH 1/6] fp8 exporting bugfix

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/export/export_to_autoround/export_to_fp8.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
index 7f069cb60..1f6cdbc65 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -109,11 +109,9 @@ def pack_layer(layer_name, model, data_type, device=None):
         torch_dtype = torch.float8_e5m2
     info = torch.finfo(torch_dtype)
     if zp is not None:
-        q_weight = (
-            weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) + zp.to(packing_device)
-            if isinstance(zp, torch.Tensor)
-            else zp
-        )
+        if isinstance(zp, torch.Tensor):
+            zp = zp.to(packing_device)
+        q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) + zp
     else:
         q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1)
     q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len)
@@ -235,3 +233,4 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
+

From 045d3a2a477ce9248671a58e034d086d8913cdcf Mon Sep 17 00:00:00 2001
From: Weiwei <weiwei1.zhang@intel.com>
Date: Wed, 5 Nov 2025 14:49:15 +0800
Subject: [PATCH 2/6] Update mxnv_acc.md

---
 docs/mxnv_acc.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/mxnv_acc.md b/docs/mxnv_acc.md
index cb764fa58..24ee865af 100644
--- a/docs/mxnv_acc.md
+++ b/docs/mxnv_acc.md
@@ -3,13 +3,13 @@ Average accuracy of hellaswag,lambada_openai,mmlu,piqa,winogrande.
 We evaluated using a fake model since we currently have no access to devices for running the real models. However, we have verified that in most cases the fake model closely matches the real model.
 
 | mxfp4 g32         | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4    | Qwen3-32B |
-|-------------------|----------------------|--------------------|---------|-----------|
-| RTN               | 0.62124              | 0.65502            | 0.71674 | 0.69006   |
-| AutoRound         | 0.66862              | 0.67588            | 0.72472 | 0.72106   |
-| AutoRound+alg_ext | 0.6732               | 0.68094            | 0.72252 | 0.72012   |
+|:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:|
+| RTN               | 0.6212               | 0.6550            | 0.7167 | 0.6901   |
+| AutoRound         | 0.6686               | 0.6758            | 0.7247 | 0.7211   |
+| AutoRound+alg_ext | 0.6732               | 0.6809            | 0.7225 | 0.7201   |
 
 | nvfp4  g16        | llama3.1-8B-Instruct | Qwen2-7.5-Instruct | Phi4    | Qwen3-32B |
-|-------------------|----------------------|--------------------|---------|-----------|
-| RTN               | 0.68756              | 0.6906             | 0.72962 | 0.71636   |
-| AutoRound         | 0.69184              | 0.69728            | 0.73058 | 0.73062   |
-| AutoRound+alg_ext | 0.69648              | 0.6989             | 0.7318  | 0.72948    |
\ No newline at end of file
+|:-------------------|:----------------------:|:--------------------:|:---------:|:-----------:|
+| RTN               | 0.6876              | 0.6906             | 0.7296 | 0.7164      |
+| AutoRound         | 0.6918              | 0.6973             | 0.7306 | 0.7306      |
+| AutoRound+alg_ext | 0.6965              | 0.6989             | 0.7318  | 0.7295     |

From c2daa79099713a7fcdd45f8a97f3fd5e7170c711 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 6 Nov 2025 16:52:46 +0800
Subject: [PATCH 3/6] refine exllama backend cuda UT

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 test/test_cuda/test_exllamav2_backend.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index 5c12e0557..38905e9bd 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -12,7 +12,7 @@
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
-from auto_round.testing_utils import require_autogptq, require_gptqmodel
+from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut
 
 
 class LLMDataLoader:
@@ -24,7 +24,7 @@ def __iter__(self):
             yield torch.ones([1, 10], dtype=torch.long)
 
 
-class TestAutoRoundMarlinBackend(unittest.TestCase):
+class TestAutoRoundexllamaBackend(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -99,6 +99,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_autogptq
+    @require_package_version_ut("torch", "<2.6.0")
     def test_gptq_exllamav2_4bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -130,6 +131,7 @@ def test_gptq_exllamav2_4bits_sym(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
     @require_autogptq
+    @require_package_version_ut("torch", "<2.6.0")
     def test_gptq_exllamav2_4bits_sym_group_size(self):
         for group_size in [-1, 32, 64, 128, 256, 1024]:  ## 384, 768 has accuracy issue
             print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")
@@ -166,3 +168,4 @@ def test_gptq_exllamav2_4bits_sym_group_size(self):
 
 if __name__ == "__main__":
     unittest.main()
+

From 018bb0fe86ae72a22b00212b20ca71eea6a93b1a Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 6 Nov 2025 17:01:33 +0800
Subject: [PATCH 4/6] refine md tables

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 docs/alg_202508.md      | 10 +++++-----
 docs/auto_scheme_acc.md | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/alg_202508.md b/docs/alg_202508.md
index 086cd5cf6..58069d870 100644
--- a/docs/alg_202508.md
+++ b/docs/alg_202508.md
@@ -4,11 +4,11 @@ in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src
 to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval.
 
 | Qwen3-8B W2G64    | Avg.   | arc_challenge | hellaswag | gsm8k  | lambada_openai | mmlu   | mmlupro | truthfulqa_mc1 | winogrande |
-|-------------------|--------|---------------|-----------|--------|----------------|--------|---------|----------------|------------|
-| AutoRound         | 0.4373 | 0.4019        | 0.4437    | 0.4215 | 0.4826         | 0.5474 | 0.263   | 0.3072         | 0.6314     |
+|:-------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
+| AutoRound         | 0.4373 | 0.4019        | 0.4437    | 0.4215 | 0.4826         | 0.5474 | 0.2630   | 0.3072         | 0.6314     |
 | AutoRound+alg_ext | 0.4787 | 0.4275        | 0.4516    | 0.5944 | 0.5181         | 0.5773 | 0.2807  | 0.3305         | 0.6496     |
 
 | Llama3.1-8B W2G64 | Avg.   | arc_challenge | hellaswag | gsm8k  | lambada_openai | mmlu   | mmlupro | truthfulqa_mc1 | winogrande |
-|-------------------|--------|---------------|-----------|--------|----------------|--------|---------|----------------|------------|
-| AutoRound         | 0.382  | 0.3635        | 0.4562    | 0.1622 | 0.5069         | 0.4411 | 0.1661  | 0.3207         | 0.6393     |
-| AutoRound+alg_ext | 0.4166 | 0.3712        | 0.4729    | 0.2039 | 0.5946         | 0.4981 | 0.2163  | 0.3011         | 0.6748     |
\ No newline at end of file
+|:-------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
+| AutoRound         | 0.3820  | 0.3635        | 0.4562    | 0.1622 | 0.5069         | 0.4411 | 0.1661  | 0.3207         | 0.6393     |
+| AutoRound+alg_ext | 0.4166 | 0.3712        | 0.4729    | 0.2039 | 0.5946         | 0.4981 | 0.2163  | 0.3011         | 0.6748     |
diff --git a/docs/auto_scheme_acc.md b/docs/auto_scheme_acc.md
index cdf481d69..522a5f607 100644
--- a/docs/auto_scheme_acc.md
+++ b/docs/auto_scheme_acc.md
@@ -13,7 +13,7 @@ For mxfp experiment, we use fake model while for weight only model we use real m
 ### Table 1 MXFP4/8 mixed accuracy.
 
 | Average bits     | Llama3.1-8B-I  | Qwen2.5-7B-I   | Qwen3-8B       | Qwen3-32B      |
-|------------------|----------------|----------------|----------------|----------------|
+|:------------------|:----------------:|:----------------:|:----------------:|:----------------:|
 | **BF16**         | 0.7076 (100%)  | 0.7075 (100%)  | 0.6764 (100%)  | 0.7321 (100%)  |
 | **Pure 4-bit**   | 0.6626 (93.6%) | 0.6550 (92.6%) | 0.6316 (93.4%) | 0.6901 (94.3%) |
 | **Ours 4.5-bit** | 0.6808 (96.2%) | 0.6776 (95.8%) | 0.6550 (96.8%) | 0.7176 (98.0%) |
@@ -27,7 +27,7 @@ performance advantages.
 ### Table 2  Comparison with other recipes at an average of 5 bits of mxfp datatype
 
 | Avg. bits = 5         |      Llama3.1-8B-I |       Qwen2.5-7B-I |           Qwen3-8B |
-|-----------------------|-------------------:|-------------------:|-------------------:|
+|:------------------|:----------------:|:----------------:|:----------------:|
 | **Tail layers 8-bit** |     0.6671 (94.3%) |     0.6616 (93.5%) |     0.6410 (94.8%) |
 | **Head layers 8-bit** |     0.6657 (94.1%) |     0.6686 (94.5%) |     0.6356 (94.0%) |
 | **Ours**              | **0.6857 (96.9%)** | **0.6823 (96.4%)** | **0.6594 (97.5%)** |
@@ -35,7 +35,7 @@ performance advantages.
 ### Table 3  Comparison with other recipes at an average of 4.5 bits of mxfp datatype
 
 | Avg. bits = 4.5       |      Llama3.1-8B-I |       Qwen2.5-7B-I |           Qwen3-8B |
-|-----------------------|-------------------:|-------------------:|-------------------:|
+|:------------------|:----------------:|:----------------:|:----------------:|
 | **Tail layers 8-bit** |     0.6614 (93.5%) |     0.6535 (92.4%) |     0.6373 (94.2%) |
 | **Head layers 8-bit** |     0.6568 (92.8%) |     0.6642 (93.9%) |     0.6305 (93.2%) |
 | **Ours**              | **0.6808 (96.2%)** | **0.6776 (95.5%)** | **0.6550 (95.8%)** |
@@ -44,7 +44,7 @@ performance advantages.
 ### Table4   Comparison with other recipes at an average of 3 bits of W2G128 and W4G128
 
 | Avg. bits = 4.5       | Llama3.1-8B-I | Qwen2.5-7B-I | Qwen3-8B |
-|-----------------------|--------------:|-------------:|---------:|
+|:------------------|:----------------:|:----------------:|:----------------:|
 | **Tail layers 4-bit** |        0.6058 |       0.3798 |   0.4536 |
 | **Head layers 4-bit** |        0.3198 |       0.3270 |   0.3196 |
-| **Ours**              |        0.6148 |       0.4058 |   0.4862 |
\ No newline at end of file
+| **Ours**              |        0.6148 |       0.4058 |   0.4862 |

From a63077118a4e7c71d5ce4b01130d7ec1a9f0431d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Nov 2025 09:02:13 +0000
Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_autoround/export_to_fp8.py | 1 -
 test/test_cuda/test_exllamav2_backend.py               | 1 -
 2 files changed, 2 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
index 88f6d750c..8b8a618e2 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -228,4 +228,3 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
-
diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index 38905e9bd..c489b37b2 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -168,4 +168,3 @@ def test_gptq_exllamav2_4bits_sym_group_size(self):
 
 if __name__ == "__main__":
     unittest.main()
-

From 008e4809d5a524ccd62ed604bb9ff846d87ac7ba Mon Sep 17 00:00:00 2001
From: Weiwei <weiwei1.zhang@intel.com>
Date: Thu, 6 Nov 2025 17:06:17 +0800
Subject: [PATCH 6/6] revert typo

---
 test/test_cuda/test_exllamav2_backend.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index c489b37b2..5c12e0557 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -12,7 +12,7 @@
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
-from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut
+from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
 
 class LLMDataLoader:
@@ -24,7 +24,7 @@ def __iter__(self):
             yield torch.ones([1, 10], dtype=torch.long)
 
 
-class TestAutoRoundexllamaBackend(unittest.TestCase):
+class TestAutoRoundMarlinBackend(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -99,7 +99,6 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_autogptq
-    @require_package_version_ut("torch", "<2.6.0")
     def test_gptq_exllamav2_4bits_sym(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -131,7 +130,6 @@ def test_gptq_exllamav2_4bits_sym(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
     @require_autogptq
-    @require_package_version_ut("torch", "<2.6.0")
     def test_gptq_exllamav2_4bits_sym_group_size(self):
         for group_size in [-1, 32, 64, 128, 256, 1024]:  ## 384, 768 has accuracy issue
             print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")