intel · n1ck-guo · Nov 11, 2025 · Nov 10, 2025
diff --git a/auto_round/alg_ext.abi3.so b/auto_round/alg_ext.abi3.so
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -698,11 +698,13 @@ def _check_compatibility(self) -> None:
                     has_besides_gguf = True
             if has_gguf and has_besides_gguf:
                 raise ValueError("Gguf format is not compatible with other formats, please choose only one of them")
-            if has_gguf and self.iters != 0 and self.bits != 3:
+            if has_gguf and self.iters != 0 and self.bits != 3 and not self.enable_alg_ext:
                 logger.warning(
                     "`iters=0` is recommended when exporting to GGUF format except for bits 3,"
                     " as we have optimized the RTN method for this case."
-                    " We are likely to release new algorithm for certain configurations in the future."
+                    " Or add enable_alg_ext to use the new algorithm,"
+                    " refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md"
+                    " to check the acc."
                 )
 
         if (

diff --git a/docs/gguf_alg_ext_acc.md b/docs/gguf_alg_ext_acc.md
@@ -0,0 +1,16 @@
+We use **lm-eval** for evaluation. For LLaMA, we enabled `add_bos_token` and
+`removed @use_kernel_forward_from_hub("RMSNorm")`
+in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40)
+to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval.
+
+*Average accuracy across `lambada_openai`, `hellaswag`, `piqa`, `winogrande`, `truthfulqa_mc1`, `openbookqa`, `boolq`, `arc_easy`,	`arc_challenge` and `mmlu`.*
+
+|method|scheme|Llama-3.1-8B|Qwen2.5-7B-Instruct|Qwen3-8b|Qwen3-30B-A3B-Instruct-2507|
+|:-----|:-----|:-----------|:------------------|:-------|:--------------------------|
+|**BF16**  | -    |0.6295(100%)|0.6571(100%)       |0.6322(100%)|0.6746(100%)           |
+| **original** | q2_k_s | 0.5535(87.92%)| 0.6266(95.35%)|0.5901(93.35%)|0.6386(94.66%)|
+| **enable_alg_ext** |q2_k_s|0.5740(91.18%)|0.6349(96.62%)|0.5962(94.31%)|0.6460(95.77%)|
+| **original**  | q3_k_s | 0.6040(95.95%)|0.6382(97.12%)|0.6128(96.94%)|0.6598(97.82%)|
+| **enable_alg_ext** |q3_k_s|0.6081(96.59%)|0.6503(98.97%)|0.6252(98.89%)|0.6622(98.17%)|
+| **original**  | q4_k_s | 0.6228(98.94%)|0.6560(99.83%)|0.6303(99.70%)|0.6762(100.24%)|
+| **enable_alg_ext** |q4_k_s|0.6239(99.11%)|0.6605(100.51%)|0.6320(99.98%)|0.6777(100.46%)|
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
@@ -716,8 +716,12 @@ def test_alg_ext(self):
         ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True)
         ar.quantize()
 
+        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True)
+        ar.quantize()
+
     def test_alg_ext_import(self):
-        from auto_round.alg_ext import quantize_block_ext
+        from auto_round.alg_ext import dq_quantize_block_ext, quantize_block_ext
 
     def test_invalid_layer_config(self):
         with self.assertRaises(ValueError):