diff --git a/auto_round/alg_ext.abi3.so b/auto_round/alg_ext.abi3.so index b89989d35..4b3f3bca3 100755 Binary files a/auto_round/alg_ext.abi3.so and b/auto_round/alg_ext.abi3.so differ diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 8f398e7a1..71899d7e0 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -698,11 +698,13 @@ def _check_compatibility(self) -> None: has_besides_gguf = True if has_gguf and has_besides_gguf: raise ValueError("Gguf format is not compatible with other formats, please choose only one of them") - if has_gguf and self.iters != 0 and self.bits != 3: + if has_gguf and self.iters != 0 and self.bits != 3 and not self.enable_alg_ext: logger.warning( "`iters=0` is recommended when exporting to GGUF format except for bits 3," " as we have optimized the RTN method for this case." - " We are likely to release new algorithm for certain configurations in the future." + " Or add enable_alg_ext to use the new algorithm," + " refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md" + " to check the acc." ) if ( diff --git a/docs/gguf_alg_ext_acc.md b/docs/gguf_alg_ext_acc.md new file mode 100644 index 000000000..8b874ae25 --- /dev/null +++ b/docs/gguf_alg_ext_acc.md @@ -0,0 +1,16 @@ +We use **lm-eval** for evaluation. For LLaMA, we enabled `add_bos_token` and +`removed @use_kernel_forward_from_hub("RMSNorm")` +in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L52C1-L52C40) +to stabilize accuracy during evaluation. All other settings follow the default configurations of AutoRound and lm-eval. + +*Average accuracy across `lambada_openai`, `hellaswag`, `piqa`, `winogrande`, `truthfulqa_mc1`, `openbookqa`, `boolq`, `arc_easy`, `arc_challenge` and `mmlu`.* + +|method|scheme|Llama-3.1-8B|Qwen2.5-7B-Instruct|Qwen3-8b|Qwen3-30B-A3B-Instruct-2507| +|:-----|:-----|:-----------|:------------------|:-------|:--------------------------| +|**BF16** | - |0.6295(100%)|0.6571(100%) |0.6322(100%)|0.6746(100%) | +| **original** | q2_k_s | 0.5535(87.92%)| 0.6266(95.35%)|0.5901(93.35%)|0.6386(94.66%)| +| **enable_alg_ext** |q2_k_s|0.5740(91.18%)|0.6349(96.62%)|0.5962(94.31%)|0.6460(95.77%)| +| **original** | q3_k_s | 0.6040(95.95%)|0.6382(97.12%)|0.6128(96.94%)|0.6598(97.82%)| +| **enable_alg_ext** |q3_k_s|0.6081(96.59%)|0.6503(98.97%)|0.6252(98.89%)|0.6622(98.17%)| +| **original** | q4_k_s | 0.6228(98.94%)|0.6560(99.83%)|0.6303(99.70%)|0.6762(100.24%)| +| **enable_alg_ext** |q4_k_s|0.6239(99.11%)|0.6605(100.51%)|0.6320(99.98%)|0.6777(100.46%)| \ No newline at end of file diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 924e68922..adadaa837 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -716,8 +716,12 @@ def test_alg_ext(self): ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True) ar.quantize() + model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True) + ar.quantize() + def test_alg_ext_import(self): - from auto_round.alg_ext import quantize_block_ext + from auto_round.alg_ext import dq_quantize_block_ext, quantize_block_ext def test_invalid_layer_config(self): with self.assertRaises(ValueError):