intel · wenhuach21 · Mar 14, 2025 · Mar 13, 2025
diff --git a/auto_round/data_type/w4fp8.py b/auto_round/data_type/w4fp8.py
@@ -240,4 +240,5 @@ def progressive_quant_fp8_int4(tensor, bits=4, group_size=-1, v=0, min_scale=1.0
                                                                           q_scale_thresh=q_scale_thresh)
     qdq_tensor = qdq_int4_tensor * bf16_to_fp8_scale
 
-    return qdq_tensor, {"scale": scale_fp8_to_int4, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4
+    bf16_to_int4_scale = scale_fp8_to_int4 * bf16_to_fp8_scale
+    return qdq_tensor, {"scale": bf16_to_int4_scale, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -984,16 +984,6 @@ def torch_version_at_least(version_string):
 # 2. Lazy Mode (By default)
 
 
-def _check_hpu_compile_mode():
-    assert (
-            os.getenv("PT_HPU_LAZY_MODE") == "0"
-    ), "Please set `PT_HPU_LAZY_MODE=0` to use HPU compile mode"
-    # Note: this is a temporary solution, will be removed in the future
-    assert (
-            os.getenv("PT_ENABLE_INT64_SUPPORT") == "1"
-    ), "Please set `PT_ENABLE_INT64_SUPPORT=1` to use HPU compile mode"
-
-
 def is_hpu_lazy_mode():
     return os.getenv("PT_HPU_LAZY_MODE") != "0"
 
@@ -1004,7 +994,6 @@ def _use_hpu_compile_mode():
 
 def compile_func_on_hpu(func):
     if _use_hpu_compile_mode():
-        _check_hpu_compile_mode()
         return torch.compile(func, backend="hpu_backend")
     return func
 

diff --git a/test/test_auto_round_hpu_only.py b/test/test_auto_round_hpu_only.py
@@ -46,34 +46,34 @@ def test_import():
         WeightOnlyLinear, save_quantized_as_itrex)
 
 
-# @pytest.mark.parametrize(
-#     "data_type",
-#     ["fp8_to_int_sym"],
-# )
-# def test_w4a8(data_type):
-#     from auto_round import AutoRound
-#     from transformers import AutoModelForCausalLM, AutoTokenizer
-#
-#     model_name = "facebook/opt-125m"
-#     model = AutoModelForCausalLM.from_pretrained(
-#         model_name,
-#         torch_dtype="auto",
-#         attn_implementation="eager",
-#         trust_remote_code=True,
-#     )
-#     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-#
-#     autoround = AutoRound(
-#         model,
-#         tokenizer,
-#         bits=4,
-#         group_size=128,
-#         iters=2,
-#         seqlen=2,
-#         data_type=data_type,
-#         act_data_type="fp8_gaudi3_sym",
-#         act_bits=8,
-#         act_dynamic=False,
-#     )
-#     q_model, qconfig = autoround.quantize()
-#     assert q_model is not None, f"Expected q_model to be not None"
+@pytest.mark.parametrize(
+    "data_type",
+    ["fp8_to_int_sym"],
+)
+def test_w4a8(data_type):
+    from auto_round import AutoRound
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    model_name = "facebook/opt-125m"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        attn_implementation="eager",
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    autoround = AutoRound(
+        model,
+        tokenizer,
+        bits=4,
+        group_size=128,
+        iters=2,
+        seqlen=2,
+        data_type=data_type,
+        act_data_type="fp8_sym",
+        act_bits=8,
+        act_dynamic=False,
+    )
+    q_model, qconfig = autoround.quantize()
+    assert q_model is not None, f"Expected q_model to be not None"