From 6a67ba63129c0d533ad46c2c1617b5c5490e05cc Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 13 Mar 2025 13:09:32 +0200 Subject: [PATCH] save bf16 to int4 scale and add HPU ut back Signed-off-by: Yi Liu --- auto_round/data_type/w4fp8.py | 3 +- auto_round/utils.py | 11 ------ test/test_auto_round_hpu_only.py | 62 ++++++++++++++++---------------- 3 files changed, 33 insertions(+), 43 deletions(-) diff --git a/auto_round/data_type/w4fp8.py b/auto_round/data_type/w4fp8.py index 9e65bed44..9b5123759 100644 --- a/auto_round/data_type/w4fp8.py +++ b/auto_round/data_type/w4fp8.py @@ -240,4 +240,5 @@ def progressive_quant_fp8_int4(tensor, bits=4, group_size=-1, v=0, min_scale=1.0 q_scale_thresh=q_scale_thresh) qdq_tensor = qdq_int4_tensor * bf16_to_fp8_scale - return qdq_tensor, {"scale": scale_fp8_to_int4, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4 + bf16_to_int4_scale = scale_fp8_to_int4 * bf16_to_fp8_scale + return qdq_tensor, {"scale": bf16_to_int4_scale, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4 \ No newline at end of file diff --git a/auto_round/utils.py b/auto_round/utils.py index 45de7573e..e1f92e18c 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -984,16 +984,6 @@ def torch_version_at_least(version_string): # 2. Lazy Mode (By default) -def _check_hpu_compile_mode(): - assert ( - os.getenv("PT_HPU_LAZY_MODE") == "0" - ), "Please set `PT_HPU_LAZY_MODE=0` to use HPU compile mode" - # Note: this is a temporary solution, will be removed in the future - assert ( - os.getenv("PT_ENABLE_INT64_SUPPORT") == "1" - ), "Please set `PT_ENABLE_INT64_SUPPORT=1` to use HPU compile mode" - - def is_hpu_lazy_mode(): return os.getenv("PT_HPU_LAZY_MODE") != "0" @@ -1004,7 +994,6 @@ def _use_hpu_compile_mode(): def compile_func_on_hpu(func): if _use_hpu_compile_mode(): - _check_hpu_compile_mode() return torch.compile(func, backend="hpu_backend") return func diff --git a/test/test_auto_round_hpu_only.py b/test/test_auto_round_hpu_only.py index ad75fd9ad..006e44f63 100644 --- a/test/test_auto_round_hpu_only.py +++ b/test/test_auto_round_hpu_only.py @@ -46,34 +46,34 @@ def test_import(): WeightOnlyLinear, save_quantized_as_itrex) -# @pytest.mark.parametrize( -# "data_type", -# ["fp8_to_int_sym"], -# ) -# def test_w4a8(data_type): -# from auto_round import AutoRound -# from transformers import AutoModelForCausalLM, AutoTokenizer -# -# model_name = "facebook/opt-125m" -# model = AutoModelForCausalLM.from_pretrained( -# model_name, -# torch_dtype="auto", -# attn_implementation="eager", -# trust_remote_code=True, -# ) -# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) -# -# autoround = AutoRound( -# model, -# tokenizer, -# bits=4, -# group_size=128, -# iters=2, -# seqlen=2, -# data_type=data_type, -# act_data_type="fp8_gaudi3_sym", -# act_bits=8, -# act_dynamic=False, -# ) -# q_model, qconfig = autoround.quantize() -# assert q_model is not None, f"Expected q_model to be not None" +@pytest.mark.parametrize( + "data_type", + ["fp8_to_int_sym"], +) +def test_w4a8(data_type): + from auto_round import AutoRound + from transformers import AutoModelForCausalLM, AutoTokenizer + + model_name = "facebook/opt-125m" + model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype="auto", + attn_implementation="eager", + trust_remote_code=True, + ) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + autoround = AutoRound( + model, + tokenizer, + bits=4, + group_size=128, + iters=2, + seqlen=2, + data_type=data_type, + act_data_type="fp8_sym", + act_bits=8, + act_dynamic=False, + ) + q_model, qconfig = autoround.quantize() + assert q_model is not None, f"Expected q_model to be not None"