Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion auto_round/data_type/w4fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,4 +240,5 @@ def progressive_quant_fp8_int4(tensor, bits=4, group_size=-1, v=0, min_scale=1.0
q_scale_thresh=q_scale_thresh)
qdq_tensor = qdq_int4_tensor * bf16_to_fp8_scale

return qdq_tensor, {"scale": scale_fp8_to_int4, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4
bf16_to_int4_scale = scale_fp8_to_int4 * bf16_to_fp8_scale
return qdq_tensor, {"scale": bf16_to_int4_scale, "bf16_to_fp8_scale": bf16_to_fp8_scale}, zp_fp8_to_int4
11 changes: 0 additions & 11 deletions auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,16 +984,6 @@ def torch_version_at_least(version_string):
# 2. Lazy Mode (By default)


def _check_hpu_compile_mode():
assert (
os.getenv("PT_HPU_LAZY_MODE") == "0"
), "Please set `PT_HPU_LAZY_MODE=0` to use HPU compile mode"
# Note: this is a temporary solution, will be removed in the future
assert (
os.getenv("PT_ENABLE_INT64_SUPPORT") == "1"
), "Please set `PT_ENABLE_INT64_SUPPORT=1` to use HPU compile mode"


def is_hpu_lazy_mode():
return os.getenv("PT_HPU_LAZY_MODE") != "0"

Expand All @@ -1004,7 +994,6 @@ def _use_hpu_compile_mode():

def compile_func_on_hpu(func):
if _use_hpu_compile_mode():
_check_hpu_compile_mode()
return torch.compile(func, backend="hpu_backend")
return func

Expand Down
62 changes: 31 additions & 31 deletions test/test_auto_round_hpu_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,34 +46,34 @@ def test_import():
WeightOnlyLinear, save_quantized_as_itrex)


# @pytest.mark.parametrize(
# "data_type",
# ["fp8_to_int_sym"],
# )
# def test_w4a8(data_type):
# from auto_round import AutoRound
# from transformers import AutoModelForCausalLM, AutoTokenizer
#
# model_name = "facebook/opt-125m"
# model = AutoModelForCausalLM.from_pretrained(
# model_name,
# torch_dtype="auto",
# attn_implementation="eager",
# trust_remote_code=True,
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
#
# autoround = AutoRound(
# model,
# tokenizer,
# bits=4,
# group_size=128,
# iters=2,
# seqlen=2,
# data_type=data_type,
# act_data_type="fp8_gaudi3_sym",
# act_bits=8,
# act_dynamic=False,
# )
# q_model, qconfig = autoround.quantize()
# assert q_model is not None, f"Expected q_model to be not None"
@pytest.mark.parametrize(
"data_type",
["fp8_to_int_sym"],
)
def test_w4a8(data_type):
from auto_round import AutoRound
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
attn_implementation="eager",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

autoround = AutoRound(
model,
tokenizer,
bits=4,
group_size=128,
iters=2,
seqlen=2,
data_type=data_type,
act_data_type="fp8_sym",
act_bits=8,
act_dynamic=False,
)
q_model, qconfig = autoround.quantize()
assert q_model is not None, f"Expected q_model to be not None"