Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions auto_round/export/export_to_autogptq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,17 +258,17 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll

all_to_quantized = True
modules_in_block_to_quantize = []
if not dynamic: # Only uniform precision
for block_names in all_blocks:
first_block = get_module(model, block_names[0])
for n, m in first_block.named_modules():
if m.tmp_name not in layer_config:
continue
if not check_to_quantized(layer_config[m.tmp_name]):
all_to_quantized = False
else:
modules_in_block_to_quantize.append(n)
modules_in_block_to_quantize = [modules_in_block_to_quantize]
# for backward compatibility
for block_names in all_blocks:
first_block = get_module(model, block_names[0])
for n, m in first_block.named_modules():
if m.tmp_name not in layer_config:
continue
if not check_to_quantized(layer_config[m.tmp_name]):
all_to_quantized = False
else:
modules_in_block_to_quantize.append(n)
modules_in_block_to_quantize = [modules_in_block_to_quantize]

if all_to_quantized:
modules_in_block_to_quantize = None
Expand Down
21 changes: 19 additions & 2 deletions auto_round/inference/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,20 @@ def feature_multiply_checker_group_size(
)


def feature_compatible_multiply_checker(
in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None
):
group_size = config["group_size"]
if out_feature_multiplier is None:
out_feature_multiplier = in_feature_multiplier
compatible_flag = in_feature < group_size and (in_feature * out_feature) % group_size == 0
return (
in_feature % in_feature_multiplier == 0
and out_feature % out_feature_multiplier == 0
and (in_feature % group_size == 0 or compatible_flag)
)


def in_feature_checker_group_size(in_feature, out_feature, config):
group_size = config["group_size"]
return in_feature % group_size == 0
Expand All @@ -148,6 +162,9 @@ def in_feature_checker_group_size(in_feature, out_feature, config):
exllamav2_feature_checker = functools.partial(
feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32
)
compatible_exllamav2_feature_checker = functools.partial(
feature_compatible_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32
)

gptqmodel_marlin_feature_checker = functools.partial(
feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
Expand Down Expand Up @@ -185,9 +202,9 @@ def fp8_static_scheme_checker(
act_bits=WOQ_DEFAULT_ACT_BITS,
# 16, 384,768 accuracy issue
group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
checkers=[exllamav2_feature_checker],
checkers=[compatible_exllamav2_feature_checker],
alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"],
requirements=["auto-gptq>=0.7.1"],
requirements=["torch<2.6.0", "auto-gptq>=0.7.1"],
)

BackendInfos["auto_gptq:tritonv2"] = BackendInfo(
Expand Down
4 changes: 3 additions & 1 deletion auto_round/special_model_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@ def _handle_moe_model(model, formats=None):
parent = model.get_submodule(parent)
setattr(parent, child, new_module)

logger.warning("Llama4 experts are converted, the quantized model can not run on transformers.")
logger.warning(
f"{model.config.model_type} experts are converted, the quantized model can not run on transformers."
)
return model


Expand Down