diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index a69cb29b4..c5cf926ae 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -258,17 +258,17 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll all_to_quantized = True modules_in_block_to_quantize = [] - if not dynamic: # Only uniform precision - for block_names in all_blocks: - first_block = get_module(model, block_names[0]) - for n, m in first_block.named_modules(): - if m.tmp_name not in layer_config: - continue - if not check_to_quantized(layer_config[m.tmp_name]): - all_to_quantized = False - else: - modules_in_block_to_quantize.append(n) - modules_in_block_to_quantize = [modules_in_block_to_quantize] + # for backward compatibility + for block_names in all_blocks: + first_block = get_module(model, block_names[0]) + for n, m in first_block.named_modules(): + if m.tmp_name not in layer_config: + continue + if not check_to_quantized(layer_config[m.tmp_name]): + all_to_quantized = False + else: + modules_in_block_to_quantize.append(n) + modules_in_block_to_quantize = [modules_in_block_to_quantize] if all_to_quantized: modules_in_block_to_quantize = None diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index ce389502a..74a9a5c62 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -132,6 +132,20 @@ def feature_multiply_checker_group_size( ) +def feature_compatible_multiply_checker( + in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None +): + group_size = config["group_size"] + if out_feature_multiplier is None: + out_feature_multiplier = in_feature_multiplier + compatible_flag = in_feature < group_size and (in_feature * out_feature) % group_size == 0 + return ( + in_feature % in_feature_multiplier == 0 + and out_feature % out_feature_multiplier == 0 + and (in_feature % group_size == 0 or compatible_flag) + ) + + def in_feature_checker_group_size(in_feature, out_feature, config): group_size = config["group_size"] return in_feature % group_size == 0 @@ -148,6 +162,9 @@ def in_feature_checker_group_size(in_feature, out_feature, config): exllamav2_feature_checker = functools.partial( feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32 ) +compatible_exllamav2_feature_checker = functools.partial( + feature_compatible_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32 +) gptqmodel_marlin_feature_checker = functools.partial( feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64 @@ -185,9 +202,9 @@ def fp8_static_scheme_checker( act_bits=WOQ_DEFAULT_ACT_BITS, # 16, 384,768 accuracy issue group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048], - checkers=[exllamav2_feature_checker], + checkers=[compatible_exllamav2_feature_checker], alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"], - requirements=["auto-gptq>=0.7.1"], + requirements=["torch<2.6.0", "auto-gptq>=0.7.1"], ) BackendInfos["auto_gptq:tritonv2"] = BackendInfo( diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index de5f8b2a4..d0fa8c962 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -85,7 +85,9 @@ def _handle_moe_model(model, formats=None): parent = model.get_submodule(parent) setattr(parent, child, new_module) - logger.warning("Llama4 experts are converted, the quantized model can not run on transformers.") + logger.warning( + f"{model.config.model_type} experts are converted, the quantized model can not run on transformers." + ) return model