diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 5aabe3969..cf8d90098 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -323,9 +323,20 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str cfg.setdefault(key, copy.deepcopy(default_dict.get(key))) # 5. collect supported modules + embedding_types = (torch.nn.Embedding,) gguf_name = get_gguf_scheme(default_scheme) - if gguf_name and torch.nn.Embedding not in supported_types: - supported_types = (*supported_types, torch.nn.Embedding) + if gguf_name: + if torch.nn.Embedding not in supported_types: + supported_types = (*supported_types, torch.nn.Embedding) + + # for some Embedding which type() is not torch.nn.Embedding + # for example: transformers.models.gemma3.modeling_gemma3.Gemma3TextScaledWordEmbedding + model_module_name = model.__class__.__module__ + module_cls = sys.modules[model_module_name] + for name in module_cls.__dict__: + if name.endswith("Embedding") and not name.endswith("RotaryEmbedding"): + embedding_types = (*embedding_types, getattr(module_cls, name)) + supported_types = (*supported_types, *embedding_types) all_supported_layer_names, embedding_layer_names = [], [] all_module_names = [] @@ -338,7 +349,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: continue all_supported_layer_names.append(n) - if isinstance(m, torch.nn.Embedding): + if isinstance(m, embedding_types) or m.__class__.__name__.endswith("Embedding"): embedding_layer_names.append(n) # 6. expand regex configs @@ -650,7 +661,7 @@ def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model import gguf # pylint: disable=E0401 - from auto_round.utils.common import LazyImport + from auto_round.utils.common import MM_KEYS, LazyImport from auto_round.utils.model import get_lm_head_name, get_module # from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture @@ -660,24 +671,41 @@ def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model hparams=model.config.to_dict(), model_type=model_type ) try: - model_class = convert_hf_to_gguf.ModelBase.from_model_architecture(model_architecture, model_type=model_type) + if model_type != ModelType.TEXT: + model_class_vision = convert_hf_to_gguf.ModelBase.from_model_architecture( + model_architecture, model_type=model_type + ) + model_class = convert_hf_to_gguf.ModelBase.from_model_architecture( + model_architecture, model_type=ModelType.TEXT + ) + except NotImplementedError: return layer_config, {} n_layer = None - for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"]: - sub_attr = "text_config" if model_type == ModelType.TEXT else "vision_config" + if model_type != ModelType.TEXT: + n_layer_vision = None + for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]: if hasattr(model.config, name): n_layer = getattr(model.config, name) - break - if hasattr(model.config, sub_attr): - if hasattr(getattr(model.config, sub_attr), name): - n_layer = getattr(getattr(model.config, sub_attr), name) + if model_type != ModelType.TEXT: + if n_layer is not None and hasattr(model.config, "text_config"): + if hasattr(getattr(model.config, "text_config"), name): + n_layer = getattr(getattr(model.config, "text_config"), name) + for config_name in ["vision_config", "vision_encoder"]: + if hasattr(model.config, config_name): + if hasattr(getattr(model.config, config_name), name): + n_layer_vision = getattr(getattr(model.config, config_name), name) + break + if n_layer and n_layer_vision: break + if n_layer is None: return layer_config, {} tensor_map = gguf.get_tensor_name_map(model_class.model_arch, n_layer) + if model_type != ModelType.TEXT: + tensor_map_vision = gguf.get_tensor_name_map(model_class_vision.model_arch, n_layer_vision) def _set_config(config, target_config): for k, v in target_config.items(): @@ -733,7 +761,17 @@ def _set_config(config, target_config): re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["embedding"]).group(1) ) - gguf_name = tensor_map.get_name(layer_name) + if model_type != ModelType.TEXT and any([key in layer_name for key in MM_KEYS]): + gguf_name = tensor_map_vision.get_name(layer_name) + if gguf_name is None: + for key in MM_KEYS: + gguf_name = tensor_map_vision.get_name(layer_name.replace(f".{key}", "")) + if gguf_name is not None: + break + else: + gguf_name = tensor_map.get_name(layer_name) + if gguf_name is None: + gguf_name = tensor_map.get_name(layer_name.replace(".language_model", "")) bits_index = 6 if config.get("fixed_by_user", False): if "bits" not in config: diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 009b6458d..7235e9e4c 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -229,7 +229,6 @@ def eval_task_by_task( import traceback from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0611 - from lm_eval.models.hf_vlms import HFMultimodalLM from lm_eval.models.huggingface import HFLM from transformers import AutoModelForCausalLM, AutoTokenizer @@ -269,6 +268,8 @@ def eval_task_by_task( if batch_size is None or batch_size == "auto": logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") batch_size = 16 + from lm_eval.models.hf_vlms import HFMultimodalLM + hflm = HFMultimodalLM( pretrained=model, tokenizer=tokenizer, @@ -333,7 +334,10 @@ def eval_task_by_task( res_all = res else: for key in res_keys: - res_all[key].update(res[key]) + if key not in res_all: + continue + else: + res_all[key].update(res[key]) print(make_table(res_all)) print("total eval time:", time.time() - st) diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index 00a0fdca0..515357e2d 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -21,7 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false" -from lm_eval.models.hf_vlms import HFMultimodalLM from lm_eval.models.huggingface import HFLM @@ -37,6 +36,8 @@ def simple_evaluate_user_model( **kwargs ): if mllm: + from lm_eval.models.hf_vlms import HFMultimodalLM + if batch_size is None or batch_size == "auto": logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") batch_size = 16 diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 8633a2a50..890a93880 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -133,8 +133,8 @@ def pack_gguf_layer( ): """Export the model to gguf format.""" global gguf_model_instance_global - if output_dir is not None and os.path.exists(output_dir): - logger.warning_once(f"{output_dir} already exists, this may cause model conflict") + # if output_dir is not None and os.path.exists(output_dir): + # logger.warning_once(f"{output_dir} already exists, this may cause model conflict") if "gguf_model_instance_global" not in globals(): config = model.config diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index 9d4e4c98a..b1a5e18a6 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -124,6 +124,24 @@ def __getitem__(self, key): SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce) +MM_KEYS = [ + "multi_modal_projector", + "vision_tower", + "multimodal_projector", + "thinker", + "visual", + "audio", + "talker", + "token2wav", + "vision_model", + "audio_tower", + "vision_encoder", + "vision_language_adapter", + "patch_merger", + "pre_mm_projector_norm", + "vision", +] + def is_debug_mode(): """Checks if the Python interpreter is running in debug mode. diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index deb6f2122..849f98744 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -193,6 +193,8 @@ def detect_device_count(): """ if torch.cuda.is_available(): return torch.cuda.device_count() + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + return torch.xpu.device_count() else: try: import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401 @@ -1144,11 +1146,13 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map): device_list = parse_available_devices(device_map) gpu_devices = [] for device in device_list: + if device.startswith("hpu") and len(device_list) > 1: + logger.warning_once("Auto-scheme does not support multiple HPUs.") if device.startswith("cpu") or device.startswith("hpu"): continue gpu_devices.append(device) num_devices = len(gpu_devices) - if num_devices < 1: + if num_devices <= 1: return for block_names in block_name_list: @@ -1272,7 +1276,16 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None device_map = device_map.strip() if device_map.lower() == "cpu": return ["cpu"] - + if device_map.lower() == "auto": + device_count = detect_device_count() + if "cuda" in device_types: + return [f"cuda:{i}" for i in range(device_count)] + elif "xpu" in device_types: + return [f"xpu:{i}" for i in range(device_count)] + elif "hpu" in device_types: + return [f"hpu:{i}" for i in range(device_count)] + else: + return ["cpu"] # Split by commas parts = [x.strip() for x in device_map.split(",") if x.strip()] parsed = [] @@ -1283,7 +1296,7 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None parsed.append(f"{device_type}:{p}" if device_type != "cpu" else "cpu") else: parsed.append(p) - return parsed + return list(set(parsed)) if isinstance(device_map, dict): # Extract all devices recursively from dict values diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 5f9e1941b..6188ea16a 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -497,23 +497,8 @@ def is_pure_text_model(model): def is_mllm_model(model_or_path: Union[str, torch.nn.Module], platform: str = None): - MM_KEYS = [ - "multi_modal_projector", - "vision_tower", - "multimodal_projector", - "thinker", - "visual", - "audio", - "talker", - "token2wav", - "vision_model", - "audio_tower", - "vision_encoder", - "vision_language_adapter", - "patch_merger", - "pre_mm_projector_norm", - "vision", - ] + from auto_round.utils.common import MM_KEYS + model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path if not os.path.isdir(model_path): model_path = download_or_get_path(model_path, platform=platform) diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py index d0d29a441..c2d165639 100644 --- a/test/test_cpu/test_scheme.py +++ b/test/test_cpu/test_scheme.py @@ -118,6 +118,17 @@ def test_scheme_in_layer_config(self): if n == "model.decoder.layers.4.self_attn.k_proj": self.assertEqual(m.group_size, 64) + def test_parse_available_devices(self): + from auto_round.utils.device import parse_available_devices + + device_list = parse_available_devices("auto") + self.assertTrue(len(device_list) == 1 and "cpu" in device_list) + device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") + self.assertTrue(len(device_list) == 3) + self.assertEqual(device_list, ["cuda:0", "cuda:1", "cpu"]) + device_list = parse_available_devices("0,1") + self.assertTrue(len(device_list) == 1 and "cpu" in device_list) + if __name__ == "__main__": unittest.main() diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index fe4388667..aaad1cf6a 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -88,7 +88,7 @@ def test_q2_k_export(self): quantized_model_path = "./saved" autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q2_k_s") - gguf_file = "Qwen2.5-1.5B-Instruct-1.5B-Q2_K_S.gguf" + gguf_file = os.listdir(quantized_model_path)[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device) @@ -123,7 +123,7 @@ def test_q4_0(self): quantized_model_path = "./saved" autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0") - gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_0.gguf" + gguf_file = os.listdir(quantized_model_path)[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device) @@ -144,7 +144,7 @@ def test_q4_1(self): quantized_model_path = "./saved" autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1") - gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_1.gguf" + gguf_file = os.listdir(quantized_model_path)[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device) @@ -198,15 +198,13 @@ def test_vlm_gguf(self): quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) - file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-7.6B-Q4_0.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 4226, delta=1.0) + file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2 + self.assertAlmostEqual(file_size, 4226, delta=5.0) file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 2578, delta=1.0) + self.assertAlmostEqual(file_size, 2580, delta=5.0) shutil.rmtree("./saved", ignore_errors=True) model_name = "/models/gemma-3-12b-it" - from auto_round import AutoRoundMLLM - from auto_round.utils import mllm_load_model model, processor, tokenizer, image_processor = mllm_load_model(model_name) autoround = AutoRoundMLLM( @@ -216,15 +214,15 @@ def test_vlm_gguf(self): image_processor=image_processor, device="auto", nsamples=32, - iters=1, + iters=0, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) - file_size = os.path.getsize("./saved/gemma-3-12b-it-12B-Q4_K_M.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 6568, delta=1.0) + file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2 + self.assertAlmostEqual(file_size, 6962, delta=5.0) file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 1599, delta=1.0) + self.assertAlmostEqual(file_size, 1599, delta=5.0) shutil.rmtree(quantized_model_path, ignore_errors=True) # @require_gguf diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py index eee7c055a..d06c48ff5 100644 --- a/test/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -140,7 +140,6 @@ def test_mllm_detect(self): "/models/Phi-3.5-vision-instruct", "/models/Qwen2-VL-2B-Instruct", "/models/SmolVLM-256M-Instruct", - "/models/Llama-4-Maverick-17B-128E-Instruct", "/models/Mistral-Small-3.1-24B-Instruct-2503", "/models/InternVL3-1B", "/models/pixtral-12b",