From 192f44ea060997041496f8e62bc473eb78e117c1 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 24 Oct 2025 04:54:57 -0400 Subject: [PATCH 1/7] refact utils Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 29 +- auto_round/compressors/mllm/compressor.py | 2 +- auto_round/export/export_to_gguf/convert.py | 4 +- auto_round/utils.py | 3150 ------------------- auto_round/utils/__init__.py | 21 + auto_round/utils/constants.py | 82 + auto_round/utils/device_utils.py | 359 +++ auto_round/utils/dtype_utils.py | 146 + auto_round/utils/memory_utils.py | 182 ++ auto_round/utils/misc_utils.py | 226 ++ auto_round/utils/model_utils.py | 1104 +++++++ auto_round/utils/quantization_utils.py | 1206 +++++++ 12 files changed, 3344 insertions(+), 3167 deletions(-) delete mode 100644 auto_round/utils.py create mode 100644 auto_round/utils/__init__.py create mode 100644 auto_round/utils/constants.py create mode 100644 auto_round/utils/device_utils.py create mode 100644 auto_round/utils/dtype_utils.py create mode 100644 auto_round/utils/memory_utils.py create mode 100644 auto_round/utils/misc_utils.py create mode 100644 auto_round/utils/model_utils.py create mode 100644 auto_round/utils/quantization_utils.py diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 1ab233953..36804be15 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -47,9 +47,6 @@ SUPPORTED_LAYER_TYPES, TORCH_VERSION_AT_LEAST_2_6, CpuInfo, - _gguf_args_check, - _is_fp8_linear, - _is_fp8_model, block_forward, check_and_mark_fp8_model, check_is_cpu, @@ -78,10 +75,13 @@ get_max_vram, get_module, get_shared_keys, + gguf_args_check, htcore, infer_bits_by_data_type, init_cache, is_debug_mode, + is_fp8_linear, + is_fp8_model, is_hpex_available, is_mx_fp, is_nv_fp, @@ -879,9 +879,9 @@ def remove_duplicates(lst): ) formats[i] = gguf_format_name.lower() - _gguf_args_check(self, formats, model_type=ModelType.TEXT) + gguf_args_check(self, formats, model_type=ModelType.TEXT) if self.mllm: - _gguf_args_check(self, formats, model_type=ModelType.MMPROJ) + gguf_args_check(self, formats, model_type=ModelType.MMPROJ) for f in formats: if f.startswith("gguf"): @@ -1340,7 +1340,7 @@ def _quantize_layer_via_rtn(self, name: str) -> None: """ m = get_module(self.model, name) - if _is_fp8_linear(m): + if is_fp8_linear(m): m = convert_fp8_layer_to_linear(m, self.amp_dtype) set_module(self.model, name, m) # @@ -1500,7 +1500,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: cnt = 1 cnt += 1 # Convert remaining fp8 - if _is_fp8_model(self.model): + if is_fp8_model(self.model): convert_fp8_model_to_16b_model(self.model, self.amp_dtype) self.quantized = True return self.model, self.layer_config @@ -1568,7 +1568,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) pbar.set_description(f"Quantizing {block_name}") block = get_module(self.model, block_name) block = block.to(self.device) - if _is_fp8_model(self.model): + if is_fp8_model(self.model): convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype) if self.device_map == "auto": @@ -1765,9 +1765,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self._quantize_layers(layer_names, all_inputs) ##TODO pack layer immediately - if _is_fp8_model(self.model): + if is_fp8_model(self.model): for n, m in self.model.named_modules(): - if _is_fp8_linear(m): + if is_fp8_linear(m): new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to("cpu") set_module(self.model, n, new_layer) @@ -1816,7 +1816,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: layer = get_module(self.model, layer_name) layer = layer.to(self.device) - if _is_fp8_model(self.model): + if is_fp8_model(self.model): new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype).to(self.device) set_module(self.model, layer_name, new_layer) layer = new_layer @@ -2034,7 +2034,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l Raises: Exception: If caching on GPU fails, switches to CPU and caches there. """ - if _is_fp8_model(self.model): + if is_fp8_model(self.model): layer_names = [] if layer_names is None: layer_names = [] @@ -2446,6 +2446,7 @@ def _quantize_layer( logger.info(dump_info) def _register_act_max_hook(self, model): + def get_act_max_hook(module, input, output): if isinstance(input, (tuple, list)): input = input[0] @@ -2544,9 +2545,9 @@ def _quantize_block( Returns: Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) """ - if _is_fp8_model(self.model): + if is_fp8_model(self.model): for n, m in block.named_modules(): - if _is_fp8_linear(m): + if is_fp8_linear(m): new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device) set_module(block, n, new_layer) diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py index 6fcb50c2c..32c594d3f 100644 --- a/auto_round/compressors/mllm/compressor.py +++ b/auto_round/compressors/mllm/compressor.py @@ -33,7 +33,6 @@ _handle_special_model, ) from auto_round.utils import ( - _is_fp8_model, check_to_quantized, clear_memory, detect_device, @@ -41,6 +40,7 @@ find_matching_blocks, get_block_names, get_max_vram, + is_fp8_model, mllm_load_model, mv_module_from_gpu, to_device, diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 91f206667..37667dd3b 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -50,7 +50,7 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.export.export_to_gguf.packing import ggml_quant -from auto_round.utils import LazyImport, _get_packing_device, _is_fp8_model, clean_module_parameter, get_module, logger +from auto_round.utils import LazyImport, _get_packing_device, clean_module_parameter, get_module, is_fp8_model, logger gguf = LazyImport("gguf") @@ -145,7 +145,7 @@ def get_tensors(cls) -> Iterator[tuple[str, Tensor]]: yield name, tensor def is_extra_tensor(tensor_name): - if _is_fp8_model(cls.model) and "scale" in tensor_name.split(".")[-1]: + if is_fp8_model(cls.model) and "scale" in tensor_name.split(".")[-1]: return False if tensor_name not in cls.model.tensor_name_list: return True diff --git a/auto_round/utils.py b/auto_round/utils.py deleted file mode 100644 index 84ecadd76..000000000 --- a/auto_round/utils.py +++ /dev/null @@ -1,3150 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections.abc -import copy -import gc -import importlib -import json -import os -import re -import sys -from collections import UserDict -from dataclasses import asdict, fields -from enum import Enum -from functools import lru_cache -from pathlib import Path -from typing import Any, Callable, Dict, List, Tuple, Union - -import cpuinfo -import torch -import transformers -from accelerate.utils import get_balanced_memory -from packaging import version -from torch.amp import autocast - -from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType -from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme - -SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") - -deepspeed_exists = False -if importlib.util.find_spec("deepspeed"): # check if deepspeed is installed - deepspeed_exists = True - - -class SupportedFormats: - - def __init__(self): - self._support_format = ( - "auto_round", - "auto_gptq", - "auto_awq", - "auto_round:auto_gptq", - "auto_round:gptqmodel", - "auto_round:auto_awq", - "auto_round:llm_compressor", - "itrex", - "itrex_xpu", - "fake", - "llm_compressor", - ) - self._gguf_format = tuple(sorted(GGUF_CONFIG.keys())) - self._support_list = self._support_format + self._gguf_format - - def __contains__(self, key): - return True if key in self._support_list else False - - def __str__(self): - # Return "(%s)" % ', '.join(self._support_format + ("gguf:q*_0", "gguf:q*_1", "gguf:q*_k_s")) - return "(%s)" % ", ".join(self._support_list) - - def __getitem__(self, key): - return self._support_list[key] - - -SUPPORTED_DTYPES = ("int", "mx_fp", "fp", "nv_fp") -SUPPORTED_FORMATS = SupportedFormats() -SUPPORTED_LAYER_TYPES = (torch.nn.Linear, transformers.pytorch_utils.Conv1D) - -# Changed to str as it relies on triton or others lib to load this -INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear",) -# transformers.integrations.finegrained_fp8.FP8Linear -if deepspeed_exists: - from deepspeed.module_inject import LinearAllreduce, LinearLayer - - SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce) - - -def infer_bits_by_data_type(data_type: str): - """Infer bits by data_type - - Args: - data_type (str): data_type - - Returns: - int: bits inferred by data_type, None means cannot infer correct bits by data_type - """ - if data_type is None: - return 16 - for supported_dtype in SUPPORTED_DTYPES: - if data_type.startswith(supported_dtype) and len(data_type) > len(supported_dtype): - ##first check the following two bits - suc_2str = data_type[len(supported_dtype) : len(supported_dtype) + 2] - if str.isdigit(suc_2str): - return int(suc_2str) - if str.isdigit(data_type[len(supported_dtype)]): - return int(data_type[len(supported_dtype)]) - return None - - -class LazyImport(object): - """Lazy import python module till use.""" - - def __init__(self, module_name): - """Init LazyImport object. - - Args: - module_name (string): The name of module imported later - """ - self.module_name = module_name - self.module = None - - def __getattr__(self, name): - """Get the attributes of the module by name.""" - try: - self.module = importlib.import_module(self.module_name) - mod = getattr(self.module, name) - except: - spec = importlib.util.find_spec(str(self.module_name + "." + name)) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - return mod - - def __call__(self, *args, **kwargs): - """Call the function in that module.""" - function_name = self.module_name.split(".")[-1] - module_name = self.module_name.split(f".{function_name}")[0] - self.module = importlib.import_module(module_name) - function = getattr(self.module, function_name) - return function(*args, **kwargs) - - -auto_gptq = LazyImport("auto_gptq") -htcore = LazyImport("habana_frameworks.torch.core") - - -################ Check available sys.module to decide behavior ################# -def is_package_available(package_name: str) -> bool: - """Check if the package exists in the environment without importing. - - Args: - package_name (str): package name - """ - from importlib.util import find_spec - - package_spec = find_spec(package_name) - return package_spec is not None - - -## check hpex -if is_package_available("habana_frameworks"): - _hpex_available = True - import habana_frameworks.torch.hpex # pylint: disable=E0401 -else: - _hpex_available = False - - -@torch._dynamo.disable() -@lru_cache(None) -def is_hpex_available(): - return _hpex_available - - -def get_module(module, key): - """Get module from model by key name. - - Args: - module (torch.nn.Module): original model - key (str): module name to be replaced - """ - name_list = key.split(".") - for name in name_list: - module = getattr(module, name, None) - return module - - -def set_module(model, key, new_module): - """Set new module into model by key name. - - Args: - model (torch.nn.Module): original model - key (str): module name to be replaced - new_module (torch.nn.Module): new module to be inserted - """ - module = model - name_list = key.split(".") - for name in name_list[:-1]: - if hasattr(module, name): - module = getattr(module, name) - setattr(module, name_list[-1], new_module) - - -def get_scale_shape(weight, group_size): - """Computes the shape of the scale tensor for quantization based on the weight tensor and group size. - - Args: - weight (torch.Tensor): The weight tensor of the layer. - group_size (int): The size of the groups for quantization. - - Returns: - The shape of the scale tensor to be used for quantization. - """ - if group_size == 0: - return 1 - elif group_size == -1 or weight.shape[1] < group_size: - shape = weight.shape[0] - else: - shape = weight.shape[0] * ((weight.shape[1] + group_size - 1) // group_size) - - return shape - - -def unsupported_meta_device(model): - """Checks if the model is a valid model for auto_round. - - Args: - model: The model to be checked. - - Returns: - bool: True if the model is valid, False otherwise. - """ - target_device = None - for param in model.parameters(): - if target_device is None: - target_device = param.device - if param.device != target_device: - if param.device.type == "meta" or target_device.type == "meta": - return True - if target_device.type == "meta": - if hasattr(model, "path"): - return False - else: - return True - return False - - -def to_device(input, device=torch.device("cpu")): - """Moves input data to the specified device. - - Args: - input: The input data to be moved. - device: The target device. - - Returns: - The input data on the specified device. - """ - if input is None: - return None - if isinstance(input, torch.Tensor): - return input.to(device) - if isinstance(input, dict) or isinstance(input, UserDict): - for inp in input.keys(): - input[inp] = to_device(input[inp], device) - - elif isinstance(input, list) or isinstance(input, tuple): - if len(input) == 0: - return input - input_res = [] - for inp in input: - input_res.append(to_device(inp, device)) - if isinstance(input, tuple): - input_res = tuple(input_res) - input = input_res - - return input - - -def mv_module_from_gpu(module, low_cpu_mem_usage=False): - """Moves module from gpu to cpu or meta if low_cpu_mem_usage is true. - - Args: - module: The module to be moved. - low_cpu_mem_usage: Whether to use low CPU memory. If true, move module to meta. - - Returns: - The module on the specified device. - """ - if hasattr(module, "device"): - target_device = "meta" if low_cpu_mem_usage else "cpu" - if module.device.type == target_device: - return module - else: - return module.to(target_device) - else: - if low_cpu_mem_usage: - return module.to("meta") - else: - return module.to("cpu") - - -def to_dtype(input, dtype=torch.float32): - """Moves input data to the specified data type. - - Args: - input: The input data to be moved. - dtype: The target data type. - - Returns: - The input data on the specified data type. - """ - if input is None: - return None - if isinstance(input, torch.Tensor): - return input.to(dtype) - if isinstance(input, dict) or isinstance(input, UserDict): - for inp in input.keys(): - input[inp] = to_dtype(input[inp], dtype) - - elif isinstance(input, list) or isinstance(input, tuple): - if len(input) == 0: - return input - input_res = [] - for inp in input: - input_res.append(to_dtype(inp, dtype)) - if isinstance(input, tuple): - input_res = tuple(input_res) - input = input_res - - return input - - -def check_is_cpu(device): - """Check if the device is a CPU. - - Args: - device: The device to be checked. - - Returns: - bool: True if the device is a CPU, False otherwise. - """ - return device == torch.device("cpu") or device == "cpu" - - -def get_common_prefix(paths): - # Split each path into components and find the common prefix - split_paths = [path.split(".") for path in paths] - common_prefix = split_paths[0] - for path in split_paths[1:]: - common_prefix = [comp for comp, other in zip(common_prefix, path) if comp == other] - return ".".join(common_prefix) - - -def extract_block_names_to_str(quant_block_list): - if not isinstance(quant_block_list, (list, tuple)): - return None - # Extract common prefix for each list - prefixes = [get_common_prefix(blocks) for blocks in quant_block_list] - # Join prefixes into a single string - return ",".join(prefixes) - - -def find_matching_blocks(model, all_blocks, to_quant_block_names): - """ - Find and return matching blocks in the model based on to_quant_block_names. - - Args: - model: The model (not used in this specific function but kept for completeness). - all_blocks: List of lists, where each inner list contains full block names in the model. - to_quant_block_names: Comma-separated string of target block names to match. - - Returns: - target_blocks: List of lists containing full paths of matching blocks in the model. - """ - if not to_quant_block_names: - return all_blocks - to_quant_block_list = to_quant_block_names - if isinstance(to_quant_block_names, list) or isinstance(to_quant_block_names, tuple): - return to_quant_block_names - if isinstance(to_quant_block_names, str): - to_quant_block_list = [name.strip() for name in to_quant_block_names.split(",")] - target_blocks = [] - for block_list in all_blocks: - matched_sublist = [] - for name in to_quant_block_list: - matches = [block for block in block_list if re.search(name, block)] - if matches: - matched_sublist.extend(matches) - if matched_sublist: - target_blocks.append(matched_sublist) - if not target_blocks: - raise ValueError( - "No block names matched. Please check the input for to_quant_block_name," - "or set to_quant_block_name to None to automatically match quantizable blocks." - ) - return target_blocks - - -def get_block_names(model, quant_vision=False): - """Get the block names for transformers-like networks. - - Args: - model: The model. - - Returns: - block_names: A list whose elements are list of block's layer names - """ - from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK - - def _search_block(name, module): - if hasattr(type(module), "__name__") and "ModuleList" in type(module).__name__: - return [(name, module)] - target_modules = [] - for n, m in module.named_children(): - if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: - target_modules.append((".".join(filter(None, (name, n))), m)) - else: - target_modules.extend(_search_block(".".join(filter(None, (name, n))), m)) - return target_modules - - def _get_llm_block_names(model): - block_names = [] - target_modules = _search_block("", model) - - for i, target_m in enumerate(target_modules): - block_names.append([]) - for n, m in target_m[1].named_children(): - block_names[i].append(target_m[0] + "." + n) - return block_names - - def _get_vlm_block_names(model, quant_vision=False): - if ( - hasattr(model, "config") - and hasattr(model.config, "model_type") - and model.config.model_type in SPECIAL_MULTIMODAL_BLOCK.keys() - ): - return SPECIAL_MULTIMODAL_BLOCK.get(model.config.model_type)(model, quant_vision=quant_vision) - block_names = [] - target_modules = [] - vision_blocks_tuple = ("vision", "visual", "image", "img") - last_block_name = "" - for n, m in model.named_modules(): - if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: - if quant_vision or all(key not in n.lower() for key in (vision_blocks_tuple)): - if last_block_name and last_block_name in n: - continue - target_modules.append((n, m)) - last_block_name = n - for i, target_m in enumerate(target_modules): - block_names.append([]) - for n, m in target_m[1].named_children(): - block_names[i].append(target_m[0] + "." + n) - return block_names - - if quant_vision or not is_pure_text_model(model): - return _get_vlm_block_names(model, quant_vision=quant_vision) - else: - return _get_llm_block_names(model) - - -def collect_best_params(block): - params = {} - for n, m in block.named_modules(): - if hasattr(m, "orig_layer"): - params[n] = {} - for key in m.params.keys(): - params[n][key] = copy.deepcopy(m.params[key].data) - return params - - -def block_forward( - block: torch.nn.Module, - input_ids: torch.Tensor, - input_others: dict, - amp: bool = False, - amp_dtype: torch.dtype = torch.float16, - device: torch.device = torch.device("cpu"), - output_return_id: int = 0, -) -> Union[torch.Tensor, dict]: - """Performs a forward pass through a block with the given inputs. - - Args: - block: The block to perform the forward pass on. - input_ids: The input IDs. - input_others: A dictionary containing other input data. - amp: A boolean indicating whether to use automatic mixed precision. - amp_dtype: The data type for automatic mixed precision. - device: The target device. - output_return_id: if the output has more than one tenor, return the specified idx tensor. - - Returns: - output: The output of the forward pass. - """ - if input_ids.device != device: - input_ids = to_device(input_ids, device) - input_others = to_device(input_others, device) - input_tuple = input_others.pop("positional_inputs", None) - if "alibi" in input_others.keys() and input_others["alibi"] is not None: - alibi = input_others["alibi"] - input_others["alibi"] = alibi.reshape(-1, alibi.shape[2], alibi.shape[3]) - if amp: - with autocast(device_type=device.split(":")[0], dtype=amp_dtype): # pragma: no cover - output = block(input_ids, *input_tuple, **input_others) - else: - output = block(input_ids, *input_tuple, **input_others) - if isinstance(output_return_id, int) and (isinstance(output, list) or isinstance(output, tuple)): - output = output[output_return_id] - return output - - -def check_to_quantized(config): - """Checks if the configuration is valid for quantization. - - Args: - config (dict or object): The configuration to check. It can be either a - dictionary with a 'bits' key or an object with a 'bits' attribute. - - Returns: - bool: True if the configuration is valid for quantization (bits <= 8), - False otherwise. - """ - if isinstance(config, (dict, QuantizationScheme)): - bits = int(config.get("bits", 16)) - act_bits = int(config.get("act_bits", 16)) - elif hasattr(config, "orig_layer"): - bits = int(config.orig_layer.bits) if hasattr(config.orig_layer, "bits") else 16 - act_bits = int(config.orig_layer.act_bits) if hasattr(config.orig_layer, "act_bits") else 16 - else: - bits = int(config.bits) if hasattr(config, "bits") else 16 - act_bits = int(config.act_bits) if hasattr(config, "act_bits") else 16 - - return bits <= 8 or act_bits <= 8 - - -def detect_device_count(): - """Detects the number of available computation devices. - - This function checks if CUDA is available. If it is, it returns the count - of available CUDA devices. If not, it attempts to import the Habana - device framework to return the count of Habana devices. If the import - fails or no devices are found, it returns 0. - - Returns: - int: The number of available devices (CUDA or Habana). - """ - if torch.cuda.is_available(): - return torch.cuda.device_count() - else: - try: - import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401 - - return hthpu.device_count() - except ImportError: - return 0 - - -def detect_device(device: Union[str, int, torch.device] = None) -> str: - """Detects the appropriate computation device. - - This function determines the device to use for computations. It can take - a specific device index or default to 'auto'. The function checks for - available devices in the following order: CUDA, Habana, and finally CPU. - - Args: - device (str, int, or torch.device, optional): The desired device. - If 'auto' or None, the function will determine the best device - automatically. - - Returns: - str: The device to use for computations, formatted as a string. - """ - - def is_valid_digit(s): - try: - num = int(s) - return 0 <= num - except: - return False - - dev_idx = None - if is_valid_digit(device): - dev_idx = int(device) - device = "auto" - if isinstance(device, str) and "," in device: # device is "0,1,2" - device_list = [int(dev) for dev in device.split(",") if dev.isdigit()] - dev_idx = device_list[0] if device_list else None - device = "auto" - if device is None or device == "auto": - if torch.cuda.is_available(): - device = torch.device("cuda") - # logger.info("Using GPU device") - elif is_hpex_available(): # pragma: no cover - device = torch.device("hpu") - # logger.info("Using HPU device") - elif torch.xpu.is_available(): # pragma: no cover - device = torch.device("xpu") - # Use CPU as a fallback - else: - device = torch.device("cpu") - # logger.info("Using CPU device") - if dev_idx is not None and str(device) != "cpu": - device = str(device) + f":{dev_idx}" - return str(device) - elif isinstance(device, torch.device): - device = str(device) - elif isinstance(device, str): ## for cuda:0 - if device == "tp": # pragma: no cover - # should not specify card, e.g., cuda:0 - if torch.cuda.is_available(): - device = "cuda" - elif is_hpex_available(): - device = "hpu" - else: - device = "cpu" - else: - device = device - return device - - -class CpuInfo(object): - """Get CPU Info.""" - - def __init__(self): - """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket.""" - self._bf16 = False - info = cpuinfo.get_cpu_info() - if "arch" in info and "X86" in info["arch"]: - cpuid = cpuinfo.CPUID() - max_extension_support = cpuid.get_max_extension_support() - if max_extension_support >= 7: - eax = cpuid._run_asm( - b"\xb9\x01\x00\x00\x00", # mov ecx, 1 - b"\xb8\x07\x00\x00\x00" b"\x0f\xa2" b"\xc3", # mov eax, 7 # cpuid # ret - ) - self._bf16 = bool(eax & (1 << 5)) - - @property - def bf16(self): - """Get whether it is bf16.""" - return self._bf16 - - -def is_local_path(path): - """Checks if a given path exists locally. - - Args: - path (str): The path to check. - - Returns: - bool: True if the path exists locally, False otherwise. - """ - format_list = ( - "json", - "txt", - ) - flag = None - for x in format_list: - flag = True if x in path else flag - return flag and os.path.exists(path) - - -def convert_dtype_str2torch(str_dtype): - """Converts a string dtype to its corresponding PyTorch dtype. - - Args: - str_dtype (str): The string representation of the dtype. - - Returns: - torch.dtype: The PyTorch dtype. - - Raises: - ValueError: If the input str_dtype is unsupported. - """ - if isinstance(str_dtype, torch.dtype) or str_dtype is None: - return str_dtype - if str_dtype == "int8": - return torch.int8 - elif str_dtype == "fp32" or str_dtype == "float32" or str_dtype == "auto": - return torch.float - elif str_dtype == "fp16" or str_dtype == "float16": - return torch.float16 - elif str_dtype == "bf16" or str_dtype == "bfloat16": - return torch.bfloat16 - else: - raise ValueError(f"Unsupported string dtype '{str_dtype}' for conversion to torch dtype.") - - -def convert_dtype_torch2str(dtype): - """Converts a PyTorch dtype to its corresponding string representation. - - Args: - dtype: PyTorch dtype or str. The dtype to convert. - - Returns: - str: The string representation of the dtype. - - Raises: - ValueError: If the input dtype is unsupported. - """ - if isinstance(dtype, str) or dtype is None: - return dtype - if dtype == torch.int8: - return "int8" - elif dtype == torch.float: - return "fp32" - elif dtype == torch.float16: - return "fp16" - elif dtype == torch.bfloat16: - return "bf16" - elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]: - return dtype - else: - raise ValueError(f"Unsupported PyTorch dtype '{dtype}' for conversion to string dtype.") - - -def convert_dtype_torch2str_hf(dtype): - """Converts a PyTorch dtype to its corresponding huggingface string dtype, e.g. torch.float32 -> 'float32'. - - Args: - dtype: PyTorch dtype or str. The dtype to convert. - - Returns: - str: The string representation of the dtype. - - Raises: - ValueError: If the input str_dtype is unsupported. - """ - if dtype is None: - return dtype - if isinstance(dtype, str): - if "float" not in dtype and "int" not in dtype: - dtype = convert_dtype_str2torch(dtype) - else: - return dtype - str_dtype = str(dtype) - if "." not in str_dtype: - raise ValueError(f"Unsupported pytorch dtype '{dtype}' for conversion to huggingface str dtype") - str_dtype = str_dtype.split(".")[1] - return str_dtype - - -def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): - """Checks the availability of memory on the specified device for processing inputs using a given weight tensor. - - Args: - device (str): The device type ('cuda' for GPU or 'hpu' for HPU). - inputs (torch.Tensor): Input tensor. - weight (torch.Tensor): Weight tensor. - org_seqlen (int): Original sequence length. - org_bs (int): Original batch size. - - Returns: - tuple: A tuple containing availability status (bool), modified sequence length (int), - and modified batch size (int). - """ - weight_memory = weight.numel() * weight.element_size() - if "cuda" in device: - current_gpu_index = torch.cuda.current_device() - total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory - used_memory = torch.cuda.memory_allocated(current_gpu_index) - free_space = total_memory - used_memory - elif "hpu" in device: # pragma: no cover - current_hpu_index = torch.hpu.current_device() - free_space = torch.hpu.memory_reserved(current_hpu_index) - else: - return True, org_seqlen, org_bs - - free_space = free_space - weight_memory * 10 # for min_max_scale & grad usage - seqlen = org_seqlen - bs = org_bs - in_feature = weight.shape[1] - out_feature = weight.shape[0] - while seqlen >= 128: - input_size = bs * seqlen * in_feature - output_size = bs * seqlen * out_feature - input_output_memory = 2 * (input_size * inputs.element_size() + output_size * inputs.element_size()) - if input_output_memory < free_space: - return True, seqlen, bs - seqlen = seqlen // 2 - bs = 1 - - return False, seqlen, bs - - -def get_layer_names_in_block( - model: torch.nn.Module, - supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), - quant_block_list: list = None, - class_names: tuple = None, -) -> list[str]: - """Retrieves the names of layers within each block of the model. - - Returns: - list: A list of strings, where each string is the name of a layer - within a block of the model. - """ - if class_names is None: - class_names = [] - for n, m in model.named_modules(): - if type(m) in supported_types or (class_names is not None and m.__class__.__name__ in class_names): - m.bk_tmp_name = n - layers_in_block = [] - if bool(quant_block_list): - all_blocks = quant_block_list - else: - all_blocks = get_block_names(model) - for block_names in all_blocks: - for block_name in block_names: - block = get_module(model, block_name) - for n, m in block.named_modules(): - if hasattr(m, "bk_tmp_name"): - layers_in_block.append(m.bk_tmp_name) - delattr(m, "bk_tmp_name") - return layers_in_block - - -def is_autoround_exllamav2_available(): - """Checks if the AutoRound ExLlamaV2 kernels are available. - - Returns: - bool: - True if the AutoRound ExLlamaV2 kernels are available, False otherwise. - """ - res = True - try: - from autoround_exllamav2_kernels import gemm_half_q_half, make_q_matrix - except ImportError as e: - res = False - return res - - -def get_library_version(library_name): - from packaging.version import Version - - python_version = Version(sys.version.split()[0]) - if python_version < Version("3.8"): - import warnings - - warnings.filterwarnings("ignore", category=DeprecationWarning) - import pkg_resources # pylint: disable=E0401 - - try: - version = pkg_resources.get_distribution(library_name).version - return version - except pkg_resources.DistributionNotFound: - return f"{library_name} is not installed" - else: - import importlib.metadata # pylint: disable=E0401 - - try: - version = importlib.metadata.version(library_name) - return version - except importlib.metadata.PackageNotFoundError: - return f"{library_name} is not installed" - - -def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): - """ - Configures and returns a QuantLinear class based on the specified backend and parameters. - - Args: - backend (str): The backend to be used for quantization. Supported values include "qigen", "triton", "marlin", - "exllama", and "cuda". - bits (int, optional): The number of bits for quantization. Default is 4. - group_size (int, optional): The group size for quantization. Default is 128. - sym (bool, optional): Flag indicating whether to use symmetric quantization. Default is False. - - Returns: - class: The dynamically imported QuantLinear class configured according to the specified parameters. - """ - use_triton = True - if bits not in [2, 4, 8]: - use_triton = False - disable_exllamav2 = True - disable_exllamav1 = False - disable_marlin = True - use_qigen = False - if "qigen" in backend: - use_triton = False - use_qigen = True - elif "triton" in backend: - use_triton = True - elif "marlin" in backend and sym: - use_triton = False - disable_marlin = False - elif "exllama" in backend: ##need v1 code to export - use_triton = True ##same with triton - disable_marlin = True - elif "cuda" in backend: - use_triton = False - disable_marlin = True - disable_exllamav2 = True - disable_exllamav1 = True - if use_triton: - from auto_round.export.export_to_autogptq.qlinear_triton import QuantLinear - - return QuantLinear - try: - import auto_gptq # pylint: disable=E0401 - except: - logger.error(f"please install auto_gptq via 'pip install auto-gptq' to support exporting to {backend}") - exit() - - from auto_gptq.utils.import_utils import dynamically_import_QuantLinear # pylint: disable=E0401 - - version = get_library_version("auto_gptq") - from packaging.version import Version - - if Version(version) < Version("0.7.2"): - QuantLinear = dynamically_import_QuantLinear( - use_triton=use_triton, - desc_act=False, - group_size=group_size, - bits=bits, - disable_exllama=disable_exllamav1, - disable_exllamav2=disable_exllamav2, - use_qigen=use_qigen, - disable_marlin=disable_marlin, - ) - else: - QuantLinear = dynamically_import_QuantLinear( # pylint: disable=E1123 - use_triton=use_triton, - desc_act=False, - group_size=group_size, - bits=bits, - disable_exllama=disable_exllamav1, - disable_exllamav2=disable_exllamav2, - use_qigen=use_qigen, - use_marlin=not disable_marlin, - ) - return QuantLinear - - -def _clear_memory_for_cpu_and_cuda(tensor=None): - if isinstance(tensor, list): - for i in range(len(tensor)): - tensor[i] = None - if tensor is not None: - del tensor - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - if torch.xpu.is_available(): - torch.xpu.empty_cache() - - -@torch._dynamo.disable() -def clear_memory(tensor=None): - if is_hpex_available(): - # hpu does not have empty_cache - return - else: - _clear_memory_for_cpu_and_cuda(tensor) - - -def compare_versions(v1, v2): - return version.parse(v1) >= version.parse(v2) - - -def torch_version_at_least(version_string): - return compare_versions(torch.__version__, version_string) - - -TORCH_VERSION_AT_LEAST_2_6_PRE_RELEASE = torch_version_at_least("2.5.99") -TORCH_VERSION_AT_LEAST_2_6 = torch_version_at_least("2.6.0") -TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0") -TORCH_VERSION_AT_LEAST_2_4 = torch_version_at_least("2.4.0") - - -# Note on HPU usage: -# There are two modes available for enabling auto-round on HPU: -# 1. Compile Mode -# 1) Use PyTorch version ≥ 2.4 (Intel® Gaudi® v1.18 or later) -# 2) Set `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1` -# The compile mode can speed up quantization process but still in experimental stage. -# 2. Lazy Mode (By default) - - -def is_hpu_lazy_mode(): - return os.getenv("PT_HPU_LAZY_MODE") != "0" - - -def _use_hpu_compile_mode(): - return TORCH_VERSION_AT_LEAST_2_4 and not is_hpu_lazy_mode() - - -def compile_func_on_hpu(func): - if _use_hpu_compile_mode(): - return torch.compile(func, backend="hpu_backend") - return func - - -def compile_func_on_cuda_or_cpu(func): - return torch.compile(func) - - -def compile_func( - fun: Union[torch.nn.Module, Callable], device: Union[str, torch.device, int] -) -> Union[torch.nn.Module, Callable]: - """Compile function on the specified device.""" - if "hpu" in str(device): - return compile_func_on_hpu(fun) ## use auto by default - else: - return compile_func_on_cuda_or_cpu(fun) - - -def is_numba_available(): # pragma: no cover - """Check if Numba is available.""" - try: - import numba - - return True - except ImportError: - return False - - -def _is_tbb_installed(): # pragma: no cover - import importlib.metadata - - try: - importlib.metadata.version("tbb") - return True - except importlib.metadata.PackageNotFoundError: - return False - - -def _is_tbb_configured(): # pragma: no cover - try: - from numba.np.ufunc.parallel import _check_tbb_version_compatible - - # check if TBB is present and compatible - _check_tbb_version_compatible() - - return True - except ImportError as e: - logger.warning_once(f"TBB not available: {e}") - return False - - -def is_tbb_available(): # pragma: no cover - """Check if TBB is available.""" - if not _is_tbb_installed(): - logger.warning_once("TBB is not installed, please install it with `pip install tbb`.") - return False - if not _is_tbb_configured(): - logger.warning_once( - ( - "TBB is installed but not configured correctly. \n" - "Please add the TBB library path to `LD_LIBRARY_PATH`, " - "for example: `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/`." - ) - ) - return False - return True - - -def can_pack_with_numba(): # pragma: no cover - """Check if Numba and TBB are available for packing. - - To pack tensor with Numba, both Numba and TBB are required, and TBB should be configured correctly. - """ - if not is_numba_available(): - logger.warning_once("Numba is not installed, please install it with `pip install numba`.") - return False - if not is_tbb_available(): - return False - return True - - -def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): - """Identifies and returns layers in the model to exclude from quantization. - - This function processes a comma-separated list of fully precision (FP) layers, - matches them to the names of layers in the model, and returns a list of such - layers to exclude from quantization. - - Args: - model (torch.nn.Module): The model whose layers will be inspected. - fp_layers (str): A comma-separated string of layer names to be excluded - from quantization. Whitespace is ignored in this string. - - Returns: - list: A list of layer names that match the specified FP layers or are - subcomponents of those layers. - """ - if not fp_layers: - return [] - fp_layers = fp_layers.replace(" ", "").split(",") - all_layer_names = [] - for n, m in model.named_modules(): - if type(m) in SUPPORTED_LAYER_TYPES: - all_layer_names.append(n) - not_to_quantized_layers = [] - - for fp_layer in fp_layers: - if fp_layer == "": - continue - if fp_layer in all_layer_names: - not_to_quantized_layers.append(fp_layer) - continue - if fp_layer[-1].isdigit(): - fp_layer = fp_layer + "." ##tricky setting - for name in all_layer_names: - if fp_layer in name: - not_to_quantized_layers.append(name) - logger.trace(f"not_to_quantized_layers: {not_to_quantized_layers}") - return not_to_quantized_layers - - -def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=None): - """Checks if a model is compatible with the AutoAWQ GEMM kernel. - - Args: - model: The model object to evaluate, typically a PyTorch model. - bits (int): The number of bits for quantization (must be 4 for compatibility). - group_size (int): The group size for quantization. - sym (bool): Whether symmetric quantization is used (not utilized in the current function logic). - layer_configs (dict, optional): A dictionary mapping layer names to configurations, where each - configuration can specify a custom number of bits for the layer. - - Returns: - tuple: A tuple containing: - - bool: `True` if the model is compatible, `False` otherwise. - - str: An error message describing why the model is incompatible, or an empty string if compatible. - """ - if bits != 4: - return False, "AutoAWQ GEMM kernel only supports 4 bits" - for n, m in model.named_modules(): - if type(m) == transformers.pytorch_utils.Conv1D: - return False, "AutoAWQ GEMM kernel does not support conv1d" - - layer_names = get_layer_names_in_block(model) - for layer_name in layer_names: - if ( - layer_configs is not None - and layer_name in layer_configs.keys() - and layer_configs[layer_name].get("bits", bits) > 8 - ): - continue - - layer = get_module(model, layer_name) - if layer.in_features % group_size != 0: - return False, f"Layer {layer_name} in_features is not multiple of group_size {group_size}" - if layer.out_features % (32 // bits) != 0: - return False, f"Layer {layer_name} out_features is not multiple of 32 // bits" - - return True, "" - - -def get_device_and_parallelism(device: Union[str, torch.device, int]) -> Tuple[str, bool]: - if isinstance(device, str): - devices = device.replace(" ", "").split(",") - elif isinstance(device, int): - devices = [str(device)] - else: - devices = [device] - if all(s.isdigit() for s in devices) and len(devices) > 1 and torch.cuda.is_available(): - device = "cuda" - parallelism = True - elif all(s.isdigit() for s in devices) and len(devices) > 1 and torch.xpu.is_available(): - device = "xpu" - parallelism = False - # pragma: no cover - elif device == "auto": - device = detect_device(device) - parallelism = True - else: - device = detect_device(device) - parallelism = False - return device, parallelism - - -def set_cuda_visible_devices(device): - devices = device.replace(" ", "").split(",") - if all(s.isdigit() for s in devices): - if "CUDA_VISIBLE_DEVICES" in os.environ: - current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] - current_visible_devices = current_visible_devices.split(",") - indices = [int(device) for device in devices] - try: - pick_device = [current_visible_devices[i] for i in indices] - except: - raise ValueError( - "Invalid '--device' value: It must be smaller than the number of available devices." - " For example, with CUDA_VISIBLE_DEVICES=4,5, " - "--device 0,1 is valid, but --device 4,5 is not supported." - ) - visible_devices = ",".join(pick_device) - os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices - else: - os.environ["CUDA_VISIBLE_DEVICES"] = device - - -def is_debug_mode(): - """Checks if the Python interpreter is running in debug mode. - - Returns: - bool: True if debugging is enabled, False otherwise. - """ - return sys.gettrace() is not None or sys.flags.debug == 1 - - -def get_layer_features(layer): - """Extracts input and output feature dimensions for supported layers.""" - if type(layer) == torch.nn.Linear: - return layer.in_features, layer.out_features - elif type(layer) == transformers.pytorch_utils.Conv1D: # TODO: Verify correctness - return layer.weight.shape[0], layer.weight.shape[1] - elif isinstance(layer, torch.nn.Embedding): - return layer.num_embeddings, layer.embedding_dim - elif deepspeed_exists and type(layer) in (LinearLayer, LinearAllreduce): - return layer.weight.shape[1], layer.weight.shape[0] # (input_dim, output_dim) - elif "FP8Linear" in layer.__class__.__name__: - return layer.in_features, layer.out_features - return None, None # Unsupported layer type - - -def get_gguf_architecture(dir_model, model_type=ModelType.TEXT): - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( - ModelBase, - get_model_architecture, - ) - - is_mistral_format = False - if isinstance(dir_model, str): - dir_model = Path(dir_model) - - hparams = ModelBase.load_hparams(dir_model, is_mistral_format) - if isinstance(hparams, dict): - tmp_model_type = hparams["model_type"] - else: - tmp_model_type = hparams.model_type - if "mistral" == tmp_model_type: - is_mistral_format = True - hparams = ModelBase.load_hparams(dir_model, is_mistral_format) - if not is_mistral_format: - model_class = get_model_architecture(hparams, model_type) - elif model_type == ModelType.MMPROJ: - assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" - model_class = "PixtralModel" - else: - model_class = "MistralModel" - return model_class - - -def _gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.TEXT): - import argparse - - from auto_round.export.export_to_gguf.convert import download_convert_file - from auto_round.utils import logger - - formats = sorted(formats, key=lambda x: len(x)) - export_gguf = False - for f in formats: - if f.startswith("gguf"): - export_gguf = True - - if f.startswith("gguf") and f not in GGUF_CONFIG: - logger.error(f"{f} is not supported, please check.") - - redownload = False - if export_gguf: - try: - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 - ModelBase, - ModelType, - get_model_architecture, - ) - - if isinstance(args_or_ar.model, str): - model_path = args_or_ar.model - else: - model_path = args_or_ar.model.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) - if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: - logger.warning( - f"Current version of gguf export does not support for {model_architecture}," - " will re-download dependency file." - ) - redownload = True - except ModuleNotFoundError as e: - if "convert_hf_to_gguf" in str(e): - logger.warning("GGUF export dependency file is not found, download from github.") - redownload = True - except AttributeError as e: - raise ImportError( - "Please use the latest gguf-py, you can use the following command to install it:\n" - "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." - ) - download_convert_file(redownload) - - try: - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 - ModelBase, - ModelType, - ) - except ImportError as e: - raise ImportError( - "Please use the latest gguf-py, you can use the following command to install it:\n" - "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." - ) - if isinstance(args_or_ar.model, str): - model_path = args_or_ar.model - else: - model_path = args_or_ar.model.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) - if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: - logger.error(f"Model {model_architecture} is not supported to export gguf format.") - sys.exit(1) - - pattern = re.compile(r"q\d_k") - pre_dq_format = "" - unsupported_list, reset_list = [], [] - for format in GGUF_CONFIG: - if format in formats: - if format == "q6_k_s": - logger.warning("Please note that q6_k_s is q6_k.") - - if re.search(pattern, format): - if pre_dq_format and re.search(pattern, format).group() not in pre_dq_format: - logger.error(f"Cannot export {pre_dq_format} and {format} at the same time.") - sys.exit(-1) - else: - pre_dq_format = format - - unsupported_list, reset_list = [], [] - gguf_config = GGUF_CONFIG[format] - for k, v in gguf_config.items(): - if not hasattr(args_or_ar, k): - continue - if k == "data_type": - if re.search(r"q\d_1", format) and len(formats) > 1: - v = "int" - if k == "sym" and isinstance(args_or_ar, argparse.Namespace): - k = "asym" - v = not v - if getattr(args_or_ar, k) != v: - unsupported_list.append(f"{k}={getattr(args_or_ar, k)}") - reset_list.append(f"{k}={v}") - setattr(args_or_ar, k, v) - if len(unsupported_list) > 0: - logger.info( - f"format {format} does not support for {', '.join(unsupported_list)}," - f" reset to {', '.join(reset_list)}." - ) - # Removed obsolete commented-out block for improved readability and maintainability. - return args_or_ar - - -def _to_model_dtype(model, model_dtype): - if model_dtype is not None: - try: - if (model_dtype == "float16" or model_dtype == "fp16") and model.dtype != torch.float16: - model = model.to(torch.float16) - elif ( - model_dtype == "bfloat16" or model_dtype == "bfp16" or model_dtype == "bf16" - ) and model.dtype != torch.bfloat16: - model = model.to(torch.bfloat16) - elif model_dtype == "float32" or model_dtype == "fp32" and model.dtype != torch.bfloat32: - model = model.to(torch.float32) - except: - logger.error("please use more device to fit the device or just use one device") - exit() - return model - - -def set_fake_cuda_device_capability(func=None): - if func is not None: - torch.cuda.get_device_capability = func - return func - - def fake_cuda(): - return 100, 1 - - orig_func = torch.cuda.get_device_capability - torch.cuda.get_device_capability = fake_cuda - return orig_func - - -def _is_fp8_model(model: torch.nn.Module) -> bool: - if not hasattr(model, "is_fp8"): - return False - else: - return model.is_fp8 - - -def _is_fp8_linear(module: torch.nn.Module) -> bool: - if hasattr(module, "is_fp8_linear"): - return module.is_fp8_linear - if not (type(module) == torch.nn.Linear or module.__class__.__name__ == "FP8Linear"): - return False - if module.weight is None: - return False - if str(module.weight.dtype).startswith("torch.float8"): - return True - else: - return False - - -def check_and_mark_fp8_model(model: torch.nn.Module) -> bool: - if _is_fp8_model(model): - return True - for n, m in model.named_modules(): - if _is_fp8_linear(m): - m.is_fp8_linear = True - if not hasattr(model, "is_fp8"): - model.is_fp8 = True - if hasattr(model, "is_fp8"): - return True - return False - - -def llm_load_model( - pretrained_model_name_or_path, - trust_remote_code=True, - model_dtype=None, - device="cpu", - low_cpu_mem_mode=0, - low_cpu_mem_tmp_dir=None, - **kwargs, -): - from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer - - device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" - if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 - - is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower())) - low_cpu_mem_usage = False - - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - - model_cls = AutoModel if is_glm else AutoModelForCausalLM - if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code: - logger.warning("trust_remote_code is enabled by default, please ensure its correctness.") - - if low_cpu_mem_tmp_dir is None: - low_cpu_mem_tmp_dir = "low_cpu_mem_tmp" - if low_cpu_mem_mode == 2: - from auto_round.low_cpu_mem.utils import load_model_with_hooks - - model = load_model_with_hooks( - pretrained_model_name_or_path, - model_cls, - device=device, - clean_weight=True, - saved_path=low_cpu_mem_tmp_dir, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - ) - elif low_cpu_mem_mode == 1: - from auto_round.low_cpu_mem.utils import load_empty_model - - low_cpu_mem_usage = True - model = load_empty_model( - pretrained_model_name_or_path, - model_cls, - device=device, - saved_path=low_cpu_mem_tmp_dir, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - ) - else: - if _use_hpu_compile_mode(): - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - attn_implementation="eager", - trust_remote_code=trust_remote_code, - device_map="auto" if use_auto_mapping else None, - ) - else: - try: - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - device_map="auto" if use_auto_mapping else None, - ) - except ValueError as e: - if "FP8 quantized" in str(e): - orig_func = set_fake_cuda_device_capability() - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - device_map="auto" if use_auto_mapping else None, - ) - torch.cuda.get_device_capability = orig_func - logger.warning("the support for fp8 model as input is experimental, please use with caution.") - else: - raise - - except OSError as e: - logger.warning( - f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry." - ) - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=False, - device_map="auto" if use_auto_mapping else None, - ) - - model = model.eval() - check_and_mark_fp8_model(model) - model = _to_model_dtype(model, model_dtype) - - return model, tokenizer, low_cpu_mem_usage - - -def mllm_load_model( - pretrained_model_name_or_path, - device="cpu", - torch_dtype="auto", - use_auto_mapping=True, - trust_remote_code=True, - model_dtype=None, - **kwargs, -): - import transformers - from huggingface_hub import HfApi, HfFileSystem, hf_hub_download - from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer - - device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" - if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 - if os.path.isdir(pretrained_model_name_or_path): - config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json"))) - else: - from huggingface_hub import hf_hub_download, list_repo_files - - file_list = list_repo_files(pretrained_model_name_or_path) - if "config.json" in file_list: - # Load plain JSON - config_path = hf_hub_download(pretrained_model_name_or_path, "config.json") - with open(config_path, "r", encoding="utf-8") as f: - config = json.load(f) - elif "config.json.gz" in file_list: - # Load gzipped JSON - import gzip - - config_path = hf_hub_download(pretrained_model_name_or_path, "config.json.gz") - with gzip.open(config_path, "rt", encoding="utf-8") as f: - config = json.load(f) - else: - raise FileNotFoundError(f"No config.json or config.json.gz found for {pretrained_model_name_or_path}") - - if "model_type" in config: - model_type = config["model_type"] - else: - model_type = None - - processor, image_processor = None, None - if "deepseek_vl_v2" == model_type: - from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor # pylint: disable=E0401 - - processor = DeepseekVLV2Processor.from_pretrained(pretrained_model_name_or_path) - tokenizer = processor.tokenizer - model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, - device_map="auto" if use_auto_mapping else None, - ) - else: - architectures = config["architectures"][0] - if architectures == "LlavaLlamaForCausalLM": - from llava.model.builder import load_pretrained_model # pylint: disable=E0401 - - tokenizer, model, image_processor, _ = load_pretrained_model( - pretrained_model_name_or_path, - model_base=None, - model_name=pretrained_model_name_or_path, - torch_dtype=torch_dtype, - ) - else: - if architectures.endswith("Model") and hasattr( - transformers, n := architectures.replace("Model", "ForConditionalGeneration") - ): - cls = getattr(transformers, n) - elif hasattr(transformers, architectures): - cls = getattr(transformers, architectures) - else: - cls = AutoModelForCausalLM - try: - model = cls.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, - device_map="auto" if use_auto_mapping else None, - ) - except ValueError as e: - if "FP8 quantized" in str(e): - orig_func = set_fake_cuda_device_capability() - model = cls.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, - device_map="auto" if use_auto_mapping else None, - ) - torch.cuda.get_device_capability = orig_func - logger.warning("the support for fp8 model as input is experimental, please use with caution.") - else: - raise - - if "Mistral-Small-3.2" in pretrained_model_name_or_path: - from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pylint: disable=E0401 - - if os.path.isdir(pretrained_model_name_or_path): - tokenizer = MistralTokenizer.from_file(os.path.join(pretrained_model_name_or_path, "tekken.json")) - else: - tokenizer = MistralTokenizer.from_hf_hub(pretrained_model_name_or_path) - else: - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code - ) - processor = AutoProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code - ) - try: - from transformers import AutoImageProcessor - - image_processor = AutoImageProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code - ) - except Exception as e: - pass - - model = model.eval() - check_and_mark_fp8_model(model) - model = _to_model_dtype(model, model_dtype) - - return model, processor, tokenizer, image_processor - - -def diffusion_load_model( - pretrained_model_name_or_path: str, - device: Union[str, torch.device] = "cpu", - torch_dtype: Union[str, torch.dtype] = "auto", - use_auto_mapping: bool = False, - trust_remote_code: bool = True, - model_dtype: str = None, - **kwargs, -): - device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" - if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 - - pipelines = LazyImport("diffusers.pipelines") - - pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained( - pretrained_model_name_or_path, torch_dtype=torch_dtype - ) - pipe = _to_model_dtype(pipe, model_dtype) - model = pipe.transformer - return pipe, model.to(device) - - -def is_pure_text_model(model): - """verify on: phi-3.5, Mistral-Small-3.1, gemma-3, qwen2-vl,""" - if hasattr(model, "config") and hasattr(model.config, "vision_config"): - return False - if hasattr(model.__class__, "main_input_name") and model.__class__.main_input_name != "input_ids": - return False - for module in model.modules(): - if hasattr(module.__class__, "main_input_name") and module.__class__.main_input_name != "input_ids": - return False - if "vision" in str(module.__class__).lower(): - return False - if "image" in str(module.__class__).lower(): - return False - if "img" in str(module.__class__).lower(): - return False - return True - - -def reset_params(inputs): - """ - Resets specific input parameters to avoid saving the key-value cache during fine-tuning. - - Args: - inputs (dict): Dictionary of model inputs. - - Modifies: - inputs (dict): Sets "use_cache" to False if the key is present. - """ - if "use_cache" in inputs.keys(): # Not storing kv cache - inputs["use_cache"] = False - - -def check_skippable_keywords(key): - """ - Prints a reminder if a key is not stored during quantization fine-tuning. - """ - skippable_cache_keys = ("past_key_value",) - for cache_key in skippable_cache_keys: - if cache_key not in key: - return True - return False - - -def init_cache(positional_inputs, inputs): - """ - Initializes special model inputs by adding positional inputs if missing. - - Args: - positional_inputs (list): List of positional inputs to add to inputs. - inputs (dict): Dictionary of model inputs. - - Modifies: - inputs (dict): Adds "positional_inputs" key if not present. - """ - if "positional_inputs" not in inputs: # for chatglm Series - inputs["positional_inputs"] = [] - for idx, item in enumerate(positional_inputs): - inputs["positional_inputs"] = to_device(positional_inputs) - - -def get_shared_keys(model): - """ - Retrieves shared keys from the model's state dictionary. - - Args: - model (torch.nn.Module): The model to retrieve shared keys from. - - Returns: - tuple: tuple of shared keys. - """ - from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS - - shared_keys = SHARED_CACHE_KEYS - shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ()) - return shared_keys - - -def get_model_dtype(model_dtype, default="auto"): - if model_dtype is None or model_dtype == "auto": - model_dtype = default - elif model_dtype in ["bf16", "bfloat16"]: - model_dtype = "bfloat16" - elif model_dtype in ["f16", "float16", "fp16"]: - model_dtype = "float16" - elif model_dtype in ["f32", "float32", "fp32"]: - model_dtype = "float32" - else: - logger.warning(f"Unable to identify model_dtype {model_dtype}, reset to default model_dtype {default}") - model_dtype = default - return model_dtype - - -def str2bool(v): - import argparse - - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "f", "n", "0"): - return False - else: - raise argparse.ArgumentTypeError("Boolean value expected.") - - -def filter_quantization_config(quantization_config): - default_dict = { - "amp": True, - "batch_size": 8, - "data_type": int, - "dataset": "NeelNanda/pile-10k", - "enable_minmax_tuning": True, - "enable_norm_bias_tuning": False, - "enable_quanted_input": True, - "gradient_accumulate_steps": 1, - "iters": 200, - "low_gpu_mem_usage": False, - "nsamples": 128, - "scale_dtype": "torch.float16", - "seqlen": 2048, - } - iters = quantization_config.get("iters", 200) - - default_dict["lr"] = 1.0 / iters if iters > 0 else 5e-3 - default_dict["minmax_lr"] = default_dict["lr"] - - for key in default_dict: - if key in quantization_config and default_dict[key] == quantization_config[key]: - quantization_config.pop(key) - for k in list(quantization_config.keys()): - if quantization_config[k] is None: - quantization_config.pop(k) - - if quantization_config.get("act_bits", 16) >= 16: - quantization_config.pop("act_bits", None) - quantization_config.pop("act_data_type", None) - quantization_config.pop("act_dynamic", None) - quantization_config.pop("act_sym", None) - quantization_config.pop("act_group_size", None) - - -def check_start_with_block_name(name: str, block_name_to_quantize: list): - """ - Checks if the given layer name starts with any of the block names to be quantized. - - Args: - name (str): The name of the layer. - block_name_to_quantize (list): A list of block names to check against. - - Returns: - bool: True if the layer name starts with any of the block names, False otherwise. - """ - for block_name in block_name_to_quantize: - if name.startswith(block_name): - return True - return False - - -def check_seqlen_compatible(input_seqlen, tokenizer=None, model=None): - """ - Check whether the input sequence length is within the limits defined - by the tokenizer and the model configuration. - - Args: - input_seqlen (int): The length of the input sequence. - tokenizer: Optional, a HuggingFace tokenizer object. - model: Optional, a HuggingFace model object. - - Returns: - ValueError: if the input length is not valid, riase Error. - """ - if model is not None and hasattr(model, "config"): - model_config = model.config - if hasattr(model_config, "max_position_embeddings") and input_seqlen > model_config.max_position_embeddings: - raise ValueError( - f"seqlen({input_seqlen}) exceeds model.config.max_position_embeddings(" - f"{model_config.max_position_embeddings}). Please lowering '--seqlen'" - ) - if tokenizer is not None and hasattr(tokenizer, "model_max_length") and input_seqlen > tokenizer.model_max_length: - raise ValueError( - f"seqlen({input_seqlen}) exceeds tokenizer.model_max_length({tokenizer.model_max_length}). " - "Please oncider Consider lowering the '--seqlen' or increasing tokenizer.model_max_length." - ) - - -def _use_more_bits(i_layer: int, n_layer: int): - return (i_layer < n_layer // 8) or (i_layer >= 7 * n_layer // 8) or ((i_layer - n_layer // 8) % 3 == 2) - - -def _get_digital_in_layer_name(layer_name): - pattern = re.compile(r"([a-zA-Z]+\.){1,}(\d+)") - res = re.search(pattern, layer_name) - if res: - return int(res[2]) - else: - return None - - -def _search_gguf_type(gguf_type): - if gguf_type in GGUF_INNER_CONFIG: - return gguf_type - pattern = re.compile("gguf:q([0-9]{1,})_[01k]") - bits = re.search(pattern, gguf_type) - if not bits: - raise KeyError(f"{gguf_type} is not a correct gguf type, please check") - - for suffix in ["_k", "_0", "_1"]: - if gguf_type.endswith(suffix): - continue - if (tmp_type := re.sub("_[01k]", suffix, gguf_type)) in GGUF_INNER_CONFIG: - return tmp_type - return None - - -def _gguf_type_fallback(gguf_type: str) -> str: - gguf_type = gguf_type.lower() - if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): - gguf_type = "gguf:q5_0" - elif gguf_type == "gguf:q5_k": - gguf_type = "gguf:q5_0" - elif gguf_type == "gguf:q6_k": - gguf_type = "gguf:q8_0" - return gguf_type - - -##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 -def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT): - # # TODO: support for other format later - # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) - - import gguf # pylint: disable=E0401 - - # from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture - convert_hf_to_gguf = LazyImport("auto_round.export.export_to_gguf.convert_hf_to_gguf") - - model_architecture = convert_hf_to_gguf.get_model_architecture( - hparams=model.config.to_dict(), model_type=model_type - ) - try: - model_class = convert_hf_to_gguf.ModelBase.from_model_architecture(model_architecture, model_type=model_type) - except NotImplementedError: - return layer_config, {} - - n_layer = None - for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"]: - sub_attr = "text_config" if model_type == ModelType.TEXT else "vision_config" - if hasattr(model.config, name): - n_layer = getattr(model.config, name) - break - if hasattr(model.config, sub_attr): - if hasattr(getattr(model.config, sub_attr), name): - n_layer = getattr(getattr(model.config, sub_attr), name) - break - if n_layer is None: - return layer_config, {} - - tensor_map = gguf.get_tensor_name_map(model_class.model_arch, n_layer) - - def _set_config(config, target_config): - for k, v in target_config.items(): - if isinstance(config, dict): - config[k] = v - else: - setattr(config, k, v) - return config - - gguf_format_config = {} - lm_head_name = get_lm_head_name(model) - inner_gguf_format = GGUF_CONFIG[target_gguf_format]["mostly"] - # ggml_type = getattr(gguf.GGMLQuantizationType,inner_gguf_format.split(":")[-1].upper()) - block_size = GGML_QUANT_SIZES[inner_gguf_format.split(":")[-1].lower()][0] - tie_word_embeddings = True - if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): - tie_word_embeddings = model.config.tie_word_embeddings - - n_gqa = 1 - if ( - hasattr(model, "config") - and hasattr(model.config, "num_attention_heads") - and hasattr(model.config, "num_key_value_heads") - ): - n_gqa = model.config.num_attention_heads // model.config.num_key_value_heads - n_expert = 0 - for name in ["num_experts", "num_local_experts", "n_routed_experts"]: - if hasattr(model.config, name): - n_expert = getattr(model.config, name) - - i_attention_wv = 0 - i_ffn_down = 0 - layer_config_copy = copy.deepcopy(layer_config) - target_bits = None - if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit(): - target_bits = int(inner_gguf_format[6]) - - for layer_name, config in layer_config_copy.items(): - if not check_to_quantized(config): - continue - new_type = GGUF_CONFIG[target_gguf_format]["mostly"] - layer = get_module(model, layer_name) - if type(layer) == transformers.pytorch_utils.Conv1D: - input_features = layer.weight.shape[0] - else: - input_features = layer.weight.shape[-1] - i_layer = _get_digital_in_layer_name(layer_name) - - if lm_head_name is not None and layer_name == lm_head_name: - target_bits = int(re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["lm_head"]).group(1)) - if isinstance(layer, torch.nn.Embedding): - target_bits = int( - re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["embedding"]).group(1) - ) - - gguf_name = tensor_map.get_name(layer_name) - bits_index = 6 - if config.get("fixed_by_user", False): - if "bits" not in config: - logger.warning( - f"Setting layer_config requires providing bits, {layer_name} has not bits," - f" using bits={target_bits} instead." - ) - new_type = new_type[:bits_index] + target_bits + new_type[bits_index + 1 :] - else: - config_tmp = config.copy() - scheme_keys = [f.name for f in fields(QuantizationScheme)] - for key in config.keys(): - if key not in scheme_keys: - config_tmp.pop(key, None) - matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched - if not matched_scheme: - if config.get("super_group_size", None) is not None: - new_type = new_type[:bits_index] + str(config["bits"]) + "_k" - if config.get("super_group_size", None) is None or new_type not in GGUF_INNER_CONFIG: - prefix_idx = 0 if config.get("sym", True) else 1 - new_type = new_type[:bits_index] + str(config["bits"]) + f"_{prefix_idx}" - if new_type not in GGUF_INNER_CONFIG: - new_type = new_type[:bits_index] + str(config["bits"]) + f"_{1-prefix_idx}" - if new_type not in GGUF_INNER_CONFIG: - raise ValueError( - f"the setting in layer_config {layer_name} " - f"could not match any supported gguf format, please have a check." - ) - else: - logger.warning_once( - f"the setting in layer_config {layer_name} " - f"could not match any supported gguf format, reset to {new_type}" - ) - new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] - new_type = _search_gguf_type(new_type) - if new_type is None: - raise ValueError(f"invalid bit setting for {layer_name}") - elif target_bits is not None and "bits" in config and config["bits"] != target_bits: - new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] - new_type = _search_gguf_type(new_type) - if new_type is None: - raise ValueError(f"invalid bit setting for {layer_name}") - elif lm_head_name is not None and layer_name == lm_head_name and not tie_word_embeddings: - if gguf.MODEL_ARCH.FALCON == model_class.model_arch or input_features % block_size != 0: - new_type = "gguf:q8_0" - elif "lm_head" in GGUF_CONFIG[target_gguf_format]: - new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] - elif new_type != "gguf:q8_0": - new_type = "gguf:q6_k" - elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings: - pass - elif isinstance(layer, torch.nn.Embedding): - if "embedding" in GGUF_CONFIG[target_gguf_format]: - new_type = GGUF_CONFIG[target_gguf_format]["embedding"] - elif gguf_name is None: - pass - # attn_v - elif "attn_v" in gguf_name: - if target_gguf_format == "gguf:q2_k": - new_type = "gguf:q4_k" if n_gqa >= 4 else "gguf:q3_k" - elif target_gguf_format == "gguf:q2_k_s" and n_gqa >= 4: - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_m": - new_type = "gguf:q5_k" if i_attention_wv < 2 else "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_l": - new_type = "gguf:q5_k" - elif (target_gguf_format == "gguf:q4_k_m" or target_gguf_format == "gguf:q5_k_m") and _use_more_bits( - i_layer, n_layer - ): - new_type = "gguf:q6_k" - elif target_gguf_format == "gguf:q4_k_s" and i_attention_wv < 4: - new_type = "gguf:q5_k" - ##TODO check which models are be grouped into to LLM_TYPE_70B - # if (qs.model.type == LLM_TYPE_70B) { - # // In the 70B model we have 8 heads sharing the same attn_v weights. - # As a result, the attn_v.weight tensor is - # // 8x smaller compared to attn_q.weight.Hence, we can get a nice boost in quantization accuracy with - # // nearly negligible increase in model size by quantizing this tensor with more bits: - # if - # (new_type == GGML_TYPE_Q3_K | | new_type == GGML_TYPE_Q4_K) - # new_type = GGML_TYPE_Q5_K; - # } - if n_expert == 8: - new_type = "gguf:q8_k" - i_attention_wv += 1 - - elif "attn_k" in gguf_name: - if n_expert == 8: - new_type = "gguf:q8_0" - # ffn_down - elif "ffn_down" in gguf_name: - if target_gguf_format == "gguf:q2_k": - new_type = "gguf:q3_k" - elif target_gguf_format == "gguf:q2_k_s": - if i_layer < n_layer / 8: - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_m": - if i_layer < n_layer / 16: - new_type = "gguf:q5_k" - elif gguf.MODEL_ARCH.FALCON == model_class.model_arch or _use_more_bits(i_layer, n_layer): - new_type = "gguf:q4_k" - else: - new_type = "gguf:q3_k" - elif target_gguf_format == "gguf:q3_k_l": - if gguf.MODEL_ARCH.FALCON == model_class.model_arch: - new_type = "gguf:q4_k" - else: - new_type = "gguf:q5_k" - elif target_gguf_format == "gguf:q4_k_m": - if gguf.MODEL_ARCH.FALCON == model_class.model_arch: - if i_layer < n_layer // 16: - new_type = "gguf:q6_k" - elif _use_more_bits(i_layer, n_layer): - new_type = "gguf:q5_k" - else: - new_type = "gguf:q4_k" - else: - if _use_more_bits(i_layer, n_layer): - new_type = "gguf:q6_k" - elif target_gguf_format == "gguf:q5_k_m" and _use_more_bits(i_layer, n_layer): - new_type = "gguf:q6_k" - elif ( - target_gguf_format == "gguf:q4_k_s" - and model_class.model_arch != gguf.MODEL_ARCH.FALCON - and i_layer < n_layer / 8 - ): - new_type = "gguf:q5_k" - elif (target_gguf_format == "gguf:q4_0" or target_gguf_format == "gguf:q5_0") and i_layer < n_layer / 8: - if target_gguf_format == "gguf:q4_0": - new_type = "gguf:q4_1" - else: - new_type = "gguf:q5_1" - i_ffn_down += 1 - - # attn_output - elif "attn_output" in gguf_name: - if gguf.MODEL_ARCH.FALCON != model_class.model_arch: - if n_expert == 8: - if target_gguf_format in ( - "gguf:q2_k", - "gguf:q3_k_s", - "gguf:q3_k_m", - "gguf:q4_k_s", - "gguf:q4_k_m", - "gguf:q5_k", - ): - new_type = "gguf:q5_k" - elif target_gguf_format == "gguf:q2_k": - new_type = "gguf:q3_k" - elif target_gguf_format == "gguf:q3_k_m": - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_l": - new_type = "gguf:q5_k" - else: - if target_gguf_format == "gguf:q3_k_l": - new_type = "gguf:q4_k" - # attn_qkv - elif "attn_qkv" in gguf_name: - if target_gguf_format in ("gguf:q3_k_m", "gguf:q3_k_l"): - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q4_k_m": - new_type = "gguf:q5_k" - elif target_gguf_format == "gguf:q5_k_m": - new_type = "gguf:q5_k" - new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] - if input_features % new_block_size != 0: - new_type = _gguf_type_fallback(new_type) - new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] - if input_features % new_block_size != 0: - new_type = "gguf:bf16" - logger.warning( - f"fallback {layer_name} to {new_type}, " - f"because input_features({input_features}) % block_size({block_size}) != 0" - ) - # for deepseek v2 - if layer_name.endswith("kv_b_proj") and new_type.endswith("_k") and "Deepseek" in model.config.architectures[0]: - fallback = False - - # calc if need fallback - qk_nope_head_dim = model.config.qk_nope_head_dim - kv_b_shape = get_module(model, layer_name).weight.shape - - if ( - qk_nope_head_dim < QK_K - or qk_nope_head_dim % QK_K != 0 - or kv_b_shape[-1] < QK_K - or kv_b_shape[-1] % QK_K != 0 - ): - fallback = True - if fallback: - tmp_type = _gguf_type_fallback(new_type) - logger.warning_once( - f"self_attn.kv_b_proj does not support the use of {new_type}, replace it with {tmp_type}" - ) - new_type = tmp_type - - target_config = GGUF_INNER_CONFIG[new_type] - - _set_config(layer_config[layer_name], target_config) - _set_config(layer, target_config) - gguf_format_config[layer_name] = new_type - - return layer_config, gguf_format_config - - -def get_lm_head_name(model): - block_names = get_block_names(model, True) - last_name = None - for n, m in model.named_modules(): - if any(m.children()): - continue - last_name = n - for l in block_names: - if last_name in l: - last_name = None - break - return last_name - - -def get_gguf_qtype_by_layer_config(layer_config): - import gguf # pylint: disable=E0401 - - if layer_config["bits"] >= 16: - return None - bits = layer_config["bits"] - super_bits = layer_config.get("super_bits", None) - sym = layer_config["sym"] - group_size = layer_config.get("group_size", None) - super_group_size = layer_config.get("super_group_size", None) - if bits == 2 and super_bits == 4 and not sym and group_size == 16 and super_group_size == 16: - return gguf.GGMLQuantizationType.Q2_K - if bits == 3 and super_bits == 6 and sym and group_size == 16 and super_group_size == 16: - return gguf.GGMLQuantizationType.Q3_K - if bits == 4: - if super_bits is not None and super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: - return gguf.GGMLQuantizationType.Q4_K - if super_bits is None and sym and group_size == 32: - return gguf.GGMLQuantizationType.Q4_0 - if super_bits is None and not sym and group_size == 32: - return gguf.GGMLQuantizationType.Q4_1 - if bits == 5: - if super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: - return gguf.GGMLQuantizationType.Q5_K - if super_bits is None and sym and group_size == 32: - return gguf.GGMLQuantizationType.Q5_0 - if super_bits is None and not sym and group_size == 32: - return gguf.GGMLQuantizationType.Q5_1 - if bits == 6 and super_bits == 8 and group_size == 16 and super_group_size == 16: - return gguf.GGMLQuantizationType.Q6_K - if bits == 8 and sym and group_size == 32: - return gguf.GGMLQuantizationType.Q8_0 - raise ValueError("Unknown layer config") - - -def flatten_list(nested_list): - flattened = [] - for item in nested_list: - if isinstance(item, (list, tuple)): - flattened.extend(flatten_list(item)) - else: - flattened.append(item) - return flattened - - -def clean_module_parameter(submodule, parameter): - if submodule is None: - return - is_buffer = parameter in submodule._buffers - with torch.no_grad(): - if is_buffer: - submodule._buffers[parameter] = None - else: - submodule._parameters[parameter] = None - - -def get_reciprocal(tensor): - if torch.dtype is torch.float16: - tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5) - else: - tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor) - return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor)) - - -def check_need_act_calibration( - is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: Union[int, None] = 16 -) -> bool: - if act_bits is None or act_bits > 8: - return False - # None is dynamic - if is_act_dynamic is not None and not is_act_dynamic: - return True - if act_data_type is not None and "static" in act_data_type: - return True - return False - - -def pad_weight(weight: torch.Tensor, block_size: list) -> Tuple[torch.Tensor, int, int]: - """Pads a matrix to make its dimensions multiples of block_size.""" - M, N = weight.shape[-2:] - block_size_m, block_size_n = block_size - pad_M = (block_size_m - M % block_size_m) % block_size_m - pad_N = (block_size_n - N % block_size_n) % block_size_n - - if pad_M == 0 and pad_N == 0: - return weight, M, N # No padding needed - padded_weight = torch.nn.functional.pad(weight, (0, pad_N, 0, pad_M), mode="constant", value=0) - return padded_weight, M, N # Return original dimensions for unpadding - - -def unpad_weight(weight: torch.Tensor, original_M: int, original_N: int, keep_first_dim: bool = False) -> torch.Tensor: - """Removes padding from the matrix to restore its original shape.""" - if (weight.shape[-2] == original_M) and (weight.shape[-1] == original_N): - return weight - if keep_first_dim: - return weight[:, :original_M, :original_N] - else: - return weight[:original_M, :original_N] - - -def pad_block_fp8_weight_naive( - weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list -) -> Tuple[torch.Tensor, int, int]: - assert len(block_size) == 2 - - block_size_m, block_size_n = block_size - weight_scale_m, weight_scale_n = weight_scale.shape[-2:] - - weight, orig_M, orig_N = pad_weight(weight, block_size) - M, N = weight.shape[-2:] - - assert weight_scale_m == M // block_size_m - assert weight_scale_n == N // block_size_n - - return weight, orig_M, orig_N - - -def dequant_block_fp8_weight(weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list) -> torch.Tensor: - dtype = torch.bfloat16 - if weight_scale is None: - return weight - assert len(block_size) == 2 - - weight, orig_M, orig_N = pad_block_fp8_weight_naive(weight, weight_scale, block_size) - - weight_shape_len = len(weight.shape) - - block_size_m, block_size_n = block_size - - # mul scale - if weight_shape_len == 2: - weight_scale_m, weight_scale_n = weight_scale.shape - weight_scale = weight_scale.view(weight_scale_m, 1, weight_scale_n, 1) - weight = weight.view(weight_scale_m, block_size_m, weight_scale_n, block_size_n) - dequant_weight = weight.to(dtype) * weight_scale.to(dtype) - dequant_weight = dequant_weight.view(weight_scale_m * block_size_m, weight_scale_n * block_size_n) - keep_first_dim = False - elif weight_shape_len == 3: - fd, weight_scale_m, weight_scale_n = weight_scale.shape - weight_scale = weight_scale.view(fd, weight_scale_m, 1, weight_scale_n, 1) - weight = weight.view(fd, weight_scale_m, block_size_m, weight_scale_n, block_size_n) - dequant_weight = weight.to(dtype) * weight_scale.to(dtype) - dequant_weight = dequant_weight.view(fd, weight_scale_m * block_size_m, weight_scale_n * block_size_n) - keep_first_dim = True - else: - raise ValueError("Only support original weight shape is either 2 or 3") - - dequant_weight = unpad_weight(dequant_weight, orig_M, orig_N, keep_first_dim=keep_first_dim) - - return dequant_weight - - -def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16): - """ """ - new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype) - if layer.bias is not None: - new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype)) - scheme_keys = (f.name for f in fields(QuantizationScheme)) - keys = tuple(scheme_keys) + ("tmp_name", "scale_dtype") - for key in keys: - setattr(new_layer, key, getattr(layer, key, None)) - - if layer.__class__.__name__ == "CompressedLinear": - dq_weight = layer.compressor.decompress_module(layer) - else: - weight_scale = layer.weight_scale if hasattr(layer, "weight_scale") else layer.weight_scale_inv - dq_weight = dequant_block_fp8_weight(layer.weight, weight_scale, layer.block_size) - new_layer.weight.data.copy_(dq_weight.to(dtype=dtype)) - return new_layer - - -def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16): - """ - Convert a model with FP8 quantized layers to a model with 16-bit linear layers. - This is useful for compatibility with other frameworks or for further processing. - """ - cnt = 0 - for n, m in model.named_modules(): - if m.__class__.__name__ == "FP8Linear": - new_module = convert_fp8_layer_to_linear(m, dtype=dtype) - set_module(model, n, new_module) - cnt += 1 - if cnt % 10 == 0: # Tricky setting - clear_memory() - return model - - -def out_of_vram(error_msg): - error_msg = str(error_msg) - # CUDA - if "CUDA out of memory" in error_msg: - return True - # gaudi - if "MODULE:PT_DEVMEM" in error_msg: - return True - # XPU - if "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in error_msg: - return True - # ROCM - if "HIP out of memory. Tried to allocate" in error_msg: - return True - return False - - -def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None): - """Download hugging face model from hf hub.""" - from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE - from huggingface_hub.file_download import REGEX_COMMIT_HASH, repo_folder_name - - if cache_dir is None: - cache_dir = HUGGINGFACE_HUB_CACHE - if revision is None: - revision = DEFAULT_REVISION - if repo_type is None: - repo_type = "model" - storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)) - commit_hash = None - if REGEX_COMMIT_HASH.match(revision): - commit_hash = revision - else: - ref_path = os.path.join(storage_folder, "refs", revision) - if os.path.exists(ref_path): - with open(ref_path) as f: - commit_hash = f.read() - if storage_folder and commit_hash: - pointer_path = os.path.join(storage_folder, "snapshots", commit_hash) - if os.path.isdir(pointer_path): - return pointer_path - else: # pragma: no cover - from huggingface_hub import snapshot_download - - model_path = snapshot_download(repo_id) - return model_path - - -def is_moe(module: torch.nn.Module) -> bool: - """Returns whether the module is an MOE layer.""" - return any( - key in type(module).__name__.lower() - for key in [ - "MixtralSparseMoeBlock".lower(), - "ArcticMoE".lower(), - "DbrxFFN".lower(), - "MoELayer".lower(), - "PhimoeSparseMoeBlock".lower(), - "DeepseekMoE".lower(), - "DeepseekV2MoE".lower(), - "DeepseekV3MoE".lower(), - "Qwen2MoeSparseMoeBlock".lower(), - "Qwen3MoeSparseMoeBlock".lower(), - ] - ) - - -# please refer to https://github.com/NVIDIA/TensorRT-Model-Optimizer -# /blob/4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/layer_utils.py#L976 -def get_expert_linear_names(module: torch.nn.Module) -> list[str]: - """Get the list of linear names for the experts.""" - - def module_match_name_list(module, name_list): - """Check if the module name matches any of the names in the list. - - e.g. module_match_name_list(QuantQwen3MoeSparseMoeBlock, ['Qwen3MoeSparseMoeBlock']) -> True - - """ - return any(name.lower() in type(module).__name__.lower() for name in name_list) - - if module_match_name_list( - module, ["Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "DeepseekMoE", "DeepseekV2MoE", "DeepseekV3MoE"] - ): - return ["gate_proj", "down_proj", "up_proj"] - elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]): - return ["linear_fc1", "linear_fc2"] - elif module_match_name_list(module, ["DBRXMoeSparseMoeBlock"]): - return ["w1_linear", "w2_linear", "v1_linear"] - else: - # assuming w1, w2, w3 by default - return ["w1", "w2", "w3"] - - -def get_nested_attr(module, attr_name: str): - """Recursively get nested attribute (e.g., 'orig_layer.act_max').""" - attrs = attr_name.split(".") - for attr in attrs: - if not hasattr(module, attr): - return None - module = getattr(module, attr) - return module - - -def set_nested_attr(module, attr_name: str, value): - """Recursively set nested attribute (e.g., 'orig_layer.act_max' = value).""" - attrs = attr_name.split(".") - for attr in attrs[:-1]: - if not hasattr(module, attr): - return None # No need to set act_max for fp layers - module = getattr(module, attr) - setattr(module, attrs[-1], value) - - -def set_amax_for_uncalibrated_experts( - experts: torch.nn.Module, set_amax_value: float | None = None, attr_name="act_max" -): - """Set amax of uncalibrated experts to a given value or the max of existing amax value from other experts. - - Args: - experts: a list of experts - set_amax_value: set amax value to the given value. - If None, set amax value to the max of existing amax value from other experts. - - Returns: - uncalibrated_experts: a list of uncalibrated experts - """ - uncalibrated_experts = [] - # get the max amax value from all experts - if set_amax_value is None: - amax_values = [ - get_nested_attr(module, attr_name) for module in experts if get_nested_attr(module, attr_name) is not None - ] - if len(amax_values) == 0: - return uncalibrated_experts - # Flatten all tensors to 1D before concatenation - flat_values = [t.reshape(-1) for t in amax_values] - all_values = torch.cat(flat_values) - set_amax_value = torch.max(all_values) - - for module in experts: - if get_nested_attr(module, attr_name) is None: - logger.warning_once( - "Missing amax value of expert layers." - "This typically occurs in MoE models when certain experts are not activated during calibration. " - "Consider increasing your calibration dataset size to ensure all experts are exercised." - ) - # Use float32 dtype explicitly to ensure we create a floating point tensor - if not isinstance(set_amax_value, torch.Tensor): - set_amax_value = torch.tensor(set_amax_value, dtype=torch.float32) - set_nested_attr(module, attr_name, set_amax_value) - # uncalibrated_experts.append(module) - - -# Please refer to: https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/ -# 4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/unified_export_hf.py#L195-L207 -def set_amax_for_all_moe_layers(model: torch.nn.Module, layer_name=None, attr_name="act_max"): - if layer_name is not None: - parts = layer_name.split(".") - if "experts" not in parts: - raise ValueError - idx = parts.index("experts") - moe_name = ".".join(parts[:idx]) - model = get_module(model, moe_name) - # Handle input quantizers of experts that are not calibrated - for name, sub_module in model.named_modules(): - if not (is_moe(sub_module) and hasattr(sub_module, "experts")): - continue - expert_linear_names = get_expert_linear_names(sub_module) - for linear_name in expert_linear_names: - if isinstance(sub_module.experts, collections.abc.Iterable): - # For other MoE models (like Mixtral) with iterable experts - try: - set_amax_for_uncalibrated_experts( - [getattr(expert, linear_name, None) for expert in sub_module.experts], attr_name=attr_name - ) - except AttributeError as e: - # Provide more helpful debugging information - expert_types = list(set(type(expert).__name__ for expert in sub_module.experts)) - raise AttributeError( - f"Failed to access attribute '{linear_name}' on experts. " - f"MoE module type: {type(sub_module).__name__}, " - f"Expert types: {expert_types}, " - f"Expected linear names: {expert_linear_names}. " - f"This suggests the get_expert_linear_names function may need " - f"to be updated for this model architecture. " - f"Original error: {e}" - ) from e - else: - # Unsupported MoE model structure - raise NotImplementedError( - f"MoE model with experts type '{type(sub_module.experts).__name__}' is not supported in export." - f"Please file an issue or add support for this model architecture." - ) - - -class BackendDataType(str, Enum): - STANDARD_FP = "fp" - MX_FP = "mx_fp" - NV_FP = "nv_fp" - - -def is_standard_fp(backend): - backend = backend.lower() - return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend) - - -def is_mx_fp(backend): - backend = backend.lower() - return BackendDataType.MX_FP in backend - - -def is_nv_fp(backend): - backend = backend.lower() - return BackendDataType.NV_FP in backend - - -def _is_weight_fp8_activation_static_fp8( - bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool -) -> bool: - return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic - - -def is_wfp8afp8(ar): - if ( - ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) - and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)) - and is_standard_fp(ar.act_data_type) - and is_standard_fp(ar.data_type) - ): - return True - else: - return False - - -def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool: - if isinstance(ar_or_format, str): - return "fp8_static" in ar_or_format - if ar_or_format.act_dynamic: - return False - if is_wfp8afp8(ar_or_format): - return True - return False - - -def bytes_to_gigabytes(bytes) -> int: - """ - Converts bytes to gigabytes. - - Args: - bytes (int): The number of bytes. - - Returns: - int: The equivalent number of gigabytes. - """ - return bytes / 1024 / 1024 / 1024 - - -def get_device_memory(i: int = 0) -> int: - """ - Gets the available memory on the specified device. - - Args: - i (int, optional): Device index. Defaults to 0. - - Returns: - int: Available memory in gigabytes. - """ - if torch.cuda.is_available(): - total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory) - elif torch.xpu.is_available(): - raise RuntimeError("XPU does not support device_map='auto' currently.") - else: - raise RuntimeError("No supported device found (CUDA or XPU).") - return total_memory - - -def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]: - """ - Calculates the memory consumption of a specific block in the model. - - Args: - block (torch.nn.Module): The block of the model to analyze. - input_ids (list[torch.Tensor]): A list of input tensors for the block. - - Returns: - tuple: A tuple containing the following: - - block_memory (float): The memory consumption (in GB) of the block's linear layers. - - input_output_memory (float): The memory consumption (in GB) for input and output - tensors of the block. - """ - # Calculate all block parameters memory - total_param_mem = 0 - for name, module in block.named_modules(): - if check_to_quantized(module): - param_size = module.weight.nbytes - total_param_mem += param_size - block_memory = total_param_mem / 1024**3 # Convert to GB - - # Assuming bfloat16 or float32, input and output - input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 - - return block_memory, input_output_memory - - -def get_max_vram(ratio: float = 0.9) -> dict: - max_memory = {} - if torch.cuda.is_available(): # NVIDIA CUDA - num_devices = torch.cuda.device_count() - for i in range(num_devices): - total_mem = torch.cuda.get_device_properties(i).total_memory - max_mem_gb = int(total_mem / 1024**3 * ratio) - max_memory[i] = f"{max_mem_gb}GiB" - elif torch.xpu.is_available(): # TODO need verification - num_devices = torch.xpu.device_count() - for i in range(num_devices): - total_mem = torch.xpu.get_device_properties(i).total_memory - max_mem_gb = int(total_mem / 1024**3 * ratio) - max_memory[i] = f"{max_mem_gb}GiB" - - else: - raise RuntimeError("No CUDA or XPU devices found.") - return max_memory - - -def _get_packing_device(device: str | torch.device | None = "auto") -> torch.device: - """ - Selects the packing device. - - "auto": choose best available (CUDA > XPU > CPU). - - str: parsed by torch.device (e.g., "cuda:2", "cpu"). - - torch.device: returned as-is. - - None: treated as "auto". - - Args: - device: Target device spec ("auto", "cuda:0", "xpu:0", "cpu", or torch.device). - - Returns: - torch.device: The resolved device. - """ - if device is None or (isinstance(device, str) and device.lower() == "auto"): - if torch.cuda.is_available(): - return torch.device("cuda:0") - if hasattr(torch, "xpu") and torch.xpu.is_available(): - return torch.device("xpu:0") - return torch.device("cpu") - - if isinstance(device, torch.device): - return device - - if isinstance(device, str): - try: - return torch.device(device) - except Exception as e: - raise ValueError(f"Invalid device string: {device}") from e - - raise TypeError(f"Unsupported device type: {type(device)} ({device})") - - -# Adapted from https://github.com/vllm-project/llm-compressor/blob/ -# 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144 -def copy_python_files_from_model_cache(model, save_path: str): - config = model.config - cache_path = None - if hasattr(config, "_name_or_path"): - import os - import shutil - - from huggingface_hub import hf_hub_download - from transformers import TRANSFORMERS_CACHE - from transformers.utils import http_user_agent - - cache_path = config._name_or_path - if not os.path.exists(cache_path): - user_agent = http_user_agent() - config_file_path = hf_hub_download( - repo_id=cache_path, - filename="config.json", - cache_dir=TRANSFORMERS_CACHE, - force_download=False, - user_agent=user_agent, - ) - cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) - - for file in os.listdir(cache_path): - full_file_name = os.path.join(cache_path, file) - if file.endswith(".py") and os.path.isfile(full_file_name): - logger.debug(f"Transferring {full_file_name} to {save_path}") - shutil.copy(full_file_name, save_path) - - -def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): - MM_KEYS = [ - "multi_modal_projector", - "vision_tower", - "multimodal_projector", - "thinker", - "visual", - "audio", - "talker", - "token2wav", - "vision_model", - "audio_tower", - "vision_encoder", - "vision_language_adapter", - "patch_merger", - "pre_mm_projector_norm", - "vision", - ] - - model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - - if isinstance(model_path, str): - if os.path.exists(os.path.join(model_path, "preprocessor_config.json")): - return True - if os.path.exists(os.path.join(model_path, "processor_config.json")): - return True - if os.path.exists(os.path.join(model_path, "config.json")): - with open(os.path.join(model_path, "config.json")) as f: - config = json.load(f) - for key in config.keys(): - if any([k in key for k in MM_KEYS]): - return True - - if isinstance(model_or_path, torch.nn.Module): - for name, module in model_or_path.named_modules(): - if any([k in name for k in MM_KEYS]): - return True - - return False - - -def set_layer_config( - model: torch.nn.Module, - layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], - default_scheme: Union[str, "QuantizationScheme"], - default_scale_dtype: torch.dtype | str, - supported_types: tuple, - inner_supported_types: tuple, - quant_block_list=None, - fp_layers: str = "", - quant_lm_head: bool = False, - enable_gguf_official_mixed: bool = True, - is_mllm: bool = False, -) -> tuple[dict, bool, dict]: - """ - Normalize, validate, and expand layer-specific quantization configs. - Returns (final_layer_config, has_quant_layer_outside_block) - """ - - from auto_round.schemes import get_gguf_scheme - - # ---- helpers ------------------------------------------------- - def dispatch_layer_config(layer_config: dict[str, dict]) -> None: - """Assign scheme values as attributes to matched modules.""" - for layer_name, scheme in layer_config.items(): - module = get_module(model, layer_name) - for attr, value in scheme.items(): - setattr(module, attr, value) - - def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: - """Convert config entry into dict and validate keys.""" - if isinstance(item, str): - config = asdict(preset_name_to_scheme(item.upper())) - elif isinstance(item, QuantizationScheme): - config = asdict(item) - elif isinstance(item, dict): - invalid = set(item) - set(scheme_keys + ("fixed_by_user", "scale_dtype")) - if invalid: - raise ValueError( - f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}" - ) - config = dict(item) - else: - raise TypeError( - f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " - f"Expected str, dict, or QuantizationScheme." - ) - # Clean up - config = {k: v for k, v in config.items() if v is not None} - config["fixed_by_user"] = True - return config - - # ---- main logic ---------------------------------------------- - scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) - layer_config = copy.deepcopy(layer_config) or {} - - # 1. fp_layers -> force 16 - for name in get_fp_layer_names(model, fp_layers): - layer_config[name] = { - "bits": 16, - "act_bits": 16, - "data_type": "float", - "act_data_type": "float", - "fixed_by_user": True, - } - - # 2. normalize - layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} - - # 3. infer missing bits - for cfg in layer_config.values(): - if "data_type" in cfg and "bits" not in cfg: - if (b := infer_bits_by_data_type(cfg["data_type"])) is not None: - cfg["bits"] = b - if "act_data_type" in cfg and "act_bits" not in cfg: - if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None: - cfg["act_bits"] = b - - # 4. fill defaults - if isinstance(default_scheme, str): - default_dict = asdict(preset_name_to_scheme(default_scheme.upper())) - else: - default_dict = asdict(default_scheme) - default_dict["scale_dtype"] = default_scale_dtype - for cfg in layer_config.values(): - for key in scheme_keys: - cfg.setdefault(key, copy.deepcopy(default_dict.get(key))) - - # 5. collect supported modules - gguf_name = get_gguf_scheme(default_scheme) - if gguf_name and torch.nn.Embedding not in supported_types: - supported_types = (*supported_types, torch.nn.Embedding) - - all_supported_layer_names, embedding_layer_names = [], [] - all_module_names = [] - for n, m in model.named_modules(): - all_module_names.append(n) - # cleanup stale attributes - for key in scheme_keys: - if hasattr(m, key): - delattr(m, key) - if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: - continue - all_supported_layer_names.append(n) - if isinstance(m, torch.nn.Embedding): - embedding_layer_names.append(n) - - # 6. expand regex configs - regex_config = {} - for name in list(layer_config.keys()): - if name in all_supported_layer_names: - continue - if name in all_module_names: - m = get_module(model, name) - if len(list(m.children())) == 0 and type(m) not in supported_types: - layer_config.pop(name) - logger.warning(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") - continue - - regex = re.compile(name) - matched = [ln for ln in all_supported_layer_names if regex.search(ln)] - if not matched: - raise ValueError(f"Invalid '{name}' in layer_config, no match found.") - val = layer_config.pop(name) - regex_config[name] = val # keep regex config - for match in matched: - layer_config[match] = val - # regex_config = None if len(regex_config)==0 else regex_config - - # 7. lm_head - lm_head_name = get_lm_head_name(model) - tie_word_embeddings = False - if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): - tie_word_embeddings = model.config.tie_word_embeddings - - if quant_lm_head and tie_word_embeddings: - quant_lm_head = False - logger.warning( - "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently" - ) - - if lm_head_name not in layer_config and quant_lm_head: - layer_config[lm_head_name] = copy.deepcopy(default_dict) - - # 8. enforce shape divisibility for int weight-only - if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name: - for n, m in model.named_modules(): - if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: - if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: - layer_config.setdefault(n, copy.deepcopy(default_dict)) - layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) - logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") - # enforce shape divisibility for mxfp/nvfp - if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name: - for n, m in model.named_modules(): - if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: - if m.weight.shape[1] % default_dict["group_size"]: - layer_config.setdefault(n, copy.deepcopy(default_dict)) - layer_config[n].update( - {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True} - ) - logger.warning_once( - f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})." - ) - - # 9. block layers: mark as in_blocks=True - for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): - if name not in layer_config: - layer_config[name] = copy.deepcopy(default_dict) - layer_config[name]["fixed_by_user"] = False - layer_config[name]["in_blocks"] = True - - # ---- restore: ensure missing in_blocks are set to False and compute flag ---- - has_qlayer_outside_block = False - for cfg in layer_config.values(): - if "in_blocks" not in cfg: - cfg["in_blocks"] = False - # mark layer outside block - if not cfg["in_blocks"] and check_to_quantized(cfg): - has_qlayer_outside_block = True - - # 10. GGUF handling - if not gguf_name: - dispatch_layer_config(layer_config) - return layer_config, has_qlayer_outside_block, regex_config - - # embed + lm_head defaults for gguf - if lm_head_name not in layer_config and not tie_word_embeddings: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] - cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} - layer_config[lm_head_name] = cfg - has_qlayer_outside_block = True - for emd_name in embedding_layer_names: - if emd_name in layer_config: - continue - if not tie_word_embeddings: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] - else: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] - cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} - layer_config[emd_name] = cfg - - if enable_gguf_official_mixed: - model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT - layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type) - - dispatch_layer_config(layer_config) - return layer_config, has_qlayer_outside_block, regex_config - - -def check_diffusers_installed(): # pragma: no cover - try: - import diffusers # noqa: F401 - - return True - except ImportError: - logger.error("Please install diffusers via 'pip install diffusers'" " to run diffusion model") - exit(-1) - - -def is_diffusion_model(model_or_path: Union[str, object]) -> bool: - if isinstance(model_or_path, str): - index_file = None - if not os.path.isdir(model_or_path): - try: - from huggingface_hub import hf_hub_download - - index_file = hf_hub_download(model_or_path, "model_index.json") - check_diffusers_installed() - except Exception as e: - print(e) - index_file = None - - elif os.path.exists(os.path.join(model_or_path, "model_index.json")): - check_diffusers_installed() - index_file = os.path.join(model_or_path, "model_index.json") - return index_file is not None - elif not isinstance(model_or_path, torch.nn.Module): - check_diffusers_installed() - pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") - return isinstance(model_or_path, pipeline_utils.DiffusionPipeline) - else: - return False - - -def to_standard_regex(pattern: str) -> str: - """ - Convert a user-specified string into a standardized regex for layer matching. - - Rules: - - If the pattern already contains regex tokens ('.*', '^', '$', etc.), - keep them as-is. - - Otherwise, wrap the pattern with `.*` on both sides to allow substring matching. - - Always ensure the returned regex is valid (compilable by re). - - Examples: - >>> to_standard_regex("model.embed_tokens") - '.*model\\.embed_tokens.*' - >>> to_standard_regex("mlp.gate") - '.*mlp\\.gate.*' - >>> to_standard_regex("mlp.gate$") - '.*mlp\\.gate$' - >>> to_standard_regex("mlp.*gate") - '.*mlp.*gate.*' - """ - # Heuristic: if pattern contains regex meta characters, assume partial regex - meta_chars = {".*", "^", "$", "|", "(", ")", "[", "]", "?", "+"} - has_regex = any(tok in pattern for tok in meta_chars) - if not has_regex: - # Escape literal dots, etc., and wrap with .* for substring matching - pattern = re.escape(pattern) - regex = f".*{pattern}.*" - else: - # Only escape bare dots that are not already part of regex constructs - # Avoid double escaping .* sequences - tmp = [] - i = 0 - while i < len(pattern): - if pattern[i] == ".": - if i + 1 < len(pattern) and pattern[i + 1] == "*": - tmp.append(".*") # keep regex token - i += 2 - continue - else: - tmp.append("\\.") # escape bare dot - else: - tmp.append(pattern[i]) - i += 1 - regex = "".join(tmp) - # If no anchors are provided, allow substring matching - if not regex.startswith("^") and not regex.startswith(".*"): - regex = ".*" + regex - if not regex.endswith("$") and not regex.endswith(".*"): - regex = regex + ".*" - # Validate regex - try: - re.compile(regex) - except re.error as e: - raise ValueError(f"Invalid regex generated from pattern '{pattern}': {e}") - return regex - - -def matches_any_regex(layer_name: str, regex_config: Dict[str, dict]) -> bool: - """ - Check whether `layer_name` matches any regex pattern key in `regex_config`. - Args: - layer_name (str): The layer name to test. - regex_config (Dict[str, dict]): A mapping of regex patterns to configs. - Returns: - bool: True if any pattern matches `layer_name`, otherwise False. - """ - if not regex_config: - return False - - for pattern in regex_config: - # Strip dynamic prefixes (e.g., "+:" or "-:") - raw_pattern = pattern[2:] if pattern.startswith(("+:", "-:")) else pattern - - try: - if re.search(raw_pattern, layer_name): - return True - except re.error as e: - logger.warning("Skipping invalid regex pattern %r: %s", pattern, e) - continue - - return False - - -def json_serialize(obj: Any): - """Convert non-JSON-serializable objects into JSON-friendly formats.""" - if isinstance(obj, torch.dtype): - return str(obj).split(".")[-1] # e.g., torch.float16 -> "float16" - raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") diff --git a/auto_round/utils/__init__.py b/auto_round/utils/__init__.py new file mode 100644 index 000000000..8b9366d63 --- /dev/null +++ b/auto_round/utils/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_round.utils.constants import * +from auto_round.utils.device_utils import * +from auto_round.utils.dtype_utils import * +from auto_round.utils.memory_utils import * +from auto_round.utils.misc_utils import * +from auto_round.utils.model_utils import * +from auto_round.utils.quantization_utils import * diff --git a/auto_round/utils/constants.py b/auto_round/utils/constants.py new file mode 100644 index 000000000..ef962bdcd --- /dev/null +++ b/auto_round/utils/constants.py @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib + +import torch +import transformers +from packaging import version + +from auto_round.export.export_to_gguf.config import GGUF_CONFIG + + +def compare_versions(v1, v2): + return version.parse(v1) >= version.parse(v2) + + +def torch_version_at_least(version_string): + return compare_versions(torch.__version__, version_string) + + +TORCH_VERSION_AT_LEAST_2_6_PRE_RELEASE = torch_version_at_least("2.5.99") +TORCH_VERSION_AT_LEAST_2_6 = torch_version_at_least("2.6.0") +TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0") +TORCH_VERSION_AT_LEAST_2_4 = torch_version_at_least("2.4.0") + + +class SupportedFormats: + + def __init__(self): + self._support_format = ( + "auto_round", + "auto_gptq", + "auto_awq", + "auto_round:auto_gptq", + "auto_round:gptqmodel", + "auto_round:auto_awq", + "auto_round:llm_compressor", + "itrex", + "itrex_xpu", + "fake", + "llm_compressor", + ) + self._gguf_format = tuple(sorted(GGUF_CONFIG.keys())) + self._support_list = self._support_format + self._gguf_format + + def __contains__(self, key): + return True if key in self._support_list else False + + def __str__(self): + # Return "(%s)" % ', '.join(self._support_format + ("gguf:q*_0", "gguf:q*_1", "gguf:q*_k_s")) + return "(%s)" % ", ".join(self._support_list) + + def __getitem__(self, key): + return self._support_list[key] + + +SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") + +deepspeed_exists = False +if importlib.util.find_spec("deepspeed"): # check if deepspeed is installed + deepspeed_exists = True + +SUPPORTED_DTYPES = ("int", "mx_fp", "fp", "nv_fp") +SUPPORTED_FORMATS = SupportedFormats() +SUPPORTED_LAYER_TYPES = (torch.nn.Linear, transformers.pytorch_utils.Conv1D) +# Changed to str as it relies on triton or others lib to load this +INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear",) +# transformers.integrations.finegrained_fp8.FP8Linear +if deepspeed_exists: + from deepspeed.module_inject import LinearAllreduce, LinearLayer + + SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce) diff --git a/auto_round/utils/device_utils.py b/auto_round/utils/device_utils.py new file mode 100644 index 000000000..260f04097 --- /dev/null +++ b/auto_round/utils/device_utils.py @@ -0,0 +1,359 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from functools import lru_cache +from typing import Any, Callable, Dict, List, Tuple, Union + +import cpuinfo +import torch + +from auto_round.logger import logger + +# Note on HPU usage: +# There are two modes available for enabling auto-round on HPU: +# 1. Compile Mode +# 1) Use PyTorch version ≥ 2.4 (Intel® Gaudi® v1.18 or later) +# 2) Set `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1` +# The compile mode can speed up quantization process but still in experimental stage. +# 2. Lazy Mode (By default) + + +################ Check available sys.module to decide behavior ################# +def is_package_available(package_name: str) -> bool: + """Check if the package exists in the environment without importing. + + Args: + package_name (str): package name + """ + from importlib.util import find_spec + + package_spec = find_spec(package_name) + return package_spec is not None + + +def is_hpu_lazy_mode(): + return os.getenv("PT_HPU_LAZY_MODE") != "0" + + +def _use_hpu_compile_mode(): + from auto_round.utils.constants import TORCH_VERSION_AT_LEAST_2_4 + + return TORCH_VERSION_AT_LEAST_2_4 and not is_hpu_lazy_mode() + + +def compile_func_on_hpu(func): + if _use_hpu_compile_mode(): + return torch.compile(func, backend="hpu_backend") + return func + + +def compile_func_on_cuda_or_cpu(func): + return torch.compile(func) + + +def compile_func( + fun: Union[torch.nn.Module, Callable], device: Union[str, torch.device, int] +) -> Union[torch.nn.Module, Callable]: + """Compile function on the specified device.""" + if "hpu" in str(device): + return compile_func_on_hpu(fun) ## use auto by default + else: + return compile_func_on_cuda_or_cpu(fun) + + +def is_numba_available(): # pragma: no cover + """Check if Numba is available.""" + try: + import numba + + return True + except ImportError: + return False + + +def _is_tbb_installed(): # pragma: no cover + import importlib.metadata + + try: + importlib.metadata.version("tbb") + return True + except importlib.metadata.PackageNotFoundError: + return False + + +def _is_tbb_configured(): # pragma: no cover + try: + from numba.np.ufunc.parallel import _check_tbb_version_compatible + + # check if TBB is present and compatible + _check_tbb_version_compatible() + + return True + except ImportError as e: + logger.warning_once(f"TBB not available: {e}") + return False + + +def is_tbb_available(): # pragma: no cover + """Check if TBB is available.""" + if not _is_tbb_installed(): + logger.warning_once("TBB is not installed, please install it with `pip install tbb`.") + return False + if not _is_tbb_configured(): + logger.warning_once( + ( + "TBB is installed but not configured correctly. \n" + "Please add the TBB library path to `LD_LIBRARY_PATH`, " + "for example: `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/`." + ) + ) + return False + return True + + +def can_pack_with_numba(): # pragma: no cover + """Check if Numba and TBB are available for packing. + + To pack tensor with Numba, both Numba and TBB are required, and TBB should be configured correctly. + """ + if not is_numba_available(): + logger.warning_once("Numba is not installed, please install it with `pip install numba`.") + return False + if not is_tbb_available(): + return False + return True + + +## check hpex +if is_package_available("habana_frameworks"): + _hpex_available = True + import habana_frameworks.torch.hpex # pylint: disable=E0401 +else: + _hpex_available = False + + +@torch._dynamo.disable() +@lru_cache(None) +def is_hpex_available(): + return _hpex_available + + +def check_is_cpu(device): + """Check if the device is a CPU. + + Args: + device: The device to be checked. + + Returns: + bool: True if the device is a CPU, False otherwise. + """ + return device == torch.device("cpu") or device == "cpu" + + +def detect_device_count(): + """Detects the number of available computation devices. + + This function checks if CUDA is available. If it is, it returns the count + of available CUDA devices. If not, it attempts to import the Habana + device framework to return the count of Habana devices. If the import + fails or no devices are found, it returns 0. + + Returns: + int: The number of available devices (CUDA or Habana). + """ + if torch.cuda.is_available(): + return torch.cuda.device_count() + else: + try: + import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401 + + return hthpu.device_count() + except ImportError: + return 0 + + +def detect_device(device: Union[str, int, torch.device] = None) -> str: + """Detects the appropriate computation device. + + This function determines the device to use for computations. It can take + a specific device index or default to 'auto'. The function checks for + available devices in the following order: CUDA, Habana, and finally CPU. + + Args: + device (str, int, or torch.device, optional): The desired device. + If 'auto' or None, the function will determine the best device + automatically. + + Returns: + str: The device to use for computations, formatted as a string. + """ + + def is_valid_digit(s): + try: + num = int(s) + return 0 <= num + except: + return False + + dev_idx = None + if is_valid_digit(device): + dev_idx = int(device) + device = "auto" + if isinstance(device, str) and "," in device: # device is "0,1,2" + device_list = [int(dev) for dev in device.split(",") if dev.isdigit()] + dev_idx = device_list[0] if device_list else None + device = "auto" + if device is None or device == "auto": + if torch.cuda.is_available(): + device = torch.device("cuda") + # logger.info("Using GPU device") + elif is_hpex_available(): # pragma: no cover + device = torch.device("hpu") + # logger.info("Using HPU device") + elif torch.xpu.is_available(): # pragma: no cover + device = torch.device("xpu") + # Use CPU as a fallback + else: + device = torch.device("cpu") + # logger.info("Using CPU device") + if dev_idx is not None and str(device) != "cpu": + device = str(device) + f":{dev_idx}" + return str(device) + elif isinstance(device, torch.device): + device = str(device) + elif isinstance(device, str): ## for cuda:0 + if device == "tp": # pragma: no cover + # should not specify card, e.g., cuda:0 + if torch.cuda.is_available(): + device = "cuda" + elif is_hpex_available(): + device = "hpu" + else: + device = "cpu" + else: + device = device + return device + + +def get_device_and_parallelism(device: Union[str, torch.device, int]) -> Tuple[str, bool]: + if isinstance(device, str): + devices = device.replace(" ", "").split(",") + elif isinstance(device, int): + devices = [str(device)] + else: + devices = [device] + if all(s.isdigit() for s in devices) and len(devices) > 1 and torch.cuda.is_available(): + device = "cuda" + parallelism = True + elif all(s.isdigit() for s in devices) and len(devices) > 1 and torch.xpu.is_available(): + device = "xpu" + parallelism = False + # pragma: no cover + elif device == "auto": + device = detect_device(device) + parallelism = True + else: + device = detect_device(device) + parallelism = False + return device, parallelism + + +def set_cuda_visible_devices(device): + devices = device.replace(" ", "").split(",") + if all(s.isdigit() for s in devices): + if "CUDA_VISIBLE_DEVICES" in os.environ: + current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] + current_visible_devices = current_visible_devices.split(",") + indices = [int(device) for device in devices] + try: + pick_device = [current_visible_devices[i] for i in indices] + except: + raise ValueError( + "Invalid '--device' value: It must be smaller than the number of available devices." + " For example, with CUDA_VISIBLE_DEVICES=4,5, " + "--device 0,1 is valid, but --device 4,5 is not supported." + ) + visible_devices = ",".join(pick_device) + os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices + else: + os.environ["CUDA_VISIBLE_DEVICES"] = device + + +def set_fake_cuda_device_capability(func=None): + if func is not None: + torch.cuda.get_device_capability = func + return func + + def fake_cuda(): + return 100, 1 + + orig_func = torch.cuda.get_device_capability + torch.cuda.get_device_capability = fake_cuda + return orig_func + + +def _get_packing_device(device: str | torch.device | None = "auto") -> torch.device: + """ + Selects the packing device. + - "auto": choose best available (CUDA > XPU > CPU). + - str: parsed by torch.device (e.g., "cuda:2", "cpu"). + - torch.device: returned as-is. + - None: treated as "auto". + + Args: + device: Target device spec ("auto", "cuda:0", "xpu:0", "cpu", or torch.device). + + Returns: + torch.device: The resolved device. + """ + if device is None or (isinstance(device, str) and device.lower() == "auto"): + if torch.cuda.is_available(): + return torch.device("cuda:0") + if hasattr(torch, "xpu") and torch.xpu.is_available(): + return torch.device("xpu:0") + return torch.device("cpu") + + if isinstance(device, torch.device): + return device + + if isinstance(device, str): + try: + return torch.device(device) + except Exception as e: + raise ValueError(f"Invalid device string: {device}") from e + + raise TypeError(f"Unsupported device type: {type(device)} ({device})") + + +class CpuInfo(object): + """Get CPU Info.""" + + def __init__(self): + """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket.""" + self._bf16 = False + info = cpuinfo.get_cpu_info() + if "arch" in info and "X86" in info["arch"]: + cpuid = cpuinfo.CPUID() + max_extension_support = cpuid.get_max_extension_support() + if max_extension_support >= 7: + eax = cpuid._run_asm( + b"\xb9\x01\x00\x00\x00", # mov ecx, 1 + b"\xb8\x07\x00\x00\x00" b"\x0f\xa2" b"\xc3", # mov eax, 7 # cpuid # ret + ) + self._bf16 = bool(eax & (1 << 5)) + + @property + def bf16(self): + """Get whether it is bf16.""" + return self._bf16 diff --git a/auto_round/utils/dtype_utils.py b/auto_round/utils/dtype_utils.py new file mode 100644 index 000000000..91ed869c3 --- /dev/null +++ b/auto_round/utils/dtype_utils.py @@ -0,0 +1,146 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from enum import Enum +from typing import Any, Callable, Dict, List, Tuple, Union + +import torch + + +def convert_dtype_str2torch(str_dtype): + """Converts a string dtype to its corresponding PyTorch dtype. + + Args: + str_dtype (str): The string representation of the dtype. + + Returns: + torch.dtype: The PyTorch dtype. + + Raises: + ValueError: If the input str_dtype is unsupported. + """ + if isinstance(str_dtype, torch.dtype) or str_dtype is None: + return str_dtype + if str_dtype == "int8": + return torch.int8 + elif str_dtype == "fp32" or str_dtype == "float32" or str_dtype == "auto": + return torch.float + elif str_dtype == "fp16" or str_dtype == "float16": + return torch.float16 + elif str_dtype == "bf16" or str_dtype == "bfloat16": + return torch.bfloat16 + else: + raise ValueError(f"Unsupported string dtype '{str_dtype}' for conversion to torch dtype.") + + +def convert_dtype_torch2str(dtype): + """Converts a PyTorch dtype to its corresponding string representation. + + Args: + dtype: PyTorch dtype or str. The dtype to convert. + + Returns: + str: The string representation of the dtype. + + Raises: + ValueError: If the input dtype is unsupported. + """ + if isinstance(dtype, str) or dtype is None: + return dtype + if dtype == torch.int8: + return "int8" + elif dtype == torch.float: + return "fp32" + elif dtype == torch.float16: + return "fp16" + elif dtype == torch.bfloat16: + return "bf16" + elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]: + return dtype + else: + raise ValueError(f"Unsupported PyTorch dtype '{dtype}' for conversion to string dtype.") + + +def convert_dtype_torch2str_hf(dtype): + """Converts a PyTorch dtype to its corresponding huggingface string dtype, e.g. torch.float32 -> 'float32'. + + Args: + dtype: PyTorch dtype or str. The dtype to convert. + + Returns: + str: The string representation of the dtype. + + Raises: + ValueError: If the input str_dtype is unsupported. + """ + if dtype is None: + return dtype + if isinstance(dtype, str): + if "float" not in dtype and "int" not in dtype: + dtype = convert_dtype_str2torch(dtype) + else: + return dtype + str_dtype = str(dtype) + if "." not in str_dtype: + raise ValueError(f"Unsupported pytorch dtype '{dtype}' for conversion to huggingface str dtype") + str_dtype = str_dtype.split(".")[1] + return str_dtype + + +class BackendDataType(str, Enum): + STANDARD_FP = "fp" + MX_FP = "mx_fp" + NV_FP = "nv_fp" + + +def is_standard_fp(backend): + backend = backend.lower() + return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend) + + +def is_mx_fp(backend): + backend = backend.lower() + return BackendDataType.MX_FP in backend + + +def is_nv_fp(backend): + backend = backend.lower() + return BackendDataType.NV_FP in backend + + +def _is_weight_fp8_activation_static_fp8( + bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool +) -> bool: + return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic + + +def is_wfp8afp8(ar): + if ( + ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) + and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)) + and is_standard_fp(ar.act_data_type) + and is_standard_fp(ar.data_type) + ): + return True + else: + return False + + +def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool: + if isinstance(ar_or_format, str): + return "fp8_static" in ar_or_format + if ar_or_format.act_dynamic: + return False + if is_wfp8afp8(ar_or_format): + return True + return False diff --git a/auto_round/utils/memory_utils.py b/auto_round/utils/memory_utils.py new file mode 100644 index 000000000..7ecc31893 --- /dev/null +++ b/auto_round/utils/memory_utils.py @@ -0,0 +1,182 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import gc + +import torch + + +def bytes_to_gigabytes(bytes) -> int: + """ + Converts bytes to gigabytes. + + Args: + bytes (int): The number of bytes. + + Returns: + int: The equivalent number of gigabytes. + """ + return bytes / 1024 / 1024 / 1024 + + +def _clear_memory_for_cpu_and_cuda(tensor=None): + if isinstance(tensor, list): + for i in range(len(tensor)): + tensor[i] = None + if tensor is not None: + del tensor + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + if torch.xpu.is_available(): + torch.xpu.empty_cache() + + +@torch._dynamo.disable() +def clear_memory(tensor=None): + from auto_round.utils.device_utils import is_hpex_available + + if is_hpex_available(): + # hpu does not have empty_cache + return + else: + _clear_memory_for_cpu_and_cuda(tensor) + + +def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): + """Checks the availability of memory on the specified device for processing inputs using a given weight tensor. + + Args: + device (str): The device type ('cuda' for GPU or 'hpu' for HPU). + inputs (torch.Tensor): Input tensor. + weight (torch.Tensor): Weight tensor. + org_seqlen (int): Original sequence length. + org_bs (int): Original batch size. + + Returns: + tuple: A tuple containing availability status (bool), modified sequence length (int), + and modified batch size (int). + """ + weight_memory = weight.numel() * weight.element_size() + if "cuda" in device: + current_gpu_index = torch.cuda.current_device() + total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory + used_memory = torch.cuda.memory_allocated(current_gpu_index) + free_space = total_memory - used_memory + elif "hpu" in device: # pragma: no cover + current_hpu_index = torch.hpu.current_device() + free_space = torch.hpu.memory_reserved(current_hpu_index) + else: + return True, org_seqlen, org_bs + + free_space = free_space - weight_memory * 10 # for min_max_scale & grad usage + seqlen = org_seqlen + bs = org_bs + in_feature = weight.shape[1] + out_feature = weight.shape[0] + while seqlen >= 128: + input_size = bs * seqlen * in_feature + output_size = bs * seqlen * out_feature + input_output_memory = 2 * (input_size * inputs.element_size() + output_size * inputs.element_size()) + if input_output_memory < free_space: + return True, seqlen, bs + seqlen = seqlen // 2 + bs = 1 + + return False, seqlen, bs + + +def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]: + """ + Calculates the memory consumption of a specific block in the model. + + Args: + block (torch.nn.Module): The block of the model to analyze. + input_ids (list[torch.Tensor]): A list of input tensors for the block. + + Returns: + tuple: A tuple containing the following: + - block_memory (float): The memory consumption (in GB) of the block's linear layers. + - input_output_memory (float): The memory consumption (in GB) for input and output + tensors of the block. + """ + # Calculate all block parameters memory + from auto_round.utils.quantization_utils import check_to_quantized + + total_param_mem = 0 + for name, module in block.named_modules(): + if check_to_quantized(module): + param_size = module.weight.nbytes + total_param_mem += param_size + block_memory = total_param_mem / 1024**3 # Convert to GB + + # Assuming bfloat16 or float32, input and output + input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 + + return block_memory, input_output_memory + + +def out_of_vram(error_msg): + error_msg = str(error_msg) + # CUDA + if "CUDA out of memory" in error_msg: + return True + # gaudi + if "MODULE:PT_DEVMEM" in error_msg: + return True + # XPU + if "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in error_msg: + return True + # ROCM + if "HIP out of memory. Tried to allocate" in error_msg: + return True + return False + + +def get_max_vram(ratio: float = 0.9) -> dict: + max_memory = {} + if torch.cuda.is_available(): # NVIDIA CUDA + num_devices = torch.cuda.device_count() + for i in range(num_devices): + total_mem = torch.cuda.get_device_properties(i).total_memory + max_mem_gb = int(total_mem / 1024**3 * ratio) + max_memory[i] = f"{max_mem_gb}GiB" + elif torch.xpu.is_available(): # TODO need verification + num_devices = torch.xpu.device_count() + for i in range(num_devices): + total_mem = torch.xpu.get_device_properties(i).total_memory + max_mem_gb = int(total_mem / 1024**3 * ratio) + max_memory[i] = f"{max_mem_gb}GiB" + + else: + raise RuntimeError("No CUDA or XPU devices found.") + return max_memory + + +def get_device_memory(i: int = 0) -> int: + """ + Gets the available memory on the specified device. + + Args: + i (int, optional): Device index. Defaults to 0. + + Returns: + int: Available memory in gigabytes. + """ + if torch.cuda.is_available(): + total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory) + elif torch.xpu.is_available(): + raise RuntimeError("XPU does not support device_map='auto' currently.") + else: + raise RuntimeError("No supported device found (CUDA or XPU).") + return total_memory diff --git a/auto_round/utils/misc_utils.py b/auto_round/utils/misc_utils.py new file mode 100644 index 000000000..ab46653b7 --- /dev/null +++ b/auto_round/utils/misc_utils.py @@ -0,0 +1,226 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import importlib +import os +import re +import sys +from typing import Any, Callable, Dict, List, Tuple, Union + +import torch + +from auto_round.logger import logger + + +class LazyImport(object): + """Lazy import python module till use.""" + + def __init__(self, module_name): + """Init LazyImport object. + + Args: + module_name (string): The name of module imported later + """ + self.module_name = module_name + self.module = None + + def __getattr__(self, name): + """Get the attributes of the module by name.""" + try: + self.module = importlib.import_module(self.module_name) + mod = getattr(self.module, name) + except: + spec = importlib.util.find_spec(str(self.module_name + "." + name)) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + def __call__(self, *args, **kwargs): + """Call the function in that module.""" + function_name = self.module_name.split(".")[-1] + module_name = self.module_name.split(f".{function_name}")[0] + self.module = importlib.import_module(module_name) + function = getattr(self.module, function_name) + return function(*args, **kwargs) + + +auto_gptq = LazyImport("auto_gptq") +htcore = LazyImport("habana_frameworks.torch.core") + + +def is_debug_mode(): + """Checks if the Python interpreter is running in debug mode. + + Returns: + bool: True if debugging is enabled, False otherwise. + """ + return sys.gettrace() is not None or sys.flags.debug == 1 + + +def is_local_path(path): + """Checks if a given path exists locally. + + Args: + path (str): The path to check. + + Returns: + bool: True if the path exists locally, False otherwise. + """ + format_list = ( + "json", + "txt", + ) + flag = None + for x in format_list: + flag = True if x in path else flag + return flag and os.path.exists(path) + + +def get_library_version(library_name): + from packaging.version import Version + + python_version = Version(sys.version.split()[0]) + if python_version < Version("3.8"): + import warnings + + warnings.filterwarnings("ignore", category=DeprecationWarning) + import pkg_resources # pylint: disable=E0401 + + try: + version = pkg_resources.get_distribution(library_name).version + return version + except pkg_resources.DistributionNotFound: + return f"{library_name} is not installed" + else: + import importlib.metadata # pylint: disable=E0401 + + try: + version = importlib.metadata.version(library_name) + return version + except importlib.metadata.PackageNotFoundError: + return f"{library_name} is not installed" + + +def str2bool(v): + import argparse + + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def flatten_list(nested_list): + flattened = [] + for item in nested_list: + if isinstance(item, (list, tuple)): + flattened.extend(flatten_list(item)) + else: + flattened.append(item) + return flattened + + +def to_standard_regex(pattern: str) -> str: + """ + Convert a user-specified string into a standardized regex for layer matching. + + Rules: + - If the pattern already contains regex tokens ('.*', '^', '$', etc.), + keep them as-is. + - Otherwise, wrap the pattern with `.*` on both sides to allow substring matching. + - Always ensure the returned regex is valid (compilable by re). + + Examples: + >>> to_standard_regex("model.embed_tokens") + '.*model\\.embed_tokens.*' + >>> to_standard_regex("mlp.gate") + '.*mlp\\.gate.*' + >>> to_standard_regex("mlp.gate$") + '.*mlp\\.gate$' + >>> to_standard_regex("mlp.*gate") + '.*mlp.*gate.*' + """ + # Heuristic: if pattern contains regex meta characters, assume partial regex + meta_chars = {".*", "^", "$", "|", "(", ")", "[", "]", "?", "+"} + has_regex = any(tok in pattern for tok in meta_chars) + if not has_regex: + # Escape literal dots, etc., and wrap with .* for substring matching + pattern = re.escape(pattern) + regex = f".*{pattern}.*" + else: + # Only escape bare dots that are not already part of regex constructs + # Avoid double escaping .* sequences + tmp = [] + i = 0 + while i < len(pattern): + if pattern[i] == ".": + if i + 1 < len(pattern) and pattern[i + 1] == "*": + tmp.append(".*") # keep regex token + i += 2 + continue + else: + tmp.append("\\.") # escape bare dot + else: + tmp.append(pattern[i]) + i += 1 + regex = "".join(tmp) + # If no anchors are provided, allow substring matching + if not regex.startswith("^") and not regex.startswith(".*"): + regex = ".*" + regex + if not regex.endswith("$") and not regex.endswith(".*"): + regex = regex + ".*" + # Validate regex + try: + re.compile(regex) + except re.error as e: + raise ValueError(f"Invalid regex generated from pattern '{pattern}': {e}") + return regex + + +def matches_any_regex(layer_name: str, regex_config: Dict[str, dict]) -> bool: + """ + Check whether `layer_name` matches any regex pattern key in `regex_config`. + Args: + layer_name (str): The layer name to test. + regex_config (Dict[str, dict]): A mapping of regex patterns to configs. + Returns: + bool: True if any pattern matches `layer_name`, otherwise False. + """ + if not regex_config: + return False + + for pattern in regex_config: + # Strip dynamic prefixes (e.g., "+:" or "-:") + raw_pattern = pattern[2:] if pattern.startswith(("+:", "-:")) else pattern + + try: + if re.search(raw_pattern, layer_name): + return True + except re.error as e: + logger.warning("Skipping invalid regex pattern %r: %s", pattern, e) + continue + + return False + + +def json_serialize(obj: Any): + """Convert non-JSON-serializable objects into JSON-friendly formats.""" + if isinstance(obj, torch.dtype): + return str(obj).split(".")[-1] # e.g., torch.float16 -> "float16" + raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") diff --git a/auto_round/utils/model_utils.py b/auto_round/utils/model_utils.py new file mode 100644 index 000000000..1b833ca4b --- /dev/null +++ b/auto_round/utils/model_utils.py @@ -0,0 +1,1104 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import json +import os +import re +from collections import UserDict +from dataclasses import asdict, fields +from pathlib import Path +from typing import Any, Callable, Dict, List, Tuple, Union + +import torch +import transformers + +from auto_round.export.export_to_gguf.config import ModelType +from auto_round.logger import logger + + +def clean_module_parameter(submodule, parameter): + if submodule is None: + return + is_buffer = parameter in submodule._buffers + with torch.no_grad(): + if is_buffer: + submodule._buffers[parameter] = None + else: + submodule._parameters[parameter] = None + + +def check_and_mark_fp8_model(model: torch.nn.Module) -> bool: + if is_fp8_model(model): + return True + for n, m in model.named_modules(): + if is_fp8_linear(m): + m.is_fp8_linear = True + if not hasattr(model, "is_fp8"): + model.is_fp8 = True + if hasattr(model, "is_fp8"): + return True + return False + + +def check_diffusers_installed(): # pragma: no cover + try: + import diffusers # noqa: F401 + + return True + except ImportError: + logger.error("Please install diffusers via 'pip install diffusers'" " to run diffusion model") + exit(-1) + + +def check_start_with_block_name(name: str, block_name_to_quantize: list): + """ + Checks if the given layer name starts with any of the block names to be quantized. + + Args: + name (str): The name of the layer. + block_name_to_quantize (list): A list of block names to check against. + + Returns: + bool: True if the layer name starts with any of the block names, False otherwise. + """ + for block_name in block_name_to_quantize: + if name.startswith(block_name): + return True + return False + + +def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None): + """Download hugging face model from hf hub.""" + from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE + from huggingface_hub.file_download import REGEX_COMMIT_HASH, repo_folder_name + + if cache_dir is None: + cache_dir = HUGGINGFACE_HUB_CACHE + if revision is None: + revision = DEFAULT_REVISION + if repo_type is None: + repo_type = "model" + storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)) + commit_hash = None + if REGEX_COMMIT_HASH.match(revision): + commit_hash = revision + else: + ref_path = os.path.join(storage_folder, "refs", revision) + if os.path.exists(ref_path): + with open(ref_path) as f: + commit_hash = f.read() + if storage_folder and commit_hash: + pointer_path = os.path.join(storage_folder, "snapshots", commit_hash) + if os.path.isdir(pointer_path): + return pointer_path + else: # pragma: no cover + from huggingface_hub import snapshot_download + + model_path = snapshot_download(repo_id) + return model_path + + +def llm_load_model( + pretrained_model_name_or_path, + trust_remote_code=True, + model_dtype=None, + device="cpu", + low_cpu_mem_mode=0, + low_cpu_mem_tmp_dir=None, + **kwargs, +): + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + + from auto_round.utils.device_utils import ( + _use_hpu_compile_mode, + get_device_and_parallelism, + set_fake_cuda_device_capability, + ) + + device_str, use_auto_mapping = get_device_and_parallelism(device) + torch_dtype = "auto" + if device_str is not None and "hpu" in device_str: + torch_dtype = torch.bfloat16 + + is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower())) + low_cpu_mem_usage = False + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) + + model_cls = AutoModel if is_glm else AutoModelForCausalLM + if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code: + logger.warning("trust_remote_code is enabled by default, please ensure its correctness.") + + if low_cpu_mem_tmp_dir is None: + low_cpu_mem_tmp_dir = "low_cpu_mem_tmp" + if low_cpu_mem_mode == 2: + from auto_round.low_cpu_mem.utils import load_model_with_hooks + + model = load_model_with_hooks( + pretrained_model_name_or_path, + model_cls, + device=device, + clean_weight=True, + saved_path=low_cpu_mem_tmp_dir, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + ) + elif low_cpu_mem_mode == 1: + from auto_round.low_cpu_mem.utils import load_empty_model + + low_cpu_mem_usage = True + model = load_empty_model( + pretrained_model_name_or_path, + model_cls, + device=device, + saved_path=low_cpu_mem_tmp_dir, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + ) + else: + if _use_hpu_compile_mode(): + model = model_cls.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + attn_implementation="eager", + trust_remote_code=trust_remote_code, + device_map="auto" if use_auto_mapping else None, + ) + else: + try: + model = model_cls.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + device_map="auto" if use_auto_mapping else None, + ) + except ValueError as e: + if "FP8 quantized" in str(e): + orig_func = set_fake_cuda_device_capability() + model = model_cls.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + device_map="auto" if use_auto_mapping else None, + ) + torch.cuda.get_device_capability = orig_func + logger.warning("the support for fp8 model as input is experimental, please use with caution.") + else: + raise + + except OSError as e: + logger.warning( + f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry." + ) + model = model_cls.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + trust_remote_code=False, + device_map="auto" if use_auto_mapping else None, + ) + + model = model.eval() + check_and_mark_fp8_model(model) + model = _to_model_dtype(model, model_dtype) + + return model, tokenizer, low_cpu_mem_usage + + +def mllm_load_model( + pretrained_model_name_or_path, + device="cpu", + torch_dtype="auto", + use_auto_mapping=True, + trust_remote_code=True, + model_dtype=None, + **kwargs, +): + import transformers + from huggingface_hub import HfApi, HfFileSystem, hf_hub_download + from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer + + from auto_round.utils.device_utils import get_device_and_parallelism, set_fake_cuda_device_capability + + device_str, use_auto_mapping = get_device_and_parallelism(device) + torch_dtype = "auto" + if device_str is not None and "hpu" in device_str: + torch_dtype = torch.bfloat16 + if os.path.isdir(pretrained_model_name_or_path): + config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json"))) + else: + from huggingface_hub import hf_hub_download, list_repo_files + + file_list = list_repo_files(pretrained_model_name_or_path) + if "config.json" in file_list: + # Load plain JSON + config_path = hf_hub_download(pretrained_model_name_or_path, "config.json") + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + elif "config.json.gz" in file_list: + # Load gzipped JSON + import gzip + + config_path = hf_hub_download(pretrained_model_name_or_path, "config.json.gz") + with gzip.open(config_path, "rt", encoding="utf-8") as f: + config = json.load(f) + else: + raise FileNotFoundError(f"No config.json or config.json.gz found for {pretrained_model_name_or_path}") + + if "model_type" in config: + model_type = config["model_type"] + else: + model_type = None + + processor, image_processor = None, None + if "deepseek_vl_v2" == model_type: + from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor # pylint: disable=E0401 + + processor = DeepseekVLV2Processor.from_pretrained(pretrained_model_name_or_path) + tokenizer = processor.tokenizer + model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + device_map="auto" if use_auto_mapping else None, + ) + else: + architectures = config["architectures"][0] + if architectures == "LlavaLlamaForCausalLM": + from llava.model.builder import load_pretrained_model # pylint: disable=E0401 + + tokenizer, model, image_processor, _ = load_pretrained_model( + pretrained_model_name_or_path, + model_base=None, + model_name=pretrained_model_name_or_path, + torch_dtype=torch_dtype, + ) + else: + if architectures.endswith("Model") and hasattr( + transformers, n := architectures.replace("Model", "ForConditionalGeneration") + ): + cls = getattr(transformers, n) + elif hasattr(transformers, architectures): + cls = getattr(transformers, architectures) + else: + cls = AutoModelForCausalLM + try: + model = cls.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + device_map="auto" if use_auto_mapping else None, + ) + except ValueError as e: + if "FP8 quantized" in str(e): + orig_func = set_fake_cuda_device_capability() + model = cls.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + device_map="auto" if use_auto_mapping else None, + ) + torch.cuda.get_device_capability = orig_func + logger.warning("the support for fp8 model as input is experimental, please use with caution.") + else: + raise + + if "Mistral-Small-3.2" in pretrained_model_name_or_path: + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pylint: disable=E0401 + + if os.path.isdir(pretrained_model_name_or_path): + tokenizer = MistralTokenizer.from_file(os.path.join(pretrained_model_name_or_path, "tekken.json")) + else: + tokenizer = MistralTokenizer.from_hf_hub(pretrained_model_name_or_path) + else: + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code + ) + processor = AutoProcessor.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code + ) + try: + from transformers import AutoImageProcessor + + image_processor = AutoImageProcessor.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code + ) + except Exception as e: + pass + + model = model.eval() + check_and_mark_fp8_model(model) + model = _to_model_dtype(model, model_dtype) + + return model, processor, tokenizer, image_processor + + +def diffusion_load_model( + pretrained_model_name_or_path: str, + device: Union[str, torch.device] = "cpu", + torch_dtype: Union[str, torch.dtype] = "auto", + use_auto_mapping: bool = False, + trust_remote_code: bool = True, + model_dtype: str = None, + **kwargs, +): + from auto_round.utils.device_utils import get_device_and_parallelism + from auto_round.utils.misc_utils import LazyImport + + device_str, use_auto_mapping = get_device_and_parallelism(device) + torch_dtype = "auto" + if device_str is not None and "hpu" in device_str: + torch_dtype = torch.bfloat16 + + pipelines = LazyImport("diffusers.pipelines") + + pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained( + pretrained_model_name_or_path, torch_dtype=torch_dtype + ) + pipe = _to_model_dtype(pipe, model_dtype) + model = pipe.transformer + return pipe, model.to(device) + + +def is_pure_text_model(model): + """verify on: phi-3.5, Mistral-Small-3.1, gemma-3, qwen2-vl,""" + if hasattr(model, "config") and hasattr(model.config, "vision_config"): + return False + if hasattr(model.__class__, "main_input_name") and model.__class__.main_input_name != "input_ids": + return False + for module in model.modules(): + if hasattr(module.__class__, "main_input_name") and module.__class__.main_input_name != "input_ids": + return False + if "vision" in str(module.__class__).lower(): + return False + if "image" in str(module.__class__).lower(): + return False + if "img" in str(module.__class__).lower(): + return False + return True + + +def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): + MM_KEYS = [ + "multi_modal_projector", + "vision_tower", + "multimodal_projector", + "thinker", + "visual", + "audio", + "talker", + "token2wav", + "vision_model", + "audio_tower", + "vision_encoder", + "vision_language_adapter", + "patch_merger", + "pre_mm_projector_norm", + "vision", + ] + + model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path + if not os.path.isdir(model_path): + model_path = download_hf_model(model_path) + + if isinstance(model_path, str): + if os.path.exists(os.path.join(model_path, "preprocessor_config.json")): + return True + if os.path.exists(os.path.join(model_path, "processor_config.json")): + return True + if os.path.exists(os.path.join(model_path, "config.json")): + with open(os.path.join(model_path, "config.json")) as f: + config = json.load(f) + for key in config.keys(): + if any([k in key for k in MM_KEYS]): + return True + + if isinstance(model_or_path, torch.nn.Module): + for name, module in model_or_path.named_modules(): + if any([k in name for k in MM_KEYS]): + return True + + return False + + +def is_diffusion_model(model_or_path: Union[str, object]) -> bool: + from auto_round.utils.misc_utils import LazyImport + + if isinstance(model_or_path, str): + index_file = None + if not os.path.isdir(model_or_path): + try: + from huggingface_hub import hf_hub_download + + index_file = hf_hub_download(model_or_path, "model_index.json") + check_diffusers_installed() + except Exception as e: + print(e) + index_file = None + + elif os.path.exists(os.path.join(model_or_path, "model_index.json")): + check_diffusers_installed() + index_file = os.path.join(model_or_path, "model_index.json") + return index_file is not None + elif not isinstance(model_or_path, torch.nn.Module): + check_diffusers_installed() + pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") + return isinstance(model_or_path, pipeline_utils.DiffusionPipeline) + else: + return False + + +def is_moe(module: torch.nn.Module) -> bool: + """Returns whether the module is an MOE layer.""" + return any( + key in type(module).__name__.lower() + for key in [ + "MixtralSparseMoeBlock".lower(), + "ArcticMoE".lower(), + "DbrxFFN".lower(), + "MoELayer".lower(), + "PhimoeSparseMoeBlock".lower(), + "DeepseekMoE".lower(), + "DeepseekV2MoE".lower(), + "DeepseekV3MoE".lower(), + "Qwen2MoeSparseMoeBlock".lower(), + "Qwen3MoeSparseMoeBlock".lower(), + ] + ) + + +def is_fp8_model(model: torch.nn.Module) -> bool: + if not hasattr(model, "is_fp8"): + return False + else: + return model.is_fp8 + + +def is_fp8_linear(module: torch.nn.Module) -> bool: + if hasattr(module, "is_fp8_linear"): + return module.is_fp8_linear + if not (type(module) == torch.nn.Linear or module.__class__.__name__ == "FP8Linear"): + return False + if module.weight is None: + return False + if str(module.weight.dtype).startswith("torch.float8"): + return True + else: + return False + + +def get_block_names(model, quant_vision=False): + """Get the block names for transformers-like networks. + + Args: + model: The model. + + Returns: + block_names: A list whose elements are list of block's layer names + """ + from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK + + def _search_block(name, module): + if hasattr(type(module), "__name__") and "ModuleList" in type(module).__name__: + return [(name, module)] + target_modules = [] + for n, m in module.named_children(): + if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: + target_modules.append((".".join(filter(None, (name, n))), m)) + else: + target_modules.extend(_search_block(".".join(filter(None, (name, n))), m)) + return target_modules + + def _get_llm_block_names(model): + block_names = [] + target_modules = _search_block("", model) + + for i, target_m in enumerate(target_modules): + block_names.append([]) + for n, m in target_m[1].named_children(): + block_names[i].append(target_m[0] + "." + n) + return block_names + + def _get_vlm_block_names(model, quant_vision=False): + if ( + hasattr(model, "config") + and hasattr(model.config, "model_type") + and model.config.model_type in SPECIAL_MULTIMODAL_BLOCK.keys() + ): + return SPECIAL_MULTIMODAL_BLOCK.get(model.config.model_type)(model, quant_vision=quant_vision) + block_names = [] + target_modules = [] + vision_blocks_tuple = ("vision", "visual", "image", "img") + last_block_name = "" + for n, m in model.named_modules(): + if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: + if quant_vision or all(key not in n.lower() for key in (vision_blocks_tuple)): + if last_block_name and last_block_name in n: + continue + target_modules.append((n, m)) + last_block_name = n + for i, target_m in enumerate(target_modules): + block_names.append([]) + for n, m in target_m[1].named_children(): + block_names[i].append(target_m[0] + "." + n) + return block_names + + if quant_vision or not is_pure_text_model(model): + return _get_vlm_block_names(model, quant_vision=quant_vision) + else: + return _get_llm_block_names(model) + + +def get_lm_head_name(model): + block_names = get_block_names(model, True) + last_name = None + for n, m in model.named_modules(): + if any(m.children()): + continue + last_name = n + for l in block_names: + if last_name in l: + last_name = None + break + return last_name + + +# please refer to https://github.com/NVIDIA/TensorRT-Model-Optimizer +# /blob/4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/layer_utils.py#L976 +def get_expert_linear_names(module: torch.nn.Module) -> list[str]: + """Get the list of linear names for the experts.""" + + def module_match_name_list(module, name_list): + """Check if the module name matches any of the names in the list. + + e.g. module_match_name_list(QuantQwen3MoeSparseMoeBlock, ['Qwen3MoeSparseMoeBlock']) -> True + + """ + return any(name.lower() in type(module).__name__.lower() for name in name_list) + + if module_match_name_list( + module, ["Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "DeepseekMoE", "DeepseekV2MoE", "DeepseekV3MoE"] + ): + return ["gate_proj", "down_proj", "up_proj"] + elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]): + return ["linear_fc1", "linear_fc2"] + elif module_match_name_list(module, ["DBRXMoeSparseMoeBlock"]): + return ["w1_linear", "w2_linear", "v1_linear"] + else: + # assuming w1, w2, w3 by default + return ["w1", "w2", "w3"] + + +def get_model_dtype(model_dtype, default="auto"): + if model_dtype is None or model_dtype == "auto": + model_dtype = default + elif model_dtype in ["bf16", "bfloat16"]: + model_dtype = "bfloat16" + elif model_dtype in ["f16", "float16", "fp16"]: + model_dtype = "float16" + elif model_dtype in ["f32", "float32", "fp32"]: + model_dtype = "float32" + else: + logger.warning(f"Unable to identify model_dtype {model_dtype}, reset to default model_dtype {default}") + model_dtype = default + return model_dtype + + +def get_nested_attr(module, attr_name: str): + """Recursively get nested attribute (e.g., 'orig_layer.act_max').""" + attrs = attr_name.split(".") + for attr in attrs: + if not hasattr(module, attr): + return None + module = getattr(module, attr) + return module + + +def get_gguf_architecture(dir_model, model_type=ModelType.TEXT): + from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( + ModelBase, + get_model_architecture, + ) + + is_mistral_format = False + if isinstance(dir_model, str): + dir_model = Path(dir_model) + + hparams = ModelBase.load_hparams(dir_model, is_mistral_format) + if isinstance(hparams, dict): + tmp_model_type = hparams["model_type"] + else: + tmp_model_type = hparams.model_type + if "mistral" == tmp_model_type: + is_mistral_format = True + hparams = ModelBase.load_hparams(dir_model, is_mistral_format) + if not is_mistral_format: + model_class = get_model_architecture(hparams, model_type) + elif model_type == ModelType.MMPROJ: + assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" + model_class = "PixtralModel" + else: + model_class = "MistralModel" + return model_class + + +def get_layer_names_in_block( + model: torch.nn.Module, + supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), + quant_block_list: list = None, + class_names: tuple = None, +) -> list[str]: + """Retrieves the names of layers within each block of the model. + + Returns: + list: A list of strings, where each string is the name of a layer + within a block of the model. + """ + if class_names is None: + class_names = [] + for n, m in model.named_modules(): + if type(m) in supported_types or (class_names is not None and m.__class__.__name__ in class_names): + m.bk_tmp_name = n + layers_in_block = [] + if bool(quant_block_list): + all_blocks = quant_block_list + else: + all_blocks = get_block_names(model) + for block_names in all_blocks: + for block_name in block_names: + block = get_module(model, block_name) + for n, m in block.named_modules(): + if hasattr(m, "bk_tmp_name"): + layers_in_block.append(m.bk_tmp_name) + delattr(m, "bk_tmp_name") + return layers_in_block + + +def set_nested_attr(module, attr_name: str, value): + """Recursively set nested attribute (e.g., 'orig_layer.act_max' = value).""" + attrs = attr_name.split(".") + for attr in attrs[:-1]: + if not hasattr(module, attr): + return None # No need to set act_max for fp layers + module = getattr(module, attr) + setattr(module, attrs[-1], value) + + +def pad_weight(weight: torch.Tensor, block_size: list) -> Tuple[torch.Tensor, int, int]: + """Pads a matrix to make its dimensions multiples of block_size.""" + M, N = weight.shape[-2:] + block_size_m, block_size_n = block_size + pad_M = (block_size_m - M % block_size_m) % block_size_m + pad_N = (block_size_n - N % block_size_n) % block_size_n + + if pad_M == 0 and pad_N == 0: + return weight, M, N # No padding needed + padded_weight = torch.nn.functional.pad(weight, (0, pad_N, 0, pad_M), mode="constant", value=0) + return padded_weight, M, N # Return original dimensions for unpadding + + +def unpad_weight(weight: torch.Tensor, original_M: int, original_N: int, keep_first_dim: bool = False) -> torch.Tensor: + """Removes padding from the matrix to restore its original shape.""" + if (weight.shape[-2] == original_M) and (weight.shape[-1] == original_N): + return weight + if keep_first_dim: + return weight[:, :original_M, :original_N] + else: + return weight[:original_M, :original_N] + + +def pad_block_fp8_weight_naive( + weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list +) -> Tuple[torch.Tensor, int, int]: + assert len(block_size) == 2 + + block_size_m, block_size_n = block_size + weight_scale_m, weight_scale_n = weight_scale.shape[-2:] + + weight, orig_M, orig_N = pad_weight(weight, block_size) + M, N = weight.shape[-2:] + + assert weight_scale_m == M // block_size_m + assert weight_scale_n == N // block_size_n + + return weight, orig_M, orig_N + + +def dequant_block_fp8_weight(weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list) -> torch.Tensor: + dtype = torch.bfloat16 + if weight_scale is None: + return weight + assert len(block_size) == 2 + + weight, orig_M, orig_N = pad_block_fp8_weight_naive(weight, weight_scale, block_size) + + weight_shape_len = len(weight.shape) + + block_size_m, block_size_n = block_size + + # mul scale + if weight_shape_len == 2: + weight_scale_m, weight_scale_n = weight_scale.shape + weight_scale = weight_scale.view(weight_scale_m, 1, weight_scale_n, 1) + weight = weight.view(weight_scale_m, block_size_m, weight_scale_n, block_size_n) + dequant_weight = weight.to(dtype) * weight_scale.to(dtype) + dequant_weight = dequant_weight.view(weight_scale_m * block_size_m, weight_scale_n * block_size_n) + keep_first_dim = False + elif weight_shape_len == 3: + fd, weight_scale_m, weight_scale_n = weight_scale.shape + weight_scale = weight_scale.view(fd, weight_scale_m, 1, weight_scale_n, 1) + weight = weight.view(fd, weight_scale_m, block_size_m, weight_scale_n, block_size_n) + dequant_weight = weight.to(dtype) * weight_scale.to(dtype) + dequant_weight = dequant_weight.view(fd, weight_scale_m * block_size_m, weight_scale_n * block_size_n) + keep_first_dim = True + else: + raise ValueError("Only support original weight shape is either 2 or 3") + + dequant_weight = unpad_weight(dequant_weight, orig_M, orig_N, keep_first_dim=keep_first_dim) + + return dequant_weight + + +def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16): + """ """ + from auto_round.schemes import QuantizationScheme + + new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype) + if layer.bias is not None: + new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype)) + scheme_keys = (f.name for f in fields(QuantizationScheme)) + keys = tuple(scheme_keys) + ("tmp_name", "scale_dtype") + for key in keys: + setattr(new_layer, key, getattr(layer, key, None)) + + if layer.__class__.__name__ == "CompressedLinear": + dq_weight = layer.compressor.decompress_module(layer) + else: + weight_scale = layer.weight_scale if hasattr(layer, "weight_scale") else layer.weight_scale_inv + dq_weight = dequant_block_fp8_weight(layer.weight, weight_scale, layer.block_size) + new_layer.weight.data.copy_(dq_weight.to(dtype=dtype)) + return new_layer + + +def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16): + """ + Convert a model with FP8 quantized layers to a model with 16-bit linear layers. + This is useful for compatibility with other frameworks or for further processing. + """ + from auto_round.utils.memory_utils import clear_memory + + cnt = 0 + for n, m in model.named_modules(): + if m.__class__.__name__ == "FP8Linear": + new_module = convert_fp8_layer_to_linear(m, dtype=dtype) + set_module(model, n, new_module) + cnt += 1 + if cnt % 10 == 0: # Tricky setting + clear_memory() + return model + + +def get_shared_keys(model): + """ + Retrieves shared keys from the model's state dictionary. + + Args: + model (torch.nn.Module): The model to retrieve shared keys from. + + Returns: + tuple: tuple of shared keys. + """ + from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS + from auto_round.utils.constants import SHARED_CACHE_KEYS + + shared_keys = SHARED_CACHE_KEYS + shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ()) + return shared_keys + + +def _to_model_dtype(model, model_dtype): + if model_dtype is not None: + try: + if (model_dtype == "float16" or model_dtype == "fp16") and model.dtype != torch.float16: + model = model.to(torch.float16) + elif ( + model_dtype == "bfloat16" or model_dtype == "bfp16" or model_dtype == "bf16" + ) and model.dtype != torch.bfloat16: + model = model.to(torch.bfloat16) + elif model_dtype == "float32" or model_dtype == "fp32" and model.dtype != torch.bfloat32: + model = model.to(torch.float32) + except: + logger.error("please use more device to fit the device or just use one device") + exit() + return model + + +def get_module(module, key): + """Get module from model by key name. + + Args: + module (torch.nn.Module): original model + key (str): module name to be replaced + """ + name_list = key.split(".") + for name in name_list: + module = getattr(module, name, None) + return module + + +def set_module(model, key, new_module): + """Set new module into model by key name. + + Args: + model (torch.nn.Module): original model + key (str): module name to be replaced + new_module (torch.nn.Module): new module to be inserted + """ + module = model + name_list = key.split(".") + for name in name_list[:-1]: + if hasattr(module, name): + module = getattr(module, name) + setattr(module, name_list[-1], new_module) + + +def _get_digital_in_layer_name(layer_name): + pattern = re.compile(r"([a-zA-Z]+\.){1,}(\d+)") + res = re.search(pattern, layer_name) + if res: + return int(res[2]) + else: + return None + + +def unsupported_meta_device(model): + """Checks if the model is a valid model for auto_round. + + Args: + model: The model to be checked. + + Returns: + bool: True if the model is valid, False otherwise. + """ + target_device = None + for param in model.parameters(): + if target_device is None: + target_device = param.device + if param.device != target_device: + if param.device.type == "meta" or target_device.type == "meta": + return True + if target_device.type == "meta": + if hasattr(model, "path"): + return False + else: + return True + return False + + +def to_device(input, device=torch.device("cpu")): + """Moves input data to the specified device. + + Args: + input: The input data to be moved. + device: The target device. + + Returns: + The input data on the specified device. + """ + if input is None: + return None + if isinstance(input, torch.Tensor): + return input.to(device) + if isinstance(input, dict) or isinstance(input, UserDict): + for inp in input.keys(): + input[inp] = to_device(input[inp], device) + + elif isinstance(input, list) or isinstance(input, tuple): + if len(input) == 0: + return input + input_res = [] + for inp in input: + input_res.append(to_device(inp, device)) + if isinstance(input, tuple): + input_res = tuple(input_res) + input = input_res + + return input + + +def mv_module_from_gpu(module, low_cpu_mem_usage=False): + """Moves module from gpu to cpu or meta if low_cpu_mem_usage is true. + + Args: + module: The module to be moved. + low_cpu_mem_usage: Whether to use low CPU memory. If true, move module to meta. + + Returns: + The module on the specified device. + """ + if hasattr(module, "device"): + target_device = "meta" if low_cpu_mem_usage else "cpu" + if module.device.type == target_device: + return module + else: + return module.to(target_device) + else: + if low_cpu_mem_usage: + return module.to("meta") + else: + return module.to("cpu") + + +def to_dtype(input, dtype=torch.float32): + """Moves input data to the specified data type. + + Args: + input: The input data to be moved. + dtype: The target data type. + + Returns: + The input data on the specified data type. + """ + if input is None: + return None + if isinstance(input, torch.Tensor): + return input.to(dtype) + if isinstance(input, dict) or isinstance(input, UserDict): + for inp in input.keys(): + input[inp] = to_dtype(input[inp], dtype) + + elif isinstance(input, list) or isinstance(input, tuple): + if len(input) == 0: + return input + input_res = [] + for inp in input: + input_res.append(to_dtype(inp, dtype)) + if isinstance(input, tuple): + input_res = tuple(input_res) + input = input_res + + return input + + +def set_amax_for_uncalibrated_experts( + experts: torch.nn.Module, set_amax_value: float | None = None, attr_name="act_max" +): + """Set amax of uncalibrated experts to a given value or the max of existing amax value from other experts. + + Args: + experts: a list of experts + set_amax_value: set amax value to the given value. + If None, set amax value to the max of existing amax value from other experts. + + Returns: + uncalibrated_experts: a list of uncalibrated experts + """ + uncalibrated_experts = [] + # get the max amax value from all experts + if set_amax_value is None: + amax_values = [ + get_nested_attr(module, attr_name) for module in experts if get_nested_attr(module, attr_name) is not None + ] + if len(amax_values) == 0: + return uncalibrated_experts + # Flatten all tensors to 1D before concatenation + flat_values = [t.reshape(-1) for t in amax_values] + all_values = torch.cat(flat_values) + set_amax_value = torch.max(all_values) + + for module in experts: + if get_nested_attr(module, attr_name) is None: + logger.warning_once( + "Missing amax value of expert layers." + "This typically occurs in MoE models when certain experts are not activated during calibration. " + "Consider increasing your calibration dataset size to ensure all experts are exercised." + ) + # Use float32 dtype explicitly to ensure we create a floating point tensor + if not isinstance(set_amax_value, torch.Tensor): + set_amax_value = torch.tensor(set_amax_value, dtype=torch.float32) + set_nested_attr(module, attr_name, set_amax_value) + # uncalibrated_experts.append(module) + + +# Please refer to: https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/ +# 4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/unified_export_hf.py#L195-L207 +def set_amax_for_all_moe_layers(model: torch.nn.Module, layer_name=None, attr_name="act_max"): + if layer_name is not None: + parts = layer_name.split(".") + if "experts" not in parts: + raise ValueError + idx = parts.index("experts") + moe_name = ".".join(parts[:idx]) + model = get_module(model, moe_name) + # Handle input quantizers of experts that are not calibrated + for name, sub_module in model.named_modules(): + if not (is_moe(sub_module) and hasattr(sub_module, "experts")): + continue + expert_linear_names = get_expert_linear_names(sub_module) + for linear_name in expert_linear_names: + if isinstance(sub_module.experts, collections.abc.Iterable): + # For other MoE models (like Mixtral) with iterable experts + try: + set_amax_for_uncalibrated_experts( + [getattr(expert, linear_name, None) for expert in sub_module.experts], attr_name=attr_name + ) + except AttributeError as e: + # Provide more helpful debugging information + expert_types = list(set(type(expert).__name__ for expert in sub_module.experts)) + raise AttributeError( + f"Failed to access attribute '{linear_name}' on experts. " + f"MoE module type: {type(sub_module).__name__}, " + f"Expert types: {expert_types}, " + f"Expected linear names: {expert_linear_names}. " + f"This suggests the get_expert_linear_names function may need " + f"to be updated for this model architecture. " + f"Original error: {e}" + ) from e + else: + # Unsupported MoE model structure + raise NotImplementedError( + f"MoE model with experts type '{type(sub_module.experts).__name__}' is not supported in export." + f"Please file an issue or add support for this model architecture." + ) + + +# Adapted from https://github.com/vllm-project/llm-compressor/blob/ +# 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144 +def copy_python_files_from_model_cache(model, save_path: str): + config = model.config + cache_path = None + if hasattr(config, "_name_or_path"): + import os + import shutil + + from huggingface_hub import hf_hub_download + from transformers import TRANSFORMERS_CACHE + from transformers.utils import http_user_agent + + cache_path = config._name_or_path + if not os.path.exists(cache_path): + user_agent = http_user_agent() + config_file_path = hf_hub_download( + repo_id=cache_path, + filename="config.json", + cache_dir=TRANSFORMERS_CACHE, + force_download=False, + user_agent=user_agent, + ) + cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) + + for file in os.listdir(cache_path): + full_file_name = os.path.join(cache_path, file) + if file.endswith(".py") and os.path.isfile(full_file_name): + logger.debug(f"Transferring {full_file_name} to {save_path}") + shutil.copy(full_file_name, save_path) diff --git a/auto_round/utils/quantization_utils.py b/auto_round/utils/quantization_utils.py new file mode 100644 index 000000000..bf4906c18 --- /dev/null +++ b/auto_round/utils/quantization_utils.py @@ -0,0 +1,1206 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import os +import re +import sys +from dataclasses import asdict, fields +from typing import Any, Callable, Dict, List, Tuple, Union + +import torch +import transformers +from torch.amp import autocast + +from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType +from auto_round.logger import logger +from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme + + +def block_forward( + block: torch.nn.Module, + input_ids: torch.Tensor, + input_others: dict, + amp: bool = False, + amp_dtype: torch.dtype = torch.float16, + device: torch.device = torch.device("cpu"), + output_return_id: int = 0, +) -> Union[torch.Tensor, dict]: + """Performs a forward pass through a block with the given inputs. + + Args: + block: The block to perform the forward pass on. + input_ids: The input IDs. + input_others: A dictionary containing other input data. + amp: A boolean indicating whether to use automatic mixed precision. + amp_dtype: The data type for automatic mixed precision. + device: The target device. + output_return_id: if the output has more than one tenor, return the specified idx tensor. + + Returns: + output: The output of the forward pass. + """ + from auto_round.utils.model_utils import to_device + + if input_ids.device != device: + input_ids = to_device(input_ids, device) + input_others = to_device(input_others, device) + input_tuple = input_others.pop("positional_inputs", None) + if "alibi" in input_others.keys() and input_others["alibi"] is not None: + alibi = input_others["alibi"] + input_others["alibi"] = alibi.reshape(-1, alibi.shape[2], alibi.shape[3]) + if amp: + with autocast(device_type=device.split(":")[0], dtype=amp_dtype): # pragma: no cover + output = block(input_ids, *input_tuple, **input_others) + else: + output = block(input_ids, *input_tuple, **input_others) + if isinstance(output_return_id, int) and (isinstance(output, list) or isinstance(output, tuple)): + output = output[output_return_id] + return output + + +def collect_best_params(block): + params = {} + for n, m in block.named_modules(): + if hasattr(m, "orig_layer"): + params[n] = {} + for key in m.params.keys(): + params[n][key] = copy.deepcopy(m.params[key].data) + return params + + +def infer_bits_by_data_type(data_type: str): + """Infer bits by data_type + + Args: + data_type (str): data_type + + Returns: + int: bits inferred by data_type, None means cannot infer correct bits by data_type + """ + from auto_round.utils.constants import SUPPORTED_DTYPES + + if data_type is None: + return 16 + for supported_dtype in SUPPORTED_DTYPES: + if data_type.startswith(supported_dtype) and len(data_type) > len(supported_dtype): + ##first check the following two bits + suc_2str = data_type[len(supported_dtype) : len(supported_dtype) + 2] + if str.isdigit(suc_2str): + return int(suc_2str) + if str.isdigit(data_type[len(supported_dtype)]): + return int(data_type[len(supported_dtype)]) + return None + + +def check_to_quantized(config): + """Checks if the configuration is valid for quantization. + + Args: + config (dict or object): The configuration to check. It can be either a + dictionary with a 'bits' key or an object with a 'bits' attribute. + + Returns: + bool: True if the configuration is valid for quantization (bits <= 8), + False otherwise. + """ + + if isinstance(config, (dict, QuantizationScheme)): + bits = int(config.get("bits", 16)) + act_bits = int(config.get("act_bits", 16)) + elif hasattr(config, "orig_layer"): + bits = int(config.orig_layer.bits) if hasattr(config.orig_layer, "bits") else 16 + act_bits = int(config.orig_layer.act_bits) if hasattr(config.orig_layer, "act_bits") else 16 + else: + bits = int(config.bits) if hasattr(config, "bits") else 16 + act_bits = int(config.act_bits) if hasattr(config, "act_bits") else 16 + + return bits <= 8 or act_bits <= 8 + + +def set_layer_config( + model: torch.nn.Module, + layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], + default_scheme: Union[str, "QuantizationScheme"], + default_scale_dtype: torch.dtype | str, + supported_types: tuple, + inner_supported_types: tuple, + quant_block_list=None, + fp_layers: str = "", + quant_lm_head: bool = False, + enable_gguf_official_mixed: bool = True, + is_mllm: bool = False, +) -> tuple[dict, bool, dict]: + """ + Normalize, validate, and expand layer-specific quantization configs. + Returns (final_layer_config, has_quant_layer_outside_block) + """ + + from auto_round.schemes import get_gguf_scheme + from auto_round.utils.dtype_utils import is_mx_fp, is_nv_fp + from auto_round.utils.model_utils import get_layer_names_in_block, get_lm_head_name, get_module + + # ---- helpers ------------------------------------------------- + def dispatch_layer_config(layer_config: dict[str, dict]) -> None: + """Assign scheme values as attributes to matched modules.""" + for layer_name, scheme in layer_config.items(): + module = get_module(model, layer_name) + for attr, value in scheme.items(): + setattr(module, attr, value) + + def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: + """Convert config entry into dict and validate keys.""" + if isinstance(item, str): + config = asdict(preset_name_to_scheme(item.upper())) + elif isinstance(item, QuantizationScheme): + config = asdict(item) + elif isinstance(item, dict): + invalid = set(item) - set(scheme_keys + ("fixed_by_user", "scale_dtype")) + if invalid: + raise ValueError( + f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}" + ) + config = dict(item) + else: + raise TypeError( + f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " + f"Expected str, dict, or QuantizationScheme." + ) + # Clean up + config = {k: v for k, v in config.items() if v is not None} + config["fixed_by_user"] = True + return config + + # ---- main logic ---------------------------------------------- + scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) + layer_config = copy.deepcopy(layer_config) or {} + + # 1. fp_layers -> force 16 + for name in get_fp_layer_names(model, fp_layers): + layer_config[name] = { + "bits": 16, + "act_bits": 16, + "data_type": "float", + "act_data_type": "float", + "fixed_by_user": True, + } + + # 2. normalize + layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} + + # 3. infer missing bits + for cfg in layer_config.values(): + if "data_type" in cfg and "bits" not in cfg: + if (b := infer_bits_by_data_type(cfg["data_type"])) is not None: + cfg["bits"] = b + if "act_data_type" in cfg and "act_bits" not in cfg: + if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None: + cfg["act_bits"] = b + + # 4. fill defaults + if isinstance(default_scheme, str): + default_dict = asdict(preset_name_to_scheme(default_scheme.upper())) + else: + default_dict = asdict(default_scheme) + default_dict["scale_dtype"] = default_scale_dtype + for cfg in layer_config.values(): + for key in scheme_keys: + cfg.setdefault(key, copy.deepcopy(default_dict.get(key))) + + # 5. collect supported modules + gguf_name = get_gguf_scheme(default_scheme) + if gguf_name and torch.nn.Embedding not in supported_types: + supported_types = (*supported_types, torch.nn.Embedding) + + all_supported_layer_names, embedding_layer_names = [], [] + all_module_names = [] + for n, m in model.named_modules(): + all_module_names.append(n) + # cleanup stale attributes + for key in scheme_keys: + if hasattr(m, key): + delattr(m, key) + if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: + continue + all_supported_layer_names.append(n) + if isinstance(m, torch.nn.Embedding): + embedding_layer_names.append(n) + + # 6. expand regex configs + regex_config = {} + for name in list(layer_config.keys()): + if name in all_supported_layer_names: + continue + if name in all_module_names: + m = get_module(model, name) + if len(list(m.children())) == 0 and type(m) not in supported_types: + layer_config.pop(name) + logger.warning(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") + continue + + regex = re.compile(name) + matched = [ln for ln in all_supported_layer_names if regex.search(ln)] + if not matched: + raise ValueError(f"Invalid '{name}' in layer_config, no match found.") + val = layer_config.pop(name) + regex_config[name] = val # keep regex config + for match in matched: + layer_config[match] = val + # regex_config = None if len(regex_config)==0 else regex_config + + # 7. lm_head + lm_head_name = get_lm_head_name(model) + tie_word_embeddings = False + if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): + tie_word_embeddings = model.config.tie_word_embeddings + + if quant_lm_head and tie_word_embeddings: + quant_lm_head = False + logger.warning( + "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently" + ) + + if lm_head_name not in layer_config and quant_lm_head: + layer_config[lm_head_name] = copy.deepcopy(default_dict) + + # 8. enforce shape divisibility for int weight-only + if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: + layer_config.setdefault(n, copy.deepcopy(default_dict)) + layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) + logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") + # enforce shape divisibility for mxfp/nvfp + if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name: + for n, m in model.named_modules(): + if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: + if m.weight.shape[1] % default_dict["group_size"]: + layer_config.setdefault(n, copy.deepcopy(default_dict)) + layer_config[n].update( + {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True} + ) + logger.warning_once( + f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})." + ) + + # 9. block layers: mark as in_blocks=True + for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): + if name not in layer_config: + layer_config[name] = copy.deepcopy(default_dict) + layer_config[name]["fixed_by_user"] = False + layer_config[name]["in_blocks"] = True + + # ---- restore: ensure missing in_blocks are set to False and compute flag ---- + has_qlayer_outside_block = False + for cfg in layer_config.values(): + if "in_blocks" not in cfg: + cfg["in_blocks"] = False + # mark layer outside block + if not cfg["in_blocks"] and check_to_quantized(cfg): + has_qlayer_outside_block = True + + # 10. GGUF handling + if not gguf_name: + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block, regex_config + + # embed + lm_head defaults for gguf + if lm_head_name not in layer_config and not tie_word_embeddings: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[lm_head_name] = cfg + has_qlayer_outside_block = True + for emd_name in embedding_layer_names: + if emd_name in layer_config: + continue + if not tie_word_embeddings: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] + else: + cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] + cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} + layer_config[emd_name] = cfg + + if enable_gguf_official_mixed: + model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT + layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type) + + dispatch_layer_config(layer_config) + return layer_config, has_qlayer_outside_block, regex_config + + +def get_gguf_qtype_by_layer_config(layer_config): + import gguf # pylint: disable=E0401 + + if layer_config["bits"] >= 16: + return None + bits = layer_config["bits"] + super_bits = layer_config.get("super_bits", None) + sym = layer_config["sym"] + group_size = layer_config.get("group_size", None) + super_group_size = layer_config.get("super_group_size", None) + if bits == 2 and super_bits == 4 and not sym and group_size == 16 and super_group_size == 16: + return gguf.GGMLQuantizationType.Q2_K + if bits == 3 and super_bits == 6 and sym and group_size == 16 and super_group_size == 16: + return gguf.GGMLQuantizationType.Q3_K + if bits == 4: + if super_bits is not None and super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: + return gguf.GGMLQuantizationType.Q4_K + if super_bits is None and sym and group_size == 32: + return gguf.GGMLQuantizationType.Q4_0 + if super_bits is None and not sym and group_size == 32: + return gguf.GGMLQuantizationType.Q4_1 + if bits == 5: + if super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: + return gguf.GGMLQuantizationType.Q5_K + if super_bits is None and sym and group_size == 32: + return gguf.GGMLQuantizationType.Q5_0 + if super_bits is None and not sym and group_size == 32: + return gguf.GGMLQuantizationType.Q5_1 + if bits == 6 and super_bits == 8 and group_size == 16 and super_group_size == 16: + return gguf.GGMLQuantizationType.Q6_K + if bits == 8 and sym and group_size == 32: + return gguf.GGMLQuantizationType.Q8_0 + raise ValueError("Unknown layer config") + + +##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 +def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT): + # # TODO: support for other format later + # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) + import gguf # pylint: disable=E0401 + + from auto_round.utils.misc_utils import LazyImport + from auto_round.utils.model_utils import _get_digital_in_layer_name, get_lm_head_name, get_module + + # from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture + convert_hf_to_gguf = LazyImport("auto_round.export.export_to_gguf.convert_hf_to_gguf") + + model_architecture = convert_hf_to_gguf.get_model_architecture( + hparams=model.config.to_dict(), model_type=model_type + ) + try: + model_class = convert_hf_to_gguf.ModelBase.from_model_architecture(model_architecture, model_type=model_type) + except NotImplementedError: + return layer_config, {} + + n_layer = None + for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"]: + sub_attr = "text_config" if model_type == ModelType.TEXT else "vision_config" + if hasattr(model.config, name): + n_layer = getattr(model.config, name) + break + if hasattr(model.config, sub_attr): + if hasattr(getattr(model.config, sub_attr), name): + n_layer = getattr(getattr(model.config, sub_attr), name) + break + if n_layer is None: + return layer_config, {} + + tensor_map = gguf.get_tensor_name_map(model_class.model_arch, n_layer) + + def _set_config(config, target_config): + for k, v in target_config.items(): + if isinstance(config, dict): + config[k] = v + else: + setattr(config, k, v) + return config + + gguf_format_config = {} + lm_head_name = get_lm_head_name(model) + inner_gguf_format = GGUF_CONFIG[target_gguf_format]["mostly"] + # ggml_type = getattr(gguf.GGMLQuantizationType,inner_gguf_format.split(":")[-1].upper()) + block_size = GGML_QUANT_SIZES[inner_gguf_format.split(":")[-1].lower()][0] + tie_word_embeddings = True + if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): + tie_word_embeddings = model.config.tie_word_embeddings + + n_gqa = 1 + if ( + hasattr(model, "config") + and hasattr(model.config, "num_attention_heads") + and hasattr(model.config, "num_key_value_heads") + ): + n_gqa = model.config.num_attention_heads // model.config.num_key_value_heads + n_expert = 0 + for name in ["num_experts", "num_local_experts", "n_routed_experts"]: + if hasattr(model.config, name): + n_expert = getattr(model.config, name) + + i_attention_wv = 0 + i_ffn_down = 0 + layer_config_copy = copy.deepcopy(layer_config) + target_bits = None + if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit(): + target_bits = int(inner_gguf_format[6]) + + for layer_name, config in layer_config_copy.items(): + if not check_to_quantized(config): + continue + new_type = GGUF_CONFIG[target_gguf_format]["mostly"] + layer = get_module(model, layer_name) + if type(layer) == transformers.pytorch_utils.Conv1D: + input_features = layer.weight.shape[0] + else: + input_features = layer.weight.shape[-1] + i_layer = _get_digital_in_layer_name(layer_name) + + if lm_head_name is not None and layer_name == lm_head_name: + target_bits = int(re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["lm_head"]).group(1)) + if isinstance(layer, torch.nn.Embedding): + target_bits = int( + re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["embedding"]).group(1) + ) + + gguf_name = tensor_map.get_name(layer_name) + bits_index = 6 + if config.get("fixed_by_user", False): + if "bits" not in config: + logger.warning( + f"Setting layer_config requires providing bits, {layer_name} has not bits," + f" using bits={target_bits} instead." + ) + new_type = new_type[:bits_index] + target_bits + new_type[bits_index + 1 :] + else: + config_tmp = config.copy() + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key in config.keys(): + if key not in scheme_keys: + config_tmp.pop(key, None) + matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched + if not matched_scheme: + if config.get("super_group_size", None) is not None: + new_type = new_type[:bits_index] + str(config["bits"]) + "_k" + if config.get("super_group_size", None) is None or new_type not in GGUF_INNER_CONFIG: + prefix_idx = 0 if config.get("sym", True) else 1 + new_type = new_type[:bits_index] + str(config["bits"]) + f"_{prefix_idx}" + if new_type not in GGUF_INNER_CONFIG: + new_type = new_type[:bits_index] + str(config["bits"]) + f"_{1-prefix_idx}" + if new_type not in GGUF_INNER_CONFIG: + raise ValueError( + f"the setting in layer_config {layer_name} " + f"could not match any supported gguf format, please have a check." + ) + else: + logger.warning_once( + f"the setting in layer_config {layer_name} " + f"could not match any supported gguf format, reset to {new_type}" + ) + new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] + new_type = _search_gguf_type(new_type) + if new_type is None: + raise ValueError(f"invalid bit setting for {layer_name}") + elif target_bits is not None and "bits" in config and config["bits"] != target_bits: + new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] + new_type = _search_gguf_type(new_type) + if new_type is None: + raise ValueError(f"invalid bit setting for {layer_name}") + elif lm_head_name is not None and layer_name == lm_head_name and not tie_word_embeddings: + if gguf.MODEL_ARCH.FALCON == model_class.model_arch or input_features % block_size != 0: + new_type = "gguf:q8_0" + elif "lm_head" in GGUF_CONFIG[target_gguf_format]: + new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] + elif new_type != "gguf:q8_0": + new_type = "gguf:q6_k" + elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings: + pass + elif isinstance(layer, torch.nn.Embedding): + if "embedding" in GGUF_CONFIG[target_gguf_format]: + new_type = GGUF_CONFIG[target_gguf_format]["embedding"] + elif gguf_name is None: + pass + # attn_v + elif "attn_v" in gguf_name: + if target_gguf_format == "gguf:q2_k": + new_type = "gguf:q4_k" if n_gqa >= 4 else "gguf:q3_k" + elif target_gguf_format == "gguf:q2_k_s" and n_gqa >= 4: + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_m": + new_type = "gguf:q5_k" if i_attention_wv < 2 else "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_l": + new_type = "gguf:q5_k" + elif (target_gguf_format == "gguf:q4_k_m" or target_gguf_format == "gguf:q5_k_m") and _use_more_bits( + i_layer, n_layer + ): + new_type = "gguf:q6_k" + elif target_gguf_format == "gguf:q4_k_s" and i_attention_wv < 4: + new_type = "gguf:q5_k" + ##TODO check which models are be grouped into to LLM_TYPE_70B + # if (qs.model.type == LLM_TYPE_70B) { + # // In the 70B model we have 8 heads sharing the same attn_v weights. + # As a result, the attn_v.weight tensor is + # // 8x smaller compared to attn_q.weight.Hence, we can get a nice boost in quantization accuracy with + # // nearly negligible increase in model size by quantizing this tensor with more bits: + # if + # (new_type == GGML_TYPE_Q3_K | | new_type == GGML_TYPE_Q4_K) + # new_type = GGML_TYPE_Q5_K; + # } + if n_expert == 8: + new_type = "gguf:q8_k" + i_attention_wv += 1 + + elif "attn_k" in gguf_name: + if n_expert == 8: + new_type = "gguf:q8_0" + # ffn_down + elif "ffn_down" in gguf_name: + if target_gguf_format == "gguf:q2_k": + new_type = "gguf:q3_k" + elif target_gguf_format == "gguf:q2_k_s": + if i_layer < n_layer / 8: + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_m": + if i_layer < n_layer / 16: + new_type = "gguf:q5_k" + elif gguf.MODEL_ARCH.FALCON == model_class.model_arch or _use_more_bits(i_layer, n_layer): + new_type = "gguf:q4_k" + else: + new_type = "gguf:q3_k" + elif target_gguf_format == "gguf:q3_k_l": + if gguf.MODEL_ARCH.FALCON == model_class.model_arch: + new_type = "gguf:q4_k" + else: + new_type = "gguf:q5_k" + elif target_gguf_format == "gguf:q4_k_m": + if gguf.MODEL_ARCH.FALCON == model_class.model_arch: + if i_layer < n_layer // 16: + new_type = "gguf:q6_k" + elif _use_more_bits(i_layer, n_layer): + new_type = "gguf:q5_k" + else: + new_type = "gguf:q4_k" + else: + if _use_more_bits(i_layer, n_layer): + new_type = "gguf:q6_k" + elif target_gguf_format == "gguf:q5_k_m" and _use_more_bits(i_layer, n_layer): + new_type = "gguf:q6_k" + elif ( + target_gguf_format == "gguf:q4_k_s" + and model_class.model_arch != gguf.MODEL_ARCH.FALCON + and i_layer < n_layer / 8 + ): + new_type = "gguf:q5_k" + elif (target_gguf_format == "gguf:q4_0" or target_gguf_format == "gguf:q5_0") and i_layer < n_layer / 8: + if target_gguf_format == "gguf:q4_0": + new_type = "gguf:q4_1" + else: + new_type = "gguf:q5_1" + i_ffn_down += 1 + + # attn_output + elif "attn_output" in gguf_name: + if gguf.MODEL_ARCH.FALCON != model_class.model_arch: + if n_expert == 8: + if target_gguf_format in ( + "gguf:q2_k", + "gguf:q3_k_s", + "gguf:q3_k_m", + "gguf:q4_k_s", + "gguf:q4_k_m", + "gguf:q5_k", + ): + new_type = "gguf:q5_k" + elif target_gguf_format == "gguf:q2_k": + new_type = "gguf:q3_k" + elif target_gguf_format == "gguf:q3_k_m": + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q3_k_l": + new_type = "gguf:q5_k" + else: + if target_gguf_format == "gguf:q3_k_l": + new_type = "gguf:q4_k" + # attn_qkv + elif "attn_qkv" in gguf_name: + if target_gguf_format in ("gguf:q3_k_m", "gguf:q3_k_l"): + new_type = "gguf:q4_k" + elif target_gguf_format == "gguf:q4_k_m": + new_type = "gguf:q5_k" + elif target_gguf_format == "gguf:q5_k_m": + new_type = "gguf:q5_k" + new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] + if input_features % new_block_size != 0: + new_type = _gguf_type_fallback(new_type) + new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] + if input_features % new_block_size != 0: + new_type = "gguf:bf16" + logger.warning( + f"fallback {layer_name} to {new_type}, " + f"because input_features({input_features}) % block_size({block_size}) != 0" + ) + # for deepseek v2 + if layer_name.endswith("kv_b_proj") and new_type.endswith("_k") and "Deepseek" in model.config.architectures[0]: + fallback = False + + # calc if need fallback + qk_nope_head_dim = model.config.qk_nope_head_dim + kv_b_shape = get_module(model, layer_name).weight.shape + + if ( + qk_nope_head_dim < QK_K + or qk_nope_head_dim % QK_K != 0 + or kv_b_shape[-1] < QK_K + or kv_b_shape[-1] % QK_K != 0 + ): + fallback = True + if fallback: + tmp_type = _gguf_type_fallback(new_type) + logger.warning_once( + f"self_attn.kv_b_proj does not support the use of {new_type}, replace it with {tmp_type}" + ) + new_type = tmp_type + + target_config = GGUF_INNER_CONFIG[new_type] + + _set_config(layer_config[layer_name], target_config) + _set_config(layer, target_config) + gguf_format_config[layer_name] = new_type + + return layer_config, gguf_format_config + + +def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=None): + """Checks if a model is compatible with the AutoAWQ GEMM kernel. + + Args: + model: The model object to evaluate, typically a PyTorch model. + bits (int): The number of bits for quantization (must be 4 for compatibility). + group_size (int): The group size for quantization. + sym (bool): Whether symmetric quantization is used (not utilized in the current function logic). + layer_configs (dict, optional): A dictionary mapping layer names to configurations, where each + configuration can specify a custom number of bits for the layer. + + Returns: + tuple: A tuple containing: + - bool: `True` if the model is compatible, `False` otherwise. + - str: An error message describing why the model is incompatible, or an empty string if compatible. + """ + from auto_round.utils.model_utils import get_layer_names_in_block, get_module + + if bits != 4: + return False, "AutoAWQ GEMM kernel only supports 4 bits" + for n, m in model.named_modules(): + if type(m) == transformers.pytorch_utils.Conv1D: + return False, "AutoAWQ GEMM kernel does not support conv1d" + + layer_names = get_layer_names_in_block(model) + for layer_name in layer_names: + if ( + layer_configs is not None + and layer_name in layer_configs.keys() + and layer_configs[layer_name].get("bits", bits) > 8 + ): + continue + + layer = get_module(model, layer_name) + if layer.in_features % group_size != 0: + return False, f"Layer {layer_name} in_features is not multiple of group_size {group_size}" + if layer.out_features % (32 // bits) != 0: + return False, f"Layer {layer_name} out_features is not multiple of 32 // bits" + + return True, "" + + +def check_need_act_calibration( + is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: Union[int, None] = 16 +) -> bool: + if act_bits is None or act_bits > 8: + return False + # None is dynamic + if is_act_dynamic is not None and not is_act_dynamic: + return True + if act_data_type is not None and "static" in act_data_type: + return True + return False + + +def is_autoround_exllamav2_available(): + """Checks if the AutoRound ExLlamaV2 kernels are available. + + Returns: + bool: + True if the AutoRound ExLlamaV2 kernels are available, False otherwise. + """ + res = True + try: + from autoround_exllamav2_kernels import gemm_half_q_half, make_q_matrix + except ImportError as e: + res = False + return res + + +def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): + """ + Configures and returns a QuantLinear class based on the specified backend and parameters. + + Args: + backend (str): The backend to be used for quantization. Supported values include "qigen", "triton", "marlin", + "exllama", and "cuda". + bits (int, optional): The number of bits for quantization. Default is 4. + group_size (int, optional): The group size for quantization. Default is 128. + sym (bool, optional): Flag indicating whether to use symmetric quantization. Default is False. + + Returns: + class: The dynamically imported QuantLinear class configured according to the specified parameters. + """ + use_triton = True + if bits not in [2, 4, 8]: + use_triton = False + disable_exllamav2 = True + disable_exllamav1 = False + disable_marlin = True + use_qigen = False + if "qigen" in backend: + use_triton = False + use_qigen = True + elif "triton" in backend: + use_triton = True + elif "marlin" in backend and sym: + use_triton = False + disable_marlin = False + elif "exllama" in backend: ##need v1 code to export + use_triton = True ##same with triton + disable_marlin = True + elif "cuda" in backend: + use_triton = False + disable_marlin = True + disable_exllamav2 = True + disable_exllamav1 = True + if use_triton: + from auto_round.export.export_to_autogptq.qlinear_triton import QuantLinear + + return QuantLinear + try: + import auto_gptq # pylint: disable=E0401 + except: + logger.error(f"please install auto_gptq via 'pip install auto-gptq' to support exporting to {backend}") + exit() + + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear # pylint: disable=E0401 + + from auto_round.utils.misc_utils import get_library_version + + version = get_library_version("auto_gptq") + from packaging.version import Version + + if Version(version) < Version("0.7.2"): + QuantLinear = dynamically_import_QuantLinear( + use_triton=use_triton, + desc_act=False, + group_size=group_size, + bits=bits, + disable_exllama=disable_exllamav1, + disable_exllamav2=disable_exllamav2, + use_qigen=use_qigen, + disable_marlin=disable_marlin, + ) + else: + QuantLinear = dynamically_import_QuantLinear( # pylint: disable=E1123 + use_triton=use_triton, + desc_act=False, + group_size=group_size, + bits=bits, + disable_exllama=disable_exllamav1, + disable_exllamav2=disable_exllamav2, + use_qigen=use_qigen, + use_marlin=not disable_marlin, + ) + return QuantLinear + + +def get_reciprocal(tensor): + if torch.dtype is torch.float16: + tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5) + else: + tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor) + return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor)) + + +def check_seqlen_compatible(input_seqlen, tokenizer=None, model=None): + """ + Check whether the input sequence length is within the limits defined + by the tokenizer and the model configuration. + + Args: + input_seqlen (int): The length of the input sequence. + tokenizer: Optional, a HuggingFace tokenizer object. + model: Optional, a HuggingFace model object. + + Returns: + ValueError: if the input length is not valid, riase Error. + """ + if model is not None and hasattr(model, "config"): + model_config = model.config + if hasattr(model_config, "max_position_embeddings") and input_seqlen > model_config.max_position_embeddings: + raise ValueError( + f"seqlen({input_seqlen}) exceeds model.config.max_position_embeddings(" + f"{model_config.max_position_embeddings}). Please lowering '--seqlen'" + ) + if tokenizer is not None and hasattr(tokenizer, "model_max_length") and input_seqlen > tokenizer.model_max_length: + raise ValueError( + f"seqlen({input_seqlen}) exceeds tokenizer.model_max_length({tokenizer.model_max_length}). " + "Please oncider Consider lowering the '--seqlen' or increasing tokenizer.model_max_length." + ) + + +def filter_quantization_config(quantization_config): + default_dict = { + "amp": True, + "batch_size": 8, + "data_type": int, + "dataset": "NeelNanda/pile-10k", + "enable_minmax_tuning": True, + "enable_norm_bias_tuning": False, + "enable_quanted_input": True, + "gradient_accumulate_steps": 1, + "iters": 200, + "low_gpu_mem_usage": False, + "nsamples": 128, + "scale_dtype": "torch.float16", + "seqlen": 2048, + } + iters = quantization_config.get("iters", 200) + + default_dict["lr"] = 1.0 / iters if iters > 0 else 5e-3 + default_dict["minmax_lr"] = default_dict["lr"] + + for key in default_dict: + if key in quantization_config and default_dict[key] == quantization_config[key]: + quantization_config.pop(key) + for k in list(quantization_config.keys()): + if quantization_config[k] is None: + quantization_config.pop(k) + + if quantization_config.get("act_bits", 16) >= 16: + quantization_config.pop("act_bits", None) + quantization_config.pop("act_data_type", None) + quantization_config.pop("act_dynamic", None) + quantization_config.pop("act_sym", None) + quantization_config.pop("act_group_size", None) + + +def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): + """Identifies and returns layers in the model to exclude from quantization. + + This function processes a comma-separated list of fully precision (FP) layers, + matches them to the names of layers in the model, and returns a list of such + layers to exclude from quantization. + + Args: + model (torch.nn.Module): The model whose layers will be inspected. + fp_layers (str): A comma-separated string of layer names to be excluded + from quantization. Whitespace is ignored in this string. + + Returns: + list: A list of layer names that match the specified FP layers or are + subcomponents of those layers. + """ + from auto_round.utils.constants import SUPPORTED_LAYER_TYPES + + if not fp_layers: + return [] + fp_layers = fp_layers.replace(" ", "").split(",") + all_layer_names = [] + for n, m in model.named_modules(): + if type(m) in SUPPORTED_LAYER_TYPES: + all_layer_names.append(n) + not_to_quantized_layers = [] + + for fp_layer in fp_layers: + if fp_layer == "": + continue + if fp_layer in all_layer_names: + not_to_quantized_layers.append(fp_layer) + continue + if fp_layer[-1].isdigit(): + fp_layer = fp_layer + "." ##tricky setting + for name in all_layer_names: + if fp_layer in name: + not_to_quantized_layers.append(name) + logger.trace(f"not_to_quantized_layers: {not_to_quantized_layers}") + return not_to_quantized_layers + + +def _use_more_bits(i_layer: int, n_layer: int): + return (i_layer < n_layer // 8) or (i_layer >= 7 * n_layer // 8) or ((i_layer - n_layer // 8) % 3 == 2) + + +def _search_gguf_type(gguf_type): + if gguf_type in GGUF_INNER_CONFIG: + return gguf_type + pattern = re.compile("gguf:q([0-9]{1,})_[01k]") + bits = re.search(pattern, gguf_type) + if not bits: + raise KeyError(f"{gguf_type} is not a correct gguf type, please check") + + for suffix in ["_k", "_0", "_1"]: + if gguf_type.endswith(suffix): + continue + if (tmp_type := re.sub("_[01k]", suffix, gguf_type)) in GGUF_INNER_CONFIG: + return tmp_type + return None + + +def _gguf_type_fallback(gguf_type: str) -> str: + gguf_type = gguf_type.lower() + if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q5_k": + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q6_k": + gguf_type = "gguf:q8_0" + return gguf_type + + +def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.TEXT): + import argparse + + from auto_round.export.export_to_gguf.convert import download_convert_file + from auto_round.logger import logger + from auto_round.utils.model_utils import download_hf_model, get_gguf_architecture + + formats = sorted(formats, key=lambda x: len(x)) + export_gguf = False + for f in formats: + if f.startswith("gguf"): + export_gguf = True + + if f.startswith("gguf") and f not in GGUF_CONFIG: + logger.error(f"{f} is not supported, please check.") + + redownload = False + if export_gguf: + try: + from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 + ModelBase, + ModelType, + get_model_architecture, + ) + + if isinstance(args_or_ar.model, str): + model_path = args_or_ar.model + else: + model_path = args_or_ar.model.name_or_path + if not os.path.isdir(model_path): + model_path = download_hf_model(model_path) + model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) + if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: + logger.warning( + f"Current version of gguf export does not support for {model_architecture}," + " will re-download dependency file." + ) + redownload = True + except ModuleNotFoundError as e: + if "convert_hf_to_gguf" in str(e): + logger.warning("GGUF export dependency file is not found, download from github.") + redownload = True + except AttributeError as e: + raise ImportError( + "Please use the latest gguf-py, you can use the following command to install it:\n" + "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." + ) + download_convert_file(redownload) + + try: + from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 + ModelBase, + ModelType, + ) + except ImportError as e: + raise ImportError( + "Please use the latest gguf-py, you can use the following command to install it:\n" + "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." + ) + if isinstance(args_or_ar.model, str): + model_path = args_or_ar.model + else: + model_path = args_or_ar.model.name_or_path + if not os.path.isdir(model_path): + model_path = download_hf_model(model_path) + model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) + if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: + logger.error(f"Model {model_architecture} is not supported to export gguf format.") + sys.exit(1) + + pattern = re.compile(r"q\d_k") + pre_dq_format = "" + unsupported_list, reset_list = [], [] + for format in GGUF_CONFIG: + if format in formats: + if format == "q6_k_s": + logger.warning("Please note that q6_k_s is q6_k.") + + if re.search(pattern, format): + if pre_dq_format and re.search(pattern, format).group() not in pre_dq_format: + logger.error(f"Cannot export {pre_dq_format} and {format} at the same time.") + sys.exit(-1) + else: + pre_dq_format = format + + unsupported_list, reset_list = [], [] + gguf_config = GGUF_CONFIG[format] + for k, v in gguf_config.items(): + if not hasattr(args_or_ar, k): + continue + if k == "data_type": + if re.search(r"q\d_1", format) and len(formats) > 1: + v = "int" + if k == "sym" and isinstance(args_or_ar, argparse.Namespace): + k = "asym" + v = not v + if getattr(args_or_ar, k) != v: + unsupported_list.append(f"{k}={getattr(args_or_ar, k)}") + reset_list.append(f"{k}={v}") + setattr(args_or_ar, k, v) + if len(unsupported_list) > 0: + logger.info( + f"format {format} does not support for {', '.join(unsupported_list)}," + f" reset to {', '.join(reset_list)}." + ) + # Removed obsolete commented-out block for improved readability and maintainability. + return args_or_ar + + +def get_layer_features(layer): + """Extracts input and output feature dimensions for supported layers.""" + from auto_round.utils.constants import LinearAllreduce, LinearLayer, deepspeed_exists + + if type(layer) == torch.nn.Linear: + return layer.in_features, layer.out_features + elif type(layer) == transformers.pytorch_utils.Conv1D: # TODO: Verify correctness + return layer.weight.shape[0], layer.weight.shape[1] + elif isinstance(layer, torch.nn.Embedding): + return layer.num_embeddings, layer.embedding_dim + elif deepspeed_exists and type(layer) in (LinearLayer, LinearAllreduce): + return layer.weight.shape[1], layer.weight.shape[0] # (input_dim, output_dim) + elif "FP8Linear" in layer.__class__.__name__: + return layer.in_features, layer.out_features + return None, None # Unsupported layer type + + +def get_common_prefix(paths): + # Split each path into components and find the common prefix + split_paths = [path.split(".") for path in paths] + common_prefix = split_paths[0] + for path in split_paths[1:]: + common_prefix = [comp for comp, other in zip(common_prefix, path) if comp == other] + return ".".join(common_prefix) + + +def extract_block_names_to_str(quant_block_list): + if not isinstance(quant_block_list, (list, tuple)): + return None + # Extract common prefix for each list + prefixes = [get_common_prefix(blocks) for blocks in quant_block_list] + # Join prefixes into a single string + return ",".join(prefixes) + + +def find_matching_blocks(model, all_blocks, to_quant_block_names): + """ + Find and return matching blocks in the model based on to_quant_block_names. + + Args: + model: The model (not used in this specific function but kept for completeness). + all_blocks: List of lists, where each inner list contains full block names in the model. + to_quant_block_names: Comma-separated string of target block names to match. + + Returns: + target_blocks: List of lists containing full paths of matching blocks in the model. + """ + if not to_quant_block_names: + return all_blocks + to_quant_block_list = to_quant_block_names + if isinstance(to_quant_block_names, list) or isinstance(to_quant_block_names, tuple): + return to_quant_block_names + if isinstance(to_quant_block_names, str): + to_quant_block_list = [name.strip() for name in to_quant_block_names.split(",")] + target_blocks = [] + for block_list in all_blocks: + matched_sublist = [] + for name in to_quant_block_list: + matches = [block for block in block_list if re.search(name, block)] + if matches: + matched_sublist.extend(matches) + if matched_sublist: + target_blocks.append(matched_sublist) + if not target_blocks: + raise ValueError( + "No block names matched. Please check the input for to_quant_block_name," + "or set to_quant_block_name to None to automatically match quantizable blocks." + ) + return target_blocks + + +def get_scale_shape(weight, group_size): + """Computes the shape of the scale tensor for quantization based on the weight tensor and group size. + + Args: + weight (torch.Tensor): The weight tensor of the layer. + group_size (int): The size of the groups for quantization. + + Returns: + The shape of the scale tensor to be used for quantization. + """ + if group_size == 0: + return 1 + elif group_size == -1 or weight.shape[1] < group_size: + shape = weight.shape[0] + else: + shape = weight.shape[0] * ((weight.shape[1] + group_size - 1) // group_size) + + return shape + + +def init_cache(positional_inputs, inputs): + """ + Initializes special model inputs by adding positional inputs if missing. + + Args: + positional_inputs (list): List of positional inputs to add to inputs. + inputs (dict): Dictionary of model inputs. + + Modifies: + inputs (dict): Adds "positional_inputs" key if not present. + """ + from auto_round.utils.model_utils import to_device + + if "positional_inputs" not in inputs: # for chatglm Series + inputs["positional_inputs"] = [] + for idx, item in enumerate(positional_inputs): + inputs["positional_inputs"] = to_device(positional_inputs) + + +def reset_params(inputs): + """ + Resets specific input parameters to avoid saving the key-value cache during fine-tuning. + + Args: + inputs (dict): Dictionary of model inputs. + + Modifies: + inputs (dict): Sets "use_cache" to False if the key is present. + """ + if "use_cache" in inputs.keys(): # Not storing kv cache + inputs["use_cache"] = False + + +def check_skippable_keywords(key): + """ + Prints a reminder if a key is not stored during quantization fine-tuning. + """ + skippable_cache_keys = ("past_key_value",) + for cache_key in skippable_cache_keys: + if cache_key not in key: + return True + return False From 2e67d022da4ebab251665c2cb58d66d45cfbbf65 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 26 Oct 2025 20:08:59 -0400 Subject: [PATCH 2/7] fix Signed-off-by: n1ck-guo --- auto_round/auto_scheme/gen_auto_scheme.py | 4 ++-- auto_round/export/export_to_autoround/export_to_fp8.py | 4 ++-- .../export/export_to_autoround/export_to_nvfp_mxfp.py | 2 +- auto_round/export/export_to_autoround/qlinear_fp.py | 5 ++--- .../export/export_to_autoround/qlinear_triton_act.py | 4 ++-- auto_round/export/export_to_awq/utils.py | 6 ++++-- auto_round/export/export_to_gguf/convert.py | 6 +++--- .../export/export_to_llmcompressor/export_to_static_fp.py | 4 ++-- auto_round/utils/device_utils.py | 2 +- auto_round/utils/quantization_utils.py | 6 +++--- auto_round_extension/torch/qlinear_torch.py | 6 +++--- auto_round_extension/torch/qlinear_torch_zp.py | 7 +++---- auto_round_extension/triton/qlinear_tritonv2.py | 5 +++-- 13 files changed, 31 insertions(+), 30 deletions(-) diff --git a/auto_round/auto_scheme/gen_auto_scheme.py b/auto_round/auto_scheme/gen_auto_scheme.py index 9ed0b21fa..acf57b963 100644 --- a/auto_round/auto_scheme/gen_auto_scheme.py +++ b/auto_round/auto_scheme/gen_auto_scheme.py @@ -22,7 +22,7 @@ from auto_round.auto_scheme.utils import compute_avg_bits_for_scheme from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG from auto_round.logger import logger -from auto_round.utils import _gguf_type_fallback, get_layer_features, get_module +from auto_round.utils import get_layer_features, get_module, gguf_type_fallback class GenScheme: @@ -128,7 +128,7 @@ def fallback_gguf_layer_config(self, layer_config: dict[str, dict]) -> dict[str, new_type = f"gguf:q{bits}_" + f"{1 - prefix_idx}" if new_type not in GGUF_INNER_CONFIG: current_type = f"gguf:q{bits}_k" - new_type = _gguf_type_fallback(current_type) + new_type = gguf_type_fallback(current_type) # Apply fallback configuration target_config = GGUF_INNER_CONFIG[new_type] diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 261f1dbbc..6c0f91410 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -30,12 +30,12 @@ from auto_round.schemes import QuantizationScheme from auto_round.utils import ( SUPPORTED_LAYER_TYPES, - _get_packing_device, check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, filter_quantization_config, get_module, + get_packing_device, set_module, ) @@ -89,7 +89,7 @@ def pack_layer(layer_name, model, data_type, device=None): Returns: None: The function modifies the model in place. """ - packing_device = _get_packing_device(device) + packing_device = get_packing_device(device) layer = get_module(model, layer_name) if hasattr(layer, "orig_layer"): layer = layer.orig_layer diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index e344b51b3..fc553b7f2 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -31,12 +31,12 @@ from auto_round.schemes import QuantizationScheme from auto_round.utils import ( SUPPORTED_LAYER_TYPES, - _get_packing_device, check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, filter_quantization_config, get_module, + get_packing_device, is_mx_fp, is_nv_fp, set_amax_for_all_moe_layers, diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py index f7979e269..1e6846777 100644 --- a/auto_round/export/export_to_autoround/qlinear_fp.py +++ b/auto_round/export/export_to_autoround/qlinear_fp.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -38,7 +37,7 @@ from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad -from auto_round.utils import BackendDataType, _get_packing_device, is_mx_fp, is_nv_fp +from auto_round.utils import BackendDataType, get_packing_device, is_mx_fp, is_nv_fp # from auto_round.utils import get_weight_compress_dtype logger = getLogger(__name__) @@ -139,7 +138,7 @@ def post_init(self): pass def pack(self, linear, scales, zeros=None, g_idx=None, global_scale=None, input_global_scale=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) if getattr(linear, "bias", None) is not None: self.bias = linear.bias.detach().to(torch.float16) diff --git a/auto_round/export/export_to_autoround/qlinear_triton_act.py b/auto_round/export/export_to_autoround/qlinear_triton_act.py index 16e5c6a97..7d5f9dee7 100644 --- a/auto_round/export/export_to_autoround/qlinear_triton_act.py +++ b/auto_round/export/export_to_autoround/qlinear_triton_act.py @@ -41,7 +41,7 @@ import torch.nn as nn import transformers -from auto_round.utils import _get_packing_device +from auto_round.utils import get_packing_device logger = getLogger(__name__) @@ -119,7 +119,7 @@ def post_init(self): pass def pack(self, linear, scales, zeros, act_scales, w_bf16_to_fp8_scale, g_idx=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) scales_t = scales.t().contiguous() self.act_scales.data.copy_(act_scales.squeeze().clone()) diff --git a/auto_round/export/export_to_awq/utils.py b/auto_round/export/export_to_awq/utils.py index bb0cf3921..0052ec9b1 100644 --- a/auto_round/export/export_to_awq/utils.py +++ b/auto_round/export/export_to_awq/utils.py @@ -39,7 +39,7 @@ import torch.nn as nn from torch.autograd import Function -from auto_round.utils import _get_packing_device +from auto_round.utils import get_packing_device def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int): @@ -102,6 +102,7 @@ def dequantize_gemm(qweight, qzeros, scales, bits, group_size): class WQLinearMMFunction(Function): + @staticmethod # ctx is the first argument to forward def forward( @@ -136,6 +137,7 @@ def forward( class WQLinear_GEMM(nn.Module): + def __init__(self, w_bit, group_size, in_features, out_features, bias, dev, training=False): super().__init__() @@ -193,7 +195,7 @@ def __init__(self, w_bit, group_size, in_features, out_features, bias, dev, trai @classmethod def from_linear(cls, linear, w_bit, group_size, init_only=False, scales=None, zeros=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) awq_linear = cls( w_bit, group_size, diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 37667dd3b..39cc4d01b 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -50,7 +50,7 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.export.export_to_gguf.packing import ggml_quant -from auto_round.utils import LazyImport, _get_packing_device, clean_module_parameter, get_module, is_fp8_model, logger +from auto_round.utils import LazyImport, clean_module_parameter, get_module, get_packing_device, is_fp8_model, logger gguf = LazyImport("gguf") @@ -179,7 +179,7 @@ def is_extra_tensor(tensor_name): def _quant_data_with_args(data_torch, data_qtype, scale, zp, d_scale=None, wmin=None, d_wmin=None, imatrix=None): - device = _get_packing_device() + device = get_packing_device() data_torch = data_torch.to(torch.float32) scale = scale.to(torch.float32) if isinstance(scale, torch.Tensor) else scale zp = zp.to(torch.float32) if isinstance(zp, torch.Tensor) else zp @@ -204,7 +204,7 @@ def _quant_data_with_args(data_torch, data_qtype, scale, zp, d_scale=None, wmin= def _quant_data(cls, data_torch, data_qtype, name, modify_name, bid): suffix = ".weight" - device = _get_packing_device() + device = get_packing_device() if suffix in name: layer_name = name[: -len(suffix)] module = get_module(cls.model, layer_name) diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py index 18931ffa2..fedf00ad6 100644 --- a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py +++ b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py @@ -29,12 +29,12 @@ from auto_round.export.utils import save_model from auto_round.utils import ( SUPPORTED_LAYER_TYPES, - _get_packing_device, check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, filter_quantization_config, get_module, + get_packing_device, logger, set_module, ) @@ -57,7 +57,7 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device: Returns: None: The function modifies the model in place. """ - packing_device = _get_packing_device(device) + packing_device = get_packing_device(device) layer = get_module(model, layer_name) if hasattr(layer, "orig_layer"): layer = layer.orig_layer diff --git a/auto_round/utils/device_utils.py b/auto_round/utils/device_utils.py index 260f04097..ce2e46133 100644 --- a/auto_round/utils/device_utils.py +++ b/auto_round/utils/device_utils.py @@ -303,7 +303,7 @@ def fake_cuda(): return orig_func -def _get_packing_device(device: str | torch.device | None = "auto") -> torch.device: +def get_packing_device(device: str | torch.device | None = "auto") -> torch.device: """ Selects the packing device. - "auto": choose best available (CUDA > XPU > CPU). diff --git a/auto_round/utils/quantization_utils.py b/auto_round/utils/quantization_utils.py index bf4906c18..ba7b7a0d6 100644 --- a/auto_round/utils/quantization_utils.py +++ b/auto_round/utils/quantization_utils.py @@ -630,7 +630,7 @@ def _set_config(config, target_config): new_type = "gguf:q5_k" new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] if input_features % new_block_size != 0: - new_type = _gguf_type_fallback(new_type) + new_type = gguf_type_fallback(new_type) new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] if input_features % new_block_size != 0: new_type = "gguf:bf16" @@ -654,7 +654,7 @@ def _set_config(config, target_config): ): fallback = True if fallback: - tmp_type = _gguf_type_fallback(new_type) + tmp_type = gguf_type_fallback(new_type) logger.warning_once( f"self_attn.kv_b_proj does not support the use of {new_type}, replace it with {tmp_type}" ) @@ -951,7 +951,7 @@ def _search_gguf_type(gguf_type): return None -def _gguf_type_fallback(gguf_type: str) -> str: +def gguf_type_fallback(gguf_type: str) -> str: gguf_type = gguf_type.lower() if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): gguf_type = "gguf:q5_0" diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py index 4b4a14ecf..d0ff28b09 100644 --- a/auto_round_extension/torch/qlinear_torch.py +++ b/auto_round_extension/torch/qlinear_torch.py @@ -20,7 +20,7 @@ import torch.nn as nn import transformers -from auto_round.utils import _get_packing_device +from auto_round.utils import get_packing_device logger = getLogger(__name__) @@ -90,7 +90,7 @@ def post_init(self): # @torch.compile() ## cpu side has bug def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) scales_t = scales.t().contiguous() if linear.bias is not None: self.bias = linear.bias.clone().half() @@ -149,7 +149,7 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None): # @torch.compile() def pack_3bits(self, linear, scales, zeros, g_idx=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) scales_t = scales.t().contiguous() if linear.bias is not None: self.bias = linear.bias.clone().half() diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py index bf045240b..3c595cda5 100644 --- a/auto_round_extension/torch/qlinear_torch_zp.py +++ b/auto_round_extension/torch/qlinear_torch_zp.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import math from logging import getLogger @@ -21,7 +20,7 @@ import torch.nn as nn import transformers -from auto_round.utils import _get_packing_device +from auto_round.utils import get_packing_device logger = getLogger(__name__) @@ -91,7 +90,7 @@ def post_init(self): # @torch.compile() def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) scales_t = scales.t().contiguous() if linear.bias is not None: self.bias = linear.bias.clone().half() @@ -149,7 +148,7 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None): self.qzeros = qzeros.cpu() def pack_3bits(self, linear, scales, zeros, g_idx=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) scales_t = scales.t().contiguous() if linear.bias is not None: self.bias = linear.bias.clone().half() diff --git a/auto_round_extension/triton/qlinear_tritonv2.py b/auto_round_extension/triton/qlinear_tritonv2.py index 5d2cb54a1..2fcb87f92 100644 --- a/auto_round_extension/triton/qlinear_tritonv2.py +++ b/auto_round_extension/triton/qlinear_tritonv2.py @@ -20,7 +20,7 @@ import torch.nn as nn import transformers -from auto_round.utils import _get_packing_device +from auto_round.utils import get_packing_device from auto_round_extension.triton.triton_utils.mixin import TritonModuleMixin logger = getLogger(__name__) @@ -39,6 +39,7 @@ def error_raiser_triton(*args, **kwargs): ) class FakeTriton: + def __getattr__(self, name): raise ImportError( f"Trying to use the triton backend, but could not import triton dependencies with the following error: {triton_import_exception}" @@ -104,7 +105,7 @@ def post_init(self): pass def pack(self, linear, scales, zeros, g_idx=None, device=None): - device = _get_packing_device(device) + device = get_packing_device(device) scales_t = scales.t().contiguous() if linear.bias is not None: self.bias = linear.bias.clone().half() From 838f76e2c01462145e55681c7e4f1f271f7b1fe6 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 26 Oct 2025 22:37:17 -0400 Subject: [PATCH 3/7] update and clean Signed-off-by: n1ck-guo --- auto_round/auto_scheme/gen_auto_scheme.py | 3 +- auto_round/compressors/base.py | 30 +- .../compressors/diffusion/compressor.py | 2 +- .../utils.py} | 778 +++++++----------- .../export/export_to_autogptq/export.py | 4 +- .../export/export_to_autoround/export.py | 8 +- .../export_to_autoround/export_to_fp8.py | 3 +- .../export_to_nvfp_mxfp.py | 3 +- auto_round/export/export_to_awq/export.py | 3 +- auto_round/export/export_to_gguf/convert.py | 13 +- .../export/export_to_llmcompressor/export.py | 5 +- .../export_to_llmcompressor/export_to_fp.py | 3 +- .../export_to_static_fp.py | 1 - auto_round/export/utils.py | 115 +++ auto_round/utils/__init__.py | 9 +- auto_round/utils/{misc_utils.py => common.py} | 8 + .../utils/{device_utils.py => device.py} | 182 ++++ auto_round/utils/dtype_utils.py | 146 ---- auto_round/utils/memory_utils.py | 182 ---- auto_round/utils/{model_utils.py => model.py} | 250 +++++- auto_round/wrapper.py | 21 +- test/test_cuda/test_main_func.py | 4 +- 22 files changed, 871 insertions(+), 902 deletions(-) rename auto_round/{utils/quantization_utils.py => compressors/utils.py} (77%) rename auto_round/utils/{misc_utils.py => common.py} (96%) rename auto_round/utils/{device_utils.py => device.py} (65%) delete mode 100644 auto_round/utils/dtype_utils.py delete mode 100644 auto_round/utils/memory_utils.py rename auto_round/utils/{model_utils.py => model.py} (82%) diff --git a/auto_round/auto_scheme/gen_auto_scheme.py b/auto_round/auto_scheme/gen_auto_scheme.py index acf57b963..ca0abdfe0 100644 --- a/auto_round/auto_scheme/gen_auto_scheme.py +++ b/auto_round/auto_scheme/gen_auto_scheme.py @@ -20,9 +20,10 @@ from auto_round import AutoScheme from auto_round.auto_scheme import AUTO_SCHEME_METHODS from auto_round.auto_scheme.utils import compute_avg_bits_for_scheme +from auto_round.compressors.utils import gguf_type_fallback from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG from auto_round.logger import logger -from auto_round.utils import get_layer_features, get_module, gguf_type_fallback +from auto_round.utils import get_layer_features, get_module class GenScheme: diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 36804be15..827ea85c0 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -31,6 +31,21 @@ from tqdm import tqdm from transformers import set_seed +from auto_round.compressors.utils import ( + block_forward, + check_need_act_calibration, + check_skippable_keywords, + collect_best_params, + get_fp_layer_names, + get_layer_config_by_gguf_format, + get_shared_keys, + gguf_args_check, + infer_bits_by_data_type, + init_cache, + is_standard_fp, + reset_params, + set_layer_config, +) from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.data_type.utils import reshape_pad_tensor_by_group_size from auto_round.export.export_to_autoround import AutoRoundFormat @@ -47,15 +62,11 @@ SUPPORTED_LAYER_TYPES, TORCH_VERSION_AT_LEAST_2_6, CpuInfo, - block_forward, check_and_mark_fp8_model, check_is_cpu, - check_need_act_calibration, check_seqlen_compatible, - check_skippable_keywords, check_to_quantized, clear_memory, - collect_best_params, compile_func, convert_dtype_str2torch, convert_fp8_layer_to_linear, @@ -67,32 +78,23 @@ flatten_list, get_block_names, get_device_memory, - get_fp_layer_names, - get_layer_config_by_gguf_format, get_layer_features, get_layer_names_in_block, get_lm_head_name, get_max_vram, get_module, - get_shared_keys, - gguf_args_check, htcore, - infer_bits_by_data_type, - init_cache, is_debug_mode, is_fp8_linear, is_fp8_model, is_hpex_available, is_mx_fp, is_nv_fp, - is_standard_fp, is_static_wfp8afp8, is_wfp8afp8, llm_load_model, mv_module_from_gpu, - reset_params, set_amax_for_all_moe_layers, - set_layer_config, set_module, to_device, to_dtype, @@ -956,7 +958,7 @@ def remove_duplicates(lst): "Please change format to fake or auto_round etc." ) elif "auto_awq" in format: - from auto_round.utils import check_awq_gemm_compatibility + from auto_round.compressors.utils import check_awq_gemm_compatibility awq_supported, info = check_awq_gemm_compatibility( self.model, self.bits, self.group_size, self.sym, self.layer_config diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 5441d00b5..be9fa7f6a 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -21,12 +21,12 @@ from auto_round.compressors.base import BaseCompressor from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader +from auto_round.compressors.utils import block_forward from auto_round.logger import logger from auto_round.low_cpu_mem.utils import get_layers_before_block from auto_round.schemes import QuantizationScheme from auto_round.utils import ( LazyImport, - block_forward, clear_memory, diffusion_load_model, extract_block_names_to_str, diff --git a/auto_round/utils/quantization_utils.py b/auto_round/compressors/utils.py similarity index 77% rename from auto_round/utils/quantization_utils.py rename to auto_round/compressors/utils.py index ba7b7a0d6..ff889f8c5 100644 --- a/auto_round/utils/quantization_utils.py +++ b/auto_round/compressors/utils.py @@ -16,6 +16,7 @@ import re import sys from dataclasses import asdict, fields +from enum import Enum from typing import Any, Callable, Dict, List, Tuple, Union import torch @@ -25,6 +26,56 @@ from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.logger import logger from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme +from auto_round.utils import check_to_quantized, is_fp8_linear, is_fp8_model + + +class BackendDataType(str, Enum): + STANDARD_FP = "fp" + MX_FP = "mx_fp" + NV_FP = "nv_fp" + + +def is_standard_fp(backend): + backend = backend.lower() + return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend) + + +def is_mx_fp(backend): + backend = backend.lower() + return BackendDataType.MX_FP in backend + + +def is_nv_fp(backend): + backend = backend.lower() + return BackendDataType.NV_FP in backend + + +def _is_weight_fp8_activation_static_fp8( + bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool +) -> bool: + return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic + + +def is_wfp8afp8(ar): + if ( + ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) + and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)) + and is_standard_fp(ar.act_data_type) + and is_standard_fp(ar.data_type) + ): + return True + else: + return False + + +def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool: + if isinstance(ar_or_format, str): + return "fp8_static" in ar_or_format + if ar_or_format.act_dynamic: + return False + if is_wfp8afp8(ar_or_format): + return True + return False def block_forward( @@ -50,7 +101,7 @@ def block_forward( Returns: output: The output of the forward pass. """ - from auto_round.utils.model_utils import to_device + from auto_round.utils.model import to_device if input_ids.device != device: input_ids = to_device(input_ids, device) @@ -69,6 +120,85 @@ def block_forward( return output +def check_and_mark_fp8_model(model: torch.nn.Module) -> bool: + if is_fp8_model(model): + return True + for n, m in model.named_modules(): + if is_fp8_linear(m): + m.is_fp8_linear = True + if not hasattr(model, "is_fp8"): + model.is_fp8 = True + if hasattr(model, "is_fp8"): + return True + return False + + +def check_skippable_keywords(key): + """ + Prints a reminder if a key is not stored during quantization fine-tuning. + """ + skippable_cache_keys = ("past_key_value",) + for cache_key in skippable_cache_keys: + if cache_key not in key: + return True + return False + + +def check_need_act_calibration( + is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: Union[int, None] = 16 +) -> bool: + if act_bits is None or act_bits > 8: + return False + # None is dynamic + if is_act_dynamic is not None and not is_act_dynamic: + return True + if act_data_type is not None and "static" in act_data_type: + return True + return False + + +def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=None): + """Checks if a model is compatible with the AutoAWQ GEMM kernel. + + Args: + model: The model object to evaluate, typically a PyTorch model. + bits (int): The number of bits for quantization (must be 4 for compatibility). + group_size (int): The group size for quantization. + sym (bool): Whether symmetric quantization is used (not utilized in the current function logic). + layer_configs (dict, optional): A dictionary mapping layer names to configurations, where each + configuration can specify a custom number of bits for the layer. + + Returns: + tuple: A tuple containing: + - bool: `True` if the model is compatible, `False` otherwise. + - str: An error message describing why the model is incompatible, or an empty string if compatible. + """ + from auto_round.utils.model import get_layer_names_in_block, get_module + + if bits != 4: + return False, "AutoAWQ GEMM kernel only supports 4 bits" + for n, m in model.named_modules(): + if type(m) == transformers.pytorch_utils.Conv1D: + return False, "AutoAWQ GEMM kernel does not support conv1d" + + layer_names = get_layer_names_in_block(model) + for layer_name in layer_names: + if ( + layer_configs is not None + and layer_name in layer_configs.keys() + and layer_configs[layer_name].get("bits", bits) > 8 + ): + continue + + layer = get_module(model, layer_name) + if layer.in_features % group_size != 0: + return False, f"Layer {layer_name} in_features is not multiple of group_size {group_size}" + if layer.out_features % (32 // bits) != 0: + return False, f"Layer {layer_name} out_features is not multiple of 32 // bits" + + return True, "" + + def collect_best_params(block): params = {} for n, m in block.named_modules(): @@ -103,31 +233,6 @@ def infer_bits_by_data_type(data_type: str): return None -def check_to_quantized(config): - """Checks if the configuration is valid for quantization. - - Args: - config (dict or object): The configuration to check. It can be either a - dictionary with a 'bits' key or an object with a 'bits' attribute. - - Returns: - bool: True if the configuration is valid for quantization (bits <= 8), - False otherwise. - """ - - if isinstance(config, (dict, QuantizationScheme)): - bits = int(config.get("bits", 16)) - act_bits = int(config.get("act_bits", 16)) - elif hasattr(config, "orig_layer"): - bits = int(config.orig_layer.bits) if hasattr(config.orig_layer, "bits") else 16 - act_bits = int(config.orig_layer.act_bits) if hasattr(config.orig_layer, "act_bits") else 16 - else: - bits = int(config.bits) if hasattr(config, "bits") else 16 - act_bits = int(config.act_bits) if hasattr(config, "act_bits") else 16 - - return bits <= 8 or act_bits <= 8 - - def set_layer_config( model: torch.nn.Module, layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], @@ -147,8 +252,8 @@ def set_layer_config( """ from auto_round.schemes import get_gguf_scheme - from auto_round.utils.dtype_utils import is_mx_fp, is_nv_fp - from auto_round.utils.model_utils import get_layer_names_in_block, get_lm_head_name, get_module + from auto_round.utils.check_utils import is_mx_fp, is_nv_fp + from auto_round.utils.model import get_layer_names_in_block, get_lm_head_name, get_module # ---- helpers ------------------------------------------------- def dispatch_layer_config(layer_config: dict[str, dict]) -> None: @@ -339,6 +444,146 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str return layer_config, has_qlayer_outside_block, regex_config +def _use_more_bits(i_layer: int, n_layer: int): + return (i_layer < n_layer // 8) or (i_layer >= 7 * n_layer // 8) or ((i_layer - n_layer // 8) % 3 == 2) + + +def _search_gguf_type(gguf_type): + if gguf_type in GGUF_INNER_CONFIG: + return gguf_type + pattern = re.compile("gguf:q([0-9]{1,})_[01k]") + bits = re.search(pattern, gguf_type) + if not bits: + raise KeyError(f"{gguf_type} is not a correct gguf type, please check") + + for suffix in ["_k", "_0", "_1"]: + if gguf_type.endswith(suffix): + continue + if (tmp_type := re.sub("_[01k]", suffix, gguf_type)) in GGUF_INNER_CONFIG: + return tmp_type + return None + + +def gguf_type_fallback(gguf_type: str) -> str: + gguf_type = gguf_type.lower() + if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q5_k": + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q6_k": + gguf_type = "gguf:q8_0" + return gguf_type + + +def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.TEXT): + import argparse + + from auto_round.export.export_to_gguf.convert import download_convert_file + from auto_round.logger import logger + from auto_round.utils.model import download_hf_model, get_gguf_architecture + + formats = sorted(formats, key=lambda x: len(x)) + export_gguf = False + for f in formats: + if f.startswith("gguf"): + export_gguf = True + + if f.startswith("gguf") and f not in GGUF_CONFIG: + logger.error(f"{f} is not supported, please check.") + + redownload = False + if export_gguf: + try: + from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 + ModelBase, + ModelType, + get_model_architecture, + ) + + if isinstance(args_or_ar.model, str): + model_path = args_or_ar.model + else: + model_path = args_or_ar.model.name_or_path + if not os.path.isdir(model_path): + model_path = download_hf_model(model_path) + model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) + if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: + logger.warning( + f"Current version of gguf export does not support for {model_architecture}," + " will re-download dependency file." + ) + redownload = True + except ModuleNotFoundError as e: + if "convert_hf_to_gguf" in str(e): + logger.warning("GGUF export dependency file is not found, download from github.") + redownload = True + except AttributeError as e: + raise ImportError( + "Please use the latest gguf-py, you can use the following command to install it:\n" + "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." + ) + download_convert_file(redownload) + + try: + from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 + ModelBase, + ModelType, + ) + except ImportError as e: + raise ImportError( + "Please use the latest gguf-py, you can use the following command to install it:\n" + "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." + ) + if isinstance(args_or_ar.model, str): + model_path = args_or_ar.model + else: + model_path = args_or_ar.model.name_or_path + if not os.path.isdir(model_path): + model_path = download_hf_model(model_path) + model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) + if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: + logger.error(f"Model {model_architecture} is not supported to export gguf format.") + sys.exit(1) + + pattern = re.compile(r"q\d_k") + pre_dq_format = "" + unsupported_list, reset_list = [], [] + for format in GGUF_CONFIG: + if format in formats: + if format == "q6_k_s": + logger.warning("Please note that q6_k_s is q6_k.") + + if re.search(pattern, format): + if pre_dq_format and re.search(pattern, format).group() not in pre_dq_format: + logger.error(f"Cannot export {pre_dq_format} and {format} at the same time.") + sys.exit(-1) + else: + pre_dq_format = format + + unsupported_list, reset_list = [], [] + gguf_config = GGUF_CONFIG[format] + for k, v in gguf_config.items(): + if not hasattr(args_or_ar, k): + continue + if k == "data_type": + if re.search(r"q\d_1", format) and len(formats) > 1: + v = "int" + if k == "sym" and isinstance(args_or_ar, argparse.Namespace): + k = "asym" + v = not v + if getattr(args_or_ar, k) != v: + unsupported_list.append(f"{k}={getattr(args_or_ar, k)}") + reset_list.append(f"{k}={v}") + setattr(args_or_ar, k, v) + if len(unsupported_list) > 0: + logger.info( + f"format {format} does not support for {', '.join(unsupported_list)}," + f" reset to {', '.join(reset_list)}." + ) + # Removed obsolete commented-out block for improved readability and maintainability. + return args_or_ar + + def get_gguf_qtype_by_layer_config(layer_config): import gguf # pylint: disable=E0401 @@ -374,14 +619,23 @@ def get_gguf_qtype_by_layer_config(layer_config): raise ValueError("Unknown layer config") +def _get_digital_in_layer_name(layer_name): + pattern = re.compile(r"([a-zA-Z]+\.){1,}(\d+)") + res = re.search(pattern, layer_name) + if res: + return int(res[2]) + else: + return None + + ##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT): # # TODO: support for other format later # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) import gguf # pylint: disable=E0401 - from auto_round.utils.misc_utils import LazyImport - from auto_round.utils.model_utils import _get_digital_in_layer_name, get_lm_head_name, get_module + from auto_round.utils.common import LazyImport + from auto_round.utils.model import get_lm_head_name, get_module # from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture convert_hf_to_gguf = LazyImport("auto_round.export.export_to_gguf.convert_hf_to_gguf") @@ -669,226 +923,6 @@ def _set_config(config, target_config): return layer_config, gguf_format_config -def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=None): - """Checks if a model is compatible with the AutoAWQ GEMM kernel. - - Args: - model: The model object to evaluate, typically a PyTorch model. - bits (int): The number of bits for quantization (must be 4 for compatibility). - group_size (int): The group size for quantization. - sym (bool): Whether symmetric quantization is used (not utilized in the current function logic). - layer_configs (dict, optional): A dictionary mapping layer names to configurations, where each - configuration can specify a custom number of bits for the layer. - - Returns: - tuple: A tuple containing: - - bool: `True` if the model is compatible, `False` otherwise. - - str: An error message describing why the model is incompatible, or an empty string if compatible. - """ - from auto_round.utils.model_utils import get_layer_names_in_block, get_module - - if bits != 4: - return False, "AutoAWQ GEMM kernel only supports 4 bits" - for n, m in model.named_modules(): - if type(m) == transformers.pytorch_utils.Conv1D: - return False, "AutoAWQ GEMM kernel does not support conv1d" - - layer_names = get_layer_names_in_block(model) - for layer_name in layer_names: - if ( - layer_configs is not None - and layer_name in layer_configs.keys() - and layer_configs[layer_name].get("bits", bits) > 8 - ): - continue - - layer = get_module(model, layer_name) - if layer.in_features % group_size != 0: - return False, f"Layer {layer_name} in_features is not multiple of group_size {group_size}" - if layer.out_features % (32 // bits) != 0: - return False, f"Layer {layer_name} out_features is not multiple of 32 // bits" - - return True, "" - - -def check_need_act_calibration( - is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: Union[int, None] = 16 -) -> bool: - if act_bits is None or act_bits > 8: - return False - # None is dynamic - if is_act_dynamic is not None and not is_act_dynamic: - return True - if act_data_type is not None and "static" in act_data_type: - return True - return False - - -def is_autoround_exllamav2_available(): - """Checks if the AutoRound ExLlamaV2 kernels are available. - - Returns: - bool: - True if the AutoRound ExLlamaV2 kernels are available, False otherwise. - """ - res = True - try: - from autoround_exllamav2_kernels import gemm_half_q_half, make_q_matrix - except ImportError as e: - res = False - return res - - -def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): - """ - Configures and returns a QuantLinear class based on the specified backend and parameters. - - Args: - backend (str): The backend to be used for quantization. Supported values include "qigen", "triton", "marlin", - "exllama", and "cuda". - bits (int, optional): The number of bits for quantization. Default is 4. - group_size (int, optional): The group size for quantization. Default is 128. - sym (bool, optional): Flag indicating whether to use symmetric quantization. Default is False. - - Returns: - class: The dynamically imported QuantLinear class configured according to the specified parameters. - """ - use_triton = True - if bits not in [2, 4, 8]: - use_triton = False - disable_exllamav2 = True - disable_exllamav1 = False - disable_marlin = True - use_qigen = False - if "qigen" in backend: - use_triton = False - use_qigen = True - elif "triton" in backend: - use_triton = True - elif "marlin" in backend and sym: - use_triton = False - disable_marlin = False - elif "exllama" in backend: ##need v1 code to export - use_triton = True ##same with triton - disable_marlin = True - elif "cuda" in backend: - use_triton = False - disable_marlin = True - disable_exllamav2 = True - disable_exllamav1 = True - if use_triton: - from auto_round.export.export_to_autogptq.qlinear_triton import QuantLinear - - return QuantLinear - try: - import auto_gptq # pylint: disable=E0401 - except: - logger.error(f"please install auto_gptq via 'pip install auto-gptq' to support exporting to {backend}") - exit() - - from auto_gptq.utils.import_utils import dynamically_import_QuantLinear # pylint: disable=E0401 - - from auto_round.utils.misc_utils import get_library_version - - version = get_library_version("auto_gptq") - from packaging.version import Version - - if Version(version) < Version("0.7.2"): - QuantLinear = dynamically_import_QuantLinear( - use_triton=use_triton, - desc_act=False, - group_size=group_size, - bits=bits, - disable_exllama=disable_exllamav1, - disable_exllamav2=disable_exllamav2, - use_qigen=use_qigen, - disable_marlin=disable_marlin, - ) - else: - QuantLinear = dynamically_import_QuantLinear( # pylint: disable=E1123 - use_triton=use_triton, - desc_act=False, - group_size=group_size, - bits=bits, - disable_exllama=disable_exllamav1, - disable_exllamav2=disable_exllamav2, - use_qigen=use_qigen, - use_marlin=not disable_marlin, - ) - return QuantLinear - - -def get_reciprocal(tensor): - if torch.dtype is torch.float16: - tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5) - else: - tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor) - return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor)) - - -def check_seqlen_compatible(input_seqlen, tokenizer=None, model=None): - """ - Check whether the input sequence length is within the limits defined - by the tokenizer and the model configuration. - - Args: - input_seqlen (int): The length of the input sequence. - tokenizer: Optional, a HuggingFace tokenizer object. - model: Optional, a HuggingFace model object. - - Returns: - ValueError: if the input length is not valid, riase Error. - """ - if model is not None and hasattr(model, "config"): - model_config = model.config - if hasattr(model_config, "max_position_embeddings") and input_seqlen > model_config.max_position_embeddings: - raise ValueError( - f"seqlen({input_seqlen}) exceeds model.config.max_position_embeddings(" - f"{model_config.max_position_embeddings}). Please lowering '--seqlen'" - ) - if tokenizer is not None and hasattr(tokenizer, "model_max_length") and input_seqlen > tokenizer.model_max_length: - raise ValueError( - f"seqlen({input_seqlen}) exceeds tokenizer.model_max_length({tokenizer.model_max_length}). " - "Please oncider Consider lowering the '--seqlen' or increasing tokenizer.model_max_length." - ) - - -def filter_quantization_config(quantization_config): - default_dict = { - "amp": True, - "batch_size": 8, - "data_type": int, - "dataset": "NeelNanda/pile-10k", - "enable_minmax_tuning": True, - "enable_norm_bias_tuning": False, - "enable_quanted_input": True, - "gradient_accumulate_steps": 1, - "iters": 200, - "low_gpu_mem_usage": False, - "nsamples": 128, - "scale_dtype": "torch.float16", - "seqlen": 2048, - } - iters = quantization_config.get("iters", 200) - - default_dict["lr"] = 1.0 / iters if iters > 0 else 5e-3 - default_dict["minmax_lr"] = default_dict["lr"] - - for key in default_dict: - if key in quantization_config and default_dict[key] == quantization_config[key]: - quantization_config.pop(key) - for k in list(quantization_config.keys()): - if quantization_config[k] is None: - quantization_config.pop(k) - - if quantization_config.get("act_bits", 16) >= 16: - quantization_config.pop("act_bits", None) - quantization_config.pop("act_data_type", None) - quantization_config.pop("act_dynamic", None) - quantization_config.pop("act_sym", None) - quantization_config.pop("act_group_size", None) - - def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): """Identifies and returns layers in the model to exclude from quantization. @@ -931,235 +965,22 @@ def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): return not_to_quantized_layers -def _use_more_bits(i_layer: int, n_layer: int): - return (i_layer < n_layer // 8) or (i_layer >= 7 * n_layer // 8) or ((i_layer - n_layer // 8) % 3 == 2) - - -def _search_gguf_type(gguf_type): - if gguf_type in GGUF_INNER_CONFIG: - return gguf_type - pattern = re.compile("gguf:q([0-9]{1,})_[01k]") - bits = re.search(pattern, gguf_type) - if not bits: - raise KeyError(f"{gguf_type} is not a correct gguf type, please check") - - for suffix in ["_k", "_0", "_1"]: - if gguf_type.endswith(suffix): - continue - if (tmp_type := re.sub("_[01k]", suffix, gguf_type)) in GGUF_INNER_CONFIG: - return tmp_type - return None - - -def gguf_type_fallback(gguf_type: str) -> str: - gguf_type = gguf_type.lower() - if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): - gguf_type = "gguf:q5_0" - elif gguf_type == "gguf:q5_k": - gguf_type = "gguf:q5_0" - elif gguf_type == "gguf:q6_k": - gguf_type = "gguf:q8_0" - return gguf_type - - -def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.TEXT): - import argparse - - from auto_round.export.export_to_gguf.convert import download_convert_file - from auto_round.logger import logger - from auto_round.utils.model_utils import download_hf_model, get_gguf_architecture - - formats = sorted(formats, key=lambda x: len(x)) - export_gguf = False - for f in formats: - if f.startswith("gguf"): - export_gguf = True - - if f.startswith("gguf") and f not in GGUF_CONFIG: - logger.error(f"{f} is not supported, please check.") - - redownload = False - if export_gguf: - try: - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 - ModelBase, - ModelType, - get_model_architecture, - ) - - if isinstance(args_or_ar.model, str): - model_path = args_or_ar.model - else: - model_path = args_or_ar.model.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) - if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: - logger.warning( - f"Current version of gguf export does not support for {model_architecture}," - " will re-download dependency file." - ) - redownload = True - except ModuleNotFoundError as e: - if "convert_hf_to_gguf" in str(e): - logger.warning("GGUF export dependency file is not found, download from github.") - redownload = True - except AttributeError as e: - raise ImportError( - "Please use the latest gguf-py, you can use the following command to install it:\n" - "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." - ) - download_convert_file(redownload) - - try: - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 - ModelBase, - ModelType, - ) - except ImportError as e: - raise ImportError( - "Please use the latest gguf-py, you can use the following command to install it:\n" - "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." - ) - if isinstance(args_or_ar.model, str): - model_path = args_or_ar.model - else: - model_path = args_or_ar.model.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) - if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: - logger.error(f"Model {model_architecture} is not supported to export gguf format.") - sys.exit(1) - - pattern = re.compile(r"q\d_k") - pre_dq_format = "" - unsupported_list, reset_list = [], [] - for format in GGUF_CONFIG: - if format in formats: - if format == "q6_k_s": - logger.warning("Please note that q6_k_s is q6_k.") - - if re.search(pattern, format): - if pre_dq_format and re.search(pattern, format).group() not in pre_dq_format: - logger.error(f"Cannot export {pre_dq_format} and {format} at the same time.") - sys.exit(-1) - else: - pre_dq_format = format - - unsupported_list, reset_list = [], [] - gguf_config = GGUF_CONFIG[format] - for k, v in gguf_config.items(): - if not hasattr(args_or_ar, k): - continue - if k == "data_type": - if re.search(r"q\d_1", format) and len(formats) > 1: - v = "int" - if k == "sym" and isinstance(args_or_ar, argparse.Namespace): - k = "asym" - v = not v - if getattr(args_or_ar, k) != v: - unsupported_list.append(f"{k}={getattr(args_or_ar, k)}") - reset_list.append(f"{k}={v}") - setattr(args_or_ar, k, v) - if len(unsupported_list) > 0: - logger.info( - f"format {format} does not support for {', '.join(unsupported_list)}," - f" reset to {', '.join(reset_list)}." - ) - # Removed obsolete commented-out block for improved readability and maintainability. - return args_or_ar - - -def get_layer_features(layer): - """Extracts input and output feature dimensions for supported layers.""" - from auto_round.utils.constants import LinearAllreduce, LinearLayer, deepspeed_exists - - if type(layer) == torch.nn.Linear: - return layer.in_features, layer.out_features - elif type(layer) == transformers.pytorch_utils.Conv1D: # TODO: Verify correctness - return layer.weight.shape[0], layer.weight.shape[1] - elif isinstance(layer, torch.nn.Embedding): - return layer.num_embeddings, layer.embedding_dim - elif deepspeed_exists and type(layer) in (LinearLayer, LinearAllreduce): - return layer.weight.shape[1], layer.weight.shape[0] # (input_dim, output_dim) - elif "FP8Linear" in layer.__class__.__name__: - return layer.in_features, layer.out_features - return None, None # Unsupported layer type - - -def get_common_prefix(paths): - # Split each path into components and find the common prefix - split_paths = [path.split(".") for path in paths] - common_prefix = split_paths[0] - for path in split_paths[1:]: - common_prefix = [comp for comp, other in zip(common_prefix, path) if comp == other] - return ".".join(common_prefix) - - -def extract_block_names_to_str(quant_block_list): - if not isinstance(quant_block_list, (list, tuple)): - return None - # Extract common prefix for each list - prefixes = [get_common_prefix(blocks) for blocks in quant_block_list] - # Join prefixes into a single string - return ",".join(prefixes) - - -def find_matching_blocks(model, all_blocks, to_quant_block_names): +def get_shared_keys(model): """ - Find and return matching blocks in the model based on to_quant_block_names. + Retrieves shared keys from the model's state dictionary. Args: - model: The model (not used in this specific function but kept for completeness). - all_blocks: List of lists, where each inner list contains full block names in the model. - to_quant_block_names: Comma-separated string of target block names to match. + model (torch.nn.Module): The model to retrieve shared keys from. Returns: - target_blocks: List of lists containing full paths of matching blocks in the model. + tuple: tuple of shared keys. """ - if not to_quant_block_names: - return all_blocks - to_quant_block_list = to_quant_block_names - if isinstance(to_quant_block_names, list) or isinstance(to_quant_block_names, tuple): - return to_quant_block_names - if isinstance(to_quant_block_names, str): - to_quant_block_list = [name.strip() for name in to_quant_block_names.split(",")] - target_blocks = [] - for block_list in all_blocks: - matched_sublist = [] - for name in to_quant_block_list: - matches = [block for block in block_list if re.search(name, block)] - if matches: - matched_sublist.extend(matches) - if matched_sublist: - target_blocks.append(matched_sublist) - if not target_blocks: - raise ValueError( - "No block names matched. Please check the input for to_quant_block_name," - "or set to_quant_block_name to None to automatically match quantizable blocks." - ) - return target_blocks + from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS + from auto_round.utils.constants import SHARED_CACHE_KEYS - -def get_scale_shape(weight, group_size): - """Computes the shape of the scale tensor for quantization based on the weight tensor and group size. - - Args: - weight (torch.Tensor): The weight tensor of the layer. - group_size (int): The size of the groups for quantization. - - Returns: - The shape of the scale tensor to be used for quantization. - """ - if group_size == 0: - return 1 - elif group_size == -1 or weight.shape[1] < group_size: - shape = weight.shape[0] - else: - shape = weight.shape[0] * ((weight.shape[1] + group_size - 1) // group_size) - - return shape + shared_keys = SHARED_CACHE_KEYS + shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ()) + return shared_keys def init_cache(positional_inputs, inputs): @@ -1173,7 +994,7 @@ def init_cache(positional_inputs, inputs): Modifies: inputs (dict): Adds "positional_inputs" key if not present. """ - from auto_round.utils.model_utils import to_device + from auto_round.utils.model import to_device if "positional_inputs" not in inputs: # for chatglm Series inputs["positional_inputs"] = [] @@ -1193,14 +1014,3 @@ def reset_params(inputs): """ if "use_cache" in inputs.keys(): # Not storing kv cache inputs["use_cache"] = False - - -def check_skippable_keywords(key): - """ - Prints a reminder if a key is not stored during quantization fine-tuning. - """ - skippable_cache_keys = ("past_key_value",) - for cache_key in skippable_cache_keys: - if cache_key not in key: - return True - return False diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index 5c795bac9..a69cb29b4 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -49,7 +49,7 @@ from tqdm import tqdm import auto_round.export.export_to_autogptq.qlinear_triton -from auto_round.export.utils import save_model +from auto_round.export.utils import filter_quantization_config, get_autogptq_packing_qlinear, save_model from auto_round.schemes import QuantizationScheme GPTQ_REQUIRED_CONFIG_KEYS = ( @@ -64,8 +64,6 @@ check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, - filter_quantization_config, - get_autogptq_packing_qlinear, get_block_names, get_module, json_serialize, diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 679257762..d587efa2d 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -27,8 +27,9 @@ import transformers from tqdm import tqdm +from auto_round.compressors.utils import is_mx_fp, is_nv_fp, is_standard_fp from auto_round.export.export_to_autoround.utils import check_neq_config -from auto_round.export.utils import save_model +from auto_round.export.utils import filter_quantization_config, get_autogptq_packing_qlinear, save_model from auto_round.logger import logger from auto_round.schemes import QuantizationScheme from auto_round.utils import ( @@ -37,12 +38,7 @@ check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, - filter_quantization_config, - get_autogptq_packing_qlinear, get_module, - is_mx_fp, - is_nv_fp, - is_standard_fp, set_module, to_standard_regex, ) diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 6c0f91410..8b8a618e2 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -25,7 +25,7 @@ from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad from auto_round.export.export_to_autoround.utils import check_neq_config -from auto_round.export.utils import save_model +from auto_round.export.utils import filter_quantization_config, save_model from auto_round.logger import logger from auto_round.schemes import QuantizationScheme from auto_round.utils import ( @@ -33,7 +33,6 @@ check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, - filter_quantization_config, get_module, get_packing_device, set_module, diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index fc553b7f2..ba49db53a 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -26,7 +26,7 @@ from tqdm import tqdm from auto_round.export.export_to_autoround.utils import check_neq_config -from auto_round.export.utils import save_model +from auto_round.export.utils import filter_quantization_config, save_model from auto_round.logger import logger from auto_round.schemes import QuantizationScheme from auto_round.utils import ( @@ -34,7 +34,6 @@ check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, - filter_quantization_config, get_module, get_packing_device, is_mx_fp, diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py index 2cf256c67..2785a0e55 100644 --- a/auto_round/export/export_to_awq/export.py +++ b/auto_round/export/export_to_awq/export.py @@ -31,14 +31,13 @@ from tqdm import tqdm from auto_round.export.export_to_awq.utils import WQLinear_GEMM -from auto_round.export.utils import save_model +from auto_round.export.utils import filter_quantization_config, save_model from auto_round.logger import logger from auto_round.utils import ( SUPPORTED_LAYER_TYPES, check_to_quantized, copy_python_files_from_model_cache, extract_block_names_to_str, - filter_quantization_config, get_block_names, get_module, set_module, diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 39cc4d01b..a03921624 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -50,7 +50,7 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.export.export_to_gguf.packing import ggml_quant -from auto_round.utils import LazyImport, clean_module_parameter, get_module, get_packing_device, is_fp8_model, logger +from auto_round.utils import LazyImport, get_module, get_packing_device, is_fp8_model, logger gguf = LazyImport("gguf") @@ -58,6 +58,17 @@ from torch import Tensor +def clean_module_parameter(submodule, parameter): + if submodule is None: + return + is_buffer = parameter in submodule._buffers + with torch.no_grad(): + if is_buffer: + submodule._buffers[parameter] = None + else: + submodule._parameters[parameter] = None + + def download_convert_file(redownload=False): CONVERT_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/convert_hf_to_gguf.py" FILE_NAME = "convert_hf_to_gguf.py" diff --git a/auto_round/export/export_to_llmcompressor/export.py b/auto_round/export/export_to_llmcompressor/export.py index 938411b3a..41cd9b71c 100644 --- a/auto_round/export/export_to_llmcompressor/export.py +++ b/auto_round/export/export_to_llmcompressor/export.py @@ -17,6 +17,7 @@ import torch +from auto_round.compressors.utils import is_mx_fp, is_nv_fp, is_standard_fp, is_static_wfp8afp8 from auto_round.export.export_to_llmcompressor.config import quantization_config from auto_round.export.export_to_llmcompressor.export_to_fp import save_quantized_as_fp from auto_round.export.export_to_llmcompressor.export_to_static_fp import save_quantized_as_static_fp @@ -25,10 +26,6 @@ copy_python_files_from_model_cache, detect_device, get_module, - is_mx_fp, - is_nv_fp, - is_standard_fp, - is_static_wfp8afp8, set_module, ) from auto_round.wrapper import WrapperWALayer diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py index f5c273856..67aedd7e9 100644 --- a/auto_round/export/export_to_llmcompressor/export_to_fp.py +++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py @@ -26,14 +26,13 @@ from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear from auto_round.export.export_to_llmcompressor.utils import generate_ignore_regex_list -from auto_round.export.utils import save_model +from auto_round.export.utils import filter_quantization_config, save_model from auto_round.logger import logger from auto_round.utils import ( SUPPORTED_LAYER_TYPES, check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, - filter_quantization_config, get_block_names, get_module, is_mx_fp, diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py index fedf00ad6..1b0b48b35 100644 --- a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py +++ b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py @@ -32,7 +32,6 @@ check_start_with_block_name, check_to_quantized, copy_python_files_from_model_cache, - filter_quantization_config, get_module, get_packing_device, logger, diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py index 716d0afd3..78e2f43e6 100644 --- a/auto_round/export/utils.py +++ b/auto_round/export/utils.py @@ -70,3 +70,118 @@ def save_model( copy_python_files_from_model_cache(model, save_dir) except Exception as e: logger.warning("Skipping source model Python file copy due to error: %s", e) + + +def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): + """ + Configures and returns a QuantLinear class based on the specified backend and parameters. + + Args: + backend (str): The backend to be used for quantization. Supported values include "qigen", "triton", "marlin", + "exllama", and "cuda". + bits (int, optional): The number of bits for quantization. Default is 4. + group_size (int, optional): The group size for quantization. Default is 128. + sym (bool, optional): Flag indicating whether to use symmetric quantization. Default is False. + + Returns: + class: The dynamically imported QuantLinear class configured according to the specified parameters. + """ + use_triton = True + if bits not in [2, 4, 8]: + use_triton = False + disable_exllamav2 = True + disable_exllamav1 = False + disable_marlin = True + use_qigen = False + if "qigen" in backend: + use_triton = False + use_qigen = True + elif "triton" in backend: + use_triton = True + elif "marlin" in backend and sym: + use_triton = False + disable_marlin = False + elif "exllama" in backend: ##need v1 code to export + use_triton = True ##same with triton + disable_marlin = True + elif "cuda" in backend: + use_triton = False + disable_marlin = True + disable_exllamav2 = True + disable_exllamav1 = True + if use_triton: + from auto_round.export.export_to_autogptq.qlinear_triton import QuantLinear + + return QuantLinear + try: + import auto_gptq # pylint: disable=E0401 + except: + logger.error(f"please install auto_gptq via 'pip install auto-gptq' to support exporting to {backend}") + exit() + + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear # pylint: disable=E0401 + + from auto_round.utils.common import get_library_version + + version = get_library_version("auto_gptq") + from packaging.version import Version + + if Version(version) < Version("0.7.2"): + QuantLinear = dynamically_import_QuantLinear( + use_triton=use_triton, + desc_act=False, + group_size=group_size, + bits=bits, + disable_exllama=disable_exllamav1, + disable_exllamav2=disable_exllamav2, + use_qigen=use_qigen, + disable_marlin=disable_marlin, + ) + else: + QuantLinear = dynamically_import_QuantLinear( # pylint: disable=E1123 + use_triton=use_triton, + desc_act=False, + group_size=group_size, + bits=bits, + disable_exllama=disable_exllamav1, + disable_exllamav2=disable_exllamav2, + use_qigen=use_qigen, + use_marlin=not disable_marlin, + ) + return QuantLinear + + +def filter_quantization_config(quantization_config): + default_dict = { + "amp": True, + "batch_size": 8, + "data_type": int, + "dataset": "NeelNanda/pile-10k", + "enable_minmax_tuning": True, + "enable_norm_bias_tuning": False, + "enable_quanted_input": True, + "gradient_accumulate_steps": 1, + "iters": 200, + "low_gpu_mem_usage": False, + "nsamples": 128, + "scale_dtype": "torch.float16", + "seqlen": 2048, + } + iters = quantization_config.get("iters", 200) + + default_dict["lr"] = 1.0 / iters if iters > 0 else 5e-3 + default_dict["minmax_lr"] = default_dict["lr"] + + for key in default_dict: + if key in quantization_config and default_dict[key] == quantization_config[key]: + quantization_config.pop(key) + for k in list(quantization_config.keys()): + if quantization_config[k] is None: + quantization_config.pop(k) + + if quantization_config.get("act_bits", 16) >= 16: + quantization_config.pop("act_bits", None) + quantization_config.pop("act_data_type", None) + quantization_config.pop("act_dynamic", None) + quantization_config.pop("act_sym", None) + quantization_config.pop("act_group_size", None) diff --git a/auto_round/utils/__init__.py b/auto_round/utils/__init__.py index 8b9366d63..3ffb5359a 100644 --- a/auto_round/utils/__init__.py +++ b/auto_round/utils/__init__.py @@ -13,9 +13,6 @@ # limitations under the License. from auto_round.utils.constants import * -from auto_round.utils.device_utils import * -from auto_round.utils.dtype_utils import * -from auto_round.utils.memory_utils import * -from auto_round.utils.misc_utils import * -from auto_round.utils.model_utils import * -from auto_round.utils.quantization_utils import * +from auto_round.utils.device import * +from auto_round.utils.common import * +from auto_round.utils.model import * diff --git a/auto_round/utils/misc_utils.py b/auto_round/utils/common.py similarity index 96% rename from auto_round/utils/misc_utils.py rename to auto_round/utils/common.py index ab46653b7..8c28980be 100644 --- a/auto_round/utils/misc_utils.py +++ b/auto_round/utils/common.py @@ -224,3 +224,11 @@ def json_serialize(obj: Any): if isinstance(obj, torch.dtype): return str(obj).split(".")[-1] # e.g., torch.float16 -> "float16" raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + + +def get_reciprocal(tensor): + if torch.dtype is torch.float16: + tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5) + else: + tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor) + return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor)) diff --git a/auto_round/utils/device_utils.py b/auto_round/utils/device.py similarity index 65% rename from auto_round/utils/device_utils.py rename to auto_round/utils/device.py index ce2e46133..4aeb66f3e 100644 --- a/auto_round/utils/device_utils.py +++ b/auto_round/utils/device.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import gc import os from functools import lru_cache from typing import Any, Callable, Dict, List, Tuple, Union @@ -42,6 +43,21 @@ def is_package_available(package_name: str) -> bool: return package_spec is not None +def is_autoround_exllamav2_available(): + """Checks if the AutoRound ExLlamaV2 kernels are available. + + Returns: + bool: + True if the AutoRound ExLlamaV2 kernels are available, False otherwise. + """ + res = True + try: + from autoround_exllamav2_kernels import gemm_half_q_half, make_q_matrix + except ImportError as e: + res = False + return res + + def is_hpu_lazy_mode(): return os.getenv("PT_HPU_LAZY_MODE") != "0" @@ -357,3 +373,169 @@ def __init__(self): def bf16(self): """Get whether it is bf16.""" return self._bf16 + + +def bytes_to_gigabytes(bytes) -> int: + """ + Converts bytes to gigabytes. + + Args: + bytes (int): The number of bytes. + + Returns: + int: The equivalent number of gigabytes. + """ + return bytes / 1024 / 1024 / 1024 + + +def _clear_memory_for_cpu_and_cuda(tensor=None): + if isinstance(tensor, list): + for i in range(len(tensor)): + tensor[i] = None + if tensor is not None: + del tensor + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + if torch.xpu.is_available(): + torch.xpu.empty_cache() + + +@torch._dynamo.disable() +def clear_memory(tensor=None): + from auto_round.utils.device import is_hpex_available + + if is_hpex_available(): + # hpu does not have empty_cache + return + else: + _clear_memory_for_cpu_and_cuda(tensor) + + +def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): + """Checks the availability of memory on the specified device for processing inputs using a given weight tensor. + + Args: + device (str): The device type ('cuda' for GPU or 'hpu' for HPU). + inputs (torch.Tensor): Input tensor. + weight (torch.Tensor): Weight tensor. + org_seqlen (int): Original sequence length. + org_bs (int): Original batch size. + + Returns: + tuple: A tuple containing availability status (bool), modified sequence length (int), + and modified batch size (int). + """ + weight_memory = weight.numel() * weight.element_size() + if "cuda" in device: + current_gpu_index = torch.cuda.current_device() + total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory + used_memory = torch.cuda.memory_allocated(current_gpu_index) + free_space = total_memory - used_memory + elif "hpu" in device: # pragma: no cover + current_hpu_index = torch.hpu.current_device() + free_space = torch.hpu.memory_reserved(current_hpu_index) + else: + return True, org_seqlen, org_bs + + free_space = free_space - weight_memory * 10 # for min_max_scale & grad usage + seqlen = org_seqlen + bs = org_bs + in_feature = weight.shape[1] + out_feature = weight.shape[0] + while seqlen >= 128: + input_size = bs * seqlen * in_feature + output_size = bs * seqlen * out_feature + input_output_memory = 2 * (input_size * inputs.element_size() + output_size * inputs.element_size()) + if input_output_memory < free_space: + return True, seqlen, bs + seqlen = seqlen // 2 + bs = 1 + + return False, seqlen, bs + + +def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]: + """ + Calculates the memory consumption of a specific block in the model. + + Args: + block (torch.nn.Module): The block of the model to analyze. + input_ids (list[torch.Tensor]): A list of input tensors for the block. + + Returns: + tuple: A tuple containing the following: + - block_memory (float): The memory consumption (in GB) of the block's linear layers. + - input_output_memory (float): The memory consumption (in GB) for input and output + tensors of the block. + """ + # Calculate all block parameters memory + from auto_round.utils.quantization_utils import check_to_quantized + + total_param_mem = 0 + for name, module in block.named_modules(): + if check_to_quantized(module): + param_size = module.weight.nbytes + total_param_mem += param_size + block_memory = total_param_mem / 1024**3 # Convert to GB + + # Assuming bfloat16 or float32, input and output + input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 + + return block_memory, input_output_memory + + +def out_of_vram(error_msg): + error_msg = str(error_msg) + # CUDA + if "CUDA out of memory" in error_msg: + return True + # gaudi + if "MODULE:PT_DEVMEM" in error_msg: + return True + # XPU + if "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in error_msg: + return True + # ROCM + if "HIP out of memory. Tried to allocate" in error_msg: + return True + return False + + +def get_max_vram(ratio: float = 0.9) -> dict: + max_memory = {} + if torch.cuda.is_available(): # NVIDIA CUDA + num_devices = torch.cuda.device_count() + for i in range(num_devices): + total_mem = torch.cuda.get_device_properties(i).total_memory + max_mem_gb = int(total_mem / 1024**3 * ratio) + max_memory[i] = f"{max_mem_gb}GiB" + elif torch.xpu.is_available(): # TODO need verification + num_devices = torch.xpu.device_count() + for i in range(num_devices): + total_mem = torch.xpu.get_device_properties(i).total_memory + max_mem_gb = int(total_mem / 1024**3 * ratio) + max_memory[i] = f"{max_mem_gb}GiB" + + else: + raise RuntimeError("No CUDA or XPU devices found.") + return max_memory + + +def get_device_memory(i: int = 0) -> int: + """ + Gets the available memory on the specified device. + + Args: + i (int, optional): Device index. Defaults to 0. + + Returns: + int: Available memory in gigabytes. + """ + if torch.cuda.is_available(): + total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory) + elif torch.xpu.is_available(): + raise RuntimeError("XPU does not support device_map='auto' currently.") + else: + raise RuntimeError("No supported device found (CUDA or XPU).") + return total_memory diff --git a/auto_round/utils/dtype_utils.py b/auto_round/utils/dtype_utils.py deleted file mode 100644 index 91ed869c3..000000000 --- a/auto_round/utils/dtype_utils.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from enum import Enum -from typing import Any, Callable, Dict, List, Tuple, Union - -import torch - - -def convert_dtype_str2torch(str_dtype): - """Converts a string dtype to its corresponding PyTorch dtype. - - Args: - str_dtype (str): The string representation of the dtype. - - Returns: - torch.dtype: The PyTorch dtype. - - Raises: - ValueError: If the input str_dtype is unsupported. - """ - if isinstance(str_dtype, torch.dtype) or str_dtype is None: - return str_dtype - if str_dtype == "int8": - return torch.int8 - elif str_dtype == "fp32" or str_dtype == "float32" or str_dtype == "auto": - return torch.float - elif str_dtype == "fp16" or str_dtype == "float16": - return torch.float16 - elif str_dtype == "bf16" or str_dtype == "bfloat16": - return torch.bfloat16 - else: - raise ValueError(f"Unsupported string dtype '{str_dtype}' for conversion to torch dtype.") - - -def convert_dtype_torch2str(dtype): - """Converts a PyTorch dtype to its corresponding string representation. - - Args: - dtype: PyTorch dtype or str. The dtype to convert. - - Returns: - str: The string representation of the dtype. - - Raises: - ValueError: If the input dtype is unsupported. - """ - if isinstance(dtype, str) or dtype is None: - return dtype - if dtype == torch.int8: - return "int8" - elif dtype == torch.float: - return "fp32" - elif dtype == torch.float16: - return "fp16" - elif dtype == torch.bfloat16: - return "bf16" - elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]: - return dtype - else: - raise ValueError(f"Unsupported PyTorch dtype '{dtype}' for conversion to string dtype.") - - -def convert_dtype_torch2str_hf(dtype): - """Converts a PyTorch dtype to its corresponding huggingface string dtype, e.g. torch.float32 -> 'float32'. - - Args: - dtype: PyTorch dtype or str. The dtype to convert. - - Returns: - str: The string representation of the dtype. - - Raises: - ValueError: If the input str_dtype is unsupported. - """ - if dtype is None: - return dtype - if isinstance(dtype, str): - if "float" not in dtype and "int" not in dtype: - dtype = convert_dtype_str2torch(dtype) - else: - return dtype - str_dtype = str(dtype) - if "." not in str_dtype: - raise ValueError(f"Unsupported pytorch dtype '{dtype}' for conversion to huggingface str dtype") - str_dtype = str_dtype.split(".")[1] - return str_dtype - - -class BackendDataType(str, Enum): - STANDARD_FP = "fp" - MX_FP = "mx_fp" - NV_FP = "nv_fp" - - -def is_standard_fp(backend): - backend = backend.lower() - return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend) - - -def is_mx_fp(backend): - backend = backend.lower() - return BackendDataType.MX_FP in backend - - -def is_nv_fp(backend): - backend = backend.lower() - return BackendDataType.NV_FP in backend - - -def _is_weight_fp8_activation_static_fp8( - bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool -) -> bool: - return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic - - -def is_wfp8afp8(ar): - if ( - ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) - and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)) - and is_standard_fp(ar.act_data_type) - and is_standard_fp(ar.data_type) - ): - return True - else: - return False - - -def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool: - if isinstance(ar_or_format, str): - return "fp8_static" in ar_or_format - if ar_or_format.act_dynamic: - return False - if is_wfp8afp8(ar_or_format): - return True - return False diff --git a/auto_round/utils/memory_utils.py b/auto_round/utils/memory_utils.py deleted file mode 100644 index 7ecc31893..000000000 --- a/auto_round/utils/memory_utils.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import gc - -import torch - - -def bytes_to_gigabytes(bytes) -> int: - """ - Converts bytes to gigabytes. - - Args: - bytes (int): The number of bytes. - - Returns: - int: The equivalent number of gigabytes. - """ - return bytes / 1024 / 1024 / 1024 - - -def _clear_memory_for_cpu_and_cuda(tensor=None): - if isinstance(tensor, list): - for i in range(len(tensor)): - tensor[i] = None - if tensor is not None: - del tensor - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - if torch.xpu.is_available(): - torch.xpu.empty_cache() - - -@torch._dynamo.disable() -def clear_memory(tensor=None): - from auto_round.utils.device_utils import is_hpex_available - - if is_hpex_available(): - # hpu does not have empty_cache - return - else: - _clear_memory_for_cpu_and_cuda(tensor) - - -def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): - """Checks the availability of memory on the specified device for processing inputs using a given weight tensor. - - Args: - device (str): The device type ('cuda' for GPU or 'hpu' for HPU). - inputs (torch.Tensor): Input tensor. - weight (torch.Tensor): Weight tensor. - org_seqlen (int): Original sequence length. - org_bs (int): Original batch size. - - Returns: - tuple: A tuple containing availability status (bool), modified sequence length (int), - and modified batch size (int). - """ - weight_memory = weight.numel() * weight.element_size() - if "cuda" in device: - current_gpu_index = torch.cuda.current_device() - total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory - used_memory = torch.cuda.memory_allocated(current_gpu_index) - free_space = total_memory - used_memory - elif "hpu" in device: # pragma: no cover - current_hpu_index = torch.hpu.current_device() - free_space = torch.hpu.memory_reserved(current_hpu_index) - else: - return True, org_seqlen, org_bs - - free_space = free_space - weight_memory * 10 # for min_max_scale & grad usage - seqlen = org_seqlen - bs = org_bs - in_feature = weight.shape[1] - out_feature = weight.shape[0] - while seqlen >= 128: - input_size = bs * seqlen * in_feature - output_size = bs * seqlen * out_feature - input_output_memory = 2 * (input_size * inputs.element_size() + output_size * inputs.element_size()) - if input_output_memory < free_space: - return True, seqlen, bs - seqlen = seqlen // 2 - bs = 1 - - return False, seqlen, bs - - -def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]: - """ - Calculates the memory consumption of a specific block in the model. - - Args: - block (torch.nn.Module): The block of the model to analyze. - input_ids (list[torch.Tensor]): A list of input tensors for the block. - - Returns: - tuple: A tuple containing the following: - - block_memory (float): The memory consumption (in GB) of the block's linear layers. - - input_output_memory (float): The memory consumption (in GB) for input and output - tensors of the block. - """ - # Calculate all block parameters memory - from auto_round.utils.quantization_utils import check_to_quantized - - total_param_mem = 0 - for name, module in block.named_modules(): - if check_to_quantized(module): - param_size = module.weight.nbytes - total_param_mem += param_size - block_memory = total_param_mem / 1024**3 # Convert to GB - - # Assuming bfloat16 or float32, input and output - input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 - - return block_memory, input_output_memory - - -def out_of_vram(error_msg): - error_msg = str(error_msg) - # CUDA - if "CUDA out of memory" in error_msg: - return True - # gaudi - if "MODULE:PT_DEVMEM" in error_msg: - return True - # XPU - if "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in error_msg: - return True - # ROCM - if "HIP out of memory. Tried to allocate" in error_msg: - return True - return False - - -def get_max_vram(ratio: float = 0.9) -> dict: - max_memory = {} - if torch.cuda.is_available(): # NVIDIA CUDA - num_devices = torch.cuda.device_count() - for i in range(num_devices): - total_mem = torch.cuda.get_device_properties(i).total_memory - max_mem_gb = int(total_mem / 1024**3 * ratio) - max_memory[i] = f"{max_mem_gb}GiB" - elif torch.xpu.is_available(): # TODO need verification - num_devices = torch.xpu.device_count() - for i in range(num_devices): - total_mem = torch.xpu.get_device_properties(i).total_memory - max_mem_gb = int(total_mem / 1024**3 * ratio) - max_memory[i] = f"{max_mem_gb}GiB" - - else: - raise RuntimeError("No CUDA or XPU devices found.") - return max_memory - - -def get_device_memory(i: int = 0) -> int: - """ - Gets the available memory on the specified device. - - Args: - i (int, optional): Device index. Defaults to 0. - - Returns: - int: Available memory in gigabytes. - """ - if torch.cuda.is_available(): - total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory) - elif torch.xpu.is_available(): - raise RuntimeError("XPU does not support device_map='auto' currently.") - else: - raise RuntimeError("No supported device found (CUDA or XPU).") - return total_memory diff --git a/auto_round/utils/model_utils.py b/auto_round/utils/model.py similarity index 82% rename from auto_round/utils/model_utils.py rename to auto_round/utils/model.py index 1b833ca4b..ac34c80ea 100644 --- a/auto_round/utils/model_utils.py +++ b/auto_round/utils/model.py @@ -25,17 +25,87 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.logger import logger +from auto_round.schemes import QuantizationScheme -def clean_module_parameter(submodule, parameter): - if submodule is None: - return - is_buffer = parameter in submodule._buffers - with torch.no_grad(): - if is_buffer: - submodule._buffers[parameter] = None +def convert_dtype_str2torch(str_dtype): + """Converts a string dtype to its corresponding PyTorch dtype. + + Args: + str_dtype (str): The string representation of the dtype. + + Returns: + torch.dtype: The PyTorch dtype. + + Raises: + ValueError: If the input str_dtype is unsupported. + """ + if isinstance(str_dtype, torch.dtype) or str_dtype is None: + return str_dtype + if str_dtype == "int8": + return torch.int8 + elif str_dtype == "fp32" or str_dtype == "float32" or str_dtype == "auto": + return torch.float + elif str_dtype == "fp16" or str_dtype == "float16": + return torch.float16 + elif str_dtype == "bf16" or str_dtype == "bfloat16": + return torch.bfloat16 + else: + raise ValueError(f"Unsupported string dtype '{str_dtype}' for conversion to torch dtype.") + + +def convert_dtype_torch2str(dtype): + """Converts a PyTorch dtype to its corresponding string representation. + + Args: + dtype: PyTorch dtype or str. The dtype to convert. + + Returns: + str: The string representation of the dtype. + + Raises: + ValueError: If the input dtype is unsupported. + """ + if isinstance(dtype, str) or dtype is None: + return dtype + if dtype == torch.int8: + return "int8" + elif dtype == torch.float: + return "fp32" + elif dtype == torch.float16: + return "fp16" + elif dtype == torch.bfloat16: + return "bf16" + elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]: + return dtype + else: + raise ValueError(f"Unsupported PyTorch dtype '{dtype}' for conversion to string dtype.") + + +def convert_dtype_torch2str_hf(dtype): + """Converts a PyTorch dtype to its corresponding huggingface string dtype, e.g. torch.float32 -> 'float32'. + + Args: + dtype: PyTorch dtype or str. The dtype to convert. + + Returns: + str: The string representation of the dtype. + + Raises: + ValueError: If the input str_dtype is unsupported. + """ + if dtype is None: + return dtype + if isinstance(dtype, str): + if "float" not in dtype and "int" not in dtype: + dtype = convert_dtype_str2torch(dtype) else: - submodule._parameters[parameter] = None + return dtype + str_dtype = str(dtype) + if "." not in str_dtype: + raise ValueError(f"Unsupported pytorch dtype '{dtype}' for conversion to huggingface str dtype") + str_dtype = str_dtype.split(".")[1] + return str_dtype def check_and_mark_fp8_model(model: torch.nn.Module) -> bool: @@ -120,7 +190,7 @@ def llm_load_model( ): from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer - from auto_round.utils.device_utils import ( + from auto_round.utils.device import ( _use_hpu_compile_mode, get_device_and_parallelism, set_fake_cuda_device_capability, @@ -228,7 +298,7 @@ def mllm_load_model( from huggingface_hub import HfApi, HfFileSystem, hf_hub_download from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer - from auto_round.utils.device_utils import get_device_and_parallelism, set_fake_cuda_device_capability + from auto_round.utils.device import get_device_and_parallelism, set_fake_cuda_device_capability device_str, use_auto_mapping = get_device_and_parallelism(device) torch_dtype = "auto" @@ -352,8 +422,8 @@ def diffusion_load_model( model_dtype: str = None, **kwargs, ): - from auto_round.utils.device_utils import get_device_and_parallelism - from auto_round.utils.misc_utils import LazyImport + from auto_round.utils.common import LazyImport + from auto_round.utils.device import get_device_and_parallelism device_str, use_auto_mapping = get_device_and_parallelism(device) torch_dtype = "auto" @@ -432,7 +502,7 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): def is_diffusion_model(model_or_path: Union[str, object]) -> bool: - from auto_round.utils.misc_utils import LazyImport + from auto_round.utils.common import LazyImport if isinstance(model_or_path, str): index_file = None @@ -694,7 +764,7 @@ def set_nested_attr(module, attr_name: str, value): setattr(module, attrs[-1], value) -def pad_weight(weight: torch.Tensor, block_size: list) -> Tuple[torch.Tensor, int, int]: +def _pad_weight(weight: torch.Tensor, block_size: list) -> Tuple[torch.Tensor, int, int]: """Pads a matrix to make its dimensions multiples of block_size.""" M, N = weight.shape[-2:] block_size_m, block_size_n = block_size @@ -707,7 +777,7 @@ def pad_weight(weight: torch.Tensor, block_size: list) -> Tuple[torch.Tensor, in return padded_weight, M, N # Return original dimensions for unpadding -def unpad_weight(weight: torch.Tensor, original_M: int, original_N: int, keep_first_dim: bool = False) -> torch.Tensor: +def _unpad_weight(weight: torch.Tensor, original_M: int, original_N: int, keep_first_dim: bool = False) -> torch.Tensor: """Removes padding from the matrix to restore its original shape.""" if (weight.shape[-2] == original_M) and (weight.shape[-1] == original_N): return weight @@ -725,7 +795,7 @@ def pad_block_fp8_weight_naive( block_size_m, block_size_n = block_size weight_scale_m, weight_scale_n = weight_scale.shape[-2:] - weight, orig_M, orig_N = pad_weight(weight, block_size) + weight, orig_M, orig_N = _pad_weight(weight, block_size) M, N = weight.shape[-2:] assert weight_scale_m == M // block_size_m @@ -764,11 +834,63 @@ def dequant_block_fp8_weight(weight: torch.Tensor, weight_scale: torch.Tensor, b else: raise ValueError("Only support original weight shape is either 2 or 3") - dequant_weight = unpad_weight(dequant_weight, orig_M, orig_N, keep_first_dim=keep_first_dim) + dequant_weight = _unpad_weight(dequant_weight, orig_M, orig_N, keep_first_dim=keep_first_dim) return dequant_weight +def check_to_quantized(config): + """Checks if the configuration is valid for quantization. + + Args: + config (dict or object): The configuration to check. It can be either a + dictionary with a 'bits' key or an object with a 'bits' attribute. + + Returns: + bool: True if the configuration is valid for quantization (bits <= 8), + False otherwise. + """ + + if isinstance(config, (dict, QuantizationScheme)): + bits = int(config.get("bits", 16)) + act_bits = int(config.get("act_bits", 16)) + elif hasattr(config, "orig_layer"): + bits = int(config.orig_layer.bits) if hasattr(config.orig_layer, "bits") else 16 + act_bits = int(config.orig_layer.act_bits) if hasattr(config.orig_layer, "act_bits") else 16 + else: + bits = int(config.bits) if hasattr(config, "bits") else 16 + act_bits = int(config.act_bits) if hasattr(config, "act_bits") else 16 + + return bits <= 8 or act_bits <= 8 + + +def check_seqlen_compatible(input_seqlen, tokenizer=None, model=None): + """ + Check whether the input sequence length is within the limits defined + by the tokenizer and the model configuration. + + Args: + input_seqlen (int): The length of the input sequence. + tokenizer: Optional, a HuggingFace tokenizer object. + model: Optional, a HuggingFace model object. + + Returns: + ValueError: if the input length is not valid, riase Error. + """ + if model is not None and hasattr(model, "config"): + model_config = model.config + if hasattr(model_config, "max_position_embeddings") and input_seqlen > model_config.max_position_embeddings: + raise ValueError( + f"seqlen({input_seqlen}) exceeds model.config.max_position_embeddings(" + f"{model_config.max_position_embeddings}). Please lowering '--seqlen'" + ) + if tokenizer is not None and hasattr(tokenizer, "model_max_length") and input_seqlen > tokenizer.model_max_length: + raise ValueError( + f"seqlen({input_seqlen}) exceeds tokenizer.model_max_length({tokenizer.model_max_length}). " + "Please oncider Consider lowering the '--seqlen' or increasing tokenizer.model_max_length." + ) + + def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16): """ """ from auto_round.schemes import QuantizationScheme @@ -808,24 +930,6 @@ def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16): return model -def get_shared_keys(model): - """ - Retrieves shared keys from the model's state dictionary. - - Args: - model (torch.nn.Module): The model to retrieve shared keys from. - - Returns: - tuple: tuple of shared keys. - """ - from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS - from auto_round.utils.constants import SHARED_CACHE_KEYS - - shared_keys = SHARED_CACHE_KEYS - shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ()) - return shared_keys - - def _to_model_dtype(model, model_dtype): if model_dtype is not None: try: @@ -872,13 +976,30 @@ def set_module(model, key, new_module): setattr(module, name_list[-1], new_module) -def _get_digital_in_layer_name(layer_name): - pattern = re.compile(r"([a-zA-Z]+\.){1,}(\d+)") - res = re.search(pattern, layer_name) - if res: - return int(res[2]) - else: - return None +def get_layer_features(layer): + """Extracts input and output feature dimensions for supported layers.""" + from auto_round.utils.constants import LinearAllreduce, LinearLayer, deepspeed_exists + + if type(layer) == torch.nn.Linear: + return layer.in_features, layer.out_features + elif type(layer) == transformers.pytorch_utils.Conv1D: # TODO: Verify correctness + return layer.weight.shape[0], layer.weight.shape[1] + elif isinstance(layer, torch.nn.Embedding): + return layer.num_embeddings, layer.embedding_dim + elif deepspeed_exists and type(layer) in (LinearLayer, LinearAllreduce): + return layer.weight.shape[1], layer.weight.shape[0] # (input_dim, output_dim) + elif "FP8Linear" in layer.__class__.__name__: + return layer.in_features, layer.out_features + return None, None # Unsupported layer type + + +def get_common_prefix(paths): + # Split each path into components and find the common prefix + split_paths = [path.split(".") for path in paths] + common_prefix = split_paths[0] + for path in split_paths[1:]: + common_prefix = [comp for comp, other in zip(common_prefix, path) if comp == other] + return ".".join(common_prefix) def unsupported_meta_device(model): @@ -1102,3 +1223,48 @@ def copy_python_files_from_model_cache(model, save_path: str): if file.endswith(".py") and os.path.isfile(full_file_name): logger.debug(f"Transferring {full_file_name} to {save_path}") shutil.copy(full_file_name, save_path) + + +def extract_block_names_to_str(quant_block_list): + if not isinstance(quant_block_list, (list, tuple)): + return None + # Extract common prefix for each list + prefixes = [get_common_prefix(blocks) for blocks in quant_block_list] + # Join prefixes into a single string + return ",".join(prefixes) + + +def find_matching_blocks(model, all_blocks, to_quant_block_names): + """ + Find and return matching blocks in the model based on to_quant_block_names. + + Args: + model: The model (not used in this specific function but kept for completeness). + all_blocks: List of lists, where each inner list contains full block names in the model. + to_quant_block_names: Comma-separated string of target block names to match. + + Returns: + target_blocks: List of lists containing full paths of matching blocks in the model. + """ + if not to_quant_block_names: + return all_blocks + to_quant_block_list = to_quant_block_names + if isinstance(to_quant_block_names, list) or isinstance(to_quant_block_names, tuple): + return to_quant_block_names + if isinstance(to_quant_block_names, str): + to_quant_block_list = [name.strip() for name in to_quant_block_names.split(",")] + target_blocks = [] + for block_list in all_blocks: + matched_sublist = [] + for name in to_quant_block_list: + matches = [block for block in block_list if re.search(name, block)] + if matches: + matched_sublist.extend(matches) + if matched_sublist: + target_blocks.append(matched_sublist) + if not target_blocks: + raise ValueError( + "No block names matched. Please check the input for to_quant_block_name," + "or set to_quant_block_name to None to automatically match quantizable blocks." + ) + return target_blocks diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index 6a6227023..d433e9910 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -24,7 +24,6 @@ check_to_quantized, compile_func, deepspeed_exists, - get_scale_shape, is_mx_fp, is_nv_fp, set_module, @@ -35,6 +34,26 @@ from deepspeed.module_inject import LinearAllreduce, LinearLayer +def get_scale_shape(weight, group_size): + """Computes the shape of the scale tensor for quantization based on the weight tensor and group size. + + Args: + weight (torch.Tensor): The weight tensor of the layer. + group_size (int): The size of the groups for quantization. + + Returns: + The shape of the scale tensor to be used for quantization. + """ + if group_size == 0: + return 1 + elif group_size == -1 or weight.shape[1] < group_size: + shape = weight.shape[0] + else: + shape = weight.shape[0] * ((weight.shape[1] + group_size - 1) // group_size) + + return shape + + def reshape_and_pad_tensor(v, group_size=-1): """Reshapes the tensor based on the group size. diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index 7347118bc..20888486e 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -89,7 +89,7 @@ def test_fp_layers(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) - from auto_round.utils import get_fp_layer_names + from auto_round.compressors.utils import get_fp_layer_names layer_names = get_fp_layer_names(model, "model.decoder.layers.0,model.decoder.layers.1") layer_configs = {} @@ -114,7 +114,7 @@ def test_fp_layers_awq(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) - from auto_round.utils import get_fp_layer_names + from auto_round.compressors.utils import get_fp_layer_names layer_names = get_fp_layer_names(model, "model.decoder.layers.0,model.decoder.layers.1") layer_configs = {} From da05bcc1ec608d85ff0e3da9515150e4ea8f7838 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 26 Oct 2025 22:42:12 -0400 Subject: [PATCH 4/7] reduce file Signed-off-by: n1ck-guo --- auto_round/compressors/utils.py | 6 +-- auto_round/utils/__init__.py | 1 - auto_round/utils/common.py | 65 ++++++++++++++++++++++++++ auto_round/utils/constants.py | 82 --------------------------------- auto_round/utils/device.py | 2 +- auto_round/utils/model.py | 2 +- 6 files changed, 70 insertions(+), 88 deletions(-) delete mode 100644 auto_round/utils/constants.py diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index ff889f8c5..78c48b8ec 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -218,7 +218,7 @@ def infer_bits_by_data_type(data_type: str): Returns: int: bits inferred by data_type, None means cannot infer correct bits by data_type """ - from auto_round.utils.constants import SUPPORTED_DTYPES + from auto_round.utils import SUPPORTED_DTYPES if data_type is None: return 16 @@ -939,7 +939,7 @@ def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): list: A list of layer names that match the specified FP layers or are subcomponents of those layers. """ - from auto_round.utils.constants import SUPPORTED_LAYER_TYPES + from auto_round.utils import SUPPORTED_LAYER_TYPES if not fp_layers: return [] @@ -976,7 +976,7 @@ def get_shared_keys(model): tuple: tuple of shared keys. """ from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS - from auto_round.utils.constants import SHARED_CACHE_KEYS + from auto_round.utils import SHARED_CACHE_KEYS shared_keys = SHARED_CACHE_KEYS shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ()) diff --git a/auto_round/utils/__init__.py b/auto_round/utils/__init__.py index 3ffb5359a..898946ad3 100644 --- a/auto_round/utils/__init__.py +++ b/auto_round/utils/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from auto_round.utils.constants import * from auto_round.utils.device import * from auto_round.utils.common import * from auto_round.utils.model import * diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index 8c28980be..dea3f8c81 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -20,10 +20,27 @@ from typing import Any, Callable, Dict, List, Tuple, Union import torch +import transformers +from packaging import version +from auto_round.export.export_to_gguf.config import GGUF_CONFIG from auto_round.logger import logger +def compare_versions(v1, v2): + return version.parse(v1) >= version.parse(v2) + + +def torch_version_at_least(version_string): + return compare_versions(torch.__version__, version_string) + + +TORCH_VERSION_AT_LEAST_2_6_PRE_RELEASE = torch_version_at_least("2.5.99") +TORCH_VERSION_AT_LEAST_2_6 = torch_version_at_least("2.6.0") +TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0") +TORCH_VERSION_AT_LEAST_2_4 = torch_version_at_least("2.4.0") + + class LazyImport(object): """Lazy import python module till use.""" @@ -60,6 +77,54 @@ def __call__(self, *args, **kwargs): htcore = LazyImport("habana_frameworks.torch.core") +class SupportedFormats: + + def __init__(self): + self._support_format = ( + "auto_round", + "auto_gptq", + "auto_awq", + "auto_round:auto_gptq", + "auto_round:gptqmodel", + "auto_round:auto_awq", + "auto_round:llm_compressor", + "itrex", + "itrex_xpu", + "fake", + "llm_compressor", + ) + self._gguf_format = tuple(sorted(GGUF_CONFIG.keys())) + self._support_list = self._support_format + self._gguf_format + + def __contains__(self, key): + return True if key in self._support_list else False + + def __str__(self): + # Return "(%s)" % ', '.join(self._support_format + ("gguf:q*_0", "gguf:q*_1", "gguf:q*_k_s")) + return "(%s)" % ", ".join(self._support_list) + + def __getitem__(self, key): + return self._support_list[key] + + +SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") + +deepspeed_exists = False +if importlib.util.find_spec("deepspeed"): # check if deepspeed is installed + deepspeed_exists = True + +SUPPORTED_DTYPES = ("int", "mx_fp", "fp", "nv_fp") +SUPPORTED_FORMATS = SupportedFormats() +SUPPORTED_LAYER_TYPES = (torch.nn.Linear, transformers.pytorch_utils.Conv1D) +# Changed to str as it relies on triton or others lib to load this +INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear",) +# transformers.integrations.finegrained_fp8.FP8Linear +if deepspeed_exists: + from deepspeed.module_inject import LinearAllreduce, LinearLayer + + SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce) + + def is_debug_mode(): """Checks if the Python interpreter is running in debug mode. diff --git a/auto_round/utils/constants.py b/auto_round/utils/constants.py deleted file mode 100644 index ef962bdcd..000000000 --- a/auto_round/utils/constants.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import importlib - -import torch -import transformers -from packaging import version - -from auto_round.export.export_to_gguf.config import GGUF_CONFIG - - -def compare_versions(v1, v2): - return version.parse(v1) >= version.parse(v2) - - -def torch_version_at_least(version_string): - return compare_versions(torch.__version__, version_string) - - -TORCH_VERSION_AT_LEAST_2_6_PRE_RELEASE = torch_version_at_least("2.5.99") -TORCH_VERSION_AT_LEAST_2_6 = torch_version_at_least("2.6.0") -TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0") -TORCH_VERSION_AT_LEAST_2_4 = torch_version_at_least("2.4.0") - - -class SupportedFormats: - - def __init__(self): - self._support_format = ( - "auto_round", - "auto_gptq", - "auto_awq", - "auto_round:auto_gptq", - "auto_round:gptqmodel", - "auto_round:auto_awq", - "auto_round:llm_compressor", - "itrex", - "itrex_xpu", - "fake", - "llm_compressor", - ) - self._gguf_format = tuple(sorted(GGUF_CONFIG.keys())) - self._support_list = self._support_format + self._gguf_format - - def __contains__(self, key): - return True if key in self._support_list else False - - def __str__(self): - # Return "(%s)" % ', '.join(self._support_format + ("gguf:q*_0", "gguf:q*_1", "gguf:q*_k_s")) - return "(%s)" % ", ".join(self._support_list) - - def __getitem__(self, key): - return self._support_list[key] - - -SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") - -deepspeed_exists = False -if importlib.util.find_spec("deepspeed"): # check if deepspeed is installed - deepspeed_exists = True - -SUPPORTED_DTYPES = ("int", "mx_fp", "fp", "nv_fp") -SUPPORTED_FORMATS = SupportedFormats() -SUPPORTED_LAYER_TYPES = (torch.nn.Linear, transformers.pytorch_utils.Conv1D) -# Changed to str as it relies on triton or others lib to load this -INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear",) -# transformers.integrations.finegrained_fp8.FP8Linear -if deepspeed_exists: - from deepspeed.module_inject import LinearAllreduce, LinearLayer - - SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 4aeb66f3e..f475e2157 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -63,7 +63,7 @@ def is_hpu_lazy_mode(): def _use_hpu_compile_mode(): - from auto_round.utils.constants import TORCH_VERSION_AT_LEAST_2_4 + from auto_round.utils.common import TORCH_VERSION_AT_LEAST_2_4 return TORCH_VERSION_AT_LEAST_2_4 and not is_hpu_lazy_mode() diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index ac34c80ea..7345c11eb 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -978,7 +978,7 @@ def set_module(model, key, new_module): def get_layer_features(layer): """Extracts input and output feature dimensions for supported layers.""" - from auto_round.utils.constants import LinearAllreduce, LinearLayer, deepspeed_exists + from auto_round.utils.common import LinearAllreduce, LinearLayer, deepspeed_exists if type(layer) == torch.nn.Linear: return layer.in_features, layer.out_features From 50071b689ca440f756ce7b25b7eeb0b0cbf1decd Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Oct 2025 02:47:59 -0400 Subject: [PATCH 5/7] fix merge Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 8 +- auto_round/compressors/utils.py | 1 - .../export_to_nvfp_mxfp.py | 3 +- .../export/export_to_autoround/qlinear_fp.py | 3 +- .../export_to_llmcompressor/export_to_fp.py | 3 +- auto_round/utils.py | 3140 ----------------- auto_round/utils/device.py | 2 +- auto_round/utils/model.py | 2 +- auto_round/wrapper.py | 6 +- 9 files changed, 12 insertions(+), 3156 deletions(-) delete mode 100644 auto_round/utils.py diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index bf5d1a576..9608e6ee4 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -42,7 +42,11 @@ gguf_args_check, infer_bits_by_data_type, init_cache, + is_mx_fp, + is_nv_fp, is_standard_fp, + is_static_wfp8afp8, + is_wfp8afp8, reset_params, set_layer_config, ) @@ -87,10 +91,6 @@ is_fp8_linear, is_fp8_model, is_hpex_available, - is_mx_fp, - is_nv_fp, - is_static_wfp8afp8, - is_wfp8afp8, llm_load_model, mv_module_from_gpu, set_amax_for_all_moe_layers, diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 78c48b8ec..ed56c72fa 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -252,7 +252,6 @@ def set_layer_config( """ from auto_round.schemes import get_gguf_scheme - from auto_round.utils.check_utils import is_mx_fp, is_nv_fp from auto_round.utils.model import get_layer_names_in_block, get_lm_head_name, get_module # ---- helpers ------------------------------------------------- diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index ba49db53a..819b9d99c 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -25,6 +25,7 @@ import transformers from tqdm import tqdm +from auto_round.compressors.utils import is_mx_fp, is_nv_fp from auto_round.export.export_to_autoround.utils import check_neq_config from auto_round.export.utils import filter_quantization_config, save_model from auto_round.logger import logger @@ -36,8 +37,6 @@ copy_python_files_from_model_cache, get_module, get_packing_device, - is_mx_fp, - is_nv_fp, set_amax_for_all_moe_layers, set_module, to_standard_regex, diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py index 1e6846777..5f8f2f6f4 100644 --- a/auto_round/export/export_to_autoround/qlinear_fp.py +++ b/auto_round/export/export_to_autoround/qlinear_fp.py @@ -34,10 +34,11 @@ import torch.nn as nn import transformers +from auto_round.compressors.utils import BackendDataType, is_mx_fp, is_nv_fp from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad -from auto_round.utils import BackendDataType, get_packing_device, is_mx_fp, is_nv_fp +from auto_round.utils import get_packing_device # from auto_round.utils import get_weight_compress_dtype logger = getLogger(__name__) diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py index 67aedd7e9..31766af21 100644 --- a/auto_round/export/export_to_llmcompressor/export_to_fp.py +++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py @@ -24,6 +24,7 @@ import transformers from tqdm import tqdm +from auto_round.compressors.utils import is_mx_fp, is_nv_fp from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear from auto_round.export.export_to_llmcompressor.utils import generate_ignore_regex_list from auto_round.export.utils import filter_quantization_config, save_model @@ -35,8 +36,6 @@ copy_python_files_from_model_cache, get_block_names, get_module, - is_mx_fp, - is_nv_fp, set_amax_for_all_moe_layers, set_module, ) diff --git a/auto_round/utils.py b/auto_round/utils.py deleted file mode 100644 index afb7b2940..000000000 --- a/auto_round/utils.py +++ /dev/null @@ -1,3140 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections.abc -import copy -import gc -import importlib -import json -import os -import re -import sys -from collections import UserDict -from dataclasses import asdict, fields -from enum import Enum -from functools import lru_cache -from pathlib import Path -from typing import Any, Callable, Dict, List, Tuple, Union - -import cpuinfo -import torch -import transformers -from accelerate.utils import get_balanced_memory -from packaging import version -from torch.amp import autocast - -from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType -from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme, get_gguf_scheme, preset_name_to_scheme - -SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") - -deepspeed_exists = False -if importlib.util.find_spec("deepspeed"): # check if deepspeed is installed - deepspeed_exists = True - - -class SupportedFormats: - - def __init__(self): - self._support_format = ( - "auto_round", - "auto_gptq", - "auto_awq", - "auto_round:auto_gptq", - "auto_round:gptqmodel", - "auto_round:auto_awq", - "auto_round:llm_compressor", - "itrex", - "itrex_xpu", - "fake", - "llm_compressor", - ) - self._gguf_format = tuple(sorted(GGUF_CONFIG.keys())) - self._support_list = self._support_format + self._gguf_format - - def __contains__(self, key): - return True if key in self._support_list else False - - def __str__(self): - # Return "(%s)" % ', '.join(self._support_format + ("gguf:q*_0", "gguf:q*_1", "gguf:q*_k_s")) - return "(%s)" % ", ".join(self._support_list) - - def __getitem__(self, key): - return self._support_list[key] - - -SUPPORTED_DTYPES = ("int", "mx_fp", "fp", "nv_fp") -SUPPORTED_FORMATS = SupportedFormats() -SUPPORTED_LAYER_TYPES = (torch.nn.Linear, transformers.pytorch_utils.Conv1D) - -# Changed to str as it relies on triton or others lib to load this -INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear",) -# transformers.integrations.finegrained_fp8.FP8Linear -if deepspeed_exists: - from deepspeed.module_inject import LinearAllreduce, LinearLayer - - SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce) - - -def infer_bits_by_data_type(data_type: str): - """Infer bits by data_type - - Args: - data_type (str): data_type - - Returns: - int: bits inferred by data_type, None means cannot infer correct bits by data_type - """ - if data_type is None: - return 16 - for supported_dtype in SUPPORTED_DTYPES: - if data_type.startswith(supported_dtype) and len(data_type) > len(supported_dtype): - ##first check the following two bits - suc_2str = data_type[len(supported_dtype) : len(supported_dtype) + 2] - if str.isdigit(suc_2str): - return int(suc_2str) - if str.isdigit(data_type[len(supported_dtype)]): - return int(data_type[len(supported_dtype)]) - return None - - -class LazyImport(object): - """Lazy import python module till use.""" - - def __init__(self, module_name): - """Init LazyImport object. - - Args: - module_name (string): The name of module imported later - """ - self.module_name = module_name - self.module = None - - def __getattr__(self, name): - """Get the attributes of the module by name.""" - try: - self.module = importlib.import_module(self.module_name) - mod = getattr(self.module, name) - except: - spec = importlib.util.find_spec(str(self.module_name + "." + name)) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - return mod - - def __call__(self, *args, **kwargs): - """Call the function in that module.""" - function_name = self.module_name.split(".")[-1] - module_name = self.module_name.split(f".{function_name}")[0] - self.module = importlib.import_module(module_name) - function = getattr(self.module, function_name) - return function(*args, **kwargs) - - -auto_gptq = LazyImport("auto_gptq") -htcore = LazyImport("habana_frameworks.torch.core") - - -################ Check available sys.module to decide behavior ################# -def is_package_available(package_name: str) -> bool: - """Check if the package exists in the environment without importing. - - Args: - package_name (str): package name - """ - from importlib.util import find_spec - - package_spec = find_spec(package_name) - return package_spec is not None - - -## check hpex -if is_package_available("habana_frameworks"): - _hpex_available = True - import habana_frameworks.torch.hpex # pylint: disable=E0401 -else: - _hpex_available = False - - -@torch._dynamo.disable() -@lru_cache(None) -def is_hpex_available(): - return _hpex_available - - -def get_module(module, key): - """Get module from model by key name. - - Args: - module (torch.nn.Module): original model - key (str): module name to be replaced - """ - name_list = key.split(".") - for name in name_list: - module = getattr(module, name, None) - return module - - -def set_module(model, key, new_module): - """Set new module into model by key name. - - Args: - model (torch.nn.Module): original model - key (str): module name to be replaced - new_module (torch.nn.Module): new module to be inserted - """ - module = model - name_list = key.split(".") - for name in name_list[:-1]: - if hasattr(module, name): - module = getattr(module, name) - setattr(module, name_list[-1], new_module) - - -def get_scale_shape(weight, group_size): - """Computes the shape of the scale tensor for quantization based on the weight tensor and group size. - - Args: - weight (torch.Tensor): The weight tensor of the layer. - group_size (int): The size of the groups for quantization. - - Returns: - The shape of the scale tensor to be used for quantization. - """ - if group_size == 0: - return 1 - elif group_size == -1 or weight.shape[1] < group_size: - shape = weight.shape[0] - else: - shape = weight.shape[0] * ((weight.shape[1] + group_size - 1) // group_size) - - return shape - - -def unsupported_meta_device(model): - """Checks if the model is a valid model for auto_round. - - Args: - model: The model to be checked. - - Returns: - bool: True if the model is valid, False otherwise. - """ - target_device = None - for param in model.parameters(): - if target_device is None: - target_device = param.device - if param.device != target_device: - if param.device.type == "meta" or target_device.type == "meta": - return True - if target_device.type == "meta": - if hasattr(model, "path"): - return False - else: - return True - return False - - -def to_device(input, device=torch.device("cpu")): - """Moves input data to the specified device. - - Args: - input: The input data to be moved. - device: The target device. - - Returns: - The input data on the specified device. - """ - if input is None: - return None - if isinstance(input, torch.Tensor): - return input.to(device) - if isinstance(input, dict) or isinstance(input, UserDict): - for inp in input.keys(): - input[inp] = to_device(input[inp], device) - - elif isinstance(input, list) or isinstance(input, tuple): - if len(input) == 0: - return input - input_res = [] - for inp in input: - input_res.append(to_device(inp, device)) - if isinstance(input, tuple): - input_res = tuple(input_res) - input = input_res - - return input - - -def mv_module_from_gpu(module): - """Moves module from gpu to cpu. - - Args: - module: The module to be moved. - - Returns: - The module on the specified device. - """ - if hasattr(module, "device"): - target_device = "cpu" - if module.device.type == target_device: - return module - else: - return module.to(target_device) - else: - return module.to("cpu") - - -def to_dtype(input, dtype=torch.float32): - """Moves input data to the specified data type. - - Args: - input: The input data to be moved. - dtype: The target data type. - - Returns: - The input data on the specified data type. - """ - if input is None: - return None - if isinstance(input, torch.Tensor): - return input.to(dtype) - if isinstance(input, dict) or isinstance(input, UserDict): - for inp in input.keys(): - input[inp] = to_dtype(input[inp], dtype) - - elif isinstance(input, list) or isinstance(input, tuple): - if len(input) == 0: - return input - input_res = [] - for inp in input: - input_res.append(to_dtype(inp, dtype)) - if isinstance(input, tuple): - input_res = tuple(input_res) - input = input_res - - return input - - -def check_is_cpu(device): - """Check if the device is a CPU. - - Args: - device: The device to be checked. - - Returns: - bool: True if the device is a CPU, False otherwise. - """ - return device == torch.device("cpu") or device == "cpu" - - -def get_common_prefix(paths): - # Split each path into components and find the common prefix - split_paths = [path.split(".") for path in paths] - common_prefix = split_paths[0] - for path in split_paths[1:]: - common_prefix = [comp for comp, other in zip(common_prefix, path) if comp == other] - return ".".join(common_prefix) - - -def extract_block_names_to_str(quant_block_list): - if not isinstance(quant_block_list, (list, tuple)): - return None - # Extract common prefix for each list - prefixes = [get_common_prefix(blocks) for blocks in quant_block_list] - # Join prefixes into a single string - return ",".join(prefixes) - - -def find_matching_blocks(model, all_blocks, to_quant_block_names): - """ - Find and return matching blocks in the model based on to_quant_block_names. - - Args: - model: The model (not used in this specific function but kept for completeness). - all_blocks: List of lists, where each inner list contains full block names in the model. - to_quant_block_names: Comma-separated string of target block names to match. - - Returns: - target_blocks: List of lists containing full paths of matching blocks in the model. - """ - if not to_quant_block_names: - return all_blocks - to_quant_block_list = to_quant_block_names - if isinstance(to_quant_block_names, list) or isinstance(to_quant_block_names, tuple): - return to_quant_block_names - if isinstance(to_quant_block_names, str): - to_quant_block_list = [name.strip() for name in to_quant_block_names.split(",")] - target_blocks = [] - for block_list in all_blocks: - matched_sublist = [] - for name in to_quant_block_list: - matches = [block for block in block_list if re.search(name, block)] - if matches: - matched_sublist.extend(matches) - if matched_sublist: - target_blocks.append(matched_sublist) - if not target_blocks: - raise ValueError( - "No block names matched. Please check the input for to_quant_block_name," - "or set to_quant_block_name to None to automatically match quantizable blocks." - ) - return target_blocks - - -def get_block_names(model, quant_vision=False): - """Get the block names for transformers-like networks. - - Args: - model: The model. - - Returns: - block_names: A list whose elements are list of block's layer names - """ - from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK - - def _search_block(name, module): - if hasattr(type(module), "__name__") and "ModuleList" in type(module).__name__: - return [(name, module)] - target_modules = [] - for n, m in module.named_children(): - if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: - target_modules.append((".".join(filter(None, (name, n))), m)) - else: - target_modules.extend(_search_block(".".join(filter(None, (name, n))), m)) - return target_modules - - def _get_llm_block_names(model): - block_names = [] - target_modules = _search_block("", model) - - for i, target_m in enumerate(target_modules): - block_names.append([]) - for n, m in target_m[1].named_children(): - block_names[i].append(target_m[0] + "." + n) - return block_names - - def _get_vlm_block_names(model, quant_vision=False): - if ( - hasattr(model, "config") - and hasattr(model.config, "model_type") - and model.config.model_type in SPECIAL_MULTIMODAL_BLOCK.keys() - ): - return SPECIAL_MULTIMODAL_BLOCK.get(model.config.model_type)(model, quant_vision=quant_vision) - block_names = [] - target_modules = [] - vision_blocks_tuple = ("vision", "visual", "image", "img") - last_block_name = "" - for n, m in model.named_modules(): - if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: - if quant_vision or all(key not in n.lower() for key in (vision_blocks_tuple)): - if last_block_name and last_block_name in n: - continue - target_modules.append((n, m)) - last_block_name = n - for i, target_m in enumerate(target_modules): - block_names.append([]) - for n, m in target_m[1].named_children(): - block_names[i].append(target_m[0] + "." + n) - return block_names - - if quant_vision or not is_pure_text_model(model): - return _get_vlm_block_names(model, quant_vision=quant_vision) - else: - return _get_llm_block_names(model) - - -def collect_best_params(block): - params = {} - for n, m in block.named_modules(): - if hasattr(m, "orig_layer"): - params[n] = {} - for key in m.params.keys(): - params[n][key] = copy.deepcopy(m.params[key].data) - return params - - -def block_forward( - block: torch.nn.Module, - input_ids: torch.Tensor, - input_others: dict, - amp: bool = False, - amp_dtype: torch.dtype = torch.float16, - device: torch.device = torch.device("cpu"), - output_return_id: int = 0, -) -> Union[torch.Tensor, dict]: - """Performs a forward pass through a block with the given inputs. - - Args: - block: The block to perform the forward pass on. - input_ids: The input IDs. - input_others: A dictionary containing other input data. - amp: A boolean indicating whether to use automatic mixed precision. - amp_dtype: The data type for automatic mixed precision. - device: The target device. - output_return_id: if the output has more than one tenor, return the specified idx tensor. - - Returns: - output: The output of the forward pass. - """ - if input_ids.device != device: - input_ids = to_device(input_ids, device) - input_others = to_device(input_others, device) - input_tuple = input_others.pop("positional_inputs", None) - if "alibi" in input_others.keys() and input_others["alibi"] is not None: - alibi = input_others["alibi"] - input_others["alibi"] = alibi.reshape(-1, alibi.shape[2], alibi.shape[3]) - if amp: - with autocast(device_type=device.split(":")[0], dtype=amp_dtype): # pragma: no cover - output = block(input_ids, *input_tuple, **input_others) - else: - output = block(input_ids, *input_tuple, **input_others) - if isinstance(output_return_id, int) and (isinstance(output, list) or isinstance(output, tuple)): - output = output[output_return_id] - return output - - -def check_to_quantized(config): - """Checks if the configuration is valid for quantization. - - Args: - config (dict or object): The configuration to check. It can be either a - dictionary with a 'bits' key or an object with a 'bits' attribute. - - Returns: - bool: True if the configuration is valid for quantization (bits <= 8), - False otherwise. - """ - if isinstance(config, (dict, QuantizationScheme)): - bits = int(config.get("bits", 16)) - act_bits = int(config.get("act_bits", 16)) - elif hasattr(config, "orig_layer"): - bits = int(config.orig_layer.bits) if hasattr(config.orig_layer, "bits") else 16 - act_bits = int(config.orig_layer.act_bits) if hasattr(config.orig_layer, "act_bits") else 16 - else: - bits = int(config.bits) if hasattr(config, "bits") else 16 - act_bits = int(config.act_bits) if hasattr(config, "act_bits") else 16 - - return bits <= 8 or act_bits <= 8 - - -def detect_device_count(): - """Detects the number of available computation devices. - - This function checks if CUDA is available. If it is, it returns the count - of available CUDA devices. If not, it attempts to import the Habana - device framework to return the count of Habana devices. If the import - fails or no devices are found, it returns 0. - - Returns: - int: The number of available devices (CUDA or Habana). - """ - if torch.cuda.is_available(): - return torch.cuda.device_count() - else: - try: - import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401 - - return hthpu.device_count() - except ImportError: - return 0 - - -def detect_device(device: Union[str, int, torch.device] = None) -> str: - """Detects the appropriate computation device. - - This function determines the device to use for computations. It can take - a specific device index or default to 'auto'. The function checks for - available devices in the following order: CUDA, Habana, and finally CPU. - - Args: - device (str, int, or torch.device, optional): The desired device. - If 'auto' or None, the function will determine the best device - automatically. - - Returns: - str: The device to use for computations, formatted as a string. - """ - - def is_valid_digit(s): - try: - num = int(s) - return 0 <= num - except: - return False - - dev_idx = None - if is_valid_digit(device): - dev_idx = int(device) - device = "auto" - if isinstance(device, str) and "," in device: # device is "0,1,2" - device_list = [int(dev) for dev in device.split(",") if dev.isdigit()] - dev_idx = device_list[0] if device_list else None - device = "auto" - if device is None or device == "auto": - if torch.cuda.is_available(): - device = torch.device("cuda") - # logger.info("Using GPU device") - elif is_hpex_available(): # pragma: no cover - device = torch.device("hpu") - # logger.info("Using HPU device") - elif torch.xpu.is_available(): # pragma: no cover - device = torch.device("xpu") - # Use CPU as a fallback - else: - device = torch.device("cpu") - # logger.info("Using CPU device") - if dev_idx is not None and str(device) != "cpu": - device = str(device) + f":{dev_idx}" - return str(device) - elif isinstance(device, torch.device): - device = str(device) - elif isinstance(device, str): ## for cuda:0 - if device == "tp": # pragma: no cover - # should not specify card, e.g., cuda:0 - if torch.cuda.is_available(): - device = "cuda" - elif is_hpex_available(): - device = "hpu" - else: - device = "cpu" - else: - device = device - return device - - -class CpuInfo(object): - """Get CPU Info.""" - - def __init__(self): - """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket.""" - self._bf16 = False - info = cpuinfo.get_cpu_info() - if "arch" in info and "X86" in info["arch"]: - cpuid = cpuinfo.CPUID() - max_extension_support = cpuid.get_max_extension_support() - if max_extension_support >= 7: - eax = cpuid._run_asm( - b"\xb9\x01\x00\x00\x00", # mov ecx, 1 - b"\xb8\x07\x00\x00\x00" b"\x0f\xa2" b"\xc3", # mov eax, 7 # cpuid # ret - ) - self._bf16 = bool(eax & (1 << 5)) - - @property - def bf16(self): - """Get whether it is bf16.""" - return self._bf16 - - -def is_local_path(path): - """Checks if a given path exists locally. - - Args: - path (str): The path to check. - - Returns: - bool: True if the path exists locally, False otherwise. - """ - format_list = ( - "json", - "txt", - ) - flag = None - for x in format_list: - flag = True if x in path else flag - return flag and os.path.exists(path) - - -def convert_dtype_str2torch(str_dtype): - """Converts a string dtype to its corresponding PyTorch dtype. - - Args: - str_dtype (str): The string representation of the dtype. - - Returns: - torch.dtype: The PyTorch dtype. - - Raises: - ValueError: If the input str_dtype is unsupported. - """ - if isinstance(str_dtype, torch.dtype) or str_dtype is None: - return str_dtype - if str_dtype == "int8": - return torch.int8 - elif str_dtype == "fp32" or str_dtype == "float32" or str_dtype == "auto": - return torch.float - elif str_dtype == "fp16" or str_dtype == "float16": - return torch.float16 - elif str_dtype == "bf16" or str_dtype == "bfloat16": - return torch.bfloat16 - else: - raise ValueError(f"Unsupported string dtype '{str_dtype}' for conversion to torch dtype.") - - -def convert_dtype_torch2str(dtype): - """Converts a PyTorch dtype to its corresponding string representation. - - Args: - dtype: PyTorch dtype or str. The dtype to convert. - - Returns: - str: The string representation of the dtype. - - Raises: - ValueError: If the input dtype is unsupported. - """ - if isinstance(dtype, str) or dtype is None: - return dtype - if dtype == torch.int8: - return "int8" - elif dtype == torch.float: - return "fp32" - elif dtype == torch.float16: - return "fp16" - elif dtype == torch.bfloat16: - return "bf16" - elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]: - return dtype - else: - raise ValueError(f"Unsupported PyTorch dtype '{dtype}' for conversion to string dtype.") - - -def convert_dtype_torch2str_hf(dtype): - """Converts a PyTorch dtype to its corresponding huggingface string dtype, e.g. torch.float32 -> 'float32'. - - Args: - dtype: PyTorch dtype or str. The dtype to convert. - - Returns: - str: The string representation of the dtype. - - Raises: - ValueError: If the input str_dtype is unsupported. - """ - if dtype is None: - return dtype - if isinstance(dtype, str): - if "float" not in dtype and "int" not in dtype: - dtype = convert_dtype_str2torch(dtype) - else: - return dtype - str_dtype = str(dtype) - if "." not in str_dtype: - raise ValueError(f"Unsupported pytorch dtype '{dtype}' for conversion to huggingface str dtype") - str_dtype = str_dtype.split(".")[1] - return str_dtype - - -def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): - """Checks the availability of memory on the specified device for processing inputs using a given weight tensor. - - Args: - device (str): The device type ('cuda' for GPU or 'hpu' for HPU). - inputs (torch.Tensor): Input tensor. - weight (torch.Tensor): Weight tensor. - org_seqlen (int): Original sequence length. - org_bs (int): Original batch size. - - Returns: - tuple: A tuple containing availability status (bool), modified sequence length (int), - and modified batch size (int). - """ - weight_memory = weight.numel() * weight.element_size() - if "cuda" in device: - current_gpu_index = torch.cuda.current_device() - total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory - used_memory = torch.cuda.memory_allocated(current_gpu_index) - free_space = total_memory - used_memory - elif "hpu" in device: # pragma: no cover - current_hpu_index = torch.hpu.current_device() - free_space = torch.hpu.memory_reserved(current_hpu_index) - else: - return True, org_seqlen, org_bs - - free_space = free_space - weight_memory * 10 # for min_max_scale & grad usage - seqlen = org_seqlen - bs = org_bs - in_feature = weight.shape[1] - out_feature = weight.shape[0] - while seqlen >= 128: - input_size = bs * seqlen * in_feature - output_size = bs * seqlen * out_feature - input_output_memory = 2 * (input_size * inputs.element_size() + output_size * inputs.element_size()) - if input_output_memory < free_space: - return True, seqlen, bs - seqlen = seqlen // 2 - bs = 1 - - return False, seqlen, bs - - -def get_layer_names_in_block( - model: torch.nn.Module, - supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), - quant_block_list: list = None, - class_names: tuple = None, -) -> list[str]: - """Retrieves the names of layers within each block of the model. - - Returns: - list: A list of strings, where each string is the name of a layer - within a block of the model. - """ - if class_names is None: - class_names = [] - for n, m in model.named_modules(): - if type(m) in supported_types or (class_names is not None and m.__class__.__name__ in class_names): - m.bk_tmp_name = n - layers_in_block = [] - if bool(quant_block_list): - all_blocks = quant_block_list - else: - all_blocks = get_block_names(model) - for block_names in all_blocks: - for block_name in block_names: - block = get_module(model, block_name) - for n, m in block.named_modules(): - if hasattr(m, "bk_tmp_name"): - layers_in_block.append(m.bk_tmp_name) - delattr(m, "bk_tmp_name") - return layers_in_block - - -def is_autoround_exllamav2_available(): - """Checks if the AutoRound ExLlamaV2 kernels are available. - - Returns: - bool: - True if the AutoRound ExLlamaV2 kernels are available, False otherwise. - """ - res = True - try: - from autoround_exllamav2_kernels import gemm_half_q_half, make_q_matrix - except ImportError as e: - res = False - return res - - -def get_library_version(library_name): - from packaging.version import Version - - python_version = Version(sys.version.split()[0]) - if python_version < Version("3.8"): - import warnings - - warnings.filterwarnings("ignore", category=DeprecationWarning) - import pkg_resources # pylint: disable=E0401 - - try: - version = pkg_resources.get_distribution(library_name).version - return version - except pkg_resources.DistributionNotFound: - return f"{library_name} is not installed" - else: - import importlib.metadata # pylint: disable=E0401 - - try: - version = importlib.metadata.version(library_name) - return version - except importlib.metadata.PackageNotFoundError: - return f"{library_name} is not installed" - - -def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): - """ - Configures and returns a QuantLinear class based on the specified backend and parameters. - - Args: - backend (str): The backend to be used for quantization. Supported values include "qigen", "triton", "marlin", - "exllama", and "cuda". - bits (int, optional): The number of bits for quantization. Default is 4. - group_size (int, optional): The group size for quantization. Default is 128. - sym (bool, optional): Flag indicating whether to use symmetric quantization. Default is False. - - Returns: - class: The dynamically imported QuantLinear class configured according to the specified parameters. - """ - use_triton = True - if bits not in [2, 4, 8]: - use_triton = False - disable_exllamav2 = True - disable_exllamav1 = False - disable_marlin = True - use_qigen = False - if "qigen" in backend: - use_triton = False - use_qigen = True - elif "triton" in backend: - use_triton = True - elif "marlin" in backend and sym: - use_triton = False - disable_marlin = False - elif "exllama" in backend: ##need v1 code to export - use_triton = True ##same with triton - disable_marlin = True - elif "cuda" in backend: - use_triton = False - disable_marlin = True - disable_exllamav2 = True - disable_exllamav1 = True - if use_triton: - from auto_round.export.export_to_autogptq.qlinear_triton import QuantLinear - - return QuantLinear - try: - import auto_gptq # pylint: disable=E0401 - except: - logger.error(f"please install auto_gptq via 'pip install auto-gptq' to support exporting to {backend}") - exit() - - from auto_gptq.utils.import_utils import dynamically_import_QuantLinear # pylint: disable=E0401 - - version = get_library_version("auto_gptq") - from packaging.version import Version - - if Version(version) < Version("0.7.2"): - QuantLinear = dynamically_import_QuantLinear( - use_triton=use_triton, - desc_act=False, - group_size=group_size, - bits=bits, - disable_exllama=disable_exllamav1, - disable_exllamav2=disable_exllamav2, - use_qigen=use_qigen, - disable_marlin=disable_marlin, - ) - else: - QuantLinear = dynamically_import_QuantLinear( # pylint: disable=E1123 - use_triton=use_triton, - desc_act=False, - group_size=group_size, - bits=bits, - disable_exllama=disable_exllamav1, - disable_exllamav2=disable_exllamav2, - use_qigen=use_qigen, - use_marlin=not disable_marlin, - ) - return QuantLinear - - -def _clear_memory_for_cpu_and_cuda(tensor=None): - if isinstance(tensor, list): - for i in range(len(tensor)): - tensor[i] = None - if tensor is not None: - del tensor - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - if torch.xpu.is_available(): - torch.xpu.empty_cache() - - -@torch._dynamo.disable() -def clear_memory(tensor=None): - if is_hpex_available(): - # hpu does not have empty_cache - return - else: - _clear_memory_for_cpu_and_cuda(tensor) - - -def compare_versions(v1, v2): - return version.parse(v1) >= version.parse(v2) - - -def torch_version_at_least(version_string): - return compare_versions(torch.__version__, version_string) - - -TORCH_VERSION_AT_LEAST_2_6_PRE_RELEASE = torch_version_at_least("2.5.99") -TORCH_VERSION_AT_LEAST_2_6 = torch_version_at_least("2.6.0") -TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0") -TORCH_VERSION_AT_LEAST_2_4 = torch_version_at_least("2.4.0") - - -# Note on HPU usage: -# There are two modes available for enabling auto-round on HPU: -# 1. Compile Mode -# 1) Use PyTorch version ≥ 2.4 (Intel® Gaudi® v1.18 or later) -# 2) Set `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1` -# The compile mode can speed up quantization process but still in experimental stage. -# 2. Lazy Mode (By default) - - -def is_hpu_lazy_mode(): - return os.getenv("PT_HPU_LAZY_MODE") != "0" - - -def _use_hpu_compile_mode(): - return TORCH_VERSION_AT_LEAST_2_4 and not is_hpu_lazy_mode() - - -def compile_func_on_hpu(func): - if _use_hpu_compile_mode(): - return torch.compile(func, backend="hpu_backend") - return func - - -def compile_func_on_cuda_or_cpu(func): - return torch.compile(func) - - -def compile_func( - fun: Union[torch.nn.Module, Callable], device: Union[str, torch.device, int] -) -> Union[torch.nn.Module, Callable]: - """Compile function on the specified device.""" - if "hpu" in str(device): - return compile_func_on_hpu(fun) ## use auto by default - else: - return compile_func_on_cuda_or_cpu(fun) - - -def is_numba_available(): # pragma: no cover - """Check if Numba is available.""" - try: - import numba - - return True - except ImportError: - return False - - -def _is_tbb_installed(): # pragma: no cover - import importlib.metadata - - try: - importlib.metadata.version("tbb") - return True - except importlib.metadata.PackageNotFoundError: - return False - - -def _is_tbb_configured(): # pragma: no cover - try: - from numba.np.ufunc.parallel import _check_tbb_version_compatible - - # check if TBB is present and compatible - _check_tbb_version_compatible() - - return True - except ImportError as e: - logger.warning_once(f"TBB not available: {e}") - return False - - -def is_tbb_available(): # pragma: no cover - """Check if TBB is available.""" - if not _is_tbb_installed(): - logger.warning_once("TBB is not installed, please install it with `pip install tbb`.") - return False - if not _is_tbb_configured(): - logger.warning_once( - ( - "TBB is installed but not configured correctly. \n" - "Please add the TBB library path to `LD_LIBRARY_PATH`, " - "for example: `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/`." - ) - ) - return False - return True - - -def can_pack_with_numba(): # pragma: no cover - """Check if Numba and TBB are available for packing. - - To pack tensor with Numba, both Numba and TBB are required, and TBB should be configured correctly. - """ - if not is_numba_available(): - logger.warning_once("Numba is not installed, please install it with `pip install numba`.") - return False - if not is_tbb_available(): - return False - return True - - -def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): - """Identifies and returns layers in the model to exclude from quantization. - - This function processes a comma-separated list of fully precision (FP) layers, - matches them to the names of layers in the model, and returns a list of such - layers to exclude from quantization. - - Args: - model (torch.nn.Module): The model whose layers will be inspected. - fp_layers (str): A comma-separated string of layer names to be excluded - from quantization. Whitespace is ignored in this string. - - Returns: - list: A list of layer names that match the specified FP layers or are - subcomponents of those layers. - """ - if not fp_layers: - return [] - fp_layers = fp_layers.replace(" ", "").split(",") - all_layer_names = [] - for n, m in model.named_modules(): - if type(m) in SUPPORTED_LAYER_TYPES: - all_layer_names.append(n) - not_to_quantized_layers = [] - - for fp_layer in fp_layers: - if fp_layer == "": - continue - if fp_layer in all_layer_names: - not_to_quantized_layers.append(fp_layer) - continue - if fp_layer[-1].isdigit(): - fp_layer = fp_layer + "." ##tricky setting - for name in all_layer_names: - if fp_layer in name: - not_to_quantized_layers.append(name) - logger.trace(f"not_to_quantized_layers: {not_to_quantized_layers}") - return not_to_quantized_layers - - -def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=None): - """Checks if a model is compatible with the AutoAWQ GEMM kernel. - - Args: - model: The model object to evaluate, typically a PyTorch model. - bits (int): The number of bits for quantization (must be 4 for compatibility). - group_size (int): The group size for quantization. - sym (bool): Whether symmetric quantization is used (not utilized in the current function logic). - layer_configs (dict, optional): A dictionary mapping layer names to configurations, where each - configuration can specify a custom number of bits for the layer. - - Returns: - tuple: A tuple containing: - - bool: `True` if the model is compatible, `False` otherwise. - - str: An error message describing why the model is incompatible, or an empty string if compatible. - """ - if bits != 4: - return False, "AutoAWQ GEMM kernel only supports 4 bits" - for n, m in model.named_modules(): - if type(m) == transformers.pytorch_utils.Conv1D: - return False, "AutoAWQ GEMM kernel does not support conv1d" - - layer_names = get_layer_names_in_block(model) - for layer_name in layer_names: - if ( - layer_configs is not None - and layer_name in layer_configs.keys() - and layer_configs[layer_name].get("bits", bits) > 8 - ): - continue - - layer = get_module(model, layer_name) - if layer.in_features % group_size != 0: - return False, f"Layer {layer_name} in_features is not multiple of group_size {group_size}" - if layer.out_features % (32 // bits) != 0: - return False, f"Layer {layer_name} out_features is not multiple of 32 // bits" - - return True, "" - - -def get_device_and_parallelism(device: Union[str, torch.device, int]) -> Tuple[str, bool]: - if isinstance(device, str): - devices = device.replace(" ", "").split(",") - elif isinstance(device, int): - devices = [str(device)] - else: - devices = [device] - if all(s.isdigit() for s in devices) and len(devices) > 1 and torch.cuda.is_available(): - device = "cuda" - parallelism = True - elif all(s.isdigit() for s in devices) and len(devices) > 1 and torch.xpu.is_available(): - device = "xpu" - parallelism = False - # pragma: no cover - elif device == "auto": - device = detect_device(device) - parallelism = True - else: - device = detect_device(device) - parallelism = False - return device, parallelism - - -def set_cuda_visible_devices(device): - devices = device.replace(" ", "").split(",") - if all(s.isdigit() for s in devices): - if "CUDA_VISIBLE_DEVICES" in os.environ: - current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] - current_visible_devices = current_visible_devices.split(",") - indices = [int(device) for device in devices] - try: - pick_device = [current_visible_devices[i] for i in indices] - except: - raise ValueError( - "Invalid '--device' value: It must be smaller than the number of available devices." - " For example, with CUDA_VISIBLE_DEVICES=4,5, " - "--device 0,1 is valid, but --device 4,5 is not supported." - ) - visible_devices = ",".join(pick_device) - os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices - else: - os.environ["CUDA_VISIBLE_DEVICES"] = device - - -def is_debug_mode(): - """Checks if the Python interpreter is running in debug mode. - - Returns: - bool: True if debugging is enabled, False otherwise. - """ - return sys.gettrace() is not None or sys.flags.debug == 1 - - -def get_layer_features(layer): - """Extracts input and output feature dimensions for supported layers.""" - if type(layer) == torch.nn.Linear: - return layer.in_features, layer.out_features - elif type(layer) == transformers.pytorch_utils.Conv1D: # TODO: Verify correctness - return layer.weight.shape[0], layer.weight.shape[1] - elif isinstance(layer, torch.nn.Embedding): - return layer.num_embeddings, layer.embedding_dim - elif deepspeed_exists and type(layer) in (LinearLayer, LinearAllreduce): - return layer.weight.shape[1], layer.weight.shape[0] # (input_dim, output_dim) - elif "FP8Linear" in layer.__class__.__name__: - return layer.in_features, layer.out_features - return None, None # Unsupported layer type - - -def get_gguf_architecture(dir_model, model_type=ModelType.TEXT): - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( - ModelBase, - get_model_architecture, - ) - - is_mistral_format = False - if isinstance(dir_model, str): - dir_model = Path(dir_model) - - hparams = ModelBase.load_hparams(dir_model, is_mistral_format) - if isinstance(hparams, dict): - tmp_model_type = hparams["model_type"] - else: - tmp_model_type = hparams.model_type - if "mistral" == tmp_model_type: - is_mistral_format = True - hparams = ModelBase.load_hparams(dir_model, is_mistral_format) - if not is_mistral_format: - model_class = get_model_architecture(hparams, model_type) - elif model_type == ModelType.MMPROJ: - assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" - model_class = "PixtralModel" - else: - model_class = "MistralModel" - return model_class - - -def _gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.TEXT): - import argparse - - from auto_round.export.export_to_gguf.convert import download_convert_file - from auto_round.utils import logger - - formats = sorted(formats, key=lambda x: len(x)) - export_gguf = False - for f in formats: - if f.startswith("gguf"): - export_gguf = True - - if f.startswith("gguf") and f not in GGUF_CONFIG: - logger.error(f"{f} is not supported, please check.") - - redownload = False - if export_gguf: - try: - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 - ModelBase, - ModelType, - get_model_architecture, - ) - - if isinstance(args_or_ar.model, str): - model_path = args_or_ar.model - else: - model_path = args_or_ar.model.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) - if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: - logger.warning( - f"Current version of gguf export does not support for {model_architecture}," - " will re-download dependency file." - ) - redownload = True - except ModuleNotFoundError as e: - if "convert_hf_to_gguf" in str(e): - logger.warning("GGUF export dependency file is not found, download from github.") - redownload = True - except AttributeError as e: - raise ImportError( - "Please use the latest gguf-py, you can use the following command to install it:\n" - "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." - ) - download_convert_file(redownload) - - try: - from auto_round.export.export_to_gguf.convert_hf_to_gguf import ( # pylint: disable=E0401 - ModelBase, - ModelType, - ) - except ImportError as e: - raise ImportError( - "Please use the latest gguf-py, you can use the following command to install it:\n" - "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ." - ) - if isinstance(args_or_ar.model, str): - model_path = args_or_ar.model - else: - model_path = args_or_ar.model.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) - if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: - logger.error(f"Model {model_architecture} is not supported to export gguf format.") - sys.exit(1) - - pattern = re.compile(r"q\d_k") - pre_dq_format = "" - unsupported_list, reset_list = [], [] - for format in GGUF_CONFIG: - if format in formats: - if format == "q6_k_s": - logger.warning("Please note that q6_k_s is q6_k.") - - if re.search(pattern, format): - if pre_dq_format and re.search(pattern, format).group() not in pre_dq_format: - logger.error(f"Cannot export {pre_dq_format} and {format} at the same time.") - sys.exit(-1) - else: - pre_dq_format = format - - unsupported_list, reset_list = [], [] - gguf_config = GGUF_CONFIG[format] - for k, v in gguf_config.items(): - if not hasattr(args_or_ar, k): - continue - if k == "data_type": - if re.search(r"q\d_1", format) and len(formats) > 1: - v = "int" - if k == "sym" and isinstance(args_or_ar, argparse.Namespace): - k = "asym" - v = not v - if getattr(args_or_ar, k) != v: - unsupported_list.append(f"{k}={getattr(args_or_ar, k)}") - reset_list.append(f"{k}={v}") - setattr(args_or_ar, k, v) - if len(unsupported_list) > 0: - logger.info( - f"format {format} does not support for {', '.join(unsupported_list)}," - f" reset to {', '.join(reset_list)}." - ) - # Removed obsolete commented-out block for improved readability and maintainability. - return args_or_ar - - -def _to_model_dtype(model, model_dtype): - if model_dtype is not None: - try: - if (model_dtype == "float16" or model_dtype == "fp16") and model.dtype != torch.float16: - model = model.to(torch.float16) - elif ( - model_dtype == "bfloat16" or model_dtype == "bfp16" or model_dtype == "bf16" - ) and model.dtype != torch.bfloat16: - model = model.to(torch.bfloat16) - elif model_dtype == "float32" or model_dtype == "fp32" and model.dtype != torch.bfloat32: - model = model.to(torch.float32) - except: - logger.error("please use more device to fit the device or just use one device") - exit() - return model - - -def set_fake_cuda_device_capability(func=None): - if func is not None: - torch.cuda.get_device_capability = func - return func - - def fake_cuda(): - return 100, 1 - - orig_func = torch.cuda.get_device_capability - torch.cuda.get_device_capability = fake_cuda - return orig_func - - -def _is_fp8_model(model: torch.nn.Module) -> bool: - if not hasattr(model, "is_fp8"): - return False - else: - return model.is_fp8 - - -def _is_fp8_linear(module: torch.nn.Module) -> bool: - if hasattr(module, "is_fp8_linear"): - return module.is_fp8_linear - if not (type(module) == torch.nn.Linear or module.__class__.__name__ == "FP8Linear"): - return False - if module.weight is None: - return False - if str(module.weight.dtype).startswith("torch.float8"): - return True - else: - return False - - -def check_and_mark_fp8_model(model: torch.nn.Module) -> bool: - if _is_fp8_model(model): - return True - for n, m in model.named_modules(): - if _is_fp8_linear(m): - m.is_fp8_linear = True - if not hasattr(model, "is_fp8"): - model.is_fp8 = True - if hasattr(model, "is_fp8"): - return True - return False - - -def llm_load_model( - pretrained_model_name_or_path, - trust_remote_code=True, - model_dtype=None, - device="cpu", - **kwargs, -): - from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer - - device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" - if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 - - is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower())) - - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - - model_cls = AutoModel if is_glm else AutoModelForCausalLM - if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code: - logger.warning("trust_remote_code is enabled by default, please ensure its correctness.") - - if _use_hpu_compile_mode(): - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - attn_implementation="eager", - trust_remote_code=trust_remote_code, - device_map="auto" if use_auto_mapping else None, - ) - else: - try: - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - device_map="auto" if use_auto_mapping else None, - ) - except ValueError as e: - if "FP8 quantized" in str(e): - orig_func = set_fake_cuda_device_capability() - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - device_map="auto" if use_auto_mapping else None, - ) - torch.cuda.get_device_capability = orig_func - logger.warning("the support for fp8 model as input is experimental, please use with caution.") - else: - raise - - except OSError as e: - logger.warning(f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry.") - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=False, - device_map="auto" if use_auto_mapping else None, - ) - - model = model.eval() - check_and_mark_fp8_model(model) - model = _to_model_dtype(model, model_dtype) - - return model, tokenizer - - -def mllm_load_model( - pretrained_model_name_or_path, - device="cpu", - torch_dtype="auto", - use_auto_mapping=True, - trust_remote_code=True, - model_dtype=None, - **kwargs, -): - import transformers - from huggingface_hub import HfApi, HfFileSystem, hf_hub_download - from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer - - device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" - if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 - if os.path.isdir(pretrained_model_name_or_path): - config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json"))) - else: - from huggingface_hub import hf_hub_download, list_repo_files - - file_list = list_repo_files(pretrained_model_name_or_path) - if "config.json" in file_list: - # Load plain JSON - config_path = hf_hub_download(pretrained_model_name_or_path, "config.json") - with open(config_path, "r", encoding="utf-8") as f: - config = json.load(f) - elif "config.json.gz" in file_list: - # Load gzipped JSON - import gzip - - config_path = hf_hub_download(pretrained_model_name_or_path, "config.json.gz") - with gzip.open(config_path, "rt", encoding="utf-8") as f: - config = json.load(f) - else: - raise FileNotFoundError(f"No config.json or config.json.gz found for {pretrained_model_name_or_path}") - - if "model_type" in config: - model_type = config["model_type"] - else: - model_type = None - - processor, image_processor = None, None - if "deepseek_vl_v2" == model_type: - from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor # pylint: disable=E0401 - - processor = DeepseekVLV2Processor.from_pretrained(pretrained_model_name_or_path) - tokenizer = processor.tokenizer - model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, - device_map="auto" if use_auto_mapping else None, - ) - else: - architectures = config["architectures"][0] - if architectures == "LlavaLlamaForCausalLM": - from llava.model.builder import load_pretrained_model # pylint: disable=E0401 - - tokenizer, model, image_processor, _ = load_pretrained_model( - pretrained_model_name_or_path, - model_base=None, - model_name=pretrained_model_name_or_path, - torch_dtype=torch_dtype, - ) - else: - if architectures.endswith("Model") and hasattr( - transformers, n := architectures.replace("Model", "ForConditionalGeneration") - ): - cls = getattr(transformers, n) - elif hasattr(transformers, architectures): - cls = getattr(transformers, architectures) - else: - cls = AutoModelForCausalLM - try: - model = cls.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, - device_map="auto" if use_auto_mapping else None, - ) - except ValueError as e: - if "FP8 quantized" in str(e): - orig_func = set_fake_cuda_device_capability() - model = cls.from_pretrained( - pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, - device_map="auto" if use_auto_mapping else None, - ) - torch.cuda.get_device_capability = orig_func - logger.warning("the support for fp8 model as input is experimental, please use with caution.") - else: - raise - - if "Mistral-Small-3.2" in pretrained_model_name_or_path: - from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pylint: disable=E0401 - - if os.path.isdir(pretrained_model_name_or_path): - tokenizer = MistralTokenizer.from_file(os.path.join(pretrained_model_name_or_path, "tekken.json")) - else: - tokenizer = MistralTokenizer.from_hf_hub(pretrained_model_name_or_path) - else: - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code - ) - processor = AutoProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code - ) - try: - from transformers import AutoImageProcessor - - image_processor = AutoImageProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code - ) - except Exception as e: - pass - - model = model.eval() - check_and_mark_fp8_model(model) - model = _to_model_dtype(model, model_dtype) - - return model, processor, tokenizer, image_processor - - -def diffusion_load_model( - pretrained_model_name_or_path: str, - device: Union[str, torch.device] = "cpu", - torch_dtype: Union[str, torch.dtype] = "auto", - use_auto_mapping: bool = False, - trust_remote_code: bool = True, - model_dtype: str = None, - **kwargs, -): - device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" - if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 - - pipelines = LazyImport("diffusers.pipelines") - - pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained( - pretrained_model_name_or_path, torch_dtype=torch_dtype - ) - pipe = _to_model_dtype(pipe, model_dtype) - model = pipe.transformer - return pipe, model.to(device) - - -def is_pure_text_model(model): - """verify on: phi-3.5, Mistral-Small-3.1, gemma-3, qwen2-vl,""" - if hasattr(model, "config") and hasattr(model.config, "vision_config"): - return False - if hasattr(model.__class__, "main_input_name") and model.__class__.main_input_name != "input_ids": - return False - for module in model.modules(): - if hasattr(module.__class__, "main_input_name") and module.__class__.main_input_name != "input_ids": - return False - if "vision" in str(module.__class__).lower(): - return False - if "image" in str(module.__class__).lower(): - return False - if "img" in str(module.__class__).lower(): - return False - return True - - -def reset_params(inputs): - """ - Resets specific input parameters to avoid saving the key-value cache during fine-tuning. - - Args: - inputs (dict): Dictionary of model inputs. - - Modifies: - inputs (dict): Sets "use_cache" to False if the key is present. - """ - if "use_cache" in inputs.keys(): # Not storing kv cache - inputs["use_cache"] = False - - -def check_skippable_keywords(key): - """ - Prints a reminder if a key is not stored during quantization fine-tuning. - """ - skippable_cache_keys = ("past_key_value",) - for cache_key in skippable_cache_keys: - if cache_key not in key: - return True - return False - - -def init_cache(positional_inputs, inputs): - """ - Initializes special model inputs by adding positional inputs if missing. - - Args: - positional_inputs (list): List of positional inputs to add to inputs. - inputs (dict): Dictionary of model inputs. - - Modifies: - inputs (dict): Adds "positional_inputs" key if not present. - """ - if "positional_inputs" not in inputs: # for chatglm Series - inputs["positional_inputs"] = [] - for idx, item in enumerate(positional_inputs): - inputs["positional_inputs"] = to_device(positional_inputs) - - -def get_shared_keys(model): - """ - Retrieves shared keys from the model's state dictionary. - - Args: - model (torch.nn.Module): The model to retrieve shared keys from. - - Returns: - tuple: tuple of shared keys. - """ - from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS - - shared_keys = SHARED_CACHE_KEYS - shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ()) - return shared_keys - - -def get_model_dtype(model_dtype, default="auto"): - if model_dtype is None or model_dtype == "auto": - model_dtype = default - elif model_dtype in ["bf16", "bfloat16"]: - model_dtype = "bfloat16" - elif model_dtype in ["f16", "float16", "fp16"]: - model_dtype = "float16" - elif model_dtype in ["f32", "float32", "fp32"]: - model_dtype = "float32" - else: - logger.warning(f"Unable to identify model_dtype {model_dtype}, reset to default model_dtype {default}") - model_dtype = default - return model_dtype - - -def str2bool(v): - import argparse - - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "f", "n", "0"): - return False - else: - raise argparse.ArgumentTypeError("Boolean value expected.") - - -def filter_quantization_config(quantization_config): - default_dict = { - "amp": True, - "batch_size": 8, - "data_type": int, - "dataset": "NeelNanda/pile-10k", - "enable_minmax_tuning": True, - "enable_norm_bias_tuning": False, - "enable_quanted_input": True, - "gradient_accumulate_steps": 1, - "iters": 200, - "low_gpu_mem_usage": False, - "nsamples": 128, - "scale_dtype": "torch.float16", - "seqlen": 2048, - } - iters = quantization_config.get("iters", 200) - - default_dict["lr"] = 1.0 / iters if iters > 0 else 5e-3 - default_dict["minmax_lr"] = default_dict["lr"] - - for key in default_dict: - if key in quantization_config and default_dict[key] == quantization_config[key]: - quantization_config.pop(key) - for k in list(quantization_config.keys()): - if quantization_config[k] is None: - quantization_config.pop(k) - - if quantization_config.get("act_bits", 16) >= 16: - quantization_config.pop("act_bits", None) - quantization_config.pop("act_data_type", None) - quantization_config.pop("act_dynamic", None) - quantization_config.pop("act_sym", None) - quantization_config.pop("act_group_size", None) - - -def check_start_with_block_name(name: str, block_name_to_quantize: list): - """ - Checks if the given layer name starts with any of the block names to be quantized. - - Args: - name (str): The name of the layer. - block_name_to_quantize (list): A list of block names to check against. - - Returns: - bool: True if the layer name starts with any of the block names, False otherwise. - """ - for block_name in block_name_to_quantize: - if name.startswith(block_name): - return True - return False - - -def check_seqlen_compatible(input_seqlen, tokenizer=None, model=None): - """ - Check whether the input sequence length is within the limits defined - by the tokenizer and the model configuration. - - Args: - input_seqlen (int): The length of the input sequence. - tokenizer: Optional, a HuggingFace tokenizer object. - model: Optional, a HuggingFace model object. - - Returns: - ValueError: if the input length is not valid, riase Error. - """ - if model is not None and hasattr(model, "config"): - model_config = model.config - if hasattr(model_config, "max_position_embeddings") and input_seqlen > model_config.max_position_embeddings: - raise ValueError( - f"seqlen({input_seqlen}) exceeds model.config.max_position_embeddings(" - f"{model_config.max_position_embeddings}). Please lowering '--seqlen'" - ) - if tokenizer is not None and hasattr(tokenizer, "model_max_length") and input_seqlen > tokenizer.model_max_length: - raise ValueError( - f"seqlen({input_seqlen}) exceeds tokenizer.model_max_length({tokenizer.model_max_length}). " - "Please oncider Consider lowering the '--seqlen' or increasing tokenizer.model_max_length." - ) - - -def _use_more_bits(i_layer: int, n_layer: int): - return (i_layer < n_layer // 8) or (i_layer >= 7 * n_layer // 8) or ((i_layer - n_layer // 8) % 3 == 2) - - -def _get_digital_in_layer_name(layer_name): - pattern = re.compile(r"([a-zA-Z]+\.){1,}(\d+)") - res = re.search(pattern, layer_name) - if res: - return int(res[2]) - else: - return None - - -def _search_gguf_type(gguf_type): - if gguf_type in GGUF_INNER_CONFIG: - return gguf_type - pattern = re.compile("gguf:q([0-9]{1,})_[01k]") - bits = re.search(pattern, gguf_type) - if not bits: - raise KeyError(f"{gguf_type} is not a correct gguf type, please check") - - for suffix in ["_k", "_0", "_1"]: - if gguf_type.endswith(suffix): - continue - if (tmp_type := re.sub("_[01k]", suffix, gguf_type)) in GGUF_INNER_CONFIG: - return tmp_type - return None - - -def _gguf_type_fallback(gguf_type: str) -> str: - gguf_type = gguf_type.lower() - if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): - gguf_type = "gguf:q5_0" - elif gguf_type == "gguf:q5_k": - gguf_type = "gguf:q5_0" - elif gguf_type == "gguf:q6_k": - gguf_type = "gguf:q8_0" - return gguf_type - - -##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 -def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT): - # # TODO: support for other format later - # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) - - import gguf # pylint: disable=E0401 - - # from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture - convert_hf_to_gguf = LazyImport("auto_round.export.export_to_gguf.convert_hf_to_gguf") - - model_architecture = convert_hf_to_gguf.get_model_architecture( - hparams=model.config.to_dict(), model_type=model_type - ) - try: - model_class = convert_hf_to_gguf.ModelBase.from_model_architecture(model_architecture, model_type=model_type) - except NotImplementedError: - return layer_config, {} - - n_layer = None - for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"]: - sub_attr = "text_config" if model_type == ModelType.TEXT else "vision_config" - if hasattr(model.config, name): - n_layer = getattr(model.config, name) - break - if hasattr(model.config, sub_attr): - if hasattr(getattr(model.config, sub_attr), name): - n_layer = getattr(getattr(model.config, sub_attr), name) - break - if n_layer is None: - return layer_config, {} - - tensor_map = gguf.get_tensor_name_map(model_class.model_arch, n_layer) - - def _set_config(config, target_config): - for k, v in target_config.items(): - if isinstance(config, dict): - config[k] = v - else: - setattr(config, k, v) - return config - - gguf_format_config = {} - lm_head_name = get_lm_head_name(model) - inner_gguf_format = GGUF_CONFIG[target_gguf_format]["mostly"] - # ggml_type = getattr(gguf.GGMLQuantizationType,inner_gguf_format.split(":")[-1].upper()) - block_size = GGML_QUANT_SIZES[inner_gguf_format.split(":")[-1].lower()][0] - tie_word_embeddings = True - if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): - tie_word_embeddings = model.config.tie_word_embeddings - - n_gqa = 1 - if ( - hasattr(model, "config") - and hasattr(model.config, "num_attention_heads") - and hasattr(model.config, "num_key_value_heads") - ): - n_gqa = model.config.num_attention_heads // model.config.num_key_value_heads - n_expert = 0 - for name in ["num_experts", "num_local_experts", "n_routed_experts"]: - if hasattr(model.config, name): - n_expert = getattr(model.config, name) - - i_attention_wv = 0 - i_ffn_down = 0 - layer_config_copy = copy.deepcopy(layer_config) - target_bits = None - if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit(): - target_bits = int(inner_gguf_format[6]) - - for layer_name, config in layer_config_copy.items(): - if not check_to_quantized(config): - continue - new_type = GGUF_CONFIG[target_gguf_format]["mostly"] - layer = get_module(model, layer_name) - if type(layer) == transformers.pytorch_utils.Conv1D: - input_features = layer.weight.shape[0] - else: - input_features = layer.weight.shape[-1] - i_layer = _get_digital_in_layer_name(layer_name) - - if lm_head_name is not None and layer_name == lm_head_name: - target_bits = int(re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["lm_head"]).group(1)) - if isinstance(layer, torch.nn.Embedding): - target_bits = int( - re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["embedding"]).group(1) - ) - - gguf_name = tensor_map.get_name(layer_name) - bits_index = 6 - if config.get("fixed_by_user", False): - if "bits" not in config: - logger.warning( - f"Setting layer_config requires providing bits, {layer_name} has not bits," - f" using bits={target_bits} instead." - ) - new_type = new_type[:bits_index] + target_bits + new_type[bits_index + 1 :] - else: - config_tmp = config.copy() - scheme_keys = [f.name for f in fields(QuantizationScheme)] - for key in config.keys(): - if key not in scheme_keys: - config_tmp.pop(key, None) - matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched - if not matched_scheme: - if config.get("super_group_size", None) is not None or config.get("super_bits", None) is not None: - new_type = new_type[:bits_index] + str(config["bits"]) + "_k" - if new_type not in GGUF_INNER_CONFIG: - prefix_idx = 0 if config.get("sym", True) else 1 - new_type = new_type[:bits_index] + str(config["bits"]) + f"_{prefix_idx}" - if new_type not in GGUF_INNER_CONFIG: - new_type = new_type[:bits_index] + str(config["bits"]) + f"_{1-prefix_idx}" - if new_type not in GGUF_INNER_CONFIG: - raise ValueError( - f"the setting in layer_config {layer_name} " - f"could not match any supported gguf format, please have a check." - ) - else: - logger.warning_once( - f"the setting in layer_config {layer_name} " - f"could not match any supported gguf format, reset to {new_type}" - ) - new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] - new_type = _search_gguf_type(new_type) - if new_type is None: - raise ValueError(f"invalid bit setting for {layer_name}") - elif target_bits is not None and "bits" in config and config["bits"] != target_bits: - new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :] - new_type = _search_gguf_type(new_type) - if new_type is None: - raise ValueError(f"invalid bit setting for {layer_name}") - elif lm_head_name is not None and layer_name == lm_head_name and not tie_word_embeddings: - if gguf.MODEL_ARCH.FALCON == model_class.model_arch or input_features % block_size != 0: - new_type = "gguf:q8_0" - elif "lm_head" in GGUF_CONFIG[target_gguf_format]: - new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] - elif new_type != "gguf:q8_0": - new_type = "gguf:q6_k" - elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings: - # new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] - continue - elif isinstance(layer, torch.nn.Embedding): - if "embedding" in GGUF_CONFIG[target_gguf_format]: - new_type = GGUF_CONFIG[target_gguf_format]["embedding"] - elif gguf_name is None: - pass - # attn_v - elif "attn_v" in gguf_name: - if target_gguf_format == "gguf:q2_k": - new_type = "gguf:q4_k" if n_gqa >= 4 else "gguf:q3_k" - elif target_gguf_format == "gguf:q2_k_s" and n_gqa >= 4: - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_m": - new_type = "gguf:q5_k" if i_attention_wv < 2 else "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_l": - new_type = "gguf:q5_k" - elif (target_gguf_format == "gguf:q4_k_m" or target_gguf_format == "gguf:q5_k_m") and _use_more_bits( - i_layer, n_layer - ): - new_type = "gguf:q6_k" - elif target_gguf_format == "gguf:q4_k_s" and i_attention_wv < 4: - new_type = "gguf:q5_k" - ##TODO check which models are be grouped into to LLM_TYPE_70B - # if (qs.model.type == LLM_TYPE_70B) { - # // In the 70B model we have 8 heads sharing the same attn_v weights. - # As a result, the attn_v.weight tensor is - # // 8x smaller compared to attn_q.weight.Hence, we can get a nice boost in quantization accuracy with - # // nearly negligible increase in model size by quantizing this tensor with more bits: - # if - # (new_type == GGML_TYPE_Q3_K | | new_type == GGML_TYPE_Q4_K) - # new_type = GGML_TYPE_Q5_K; - # } - if n_expert == 8: - new_type = "gguf:q8_k" - i_attention_wv += 1 - - elif "attn_k" in gguf_name: - if n_expert == 8: - new_type = "gguf:q8_0" - # ffn_down - elif "ffn_down" in gguf_name: - if target_gguf_format == "gguf:q2_k": - new_type = "gguf:q3_k" - elif target_gguf_format == "gguf:q2_k_s": - if i_layer < n_layer / 8: - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_m": - if i_layer < n_layer / 16: - new_type = "gguf:q5_k" - elif gguf.MODEL_ARCH.FALCON == model_class.model_arch or _use_more_bits(i_layer, n_layer): - new_type = "gguf:q4_k" - else: - new_type = "gguf:q3_k" - elif target_gguf_format == "gguf:q3_k_l": - if gguf.MODEL_ARCH.FALCON == model_class.model_arch: - new_type = "gguf:q4_k" - else: - new_type = "gguf:q5_k" - elif target_gguf_format == "gguf:q4_k_m": - if gguf.MODEL_ARCH.FALCON == model_class.model_arch: - if i_layer < n_layer // 16: - new_type = "gguf:q6_k" - elif _use_more_bits(i_layer, n_layer): - new_type = "gguf:q5_k" - else: - new_type = "gguf:q4_k" - else: - if _use_more_bits(i_layer, n_layer): - new_type = "gguf:q6_k" - elif target_gguf_format == "gguf:q5_k_m" and _use_more_bits(i_layer, n_layer): - new_type = "gguf:q6_k" - elif ( - target_gguf_format == "gguf:q4_k_s" - and model_class.model_arch != gguf.MODEL_ARCH.FALCON - and i_layer < n_layer / 8 - ): - new_type = "gguf:q5_k" - elif (target_gguf_format == "gguf:q4_0" or target_gguf_format == "gguf:q5_0") and i_layer < n_layer / 8: - if target_gguf_format == "gguf:q4_0": - new_type = "gguf:q4_1" - else: - new_type = "gguf:q5_1" - i_ffn_down += 1 - - # attn_output - elif "attn_output" in gguf_name: - if gguf.MODEL_ARCH.FALCON != model_class.model_arch: - if n_expert == 8: - if target_gguf_format in ( - "gguf:q2_k", - "gguf:q3_k_s", - "gguf:q3_k_m", - "gguf:q4_k_s", - "gguf:q4_k_m", - "gguf:q5_k", - ): - new_type = "gguf:q5_k" - elif target_gguf_format == "gguf:q2_k": - new_type = "gguf:q3_k" - elif target_gguf_format == "gguf:q3_k_m": - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q3_k_l": - new_type = "gguf:q5_k" - else: - if target_gguf_format == "gguf:q3_k_l": - new_type = "gguf:q4_k" - # attn_qkv - elif "attn_qkv" in gguf_name: - if target_gguf_format in ("gguf:q3_k_m", "gguf:q3_k_l"): - new_type = "gguf:q4_k" - elif target_gguf_format == "gguf:q4_k_m": - new_type = "gguf:q5_k" - elif target_gguf_format == "gguf:q5_k_m": - new_type = "gguf:q5_k" - new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] - if input_features % new_block_size != 0: - new_type = _gguf_type_fallback(new_type) - new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] - if input_features % new_block_size != 0: - new_type = "gguf:bf16" - logger.warning( - f"fallback {layer_name} to {new_type}, " - f"because input_features({input_features}) % block_size({block_size}) != 0" - ) - # for deepseek v2 - if layer_name.endswith("kv_b_proj") and new_type.endswith("_k") and "Deepseek" in model.config.architectures[0]: - fallback = False - - # calc if need fallback - qk_nope_head_dim = model.config.qk_nope_head_dim - kv_b_shape = get_module(model, layer_name).weight.shape - - if ( - qk_nope_head_dim < QK_K - or qk_nope_head_dim % QK_K != 0 - or kv_b_shape[-1] < QK_K - or kv_b_shape[-1] % QK_K != 0 - ): - fallback = True - if fallback: - tmp_type = _gguf_type_fallback(new_type) - logger.warning_once( - f"self_attn.kv_b_proj does not support the use of {new_type}, replace it with {tmp_type}" - ) - new_type = tmp_type - - target_config = GGUF_INNER_CONFIG[new_type] - - _set_config(layer_config[layer_name], target_config) - _set_config(layer, target_config) - gguf_format_config[layer_name] = new_type - - return layer_config, gguf_format_config - - -def get_lm_head_name(model): - block_names = get_block_names(model, True) - last_name = None - for n, m in model.named_modules(): - if any(m.children()): - continue - last_name = n - for l in block_names: - if last_name in l: - last_name = None - break - return last_name - - -def get_gguf_qtype_by_layer_config(layer_config): - import gguf # pylint: disable=E0401 - - if layer_config["bits"] >= 16: - return None - bits = layer_config["bits"] - super_bits = layer_config.get("super_bits", None) - sym = layer_config["sym"] - group_size = layer_config.get("group_size", None) - super_group_size = layer_config.get("super_group_size", None) - if bits == 2 and super_bits == 4 and not sym and group_size == 16 and super_group_size == 16: - return gguf.GGMLQuantizationType.Q2_K - if bits == 3 and super_bits == 6 and sym and group_size == 16 and super_group_size == 16: - return gguf.GGMLQuantizationType.Q3_K - if bits == 4: - if super_bits is not None and super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: - return gguf.GGMLQuantizationType.Q4_K - if super_bits is None and sym and group_size == 32: - return gguf.GGMLQuantizationType.Q4_0 - if super_bits is None and not sym and group_size == 32: - return gguf.GGMLQuantizationType.Q4_1 - if bits == 5: - if super_bits == 6 and not sym and group_size == 32 and super_group_size == 8: - return gguf.GGMLQuantizationType.Q5_K - if super_bits is None and sym and group_size == 32: - return gguf.GGMLQuantizationType.Q5_0 - if super_bits is None and not sym and group_size == 32: - return gguf.GGMLQuantizationType.Q5_1 - if bits == 6 and super_bits == 8 and group_size == 16 and super_group_size == 16: - return gguf.GGMLQuantizationType.Q6_K - if bits == 8 and sym and group_size == 32: - return gguf.GGMLQuantizationType.Q8_0 - raise ValueError("Unknown layer config") - - -def flatten_list(nested_list): - flattened = [] - for item in nested_list: - if isinstance(item, (list, tuple)): - flattened.extend(flatten_list(item)) - else: - flattened.append(item) - return flattened - - -def clean_module_parameter(submodule, parameter): - if submodule is None: - return - is_buffer = parameter in submodule._buffers - with torch.no_grad(): - if is_buffer: - submodule._buffers[parameter] = None - else: - submodule._parameters[parameter] = None - - -def get_reciprocal(tensor): - if torch.dtype is torch.float16: - tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5) - else: - tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor) - return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor)) - - -def check_need_act_calibration( - is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: Union[int, None] = 16 -) -> bool: - if act_bits is None or act_bits > 8: - return False - # None is dynamic - if is_act_dynamic is not None and not is_act_dynamic: - return True - if act_data_type is not None and "static" in act_data_type: - return True - return False - - -def pad_weight(weight: torch.Tensor, block_size: list) -> Tuple[torch.Tensor, int, int]: - """Pads a matrix to make its dimensions multiples of block_size.""" - M, N = weight.shape[-2:] - block_size_m, block_size_n = block_size - pad_M = (block_size_m - M % block_size_m) % block_size_m - pad_N = (block_size_n - N % block_size_n) % block_size_n - - if pad_M == 0 and pad_N == 0: - return weight, M, N # No padding needed - padded_weight = torch.nn.functional.pad(weight, (0, pad_N, 0, pad_M), mode="constant", value=0) - return padded_weight, M, N # Return original dimensions for unpadding - - -def unpad_weight(weight: torch.Tensor, original_M: int, original_N: int, keep_first_dim: bool = False) -> torch.Tensor: - """Removes padding from the matrix to restore its original shape.""" - if (weight.shape[-2] == original_M) and (weight.shape[-1] == original_N): - return weight - if keep_first_dim: - return weight[:, :original_M, :original_N] - else: - return weight[:original_M, :original_N] - - -def pad_block_fp8_weight_naive( - weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list -) -> Tuple[torch.Tensor, int, int]: - assert len(block_size) == 2 - - block_size_m, block_size_n = block_size - weight_scale_m, weight_scale_n = weight_scale.shape[-2:] - - weight, orig_M, orig_N = pad_weight(weight, block_size) - M, N = weight.shape[-2:] - - assert weight_scale_m == M // block_size_m - assert weight_scale_n == N // block_size_n - - return weight, orig_M, orig_N - - -def dequant_block_fp8_weight(weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list) -> torch.Tensor: - dtype = torch.bfloat16 - if weight_scale is None: - return weight - assert len(block_size) == 2 - - weight, orig_M, orig_N = pad_block_fp8_weight_naive(weight, weight_scale, block_size) - - weight_shape_len = len(weight.shape) - - block_size_m, block_size_n = block_size - - # mul scale - if weight_shape_len == 2: - weight_scale_m, weight_scale_n = weight_scale.shape - weight_scale = weight_scale.view(weight_scale_m, 1, weight_scale_n, 1) - weight = weight.view(weight_scale_m, block_size_m, weight_scale_n, block_size_n) - dequant_weight = weight.to(dtype) * weight_scale.to(dtype) - dequant_weight = dequant_weight.view(weight_scale_m * block_size_m, weight_scale_n * block_size_n) - keep_first_dim = False - elif weight_shape_len == 3: - fd, weight_scale_m, weight_scale_n = weight_scale.shape - weight_scale = weight_scale.view(fd, weight_scale_m, 1, weight_scale_n, 1) - weight = weight.view(fd, weight_scale_m, block_size_m, weight_scale_n, block_size_n) - dequant_weight = weight.to(dtype) * weight_scale.to(dtype) - dequant_weight = dequant_weight.view(fd, weight_scale_m * block_size_m, weight_scale_n * block_size_n) - keep_first_dim = True - else: - raise ValueError("Only support original weight shape is either 2 or 3") - - dequant_weight = unpad_weight(dequant_weight, orig_M, orig_N, keep_first_dim=keep_first_dim) - - return dequant_weight - - -def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16): - """ """ - new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype) - if layer.bias is not None: - new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype)) - scheme_keys = (f.name for f in fields(QuantizationScheme)) - keys = tuple(scheme_keys) + ("tmp_name", "scale_dtype") - for key in keys: - setattr(new_layer, key, getattr(layer, key, None)) - - if layer.__class__.__name__ == "CompressedLinear": - dq_weight = layer.compressor.decompress_module(layer) - else: - weight_scale = layer.weight_scale if hasattr(layer, "weight_scale") else layer.weight_scale_inv - dq_weight = dequant_block_fp8_weight(layer.weight, weight_scale, layer.block_size) - new_layer.weight.data.copy_(dq_weight.to(dtype=dtype)) - return new_layer - - -def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16): - """ - Convert a model with FP8 quantized layers to a model with 16-bit linear layers. - This is useful for compatibility with other frameworks or for further processing. - """ - cnt = 0 - for n, m in model.named_modules(): - if m.__class__.__name__ == "FP8Linear": - new_module = convert_fp8_layer_to_linear(m, dtype=dtype) - set_module(model, n, new_module) - cnt += 1 - if cnt % 10 == 0: # Tricky setting - clear_memory() - return model - - -def out_of_vram(error_msg): - error_msg = str(error_msg) - # CUDA - if "CUDA out of memory" in error_msg: - return True - # gaudi - if "MODULE:PT_DEVMEM" in error_msg: - return True - # XPU - if "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in error_msg: - return True - # ROCM - if "HIP out of memory. Tried to allocate" in error_msg: - return True - return False - - -def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None): - """Download hugging face model from hf hub.""" - from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE - from huggingface_hub.file_download import REGEX_COMMIT_HASH, repo_folder_name - - if cache_dir is None: - cache_dir = HUGGINGFACE_HUB_CACHE - if revision is None: - revision = DEFAULT_REVISION - if repo_type is None: - repo_type = "model" - storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)) - commit_hash = None - if REGEX_COMMIT_HASH.match(revision): - commit_hash = revision - else: - ref_path = os.path.join(storage_folder, "refs", revision) - if os.path.exists(ref_path): - with open(ref_path) as f: - commit_hash = f.read() - if storage_folder and commit_hash: - pointer_path = os.path.join(storage_folder, "snapshots", commit_hash) - if os.path.isdir(pointer_path): - return pointer_path - else: # pragma: no cover - from huggingface_hub import snapshot_download - - model_path = snapshot_download(repo_id) - return model_path - - -def is_moe(module: torch.nn.Module) -> bool: - """Returns whether the module is an MOE layer.""" - return any( - key in type(module).__name__.lower() - for key in [ - "MixtralSparseMoeBlock".lower(), - "ArcticMoE".lower(), - "DbrxFFN".lower(), - "MoELayer".lower(), - "PhimoeSparseMoeBlock".lower(), - "DeepseekMoE".lower(), - "DeepseekV2MoE".lower(), - "DeepseekV3MoE".lower(), - "Qwen2MoeSparseMoeBlock".lower(), - "Qwen3MoeSparseMoeBlock".lower(), - ] - ) - - -# please refer to https://github.com/NVIDIA/TensorRT-Model-Optimizer -# /blob/4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/layer_utils.py#L976 -def get_expert_linear_names(module: torch.nn.Module) -> list[str]: - """Get the list of linear names for the experts.""" - - def module_match_name_list(module, name_list): - """Check if the module name matches any of the names in the list. - - e.g. module_match_name_list(QuantQwen3MoeSparseMoeBlock, ['Qwen3MoeSparseMoeBlock']) -> True - - """ - return any(name.lower() in type(module).__name__.lower() for name in name_list) - - if module_match_name_list( - module, ["Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "DeepseekMoE", "DeepseekV2MoE", "DeepseekV3MoE"] - ): - return ["gate_proj", "down_proj", "up_proj"] - elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]): - return ["linear_fc1", "linear_fc2"] - elif module_match_name_list(module, ["DBRXMoeSparseMoeBlock"]): - return ["w1_linear", "w2_linear", "v1_linear"] - else: - # assuming w1, w2, w3 by default - return ["w1", "w2", "w3"] - - -def get_nested_attr(module, attr_name: str): - """Recursively get nested attribute (e.g., 'orig_layer.act_max').""" - attrs = attr_name.split(".") - for attr in attrs: - if not hasattr(module, attr): - return None - module = getattr(module, attr) - return module - - -def set_nested_attr(module, attr_name: str, value): - """Recursively set nested attribute (e.g., 'orig_layer.act_max' = value).""" - attrs = attr_name.split(".") - for attr in attrs[:-1]: - if not hasattr(module, attr): - return None # No need to set act_max for fp layers - module = getattr(module, attr) - setattr(module, attrs[-1], value) - - -def set_amax_for_uncalibrated_experts( - experts: torch.nn.Module, set_amax_value: float | None = None, attr_name="act_max" -): - """Set amax of uncalibrated experts to a given value or the max of existing amax value from other experts. - - Args: - experts: a list of experts - set_amax_value: set amax value to the given value. - If None, set amax value to the max of existing amax value from other experts. - - Returns: - uncalibrated_experts: a list of uncalibrated experts - """ - uncalibrated_experts = [] - # get the max amax value from all experts - if set_amax_value is None: - amax_values = [ - get_nested_attr(module, attr_name) for module in experts if get_nested_attr(module, attr_name) is not None - ] - if len(amax_values) == 0: - return uncalibrated_experts - # Flatten all tensors to 1D before concatenation - flat_values = [t.reshape(-1) for t in amax_values] - all_values = torch.cat(flat_values) - set_amax_value = torch.max(all_values) - - for module in experts: - if get_nested_attr(module, attr_name) is None: - logger.warning_once( - "Missing amax value of expert layers." - "This typically occurs in MoE models when certain experts are not activated during calibration. " - "Consider increasing your calibration dataset size to ensure all experts are exercised." - ) - # Use float32 dtype explicitly to ensure we create a floating point tensor - if not isinstance(set_amax_value, torch.Tensor): - set_amax_value = torch.tensor(set_amax_value, dtype=torch.float32) - set_nested_attr(module, attr_name, set_amax_value) - # uncalibrated_experts.append(module) - - -# Please refer to: https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/ -# 4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/unified_export_hf.py#L195-L207 -def set_amax_for_all_moe_layers(model: torch.nn.Module, layer_name=None, attr_name="act_max"): - if layer_name is not None: - parts = layer_name.split(".") - if "experts" not in parts: - raise ValueError - idx = parts.index("experts") - moe_name = ".".join(parts[:idx]) - model = get_module(model, moe_name) - # Handle input quantizers of experts that are not calibrated - for name, sub_module in model.named_modules(): - if not (is_moe(sub_module) and hasattr(sub_module, "experts")): - continue - expert_linear_names = get_expert_linear_names(sub_module) - for linear_name in expert_linear_names: - if isinstance(sub_module.experts, collections.abc.Iterable): - # For other MoE models (like Mixtral) with iterable experts - try: - set_amax_for_uncalibrated_experts( - [getattr(expert, linear_name, None) for expert in sub_module.experts], attr_name=attr_name - ) - except AttributeError as e: - # Provide more helpful debugging information - expert_types = list(set(type(expert).__name__ for expert in sub_module.experts)) - raise AttributeError( - f"Failed to access attribute '{linear_name}' on experts. " - f"MoE module type: {type(sub_module).__name__}, " - f"Expert types: {expert_types}, " - f"Expected linear names: {expert_linear_names}. " - f"This suggests the get_expert_linear_names function may need " - f"to be updated for this model architecture. " - f"Original error: {e}" - ) from e - else: - # Unsupported MoE model structure - raise NotImplementedError( - f"MoE model with experts type '{type(sub_module.experts).__name__}' is not supported in export." - f"Please file an issue or add support for this model architecture." - ) - - -class BackendDataType(str, Enum): - STANDARD_FP = "fp" - MX_FP = "mx_fp" - NV_FP = "nv_fp" - - -def is_standard_fp(backend): - backend = backend.lower() - return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend) - - -def is_mx_fp(backend): - backend = backend.lower() - return BackendDataType.MX_FP in backend - - -def is_nv_fp(backend): - backend = backend.lower() - return BackendDataType.NV_FP in backend - - -def _is_weight_fp8_activation_static_fp8( - bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool -) -> bool: - return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic - - -def is_wfp8afp8(ar): - if ( - ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) - and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)) - and is_standard_fp(ar.act_data_type) - and is_standard_fp(ar.data_type) - ): - return True - else: - return False - - -def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool: - if isinstance(ar_or_format, str): - return "fp8_static" in ar_or_format - if ar_or_format.act_dynamic: - return False - if is_wfp8afp8(ar_or_format): - return True - return False - - -def bytes_to_gigabytes(bytes) -> int: - """ - Converts bytes to gigabytes. - - Args: - bytes (int): The number of bytes. - - Returns: - int: The equivalent number of gigabytes. - """ - return bytes / 1024 / 1024 / 1024 - - -def get_device_memory(i: int = 0) -> int: - """ - Gets the available memory on the specified device. - - Args: - i (int, optional): Device index. Defaults to 0. - - Returns: - int: Available memory in gigabytes. - """ - if torch.cuda.is_available(): - total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory) - elif torch.xpu.is_available(): - raise RuntimeError("XPU does not support device_map='auto' currently.") - else: - raise RuntimeError("No supported device found (CUDA or XPU).") - return total_memory - - -def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]: - """ - Calculates the memory consumption of a specific block in the model. - - Args: - block (torch.nn.Module): The block of the model to analyze. - input_ids (list[torch.Tensor]): A list of input tensors for the block. - - Returns: - tuple: A tuple containing the following: - - block_memory (float): The memory consumption (in GB) of the block's linear layers. - - input_output_memory (float): The memory consumption (in GB) for input and output - tensors of the block. - """ - # Calculate all block parameters memory - total_param_mem = 0 - for name, module in block.named_modules(): - if check_to_quantized(module): - param_size = module.weight.nbytes - total_param_mem += param_size - block_memory = total_param_mem / 1024**3 # Convert to GB - - # Assuming bfloat16 or float32, input and output - input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 - - return block_memory, input_output_memory - - -def get_max_vram(ratio: float = 0.9) -> dict: - max_memory = {} - if torch.cuda.is_available(): # NVIDIA CUDA - num_devices = torch.cuda.device_count() - for i in range(num_devices): - total_mem = torch.cuda.get_device_properties(i).total_memory - max_mem_gb = int(total_mem / 1024**3 * ratio) - max_memory[i] = f"{max_mem_gb}GiB" - elif torch.xpu.is_available(): # TODO need verification - num_devices = torch.xpu.device_count() - for i in range(num_devices): - total_mem = torch.xpu.get_device_properties(i).total_memory - max_mem_gb = int(total_mem / 1024**3 * ratio) - max_memory[i] = f"{max_mem_gb}GiB" - - else: - raise RuntimeError("No CUDA or XPU devices found.") - return max_memory - - -def _get_packing_device(device: str | torch.device | None = "auto") -> torch.device: - """ - Selects the packing device. - - "auto": choose best available (CUDA > XPU > CPU). - - str: parsed by torch.device (e.g., "cuda:2", "cpu"). - - torch.device: returned as-is. - - None: treated as "auto". - - Args: - device: Target device spec ("auto", "cuda:0", "xpu:0", "cpu", or torch.device). - - Returns: - torch.device: The resolved device. - """ - if device is None or (isinstance(device, str) and device.lower() == "auto"): - if torch.cuda.is_available(): - return torch.device("cuda:0") - if hasattr(torch, "xpu") and torch.xpu.is_available(): - return torch.device("xpu:0") - return torch.device("cpu") - - if isinstance(device, torch.device): - return device - - if isinstance(device, str): - try: - return torch.device(device) - except Exception as e: - raise ValueError(f"Invalid device string: {device}") from e - - raise TypeError(f"Unsupported device type: {type(device)} ({device})") - - -# Adapted from https://github.com/vllm-project/llm-compressor/blob/ -# 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144 -def copy_python_files_from_model_cache(model, save_path: str): - config = model.config - cache_path = None - if hasattr(config, "_name_or_path"): - import os - import shutil - - from huggingface_hub import hf_hub_download - from transformers import TRANSFORMERS_CACHE - from transformers.utils import http_user_agent - - cache_path = config._name_or_path - if not os.path.exists(cache_path): - user_agent = http_user_agent() - config_file_path = hf_hub_download( - repo_id=cache_path, - filename="config.json", - cache_dir=TRANSFORMERS_CACHE, - force_download=False, - user_agent=user_agent, - ) - cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) - - for file in os.listdir(cache_path): - full_file_name = os.path.join(cache_path, file) - if file.endswith(".py") and os.path.isfile(full_file_name): - logger.debug(f"Transferring {full_file_name} to {save_path}") - shutil.copy(full_file_name, save_path) - - -def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): - MM_KEYS = [ - "multi_modal_projector", - "vision_tower", - "multimodal_projector", - "thinker", - "visual", - "audio", - "talker", - "token2wav", - "vision_model", - "audio_tower", - "vision_encoder", - "vision_language_adapter", - "patch_merger", - "pre_mm_projector_norm", - "vision", - ] - - model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path - if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) - - if isinstance(model_path, str): - if os.path.exists(os.path.join(model_path, "preprocessor_config.json")): - return True - if os.path.exists(os.path.join(model_path, "processor_config.json")): - return True - if os.path.exists(os.path.join(model_path, "config.json")): - with open(os.path.join(model_path, "config.json")) as f: - config = json.load(f) - for key in config.keys(): - if any([k in key for k in MM_KEYS]): - return True - - if isinstance(model_or_path, torch.nn.Module): - for name, module in model_or_path.named_modules(): - if any([k in name for k in MM_KEYS]): - return True - - return False - - -def set_layer_config( - model: torch.nn.Module, - layer_config: dict[str, Union[str, dict, "QuantizationScheme"]], - default_scheme: Union[str, "QuantizationScheme"], - default_scale_dtype: torch.dtype | str, - supported_types: tuple, - inner_supported_types: tuple, - quant_block_list=None, - fp_layers: str = "", - quant_lm_head: bool = False, - enable_gguf_official_mixed: bool = True, - is_mllm: bool = False, -) -> tuple[dict, bool, dict]: - """ - Normalize, validate, and expand layer-specific quantization configs. - Returns (final_layer_config, has_quant_layer_outside_block) - """ - - from auto_round.schemes import get_gguf_scheme - - # ---- helpers ------------------------------------------------- - def dispatch_layer_config(layer_config: dict[str, dict]) -> None: - """Assign scheme values as attributes to matched modules.""" - for layer_name, scheme in layer_config.items(): - module = get_module(model, layer_name) - for attr, value in scheme.items(): - setattr(module, attr, value) - - def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str) -> dict: - """Convert config entry into dict and validate keys.""" - if isinstance(item, str): - config = asdict(preset_name_to_scheme(item.upper())) - elif isinstance(item, QuantizationScheme): - config = asdict(item) - elif isinstance(item, dict): - invalid = set(item) - set(scheme_keys + ("fixed_by_user", "scale_dtype")) - if invalid: - raise ValueError( - f"Invalid keys {invalid} in layer_config for '{layer_name}'. " f"Allowed keys: {scheme_keys}" - ) - config = dict(item) - else: - raise TypeError( - f"Unsupported type for layer_config[{layer_name}]: {type(item)}. " - f"Expected str, dict, or QuantizationScheme." - ) - # Clean up - config = {k: v for k, v in config.items() if v is not None} - config["fixed_by_user"] = True - return config - - # ---- main logic ---------------------------------------------- - scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) - layer_config = copy.deepcopy(layer_config) or {} - - # 1. fp_layers -> force 16 - for name in get_fp_layer_names(model, fp_layers): - layer_config[name] = { - "bits": 16, - "act_bits": 16, - "data_type": "float", - "act_data_type": "float", - "fixed_by_user": True, - } - - # 2. normalize - layer_config = {k: normalize_item(v, k) for k, v in layer_config.items()} - - # 3. infer missing bits - for cfg in layer_config.values(): - if "data_type" in cfg and "bits" not in cfg: - if (b := infer_bits_by_data_type(cfg["data_type"])) is not None: - cfg["bits"] = b - if "act_data_type" in cfg and "act_bits" not in cfg: - if (b := infer_bits_by_data_type(cfg["act_data_type"])) is not None: - cfg["act_bits"] = b - - # 4. fill defaults - if isinstance(default_scheme, str): - default_dict = asdict(preset_name_to_scheme(default_scheme.upper())) - else: - default_dict = asdict(default_scheme) - default_dict["scale_dtype"] = default_scale_dtype - for cfg in layer_config.values(): - for key in scheme_keys: - cfg.setdefault(key, copy.deepcopy(default_dict.get(key))) - - # 5. collect supported modules - gguf_name = get_gguf_scheme(default_scheme) - if gguf_name and torch.nn.Embedding not in supported_types: - supported_types = (*supported_types, torch.nn.Embedding) - - all_supported_layer_names, embedding_layer_names = [], [] - all_module_names = [] - for n, m in model.named_modules(): - all_module_names.append(n) - # cleanup stale attributes - for key in scheme_keys: - if hasattr(m, key): - delattr(m, key) - if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types: - continue - all_supported_layer_names.append(n) - if isinstance(m, torch.nn.Embedding): - embedding_layer_names.append(n) - - # 6. expand regex configs - regex_config = {} - for name in list(layer_config.keys()): - if name in all_supported_layer_names: - continue - if name in all_module_names: - m = get_module(model, name) - if len(list(m.children())) == 0 and type(m) not in supported_types: - layer_config.pop(name) - logger.warning(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") - continue - - regex = re.compile(name) - matched = [ln for ln in all_supported_layer_names if regex.search(ln)] - if not matched: - raise ValueError(f"Invalid '{name}' in layer_config, no match found.") - val = layer_config.pop(name) - regex_config[name] = val # keep regex config - for match in matched: - layer_config[match] = val - # regex_config = None if len(regex_config)==0 else regex_config - - # 7. lm_head - lm_head_name = get_lm_head_name(model) - tie_word_embeddings = False - if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): - tie_word_embeddings = model.config.tie_word_embeddings - - if quant_lm_head and tie_word_embeddings and not gguf_name: - quant_lm_head = False - logger.warning( - "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently" - ) - - if lm_head_name not in layer_config and quant_lm_head: - layer_config[lm_head_name] = copy.deepcopy(default_dict) - - # 8. enforce shape divisibility for int weight-only - if default_dict["data_type"] == "int" and default_dict["act_bits"] >= 16 and not gguf_name: - for n, m in model.named_modules(): - if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: - if m.weight.shape[0] % 32 or m.weight.shape[1] % 32: - layer_config.setdefault(n, copy.deepcopy(default_dict)) - layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True}) - logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).") - # enforce shape divisibility for mxfp/nvfp - if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name: - for n, m in model.named_modules(): - if type(m) in supported_types or m.__class__.__name__ in inner_supported_types: - if m.weight.shape[1] % default_dict["group_size"]: - layer_config.setdefault(n, copy.deepcopy(default_dict)) - layer_config[n].update( - {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True} - ) - logger.warning_once( - f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})." - ) - - # 9. block layers: mark as in_blocks=True - for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types): - if name not in layer_config: - layer_config[name] = copy.deepcopy(default_dict) - layer_config[name]["fixed_by_user"] = False - layer_config[name]["in_blocks"] = True - - # ---- restore: ensure missing in_blocks are set to False and compute flag ---- - has_qlayer_outside_block = False - for cfg in layer_config.values(): - if "in_blocks" not in cfg: - cfg["in_blocks"] = False - # mark layer outside block - if not cfg["in_blocks"] and check_to_quantized(cfg): - has_qlayer_outside_block = True - - # 10. GGUF handling - if not gguf_name: - dispatch_layer_config(layer_config) - return layer_config, has_qlayer_outside_block, regex_config - - # embed + lm_head defaults for gguf - tie_word_embeddings &= not is_separate_lm_head(model) - if lm_head_name not in layer_config and not tie_word_embeddings: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] - cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} - layer_config[lm_head_name] = cfg - has_qlayer_outside_block = True - for emd_name in embedding_layer_names: - if emd_name in layer_config: - continue - if not tie_word_embeddings: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["embedding"]] - else: - cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] - cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} - layer_config[emd_name] = cfg - - if enable_gguf_official_mixed: - model_type = ModelType.MMPROJ if is_mllm else ModelType.TEXT - layer_config, _ = get_layer_config_by_gguf_format(layer_config, gguf_name.lower(), model, model_type) - - dispatch_layer_config(layer_config) - return layer_config, has_qlayer_outside_block, regex_config - - -def check_diffusers_installed(): # pragma: no cover - try: - import diffusers # noqa: F401 - - return True - except ImportError: - logger.error("Please install diffusers via 'pip install diffusers'" " to run diffusion model") - exit(-1) - - -def is_diffusion_model(model_or_path: Union[str, object]) -> bool: - if isinstance(model_or_path, str): - index_file = None - if not os.path.isdir(model_or_path): - try: - from huggingface_hub import hf_hub_download - - index_file = hf_hub_download(model_or_path, "model_index.json") - check_diffusers_installed() - except Exception as e: - print(e) - index_file = None - - elif os.path.exists(os.path.join(model_or_path, "model_index.json")): - check_diffusers_installed() - index_file = os.path.join(model_or_path, "model_index.json") - return index_file is not None - elif not isinstance(model_or_path, torch.nn.Module): - check_diffusers_installed() - pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") - return isinstance(model_or_path, pipeline_utils.DiffusionPipeline) - else: - return False - - -def is_separate_lm_head(model: torch.nn.Module) -> bool: - dir_path = model.name_or_path - if not os.path.isdir(dir_path): - dir_path = download_hf_model(dir_path) - lm_head_name: str = get_lm_head_name(model) - lm_head_name += ".weight" - - if "model.safetensors.index.json" in os.listdir(dir_path): - with open(os.path.join(dir_path, "model.safetensors.index.json")) as f: - index_mapping = json.load(f) - if lm_head_name in index_mapping["weight_map"]: - return True - else: - return False - else: - from safetensors import safe_open - - f = safe_open(os.path.join(dir_path, "model.safetensors"), framework="pt") - if lm_head_name in f.keys(): - return True - else: - return False - - -def to_standard_regex(pattern: str) -> str: - """ - Convert a user-specified string into a standardized regex for layer matching. - - Rules: - - If the pattern already contains regex tokens ('.*', '^', '$', etc.), - keep them as-is. - - Otherwise, wrap the pattern with `.*` on both sides to allow substring matching. - - Always ensure the returned regex is valid (compilable by re). - - Examples: - >>> to_standard_regex("model.embed_tokens") - '.*model\\.embed_tokens.*' - >>> to_standard_regex("mlp.gate") - '.*mlp\\.gate.*' - >>> to_standard_regex("mlp.gate$") - '.*mlp\\.gate$' - >>> to_standard_regex("mlp.*gate") - '.*mlp.*gate.*' - """ - # Heuristic: if pattern contains regex meta characters, assume partial regex - meta_chars = {".*", "^", "$", "|", "(", ")", "[", "]", "?", "+"} - has_regex = any(tok in pattern for tok in meta_chars) - if not has_regex: - # Escape literal dots, etc., and wrap with .* for substring matching - pattern = re.escape(pattern) - regex = f".*{pattern}.*" - else: - # Only escape bare dots that are not already part of regex constructs - # Avoid double escaping .* sequences - tmp = [] - i = 0 - while i < len(pattern): - if pattern[i] == ".": - if i + 1 < len(pattern) and pattern[i + 1] == "*": - tmp.append(".*") # keep regex token - i += 2 - continue - else: - tmp.append("\\.") # escape bare dot - else: - tmp.append(pattern[i]) - i += 1 - regex = "".join(tmp) - # If no anchors are provided, allow substring matching - if not regex.startswith("^") and not regex.startswith(".*"): - regex = ".*" + regex - if not regex.endswith("$") and not regex.endswith(".*"): - regex = regex + ".*" - # Validate regex - try: - re.compile(regex) - except re.error as e: - raise ValueError(f"Invalid regex generated from pattern '{pattern}': {e}") - return regex - - -def matches_any_regex(layer_name: str, regex_config: Dict[str, dict]) -> bool: - """ - Check whether `layer_name` matches any regex pattern key in `regex_config`. - Args: - layer_name (str): The layer name to test. - regex_config (Dict[str, dict]): A mapping of regex patterns to configs. - Returns: - bool: True if any pattern matches `layer_name`, otherwise False. - """ - if not regex_config: - return False - - for pattern in regex_config: - # Strip dynamic prefixes (e.g., "+:" or "-:") - raw_pattern = pattern[2:] if pattern.startswith(("+:", "-:")) else pattern - - try: - if re.search(raw_pattern, layer_name): - return True - except re.error as e: - logger.warning("Skipping invalid regex pattern %r: %s", pattern, e) - continue - - return False - - -def json_serialize(obj: Any): - """Convert non-JSON-serializable objects into JSON-friendly formats.""" - if isinstance(obj, torch.dtype): - return str(obj).split(".")[-1] # e.g., torch.float16 -> "float16" - raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index f475e2157..53abc57aa 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -470,7 +470,7 @@ def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tens tensors of the block. """ # Calculate all block parameters memory - from auto_round.utils.quantization_utils import check_to_quantized + from auto_round.utils.model import check_to_quantized total_param_mem = 0 for name, module in block.named_modules(): diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f672e670f..2f53ef4af 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -885,7 +885,7 @@ def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16): Convert a model with FP8 quantized layers to a model with 16-bit linear layers. This is useful for compatibility with other frameworks or for further processing. """ - from auto_round.utils.memory_utils import clear_memory + from auto_round.utils.device import clear_memory cnt = 0 for n, m in model.named_modules(): diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index d433e9910..ea7e2ec5b 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -16,16 +16,14 @@ import transformers from torch.functional import F +from auto_round.compressors.utils import is_nv_fp from auto_round.data_type import get_quant_func from auto_round.logger import logger - -from .utils import ( +from auto_round.utils import ( SUPPORTED_LAYER_TYPES, check_to_quantized, compile_func, deepspeed_exists, - is_mx_fp, - is_nv_fp, set_module, ) From a913a9305f2180fdb05c842c1a02c3430d62e906 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Oct 2025 03:32:27 -0400 Subject: [PATCH 6/7] fix ut Signed-off-by: n1ck-guo --- test/test_cpu/test_gguf_format.py | 2 +- test/test_cpu/test_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 7505db913..c366e264a 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -339,8 +339,8 @@ def test_qtype_setting(self): # Qwen3-0.6B output q6_k, token_embed q4_0 448M # Qwen3-8B output q6_k, token_embed q4_0 4.5G # Llama-3.2-1B-Instruct o output, token_embed q6_k 736M + from auto_round.compressors import get_layer_config_by_gguf_format, set_layer_config from auto_round.export.export_to_gguf.config import ModelType - from auto_round.utils import get_layer_config_by_gguf_format, set_layer_config model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py index 846af0036..e70a4b7b4 100644 --- a/test/test_cpu/test_utils.py +++ b/test/test_cpu/test_utils.py @@ -2,7 +2,7 @@ from unittest.mock import patch sys.path.insert(0, "../..") -import auto_round.utils as auto_round_utils +import auto_round.utils.device as auto_round_utils class TestPackingWithNumba: From 16188c67c5918d6cb0da7cfe7f41cfcc4cb85cef Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Oct 2025 04:40:18 -0400 Subject: [PATCH 7/7] fix Signed-off-by: n1ck-guo --- auto_round/compressors/utils.py | 28 +++++++++++++++++++++------- auto_round/utils/model.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index ed56c72fa..6eb43e056 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -252,7 +252,7 @@ def set_layer_config( """ from auto_round.schemes import get_gguf_scheme - from auto_round.utils.model import get_layer_names_in_block, get_lm_head_name, get_module + from auto_round.utils.model import get_layer_names_in_block, get_lm_head_name, get_module, is_separate_lm_head # ---- helpers ------------------------------------------------- def dispatch_layer_config(layer_config: dict[str, dict]) -> None: @@ -368,7 +368,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"): tie_word_embeddings = model.config.tie_word_embeddings - if quant_lm_head and tie_word_embeddings: + if quant_lm_head and tie_word_embeddings and not gguf_name: quant_lm_head = False logger.warning( "reset `quant_lm_head` to false as quantizing " "lm_head with tied weights has not been supported currently" @@ -420,6 +420,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str return layer_config, has_qlayer_outside_block, regex_config # embed + lm_head defaults for gguf + tie_word_embeddings &= not is_separate_lm_head(model) if lm_head_name not in layer_config and not tie_word_embeddings: cfg = GGUF_INNER_CONFIG[GGUF_CONFIG[gguf_name.lower()]["lm_head"]] cfg = {**cfg, "fixed_by_user": False, "scale_dtype": default_scale_dtype} @@ -627,10 +628,22 @@ def _get_digital_in_layer_name(layer_name): return None +def _gguf_type_fallback(gguf_type: str) -> str: + gguf_type = gguf_type.lower() + if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"): + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q5_k": + gguf_type = "gguf:q5_0" + elif gguf_type == "gguf:q6_k": + gguf_type = "gguf:q8_0" + return gguf_type + + ##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129 def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model, model_type=ModelType.TEXT): # # TODO: support for other format later # target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) + import gguf # pylint: disable=E0401 from auto_round.utils.common import LazyImport @@ -733,9 +746,9 @@ def _set_config(config, target_config): config_tmp.pop(key, None) matched_scheme = get_gguf_scheme(QuantizationScheme.from_dict(config_tmp)) # check matched if not matched_scheme: - if config.get("super_group_size", None) is not None: + if config.get("super_group_size", None) is not None or config.get("super_bits", None) is not None: new_type = new_type[:bits_index] + str(config["bits"]) + "_k" - if config.get("super_group_size", None) is None or new_type not in GGUF_INNER_CONFIG: + if new_type not in GGUF_INNER_CONFIG: prefix_idx = 0 if config.get("sym", True) else 1 new_type = new_type[:bits_index] + str(config["bits"]) + f"_{prefix_idx}" if new_type not in GGUF_INNER_CONFIG: @@ -767,7 +780,8 @@ def _set_config(config, target_config): elif new_type != "gguf:q8_0": new_type = "gguf:q6_k" elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings: - pass + # new_type = GGUF_CONFIG[target_gguf_format]["lm_head"] + continue elif isinstance(layer, torch.nn.Embedding): if "embedding" in GGUF_CONFIG[target_gguf_format]: new_type = GGUF_CONFIG[target_gguf_format]["embedding"] @@ -883,7 +897,7 @@ def _set_config(config, target_config): new_type = "gguf:q5_k" new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] if input_features % new_block_size != 0: - new_type = gguf_type_fallback(new_type) + new_type = _gguf_type_fallback(new_type) new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0] if input_features % new_block_size != 0: new_type = "gguf:bf16" @@ -907,7 +921,7 @@ def _set_config(config, target_config): ): fallback = True if fallback: - tmp_type = gguf_type_fallback(new_type) + tmp_type = _gguf_type_fallback(new_type) logger.warning_once( f"self_attn.kv_b_proj does not support the use of {new_type}, replace it with {tmp_type}" ) diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 2f53ef4af..63cb24a9f 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -1232,3 +1232,27 @@ def find_matching_blocks(model, all_blocks, to_quant_block_names): "or set to_quant_block_name to None to automatically match quantizable blocks." ) return target_blocks + + +def is_separate_lm_head(model: torch.nn.Module) -> bool: + dir_path = model.name_or_path + if not os.path.isdir(dir_path): + dir_path = download_hf_model(dir_path) + lm_head_name: str = get_lm_head_name(model) + lm_head_name += ".weight" + + if "model.safetensors.index.json" in os.listdir(dir_path): + with open(os.path.join(dir_path, "model.safetensors.index.json")) as f: + index_mapping = json.load(f) + if lm_head_name in index_mapping["weight_map"]: + return True + else: + return False + else: + from safetensors import safe_open + + f = safe_open(os.path.join(dir_path, "model.safetensors"), framework="pt") + if lm_head_name in f.keys(): + return True + else: + return False