diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile old mode 100644 new mode 100755 index 08bc3c45b952d..47fcd11fd766d --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -45,6 +45,9 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt # Add aqlm for quantization testing RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 +# Add hqq for quantization testing +RUN python3 -m pip install --no-cache-dir hqq + # Add autoawq for quantization testing # >=v0.2.3 needed for compatibility with torch 2.2.1 RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md old mode 100644 new mode 100755 index 91de5fc8a33ce..f1e2acdcfe480 --- a/docs/source/en/main_classes/quantization.md +++ b/docs/source/en/main_classes/quantization.md @@ -52,3 +52,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide. ## HfQuantizer [[autodoc]] quantizers.base.HfQuantizer + +## HqqConfig + +[[autodoc]] HqqConfig diff --git a/docs/source/en/quantization.md b/docs/source/en/quantization.md old mode 100644 new mode 100755 index 8a3650a843904..ae4f44f6b800b --- a/docs/source/en/quantization.md +++ b/docs/source/en/quantization.md @@ -745,3 +745,53 @@ The speed and throughput of fused and unfused modules were also tested with the
generate throughput/batch size
+ +## HQQ +Half-Quadratic Quantization (HQQ) implements on-the-fly quantization via fast robust optimization. It doesn't require calibration data and can be used to quantize any model. +Please refer to the official package for more details. + +For installation, we recommend you use the following approach to get the latest version and build its corresponding CUDA kernels: +``` +pip install hqq +``` + +To quantize a model, you need to create an [`HqqConfig`]. There are two ways of doing it: +``` Python +from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig + +# Method 1: all linear layers will use the same quantization config +quant_config = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) #axis=0 is used by default +``` + +``` Python +# Method 2: each linear layer with the same tag will use a dedicated quantization config +q4_config = {'nbits':4, 'group_size':64, 'quant_zero':False, 'quant_scale':False} +q3_config = {'nbits':3, 'group_size':32, 'quant_zero':False, 'quant_scale':False} +quant_config = HqqConfig(dynamic_config={ + 'self_attn.q_proj':q4_config, + 'self_attn.k_proj':q4_config, + 'self_attn.v_proj':q4_config, + 'self_attn.o_proj':q4_config, + + 'mlp.gate_proj':q3_config, + 'mlp.up_proj' :q3_config, + 'mlp.down_proj':q3_config, +}) +``` + +The second approach is especially interesting for quantizing Mixture-of-Experts (MoEs) because the experts are less affected by lower quantization settings. + + +Then you simply quantize the model as follows +``` Python +model = transformers.AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.float16, + device_map="cuda", + quantization_config=quant_config +) +``` +### Optimized Runtime +HQQ supports various backends, including pure Pytorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training. +For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090. +For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend \ No newline at end of file diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md old mode 100644 new mode 100755 diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py old mode 100644 new mode 100755 index 53a087468e66a..12f1821df32f9 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1133,6 +1133,7 @@ "BitsAndBytesConfig", "EetqConfig", "GPTQConfig", + "HqqConfig", "QuantoConfig", ], } @@ -6099,6 +6100,7 @@ BitsAndBytesConfig, EetqConfig, GPTQConfig, + HqqConfig, QuantoConfig, ) diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py old mode 100644 new mode 100755 index 72fdf3e1bbb99..69fb0e3259b1d --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -43,6 +43,7 @@ "unset_hf_deepspeed_config", ], "eetq": ["replace_with_eetq_linear"], + "hqq": ["prepare_for_hqq_linear"], "integration_utils": [ "INTEGRATION_TO_CALLBACK", "AzureMLCallback", @@ -113,6 +114,7 @@ unset_hf_deepspeed_config, ) from .eetq import replace_with_eetq_linear + from .hqq import prepare_for_hqq_linear from .integration_utils import ( INTEGRATION_TO_CALLBACK, AzureMLCallback, diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py new file mode 100755 index 0000000000000..10a6d06a3f9f0 --- /dev/null +++ b/src/transformers/integrations/hqq.py @@ -0,0 +1,121 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"HQQ (Half-Quadratic Quantization) integration file" + +from ..utils import is_hqq_available, is_torch_available, logging + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +# Name all modules inside the model +def autoname_modules(model): + for name, module in model.named_modules(): + module.name = name + + +# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj +def name_to_linear_tag(name): + return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))]) + + +# Get all linear tags available +def get_linear_tags(model): + if is_hqq_available(): + from hqq.core.quantize import HQQLinear + + linear_tags = set() + for name, module in model.named_modules(): + if isinstance(module, (torch.nn.Linear, HQQLinear)): + linear_tags.add(name_to_linear_tag(name)) + return list(linear_tags) + + +def _prepare_for_hqq_linear(model, patch_params, has_been_replaced, current_key_name=None): + for name, module in model.named_children(): + if current_key_name is None: + current_key_name = [] + current_key_name.append(name) + + if isinstance(module, torch.nn.Linear): + # Get linear tag + linear_tag = name_to_linear_tag(module.name) + + # We put the module quant_config into the nn.Linear layer so we can access it later in quantizer_hqq.create_quantized_param() + if linear_tag in patch_params: + if patch_params[linear_tag] is not None: + model._modules[name].quant_config = patch_params[linear_tag] + # Store the module class in case we need to transpose the weight later + model._modules[name].source_cls = type(module) + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + + has_been_replaced = True + + if len(list(module.children())) > 0: + _, has_been_replaced = _prepare_for_hqq_linear( + module, + patch_params=patch_params, + has_been_replaced=has_been_replaced, + ) + # Remove the last key for recursion + current_key_name.pop(-1) + + return model, has_been_replaced + + +def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_convert=None, has_been_replaced=False): + """ + Prepares nn.Linear layers for HQQ quantization. + Since each layer type can have separate quantization parameters, we need to do the following: + 1- tag each module with its neme via autoname_modules() + 2- Extract linear_tags (e.g. ['self_attn.q_proj', ...]) + 3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params + """ + + modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert + + # Add name to module + autoname_modules(model) + + # Get linear tags. This allows us to use different quant params to different layer types + linear_tags = get_linear_tags(model) + + # Convert quantization_config to layer-wise config + skip_modules = quantization_config.skip_modules + quant_config = quantization_config.to_dict() + linear_tags = list(set(linear_tags) - set(skip_modules) - set(modules_to_not_convert)) + + if any(key in linear_tags for key in quant_config.keys()): + # If the user doesn't specify a key from get_linear_tags, the layer is not quantized via (key, None) + patch_params = {key: None for key in linear_tags} + patch_params.update(quant_config) + else: + # Same quant_config for all layers + patch_params = {k: quant_config for k in linear_tags} + + model, has_been_replaced = _prepare_for_hqq_linear( + model, patch_params=patch_params, has_been_replaced=has_been_replaced + ) + + # We store quantization config as linear_tag -> hqq quant config + model.config.quantization_config = patch_params + + if not has_been_replaced: + logger.warning("No linear modules were found in your model for quantization.") + + return model diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py old mode 100644 new mode 100755 diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py old mode 100644 new mode 100755 index 4b20b32aa694d..59b6bf8075205 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2659,6 +2659,8 @@ def get_memory_footprint(self, return_buffers=True): @wraps(torch.nn.Module.cuda) def cuda(self, *args, **kwargs): + if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ: + raise ValueError("`.cuda` is not supported for HQQ-quantized models.") # Checks if the model has been loaded in 8-bit if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: raise ValueError( @@ -2670,6 +2672,8 @@ def cuda(self, *args, **kwargs): @wraps(torch.nn.Module.to) def to(self, *args, **kwargs): + if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ: + raise ValueError("`.to` is not supported for HQQ-quantized models.") # Checks if the model has been loaded in 8-bit if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: raise ValueError( @@ -3739,6 +3743,13 @@ def from_pretrained( } if "skip_keys" in inspect.signature(dispatch_model).parameters: device_map_kwargs["skip_keys"] = model._skip_keys_device_placement + # For HQQ method we force-set the hooks for single GPU envs + if ( + "force_hooks" in inspect.signature(dispatch_model).parameters + and hf_quantizer is not None + and hf_quantizer.quantization_config.quant_method == QuantizationMethod.HQQ + ): + device_map_kwargs["force_hooks"] = True if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled(): dispatch_model(model, **device_map_kwargs) diff --git a/src/transformers/quantizers/__init__.py b/src/transformers/quantizers/__init__.py old mode 100644 new mode 100755 diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py old mode 100644 new mode 100755 index cc58cd7af69ff..2c65afa77e282 --- a/src/transformers/quantizers/auto.py +++ b/src/transformers/quantizers/auto.py @@ -21,6 +21,7 @@ BitsAndBytesConfig, EetqConfig, GPTQConfig, + HqqConfig, QuantizationConfigMixin, QuantizationMethod, QuantoConfig, @@ -31,6 +32,7 @@ from .quantizer_bnb_8bit import Bnb8BitHfQuantizer from .quantizer_eetq import EetqHfQuantizer from .quantizer_gptq import GptqHfQuantizer +from .quantizer_hqq import HqqHfQuantizer from .quantizer_quanto import QuantoHfQuantizer @@ -42,6 +44,7 @@ "aqlm": AqlmHfQuantizer, "quanto": QuantoHfQuantizer, "eetq": EetqHfQuantizer, + "hqq": HqqHfQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { @@ -52,6 +55,7 @@ "gptq": GPTQConfig, "aqlm": AqlmConfig, "quanto": QuantoConfig, + "hqq": HqqConfig, } diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py new file mode 100755 index 0000000000000..dd58c2c1bc5a2 --- /dev/null +++ b/src/transformers/quantizers/quantizer_hqq.py @@ -0,0 +1,200 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING, Any, Dict, List + +from ..integrations import prepare_for_hqq_linear +from ..utils import is_accelerate_available, is_hqq_available, is_torch_available, logging +from .base import HfQuantizer +from .quantizers_utils import get_module_from_name + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + + +if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +# Finds the parent of a node module named "name" +def find_parent(model, name): + module_tree = name.split(".")[:-1] + parent = model + for m in module_tree: + parent = parent._modules[m] + return parent + + +class HqqHfQuantizer(HfQuantizer): + """ + HQQ quantizer base HF class. + nn.Linear modules are first tagged with quant_config in _process_model_before_weight_loading(). + The actual quantization and offloading to the GPU is done in check_quantized_param(). + """ + + use_keep_in_fp32_modules = False + requires_parameters_quantization = True + requires_calibration = False + required_packages = ["hqq"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + self.torch_dtype = None + self.using_multi_gpu = False + + def validate_environment(self, *args, **kwargs): + if not (is_hqq_available()): + raise ImportError( + "HQQ is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`" + ) + + if kwargs.get("from_tf", False) or kwargs.get("from_flax", False): + raise ValueError( + "Converting weights from tf/flax weights is currently not supported, please make" + " sure the weights are in PyTorch format." + ) + + if not torch.cuda.is_available(): + raise RuntimeError("No GPU found. A GPU is needed for quantization.") + + if self.torch_dtype is None: + if "torch_dtype" in kwargs: + self.torch_dtype = kwargs["torch_dtype"] + else: + self.torch_dtype = torch.float32 + logger.info("Setting torch_dtype to torch.float32 as the default value since it was not specified.") + + device_map = kwargs.get("device_map", None) + if isinstance(device_map, dict): + if "cpu" in device_map.values() or "disk" in device_map.values(): + raise ValueError( + "You are attempting to use an HQQ model with a device_map that contains a CPU or disk device." + " This is not supported. Please remove the CPU or disk device from the device_map." + ) + else: + self.using_multi_gpu = len(set(device_map.values())) > 1 + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ) -> bool: + module, tensor_name = get_module_from_name(model, param_name) + + return isinstance(module, torch.nn.Linear) + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: List[str], + ): + """ + Each nn.Linear layer is processsed here. + We first check if the corresponding module state_dict contains already HQQ quantized parameters. + If not, we create a temp linear layer with the module state_dict params and use it for quantization + """ + + if is_hqq_available(): + from hqq.core.quantize import HQQLinear + + module, tensor_name = get_module_from_name(model, param_name) + + layer_name = param_name.replace(".weight", "").replace(".bias", "") + parent_module = find_parent(model, layer_name) + node = layer_name.split(".")[-1] + + # Step 0: set module state_dict + module_state_dict = {key.split(".")[-1]: state_dict[key] for key in state_dict if layer_name in key} + + # Step 1: populate module with weight/bias from module state dict + for key in module_state_dict: + setattr(module, key, torch.nn.Parameter(module_state_dict[key])) + + # Step 2: Replace module with either HQQLinear or move it to device. We do this via setattr on the parent as doing on it on the module + # directly doesn't work. + + if hasattr(module, "quant_config"): + hqq_layer = HQQLinear( + module, + module.quant_config, + compute_dtype=self.torch_dtype, + device=target_device, + del_orig=True, + ) + + if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor): + hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias) + + if self.using_multi_gpu: + hqq_layer = self._patch_layer_for_multigpu(hqq_layer) + + setattr(parent_module, node, hqq_layer) + + else: + module = module.to(dtype=self.torch_dtype, device=target_device) + setattr(parent_module, node, module) + + torch.cuda.empty_cache() + + # Remove accelerate hook and uses a simpler forward pass. Otherwise, this breaks with multi-gpu + def _patch_layer_for_multigpu(self, hqq_layer): + hqq_layer = remove_hook_from_module(hqq_layer) + + def forward_with_device(self, x): + out = torch.matmul(x.to(self.device), self.dequantize().t()) + if self.bias is not None: + out += self.bias + return out + + hqq_layer.forward = lambda x: forward_with_device(hqq_layer, x) + return hqq_layer + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = None, + **kwargs, + ): + keep_in_fp32_modules = keep_in_fp32_modules if keep_in_fp32_modules is not None else [] + + # Add the corresponding quant_config to each valid module. This allows us to do the actual nn.Linear -> HQQLinear conversion in create_quantized_param(). + # prepare_for_hqq_linear() also sets the right quantization config inside the model (model.config.quantization_config) and the layers (hqq_layer.quant_config) + model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config) + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + model.is_hqq_quantized = True + model.is_hqq_serializable = self.is_serializable + return model + + @property + def is_serializable(self): + return False + + @property + def is_trainable(self) -> bool: + return False diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py old mode 100644 new mode 100755 index e4ff991ed75c7..2bfa5638df922 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -129,6 +129,7 @@ is_ftfy_available, is_g2p_en_available, is_galore_torch_available, + is_hqq_available, is_in_notebook, is_ipex_available, is_jieba_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py old mode 100644 new mode 100755 index c65d4122b787d..158896347a7a6 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -170,6 +170,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _torchdistx_available = _is_package_available("torchdistx") _torchvision_available = _is_package_available("torchvision") _mlx_available = _is_package_available("mlx") +_hqq_available = _is_package_available("hqq") _torch_version = "N/A" @@ -292,6 +293,10 @@ def is_torch_available(): return _torch_available +def is_hqq_available(): + return _hqq_available + + def get_torch_version(): return _torch_version diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py old mode 100644 new mode 100755 index 8374ddef81d58..f9e503cf862f1 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -24,7 +24,7 @@ from packaging import version -from ..utils import is_auto_awq_available, is_torch_available, logging +from ..utils import is_auto_awq_available, is_hqq_available, is_torch_available, logging if is_torch_available(): @@ -41,6 +41,7 @@ class QuantizationMethod(str, Enum): AQLM = "aqlm" QUANTO = "quanto" EETQ = "eetq" + HQQ = "hqq" class AWQLinearVersion(str, Enum): @@ -180,6 +181,115 @@ def update(self, **kwargs): return unused_kwargs +@dataclass +class HqqConfig(QuantizationConfigMixin): + """ + This is wrapper around hqq's BaseQuantizeConfig. + + Args: + nbits (`int`, *optional*, defaults to 4): + Number of bits. Supported values are (8, 4, 3, 2, 1). + group_size (`int`, *optional*, defaults to 64): + Group-size value. Supported values are any value that is divisble by weight.shape[axis]). + quant_zero (`bool`, *optional*, defaults to `True`): + Quantize the zero-point if set to `True`. + quant_scale (`bool`, *optional*, defaults to `False`): + Quantize the scaling if set to `True`. + offload_meta (`bool`, *optional*, defaults to `False`): + Offload the meta-data to the CPU if set to `True`. + view_as_float (`bool`, *optional*, defaults to `False`): + View the quantized weight as float (used in distributed training) if set to `True`. + axis (`int`, *optional*, defaults to 0): + Axis along which grouping is performed. Supported values are 0 or 1. + dynamic_config (dict, *optional*): + Parameters for dynamic configuration. The key is the name tag of the layer and the value is a quantization config. + If set, each layer specified by its id will use its dedicated quantization configuration. + skip_modules (`List[str]`, *optional*, defaults to `['lm_head']`): + List of `nn.Linear` layers to skip. + kwargs (`Dict[str, Any]`, *optional*): + Additional parameters from which to initialize the configuration object. + """ + + def __init__( + self, + nbits: int = 4, + group_size: int = 64, + quant_zero: bool = True, + quant_scale: bool = False, + offload_meta: bool = False, + view_as_float: bool = False, + axis: int = 0, + dynamic_config: Optional[dict] = None, + skip_modules: List[str] = ["lm_head"], + **kwargs, + ): + if is_hqq_available(): + from hqq.core.quantize import BaseQuantizeConfig as HQQBaseQuantizeConfig + + if axis not in [0, 1]: + raise ValueError("Invalid axis value. Only 0 and 1 are allowed.") + + if dynamic_config is not None: + self.quant_config = {} + for key in dynamic_config: + self.quant_config[key] = HQQBaseQuantizeConfig(**dynamic_config[key]) + else: + self.quant_config = HQQBaseQuantizeConfig( + **{ + "nbits": nbits, + "group_size": group_size, + "quant_zero": quant_zero, + "quant_scale": quant_scale, + "offload_meta": offload_meta, + "view_as_float": view_as_float, + "axis": axis, + } + ) + + self.quant_method = QuantizationMethod.HQQ + self.skip_modules = skip_modules + + self.post_init() + + def post_init(self): + r""" + Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. + """ + pass + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. + """ + return self.quant_config + + def __repr__(self): + config_dict = self.to_dict() + return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n" + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = HqqConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + + @dataclass class BitsAndBytesConfig(QuantizationConfigMixin): """ diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py new file mode 100755 index 0000000000000..e4e01f8649638 --- /dev/null +++ b/tests/quantization/hqq/test_hqq.py @@ -0,0 +1,167 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig +from transformers.testing_utils import ( + require_accelerate, + require_torch_gpu, + require_torch_multi_gpu, + slow, + torch_device, +) +from transformers.utils import is_hqq_available, is_torch_available + + +if is_torch_available(): + import torch + +if is_hqq_available(): + from hqq.core.quantize import HQQBackend, HQQLinear + + +class HQQLLMRunner: + def __init__(self, model_id, quant_config, compute_dtype, device, cache_dir): + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=compute_dtype, + device_map=device, + quantization_config=quant_config, + low_cpu_mem_usage=True, + cache_dir=cache_dir, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir) + self.device = self.model.device + HQQLinear.set_backend(HQQBackend.PYTORCH) + + +def cleanup(): + torch.cuda.empty_cache() + gc.collect() + + +def check_hqqlayer(test_module, hqq_layer, batch_size=1, context_size=1024): + # Test HQQ layer + W_dequant = hqq_layer.dequantize() # Reconstructed weights + inputs = ( + torch.randn( + (batch_size, context_size, hqq_layer.meta["shape"][1]), + device=hqq_layer.device, + dtype=hqq_layer.compute_dtype, + ) + / 10.0 + ) + with torch.no_grad(): + outputs = hqq_layer(inputs) + test_module.assertEqual(outputs.shape[-1], W_dequant.shape[0]) + test_module.assertEqual(outputs.dtype, hqq_layer.compute_dtype) + del W_dequant, inputs, outputs + cleanup() + + +def check_forward(test_module, model, batch_size=1, context_size=1024): + # Test forward pass + with torch.no_grad(): + out = model(torch.zeros([batch_size, context_size], device=model.device, dtype=torch.int32)).logits + test_module.assertEqual(out.shape[0], batch_size) + test_module.assertEqual(out.shape[1], context_size) + cleanup() + + +MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + + +@require_torch_gpu +class HqqConfigTest(unittest.TestCase): + def test_to_dict(self): + """ + Makes sure the config format is properly set + """ + quantization_config = HqqConfig() + hqq_orig_config = quantization_config.to_dict() + + for key in hqq_orig_config: + self.assertEqual(quantization_config.quant_config[key], hqq_orig_config[key]) + + +@slow +@require_torch_gpu +@require_accelerate +class HQQTest(unittest.TestCase): + def tearDown(self): + cleanup() + + def test_fp16_quantized_model(self): + """ + Simple LLM model testing fp16 + """ + quant_config = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) + + hqq_runner = HQQLLMRunner( + model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device + ) + + check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj) + check_forward(self, hqq_runner.model) + + def test_bfp16_quantized_model_with_offloading(self): + """ + Simple LLM model testing bfp16 with meta-data offloading + """ + q4_config = {"nbits": 4, "group_size": 64, "quant_zero": False, "quant_scale": False} + q3_config = {"nbits": 3, "group_size": 32, "quant_zero": False, "quant_scale": False, "offload_meta": True} + quant_config = HqqConfig( + dynamic_config={ + "self_attn.q_proj": q4_config, + "self_attn.k_proj": q4_config, + "self_attn.v_proj": q4_config, + "self_attn.o_proj": q4_config, + "mlp.gate_proj": q3_config, + "mlp.up_proj": q3_config, + "mlp.down_proj": q3_config, + } + ) + + hqq_runner = HQQLLMRunner( + model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.bfloat16, device=torch_device + ) + + check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj) + check_forward(self, hqq_runner.model) + + +@slow +@require_torch_gpu +@require_torch_multi_gpu +@require_accelerate +class HQQTestMultiGPU(unittest.TestCase): + def tearDown(self): + cleanup() + + def test_fp16_quantized_model_multipgpu(self): + """ + Simple LLM model testing fp16 with multi-gpu + """ + + quant_config = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) + + hqq_runner = HQQLLMRunner( + model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device="auto" + ) + + check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj) + check_forward(self, hqq_runner.model)