diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 11dbc5fe31..c68add9578 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -27,7 +27,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: "3.10" - name: Install dependencies run: | diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index b4bebde138..f77647ecea 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -101,6 +101,14 @@ Optional arguments: --group-size GROUP_SIZE The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + --group-size-fallback {error,ignore,adjust} + Specifies how to handle operations that do not support the given group size. Possible values are: + `error`: raise an error if the given group size is not supported by a node, this is the default + behavior; + `ignore`: skip nodes that cannot be compressed with the given group size; + `adjust`: adjust the group size to the maximum supported value for each problematic node, if + there is no valid value greater than or equal to 32, then the node is quantized to the backup + precision which is int8_asym by default. --backup-precision {none,int8_sym,int8_asym} Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight formats. If not provided, backup precision is int8_asym. 'none' stands for original floating- diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index eb0e624249..e43e828287 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -141,6 +141,20 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."), ) + optional_group.add_argument( + "--group-size-fallback", + type=str, + choices=["error", "ignore", "adjust"], + default=None, + help=( + "Specifies how to handle operations that do not support the given group size. Possible values are: " + "`error`: raise an error if the given group size is not supported by a node, this is the default behavior; " + "`ignore`: skip nodes that cannot be compressed with the given group size; " + "`adjust`: adjust the group size to the maximum supported value for each problematic node, if there is no " + "valid value greater than or equal to 32, then the node is quantized to the backup precision which is " + "int8_asym by default. " + ), + ) optional_group.add_argument( "--backup-precision", type=str, @@ -595,6 +609,7 @@ def prepare_wc_config(args, default_configs): "dtype": args.weight_format, "backup_precision": args.backup_precision, "statistics_path": args.quantization_statistics_path, + "group_size_fallback": args.group_size_fallback, } diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 4255b9d505..502c40c4f4 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -18,6 +18,7 @@ from ..utils.import_utils import ( is_diffusers_available, is_nncf_available, + is_nncf_version, is_sentence_transformers_available, ) from .utils import ( @@ -47,6 +48,12 @@ logging.disable(logging.INFO) import nncf + if is_nncf_version("<", "2.19"): + raise ImportError( + "NNCF version 2.19 or higher is required to use NNCF-based quantization. " + f"Please upgrade your NNCF installation. The current version of NNCF is {nncf.__version__}." + ) + logging.disable(logging.NOTSET) # Suppress version mismatch logging diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 18cb677f51..65f1b770fd 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -27,7 +27,7 @@ from optimum.configuration_utils import BaseConfig -from ..utils.import_utils import is_nncf_available, is_nncf_version +from ..utils.import_utils import is_nncf_available from .utils import ( PREDEFINED_CAUSAL_LANGUAGE_DATASETS, PREDEFINED_LANGUAGE_DATASETS, @@ -350,32 +350,22 @@ class OVQuantizationMethod(str, Enum): "sym": False, "group_size": -1, }, + "inceptionai/jais-13b": { + "bits": 4, + "sym": False, + "group_size": 128, + "ratio": 1.0, + "group_size_fallback": "adjust", + }, + "HuggingFaceTB/SmolVLM2-256M-Video-Instruct": { + "bits": 4, + "sym": False, + "group_size": 128, + "ratio": 1.0, + "group_size_fallback": "adjust", + }, } -if is_nncf_available(): - # TODO: Remove after update to NNCF 2.19 because `group_size_fallback` argument will be added to OVWeightQuantizationConfig - _DEFAULT_4BIT_WQ_CONFIGS.update( - { - "inceptionai/jais-13b": { - "bits": 4, - "sym": False, - "group_size": 128, - "ratio": 1.0, - "advanced_parameters": nncf.AdvancedCompressionParameters( - group_size_fallback_mode=nncf.GroupSizeFallbackMode.ADJUST, - ), - }, - "HuggingFaceTB/SmolVLM2-256M-Video-Instruct": { - "bits": 4, - "sym": False, - "group_size": 128, - "ratio": 1.0, - "advanced_parameters": nncf.AdvancedCompressionParameters( - group_size_fallback_mode=nncf.GroupSizeFallbackMode.ADJUST, - ), - }, - } - ) # Add configs for model id aliases # The list below contains pairs of model ids: config for the second model id will be copied from the first model id. @@ -726,6 +716,13 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): multiple times on the same model and dataset to avoid recomputing statistics. Please note that the statistics depend on the dataset, so if you change the dataset, you should also change the statistics path to avoid confusion. + group_size_fallback (`str`, *optional*): + Defines the behavior when the specified group size is not compatible with the weight shape. Possible values: + - "error": raises an error if the group size is not compatible with the weight shape (default); + - "ignore": skips quantization for the layers where the group size is not compatible with the weight shape; + - "adjust": automatically adjusts the group size to the maximum compatible value for each weight tensor, + if there is no valid value greater than or equal to 32, then the node is quantized to the backup precision + which is int8_asym by default. kwargs: Additional parameters for nncf.compress_weights() call. """ @@ -750,6 +747,7 @@ def __init__( lora_correction: bool = None, backup_precision: Optional[str] = None, statistics_path: Optional[str] = None, + group_size_fallback: Optional[str] = None, **kwargs, ): weight_format = kwargs.pop("weight_format", None) @@ -781,6 +779,7 @@ def __init__( self.backup_precision = backup_precision self.dtype = dtype self.statistics_path = statistics_path + self.group_size_fallback = group_size_fallback self.post_init() def post_init(self): @@ -830,9 +829,6 @@ def post_init(self): "quantization algorithm is selected and compression ratio is 1.0." ) - if self.dataset is None and self.quant_method == OVQuantizationMethod.AWQ and is_nncf_version("<", "2.17.0"): - raise ValueError("Data-free AWQ is available starting form NNCF 2.17. Please update nncf package.") - if self.dtype in ["int4", "int8"]: bits = 4 if self.dtype == "int4" else 8 if self.bits is not None and self.bits != bits: @@ -914,6 +910,13 @@ def post_init(self): if self.gptq and self.lora_correction: raise ValueError("The GPTQ and LoRA Correction algorithms can't be applied simultaneously") + valid_group_size_fallback_values = [e.value for e in nncf.GroupSizeFallbackMode] + if self.group_size_fallback not in valid_group_size_fallback_values + [None]: + raise ValueError( + f"`group_size_fallback` must be one of the following: {valid_group_size_fallback_values}, " + f"but found: {self.group_size_fallback}" + ) + def to_nncf_dict(self) -> Dict[str, Any]: """ Returns a dictionary with the variables that are ready to use for nncf.quantize() call. @@ -923,8 +926,6 @@ def to_nncf_dict(self) -> Dict[str, Any]: mode = self.dtype if self.dtype else signed_bitness[self.bits] if mode in signed_bitness.values(): mode += "_sym" if self.sym else "_asym" - if mode == "mxfp4": - mode = "e2m1" if is_nncf_version("<=", "2.18") else "mxfp4" if mode == "cb4": mode = "cb4_f8e4m3" mode = nncf.CompressWeightsMode(mode) @@ -933,9 +934,14 @@ def to_nncf_dict(self) -> Dict[str, Any]: sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None backup_mode = nncf.BackupMode(self.backup_precision) if self.backup_precision else None kwargs = self.kwargs.copy() - if self.statistics_path: + if self.statistics_path or self.group_size_fallback: advanced_parameters = kwargs.get("advanced_parameters", nncf.AdvancedCompressionParameters()) - advanced_parameters = dataclasses.replace(advanced_parameters, statistics_path=self.statistics_path) + if self.statistics_path: + advanced_parameters = dataclasses.replace(advanced_parameters, statistics_path=self.statistics_path) + if self.group_size_fallback: + advanced_parameters = dataclasses.replace( + advanced_parameters, group_size_fallback_mode=nncf.GroupSizeFallbackMode(self.group_size_fallback) + ) kwargs["advanced_parameters"] = advanced_parameters result = { "mode": mode, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 55ceb2aaa2..06597b482c 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -41,14 +41,12 @@ from transformers.utils import is_accelerate_available from optimum.quantization_base import OptimumQuantizer -from optimum.utils.logging import warn_once from ..utils.import_utils import ( DATASETS_IMPORT_ERROR, _nncf_version, is_datasets_available, is_diffusers_available, - is_nncf_version, is_sentence_transformers_available, ) from .configuration import ( @@ -771,12 +769,6 @@ def _prepare_visual_causal_lm_calibration_data( and input_dict["pixel_values"].dim() == 4 and input_dict["pixel_values"].shape[0] > 1 ): - if is_nncf_version("<=", "2.18"): - # TODO (Nikita): Remove once NNCF 2.19 is released. - warn_once( - logger, - "If you are facing RAM OOM issues, please update to the latest NNCF develop version.", - ) batch_size = input_dict["pixel_values"].shape[0] for i in range(batch_size): single_batch_input_dict = {} diff --git a/setup.py b/setup.py index 2c2add705f..0a7b6220f9 100644 --- a/setup.py +++ b/setup.py @@ -64,8 +64,8 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] EXTRAS_REQUIRE = { - "nncf": ["nncf>=2.18.0"], - "openvino": ["nncf>=2.18.0", "openvino>=2025.1.0", "openvino-tokenizers>=2025.1.0"], + "nncf": ["nncf>=2.19.0"], + "openvino": ["nncf>=2.19.0", "openvino>=2025.1.0", "openvino-tokenizers>=2025.1.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 65031b1b77..a34837095e 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -63,7 +63,6 @@ from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS, TemporaryDirectory from optimum.intel.utils.import_utils import ( compare_versions, - is_nncf_version, is_openvino_tokenizers_available, is_openvino_version, is_tokenizers_version, @@ -466,11 +465,11 @@ class OVCLIExportTestCase(unittest.TestCase): "--dataset coco --num-samples 1", { "vision_encoder": 75, - "prompt_encoder_mask_decoder": 61 if is_nncf_version("<=", "2.18") else 60, + "prompt_encoder_mask_decoder": 60, }, { "vision_encoder": {"int8": 75}, - "prompt_encoder_mask_decoder": {"int8": 50 if is_nncf_version("<=", "2.18") else 49}, + "prompt_encoder_mask_decoder": {"int8": 49}, }, ), ( @@ -528,6 +527,12 @@ class OVCLIExportTestCase(unittest.TestCase): "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"model": {"int4": 16}}, ), + ( + "text-generation-with-past", + "gpt2", + "int4 --sym --group-size-fallback adjust", + {"model": {"int8": 4, "int4": 20}}, + ), ( "text-generation-with-past", "llama_awq", diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 576444e9b8..9faec6ac1e 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -21,11 +21,9 @@ import unittest from collections import defaultdict from collections.abc import Iterable -from enum import Enum from functools import partial from typing import Union, Type -import openvino as ov import pytest import numpy as np import torch @@ -33,7 +31,6 @@ from parameterized import parameterized import nncf from transformers import ( - AutoModelForQuestionAnswering, AutoTokenizer, AutoProcessor, AutoConfig, @@ -85,7 +82,7 @@ from copy import deepcopy from optimum.intel.openvino.quantization import InferRequestWrapper, OVCalibrationDatasetBuilder -from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version, is_nncf_version +from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version from utils_tests import ( MODEL_NAMES, get_num_quantized_nodes, @@ -362,11 +359,11 @@ class OVQuantizerTest(unittest.TestCase): OVQuantizationConfig(bits=8, dataset="coco", num_samples=1), { "vision_encoder": 75, - "prompt_encoder_mask_decoder": 61 if is_nncf_version("<=", "2.18") else 60, + "prompt_encoder_mask_decoder": 60, }, { "vision_encoder": {"int8": 75}, - "prompt_encoder_mask_decoder": {"int8": 50 if is_nncf_version("<=", "2.18") else 49}, + "prompt_encoder_mask_decoder": {"int8": 49}, }, ), ( @@ -735,7 +732,7 @@ class OVWeightCompressionTest(unittest.TestCase): dict(bits=4, dataset="coco", num_samples=1, group_size=2), { "vision_encoder": {"int8": 56, "int4": 94}, - "prompt_encoder_mask_decoder": {"int8": 6, "int4": 94 if is_nncf_version("<=", "2.18") else 92}, + "prompt_encoder_mask_decoder": {"int8": 6, "int4": 92}, }, ), ( @@ -938,6 +935,35 @@ class OVWeightCompressionTest(unittest.TestCase): dict(bits=4, sym=False, group_size=32, ratio=1.0), {"model": {"int8": 2, "int4": 14}}, ), + ( + OVModelForCausalLM, + "gpt2", + False, + dict(bits=4, sym=True, group_size_fallback="adjust"), + {"model": {"int8": 4, "int4": 20}}, + ), + ( + OVModelForCausalLM, + "llama", + False, + dict( + bits=4, + sym=True, + group_size_fallback="adjust", + ), + {"model": {"int8": 28, "int4": 2}}, + ), + ( + OVModelForCausalLM, + "llama", + False, + dict( + bits=4, + sym=True, + group_size_fallback="ignore", + ), + {"model": {"int8": 4}}, + ), ] # filter models type depending on min max transformers version diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index a43223ebc3..954657e805 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -22,7 +22,7 @@ import torch from optimum.exporters.tasks import TasksManager -from optimum.intel.utils.import_utils import is_nncf_version, is_openvino_version, is_transformers_version +from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version SEED = 42 @@ -319,7 +319,7 @@ "transformer": 58, "vae_decoder": 28, "vae_encoder": 28, - "text_encoder": 16 if is_nncf_version(">", "2.17") else 18, + "text_encoder": 16, }, "ltx-video": { "transformer": 34, @@ -329,7 +329,7 @@ }, "sam": { "vision_encoder": 102 if is_openvino_version("<", "2025.2.0") else 150, - "prompt_encoder_mask_decoder": 100 if is_nncf_version("<=", "2.18") else 98, + "prompt_encoder_mask_decoder": 98, }, "speecht5": { "encoder": 28,