Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce OVQuantizationConfig for nncf.quantize() parameters #638

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions optimum/intel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@
"OVModelForSpeechSeq2Seq",
"OVModelForSequenceClassification",
"OVModelForTokenClassification",
"OVQuantizationConfig",
"OVWeightQuantizationConfig",
"OVConfig",
]
Expand Down Expand Up @@ -241,6 +242,7 @@
OVModelForSequenceClassification,
OVModelForSpeechSeq2Seq,
OVModelForTokenClassification,
OVQuantizationConfig,
OVWeightQuantizationConfig,
)

Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from .trainer import OVTrainer


from .configuration import OVConfig, OVWeightQuantizationConfig
from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig
from .modeling import (
OVModelForAudioClassification,
OVModelForAudioFrameClassification,
Expand Down
197 changes: 91 additions & 106 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,70 +13,19 @@
# limitations under the License.

from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Optional, Union

import datasets
import nncf
import torch
from nncf.quantization.advanced_parameters import OverflowFix
from transformers import PretrainedConfig
from transformers.utils.quantization_config import QuantizationConfigMixin
from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod

from optimum.configuration_utils import BaseConfig


DEFAULT_QUANTIZATION_CONFIG = {
"algorithm": "quantization",
"preset": "mixed",
"overflow_fix": "disable",
"initializer": {
"range": {"num_init_samples": 300, "type": "mean_min_max"},
"batchnorm_adaptation": {"num_bn_adaptation_samples": 0},
},
"scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}},
"ignored_scopes": [
"{re}.*Embedding.*",
"{re}.*add___.*",
"{re}.*layer_norm_.*",
"{re}.*matmul_1",
"{re}.*__truediv__.*",
],
}

INT8_WEIGHT_COMPRESSION_CONFIG = {
"algorithm": "quantization",
"weights": {
"mode": "symmetric",
"bits": 8,
"target_scopes": [
"{re}.*Embedding.*",
"{re}.*matmul_.*",
"{re}.*addmm_.*",
"{re}.*baddmm_.*",
"{re}.*linear_.*",
],
"ignored_scopes": [
"{re}.*conv_*",
],
},
"activations": {
"ignored_scopes": [
"{re}.*add___.*",
"{re}.*__radd___.*",
"{re}.*layer_norm_.*",
"{re}.*__truediv__.*",
"{re}.*__mul___.*",
"{re}.*__rmul___.*",
"{re}.*tanh_.*",
"{re}.*pow_.*",
"{re}.*matmul_.*",
"{re}.*addmm_.*",
"{re}.*baddmm_.*",
"{re}.*linear_.*",
"{re}.*conv_.*",
],
},
"overflow_fix": "disable",
}


_DEFAULT_4BIT_CONFIGS = {
"databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5},
"EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64},
Expand All @@ -100,31 +49,55 @@
}


@dataclass
class OVQuantizationConfigBase(QuantizationConfigMixin):
def __init__(
self,
dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None,
ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None,
subset_size: Optional[int] = None,
):
self.dataset = dataset
self.ignored_scope = ignored_scope
self.subset_size = subset_size

def post_init(self):
if self.dataset is not None and isinstance(self.dataset, str):
llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]
stable_diffusion_datasets = [
"conceptual_captions",
"laion/220k-GPT4Vision-captions-from-LIVIS",
"laion/filtered-wit",
]
if self.dataset not in llm_datasets + stable_diffusion_datasets:
raise ValueError(
f"""You have entered a string value for dataset. You can only choose between
{llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
)


class OVConfig(BaseConfig):
CONFIG_NAME = "openvino_config.json"
FULL_CONFIGURATION_FILE = "openvino_config.json"

def __init__(
self,
compression: Union[List[Dict], Dict, None] = None,
input_info: Optional[List] = None,
save_onnx_model: bool = False,
quantization_config: Optional[Union[QuantizationConfigMixin, Dict]] = None,
quantization_config: Optional[Union[Dict, OVQuantizationConfigBase]] = None,
dtype: Optional[str] = None,
**kwargs,
):
super().__init__()
self.compression = compression
self.input_info = input_info
self.save_onnx_model = save_onnx_model
self._enable_standard_onnx_export_option()
self.optimum_version = kwargs.pop("optimum_version", None)
self.quantization_config = quantization_config or {}
self.quantization_config = quantization_config
self.compression = None # A backward-compatability field for training-time compression parameters

if isinstance(quantization_config, QuantizationConfigMixin):
bits = self.quantization_config.bits
else:
bits = self.quantization_config.get("bits", None)
bits = (
self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None
)
self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype

def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False):
Expand All @@ -137,28 +110,21 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False):
for name, value in model_inputs.items()
]

def save_pretrained(self, *args, **kwargs):
super().save_pretrained(*args, **kwargs)
def to_dict(self) -> Dict[str, Any]:
# Parent to_dict() implementation does not support quantization_config being None
if self.quantization_config is None:
self.quantization_config = OVQuantizationConfigBase()
result = super().to_dict()
del result["quantization_config"]
return result


def _enable_standard_onnx_export_option(self):
# This method depends on self.save_onnx_model.
# save_onnx_model is defaulted to false so that the final model output is
# in OpenVINO IR to realize performance benefit in OpenVINO runtime.
# True value of save_onnx_model will save a model in onnx format.
if (
isinstance(self.compression, dict)
and "algorithm" in self.compression
and self.compression["algorithm"] == "quantization"
):
self.compression["export_to_onnx_standard_ops"] = self.save_onnx_model
elif isinstance(self.compression, list):
for i, algo_config in enumerate(self.compression):
if algo_config["algorithm"] == "quantization":
self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model
Comment on lines -143 to -157
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved this logic directly to trainer.py

class OVQuantizationMethod(str, Enum):
DEFAULT = "default"


@dataclass
class OVWeightQuantizationConfig(QuantizationConfigMixin):
class OVWeightQuantizationConfig(OVQuantizationConfigBase):
"""
This is a wrapper class about all possible attributes and features that you can play with a model that has been
loaded using `optimum-intel` api for quantization with NNCF.
Expand All @@ -168,7 +134,7 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
bits (`int`, defaults to 8):
The number of bits to quantize to.
sym (`bool`, defaults to `False`):
Whether to use symetric quantization.
Whether to use symmetric quantization.
tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
The tokenizer used to process the dataset. You can pass either:
- A custom tokenizer object.
Expand All @@ -187,64 +153,52 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin):
group_size (`int`, *optional*):
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
all_layers (`bool`, *optional*):
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision.
sensitivity_metric (`str`, *optional*):
The sensitivity metric for assigning quantization precision to layers. In order to
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
ignored_scope (`dict`, *optional*):
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
num_samples (`int`, *optional*):
subset_size (`int`, *optional*):
The maximum number of samples composing the calibration dataset.

"""

def __init__(
self,
dataset: Optional[Union[str, List[str], nncf.Dataset, datasets.Dataset]] = None,
bits: int = 8,
ignored_scope: Optional[Union[dict, nncf.IgnoredScope]] = None,
sym: bool = False,
tokenizer: Optional[Any] = None,
dataset: Optional[Union[str, List[str]]] = None,
ratio: float = 1.0,
group_size: Optional[int] = None,
all_layers: Optional[bool] = None,
sensitivity_metric: Optional[str] = None,
ignored_scope: Optional[dict] = None,
num_samples: Optional[int] = None,
**kwargs,
subset_size: Optional[int] = None,
quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT,
):
super().__init__(dataset, ignored_scope, subset_size)
self.bits = bits
self.sym = sym
self.tokenizer = tokenizer
self.dataset = dataset
self.group_size = group_size or (-1 if bits == 8 else 128)
self.ratio = ratio
self.all_layers = all_layers
self.sensitivity_metric = sensitivity_metric
self.ignored_scope = ignored_scope
self.num_samples = num_samples
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
self.subset_size = subset_size
self.quant_method = quant_method
self.post_init()

def post_init(self):
r"""
Safety checker that arguments are correct
"""
super().post_init()
if self.ratio is not None and not (0 <= self.ratio <= 1):
raise ValueError("`ratio` must between 0 and 1.")
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
raise ValueError("`group_size` must be greater than 0 or equal to -1")
if self.dataset is not None and isinstance(self.dataset, str):
llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]
stable_diffusion_datasets = [
"conceptual_captions",
"laion/220k-GPT4Vision-captions-from-LIVIS",
"laion/filtered-wit",
]
if self.dataset not in llm_datasets + stable_diffusion_datasets:
raise ValueError(
f"""You have entered a string value for dataset. You can only choose between
{llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
)

if self.bits not in [4, 8]:
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")
Expand All @@ -260,5 +214,36 @@ def post_init(self):
)


@dataclass
class OVQuantizationConfig(OVQuantizationConfigBase):
def __init__(
self,
dataset: Union[str, List[str], nncf.Dataset, datasets.Dataset],
ignored_scope: Optional[nncf.IgnoredScope] = None,
subset_size: Optional[int] = 300,
preset: nncf.QuantizationPreset = nncf.QuantizationPreset.MIXED,
model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER,
fast_bias_correction: bool = True,
overflow_fix: OverflowFix = OverflowFix.DISABLE,
):
super().__init__(dataset, ignored_scope, subset_size)
self.preset = preset
self.model_type = model_type
self.fast_bias_correction = fast_bias_correction
self.overflow_fix = overflow_fix
self.post_init()

def post_init(self):
"""
Safety checker that arguments are correct
"""
super().post_init()
# if self.dataset is None:
# raise ValueError(
# "`dataset` is needed to compute the activations range during the calibration step and was not provided."
# " In case you only want to apply quantization on the weights, please set `weights_only=True`."
# )


def _check_default_4bit_configs(config: PretrainedConfig):
return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)
2 changes: 1 addition & 1 deletion optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ def _from_pretrained(
# from optimum.gptq.utils import get_seqlen

# seqlen = get_seqlen(causal_model)
nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
nsamples = quantization_config.subset_size if quantization_config.subset_size else 128
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def _from_pretrained(
if not isinstance(sd_model, supported_pipelines):
raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")

nsamples = quantization_config.num_samples if quantization_config.num_samples else 200
nsamples = quantization_config.subset_size if quantization_config.subset_size else 200
unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples)

from .quantization import _hybrid_quantization
Expand Down
Loading
Loading