Skip to content

Commit

Permalink
Introduce OVQuantizationConfig for nncf.quantize() parameters (#638)
Browse files Browse the repository at this point in the history
* Introduce OVQuantizationConfig for nncf.quantize() parameters

* Ignored scope tweaks

* Added **kwargs to quantization call. Added config serialization test.

* Ignored scope changes. Tests pass.

* Added documentation

* Linters

* Linters

* Tweak ignored scope serialization

* Added deprecation errors, tweak docs

* Addressed minor comments

* Make quantization config contain only serializable properties.

* Small tweaks

* Address comments

* Fix ruff

* Fix ruff 2
  • Loading branch information
nikita-savelyevv authored Apr 15, 2024
1 parent 0540b12 commit ff5d185
Show file tree
Hide file tree
Showing 9 changed files with 654 additions and 244 deletions.
2 changes: 2 additions & 0 deletions optimum/intel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
"OVModelForVision2Seq",
"OVModelForSequenceClassification",
"OVModelForTokenClassification",
"OVQuantizationConfig",
"OVWeightQuantizationConfig",
"OVConfig",
]
Expand Down Expand Up @@ -243,6 +244,7 @@
OVModelForSpeechSeq2Seq,
OVModelForTokenClassification,
OVModelForVision2Seq,
OVQuantizationConfig,
OVWeightQuantizationConfig,
)

Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from .trainer import OVTrainer


from .configuration import OVConfig, OVWeightQuantizationConfig
from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig
from .modeling import (
OVModelForAudioClassification,
OVModelForAudioFrameClassification,
Expand Down
321 changes: 222 additions & 99 deletions optimum/intel/openvino/configuration.py

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,21 @@ def __init__(
self._openvino_config = OVConfig(quantization_config=quantization_config)

@staticmethod
def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None):
def load_model(
file_name: Union[str, Path],
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
calibration_dataset: Optional = None,
):
"""
Loads the model.
Arguments:
file_name (`str` or `Path`):
The path of the model ONNX or XML file.
quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*):
Quantization config to apply after model is loaded.
calibration_dataset (`nncf.Dataset`, *optional*):
Optional nncf.Dataset to feed to model weight compression when quantization config is provided.
"""

def fix_op_names_duplicates(model: openvino.runtime.Model):
Expand Down Expand Up @@ -135,7 +143,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model):

from optimum.intel.openvino.quantization import _weight_only_quantization

model = _weight_only_quantization(model, quantization_config)
model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset)

return model

Expand Down
15 changes: 10 additions & 5 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ def _from_pretrained(
from_onnx: bool = False,
local_files_only: bool = False,
load_in_8bit: bool = False,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
model_path = Path(model_id)
Expand All @@ -596,7 +596,12 @@ def _from_pretrained(
quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)

load_in_4bit = quantization_config.bits == 4 if quantization_config else False
model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config)
calibration_dataset = kwargs.get("calibration_dataset", None)
model = cls.load_model(
model_cache_path,
quantization_config=None if load_in_4bit else quantization_config,
calibration_dataset=calibration_dataset,
)

model_type = config.model_type.replace("_", "-")
if model_type == "bloom":
Expand Down Expand Up @@ -632,7 +637,7 @@ def _from_pretrained(
f"For the given model, we recommend the following `quantization_config` : {default_config}"
)

if isinstance(quantization_config.dataset, str):
if calibration_dataset is None and isinstance(quantization_config.dataset, str):
tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id)

from optimum.gptq.data import get_dataset, prepare_dataset
Expand All @@ -644,9 +649,9 @@ def _from_pretrained(
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))
calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x))

_weight_only_quantization(model, quantization_config)
_weight_only_quantization(model, quantization_config, calibration_dataset)

return causal_model

Expand Down
Loading

0 comments on commit ff5d185

Please sign in to comment.