Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce OVQuantizationConfig for nncf.quantize() parameters #638

Merged
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions optimum/intel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
"OVModelForVision2Seq",
"OVModelForSequenceClassification",
"OVModelForTokenClassification",
"OVQuantizationConfig",
"OVWeightQuantizationConfig",
"OVConfig",
]
Expand Down Expand Up @@ -243,6 +244,7 @@
OVModelForSpeechSeq2Seq,
OVModelForTokenClassification,
OVModelForVision2Seq,
OVQuantizationConfig,
OVWeightQuantizationConfig,
)

Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from .trainer import OVTrainer


from .configuration import OVConfig, OVWeightQuantizationConfig
from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig
from .modeling import (
OVModelForAudioClassification,
OVModelForAudioFrameClassification,
Expand Down
309 changes: 209 additions & 100 deletions optimum/intel/openvino/configuration.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ def _from_pretrained(
# from optimum.gptq.utils import get_seqlen

# seqlen = get_seqlen(causal_model)
nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
nsamples = quantization_config.subset_size if quantization_config.subset_size else 128
dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
dataset = prepare_dataset(dataset)
quantization_config = copy.deepcopy(quantization_config)
Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def _from_pretrained(
if not isinstance(sd_model, supported_pipelines):
raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")

nsamples = quantization_config.num_samples if quantization_config.num_samples else 200
nsamples = quantization_config.subset_size if quantization_config.subset_size else 200
unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples)

from .quantization import _hybrid_quantization
Expand Down
197 changes: 92 additions & 105 deletions optimum/intel/openvino/quantization.py

Large diffs are not rendered by default.

31 changes: 30 additions & 1 deletion optimum/intel/openvino/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@

from ..utils.constant import _TASK_ALIASES
from ..utils.import_utils import is_transformers_version
from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig
from .configuration import OVConfig
from .quantization import OVDataLoader
from .training_args import OVTrainingArguments
from .utils import (
Expand Down Expand Up @@ -136,6 +136,25 @@
NNCF_LOG_FILE_NAME = "nncf_output.log"


DEFAULT_QUANTIZATION_CONFIG = {
"algorithm": "quantization",
"preset": "mixed",
"overflow_fix": "disable",
"initializer": {
"range": {"num_init_samples": 300, "type": "mean_min_max"},
"batchnorm_adaptation": {"num_bn_adaptation_samples": 0},
},
"scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}},
"ignored_scopes": [
"{re}.*Embedding.*",
"{re}.*add___.*",
"{re}.*layer_norm_.*",
"{re}.*matmul_1",
"{re}.*__truediv__.*",
],
}


def _onnx_export_nncf_model(model: NNCFNetwork, config: OnnxConfig, output: Union[str, io.BytesIO], opset: int = None):
# TODO: remove it when fix controller.strip(copy=True) behavior
signature = inspect.signature(model.forward)
Expand Down Expand Up @@ -228,6 +247,16 @@ def __init__(
if self.ov_config is not None:
if self.ov_config.compression is None:
self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG
if (
isinstance(self.ov_config.compression, dict)
and "algorithm" in self.ov_config.compression
and self.ov_config.compression["algorithm"] == "quantization"
):
self.ov_config.compression["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model
elif isinstance(self.ov_config.compression, list):
for i, algo_config in enumerate(self.ov_config.compression):
if algo_config["algorithm"] == "quantization":
self.ov_config.compression[i]["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model

if self.args.do_train:
self._set_task()
Expand Down
155 changes: 144 additions & 11 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@
import tempfile
import unittest
from collections import defaultdict
from enum import Enum
from functools import partial
from typing import List

import evaluate
import numpy as np
import torch
from datasets import load_dataset
from nncf.quantization.advanced_parameters import OverflowFix
from parameterized import parameterized
import openvino.runtime as ov
import nncf
Expand All @@ -37,6 +40,7 @@
TrainingArguments,
default_data_collator,
)
from transformers.utils.quantization_config import QuantizationMethod

from optimum.intel import (
OVConfig,
Expand All @@ -55,8 +59,10 @@
OVStableDiffusionXLPipeline,
OVQuantizer,
OVTrainer,
OVQuantizationConfig,
OVWeightQuantizationConfig,
)
from optimum.intel.openvino.configuration import OVQuantizationMethod, OVQuantizationConfigBase

from optimum.intel.openvino.quantization import InferRequestWrapper
from optimum.intel.utils.import_utils import is_openvino_version
Expand Down Expand Up @@ -98,7 +104,9 @@ def preprocess_function(examples, tokenizer):
num_samples=10,
dataset_split="train",
)
quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, file_name=file_name)
quantization_config = OVQuantizationConfig(dataset=calibration_dataset)
ov_config = OVConfig(quantization_config=quantization_config)
quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config, file_name=file_name)
model = model_cls.from_pretrained(tmp_dir, file_name=file_name)

# TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm)
Expand All @@ -110,6 +118,10 @@ def preprocess_function(examples, tokenizer):
outputs = model(**tokens)
self.assertTrue("logits" in outputs)

# Verify that the configuration is correctly saved and loaded
loaded_config = OVConfig.from_pretrained(tmp_dir)
self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Brought back after #630

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8):
task = model_cls.export_feature
Expand All @@ -134,7 +146,9 @@ def preprocess_function(examples, tokenizer):
num_samples=10,
dataset_split="train",
)
quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset)
quantization_config = OVQuantizationConfig(dataset=calibration_dataset)
ov_config = OVConfig(quantization_config=quantization_config)
quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config)

model = model_cls.from_pretrained(tmp_dir)

Expand Down Expand Up @@ -210,7 +224,7 @@ class OVWeightCompressionTest(unittest.TestCase):
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
dataset="ptb",
awq=True,
quant_method=QuantizationMethod.AWQ,
),
14,
),
Expand Down Expand Up @@ -251,7 +265,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i
tokenizer.pad_token = tokenizer.eos_token

quantizer = OVQuantizer.from_pretrained(transformers_model, task=task)
quantizer.quantize(save_directory=tmp_dir, weights_only=True)
quantizer.quantize(save_directory=tmp_dir)
model = model_cls.from_pretrained(tmp_dir)

_, num_int8, _ = get_num_quantized_nodes(model)
Expand All @@ -261,6 +275,15 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i
outputs = model(**tokens)
self.assertTrue("logits" in outputs)

# Verify that the configuration is correctly saved and loaded
loaded_config = OVConfig.from_pretrained(tmp_dir)
original_config_as_dict = OVWeightQuantizationConfig(bits=8, sym=True).to_dict()
for k in original_config_as_dict.keys():
v = original_config_as_dict[k]
if isinstance(v, Enum):
original_config_as_dict[k] = v.value
self.assertEqual(original_config_as_dict, loaded_config.quantization_config)

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS)
def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
task = model_cls.export_feature
Expand All @@ -272,7 +295,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p
tokenizer.pad_token = tokenizer.eos_token

quantizer = OVQuantizer.from_pretrained(transformers_model, task=task)
quantizer.quantize(save_directory=tmp_dir, weights_only=True)
quantizer.quantize(save_directory=tmp_dir)
model = model_cls.from_pretrained(tmp_dir)

_, num_int8, _ = get_num_quantized_nodes(model)
Expand All @@ -297,7 +320,6 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, ratio=0.8))
quantizer.quantize(
save_directory=tmp_dir,
weights_only=True,
ov_config=ov_config,
)
model = model_cls.from_pretrained(tmp_dir)
Expand All @@ -322,7 +344,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp
tokenizer.pad_token = tokenizer.eos_token

quantizer = OVQuantizer.from_pretrained(transformers_model, task=task)
quantizer.quantize(save_directory=tmp_dir, weights_only=True)
quantizer.quantize(save_directory=tmp_dir)
model = model_cls.from_pretrained(tmp_dir)

_, num_int8, _ = get_num_quantized_nodes(model)
Expand Down Expand Up @@ -354,7 +376,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8):
model_id = MODEL_NAMES[model_type]
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2)
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=2)
with tempfile.TemporaryDirectory() as tmp_dir:
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)

Expand All @@ -376,7 +398,7 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset(
model = model_cls.from_pretrained(
model_id,
export=True,
quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3),
quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, subset_size=3),
)
num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
Expand Down Expand Up @@ -412,6 +434,7 @@ def test_ovmodel_4bit_auto_compression_with_config(
self, model_cls, model_id, quantization_config, expected_ov_int4
):
with tempfile.TemporaryDirectory() as tmp_dir:
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
Expand Down Expand Up @@ -571,7 +594,9 @@ def preprocess_function(examples, tokenizer):
num_samples=10,
dataset_split="test",
)
quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset)
quantization_config = OVQuantizationConfig(dataset=calibration_dataset)
ov_config = OVConfig(quantization_config=quantization_config)
quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config)

# Test that inference on quantized model works
model = OVModelForQuestionAnswering.from_pretrained(tmp_dir)
Expand Down Expand Up @@ -604,7 +629,9 @@ def preprocess_function(examples, tokenizer):
num_samples=10,
dataset_split="test",
)
quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset)
quantization_config = OVQuantizationConfig(dataset=calibration_dataset)
ov_config = OVConfig(quantization_config=quantization_config)
quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config)

# Test that inference on quantized model works
model = OVModelForQuestionAnswering.from_pretrained(tmp_dir)
Expand Down Expand Up @@ -666,6 +693,112 @@ def compute_metrics(p):
self.assertTrue("logits" in outputs)


class OVQuantizationConfigTest(unittest.TestCase):
QUANTIZATION_CONFIGS = (
(
None,
[],
),
(OVWeightQuantizationConfig(), []),
(
OVWeightQuantizationConfig(
bits=8,
sym=True,
),
[],
),
(
{
"bits": 8,
"sym": True,
},
[],
),
(
OVWeightQuantizationConfig(
dataset="wikitext",
bits=4,
ignored_scope={"names": ["op_name"]},
sym=False,
tokenizer="dbmdz/bert-base-german-cased",
ratio=1.0,
group_size=128,
all_layers=True,
sensitivity_metric="mean_activation_magnitude",
subset_size=100,
quant_method=OVQuantizationMethod.DEFAULT,
),
["ignored_scope"],
),
(OVWeightQuantizationConfig(dataset=["wikitext", "c4"]), []),
(OVWeightQuantizationConfig(dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test")), ["dataset"]),
(OVWeightQuantizationConfig(dataset=nncf.Dataset([np.zeros((1, 10))])), ["dataset"]),
(
OVWeightQuantizationConfig(tokenizer=AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")),
["tokenizer"],
),
(OVWeightQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])), ["ignored_scope"]),
(OVQuantizationConfig(dataset="wikitext"), []),
({"dataset": "wikitext"}, []),
(
OVQuantizationConfig(
dataset="wikitext",
ignored_scope={"names": ["op_name"]},
subset_size=100,
preset=nncf.QuantizationPreset.MIXED,
model_type=nncf.ModelType.TRANSFORMER,
fast_bias_correction=True,
overflow_fix=OverflowFix.DISABLE,
),
["ignored_scope"],
),
(OVQuantizationConfig(dataset=["wikitext", "c4"]), []),
(OVQuantizationConfig(dataset=load_dataset("wikitext", "wikitext-2-raw-v1", split="test")), ["dataset"]),
(OVQuantizationConfig(dataset=nncf.Dataset([np.zeros((1, 10))])), ["dataset"]),
(
OVQuantizationConfig(dataset=["wikitext", "c4"], ignored_scope=nncf.IgnoredScope(names=["op_name"])),
["ignored_scope"],
),
)

@parameterized.expand(QUANTIZATION_CONFIGS)
def test_config_serialization(
self, quantization_config: OVQuantizationConfigBase, non_equal_property_names: List[str]
):
def str_to_enum(enum_cls, value):
for k, v in enum_cls.__members__.items():
if getattr(enum_cls, k).value == value:
return v
raise ValueError(f"Could not convert string {value} to enum value of type {enum_cls}")

ov_config = OVConfig(quantization_config=quantization_config)
with tempfile.TemporaryDirectory() as tmp_dir:
ov_config.save_pretrained(tmp_dir)
loaded_ov_config = OVConfig.from_pretrained(tmp_dir)

if quantization_config is None:
self.assertEqual(loaded_ov_config.quantization_config, None)
return
for key, value in loaded_ov_config.quantization_config.items():
initial_value = (
quantization_config[key]
if isinstance(quantization_config, dict)
else getattr(ov_config.quantization_config, key)
)
if key == "preset" or key == "overflow_fix":
# TODO: remove once NNCF is updated to 2.10
if getattr(quantization_config, key) is not None:
self.assertTrue(isinstance(value, str))
if key == "preset":
value = str_to_enum(nncf.QuantizationPreset, value)
else:
value = str_to_enum(OverflowFix, value)
if key in non_equal_property_names:
self.assertNotEqual(value, initial_value)
else:
self.assertEqual(value, initial_value)


class InferRequestWrapperTest(unittest.TestCase):
MODEL_ID = ("openai/whisper-tiny.en",)
APPLY_CACHING = (False, True)
Expand Down
Loading
Loading