In [1]:
from optimum.onnxruntime import ORTModelForImageClassification, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import AutoFeatureExtractor, pipeline
from pathlib import Path
from evaluate import evaluator
from datasets import load_dataset
import os
from time import perf_counter
import numpy as np

# Input

In [2]:
model_id = str(Path.home() / 'PycharmProjects/data_toolkit/computer_vision/vit_transformers/dog_cat/vit_dog_cat')
onnx_path = Path("onnx")

# 1. Convert to ONNX

In [3]:
# load vanilla transformers and convert to onnx
model = ORTModelForImageClassification.from_pretrained(model_id, from_transformers=True)
preprocessor = AutoFeatureExtractor.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
preprocessor.save_pretrained(onnx_path)

vanilla_clf = pipeline("image-classification", model=model, feature_extractor=preprocessor)

  if num_channels != self.num_channels:
  if height != self.image_size[0] or width != self.image_size[1]:


# 2. Quantizer

In [4]:
# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(model)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.quantize(save_dir=onnx_path,quantization_config=dqconfig)

# load model
model = ORTModelForImageClassification.from_pretrained(onnx_path, file_name="model_quantized.onnx")
preprocessor = AutoFeatureExtractor.from_pretrained(onnx_path)
q8_clf = pipeline("image-classification", model=model, feature_extractor=preprocessor)

# 3. Testing model performance

## 3.1 Model Size

In [5]:
for i in ['model.onnx', 'model_quantized.onnx']:
    size = os.path.getsize(onnx_path / i)/(1024*1024)
    print(f"{i} file size: {size:.2f} MB")

model.onnx file size: 327.55 MB
model_quantized.onnx file size: 84.76 MB


## 3.2 Performance

In [6]:
path_train = Path.home() / 'Desktop/dogs-vs-cats/train'
dataset_train = load_dataset("imagefolder", data_dir=str(path_train), split='train')
splits = dataset_train.train_test_split(test_size=0.2)
dataset_test_valid = splits['test'].train_test_split(test_size=0.5)
test_data = dataset_test_valid['test']

results = evaluator("image-classification").compute(
    model_or_pipeline=q8_clf,
    data=test_data,
    metric="accuracy",
    input_column="image",
    label_column="label",
    label_mapping=model.config.label2id,
    strategy="simple",
)

print(f"Quantized model: {results['accuracy']*100:.2f}%")

Resolving data files:   0%|          | 0/25000 [00:00<?, ?it/s]



Quantized model: 99.88%


In [7]:
def measure_latency(pipe):
    # prepare date
    inputs = pipe.feature_extractor(images=test_data[0]['image'], return_tensors="pt")
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe.model(**inputs)
    # timed run
    for _ in range(200):
        start_time = perf_counter()
        _ =  pipe.model(**inputs)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies, 95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms


vanilla_model = measure_latency(vanilla_clf)
quantized_model = measure_latency(q8_clf)

print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")

Vanilla model: P95 latency (ms) - 45.59195999900111; Average latency (ms) - 42.15 +\- 2.35;
Quantized model: P95 latency (ms) - 25.620775002607843; Average latency (ms) - 24.47 +\- 1.46;
Improvement through quantization: 1.78x
