In [None]:
!pip install transformers
!python -m pip install git+https://github.com/huggingface/optimum.git
!pip install onnxruntime
!pip install onnx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

# Benchmarking

## Feature Extraction

### Base Model Load





In [None]:
from transformers import AutoTokenizer, pipeline

model_id = "microsoft/codebert-base"
task = "feature-extraction"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
feature_extractor = pipeline(task, model=model_id, tokenizer=tokenizer, handle_impossible_answer=True)


In [None]:
%%time
prediction = feature_extractor("What's my name?")

CPU times: user 107 ms, sys: 435 µs, total: 108 ms
Wall time: 113 ms


### Optimized Model

In [None]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForFeatureExtraction

model_id = "microsoft/codebert-base"
onnx_path = Path("onnx")
task = "feature-extraction"

# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
optimum_feature_extractor = pipeline(task, model=model, tokenizer=tokenizer, handle_impossible_answer=True)

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

In [None]:
%%time
prediction = optimum_feature_extractor("What's my name?")

CPU times: user 89.7 ms, sys: 288 µs, total: 90 ms
Wall time: 91.6 ms


## Question Answering

### Base Model

In [None]:
from transformers import AutoTokenizer, pipeline

model_id = "deepset/roberta-base-squad2"
task = "question-answering"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
qa_model = pipeline(task, model=model_id, tokenizer=tokenizer, handle_impossible_answer=True)

In [None]:
%%time
prediction = qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)

{'score': 0.6795030236244202, 'start': 11, 'end': 16, 'answer': 'Humza'}
CPU times: user 108 ms, sys: 0 ns, total: 108 ms
Wall time: 109 ms


### Optimized Model

In [None]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering

model_id = "deepset/roberta-base-squad2"
onnx_path = Path("onnx_qa")
task = "question-answering"

# load vanilla transformers and convert to onnx
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

('onnx_qa/tokenizer_config.json',
 'onnx_qa/special_tokens_map.json',
 'onnx_qa/vocab.json',
 'onnx_qa/merges.txt',
 'onnx_qa/added_tokens.json',
 'onnx_qa/tokenizer.json')

In [None]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

PosixPath('onnx_qa/model-optimized.onnx')

In [None]:
from optimum.onnxruntime import ORTModelForQuestionAnswering

# load quantized model
opt_model = ORTModelForQuestionAnswering.from_pretrained(onnx_path, file_name="model-optimized.onnx")

# test the quantized model with using transformers pipeline
optimum_qa_model = pipeline(task, model=opt_model, tokenizer=tokenizer, handle_impossible_answer=True)

In [None]:
%%time
prediction = optimum_qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)

{'score': 0.6795043349266052, 'start': 11, 'end': 16, 'answer': 'Humza'}
CPU times: user 75.4 ms, sys: 600 µs, total: 76 ms
Wall time: 76.2 ms


### Quantized Model

In [None]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)

# apply the quantization configuration to the model
quantizer.export(
    onnx_model_path=onnx_path / "model-optimized.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    quantization_config=qconfig,
)

PosixPath('onnx_qa/model-quantized.onnx')

In [None]:
# load quantized model
qt_model = ORTModelForQuestionAnswering.from_pretrained(onnx_path, file_name="model-quantized.onnx")

# test the quantized model with using transformers pipeline
quantized_qa_model = pipeline(task, model=qt_model, tokenizer=tokenizer, handle_impossible_answer=True)

In [None]:
%%time
prediction = quantized_qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)

{'score': 0.7797815799713135, 'start': 11, 'end': 16, 'answer': 'Humza'}
CPU times: user 52.8 ms, sys: 2.83 ms, total: 55.6 ms
Wall time: 56.8 ms
