In [1]:
!pip install transformers
!python -m pip install git+https://github.com/huggingface/optimum.git
!pip install onnxruntime
!pip install onnx

Collecting transformers
  Using cached transformers-4.21.1-py3-none-any.whl (4.7 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 2.2 MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17
  Downloading regex-2022.7.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (765 kB)
[K     |████████████████████████████████| 765 kB 82 kB/s eta 0:00:011
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (661 kB)
[K     |████████████████████████████████| 661 kB 693 kB/s eta 0:00:01
Collecting filelock
  Downloading filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 769 kB/s ta 0:00:011
[?25hC

# Benchmarking

## Feature Extraction

### Base Model Load





In [2]:
from transformers import AutoTokenizer, pipeline

model_id = "microsoft/codebert-base"
task = "feature-extraction"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
feature_extractor = pipeline(task, model=model_id, tokenizer=tokenizer, handle_impossible_answer=True)


  from .autonotebook import tqdm as notebook_tqdm
Downloading tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 13.1kB/s]
Downloading config.json: 100%|██████████| 498/498 [00:00<00:00, 282kB/s]
Downloading vocab.json: 100%|██████████| 878k/878k [00:14<00:00, 62.7kB/s] 
Downloading merges.txt: 100%|██████████| 446k/446k [00:04<00:00, 111kB/s]  
Downloading special_tokens_map.json: 100%|██████████| 150/150 [00:00<00:00, 79.2kB/s]
Downloading pytorch_model.bin: 100%|██████████| 476M/476M [09:19<00:00, 892kB/s]    


In [3]:
%%time
prediction = feature_extractor("What's my name?")

CPU times: user 408 ms, sys: 51.8 ms, total: 460 ms
Wall time: 476 ms


### Optimized Model

In [5]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForFeatureExtraction

model_id = "microsoft/codebert-base"
onnx_path = Path("onnx")
task = "feature-extraction"

# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2
optimum_feature_extractor = pipeline(task, model=model, tokenizer=tokenizer, handle_impossible_answer=True)

Downloading: 100%|██████████| 498/498 [00:00<00:00, 320kB/s]


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 482 kB/s eta 0:00:01
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py39-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 7.0 MB/s eta 0:00:01
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 7.4 MB/s eta 0:00:01
Collecting xxhash
  Downloading xxhash-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (211 kB)
[K     |████████████████████████████████| 211 kB 7.1 MB/s eta 0:00:01
[?25hCollecting pyarrow>=6.0.0
  Downloading pyarrow-9.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 295 kB/s eta 0:00:01
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-c

In [7]:
from pathlib import Path

from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

model_id = "microsoft/codebert-base"
onnx_path = Path("onnx")

task = "feature-extraction"
# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

KeyError: "Unknown task: feature-extraction. Possible values are [<class 'transformers.models.auto.modeling_auto.AutoModel'>, <class 'transformers.models.auto.modeling_auto.AutoModelForMaskedLM'>, <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.auto.modeling_auto.AutoModelForSeq2SeqLM'>, <class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>, <class 'transformers.models.auto.modeling_auto.AutoModelForTokenClassification'>, <class 'transformers.models.auto.modeling_auto.AutoModelForMultipleChoice'>, <class 'transformers.models.auto.modeling_auto.AutoModelForQuestionAnswering'>, <class 'transformers.models.auto.modeling_auto.AutoModelForImageClassification'>, <class 'transformers.models.auto.modeling_auto.AutoModelForMaskedImageModeling'>]"

In [5]:
%%time
prediction = optimum_feature_extractor("What's my name?")

CPU times: user 131 ms, sys: 2.26 ms, total: 133 ms
Wall time: 120 ms


## Question Answering

### Base Model

In [6]:
from transformers import AutoTokenizer, pipeline

model_id = "deepset/roberta-base-squad2"
task = "question-answering"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
qa_model = pipeline(task, model=model_id, tokenizer=tokenizer, handle_impossible_answer=True)

Downloading tokenizer_config.json: 100%|██████████| 79.0/79.0 [00:00<00:00, 33.7kB/s]
Downloading config.json: 100%|██████████| 571/571 [00:00<00:00, 268kB/s]
Downloading vocab.json: 100%|██████████| 878k/878k [00:05<00:00, 166kB/s]  
Downloading merges.txt: 100%|██████████| 446k/446k [00:10<00:00, 42.4kB/s] 
Downloading special_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 293kB/s]
Downloading pytorch_model.bin: 100%|██████████| 473M/473M [06:57<00:00, 1.19MB/s] 


In [7]:
%%time
prediction = qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)

{'score': 0.6795036196708679, 'start': 11, 'end': 16, 'answer': 'Humza'}
CPU times: user 2.66 s, sys: 0 ns, total: 2.66 s
Wall time: 1.83 s


### Optimized Model

In [8]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering

model_id = "deepset/roberta-base-squad2"
onnx_path = Path("onnx_qa")
task = "question-answering"

# load vanilla transformers and convert to onnx
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Downloading: 100%|██████████| 571/571 [00:00<00:00, 32.5kB/s]


('onnx_qa/tokenizer_config.json',
 'onnx_qa/special_tokens_map.json',
 'onnx_qa/vocab.json',
 'onnx_qa/merges.txt',
 'onnx_qa/added_tokens.json',
 'onnx_qa/tokenizer.json')

In [9]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

RuntimeError: Failed to import optimum.onnxruntime.optimization because of the following error (look up to see its traceback):
No module named 'datasets'

In [None]:
from optimum.onnxruntime import ORTModelForQuestionAnswering

# load quantized model
opt_model = ORTModelForQuestionAnswering.from_pretrained(onnx_path, file_name="model-optimized.onnx")

# test the quantized model with using transformers pipeline
optimum_qa_model = pipeline(task, model=opt_model, tokenizer=tokenizer, handle_impossible_answer=True)

In [None]:
%%time
prediction = optimum_qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)

### Quantized Model

In [None]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)

# apply the quantization configuration to the model
quantizer.export(
    onnx_model_path=onnx_path / "model-optimized.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    quantization_config=qconfig,
)

In [None]:
# load quantized model
qt_model = ORTModelForQuestionAnswering.from_pretrained(onnx_path, file_name="model-quantized.onnx")

# test the quantized model with using transformers pipeline
quantized_qa_model = pipeline(task, model=qt_model, tokenizer=tokenizer, handle_impossible_answer=True)

In [None]:
%%time
prediction = quantized_qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)