In [1]:
!pip install transformers
!python -m pip install git+https://github.com/huggingface/optimum.git
!pip install onnxruntime
!pip install onnx

Collecting transformers
  Using cached transformers-4.21.1-py3-none-any.whl (4.7 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 2.2 MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17
  Downloading regex-2022.7.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (765 kB)
[K     |████████████████████████████████| 765 kB 82 kB/s eta 0:00:011
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (661 kB)
[K     |████████████████████████████████| 661 kB 693 kB/s eta 0:00:01
Collecting filelock
  Downloading filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 769 kB/s ta 0:00:011
[?25hC

# Benchmarking

## Feature Extraction

### Base Model Load





In [8]:
from transformers import AutoTokenizer, pipeline

model_id = "microsoft/codebert-base"
task = "feature-extraction"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
feature_extractor = pipeline(task, model=model_id, tokenizer=tokenizer, handle_impossible_answer=True)


In [19]:
%%time
prediction = feature_extractor(["What's my name?"])

CPU times: user 142 ms, sys: 0 ns, total: 142 ms
Wall time: 74.6 ms


[-0.1326238065958023,
 0.38220053911209106,
 0.03749421238899231,
 -0.02008284255862236,
 0.10234289616346359,
 -0.16812247037887573,
 -0.08768516778945923,
 0.04570560157299042,
 0.04813437908887863,
 -0.11880657821893692,
 0.1357186734676361,
 0.4343635141849518,
 -0.14596402645111084,
 -0.04268614202737808,
 0.20766493678092957,
 0.03773442283272743,
 0.07840951532125473,
 0.1304922252893448,
 0.081199049949646,
 0.07399129867553711,
 -0.17183847725391388,
 -0.14073914289474487,
 0.23667697608470917,
 0.0006463321624323726,
 0.42707276344299316,
 0.04345940798521042,
 0.3349607586860657,
 0.2065361589193344,
 0.05771685391664505,
 0.4824593961238861,
 -0.08865801244974136,
 -0.024739718064665794,
 1.4476032257080078,
 -0.14148235321044922,
 0.22061589360237122,
 -0.03208843618631363,
 -0.02742939628660679,
 0.2644815742969513,
 -0.09928885847330093,
 0.04172520712018013,
 -0.4197045564651489,
 -0.04511818662285805,
 -0.9629356861114502,
 0.13015048205852509,
 0.3906705677509308,
 -0

### Optimized Model

In [5]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForFeatureExtraction

model_id = "microsoft/codebert-base"
onnx_path = Path("onnx")
task = "feature-extraction"

# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2
optimum_feature_extractor = pipeline(task, model=model, tokenizer=tokenizer, handle_impossible_answer=True)

Downloading: 100%|██████████| 498/498 [00:00<00:00, 320kB/s]


In [24]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForFeatureExtraction

tokenizer = AutoTokenizer.from_pretrained("optimum/all-MiniLM-L6-v2")
model = ORTModelForFeatureExtraction.from_pretrained("optimum/all-MiniLM-L6-v2")
onnx_extractor = pipeline("feature-extraction", model=model, tokenizer=tokenizer)

text = "My name is Philipp and I live in Germany."
pred = onnx_extractor(text)

Downloading: 100%|██████████| 632/632 [00:00<00:00, 235kB/s]
Downloading: 100%|██████████| 90.9M/90.9M [01:34<00:00, 957kB/s] 


In [27]:
encoded_input = tokenizer("My name is Philipp and I live in Germany.", padding=True, truncation=True, return_tensors='pt')
pred = model(**encoded_input)
sentence_embeddings = sentence_embeddings = mean_pooling(pred, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-1.7761e-02,  5.6114e-04,  3.5539e-02, -6.2987e-03, -3.5684e-02,
         -2.4026e-03,  1.7687e-02, -1.1022e-02,  3.4272e-02, -3.0947e-02,
          1.8242e-02, -1.4128e-01, -4.1095e-02, -5.7751e-02, -6.9208e-03,
         -5.6833e-02,  4.8729e-03,  6.4637e-02, -5.7160e-03, -4.4907e-02,
         -7.1940e-02, -7.0136e-02,  3.7376e-02, -3.9940e-02,  1.1398e-02,
          3.8127e-03,  2.6768e-02,  4.8390e-02, -4.0072e-03, -3.2384e-02,
          6.3038e-02,  5.5875e-03,  2.7129e-02,  3.2292e-02,  4.5151e-02,
         -1.4375e-02, -9.7354e-02, -9.1670e-03, -6.2466e-02,  5.1693e-02,
          1.8237e-02, -3.6210e-02,  2.6908e-02,  1.2771e-02, -8.0993e-03,
          1.6693e-02, -2.5291e-02,  1.3691e-01,  4.0347e-02,  7.6774e-02,
         -7.7586e-02, -1.8439e-02,  2.8717e-02,  4.4411e-02,  1.7344e-03,
          2.8095e-02,  4.6564e-03,  3.8210e-02, -4.1572e-03,  4.4834e-02,
         -7.0689e-02,  2.8311e-02, -1.0524e-01,  7.0439e-02, -7.3086e-02,
          2.3546e

In [30]:
print(sentence_embeddings.shape)

torch.Size([1, 384])


In [33]:
import json

data = json.load(open("/home/humza/Downloads/projects/genml/data/faiss/training_data.json", "r"))

In [35]:
np.array(data).shape

(196, 768)

In [22]:
np.array(pred).shape

(1, 12, 384)

In [23]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Downloading: 100%|██████████| 350/350 [00:00<00:00, 147kB/s]
Downloading: 100%|██████████| 226k/226k [00:05<00:00, 41.1kB/s] 
Downloading: 100%|██████████| 455k/455k [00:07<00:00, 63.1kB/s] 
Downloading: 100%|██████████| 112/112 [00:00<00:00, 57.1kB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 628kB/s]
Downloading: 100%|██████████| 86.7M/86.7M [01:43<00:00, 882kB/s] 


Sentence embeddings:
tensor([[ 6.7657e-02,  6.3496e-02,  4.8713e-02,  7.9305e-02,  3.7448e-02,
          2.6528e-03,  3.9375e-02, -7.0985e-03,  5.9361e-02,  3.1537e-02,
          6.0098e-02, -5.2905e-02,  4.0607e-02, -2.5931e-02,  2.9843e-02,
          1.1269e-03,  7.3515e-02, -5.0382e-02, -1.2239e-01,  2.3703e-02,
          2.9727e-02,  4.2477e-02,  2.5634e-02,  1.9952e-03, -5.6919e-02,
         -2.7160e-02, -3.2904e-02,  6.6025e-02,  1.1901e-01, -4.5879e-02,
         -7.2621e-02, -3.2584e-02,  5.2341e-02,  4.5055e-02,  8.2531e-03,
          3.6702e-02, -1.3942e-02,  6.5392e-02, -2.6427e-02,  2.0641e-04,
         -1.3664e-02, -3.6281e-02, -1.9504e-02, -2.8974e-02,  3.9427e-02,
         -8.8409e-02,  2.6243e-03,  1.3671e-02,  4.8306e-02, -3.1157e-02,
         -1.1733e-01, -5.1169e-02, -8.8529e-02, -2.1896e-02,  1.4299e-02,
          4.4417e-02, -1.3482e-02,  7.4339e-02,  2.6638e-02, -1.9876e-02,
          1.7919e-02, -1.0605e-02, -9.0426e-02,  2.1327e-02,  1.4120e-01,
         -6.4717e

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 482 kB/s eta 0:00:01
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py39-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 7.0 MB/s eta 0:00:01
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 7.4 MB/s eta 0:00:01
Collecting xxhash
  Downloading xxhash-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (211 kB)
[K     |████████████████████████████████| 211 kB 7.1 MB/s eta 0:00:01
[?25hCollecting pyarrow>=6.0.0
  Downloading pyarrow-9.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 295 kB/s eta 0:00:01
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-c

In [7]:
from pathlib import Path

from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

model_id = "microsoft/codebert-base"
onnx_path = Path("onnx")

task = "feature-extraction"
# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

KeyError: "Unknown task: feature-extraction. Possible values are [<class 'transformers.models.auto.modeling_auto.AutoModel'>, <class 'transformers.models.auto.modeling_auto.AutoModelForMaskedLM'>, <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.auto.modeling_auto.AutoModelForSeq2SeqLM'>, <class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>, <class 'transformers.models.auto.modeling_auto.AutoModelForTokenClassification'>, <class 'transformers.models.auto.modeling_auto.AutoModelForMultipleChoice'>, <class 'transformers.models.auto.modeling_auto.AutoModelForQuestionAnswering'>, <class 'transformers.models.auto.modeling_auto.AutoModelForImageClassification'>, <class 'transformers.models.auto.modeling_auto.AutoModelForMaskedImageModeling'>]"

In [5]:
%%time
prediction = optimum_feature_extractor("What's my name?")

CPU times: user 131 ms, sys: 2.26 ms, total: 133 ms
Wall time: 120 ms


## Question Answering

### Base Model

In [6]:
from transformers import AutoTokenizer, pipeline

model_id = "deepset/roberta-base-squad2"
task = "question-answering"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
qa_model = pipeline(task, model=model_id, tokenizer=tokenizer, handle_impossible_answer=True)

Downloading tokenizer_config.json: 100%|██████████| 79.0/79.0 [00:00<00:00, 33.7kB/s]
Downloading config.json: 100%|██████████| 571/571 [00:00<00:00, 268kB/s]
Downloading vocab.json: 100%|██████████| 878k/878k [00:05<00:00, 166kB/s]  
Downloading merges.txt: 100%|██████████| 446k/446k [00:10<00:00, 42.4kB/s] 
Downloading special_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 293kB/s]
Downloading pytorch_model.bin: 100%|██████████| 473M/473M [06:57<00:00, 1.19MB/s] 


In [7]:
%%time
prediction = qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)

{'score': 0.6795036196708679, 'start': 11, 'end': 16, 'answer': 'Humza'}
CPU times: user 2.66 s, sys: 0 ns, total: 2.66 s
Wall time: 1.83 s


### Optimized Model

In [8]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering

model_id = "deepset/roberta-base-squad2"
onnx_path = Path("onnx_qa")
task = "question-answering"

# load vanilla transformers and convert to onnx
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Downloading: 100%|██████████| 571/571 [00:00<00:00, 32.5kB/s]


('onnx_qa/tokenizer_config.json',
 'onnx_qa/special_tokens_map.json',
 'onnx_qa/vocab.json',
 'onnx_qa/merges.txt',
 'onnx_qa/added_tokens.json',
 'onnx_qa/tokenizer.json')

In [9]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

RuntimeError: Failed to import optimum.onnxruntime.optimization because of the following error (look up to see its traceback):
No module named 'datasets'

In [None]:
from optimum.onnxruntime import ORTModelForQuestionAnswering

# load quantized model
opt_model = ORTModelForQuestionAnswering.from_pretrained(onnx_path, file_name="model-optimized.onnx")

# test the quantized model with using transformers pipeline
optimum_qa_model = pipeline(task, model=opt_model, tokenizer=tokenizer, handle_impossible_answer=True)

In [None]:
%%time
prediction = optimum_qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)

### Quantized Model

In [None]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)

# apply the quantization configuration to the model
quantizer.export(
    onnx_model_path=onnx_path / "model-optimized.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    quantization_config=qconfig,
)

In [None]:
# load quantized model
qt_model = ORTModelForQuestionAnswering.from_pretrained(onnx_path, file_name="model-quantized.onnx")

# test the quantized model with using transformers pipeline
quantized_qa_model = pipeline(task, model=qt_model, tokenizer=tokenizer, handle_impossible_answer=True)

In [None]:
%%time
prediction = quantized_qa_model(question="what is my name?", context="My name is Humza and I'm 5 years old")
print(prediction)