Source - [Tianlei Wu](https://github.com/onnx/onnx/issues/3278#issuecomment-781948998)

### Installing dependencies

In [1]:
!pip install --upgrade transformers sentencepiece
!pip install --upgrade onnxruntime
!pip install --upgrade onnxruntime-tools

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 5.3MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 17.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 28.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)

### Using model from Transformers

In [33]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

##your model name
source_name = "ktrapeznikov/albert-xlarge-v2-squad-v2"

model_name = source_name.split("/")[-1]
dir = "transformers-model"

!rm -rf {dir}
!mkdir {dir}

tokenizer = AutoTokenizer.from_pretrained(source_name)
model = AutoModelForQuestionAnswering.from_pretrained(source_name)
model.eval()

AlbertForQuestionAnswering(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=2048, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=2048, out_features=2048, bias=True)
                (key): Linear(in_features=2048, out_features=2048, bias=True)
                (value): Linear(in_features=2048, out_features=

#### Exporting model to ONNX

In [34]:
question = "what is google specialization"
text = "Google LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware."
encoding = tokenizer.encode_plus(question, text)
input_ids, attention_mask, token_type_ids = encoding["input_ids"],encoding["attention_mask"], encoding["token_type_ids"]

input_ids = torch.tensor([input_ids])
attention_mask = torch.tensor([attention_mask])
token_type_ids = torch.tensor([token_type_ids])

torch.onnx.export(
    model,
    (input_ids,attention_mask, token_type_ids),
    f"{dir}/{model_name}.onnx",
    input_names = ['input_ids','attention_mask', 'token_type_ids'], ## Be carefule to write this names
    output_names = ['qa_outputs'], ## Be carefule to write this names
    opset_version=12,
    do_constant_folding=True,
    use_external_data_format=True,
    dynamic_axes = {
        'input_ids' : {0: 'batch', 1: 'sequence'},
        'attention_mask' : {0: 'batch', 1: 'sequence'}, 
        'token_type_ids' : {0: 'batch', 1: 'sequence'}, 
        'qa_outputs': {0: 'batch'}
    }
)

  position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors


In [35]:
torch.save(model,"model.pth")

The model exported to ONNX has shared weights and because of that the size of the model increases in a huge amount

In [36]:
!du -sh model.pth
!du -sh {dir}

209M	model.pth
4.6G	transformers-model


### Removing shared weights

In [37]:
def has_same_value(val_one,val_two):
  if val_one.raw_data == val_two.raw_data:
    return True
  else:
    return False

In [38]:
from onnxruntime.transformers.onnx_model import OnnxModel
import onnx

path = f"{dir}/{model_name}.onnx"
model=onnx.load(path)
onnx_model=OnnxModel(model)

In [39]:
output_path = f"{model_name}.onnx"

count = len(model.graph.initializer)
same = [-1] * count
for i in range(count - 1):
  if same[i] >= 0:
	    continue
  for j in range(i+1, count):
      if has_same_value(model.graph.initializer[i], model.graph.initializer[j]):
	      same[j] = i

for i in range(count):
   if same[i] >= 0:
        onnx_model.replace_input_of_all_nodes(model.graph.initializer[i].name, model.graph.initializer[same[i]].name)

onnx_model.update_graph()

onnx_model.save_model_to_file(output_path)

In [40]:
!du -sh {output_path}

209M	albert-xlarge-v2-squad-v2.onnx


### Inference using the converted model

In [41]:
import numpy as np
from onnxruntime import InferenceSession, SessionOptions, get_all_providers

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

def create_model_for_provider(model_path: str, provider: str) -> InferenceSession:

  assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"

  # Few properties than might have an impact on performances (provided by MS)
  options = SessionOptions()
  options.intra_op_num_threads = 1

  return InferenceSession(model_path, options, providers=[provider])

In [42]:
cpu_model = create_model_for_provider(output_path, "CPUExecutionProvider")

In [44]:
question = "what is google's specialization"
text = "Google LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, a search engine, cloud computing, software, and hardware."
inputs = tokenizer.encode_plus(question,text, add_special_tokens=True,return_tensors="np")

answer_start_scores, answer_end_scores = cpu_model.run(None, {"attention_mask":inputs['attention_mask'],"input_ids":inputs["input_ids"],"token_type_ids":inputs["token_type_ids"]})

ans_tokens = inputs["input_ids"][0][np.argmax(answer_start_scores) : np.argmax(answer_end_scores) + 1 ]
answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens , skip_special_tokens=True)

answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)

print ("\nQuestion : ",question)
print ("\nAnswer : ",answer_tokens_to_string)


Question :  what is google's specialization

Answer :  internet-related services and products
