https://huggingface.co/docs/transformers/serialization#exporting-a-model-with-transformersonnx

In [1]:
from transformers import AutoTokenizer
from onnxruntime import InferenceSession

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
session = InferenceSession("./model.onnx")

# ONNX Runtime expects NumPy arrays as input
inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="pt")

In [2]:
for output in session.get_outputs():
    print(output.name)

start_logits
end_logits


In [3]:
for input in session.get_inputs():
    print(input.name)

input_ids
attention_mask


In [4]:
inputs

{'input_ids': tensor([[  101,  2478,  4487, 16643, 23373,  2007,  2006, 26807,  2448,  7292,
           999,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
outputs = session.run(output_names=["start_logits","end_logits"], input_feed=inputs)
outputs

TypeError: run(): incompatible function arguments. The following argument types are supported:
    1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: List[str], arg1: Dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> List[object]

Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x7efec266f4b0>, ['start_logits', 'end_logits'], {'input_ids': tensor([[  101,  2478,  4487, 16643, 23373,  2007,  2006, 26807,  2448,  7292,
           999,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, None

In [59]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

## Exporting a 🤗 Transformers model to ONNX with optimum.onnxruntime

Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmatically like so:

In [9]:
! pip install optimum[exporters]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting optimum[exporters]
  Obtaining dependency information for optimum[exporters] from https://files.pythonhosted.org/packages/f1/7f/ae62875dfa4f0ebfd0f8a7e2a0436b3c98530b812fbbcc120c8ccd33405f/optimum-1.12.0-py3-none-any.whl.metadata
  Downloading optimum-1.12.0-py3-none-any.whl.metadata (16 kB)
Collecting timm (from optimum[exporters])
  Obtaining dependency information for timm from https://files.pythonhosted.org/packages/35/57/84a71b5b540152fb5f5ec7a34f4fcd7fafa4e21053098146c831874e29e5/timm-0.9.6-py3-none-any.whl.metadata
  Downloading timm-0.9.6-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m

In [11]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased-distilled-squad"
save_directory = "onnx/"

# Load a model from transformers and export it to ONNX
ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Save the onnx model and tokenizer
ort_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading (…)lve/main/config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Framework not specified. Using pt to export to ONNX.


Downloading model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-distilled-squad and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Using framework PyTorch: 2.0.1
  mask, torch.tensor(torch.finfo(scores.dtype).min)


verbose: False, log level: Level.ERROR



('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/vocab.txt',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

In [14]:
from transformers import AutoTokenizer
from onnxruntime import InferenceSession

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
session = InferenceSession("onnx/model.onnx")
# ONNX Runtime expects NumPy arrays as input
inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
outputs = session.run(output_names=["logits"], input_feed=dict(inputs))

In [16]:
outputs

[array([[-0.05862346, -0.04041991]], dtype=float32)]

In [13]:
[output.name for output in session.get_outputs()]

['logits']