Convert model to onnx format for deployment

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("./output")

In [3]:
# Set the model to evaluation mode
# so that some layer is disable in inference mode, rather than training mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
dummy_input = tokenizer("This movie is good.", return_tensors="pt")

torch.onnx.export(
    model,
    (dummy_input["input_ids"], dummy_input["attention_mask"], dummy_input["token_type_ids"]), # example inputs
    "../deploy/movie-review-sentiment-analysis.onnx",
    input_names=["input_ids", "attention_mask", "token_type_ids"],
    output_names=["logits"],
    opset_version=17,
    dynamic_axes={
        "input_ids": {1: "sequence"},
        "attention_mask": {1: "sequence"},
        "token_type_ids": {1: "sequence"},
        "logits": {0: "batch"}
    }
    # Set dynamic inputs, to prevent onnx from freeze input as dummy
    ,
    verbose=True
)

  torch.onnx.export(
W1125 19:55:13.975000 19048 Lib\site-packages\torch\onnx\_internal\exporter\_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 17 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features


[torch.onnx] Obtain model graph for `BertForSequenceClassification([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `BertForSequenceClassification([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...


The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 17).


[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅
Applied 17 of general pattern rewrite rules.


ONNXProgram(
    model=
        <
            ir_version=10,
            opset_imports={'': 17},
            producer_name='pytorch',
            producer_version='2.9.0+cu130',
            domain=None,
            model_version=None,
        >
        graph(
            name=main_graph,
            inputs=(
                %"input_ids"<INT64,[1,s41]>,
                %"attention_mask"<INT64,[1,s41]>,
                %"token_type_ids"<INT64,[1,s41]>
            ),
            outputs=(
                %"logits"<FLOAT,[1,2]>
            ),
            initializers=(
                %"bert.embeddings.LayerNorm.weight"<FLOAT,[768]>{TorchTensor(...)},
                %"bert.embeddings.LayerNorm.bias"<FLOAT,[768]>{TorchTensor(...)},
                %"bert.encoder.layer.0.attention.self.query.bias"<FLOAT,[768]>{TorchTensor(...)},
                %"bert.encoder.layer.0.attention.self.key.bias"<FLOAT,[768]>{TorchTensor(...)},
                %"bert.encoder.layer.0.attention.self.value.bias"<FL