## ONNX

<img width="758" alt="image" src="https://user-images.githubusercontent.com/57203764/197336149-82d68617-8fff-4216-b457-6f828a861bd7.png">

학습된 model을 ONNX 모델로 만들어준 후, inference 할 때 사용자가 원하는 framework를 사용할 수 있도록 만들어주는 과정을 model packaging 이라고 한다.

## ONNX packaging

PyTorch 버전과 PyTorch-lightning 버전이 있다. (나는 PyTorch 버전을 더 많이 사용하지 않을까 싶다!)

필요한 것은 
- Name of the onnx model
- Input sample
- Input names (초기 input 뿐만 아니라 각 layer의 input들까지 이름을 지정해 줄 수 있다. layer input의 개수보다 적을 시에는 남는 것은 자동으로 이름 붙여진다.)
- Output names (output의 이름)
- Dynamic axes (각 input들에서 변하도록 설계된 axes를 표시해 주는 것. 일반적으로는 batch인 0번 axis 혹은 RNN의 sequence length 정도가 되겠다.)


In [1]:
# !python -m pip install pytorch_lightning torch transformers datasets sklearn torchmetrics wandb matplotlib
import torch
from torch import nn
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModel
import pytorch_lightning as pl
from pytorch_lightning.callbacks import Callback, ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers.wandb import WandbLogger
import torchmetrics
from torchmetrics import ConfusionMatrix
from sklearn.metrics import confusion_matrix, accuracy_score

from datasets import load_dataset
import wandb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DataModule(pl.LightningDataModule):
    def __init__(self, model_name="google/bert_uncased_L-2_H-128_A-2", batch_size=32):
        super().__init__()

        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(model_name) # Transformer (BERT) model

    def prepare_data(self):
        cola_dataset = load_dataset("glue", "cola")
        self.train_data = cola_dataset["train"]
        self.val_data = cola_dataset["validation"]

    def tokenize_data(self, example):
        # processing the data
        return self.tokenizer(
            example["sentence"],
            truncation=True,
            padding="max_length",
            max_length=256,
        )

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train_data = self.train_data.map(self.tokenize_data, batched=True)
            self.train_data.set_format(
                type="torch", columns=["input_ids", "attention_mask", "label"]
            )

            self.val_data = self.val_data.map(self.tokenize_data, batched=True)
            self.val_data.set_format(
                type="torch", columns=["input_ids", "attention_mask", "label"],
                output_all_columns=True,
            )

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_data, batch_size=self.batch_size, shuffle=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_data, batch_size=self.batch_size, shuffle=False
        )

In [3]:
class ColaModel(pl.LightningModule):
    def __init__(self, model_name="google/bert_uncased_L-2_H-128_A-2", lr=1e-2):
        super(ColaModel, self).__init__()
        self.save_hyperparameters()

        self.bert = AutoModel.from_pretrained(model_name)
        self.W = nn.Linear(self.bert.config.hidden_size, 2)
        self.num_classes = 2

        self.train_accuracy_metric = torchmetrics.Accuracy()
        self.val_accuracy_metric = torchmetrics.Accuracy()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        h_cls = outputs.last_hidden_state[:, 0]
        logits = self.W(h_cls)
        return logits

    def training_step(self, batch, batch_idx):
        logits = self.forward(batch["input_ids"], batch["attention_mask"])
        loss = F.cross_entropy(logits, batch["label"])

        preds = torch.argmax(logits, 1)
        train_acc = self.train_accuracy_metric(preds, batch["label"])
        self.log("train/loss", loss.detach().cpu(), prog_bar=True, on_epoch=True)
        self.log("train/accuracy", train_acc.detach().cpu(), prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch["input_ids"], batch["attention_mask"])
        loss = F.cross_entropy(logits, batch["label"])
        _, preds = torch.max(logits, dim=1)
        val_acc = self.val_accuracy_metric(preds, batch["label"])
        self.log("val/loss", loss.detach().cpu(), prog_bar=True, on_epoch=True)
        self.log("val/acc", val_acc.detach().cpu(), prog_bar=True, on_epoch=True)
        return {"labels": batch["label"], "logits": logits}

    def validation_epoch_end(self, outputs):
        labels = torch.cat([x["labels"] for x in outputs])
        logits = torch.cat([x["logits"] for x in outputs])
        preds = torch.argmax(logits, 1)

        labels = labels.detach().cpu().numpy()
        preds = preds.detach().cpu().numpy()
        data = confusion_matrix(labels, preds)
        df_cm = pd.DataFrame(data, columns=np.unique(labels), index=np.unique(labels))
        df_cm.index.name = "Actual"
        df_cm.columns.name = "Predicted"
        plt.figure(figsize=(7, 4))
        plot = sns.heatmap(
            df_cm, cmap="Blues", annot=True, annot_kws={"size": 16}
        )  # font size
        self.logger.experiment.log({"Confusion Matrix": wandb.Image(plot)})

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

In [7]:
model_path = f"models/best-checkpoint.ckpt"
cola_model = ColaModel.load_from_checkpoint(model_path)
data_model = DataModule()
data_model.prepare_data()
data_model.setup()

input_batch = next(iter(data_model.train_dataloader()))
input_sample = {
    "input_ids": input_batch["input_ids"][0].unsqueeze(0),
    "attention_mask": input_batch["attention_mask"][0].unsqueeze(0),
}

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset glue (/home/jaekyungcho/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70

In [12]:
# When using PyTorch
torch.onnx.export(
    cola_model,  # model being run
    (
        input_sample["input_ids"],
        input_sample["attention_mask"],
    ),  # model input (or a tuple for multiple inputs)
    "models/model.onnx",  # where to save the model
    export_params=True,
    opset_version=10,
    verbose=True,
    input_names=["input_ids", "attention_mask"],  # the model's input names
    output_names=["output"],  # the model's output names
    dynamic_axes={            # variable length axes
        "input_ids": {0: "batch_size"},
        "attention_mask": {0: "batch_size"},
        "output": {0: "batch_size"},
    },
)

# When using PyTorch-lightning
cola_model.to_onnx(
  "models/model.onnx",             # where to save the model
  input_sample,             # input samples with atleast batch size as 1
  export_params=True,
  opset_version=10,
  input_names = ["input_ids", "attention_mask"],    # Input names
  output_names = ['output'],  # Output names
  dynamic_axes={            # variable length axes
        "input_ids": {0: "batch_size"},
        "attention_mask": {0: "batch_size"},
        "output": {0: "batch_size"},
    },
)

Exported graph: graph(%input_ids : Long(*, 256, strides=[256, 1], requires_grad=0, device=cpu),
      %attention_mask : Long(*, 256, strides=[256, 1], requires_grad=0, device=cpu),
      %bert.embeddings.position_ids : Long(1, 512, strides=[512, 1], requires_grad=0, device=cpu),
      %bert.embeddings.word_embeddings.weight : Float(30522, 128, strides=[128, 1], requires_grad=1, device=cpu),
      %bert.embeddings.position_embeddings.weight : Float(512, 128, strides=[128, 1], requires_grad=1, device=cpu),
      %bert.embeddings.token_type_embeddings.weight : Float(2, 128, strides=[128, 1], requires_grad=1, device=cpu),
      %bert.embeddings.LayerNorm.weight : Float(128, strides=[1], requires_grad=1, device=cpu),
      %bert.embeddings.LayerNorm.bias : Float(128, strides=[1], requires_grad=1, device=cpu),
      %bert.encoder.layer.0.attention.self.query.bias : Float(128, strides=[1], requires_grad=1, device=cpu),
      %bert.encoder.layer.0.attention.self.key.bias : Float(128, strides=[

## ONNX runtime

ONNX runtime은 ONNX 모델의 inference engine 이다. Cuda 버전에 맞는 버전을 찾아 설치를 해주자. ([Cuda-ONNXruntime version table](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements:~:text=Install%20ORT.-,Requirements,-Please%20reference%20table))
```bash
python -m pip install onnxruntime-gpu==<version>
```
ONNX runtime은 서로 다른 OS와 HW(accelerator)에서 간편히 동작할 수 있도록 만들어졌다. 
가능한 모든 HW와 현재 플랫폼에서 사용 가능한 HW는 아래와 같이 확인 가능하다. 안타깝게도 연구실 서버는 Cuda 9.1이라서 onnxruntime-gpu를 지원하지는 않는 것 같다...ㅠㅠ
```python
from onnxruntime import  get_all_providers, get_available_providers
print(get_all_providers())
print(get_available_providers())
```



In [27]:
# !python -m pip install onnxruntime, onnxruntime-gpu
from onnxruntime import  get_all_providers, get_available_providers
print(get_all_providers())
print(get_available_providers())

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'OpenVINOExecutionProvider', 'DnnlExecutionProvider', 'NupharExecutionProvider', 'TvmExecutionProvider', 'VitisAIExecutionProvider', 'NnapiExecutionProvider', 'CoreMLExecutionProvider', 'ArmNNExecutionProvider', 'ACLExecutionProvider', 'DmlExecutionProvider', 'RknpuExecutionProvider', 'XnnpackExecutionProvider', 'CPUExecutionProvider']
['CPUExecutionProvider']


onnxruntime을 이용해 모델 inference를 해본다.


In [34]:
import onnxruntime as ort
onnx_model_path = 'models/model.onnx'
ort_session = ort.InferenceSession(onnx_model_path)
ort_inputs = {
    "input_ids": input_sample["input_ids"].numpy(),
    "attention_mask": input_sample["attention_mask"].numpy(),
}
output_name = None
ort_output = ort_session.run(output_name, ort_inputs)

[array([[-0.3588007,  0.3415024]], dtype=float32)]


## Inference time test

일반적으로 2~3배 이상 속도 차이가 발생한다. 근데 구글에선 아무도 왜 빠른지에 의문을 품지는 않았다...(그냥 최적화를 잘 한걸지도...??) 사실 Pytorch는 꽤나 불필요한 연산들이 많이 진행될 것이다. gradient도 계산한 것을 전부 저장하고 있고 그러니까. 그런거 전부 날리면 3배 정도는 빠르게 할 수 있다는 것일테다.

In [37]:
from time import time

ort_time = time()
ort_output = ort_session.run(output_name, ort_inputs)
print("ONNX inference time:", time()-ort_time, "sec")

pt_time = time()
with torch.no_grad():
    pt_output = cola_model(**input_sample)
print("PyTorch inference time:", time()-pt_time, "sec")

ONNX inference time: 0.004585742950439453 sec
PyTorch inference time: 0.015786170959472656 sec
