In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [None]:
!pip install onnx
!pip install onnxruntime
!pip install onnxruntime-gpu
!pip install torch-tensorrt

In [91]:
import os
from typing import Dict, List
import torch
import onnx
import numpy as np
import onnxruntime

class BenchmarkModelTensorRT():
    
    def __init__(self, trt_model_path):
        
        self._model = torch.jit.load(trt_model_path)
    
    def forward(self, inputs):
        
        return self._model(inputs)
    
class BenchmarkModelTensorONNX():
    
    def __init__(self, onnx_path, device: str):
        
        self._session = onnxruntime.InferenceSession(
            onnx_path,
            providers=['CUDAExecutionProvider'] if device == 'cuda' else ['CPUExecutionProvider'],
            verbose=True,
        )
        
        self._input_names = [x.name for x in self._session.get_inputs()]
        
        self._output_name = self._session.get_outputs()[0].name
        
        print(self._output_name)
        
    def forward(self, inputs):
        
        inputs = {k: v.cpu().numpy().astype(np.int64) for k, v in inputs.items() if k in self._input_names}
        
        return self._session.run([self._output_name], inputs)[0]
        
class BenchmarkModel(torch.nn.Module):
    
    def __init__(self, model, model_path: str):
        
        super().__init__()
        
        self._model = model
        self._model_path = model_path 
        
        os.makedirs(model_path, exist_ok=True)
        
    def forward(self, *args, **kwargs):
        
        return self._model.forward(*args, **kwargs)
    
    def export_to_onnx(
        self,
        dummy_input: Dict[str, torch.Tensor],
        dynamic_axis: Dict,
        device: str,
    ):
        
        self._model.eval()
        
        onnx_path = os.path.join(self._model_path, "onnx")

        torch.onnx.export(
            self._model,
            dummy_input,
            onnx_path,
            output_names=["output"],
            input_names=list(dummy_input.keys()),
            opset_version=12,
            dynamic_axes=dynamic_axis,
        )
        
        return BenchmarkModelTensorONNX(onnx_path, device=device)

    def convert_to_tensorrt(
        self,
        dummy_input,
        min_shape: List[int],
        max_shape: List[int],
        opt_shape: List[int],
        fp16_mode=False,
    ):
        
        self._model.eval()
        
        rt_model = trt.ts.convert(
            self._model,
            inputs=[dummy_input],
            fp16_mode=fp16_mode,
            min_shape=min_shape,
            opt_shape=opt_shape,
            max_shape=max_shape
        )
        
        trt_model_path = os.path.join(self._model_path, "tensorrt")
        
        torch.jit.save(rt_model, trt_model_path)
        
        return BenchmarkModelTensorRT(trt_model_path)
    
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class TextClassification(BenchmarkModel):
    
    def __init__(
        self,
        model_name: str = "nateraw/bert-base-uncased-imdb",
        **kwargs
    ):
        
        self._tokenizer = AutoTokenizer.from_pretrained(
           model_name
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            **kwargs
        )
        
        super().__init__(model, model_name.replace("/", "-"))
        
    def export_to_onnx(self, device: str):
        
        dummy_input = {
            "input_ids": torch.Tensor([[1, 2, 3]]).long(),
            "attention_mask": torch.Tensor([[0, 1, 0]]).long()
        }
        
        dynamic_axes = {
            'input_ids': {0: 'batch_size', 1: 'sequence_length'},
            'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
            'output': {0: 'batch_size'}
        }
        
        return super().export_to_onnx(
            dummy_input,
            dynamic_axes,
            device=device,
        )
            
    def forward_text(self, text: str):
        
        tokens = self._tokenizer(text, return_tensors="pt")
        
        return self.forward(**tokens)

In [92]:
model = TextClassification()

In [93]:
onnx_model = model.export_to_onnx("cuda")

output


2023-04-20 18:42:19.278158690 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-04-20 18:42:19.278174619 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [101]:
N = 1000


In [102]:
import time
s = time.time()

model.eval().cuda()

for i in range(N):
    tokens = {k: v.cuda() for k,v in model._tokenizer("This was such a good movie", return_tensors="pt").items()}
    inputs = model.forward(**tokens)
    
e = time.time()
e - s

7.181607484817505

In [103]:
import time
s = time.time()

for i in range(N):
    tokens = model._tokenizer("This was such a good movie", return_tensors="pt")
    inputs = onnx_model.forward(tokens)
    
e = time.time()
e - s

2.97001314163208

In [None]:
model._tokenizer("This was such a good movie", return_tensors="pt")["input_ids"].shape

In [15]:
model.forward_text("This was such a good movie")

SequenceClassifierOutput(loss=None, logits=tensor([[-2.2428, -1.3491,  3.8788]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
!pip install torch-tensorrt