In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [2]:
!pip install onnx
!pip install onnxruntime
!pip install onnxruntime-gpu
!pip install tensorrt
!pip install pycuda

Collecting onnx
  Downloading onnx-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting protobuf<4,>=3.20.2
  Downloading protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: protobuf, onnx
Successfully installed onnx-1.13.1 protobuf-3.20.3
[0mCollecting onnxruntime
  Downloading onnxruntime-1.14.1-cp310-cp310-manylinux_2_27_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting flatbuffers
  Downloading flatbuffers-23.3.3-py2.py3-none-any.whl (26 kB)
Collecting coloredlogs
  Downloading coloredlogs-15.

In [2]:
import onnx_tensorrt.backend as backend

In [12]:
import os
from typing import Dict, List
import torch
import onnx
import numpy as np
import onnxruntime
# import torch_tensorrt as trt

class BenchmarkModelTracedJIT():
    
    def __init__(self, model_path, device: str):
        
        self._model = torch.jit.load(model_path).to(device)
    
    def forward(self, *inputs):
        
        return self._model(*inputs)

class BenchmarkModelTensorRT():
    
    def __init__(self, trt_model_path):
        
        self._model = torch.jit.load(trt_model_path)
    
    def forward(self, inputs):
        
        return self._model(inputs)
    
class BenchmarkModelTensorONNX():
    
    def __init__(self, onnx_path, device: str, tensorrt: bool = False):
        
        self._onnx_path = onnx_path
        
        self._session = onnxruntime.InferenceSession(
            onnx_path,
            providers=['CUDAExecutionProvider'] if device == 'cuda' else ['CPUExecutionProvider'],
            verbose=True,
        )
        
        self._input_names = [x.name for x in self._session.get_inputs()]
        
        self._output_name = self._session.get_outputs()[0].name
        
    def forward(self, inputs):
        
        inputs = {k: v.cpu().numpy().astype(np.int64) for k, v in inputs.items() if k in self._input_names}
        
        return self._session.run([self._output_name], inputs)[0]
        
class BenchmarkModel(torch.nn.Module):
    
    def __init__(self, model, model_path: str):
        
        super().__init__()
        
        self._model = model
        self._model_path = model_path 
        
        os.makedirs(model_path, exist_ok=True)
        
    def forward(self, *args, **kwargs):
        
        return self._model.forward(*args, **kwargs)
    
    def export_to_onnx(
        self,
        dummy_input: Dict[str, torch.Tensor],
        dynamic_axis: Dict,
        device: str,
    ):
        
        self._model.eval()
        
        onnx_path = os.path.join(self._model_path, "onnx")

        torch.onnx.export(
            self._model,
            dummy_input,
            onnx_path,
            output_names=["output"],
            input_names=list(dummy_input.keys()),
            opset_version=12,
            dynamic_axes=dynamic_axis,
        )
        
        return BenchmarkModelTensorONNX(onnx_path, device=device)
    
    def export_to_traced_jit(
        self,
        *args,
        device: str = "cpu",
    ):
        
        module = torch.jit.trace(self._model, args)
        
        path = os.path.join(self._model_path, "traced")
        
        torch.jit.save(module, path)
        
        return BenchmarkModelTracedJIT(path, device)

    def export_to_tensorrt(
        self,
        dummy_input,
        min_shape: List[int],
        max_shape: List[int],
        opt_shape: List[int],
        fp16_mode=False,
    ):
        
        self._model.eval()

        traced = torch.jit.trace(self._model, dummy_input)
        
        inputs = [
            trt.Input(
                min_shape=min_shape,
                opt_shape=opt_shape,
                max_shape=max_shape,
                dtype=torch.half if fp16_mode else torch.float,
            )
            for min_shape, opt_shape, max_shape in zip(min_shapes, opt_shapes, max_shapes)
        ]
        
        trt_ts_module = trt.compile(traced,
            # If the inputs to the module are plain Tensors, specify them via the `inputs` argument:
            inputs = [dummy_input, # Provide example tensor for input shape or...
                trt.Input( # Specify input object with shape and dtype
                    min_shape=min_shape,
                    opt_shape=opt_shape,
                    max_shape=max_shape,
                    # For static size shape=[1, 3, 224, 224]
                    dtype=torch.half) # Datatype of input tensor. Allowed options torch.(float|half|int8|int32|bool)
            ],

            # For inputs containing tuples or lists of tensors, use the `input_signature` argument:
            # Below, we have an input consisting of a Tuple of two Tensors (Tuple[Tensor, Tensor])
            # input_signature = ( (torch_tensorrt.Input(shape=[1, 3, 224, 224], dtype=torch.half),
            #                      torch_tensorrt.Input(shape=[1, 3, 224, 224], dtype=torch.half)), ),

            enabled_precisions = {torch.half}, # Run with FP16
        )

        trt_model_path = os.path.join(self._model_path, "tensorrt")
        
        torch.jit.save(rt_model, trt_model_path)
        
        return BenchmarkModelTensorRT(trt_model_path)
    
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class TextClassification(BenchmarkModel):
    
    def __init__(
        self,
        model_name: str = "nateraw/bert-base-uncased-imdb",
        **kwargs
    ):
        
        self._tokenizer = AutoTokenizer.from_pretrained(
           model_name
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            **kwargs
        )
        
        super().__init__(model, model_name.replace("/", "-"))
        
    def export_to_onnx(self, device: str,  dynamic_axis: bool = True):
        
        dummy_input = {
            "input_ids": torch.Tensor([[1, 2, 3]]).long(),
            "attention_mask": torch.Tensor([[0, 1, 0]]).long()
        }
        
        if dynamic_axis:
            dynamic_axis = {
                'input_ids': {0: 'batch_size', 1: 'sequence_length'},
                'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
                'output': {0: 'batch_size'}
            }
        else:
            dynamic_axis = None
            
        print(dynamic_axis)
        
        return super().export_to_onnx(
            dummy_input,
            dynamic_axis,
            device=device,
        )
    
    def export_to_tensorrt(
        self,
        max_batch_size: int = 100,
        fp16_mode: bool = False,
    ):
        
        dummy_input = [
            torch.Tensor([[1, 2, 3]]).long(), # input_ids
            torch.Tensor([[0, 1, 0]]).long()  # attention_mask
        ]
        
        min_shape = [
            [1, 1], [1, 1]
        ]
        
        max_shape = [
            [max_batch_size, self._tokenizer.model_max_length],
            [max_batch_size, self._tokenizer.model_max_length]
        ]
        
        opt_shape = [
            [1, int(self._tokenizer.model_max_length / 2)],
            [1, int(self._tokenizer.model_max_length / 2)]
        ]
        
        return super().export_to_tensorrt(
            dummy_input=dummy_input,
            min_shape=min_shape,
            opt_shape=opt_shape,
            max_shape=max_shape,
            fp16_mode=fp16_mode,
        )
    
    def export_to_traced_jit(self, device: str):
        
        dummy_input = {
            "input_ids": torch.Tensor([[1, 2, 3]]).long(),
            "attention_mask": torch.Tensor([[0, 1, 0]]).long()
        }
        
        return super().export_to_traced_jit(
            torch.Tensor([[1, 2, 3]]).long(),
            torch.Tensor([[0, 1, 0]]).long(),
            device=device,
        )
            
    def forward_text(self, text: str):
        
        tokens = self._tokenizer(text, return_tensors="pt")
        
        return self.forward(**tokens)

In [13]:
model = TextClassification(torchscript=True)

In [240]:
rt_model = model.export_to_tensorrt()

KeyError: "Input specs should be either torch_tensorrt.Input or torch.Tensor, found types: [<class 'list'>, <class 'torch_tensorrt._Input.Input'>]"

In [6]:
import onnx
import tensorrt

def onnx_to_tensorrt(onnx_model_path, trt_model_path, max_batch_size=1, fp16_mode=False):
    # Create a logger to display TensorRT conversion messages
    TRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)

    # Load the ONNX model
    onnx_model = onnx.load(onnx_model_path)

    # Check the ONNX model for any issues
    onnx.checker.check_model(onnx_model)

    # Create a TensorRT builder, network, and parser
    builder = tensorrt.Builder(TRT_LOGGER)
    network = builder.create_network(
        1 << int(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = tensorrt.OnnxParser(network, TRT_LOGGER)

    # Parse the ONNX model and populate the TensorRT network
    if not parser.parse(onnx_model.SerializeToString()):
        for error in range(parser.num_errors):
            print(parser.get_error(error))

    # Set the builder configurations
    # builder.max_workspace_size = 1 << 30
    builder.max_batch_size = max_batch_size

    if fp16_mode and builder.platform_has_fast_fp16:
        builder.fp16_mode = True

    # Build the TensorRT engine
    engine = builder.build_engine(network)

    # Save the TensorRT engine to a file
    with open(trt_model_path, "wb") as f:
        f.write(engine.serialize())

    return trt_model_path


In [38]:
import tensorrt as trt
import common
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file, batch_size):
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(batch_size)
    config = builder.create_builder_config()
    parser = trt.OnnxParser(network, TRT_LOGGER)

    config.max_workspace_size = 1 << 30
    # Load the Onnx model and parse it in order to populate the TensorRT network.
    with open(model_file, "rb") as model:
        if not parser.parse(model.read()):
            print("ERROR: Failed to parse the ONNX file.")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    return builder.build_serialized_network(network, config)

In [34]:
onnx_model = model.export_to_onnx("cuda")

verbose: False, log level: Level.ERROR



2023-04-23 13:46:29.245837833 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-04-23 13:46:29.245883912 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [35]:
onnx_model._onnx_path

'nateraw-bert-base-uncased-imdb/onnx'

In [45]:
import onnx
import onnx_tensorrt.backend as backend
import numpy as np

onnx_model = onnx.load("nateraw-bert-base-uncased-imdb/onnx")
engine = backend.prepare(onnx_model, device='CUDA:0', verbose=True)



Running torch_jit...[04/23/2023-13:53:06] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading

[04/23/2023-13:53:06] [TRT] [V] Plugin creator already registered - ::BatchedNMSDynamic_TRT version 1
[04/23/2023-13:53:06] [TRT] [V] Plugin creator already registered - ::BatchedNMS_TRT version 1
[04/23/2023-13:53:06] [TRT] [V] Plugin creator already registered - ::BatchTilePlugin_TRT version 1
[04/23/2023-13:53:06] [TRT] [V] Plugin creator already registered - ::Clip_TRT version 1
[04/23/2023-13:53:06] [TRT] [V] Plugin creator already registered - ::CoordConvAC version 1
[04/23/2023-13:53:06] [TRT] [V] Plugin creator already registered - ::CropAndResizeDynamic version 1
[04/23/2023-13:53:06] [TRT] [V] Plugin creator already registered - ::CropAndResize version 1
[04/23/2023-13:53

In [None]:
input_data = np.random.random(size=(32, 3, 224, 224)).astype(np.float32)

In [24]:
tokens = {k: v.numpy() for k,v in model._tokenizer("This was such a good movie", return_tensors="pt").items()}
del tokens["token_type_ids"]

In [44]:
engine.network.num_inputs

2

In [31]:
list(tokens.values())

[array([[ 101, 2023, 2001, 2107, 1037, 2204, 3185,  102]]),
 array([[1, 1, 1, 1, 1, 1, 1, 1]])]

In [38]:
output_data = engine.run(list(tokens.values()))[0]
print(output_data)
print(output_data.shape)

[04/23/2023-13:47:00] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


ValueError: Not enough inputs. Expected 4, got 2.

In [7]:
onnx_model = model.export_to_onnx("cuda")

verbose: False, log level: Level.ERROR



2023-04-23 13:01:18.608276608 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-04-23 13:01:18.608299188 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [39]:
engine = build_engine_onnx(
    onnx_model._onnx_path,
    batch_size=1,
)

  config.max_workspace_size = 1 << 30


[04/23/2023-13:20:10] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
[04/23/2023-13:20:10] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[04/23/2023-13:20:10] [TRT] [E] 4: [network.cpp::validate::3047] Error Code 4: Internal Error (Network has dynamic or shape inputs, but no optimization profile has been defined.)


In [60]:
onnx_model._onnx_path

'nateraw-bert-base-uncased-imdb/onnx'

In [8]:
onnx_to_tensorrt(
    onnx_model._onnx_path,
    "tensor_rt",
)

[04/23/2023-13:01:34] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
[04/23/2023-13:01:34] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.


  builder.max_batch_size = max_batch_size


TypeError: build_engine(): incompatible function arguments. The following argument types are supported:
    1. (self: tensorrt.tensorrt.Builder, network: tensorrt.tensorrt.INetworkDefinition, config: tensorrt.tensorrt.IBuilderConfig) -> tensorrt.tensorrt.ICudaEngine

Invoked with: <tensorrt.tensorrt.Builder object at 0x7ff052706b70>, <tensorrt.tensorrt.INetworkDefinition object at 0x7ff0599ff330>

'nateraw-bert-base-uncased-imdb/onnx'

512

2023-04-20 19:11:58.276919577 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-04-20 19:11:58.276936137 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [173]:
onnx_tradced = model.export_to_traced_jit("cuda")

In [174]:
import time
s = time.time()

model.eval().cuda()

for i in range(N):
    tokens = {k: v.cuda() for k,v in model._tokenizer("This was such a good movie", return_tensors="pt").items()}
    inputs = onnx_tradced.forward(tokens["input_ids"], tokens["attention_mask"])
    
e = time.time()
e - s

4.80880069732666

In [101]:
N = 1000


In [175]:
import time
s = time.time()

model.eval().cuda()

for i in range(N):
    tokens = {k: v.cuda() for k,v in model._tokenizer("This was such a good movie", return_tensors="pt").items()}
    inputs = model.forward(**tokens)
    
e = time.time()
e - s

6.914243698120117

In [177]:
import time
s = time.time()

for i in range(N):
    tokens = model._tokenizer("This was such a good movie", return_tensors="pt")
    inputs = onnx_model.forward(tokens)
    
e = time.time()
e - s

2.9544553756713867

In [None]:
model._tokenizer("This was such a good movie", return_tensors="pt")["input_ids"].shape

In [15]:
model.forward_text("This was such a good movie")

SequenceClassifierOutput(loss=None, logits=tensor([[-2.2428, -1.3491,  3.8788]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [43]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pytools>=2011.2
  Downloading pytools-2022.1.14.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting appdirs>=1.4.0
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (pyproject.toml

In [38]:
# SPDX-License-Identifier: Apache-2.0

import tensorrt as trt
import pycuda.driver
import pycuda.gpuarray
import pycuda.autoinit
import numpy as np
from six import string_types

class Binding(object):
    def __init__(self, engine, idx_or_name):
        if isinstance(idx_or_name, string_types):
            self.name = idx_or_name
            self.index  = engine.get_binding_index(self.name)
            if self.index == -1:
                raise IndexError("Binding name not found: %s" % self.name)
        else:
            self.index = idx_or_name
            self.name  = engine.get_binding_name(self.index)
            if self.name is None:
                raise IndexError("Binding index out of range: %i" % self.index)
        self.is_input = engine.binding_is_input(self.index)


        dtype = engine.get_binding_dtype(self.index)
        dtype_map = {trt.DataType.FLOAT: np.float32,
                        trt.DataType.HALF:  np.float16,
                        trt.DataType.INT8:  np.int8,
                        trt.DataType.BOOL: np.bool}
        if hasattr(trt.DataType, 'INT32'):
            dtype_map[trt.DataType.INT32] = np.int32

        self.dtype = dtype_map[dtype]
        shape = engine.get_binding_shape(self.index)

        self.shape = tuple(shape)
        # Must allocate a buffer of size 1 for empty inputs / outputs
        if 0 in self.shape:
            self.empty = True
            # Save original shape to reshape output binding when execution is done
            self.empty_shape = self.shape
            self.shape = tuple([1])
        else:
            self.empty = False
        self._host_buf   = None
        self._device_buf = None
    @property
    def host_buffer(self):
        if self._host_buf is None:
            self._host_buf = pycuda.driver.pagelocked_empty(self.shape, self.dtype)
        return self._host_buf
    @property
    def device_buffer(self):
        if self._device_buf is None:
            self._device_buf = pycuda.gpuarray.empty(self.shape, self.dtype)
        return self._device_buf
    def get_async(self, stream):
        src = self.device_buffer
        dst = self.host_buffer
        src.get_async(stream, dst)
        return dst

def squeeze_hw(x):
    if x.shape[-2:] == (1, 1):
        x = x.reshape(x.shape[:-2])
    elif x.shape[-1] == 1:
        x = x.reshape(x.shape[:-1])
    return x

def check_input_validity(input_idx, input_array, input_binding):
    # Check shape
    trt_shape = tuple(input_binding.shape)
    onnx_shape    = tuple(input_array.shape)

    if onnx_shape != trt_shape:
        if not (trt_shape == (1,) and onnx_shape == ()) :
            raise ValueError("Wrong shape for input %i. Expected %s, got %s." %
                            (input_idx, trt_shape, onnx_shape))

    # Check dtype
    if input_array.dtype != input_binding.dtype:
        #TRT does not support INT64, need to convert to INT32
        if input_array.dtype == np.int64 and input_binding.dtype == np.int32:
            casted_input_array = np.array(input_array, copy=True, dtype=np.int32)
            if np.equal(input_array, casted_input_array).all():
                input_array = casted_input_array
            else:
                raise TypeError("Wrong dtype for input %i. Expected %s, got %s. Cannot safely cast." %
                            (input_idx, input_binding.dtype, input_array.dtype))
        else:
            raise TypeError("Wrong dtype for input %i. Expected %s, got %s." %
                            (input_idx, input_binding.dtype, input_array.dtype))
    return input_array


class Engine(object):
    def __init__(self, trt_engine):
        self.engine = trt_engine
        nbinding = self.engine.num_bindings

        bindings = [Binding(self.engine, i)
                    for i in range(nbinding)]
        self.binding_addrs = [b.device_buffer.ptr for b in bindings]
        self.inputs  = [b for b in bindings if     b.is_input]
        self.outputs = [b for b in bindings if not b.is_input]

        for binding in self.inputs + self.outputs:
            _ = binding.device_buffer # Force buffer allocation
        for binding in self.outputs:
            _ = binding.host_buffer   # Force buffer allocation
        self.context = self.engine.create_execution_context()
        self.stream = pycuda.driver.Stream()

    def __del__(self):
        if self.engine is not None:
            del self.engine

    def run(self, inputs):
        # len(inputs) > len(self.inputs) with Shape operator, input is never used
        # len(inputs) == len(self.inputs) for other operators
        if len(inputs) < len(self.inputs):
            raise ValueError("Not enough inputs. Expected %i, got %i." %
                             (len(self.inputs), len(inputs)))
        if isinstance(inputs, dict):
            inputs = [inputs[b.name] for b in self.inputs]


        for i, (input_array, input_binding) in enumerate(zip(inputs, self.inputs)):
            input_array = check_input_validity(i, input_array, input_binding)
            input_binding_array = input_binding.device_buffer
            input_binding_array.set_async(input_array, self.stream)

        self.context.execute_async_v2(
            self.binding_addrs, self.stream.handle)

        results = [output.get_async(self.stream)
                   for output in self.outputs]

        # For any empty bindings, update the result shape to the expected empty shape
        for i, (output_array, output_binding) in enumerate(zip(results, self.outputs)):
            if output_binding.empty:
                results[i] = np.empty(shape=output_binding.empty_shape, dtype=output_binding.dtype)

        self.stream.synchronize()
        return results

    def run_no_dma(self, batch_size):
        self.context.execute_async(
            batch_size, self.binding_addrs, self.stream.handle
        )

In [65]:
import pycuda.driver as cuda

In [61]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class TensorRTBackendRep():
    
    def __init__(
        self,
        model,
        device,
        max_workspace_size=None,
        serialize_engine=False,
        verbose=False,
        **kwargs
    ):
        
        self._logger = TRT_LOGGER
        
        self.builder = trt.Builder(self._logger)
        
        self.config = self.builder.create_builder_config()
        
        self.network = self.builder.create_network(
            flags=1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH),
            
        )
        self.parser = trt.OnnxParser(self.network, self._logger)
        
        self.serialize_engine = serialize_engine
        
        self.verbose = verbose
        
        if self.verbose:
            print(f'\nRunning {model.graph.name}...')
            TRT_LOGGER.min_severity = trt.Logger.VERBOSE

        model_str = model.SerializeToString()
        
        if not self.parser.parse(model_str):
            error = self.parser.get_error(0)
            msg = "While parsing node number %i:\n" % error.node()
            msg += ("%s:%i In function %s:\n[%i] %s" %
                    (error.file(), error.line(), error.func(),
                     error.code(), error.desc()))
            raise RuntimeError(msg)
            
        self.config.max_workspace_size = (
            max_workspace_size if max_workspace_size is not None else (1 << 28)
        )

        self.num_inputs = self.network.num_inputs
        
        if self.verbose:
            for layer in self.network:
                print(layer)

            print(f'Output shape: {self.network[-1].get_output(0).shape}')
        
        self._output_shapes = {}
        self._output_dtype = {}
        
        for output in model.graph.output:
            dims = output.type.tensor_type.shape.dim
            output_shape = tuple([dim.dim_value for dim in dims])
            self._output_shapes[output.name] = output_shape
            self._output_dtype[output.name] = output.type.tensor_type.elem_type

    def build_engine(
        self,
        min_shapes,
        max_shapes,
        opt_shapes,
        names
    ):
        """
        Builds a TensorRT engine with a builder config.
        :param inputs: inputs to the model; if not None, this means we are building the engine at run time,
                       because we need to register optimization profiles for some inputs
        :type inputs: List of np.ndarray
        """
        
        opt_profile = self.builder.create_optimization_profile()
        
        profile = self.builder.create_optimization_profile()
        
        for i, (min_shape, max_shape, opt_shape, name) in enumerate(zip(min_shapes, max_shapes, opt_shapes, names)):
            print(i, (min_shape, max_shape, opt_shape))
            profile.set_shape(
                name,
                min=min_shape,
                opt=opt_shape,
                max=max_shape,
            )
        
        self.config.add_optimization_profile(profile)

        trt_engine = self.builder.build_engine(self.network, self.config)

        if trt_engine is None:
            raise RuntimeError("Failed to build TensorRT engine from network")
        if self.serialize_engine:
            trt_engine = self._serialize_deserialize(trt_engine)
        self.engine = Engine(trt_engine)
        
        return self.engine

In [71]:
path = "nateraw-bert-base-uncased-imdb/onnx"
onnx_model = onnx.load("nateraw-bert-base-uncased-imdb/onnx")

In [49]:
onnx_model = model.export_to_onnx("cuda", dynamic_axis=True)

{'input_ids': {0: 'batch_size', 1: 'sequence_length'}, 'attention_mask': {0: 'batch_size', 1: 'sequence_length'}, 'output': {0: 'batch_size'}}
verbose: False, log level: Level.ERROR



2023-04-23 18:23:38.256377186 [W:onnxruntime:, session_state.cc:1136 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-04-23 18:23:38.256395355 [W:onnxruntime:, session_state.cc:1138 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [50]:
import onnx
onnx_model = onnx.load("nateraw-bert-base-uncased-imdb/onnx")

In [73]:
onnx_model

In [62]:
dd = TensorRTBackendRep(
    model=onnx_model,
    device='CUDA:0',
    verbose=True,
)


Running torch_jit...
[04/23/2023-18:27:47] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
[04/23/2023-18:27:47] [TRT] [V] Plugin creator already registered - ::BatchedNMSDynamic_TRT version 1
[04/23/2023-18:27:47] [TRT] [V] Plugin creator already registered - ::BatchedNMS_TRT version 1
[04/23/2023-18:27:47] [TRT] [V] Plugin creator already registered - ::BatchTilePlugin_TRT version 1
[04/23/2023-18:27:47] [TRT] [V] Plugin creator already registered - ::Clip_TRT version 1
[04/23/2023-18:27:47] [TRT] [V] Plugin creator already registered - ::CoordConvAC version 1
[04/23/2023-18:27:47] [TRT] [V] Plugin creator already registered - ::CropAndResizeDynamic version 1
[04/23/2023-18:27:47] [TRT] [V] Plugin creator already registered - ::CropAndResize version 1
[04/23/2023-18:27

<tensorrt.tensorrt.ILayer object at 0x7fc400b24cb0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b168f0>
<tensorrt.tensorrt.ILayer object at 0x7fc40d8097f0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b24af0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b275f0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b26130>
<tensorrt.tensorrt.ILayer object at 0x7fc400b27670>
<tensorrt.tensorrt.ILayer object at 0x7fc400b59df0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b5b0f0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b5a630>
<tensorrt.tensorrt.ILayer object at 0x7fc400b59bb0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b59d70>
<tensorrt.tensorrt.ILayer object at 0x7fc400b83070>
<tensorrt.tensorrt.ILayer object at 0x7fc400b839f0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b832b0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b83130>
<tensorrt.tensorrt.ILayer object at 0x7fc400b831b0>
<tensorrt.tensorrt.ILayer object at 0x7fc400b83030>
<tensorrt.tensorrt.ILayer object at 0x7fc400b83270>
<tensorrt.te

  self.config.max_workspace_size = (


In [63]:
engine = dd.build_engine(
    min_shapes=[[1, 1], [1, 1]],
    opt_shapes=[[1, 100], [1, 100]],
    max_shapes=[[1, 512], [1, 512]],
    names=["input_ids", "attention_mask"]
)


0 ([1, 1], [1, 512], [1, 100])
1 ([1, 1], [1, 512], [1, 100])
[04/23/2023-18:28:03] [TRT] [I] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[04/23/2023-18:28:03] [TRT] [V] Original: 1252 layers
[04/23/2023-18:28:03] [TRT] [V] After dead-layer removal: 1252 layers
[04/23/2023-18:28:03] [TRT] [V] Graph construction completed in 0.0176978 seconds.
[04/23/2023-18:28:03] [TRT] [V] Running: ConstShuffleFusion on /bert/Constant_8_output_0
[04/23/2023-18:28:03] [TRT] [V] ConstShuffleFusion: Fusing /bert/Constant_8_output_0 with (Unnamed Layer* 60) [Shuffle]
[04/23/2023-18:28:03] [TRT] [V] Running: ConstShuffleFusion on /bert/Constant_9_output_0
[04/23/2023-18:28:03] [TRT] [V] ConstShuffleFusion: Fusing /bert/Constant_9_output_0 with (Unnamed Layer* 63) [Shuffle]
[04/23/2023-18:28:03] [TRT] [V] Running: ConstShuffleFusion on /bert/embeddings/LayerNorm/Constant_output_0
[04/23/2023-18:28:03] [TRT] [V] ConstShuffleFusion: Fusing /bert/embeddings/LayerNorm/Constant_

  trt_engine = self.builder.build_engine(self.network, self.config)


[04/23/2023-18:28:15] [TRT] [V]  (foreignNode) Set user's cuda kernel library
[04/23/2023-18:28:16] [TRT] [V] Tactic: 0x0000000000000000 Time: 9.16889
[04/23/2023-18:28:16] [TRT] [V] {ForeignNode[bert.embeddings.token_type_embeddings.weight...(Unnamed Layer* 1980) [ElementWise]]} (Myelin[0x80000023]) profiling completed in 12.6479 seconds. Fastest Tactic: 0x0000000000000000 Time: 9.16889
[04/23/2023-18:28:16] [TRT] [V] >>>>>>>>>>>>>>> Chose Runner Type: Myelin Tactic: 0x0000000000000000
[04/23/2023-18:28:16] [TRT] [V] Formats and tactics selection completed in 12.6629 seconds.
[04/23/2023-18:28:16] [TRT] [V] After reformat layers: 1 layers
[04/23/2023-18:28:16] [TRT] [V] Total number of blocks in pre-optimized block assignment: 1
[04/23/2023-18:28:16] [TRT] [I] Detected 2 inputs and 1 output network tensors.
[04/23/2023-18:28:16] [TRT] [V] Layer: {ForeignNode[bert.embeddings.token_type_embeddings.weight...(Unnamed Layer* 1980) [ElementWise]]} Host Persistent: 32 Device Persistent: 0 Sc

  self.name  = engine.get_binding_name(self.index)
  self.is_input = engine.binding_is_input(self.index)
  dtype = engine.get_binding_dtype(self.index)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trt.DataType.BOOL: np.bool}
  shape = engine.get_binding_shape(self.index)


OverflowError: can't convert negative value to unsigned int

In [68]:
engine = engine.engine

In [78]:
def volume(shape):
    vol = 1
    for dim in shape:
        vol *= dim
    return vol

2

<DataType.FLOAT: 0>

In [84]:
output_itemsize = np.dtype(trt.nptype(engine.get_tensor_dtype("output"))).itemsize

In [None]:
context = engine.create_execution_context()

In [90]:
def send_to_cuda(data):
    
    cuda_data = {}
    
    for k, v in data.items():
        cuda_data[k] = cuda.mem_alloc(v.nbytes)
        cuda.memcpy_htod(cuda_data[k], v.ravel())
    
    
    output_itemsize = np.dtype(trt.nptype(engine.get_tensor_dtype("output"))).itemsize
    
    d_output = cuda.mem_alloc(volume(engine.get_tensor_shape("output")) * output_itemsize)
    
    return cuda_data, d_output, bindings

In [91]:
cuda_data, d_output = send_to_cuda(tokens)

In [None]:
bindings = [int(d_input_1), int(d_input_2), int(d_output)]

In [None]:
# Set the input shapes for the context
context.set_binding_shape(0, (1, *input_shape_1))
context.set_binding_shape(1, (1, *input_shape_2))


In [None]:



# Execute the inference
context.execute_v2(bindings=bindings)

In [48]:
engine.run(
    [
        tokens["input_ids"],
        tokens["attention_mask"]
    ]
)

ValueError: Wrong shape for input 0. Expected (1, 3), got (1, 8).

In [46]:
tokens = {k: v.numpy() for k,v in model._tokenizer("This was such a good movie", return_tensors="pt").items()}
del tokens["token_type_ids"]

{'input_ids': array([[ 101, 2023, 2001, 2107, 1037, 2204, 3185,  102]]),
 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [54]:
build_engine_onnx(
    "nateraw-bert-base-uncased-imdb/onnx",
    1
)

  config.max_workspace_size = 1 << 30


[04/23/2023-18:24:02] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
[04/23/2023-18:24:03] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[04/23/2023-18:24:03] [TRT] [E] 4: [network.cpp::validate::3047] Error Code 4: Internal Error (Network has dynamic or shape inputs, but no optimization profile has been defined.)


In [53]:
import tensorrt as trt
import common
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file, batch_size):
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(batch_size)
    config = builder.create_builder_config()
    parser = trt.OnnxParser(network, TRT_LOGGER)

    config.max_workspace_size = 1 << 30
    # Load the Onnx model and parse it in order to populate the TensorRT network.
    with open(model_file, "rb") as model:
        if not parser.parse(model.read()):
            print("ERROR: Failed to parse the ONNX file.")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    return builder.build_serialized_network(network, config)