In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
%load_ext autoreload
%autoreload 2

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Accelerating HuggingFace T5 Inference with TensorRT

T5 is an encoder-decoder model that converts all NLP problems into a text-to-text format. More specifically, it does so by encoding  different tasks as text directives in the input stream. This enables a single model to be trained supervised on a wide variety of NLP tasks such as translation, classification, Q&A and summarization.

This notebook shows 3 easy steps to convert a [HuggingFace PyTorch T5 model](https://huggingface.co/transformers/model_doc/t5.html) to a TensorRT engine for high-performance inference.

1. [Download HuggingFace T5 model](#1)
1. [Convert to ONNX format](#2)
1. [Convert to TensorRT engine](#3)

## Prerequisite

Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.

Next, we install some extra dependencies.

In [2]:
# %%capture
# !pip3 install -r ../requirements.txt

**Note:** After this step, you should restart the Jupyter kernel for the change to take effect.

In [3]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch
import tensorrt as trt

# huggingface
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
    WhisperProcessor, 
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperConfig
)

<a id="1"></a>

## 1. Download HuggingFace T5 model and Whisper model

First, we download the original HuggingFace PyTorch T5 model from HuggingFace model hubs, together with its associated tokernizer.

The T5 variants that are suported by TensorRT 8 are:  t5-small (60M), t5-base (220M), t5-large (770M), t5-3b(3B), t5-11b(11B)

In [4]:
T5_VARIANT = 't5-small' # choices: t5-small | t5-base | t5-large | t5-3b | t5-11b

t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT).to('cuda')
tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)
t5_config = T5Config.from_pretrained(T5_VARIANT, use_cache = False)

In [5]:
inputs = tokenizer("translate English to German: That is good.", return_tensors="pt").to('cuda')


In [6]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(T5_VARIANT)
!mkdir -p $pytorch_model_dir

t5_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/t5-small/pytorch


In [7]:
import torch
from datasets import load_dataset

Whisper_VARIANT = "openai/whisper-tiny"    # choices: openai/whisper-tiny | openai/whisper-base | openai/whisper-small | openai/whisper-medium | openai/whisper-large-v2

processor = WhisperProcessor.from_pretrained(Whisper_VARIANT)
whisper_model = WhisperForConditionalGeneration.from_pretrained(Whisper_VARIANT)


In [8]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(Whisper_VARIANT)
!mkdir -p $pytorch_model_dir

whisper_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/openai/whisper-tiny/pytorch


### Inference with PyTorch model

Next, we will carry out inference with the PyTorch model.

#### Single example inference

In [9]:
inputs = tokenizer("translate English to German: That is good.", return_tensors="pt")
num_beams = 1

In [10]:
# WAR: Using an ugly representation because cuda 11.4 does not support GPU models due to cublas errors
if "cuda-11.4" in os.environ["LD_LIBRARY_PATH"]:
    t5_model = t5_model.cpu()
    inputs = inputs.to('cpu')
else:
    t5_model = t5_model.cuda()
    inputs = inputs.to('cuda:0')
input_ids = inputs.input_ids
    

In [11]:
# inference on a single example
t5_model.eval()
with torch.no_grad():
    outputs = t5_model(**inputs, labels=inputs["input_ids"])

logits = outputs.logits

In [12]:
# Generate sequence for an input
outputs = t5_model.generate(input_ids, num_beams=num_beams)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

2023-07-24 17:34:32.801629: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Das ist gut.


In [13]:
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

audio_inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
input_features = audio_inputs.input_features

# WAR: Using an ugly representation because cuda 11.4 does not support GPU models due to cublas errors
if "cuda-11.4" in os.environ["LD_LIBRARY_PATH"]:
    whisper_model = whisper_model.cpu()
    input_features = input_features.to('cpu')
else:
    whisper_model = whisper_model.cuda()
    input_features = input_features.to('cuda:0')   

Found cached dataset librispeech_asr_dummy (/home/nvadmin/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [14]:
with torch.no_grad():
    generated_ids = whisper_model.generate(inputs=input_features)

transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
transcription
# ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'



' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'

In [15]:
input_features.shape

torch.Size([1, 80, 3000])

In [16]:
whisper_model.model.encoder(input_features=input_features)

BaseModelOutput(last_hidden_state=tensor([[[ 0.0810,  0.0036, -0.0460,  ..., -0.0463,  0.1107, -0.0297],
         [-0.8691,  0.2916,  0.8943,  ...,  1.1164,  0.0542,  0.1625],
         [ 0.0308,  2.2723,  1.5943,  ...,  0.2215, -0.8278,  0.2897],
         ...,
         [ 0.7709, -1.6775,  0.2770,  ..., -0.0620, -0.4735,  0.5232],
         [-0.1289, -0.4646,  0.1080,  ...,  0.6388,  0.0286,  0.2890],
         [ 0.1964, -0.0994, -1.4564,  ...,  0.1363, -0.5059, -0.1779]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

#### Model inference benchmark: encoder and decoder stacks

For benchmarking purposes, we will employ a helper functions `encoder_inference` and `decoder_inference` which execute the inference repeatedly for the T5 encoder and decoder stacks separately, and measure end to end execution time. Let's take note of this execution time for comparison with TensorRT. 
 
`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [17]:
from T5.measurements import decoder_inference, encoder_inference, full_inference
from T5.export import T5EncoderTorchFile, T5DecoderTorchFile, T5EncoderTRTEngine, T5DecoderTRTEngine

from Whisper.measurements import decoder_inference as w_decoder_inference, encoder_inference as w_encoder_inference, full_inference as w_full_inference
from Whisper.export import WhisperEncoderTorchFile, WhisperDecoderTorchFile, WhisperEncoderTRTEngine, WhisperDecoderTRTEngine

from NNDF.networks import TimingProfile
from NNDF.torch_utils import expand_inputs_for_beam_search

t5_torch_encoder = T5EncoderTorchFile.TorchModule(t5_model.encoder)
t5_torch_decoder = T5DecoderTorchFile.TorchModule(
    t5_model.decoder, t5_model.lm_head, t5_model.config
)

In [18]:
whisper_torch_encoder = WhisperEncoderTorchFile.TorchModule(whisper_model.model.encoder)
whisper_torch_decoder = WhisperDecoderTorchFile.TorchModule(
    whisper_model.model.decoder, whisper_model.proj_out, whisper_model.config
)

In [19]:
generated_ids = whisper_model.generate(inputs=audio_inputs.input_features.to('cuda'))

In [20]:
%%time
input_ids = inputs.input_ids

encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_torch_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time

CPU times: user 55.9 ms, sys: 3.73 ms, total: 59.7 ms
Wall time: 59.5 ms


0.004739571002573939

In [21]:
%%time
_, decoder_e2e_median_time = decoder_inference(
    t5_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

CPU times: user 96.3 ms, sys: 0 ns, total: 96.3 ms
Wall time: 96 ms


0.007701014998019673

In [22]:
%%time
input_features = audio_inputs.input_features.to('cuda')

encoder_last_hidden_state, encoder_e2e_median_time = w_encoder_inference(
    whisper_torch_encoder, input_features, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time

CPU times: user 35.4 ms, sys: 0 ns, total: 35.4 ms
Wall time: 35 ms


0.0025588759999664035

In [23]:
%%time
_, decoder_e2e_median_time = w_decoder_inference(
    whisper_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

CPU times: user 67.4 ms, sys: 0 ns, total: 67.4 ms
Wall time: 67 ms


0.003989939999883063

#### Full model inference and benchmark

Next, we will try the T5 model for the task of translation from English to German.

For benchmarking purposes, we will employ a helper function `full_inference` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for comparison with TensorRT. 

In [24]:
from T5.T5ModelConfig import T5ModelTRTConfig, T5Metadata
from Whisper.WhisperModelConfig import WhisperModelTRTConfig, WhisperMetadata

In [25]:
%%time

decoder_output, _ = full_inference(
    t5_torch_encoder,
    t5_torch_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    num_beams=num_beams,
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT],
)


CPU times: user 37.6 ms, sys: 0 ns, total: 37.6 ms
Wall time: 37.3 ms


In [26]:
"Let us decode the model's output back into text."
# De-tokenize output to raw text
print(tokenizer.decode(decoder_output[0], skip_special_tokens=True))

Das ist gut.


In [27]:
Whisper_VARIANT

'openai/whisper-tiny'

In [28]:
%%time

decoder_output, _ = w_full_inference(
    whisper_torch_encoder,
    whisper_torch_decoder,
    input_features,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    num_beams=num_beams,
    max_length=WhisperModelTRTConfig.MAX_SEQUENCE_LENGTH[Whisper_VARIANT],
)

CPU times: user 1.17 s, sys: 0 ns, total: 1.17 s
Wall time: 1.17 s


In [29]:
max_length = WhisperModelTRTConfig.MAX_SEQUENCE_LENGTH[Whisper_VARIANT]
min_length=0
num_beams=1
batch_size=1
use_cuda=True
early_stopping=True
use_cache=False

whisper_torch_decoder.to('cuda')
whisper_torch_encoder.to('cuda')
input_features.to('cuda')

tensor([[[ 1.1933e-01, -9.4576e-02, -1.0978e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [ 4.9347e-04, -8.9271e-02, -6.7290e-02,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-1.5326e-01, -2.0804e-01, -2.2227e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         ...,
         [-8.0603e-01, -8.0603e-01, -7.9997e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -7.7211e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -8.0603e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01]]], device='cuda:0')

In [30]:
whisper_torch_decoder.generate(
    input_features,
    max_length=max_length,
    min_length=min_length,
    num_beams=num_beams,
    early_stopping=early_stopping,
    eos_token_id=whisper_torch_decoder.config.eos_token_id,
    pad_token_id=whisper_torch_decoder.config.pad_token_id,
    use_cache=use_cache,
    )


tensor([[50258, 50259, 50359, 50363,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,   293,   321,   366,  5404,
           281,  2928,   702, 14943,    13, 50257]], device='cuda:0')

In [31]:
whisper_torch_decoder.generate

<bound method GenerationMixin.generate of TorchModule(
  (decoder): WhisperDecoder(
    (embed_tokens): Embedding(51865, 384, padding_idx=50257)
    (embed_positions): WhisperPositionalEmbedding(448, 384)
    (layers): ModuleList(
      (0): WhisperDecoderLayer(
        (self_attn): WhisperAttention(
          (k_proj): Linear(in_features=384, out_features=384, bias=False)
          (v_proj): Linear(in_features=384, out_features=384, bias=True)
          (q_proj): Linear(in_features=384, out_features=384, bias=True)
          (out_proj): Linear(in_features=384, out_features=384, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (encoder_attn): WhisperAttention(
          (k_proj): Linear(in_features=384, out_features=384, bias=False)
          (v_proj): Linear(in_features=384, out_features=384, bias=True)
          (q_proj): Linear(in_features=384, out_features=384, bias=True)
   

<a id="2"></a>

## 2. Convert to ONNX

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

The steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU. 

For the T5 model, we will convert the encoder and decoder seperately.

In [32]:
# helpers
from NNDF.networks import NetworkMetadata, Precision

In [33]:
onnx_model_path = './models/{}/ONNX'.format(T5_VARIANT)

t5_metadata=NetworkMetadata(variant=T5_VARIANT, precision=Precision(fp16=True), other=T5Metadata(kv_cache=False))

encoder_onnx_model_path = os.path.join(onnx_model_path, "encoder")
decoder_onnx_model_path = os.path.join(onnx_model_path, "decoder")
!mkdir -p $encoder_onnx_model_path
!mkdir -p $decoder_onnx_model_path

t5_encoder_onnx_model_fpath = T5_VARIANT + "-encoder.onnx"
t5_decoder_onnx_model_fpath = T5_VARIANT + "-decoder-with-lm-head.onnx"

t5_encoder = T5EncoderTorchFile(t5_model.to('cpu'), t5_metadata)
t5_decoder = T5DecoderTorchFile(t5_model.to('cpu'), t5_metadata)

onnx_t5_encoder = t5_encoder.as_onnx_model(
    os.path.join(encoder_onnx_model_path, t5_encoder_onnx_model_fpath), force_overwrite=False
)
onnx_t5_decoder = t5_decoder.as_onnx_model(
    os.path.join(decoder_onnx_model_path, t5_decoder_onnx_model_fpath), force_overwrite=False
)

In [34]:
Whisper_VARIANT.split('/')[1]

'whisper-tiny'

In [35]:
# graph = gs.import_onnx(onnx.load(onnx_input_fpath))
# folder = os.path.split(onnx_input_fpath)[0]
# for op in config:
#     if op == OnnxProcessOperation.CLAMP_WEIGHTS:
#         graph = clamp_weights_onnx_to_fp16_bounds(graph, **kwargs)
#     elif op == OnnxProcessOperation.MOVE_CAST_OP:
#         graph = move_t5_cast_op(graph)


In [36]:
onnx_model_path = './models/{}/ONNX'.format(Whisper_VARIANT)

wh_metadata=NetworkMetadata(variant=Whisper_VARIANT, precision=Precision(fp16=True), other=WhisperMetadata(kv_cache=False))

encoder_onnx_model_path = os.path.join(onnx_model_path, "encoder")
decoder_onnx_model_path = os.path.join(onnx_model_path, "decoder")


!mkdir -p $encoder_onnx_model_path
!mkdir -p $decoder_onnx_model_path

wh_encoder_onnx_model_fpath = Whisper_VARIANT.split('/')[1] + "-encoder.onnx"
wh_decoder_onnx_model_fpath = Whisper_VARIANT.split('/')[1] + "-decoder-with-lm-head.onnx"



In [37]:
whisper_encoder = WhisperEncoderTorchFile(whisper_model.to('cpu'), wh_metadata)
whisper_decoder = WhisperDecoderTorchFile(whisper_model.to('cpu'), wh_metadata)


In [38]:
onnx_whisper_encoder = whisper_encoder.as_onnx_model(
    os.path.join(encoder_onnx_model_path, wh_encoder_onnx_model_fpath), force_overwrite=False
)
onnx_whisper_decoder = whisper_decoder.as_onnx_model(
    os.path.join(decoder_onnx_model_path, wh_decoder_onnx_model_fpath), force_overwrite=False
)

<a id="3"></a>

## 3. Convert to TensorRT

Now we are ready to parse the ONNX encoder and decoder models and convert them to optimized TensorRT engines.

Since the models contains dynamic input shapes, we can specify a valid input range with a TensorRT optimization profile.

In [39]:
from T5.export import T5DecoderONNXFile, T5EncoderONNXFile
from Whisper.export import WhisperDecoderONNXFile, WhisperEncoderONNXFile
from polygraphy.backend.trt import Profile
from tensorrt import PreviewFeature

In [40]:
tensorrt_model_path = './models/{}/tensorrt'.format(T5_VARIANT)
!mkdir -p tensorrt_model_path
# Decoder optimization profiles
batch_size = 1
max_sequence_length = T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT]
decoder_profile = Profile()
decoder_profile.add(
    "input_ids",
    min=(batch_size * num_beams, 1),
    opt=(batch_size * num_beams, max_sequence_length // 2),
    max=(batch_size * num_beams, max_sequence_length),
)
decoder_profile.add(
    "encoder_hidden_states",
    min=(batch_size * num_beams, 1, max_sequence_length),
    opt=(batch_size * num_beams, max_sequence_length // 2, max_sequence_length),
    max=(batch_size * num_beams, max_sequence_length, max_sequence_length),
)

# Encoder optimization profiles
encoder_profile = Profile()
encoder_profile.add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, max_sequence_length // 2),
    max=(batch_size, max_sequence_length),
)


Profile().add('input_ids', min=(1, 1), opt=(1, 256), max=(1, 512))

In [41]:
encoder_profile

Profile().add('input_ids', min=(1, 1), opt=(1, 256), max=(1, 512))

In [42]:
disable_preview_dynamic_shapes = False
engine_tag = f"bs{batch_size}"

if num_beams > 1:
    engine_tag += "-beam{}".format(num_beams)

preview_features = [PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-noFasterDynamicShapes"
else:
    preview_features += [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]

encoder_engine_name = os.path.join(tensorrt_model_path, t5_encoder_onnx_model_fpath) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
decoder_engine_name = os.path.join(tensorrt_model_path, t5_decoder_onnx_model_fpath) + f"-{engine_tag}.engine"

if not os.path.exists(encoder_engine_name):
    t5_trt_encoder_engine = T5EncoderONNXFile(os.path.join(encoder_onnx_model_path, t5_encoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        encoder_engine_name,
        profiles=[encoder_profile],
        preview_features=preview_features)
else:
    t5_trt_encoder_engine = T5EncoderTRTEngine(encoder_engine_name, t5_metadata)

if not os.path.exists(decoder_engine_name):
    t5_trt_decoder_engine = T5DecoderONNXFile(os.path.join(decoder_onnx_model_path, t5_decoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        decoder_engine_name,
        profiles=[decoder_profile],
        preview_features=preview_features)
else:
    t5_trt_decoder_engine = T5DecoderTRTEngine(decoder_engine_name, t5_metadata)

In [43]:
tensorrt_model_path = './models/{}/tensorrt'.format(Whisper_VARIANT)
!mkdir -p tensorrt_model_path
# Decoder optimization profiles
batch_size = 1
max_sequence_length = WhisperModelTRTConfig.MAX_SEQUENCE_LENGTH[Whisper_VARIANT]
decoder_profile = Profile()
decoder_profile.add(
    "input_ids",
    min=(batch_size * num_beams, 1),
    opt=(batch_size * num_beams, max_sequence_length // 2),
    max=(batch_size * num_beams, max_sequence_length),
)
decoder_profile.add(
    "encoder_hidden_states",
    min=(batch_size * num_beams, 1, max_sequence_length),
    opt=(batch_size * num_beams, max_sequence_length // 2, max_sequence_length),
    max=(batch_size * num_beams, max_sequence_length, max_sequence_length),
)

# Encoder optimization profiles
encoder_profile = Profile()
encoder_profile.add(
    "input_features",
    min=(batch_size, 80, 3000),
    opt=(batch_size, max_sequence_length // 2),
    max=(batch_size, max_sequence_length),
)


Profile().add('input_features', min=(1, 80, 3000), opt=(1, 224), max=(1, 448))

In [44]:
input_ids.shape
input_features.shape

torch.Size([1, 80, 3000])

In [45]:
disable_preview_dynamic_shapes = False
engine_tag = f"bs{batch_size}"

if num_beams > 1:
    engine_tag += "-beam{}".format(num_beams)

preview_features = [PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-noFasterDynamicShapes"
else:
    preview_features += [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]

encoder_engine_name = os.path.join(tensorrt_model_path, wh_encoder_onnx_model_fpath) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
decoder_engine_name = os.path.join(tensorrt_model_path, wh_decoder_onnx_model_fpath) + f"-{engine_tag}.engine"

if not os.path.exists(encoder_engine_name):
    wh_trt_encoder_engine = WhisperEncoderONNXFile(os.path.join(encoder_onnx_model_path, wh_encoder_onnx_model_fpath), wh_metadata).as_trt_engine(
        encoder_engine_name,
        profiles=[encoder_profile],
        preview_features=preview_features)
else:
    wh_trt_encoder_engine = WhisperEncoderTRTEngine(encoder_engine_name, wh_metadata)

if not os.path.exists(decoder_engine_name):
    wh_trt_decoder_engine = WhisperDecoderONNXFile(os.path.join(decoder_onnx_model_path, wh_decoder_onnx_model_fpath), wh_metadata).as_trt_engine(
        decoder_engine_name,
        profiles=[decoder_profile],
        preview_features=preview_features)
else:
    wh_trt_decoder_engine = WhisperDecoderTRTEngine(decoder_engine_name, wh_metadata)

[38;5;11m[W] onnx2trt_utils.cpp:377: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.[0m
[38;5;11m[W]     Input tensor: input_ids (dtype=DataType.FLOAT, shape=(-1, -1, 3000)) | No shapes provided; Will use shape: [1, 1, 3000] for min/opt/max in profile.[0m
[38;5;11m[W]     This will cause the tensor to have a static shape. If this is incorrect, please set the range of shapes for this input tensor.[0m


[38;5;9m[!]     Invalid inputs were provided to the optimization profile: {'input_features'}
        Note: Inputs available in the TensorRT network are: {'input_ids'}[0m


PolygraphyException: Invalid inputs were provided to the optimization profile: {'input_features'}
Note: Inputs available in the TensorRT network are: {'input_ids'}

In [None]:
WhisperEncoderONNXFile(os.path.join(encoder_onnx_model_path, wh_encoder_onnx_model_fpath), wh_metadata)

In [None]:
wh_trt_encoder_engine = WhisperEncoderONNXFile(os.path.join(encoder_onnx_model_path, wh_encoder_onnx_model_fpath), wh_metadata).as_trt_engine(
    encoder_engine_name,
    profiles=[encoder_profile],
    preview_features=preview_features)

In [None]:

encoder_engine_name = os.path.join(tensorrt_model_path, t5_encoder_onnx_model_fpath) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
decoder_engine_name = os.path.join(tensorrt_model_path, t5_decoder_onnx_model_fpath) + f"-{engine_tag}.engine"

if not os.path.exists(encoder_engine_name):
    t5_trt_encoder_engine = T5EncoderONNXFile(os.path.join(encoder_onnx_model_path, t5_encoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        encoder_engine_name,
        profiles=[encoder_profile],
        preview_features=preview_features)
else:
    t5_trt_encoder_engine = T5EncoderTRTEngine(encoder_engine_name, t5_metadata)

if not os.path.exists(decoder_engine_name):
    t5_trt_decoder_engine = T5DecoderONNXFile(os.path.join(decoder_onnx_model_path, t5_decoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        decoder_engine_name,
        profiles=[decoder_profile],
        preview_features=preview_features)
else:
    t5_trt_decoder_engine = T5DecoderTRTEngine(decoder_engine_name, t5_metadata)

### Inference with TensorRT engine

Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the T5 model, ready for us to carry out inference. 

#### Single example inference
The T5 model with TensorRT backend can now be employed in place of the original HuggingFace T5 model.


In [None]:
# Initialize TensorRT engines
from T5.trt import T5TRTEncoder, T5TRTDecoder

t5_trt_encoder = T5TRTEncoder(
                t5_trt_encoder_engine, t5_metadata, t5_config
            )
t5_trt_decoder = T5TRTDecoder(
                t5_trt_decoder_engine, t5_metadata, t5_config, num_beams=num_beams
            )

In [None]:
# Inference on a single sample
encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)
outputs = t5_trt_decoder(
    expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, 
    expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state)

In [None]:
# Generate sequence for an input
max_length = 64

decoder_input_ids = torch.full(
    (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32
).to("cuda:0")

encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)

#### TRT engine inference benchmark: encoder and decoder stacks
First, we will bechmark the encoder and decoder stacks as before.

In [None]:
%%time
encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_trt_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time


In [None]:
%%time
_, decoder_e2e_median_time = decoder_inference(
    t5_trt_decoder, expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, 
    expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

### Full model inference benchmark

Next, we will try the full TensorRT T5 engine for the task of translation. As before, note the time difference.

In [None]:
%%time
decoder_output, full_e2e_median_runtime = full_inference(
    t5_trt_encoder,
    t5_trt_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[t5_metadata.variant],
    num_beams=num_beams,
    use_cuda=True,
)

print(tokenizer.decode(decoder_output[0], skip_special_tokens=True))
full_e2e_median_runtime


You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference. On an NVIDIA V100 32GB GPU, this results in upto ~10x performance improvement (from 0.0802s to 0.0082s for the T5-small variant).

## Conclusion and where-to next?

This notebook has walked you through the process of converting a HuggingFace PyTorch T5 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace T5 model while providing significant speed up. 

If you are interested in further details of the conversion process, check out [T5/trt.py](../T5/trt.py)