In [2]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Accelerating HuggingFace T5 Inference with TensorRT

T5 is an encoder-decoder model that converts all NLP problems into a text-to-text format. More specifically, it does so by encoding  different tasks as text directives in the input stream. This enables a single model to be trained supervised on a wide variety of NLP tasks such as translation, classification, Q&A and summarization.

This notebook shows 3 easy steps to convert a [HuggingFace PyTorch T5 model](https://huggingface.co/transformers/model_doc/t5.html) to a TensorRT engine for high-performance inference.

1. [Download HuggingFace T5 model](#1)
1. [Convert to ONNX format](#2)
1. [Convert to TensorRT engine](#3)

## Prerequisite

Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.

Next, we install some extra dependencies.

In [3]:
# %%capture
# !pip3 install -r ../requirements.txt

**Note:** After this step, you should restart the Jupyter kernel for the change to take effect.

In [4]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch
import tensorrt as trt

# huggingface
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
)

<a id="1"></a>

## 1. Download HuggingFace T5 model and Whisper model

First, we download the original HuggingFace PyTorch T5 model from HuggingFace model hubs, together with its associated tokernizer.

The T5 variants that are suported by TensorRT 8 are:  t5-small (60M), t5-base (220M), t5-large (770M), t5-3b(3B), t5-11b(11B)

In [5]:
T5_VARIANT = 't5-small' # choices: t5-small | t5-base | t5-large | t5-3b | t5-11b

t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT).to('cuda')
tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)
config = T5Config.from_pretrained(T5_VARIANT, use_cache = False)

In [6]:
inputs = tokenizer("translate English to German: That is good.", return_tensors="pt").to('cuda')


In [7]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(T5_VARIANT)
!mkdir -p $pytorch_model_dir

t5_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/t5-small/pytorch


In [8]:
import torch
from datasets import load_dataset

Whisper_VARIANT = "openai/whisper-tiny"    # choices: openai/whisper-tiny | openai/whisper-base | openai/whisper-small | openai/whisper-medium | openai/whisper-large-v2

processor = WhisperProcessor.from_pretrained(Whisper_VARIANT)
whisper_model = WhisperForConditionalGeneration.from_pretrained(Whisper_VARIANT)


NameError: name 'WhisperProcessor' is not defined

In [None]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(Whisper_VARIANT)
!mkdir -p $pytorch_model_dir

whisper_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

### Inference with PyTorch model

Next, we will carry out inference with the PyTorch model.

#### Single example inference

In [None]:
inputs = tokenizer("translate English to German: That is good.", return_tensors="pt")
num_beams = 1

In [None]:
# WAR: Using an ugly representation because cuda 11.4 does not support GPU models due to cublas errors
if "cuda-11.4" in os.environ["LD_LIBRARY_PATH"]:
    t5_model = t5_model.cpu()
    inputs = inputs.to('cpu')
else:
    t5_model = t5_model.cuda()
    inputs = inputs.to('cuda:0')
input_ids = inputs.input_ids
    

In [None]:
# inference on a single example
t5_model.eval()
with torch.no_grad():
    outputs = t5_model(**inputs, labels=inputs["input_ids"])

logits = outputs.logits

In [None]:
# Generate sequence for an input
outputs = t5_model.generate(input_ids, num_beams=num_beams)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### Model inference benchmark: encoder and decoder stacks

For benchmarking purposes, we will employ a helper functions `encoder_inference` and `decoder_inference` which execute the inference repeatedly for the T5 encoder and decoder stacks separately, and measure end to end execution time. Let's take note of this execution time for comparison with TensorRT. 
 
`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [9]:
from T5.measurements import decoder_inference, encoder_inference, full_inference
from T5.export import T5EncoderTorchFile, T5DecoderTorchFile, T5EncoderTRTEngine, T5DecoderTRTEngine

from NNDF.networks import TimingProfile
from NNDF.torch_utils import expand_inputs_for_beam_search

t5_torch_encoder = T5EncoderTorchFile.TorchModule(t5_model.encoder)
t5_torch_decoder = T5DecoderTorchFile.TorchModule(
    t5_model.decoder, t5_model.lm_head, t5_model.config
)

In [10]:
%%time
input_ids = inputs.input_ids

encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_torch_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time

CPU times: user 649 ms, sys: 163 ms, total: 813 ms
Wall time: 812 ms


0.004635270917788148

In [11]:
%%time
_, decoder_e2e_median_time = decoder_inference(
    t5_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

CPU times: user 95.7 ms, sys: 802 µs, total: 96.5 ms
Wall time: 95.9 ms


0.00764678418636322

#### Full model inference and benchmark

Next, we will try the T5 model for the task of translation from English to German.

For benchmarking purposes, we will employ a helper function `full_inference` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for comparison with TensorRT. 

In [12]:
from T5.T5ModelConfig import T5ModelTRTConfig, T5Metadata

In [13]:
num_beams=1
model_kwargs = {
    "encoder_outputs": t5_model.get_encoder()(inputs.input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
}

In [14]:
%%time

decoder_output, _ = full_inference(
    t5_torch_encoder,
    t5_torch_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    num_beams=num_beams,
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT],
)


CPU times: user 41.4 ms, sys: 0 ns, total: 41.4 ms
Wall time: 41 ms


In [15]:
model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name}


In [16]:
model_kwargs

{'encoder_outputs': BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[ 0.0141,  0.1282,  0.0321,  ..., -0.0082,  0.0340,  0.0936],
          [ 0.0616, -0.0724, -0.0481,  ...,  0.0811, -0.1882,  0.0876],
          [ 0.0102, -0.0185,  0.0025,  ...,  0.0020, -0.0040, -0.0408],
          ...,
          [-0.1562, -0.0032, -0.1109,  ..., -0.0964, -0.0977, -0.1620],
          [-0.0242, -0.1908, -0.0640,  ..., -0.0090, -0.2134, -0.1214],
          [ 0.0859, -0.0072, -0.0470,  ..., -0.0105, -0.0869,  0.0403]]],
        device='cuda:0', grad_fn=<MulBackward0>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)}

Let us decode the model's output back into text.

In [17]:
# De-tokenize output to raw text
print(tokenizer.decode(decoder_output[0], skip_special_tokens=True))

2023-08-07 17:36:37.653808: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Das ist gut.


<a id="2"></a>

## 2. Convert to ONNX

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

The steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU. 

For the T5 model, we will convert the encoder and decoder seperately.

In [18]:
# helpers
from NNDF.networks import NetworkMetadata, Precision

In [19]:
onnx_model_path = './models/{}/ONNX'.format(T5_VARIANT)

metadata=NetworkMetadata(variant=T5_VARIANT, precision=Precision(fp16=True), other=T5Metadata(kv_cache=False))

encoder_onnx_model_path = os.path.join(onnx_model_path, "encoder")
decoder_onnx_model_path = os.path.join(onnx_model_path, "decoder")
!mkdir -p $encoder_onnx_model_path
!mkdir -p $decoder_onnx_model_path

encoder_onnx_model_fpath = T5_VARIANT + "-encoder.onnx"
decoder_onnx_model_fpath = T5_VARIANT + "-decoder-with-lm-head.onnx"

t5_encoder = T5EncoderTorchFile(t5_model.to('cpu'), metadata)
t5_decoder = T5DecoderTorchFile(t5_model.to('cpu'), metadata)

onnx_t5_encoder = t5_encoder.as_onnx_model(
    os.path.join(encoder_onnx_model_path, encoder_onnx_model_fpath), force_overwrite=False
)
onnx_t5_decoder = t5_decoder.as_onnx_model(
    os.path.join(decoder_onnx_model_path, decoder_onnx_model_fpath), force_overwrite=False
)

In [20]:

encoder_onnx_model_path = os.path.join(onnx_model_path, "encoder")
decoder_onnx_model_path = os.path.join(onnx_model_path, "decoder")
!mkdir -p $encoder_onnx_model_path
!mkdir -p $decoder_onnx_model_path

encoder_onnx_model_fpath = T5_VARIANT + "-encoder.onnx"
decoder_onnx_model_fpath = T5_VARIANT + "-decoder-with-lm-head.onnx"

t5_encoder = T5EncoderTorchFile(t5_model.to('cpu'), metadata)
t5_decoder = T5DecoderTorchFile(t5_model.to('cpu'), metadata)

onnx_t5_encoder = t5_encoder.as_onnx_model(
    os.path.join(encoder_onnx_model_path, encoder_onnx_model_fpath), force_overwrite=False
)
onnx_t5_decoder = t5_decoder.as_onnx_model(
    os.path.join(decoder_onnx_model_path, decoder_onnx_model_fpath), force_overwrite=False
)

<a id="3"></a>

## 3. Convert to TensorRT

Now we are ready to parse the ONNX encoder and decoder models and convert them to optimized TensorRT engines.

Since the models contains dynamic input shapes, we can specify a valid input range with a TensorRT optimization profile.

In [21]:
from T5.export import T5DecoderONNXFile, T5EncoderONNXFile
from polygraphy.backend.trt import Profile
from tensorrt import PreviewFeature

In [22]:
tensorrt_model_path = './models/{}/tensorrt'.format(T5_VARIANT)
!mkdir -p tensorrt_model_path
# Decoder optimization profiles
batch_size = 1
max_sequence_length = T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT]
decoder_profile = Profile()
decoder_profile.add(
    "input_ids",
    min=(batch_size * num_beams, 1),
    opt=(batch_size * num_beams, max_sequence_length // 2),
    max=(batch_size * num_beams, max_sequence_length),
)
decoder_profile.add(
    "encoder_hidden_states",
    min=(batch_size * num_beams, 1, max_sequence_length),
    opt=(batch_size * num_beams, max_sequence_length // 2, max_sequence_length),
    max=(batch_size * num_beams, max_sequence_length, max_sequence_length),
)

# Encoder optimization profiles
encoder_profile = Profile()
encoder_profile.add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, max_sequence_length // 2),
    max=(batch_size, max_sequence_length),
)


Profile().add('input_ids', min=(1, 1), opt=(1, 256), max=(1, 512))

In [23]:
disable_preview_dynamic_shapes = False
engine_tag = f"bs{batch_size}"

if num_beams > 1:
    engine_tag += "-beam{}".format(num_beams)

preview_features = [PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-noFasterDynamicShapes"
else:
    preview_features += [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]

encoder_engine_name = os.path.join(tensorrt_model_path, encoder_onnx_model_fpath) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
decoder_engine_name = os.path.join(tensorrt_model_path, decoder_onnx_model_fpath) + f"-{engine_tag}.engine"

if not os.path.exists(encoder_engine_name):
    t5_trt_encoder_engine = T5EncoderONNXFile(os.path.join(encoder_onnx_model_path, encoder_onnx_model_fpath), metadata).as_trt_engine(
        encoder_engine_name,
        profiles=[encoder_profile],
        preview_features=preview_features)
else:
    t5_trt_encoder_engine = T5EncoderTRTEngine(encoder_engine_name, metadata)

if not os.path.exists(decoder_engine_name):
    t5_trt_decoder_engine = T5DecoderONNXFile(os.path.join(decoder_onnx_model_path, decoder_onnx_model_fpath), metadata).as_trt_engine(
        decoder_engine_name,
        profiles=[decoder_profile],
        preview_features=preview_features)
else:
    t5_trt_decoder_engine = T5DecoderTRTEngine(decoder_engine_name, metadata)

### Inference with TensorRT engine

Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the T5 model, ready for us to carry out inference. 

#### Single example inference
The T5 model with TensorRT backend can now be employed in place of the original HuggingFace T5 model.


In [24]:
# Initialize TensorRT engines
from T5.trt import T5TRTEncoder, T5TRTDecoder

t5_trt_encoder = T5TRTEncoder(
                t5_trt_encoder_engine, metadata, config
            )
t5_trt_decoder = T5TRTDecoder(
                t5_trt_decoder_engine, metadata, config, num_beams=num_beams
            )

In [25]:
# Inference on a single sample
encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)
outputs = t5_trt_decoder(
    expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, 
    expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state)

In [26]:
# Generate sequence for an input
max_length = 64

decoder_input_ids = torch.full(
    (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32
).to("cuda:0")

encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)

#### TRT engine inference benchmark: encoder and decoder stacks
First, we will bechmark the encoder and decoder stacks as before.

In [27]:
%%time
encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_trt_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time


CPU times: user 12.6 ms, sys: 504 µs, total: 13.1 ms
Wall time: 12.6 ms


0.0009394341614097357

In [28]:
%%time
_, decoder_e2e_median_time = decoder_inference(
    t5_trt_decoder, expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, 
    expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

CPU times: user 19.9 ms, sys: 12 µs, total: 19.9 ms
Wall time: 19.4 ms


0.0015115379355847836

### Full model inference benchmark

Next, we will try the full TensorRT T5 engine for the task of translation. As before, note the time difference.

In [29]:
%%time
decoder_output, full_e2e_median_runtime = full_inference(
    t5_trt_encoder,
    t5_trt_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[metadata.variant],
    num_beams=num_beams,
    use_cuda=True,
)

print(tokenizer.decode(decoder_output[0], skip_special_tokens=True))
full_e2e_median_runtime


Das ist gut.
CPU times: user 30.1 ms, sys: 0 ns, total: 30.1 ms
Wall time: 29.6 ms


You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference. On an NVIDIA V100 32GB GPU, this results in upto ~10x performance improvement (from 0.0802s to 0.0082s for the T5-small variant).

## Conclusion and where-to next?

This notebook has walked you through the process of converting a HuggingFace PyTorch T5 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace T5 model while providing significant speed up. 

If you are interested in further details of the conversion process, check out [T5/trt.py](../T5/trt.py)