In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
## Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
%load_ext autoreload
%autoreload 2

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Accelerating HuggingFace T5 Inference with TensorRT

T5 is an encoder-decoder model that converts all NLP problems into a text-to-text format. More specifically, it does so by encoding  different tasks as text directives in the input stream. This enables a single model to be trained supervised on a wide variety of NLP tasks such as translation, classification, Q&A and summarization.

This notebook shows 3 easy steps to convert a [HuggingFace PyTorch T5 model](https://huggingface.co/transformers/model_doc/t5.html) to a TensorRT engine for high-performance inference.

1. [Download HuggingFace T5 model](#1)
1. [Convert to ONNX format](#2)
1. [Convert to TensorRT engine](#3)

## Prerequisite

Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.

Next, we install some extra dependencies.

In [2]:
# %%capture
# !pip3 install -r ../requirements.txt

**Note:** After this step, you should restart the Jupyter kernel for the change to take effect.

In [3]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch
import tensorrt as trt

# huggingface
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
    WhisperProcessor, 
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperConfig
)

<a id="1"></a>

## 1. Download HuggingFace T5 model and Whisper model

First, we download the original HuggingFace PyTorch T5 model from HuggingFace model hubs, together with its associated tokernizer.

The T5 variants that are suported by TensorRT 8 are:  t5-small (60M), t5-base (220M), t5-large (770M), t5-3b(3B), t5-11b(11B)

In [6]:
T5_VARIANT = 't5-small' # choices: t5-small | t5-base | t5-large | t5-3b | t5-11b

t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT).to('cuda')
tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)
t5_config = T5Config.from_pretrained(T5_VARIANT, use_cache = False)

In [7]:
inputs = tokenizer("translate English to German: That is good.", return_tensors="pt").to('cuda')

In [8]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(T5_VARIANT)
!mkdir -p $pytorch_model_dir

t5_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/t5-small/pytorch


In [9]:
import torch
from datasets import load_dataset

Whisper_VARIANT = "openai/whisper-tiny"    # choices: openai/whisper-tiny | openai/whisper-base | openai/whisper-small | openai/whisper-medium | openai/whisper-large-v2

processor = WhisperProcessor.from_pretrained(Whisper_VARIANT)
whisper_model = WhisperForConditionalGeneration.from_pretrained(Whisper_VARIANT)
wh_config = WhisperConfig.from_pretrained(Whisper_VARIANT, use_cache = False)

In [10]:
wh_config.max_source_positions

1500

In [11]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(Whisper_VARIANT)
!mkdir -p $pytorch_model_dir

whisper_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/openai/whisper-tiny/pytorch


### Inference with PyTorch model

Next, we will carry out inference with the PyTorch model.

#### Single example inference

In [12]:
inputs = tokenizer("translate English to German: That is good.", return_tensors="pt")
num_beams = 1

In [13]:
# WAR: Using an ugly representation because cuda 11.4 does not support GPU models due to cublas errors
if "LD_LIBRARY_PATH" in os.environ and "cuda-11.4" in os.environ["LD_LIBRARY_PATH"]:
    t5_model = t5_model.cpu()
    inputs = inputs.to('cpu')
else:
    t5_model = t5_model.cuda()
    inputs = inputs.to('cuda:0')
input_ids = inputs.input_ids
    

In [14]:
# inference on a single example
t5_model.eval()
with torch.no_grad():
    outputs = t5_model(**inputs, labels=inputs["input_ids"])

logits = outputs.logits

In [15]:
# Generate sequence for an input
outputs = t5_model.generate(input_ids, num_beams=num_beams)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

2023-08-03 18:04:43.488813: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Das ist gut.


In [16]:
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

audio_inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
input_features = audio_inputs.input_features

# WAR: Using an ugly representation because cuda 11.4 does not support GPU models due to cublas errors
if "LD_LIBRARY_PATH" in os.environ and "cuda-11.4" in os.environ["LD_LIBRARY_PATH"]:
    whisper_model = whisper_model.cpu()
    input_features = input_features.to('cpu')
else:
    whisper_model = whisper_model.cuda()
    input_features = input_features.to('cuda:0')   

Found cached dataset librispeech_asr_dummy (/home/nvadmin/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [17]:
with torch.no_grad():
    generated_ids = whisper_model.generate(inputs=input_features)

transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
transcription
# ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'



' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'

In [18]:
input_features.shape

torch.Size([1, 80, 3000])

In [19]:
whisper_model.model.encoder(input_features=input_features)[0].shape

torch.Size([1, 1500, 384])

#### Model inference benchmark: encoder and decoder stacks

For benchmarking purposes, we will employ a helper functions `encoder_inference` and `decoder_inference` which execute the inference repeatedly for the T5 encoder and decoder stacks separately, and measure end to end execution time. Let's take note of this execution time for comparison with TensorRT. 
 
`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [20]:
from T5.measurements import decoder_inference, encoder_inference, full_inference
from T5.export import T5EncoderTorchFile, T5DecoderTorchFile, T5EncoderTRTEngine, T5DecoderTRTEngine

from Whisper.measurements import decoder_inference as w_decoder_inference, encoder_inference as w_encoder_inference, full_inference as w_full_inference, full_inference_greedy, full_inference_beam
from Whisper.export import WhisperEncoderTorchFile, WhisperDecoderTorchFile, WhisperEncoderTRTEngine, WhisperDecoderTRTEngine

from NNDF.networks import TimingProfile
from NNDF.torch_utils import expand_inputs_for_beam_search

t5_torch_encoder = T5EncoderTorchFile.TorchModule(t5_model.encoder)
t5_torch_decoder = T5DecoderTorchFile.TorchModule(
    t5_model.decoder, t5_model.lm_head, t5_model.config
)

In [21]:
whisper_torch_encoder = WhisperEncoderTorchFile.TorchModule(whisper_model.model.encoder)
whisper_torch_decoder = WhisperDecoderTorchFile.TorchModule(
    whisper_model.model.decoder, whisper_model.proj_out, whisper_model.config
)

In [22]:
generated_ids = whisper_model.generate(inputs=audio_inputs.input_features.to('cuda'))

In [23]:
%%time
input_ids = inputs.input_ids

encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_torch_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time

CPU times: user 59.5 ms, sys: 0 ns, total: 59.5 ms
Wall time: 59.2 ms


0.00469854602124542

In [24]:
%%time
_, decoder_e2e_median_time = decoder_inference(
    t5_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

CPU times: user 94.2 ms, sys: 0 ns, total: 94.2 ms
Wall time: 93.9 ms


0.00756344199180603

In [25]:
%%time
input_features = audio_inputs.input_features.to('cuda')

encoder_last_hidden_state, encoder_e2e_median_time = w_encoder_inference(
    whisper_torch_encoder, input_features, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time

CPU times: user 34.4 ms, sys: 0 ns, total: 34.4 ms
Wall time: 33.9 ms


0.0025924149667844176

In [26]:
encoder_last_hidden_state.shape

torch.Size([1, 1500, 384])

In [27]:
%%time
_, decoder_e2e_median_time = w_decoder_inference(
    whisper_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

CPU times: user 49.6 ms, sys: 1.34 ms, total: 51 ms
Wall time: 50.7 ms


0.003986836993135512

#### Full model inference and benchmark

Next, we will try the T5 model for the task of translation from English to German.

For benchmarking purposes, we will employ a helper function `full_inference` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for comparison with TensorRT. 

In [28]:
from T5.T5ModelConfig import T5ModelTRTConfig, T5Metadata
from Whisper.WhisperModelConfig import WhisperModelTRTConfig, WhisperMetadata

In [29]:
import transformers

In [30]:
transformers.__version__

'4.23.0'

In [31]:
%%time

decoder_output, _ = full_inference(
    t5_torch_encoder,
    t5_torch_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    num_beams=num_beams,
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT],
)

CPU times: user 36.8 ms, sys: 531 µs, total: 37.3 ms
Wall time: 37.1 ms


In [32]:
"Let us decode the model's output back into text."
# De-tokenize output to raw text
print(tokenizer.decode(decoder_output[0], skip_special_tokens=True))

Das ist gut.


In [33]:
min_output_len =0 
max_output_len = whisper_model.config.max_length

In [34]:
from NNDF.general_utils import measure_python_inference_code
timing_profile = TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=[50,99])

def percentile_print(timing):
    return ', '.join(['p{} {:.2f}ms'.format(timing_profile.percentile[i], p*1000) for i,p in enumerate(timing)])
whisper_model = WhisperForConditionalGeneration.from_pretrained(Whisper_VARIANT).cuda()

# encoder-decoder inference 
with torch.no_grad():
    output_ids = whisper_model.generate(input_features, max_length=max_output_len, min_length=min_output_len, num_beams=num_beams, use_cache=False)    
    outputs = processor.tokenizer.decode(output_ids[-1,:], skip_special_tokens=True)    
outputs_hf = outputs

# timing
# FP32
whisper_model.float()
hf_nonkv_time = measure_python_inference_code(lambda: whisper_model.generate(input_features, max_length=max_output_len, min_length=min_output_len, num_beams=num_beams, use_cache=False), timing_profile)
hf_kv_time = measure_python_inference_code(lambda: whisper_model.generate(input_features, max_length=max_output_len, min_length=min_output_len, num_beams=num_beams, use_cache=True), timing_profile)

# FP16, cuda 11.4 has cublas error that will fail in both cpu or cpu model for BART
# if not cuda_114_mode:
whisper_model= whisper_model.half()
hf_nonkv_time_fp16 = measure_python_inference_code(lambda: whisper_model.generate(input_features.half(), max_length=max_output_len, min_length=min_output_len, num_beams=num_beams, use_cache=False), timing_profile)
hf_kv_time_fp16 = measure_python_inference_code(lambda: whisper_model.generate(input_features.half(), max_length=max_output_len, min_length=min_output_len, num_beams=num_beams, use_cache=True), timing_profile)

In [35]:
# FP32
HF_KV=True
timing_profile = TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
whisper_model.float()
whisper_torch_encoder = WhisperEncoderTorchFile.TorchModule(whisper_model.get_encoder())
whisper_torch_decoder = WhisperDecoderTorchFile.TorchModule(whisper_model.get_decoder(), whisper_model.proj_out, whisper_model.config)

with torch.no_grad():

    encoder_last_hidden_state, encoder_pytorch_time = w_encoder_inference(whisper_torch_encoder, input_features, timing_profile)
    _, decoder_pytorch_time = w_decoder_inference(whisper_torch_decoder, expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state, timing_profile, use_cache=HF_KV)
    if num_beams == 1:
        output_ids, full_pytorch_time = full_inference_greedy(whisper_torch_encoder,whisper_torch_decoder,input_features,tokenizer,timing_profile,max_length=max_output_len, min_length=min_output_len, use_cache=HF_KV)
    else:
        output_ids, full_pytorch_time = full_inference_beam(whisper_torch_encoder,whisper_torch_decoder,input_features,tokenizer,timing_profile,num_beams=num_beams,max_length=max_output_len, min_length=min_output_len, use_cache=HF_KV)
    outputs = tokenizer.decode(output_ids[0], skip_special_tokens=True)    

outputs_pytorch = outputs

# # FP16
# if not cuda_114_mode:
whisper_model.half()
input_features= input_features.half()
whisper_torch_encoder_fp16 = WhisperEncoderTorchFile.TorchModule(whisper_model.get_encoder())
whisper_torch_decoder_fp16 = WhisperDecoderTorchFile.TorchModule(whisper_model.get_decoder(), whisper_model.proj_out, whisper_model.config)

with torch.no_grad():

    encoder_last_hidden_state, encoder_pytorch_time_fp16 = w_encoder_inference(whisper_torch_encoder_fp16, input_features, timing_profile)
    _, decoder_pytorch_time_fp16 = w_decoder_inference(whisper_torch_decoder_fp16, expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state, timing_profile, use_cache=HF_KV)
    if num_beams == 1:
        output_ids_fp16, full_pytorch_time_fp16 = full_inference_greedy(whisper_torch_encoder_fp16,whisper_torch_decoder_fp16,input_features,tokenizer,timing_profile,max_length=max_output_len, min_length=min_output_len, use_cache=HF_KV)
    else:
        output_ids_fp16, full_pytorch_time_fp16 = full_inference_beam(whisper_torch_encoder_fp16,whisper_torch_decoder_fp16,input_features,tokenizer,timing_profile,num_beams=num_beams,max_length=max_output_len, min_length=min_output_len, use_cache=HF_KV)
    outputs_fp16 = tokenizer.decode(output_ids_fp16[0], skip_special_tokens=True)    

outputs_pytorch_fp16 = outputs_fp16

Using bos_token, but it is not set yet.
Using bos_token, but it is not set yet.


In [36]:
encoder_last_hidden_state.shape

torch.Size([1, 1500, 384])

In [37]:
# print
print(f'PyTorch FP32 Output identical to HF results? {outputs_pytorch == outputs_hf}')
print(f'PyTorch FP16 Output identical to HF results? {outputs_pytorch_fp16 == outputs_hf}')
print('\n')      
print(f'Device: {torch.cuda.get_device_name()}')
print(f"Precision: FP32, Number of Beams: {num_beams}")
print(f"Encoder time: {encoder_pytorch_time}")
print(f"Decoder time: {decoder_pytorch_time}")
print(f"Full E2E time: {full_pytorch_time}")
print(f"Precision: FP16, Number of Beams: {num_beams}")
print(f"Encoder time: {encoder_pytorch_time_fp16}")
print(f"Decoder time: {decoder_pytorch_time_fp16}")
print(f"Full E2E time: {full_pytorch_time_fp16}")

PyTorch FP32 Output identical to HF results? False
PyTorch FP16 Output identical to HF results? False


Device: NVIDIA A100-SXM4-80GB
Precision: FP32, Number of Beams: 1
Encoder time: 0.0021469868952408433
Decoder time: 0.003236535005271435
Full E2E time: 0.08386038697790354
Precision: FP16, Number of Beams: 1
Encoder time: 0.003080081893131137
Decoder time: 0.0033120280131697655
Full E2E time: 0.08407521108165383


In [38]:
output_ids_fp16, full_pytorch_time_fp16 = full_inference_greedy(whisper_torch_encoder_fp16,whisper_torch_decoder_fp16,input_features,tokenizer,timing_profile,max_length=max_output_len, min_length=min_output_len, use_cache=HF_KV)

Using bos_token, but it is not set yet.


In [39]:
processor.tokenizer.batch_decode(output_ids_fp16)

['"!<|startoftranscript|>.<|translate|><|notimestamps|> Mr. Kilder is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>']

In [40]:
whisper_model.float()


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0): WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (final_lay

<a id="2"></a>

## 2. Convert to ONNX

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

The steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU. 

For the T5 model, we will convert the encoder and decoder seperately.

In [41]:
# helpers
from NNDF.networks import NetworkMetadata, Precision

In [42]:
onnx_model_path = './models/{}/ONNX'.format(T5_VARIANT)

t5_metadata=NetworkMetadata(variant=T5_VARIANT, precision=Precision(fp16=True), other=T5Metadata(kv_cache=False))

t5_encoder_onnx_model_path = os.path.join(onnx_model_path, "encoder")
t5_decoder_onnx_model_path = os.path.join(onnx_model_path, "decoder")
!mkdir -p $t5_encoder_onnx_model_path
!mkdir -p $t5_decoder_onnx_model_path

t5_encoder_onnx_model_fpath = T5_VARIANT + "-encoder.onnx"
t5_decoder_onnx_model_fpath = T5_VARIANT + "-decoder-with-lm-head.onnx"

t5_encoder = T5EncoderTorchFile(t5_model.to('cpu'), t5_metadata)
t5_decoder = T5DecoderTorchFile(t5_model.to('cpu'), t5_metadata)

onnx_t5_encoder = t5_encoder.as_onnx_model(
    os.path.join(t5_encoder_onnx_model_path, t5_encoder_onnx_model_fpath), force_overwrite=True
)
onnx_t5_decoder = t5_decoder.as_onnx_model(
    os.path.join(t5_decoder_onnx_model_path, t5_decoder_onnx_model_fpath), force_overwrite=True
)

  if causal_mask.shape[1] < attention_mask.shape[1]:


In [43]:
onnx_model_path = './models/{}/ONNX'.format(Whisper_VARIANT)

wh_metadata=NetworkMetadata(variant=Whisper_VARIANT, precision=Precision(fp16=True), other=WhisperMetadata(kv_cache=False))

wh_encoder_onnx_model_path = os.path.join(onnx_model_path, "encoder")
wh_decoder_onnx_model_path = os.path.join(onnx_model_path, "decoder")


!mkdir -p $wh_encoder_onnx_model_path
!mkdir -p $wh_decoder_onnx_model_path

wh_encoder_onnx_model_fpath = Whisper_VARIANT.split('/')[1] + "-encoder.onnx"
wh_decoder_onnx_model_fpath = Whisper_VARIANT.split('/')[1] + "-decoder-with-lm-head.onnx"



In [44]:
whisper_encoder = WhisperEncoderTorchFile(whisper_model.to('cpu'), wh_metadata)
whisper_decoder = WhisperDecoderTorchFile(whisper_model.to('cpu'), wh_metadata)

onnx_whisper_encoder = whisper_encoder.as_onnx_model(
    os.path.join(wh_encoder_onnx_model_path, wh_encoder_onnx_model_fpath), force_overwrite=True
)
onnx_whisper_decoder = whisper_decoder.as_onnx_model(
    os.path.join(wh_decoder_onnx_model_path, wh_decoder_onnx_model_fpath), force_overwrite=True
)

  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
  if input_shape[-1] > 1:
  mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):


In [45]:
# from NNDF.tensorrt_utils import OnnxProcessOperation, process_onnx, move_t5_cast_op
# output_fpath =  os.path.join(wh_encoder_onnx_model_path, wh_encoder_onnx_model_fpath)
# #output_fpath = os.path.join(t5_encoder_onnx_model_path, t5_encoder_onnx_model_fpath)
# config = [OnnxProcessOperation.MOVE_CAST_OP2, OnnxProcessOperation.CLAMP_WEIGHTS]

# import onnx_graphsurgeon as gs
# import onnx
# import numpy as np
# graph = gs.import_onnx(onnx.load(output_fpath))
# folder = os.path.split(output_fpath)[0]
# for op in config:
#     if op == OnnxProcessOperation.CLAMP_WEIGHTS:
#         graph = clamp_weights_onnx_to_fp16_bounds(graph, **kwargs)
#     elif op == OnnxProcessOperation.MOVE_CAST_OP2:
#         graph = move_t5_cast_op(graph)


In [46]:
# output_fpath =  os.path.join(wh_encoder_onnx_model_path, wh_encoder_onnx_model_fpath)
# converter= ModelFileConverter
# force_overwrite= False

In [47]:
# converter.torch_to_onnx(
#     output_fpath, self.load_model(), self.network_metadata
# )

In [48]:
from Whisper.export import WhisperEncoderConverter
from NNDF.models import ONNXModelFile

from NNDF.networks import NetworkMetadata, Precision, Dims
from NNDF.models import ModelFileConverter
from Whisper.export import WhisperDecoderONNXFile, WhisperEncoderONNXFile


network_metadata_cp_dct = wh_metadata._asdict()
del network_metadata_cp_dct["precision"]
network_metadata = NetworkMetadata(
    **network_metadata_cp_dct, precision=Precision(fp16=False)
)
ModelFileConverter(
    WhisperEncoderTorchFile, WhisperEncoderONNXFile, WhisperEncoderTRTEngine
)
suconverter = ModelFileConverter(WhisperEncoderTorchFile, WhisperEncoderONNXFile, WhisperEncoderTRTEngine)
# suconverter.onnx_to_trt(
#     output_fpath, fpath, network_metadata, profiles, preview_features
# )

In [49]:
model = whisper_model.to('cpu')

In [50]:
# output_fpath =  os.path.join(wh_encoder_onnx_model_path, wh_encoder_onnx_model_fpath)

In [51]:
# device = model.device
# input_features = torch.ones(1, 80, 1500).to(device)
# simplified_encoder = WhisperEncoderTorchFile.TorchModule(model.model.encoder)
# inputs = WhisperModelTRTConfig.get_input_dims(network_metadata)["encoder"]
# outputs = WhisperModelTRTConfig.get_output_dims(network_metadata)["encoder"]

# # Exports to ONNX
# opt_args = {}

# version_major = int((torch.__version__).split(".")[0])
# version_minor = int((torch.__version__).split(".")[1])
# if version_major < 1 or (version_major == 1 and version_minor < 11):
#     opt_args["use_external_data_format"] = True
# torch.onnx.export(
#     simplified_encoder,
#     input_features,
#     output_fpath,
#     do_constant_folding=True,
#     opset_version=13,
#     input_names=inputs.get_names(),
#     output_names=outputs.get_names(),
#     dynamic_axes={
#         **inputs.get_torch_dynamic_axis_encoding(),
#         **outputs.get_torch_dynamic_axis_encoding(),
#     },
#     training=torch.onnx.TrainingMode.EVAL,
#     **opt_args,
# )

# if network_metadata.precision.fp16:
#     process_onnx(
#         [OnnxProcessOperation.MOVE_CAST_OP2, OnnxProcessOperation.CLAMP_WEIGHTS],
#         output_fpath,
#         output_fpath,
#     )


In [52]:

# input_ids = torch.tensor([[42] * 10])
# input_features = torch.ones(1, 80, 3000)
# # Exporting the decoder requires a basic instance of the encoder
# # Create one temporarily
# simplified_encoder = WhisperEncoderTorchFile.TorchModule(model.get_encoder())
# # Exports to ONNX
# decoder_with_lm_head = WhisperDecoderTorchFile.TorchModule(
#     model.get_decoder(), model.proj_out, model.config
# )

# inputs = WhisperModelTRTConfig.get_input_dims(network_metadata)["decoder"]
# outputs = WhisperModelTRTConfig.get_output_dims(network_metadata)["decoder"]

# # Exports to ONNX
# opt_args = {}

# version_major = int((torch.__version__).split(".")[0])
# version_minor = int((torch.__version__).split(".")[1])
# if version_major < 1 or (version_major == 1 and version_minor < 11):
#     opt_args["use_external_data_format"] = True

# encoder_hidden_states = simplified_encoder(input_features)
# decoder_output = decoder_with_lm_head(
#     input_ids[:, :-1], encoder_hidden_states
# )  # decoder output at t-1 step (logits, past_key_values from 0 to t-1)
# past_key_values = decoder_output[1]

# decoder_root, decoder_fullname = os.path.split(output_fpath)
# # Split kv and non kv onnx into separate folders to avoid weight overlap

# non_kv_root = os.path.join(decoder_root, "non-kv")
# kv_root = os.path.join(decoder_root, "kv")
# decoder_name, decoder_ext = os.path.splitext(decoder_fullname)
# non_kv_fpath = os.path.join(
#     non_kv_root, decoder_name + "-non-kv" + decoder_ext
# )
# kv_fpath = os.path.join(kv_root, decoder_fullname)

# # This code allows for huggingface compatible torch class to use onnx exporter (change just before onnx.export)
# old_forward = decoder_with_lm_head.forward

# def _export_forward(input_ids, encoder_hidden_states, past_key_values):
#     result = old_forward(
#         input_ids, encoder_hidden_states, past_key_values=past_key_values
#     )
#     return (result[0], result[1])

# decoder_with_lm_head.forward = _export_forward

# torch.onnx.export(
#     decoder_with_lm_head,
#     (input_ids[:, -1:], encoder_hidden_states, past_key_values),
#     # (1) input_ids should be the t token (last one) while past_key_values is 0 to t-1 caches
#     # (2) since past_key_values is kwargs, ideally use "(input_ids[:,-1:], encoder_hidden_states, {"past_key_values": past_key_values})",
#     # but onnx.export seems to unable to take kwargs properly (although PyTorch 1.11 claims it supports already).
#     # Therefore, we need to wrap inside _export_forward() and make past_key_values indeed a kwargs
#     kv_fpath,
#     export_params=True,
#     opset_version=12,
#     input_names=inputs.get_names(),
#     output_names=outputs.get_names(),
#     dynamic_axes={
#         **inputs.get_torch_dynamic_axis_encoding(),
#         **outputs.get_torch_dynamic_axis_encoding(),
#     },
#     training=torch.onnx.TrainingMode.EVAL,
#     **opt_args,
# )


In [53]:

    # # dual-engine approach: also export non-kv onnx model. Note that this is different from the original "non-kv" model. This one traces the `use_cache` path and have present_key_values output
    # def _export_forward(input_ids, encoder_hidden_states, use_cache):
    #     result = old_forward(
    #         input_ids, encoder_hidden_states, use_cache=use_cache
    #     )
    #     return (result[0], result[1])

    # decoder_with_lm_head.forward = _export_forward

    # # inputs are same as non-kv model
    # # outputs are same as kv model
    # dict_inputs = inputs.get_dims()
    # dict_inputs_non_kv = OrderedDict(
    #     {k: dict_inputs[k] for k in ["input_ids", "encoder_hidden_states"]}
    # )
    # inputs_non_kv = Dims(dict_inputs_non_kv)

    # torch.onnx.export(
    #     decoder_with_lm_head,
    #     (input_ids[:, -1:], encoder_hidden_states, True),
    #     non_kv_fpath,
    #     export_params=True,
    #     opset_version=12,
    #     input_names=inputs_non_kv.get_names(),
    #     output_names=outputs.get_names(),
    #     dynamic_axes={
    #         **inputs_non_kv.get_torch_dynamic_axis_encoding(),
    #         **outputs.get_torch_dynamic_axis_encoding(),
    #     },
    #     training=torch.onnx.TrainingMode.EVAL,
    #     **opt_args,
    # )

In [54]:
from NNDF.networks import NetworkMetadata, Precision
TRT_KV = False

wh_onnx_model_path = './models/{}/onnx'.format(Whisper_VARIANT)
!mkdir -p $wh_onnx_model_path

# FP32
whisper_model.float()
metadata = NetworkMetadata(variant=Whisper_VARIANT, precision=Precision(fp16=False), other=WhisperMetadata(kv_cache=TRT_KV))
trt_config = WhisperModelTRTConfig()
metadata_string = trt_config.get_metadata_string(metadata)

wh_encoder_onnx_model_fpath = metadata_string + "-encoder.onnx"
wh_decoder_onnx_model_fpath = metadata_string + "-decoder-with-lm-head.onnx"

# for onnx conversion, ensure model is on CPU and FP32 precision in this step
whisper_torchfile_encoder = WhisperEncoderTorchFile(whisper_model.to('cpu'), metadata)
whisper_torchfile_decoder = WhisperDecoderTorchFile(whisper_model.to('cpu'), metadata)

onnx_whisper_encoder = whisper_torchfile_encoder.as_onnx_model(os.path.join(wh_onnx_model_path, wh_encoder_onnx_model_fpath), force_overwrite=False)
onnx_whisper_decoder = whisper_torchfile_decoder.as_onnx_model(os.path.join(wh_onnx_model_path, wh_decoder_onnx_model_fpath), force_overwrite=False)

# FP16
metadata_fp16 = NetworkMetadata(variant=Whisper_VARIANT, precision=Precision(fp16=True), other=WhisperMetadata(kv_cache=TRT_KV))
trt_config_fp16 = WhisperModelTRTConfig()
metadata_string_fp16 = trt_config.get_metadata_string(metadata_fp16)

wh_encoder_onnx_model_fpath_fp16 = metadata_string_fp16 + "-encoder.onnx"
wh_decoder_onnx_model_fpath_fp16 = metadata_string_fp16 + "-decoder-with-lm-head.onnx"

# for onnx conversion, ensure model is on CPU and FP32 precision in this step
whisper_torchfile_encoder = WhisperEncoderTorchFile(whisper_model.to('cpu'), metadata)
whisper_torchfile_decoder = WhisperDecoderTorchFile(whisper_model.to('cpu'), metadata)

onnx_whisper_encoder_fp16 = whisper_torchfile_encoder.as_onnx_model(os.path.join(wh_onnx_model_path, wh_encoder_onnx_model_fpath_fp16), force_overwrite=False)
onnx_whisper_decoder_fp16 = whisper_torchfile_decoder.as_onnx_model(os.path.join(wh_onnx_model_path, wh_decoder_onnx_model_fpath_fp16), force_overwrite=False)

<a id="3"></a>

## 3. Convert to TensorRT

Now we are ready to parse the ONNX encoder and decoder models and convert them to optimized TensorRT engines.

Since the models contains dynamic input shapes, we can specify a valid input range with a TensorRT optimization profile.

In [55]:
from T5.export import T5DecoderONNXFile, T5EncoderONNXFile
from Whisper.export import WhisperDecoderONNXFile, WhisperEncoderONNXFile
from polygraphy.backend.trt import Profile
from tensorrt import PreviewFeature

In [56]:
t5_tensorrt_model_path = './models/{}/tensorrt'.format(T5_VARIANT)
!mkdir -p t5_tensorrt_model_path
# Decoder optimization profiles
batch_size = 1
max_sequence_length = T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT]
decoder_profile = Profile()
decoder_profile.add(
    "input_ids",
    min=(batch_size * num_beams, 1),
    opt=(batch_size * num_beams, max_sequence_length // 2),
    max=(batch_size * num_beams, max_sequence_length),
)
decoder_profile.add(
    "encoder_hidden_states",
    min=(batch_size * num_beams, 1, max_sequence_length),
    opt=(batch_size * num_beams, max_sequence_length // 2, max_sequence_length),
    max=(batch_size * num_beams, max_sequence_length, max_sequence_length),
)

# Encoder optimization profiles
encoder_profile = Profile()
encoder_profile.add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, max_sequence_length // 2),
    max=(batch_size, max_sequence_length),
)


Profile().add('input_ids', min=(1, 1), opt=(1, 256), max=(1, 512))

In [57]:
disable_preview_dynamic_shapes = False
engine_tag = f"bs{batch_size}"

if num_beams > 1:
    engine_tag += "-beam{}".format(num_beams)

preview_features = [PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-noFasterDynamicShapes"
else:
    preview_features += [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]

t5_encoder_engine_name = os.path.join(t5_tensorrt_model_path, t5_encoder_onnx_model_fpath) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
t5_decoder_engine_name = os.path.join(t5_tensorrt_model_path, t5_decoder_onnx_model_fpath) + f"-{engine_tag}.engine"

if not os.path.exists(t5_encoder_engine_name):
    t5_trt_encoder_engine = T5EncoderONNXFile(os.path.join(t5_encoder_onnx_model_path, t5_encoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        t5_encoder_engine_name,
        profiles=[encoder_profile],
        preview_features=preview_features)
else:
    t5_trt_encoder_engine = T5EncoderTRTEngine(t5_encoder_engine_name, t5_metadata)

if not os.path.exists(t5_decoder_engine_name):
    t5_trt_decoder_engine = T5DecoderONNXFile(os.path.join(t5_decoder_onnx_model_path, t5_decoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        t5_decoder_engine_name,
        profiles=[decoder_profile],
        preview_features=preview_features)
else:
    t5_trt_decoder_engine = T5DecoderTRTEngine(t5_decoder_engine_name, t5_metadata)

In [58]:
wh_tensorrt_model_path = './models/{}/tensorrt'.format(Whisper_VARIANT)
!mkdir -p wh_tensorrt_model_path
# Decoder optimization profiles
batch_size = 1
max_sequence_length = WhisperModelTRTConfig.MAX_SEQUENCE_LENGTH[Whisper_VARIANT]
decoder_profile = Profile()
decoder_profile.add(
    "input_ids",
    min=(batch_size * num_beams, 1),
    opt=(batch_size * num_beams, max_sequence_length // 2),
    max=(batch_size * num_beams, max_sequence_length),
)
decoder_profile.add(
    "encoder_hidden_states",
    min=(batch_size * num_beams, 1, max_sequence_length),
    opt=(batch_size * num_beams, max_sequence_length // 2, max_sequence_length),
    max=(batch_size * num_beams, max_sequence_length, max_sequence_length),
)

# Encoder optimization profiles
encoder_profile = Profile()
encoder_profile.add(
    "input_features",
    min=(batch_size, 80, 1500),
    opt=(batch_size, 80, 3000),
    max=(batch_size, 80, 3000)
)


Profile().add('input_features', min=(1, 80, 1500), opt=(1, 80, 3000), max=(1, 80, 3000))

In [59]:
engine_tag = f"bs{batch_size}"

if num_beams > 1:
    engine_tag += "-beam{}".format(num_beams)

preview_features = [PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-noPreviewFasterDynamicShapes"
else:
    preview_features.append(PreviewFeature.FASTER_DYNAMIC_SHAPES_0805)

# FP32
wh_encoder_engine_name = os.path.join(wh_tensorrt_model_path, wh_encoder_onnx_model_fpath) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
wh_decoder_engine_name = os.path.join(wh_tensorrt_model_path, wh_decoder_onnx_model_fpath) + f"-{engine_tag}.engine"

if not os.path.exists(wh_encoder_engine_name):
    whisper_trt_encoder_engine = WhisperEncoderONNXFile(os.path.join(wh_onnx_model_path, wh_encoder_onnx_model_fpath), metadata).as_trt_engine(
        wh_encoder_engine_name, 
        profiles=[encoder_profile], 
        preview_features=preview_features
    )
else:
    whisper_trt_encoder_engine = WhisperEncoderTRTEngine(wh_encoder_engine_name, metadata)
    
if not os.path.exists(wh_decoder_engine_name):
    whisper_trt_decoder_engine = WhisperDecoderONNXFile(os.path.join(wh_onnx_model_path, wh_decoder_onnx_model_fpath), metadata).as_trt_engine(
        wh_decoder_engine_name, 
        profiles=[decoder_profile], 
        preview_features=preview_features
    )
else:
    whisper_trt_decoder_engine = WhisperDecoderTRTEngine(wh_decoder_engine_name, metadata)


In [60]:
# FP16
wh_encoder_engine_name_fp16 = os.path.join(wh_tensorrt_model_path, wh_encoder_onnx_model_fpath_fp16) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
wh_decoder_engine_name_fp16 = os.path.join(wh_tensorrt_model_path, wh_decoder_onnx_model_fpath_fp16) + f"-{engine_tag}.engine"

if not os.path.exists(wh_encoder_engine_name_fp16):
    whisper_trt_encoder_engine_fp16 = WhisperEncoderONNXFile(os.path.join(wh_onnx_model_path, wh_encoder_onnx_model_fpath_fp16), metadata_fp16).as_trt_engine(
        wh_encoder_engine_name_fp16, 
        profiles=[encoder_profile], 
        preview_features=preview_features
    )
else:
    whisper_trt_encoder_engine_fp16 = WhisperEncoderTRTEngine(wh_encoder_engine_name_fp16, metadata_fp16)
    
if not os.path.exists(wh_decoder_engine_name_fp16):
    whisper_trt_decoder_engine_fp16 = WhisperDecoderONNXFile(os.path.join(wh_onnx_model_path, wh_decoder_onnx_model_fpath_fp16), metadata_fp16).as_trt_engine(
        wh_decoder_engine_name_fp16, 
        profiles=[decoder_profile], 
        preview_features=preview_features
    )
else:
    whisper_trt_decoder_engine_fp16 = WhisperDecoderTRTEngine(wh_decoder_engine_name_fp16, metadata_fp16)

In [61]:
# encoder_profiles = [
#             Profile().add(
#                 "input_features",
#                 min=(batch_size, 1),
#                 opt=(batch_size, opt_input_seq_len),
#                 max=(batch_size, max_input_length),
#             )
#         ]

In [62]:
from Whisper.export import WhisperEncoderConverter, WhisperDecoderConverter
from NNDF.models import ONNXModelFile

from NNDF.networks import NetworkMetadata, Precision, Dims
from NNDF.models import ModelFileConverter

print(wh_encoder_engine_name)
print(wh_decoder_engine_name)

model = os.path.join(wh_encoder_onnx_model_path, wh_encoder_onnx_model_fpath) 
# encoder convert to tensorrt
onmf = ONNXModelFile(model, WhisperEncoderConverter, wh_metadata)

output_fpath = wh_encoder_engine_name
profiles=[encoder_profile]
preview_features=preview_features
converter = onmf.default_converter
fpath = model

network_metadata_cp_dct = wh_metadata._asdict()
del network_metadata_cp_dct["precision"]
network_metadata = NetworkMetadata(
    **network_metadata_cp_dct, precision=Precision(fp16=False)
)

./models/openai/whisper-tiny/tensorrt/Whisper-tiny-encoder.onnx-bs1.engine
./models/openai/whisper-tiny/tensorrt/Whisper-tiny-decoder-with-lm-head.onnx-bs1.engine


In [65]:
WhisperEncoderTRTEngine
onnx_class = WhisperEncoderTorchFile
torch_class = WhisperEncoderONNXFile
trt_engine_class = WhisperEncoderTRTEngine

from polygraphy.backend.trt import CreateConfig
from tensorrt import PreviewFeature, MemoryPoolType

# polygraphy
from polygraphy.backend.trt import (
    network_from_onnx_path,
    engine_from_network,
    save_engine,
    Profile,
)

In [66]:
result = trt_engine_class(output_fpath, network_metadata)

trt_inference_config = CreateConfig(
    tf32=True,
    fp16=network_metadata.precision.fp16,
    memory_pool_limits = {MemoryPoolType.WORKSPACE: result.max_trt_workspace * 1024 * 1024},
    profiles=profiles,
    precision_constraints=("obey" if result.use_obey_precision_constraints() else None),
    preview_features=preview_features
)
    

In [77]:
from polygraphy.backend.trt import util as trt_util

In [92]:
network_definition = result.get_network_definition(network_from_onnx_path(os.path.join(wh_onnx_model_path, wh_encoder_onnx_model_fpath)))
#network_definition[1].get_input(0).name='input_features'
trt_engine = engine_from_network(
    network_definition, config=trt_inference_config
)
save_engine(trt_engine, output_fpath)

[I]     Configuring with profiles: [Profile().add('input_features', min=(1, 80, 3000), opt=(1, 80, 3000), max=(1, 80, 3000))]
[38;5;11m[W] It looks like some layers in the network have compute precision set, but precision constraints were not enabled. 
    Precision constraints must be set to 'prefer' or 'obey' for layer compute precision to take effect. 
    Note: Layers and their requested precisions were: {'encoder/layers.0/self_attn_layer_norm/ReduceMean': 'FLOAT', 'encoder/layers.0/self_attn_layer_norm/Pow': 'FLOAT', 'encoder/layers.0/self_attn_layer_norm/ReduceMean_1': 'FLOAT', 'encoder/layers.0/self_attn_layer_norm/Add': 'FLOAT', 'encoder/layers.0/self_attn_layer_norm/Sqrt': 'FLOAT', 'encoder/layers.0/self_attn_layer_norm/Div': 'FLOAT', 'encoder/layers.0/self_attn_layer_norm/Mul': 'FLOAT', 'encoder/layers.0/final_layer_norm/ReduceMean': 'FLOAT', 'encoder/layers.0/final_layer_norm/Pow': 'FLOAT', 'encoder/layers.0/final_layer_norm/ReduceMean_1': 'FLOAT', 'encoder/layers.0/final_l

<tensorrt.tensorrt.ICudaEngine at 0x7fca695021f0>

In [93]:
# decoder tensorrt
model = os.path.join(wh_onnx_model_path, wh_decoder_onnx_model_fpath) 
# encoder convert to tensorrt
onmf = ONNXModelFile(model, WhisperDecoderConverter, wh_metadata)

output_fpath = wh_decoder_engine_name
profiles=[decoder_profile]
preview_features=preview_features
converter = onmf.default_converter
fpath = model

In [95]:

= trt_engine_class(output_fpath, network_metadata)

trt_inference_config = CreateConfig(
    tf32=True,
    fp16=network_metadata.precision.fp16,
    memory_pool_limits = {MemoryPoolType.WORKSPACE: result.max_trt_workspace * 1024 * 1024},
    profiles=profiles,
    precision_constraints=("obey" if result.use_obey_precision_constraints() else None),
    preview_features=preview_features
)
    

In [96]:
network_definition = result.get_network_definition(network_from_onnx_path(os.path.join(wh_onnx_model_path, wh_decoder_onnx_model_fpath)))
#network_definition[1].get_input(0).name='input_features'
trt_engine = engine_from_network(
    network_definition, config=trt_inference_config
)
save_engine(trt_engine, output_fpath)

[I]     Configuring with profiles: [Profile().add('input_ids', min=(1, 1), opt=(1, 192), max=(1, 384)).add('encoder_hidden_states', min=(1, 1, 384), opt=(1, 192, 384), max=(1, 384, 384))]
[38;5;11m[W] It looks like some layers in the network have compute precision set, but precision constraints were not enabled. 
    Precision constraints must be set to 'prefer' or 'obey' for layer compute precision to take effect. 
    Note: Layers and their requested precisions were: {'/decoder/Cast_2': 'FLOAT', '/decoder/Cast_3': 'FLOAT', '/decoder/layers.0/self_attn_layer_norm/ReduceMean': 'FLOAT', '/decoder/layers.0/self_attn_layer_norm/Pow': 'FLOAT', '/decoder/layers.0/self_attn_layer_norm/ReduceMean_1': 'FLOAT', '/decoder/layers.0/self_attn_layer_norm/Add': 'FLOAT', '/decoder/layers.0/self_attn_layer_norm/Sqrt': 'FLOAT', '/decoder/layers.0/self_attn_layer_norm/Div': 'FLOAT', '/decoder/layers.0/self_attn_layer_norm/Mul': 'FLOAT', '/decoder/layers.0/encoder_attn_layer_norm/ReduceMean': 'FLOAT', '

<tensorrt.tensorrt.ICudaEngine at 0x7fca6950ae30>

In [63]:
print(wh_encoder_onnx_model_fpath)
print(wh_decoder_onnx_model_fpath)
print(onnx_whisper_encoder)
print(onnx_whisper_decoder)
#onnx_whisper_encoder = whisper_torchfile_encoder.as_onnx_model(os.path.join(wh_onnx_model_path, wh_encoder_onnx_model_fpath), force_overwrite=False)
#onnx_whisper_decoder = whisper_torchfile_decoder.as_onnx_model(os.path.join(wh_onnx_model_path, wh_decoder_onnx_model_fpath), force_overwrite=False)

Whisper-tiny-encoder.onnx
Whisper-tiny-decoder-with-lm-head.onnx
<Whisper.export.WhisperEncoderTorchFile object at 0x7f17bc6ac8b0>
<Whisper.export.WhisperDecoderTorchFile object at 0x7f17bc54c040>


In [64]:
encoder_engine_name = os.path.join(t5_tensorrt_model_path, t5_encoder_onnx_model_fpath) + f"-{engine_tag}.engine".replace(f"-beam{num_beams}", "") # encoder engine not affected by beam search
decoder_engine_name = os.path.join(t5_tensorrt_model_path, t5_decoder_onnx_model_fpath) + f"-{engine_tag}.engine"

if not os.path.exists(encoder_engine_name):
    t5_trt_encoder_engine = T5EncoderONNXFile(os.path.join(encoder_onnx_model_path, t5_encoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        encoder_engine_name,
        profiles=[encoder_profile],
        preview_features=preview_features)
else:
    t5_trt_encoder_engine = T5EncoderTRTEngine(encoder_engine_name, t5_metadata)

if not os.path.exists(decoder_engine_name):
    t5_trt_decoder_engine = T5DecoderONNXFile(os.path.join(decoder_onnx_model_path, t5_decoder_onnx_model_fpath), t5_metadata).as_trt_engine(
        decoder_engine_name,
        profiles=[decoder_profile],
        preview_features=preview_features)
else:
    t5_trt_decoder_engine = T5DecoderTRTEngine(decoder_engine_name, t5_metadata)

### Inference with TensorRT engine

Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the T5 model, ready for us to carry out inference. 

#### Single example inference
The T5 model with TensorRT backend can now be employed in place of the original HuggingFace T5 model.


In [65]:
# Initialize TensorRT engines
from T5.trt import T5TRTEncoder, T5TRTDecoder

t5_trt_encoder = T5TRTEncoder(
                t5_trt_encoder_engine, t5_metadata, t5_config
            )
t5_trt_decoder = T5TRTDecoder(
                t5_trt_decoder_engine, t5_metadata, t5_config, num_beams=num_beams
            )

In [66]:
# Inference on a single sample
encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)
outputs = t5_trt_decoder(
    expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, 
    expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state)

In [67]:
# Generate sequence for an input
max_length = 64

decoder_input_ids = torch.full(
    (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32
).to("cuda:0")

encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)

#### TRT engine inference benchmark: encoder and decoder stacks
First, we will bechmark the encoder and decoder stacks as before.

In [68]:
%%time
encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_trt_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time


CPU times: user 9.86 ms, sys: 3.14 ms, total: 13 ms
Wall time: 12.6 ms


0.0009066950296983123

In [69]:
%%time
_, decoder_e2e_median_time = decoder_inference(
    t5_trt_decoder, expand_inputs_for_beam_search(input_ids, num_beams) if num_beams > 1 else input_ids, 
    expand_inputs_for_beam_search(encoder_last_hidden_state, num_beams) if num_beams > 1 else encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
decoder_e2e_median_time

CPU times: user 19.3 ms, sys: 0 ns, total: 19.3 ms
Wall time: 19 ms


0.0014868349535390735

### Full model inference benchmark

Next, we will try the full TensorRT T5 engine for the task of translation. As before, note the time difference.

In [70]:
%%time
decoder_output, full_e2e_median_runtime = full_inference(
    t5_trt_encoder,
    t5_trt_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[t5_metadata.variant],
    num_beams=num_beams,
    use_cuda=True,
)

print(tokenizer.decode(decoder_output[0], skip_special_tokens=True))
full_e2e_median_runtime


Das ist gut.
CPU times: user 13.3 ms, sys: 0 ns, total: 13.3 ms
Wall time: 13.1 ms


You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference. On an NVIDIA V100 32GB GPU, this results in upto ~10x performance improvement (from 0.0802s to 0.0082s for the T5-small variant).

## Conclusion and where-to next?

This notebook has walked you through the process of converting a HuggingFace PyTorch T5 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace T5 model while providing significant speed up. 

If you are interested in further details of the conversion process, check out [T5/trt.py](../T5/trt.py)

# Whisper Tensorrt 

In [71]:
from transformers import AutoConfig
from Whisper.trt import WhisperTRTEncoder, WhisperTRTDecoder, TRTHFRunner

In [72]:
# Initialize TensorRT engines
trt_config = AutoConfig.from_pretrained(Whisper_VARIANT, use_cache = metadata.other.kv_cache)

# FP32
whisper_trt_encoder = WhisperTRTEncoder(whisper_trt_encoder_engine, metadata, trt_config, batch_size=batch_size)
whisper_trt_decoder = WhisperTRTDecoder(whisper_trt_decoder_engine, metadata, trt_config, batch_size=batch_size, num_beams=num_beams)

# FP16
whisper_trt_encoder_fp16 = WhisperTRTEncoder(whisper_trt_encoder_engine_fp16, metadata_fp16, trt_config, batch_size=batch_size)
whisper_trt_decoder_fp16 = WhisperTRTDecoder(whisper_trt_decoder_engine_fp16, metadata_fp16, trt_config, batch_size=batch_size, num_beams=num_beams)

In [73]:
wte = TRTHFRunner(trt_engine_file, network_metadata, hf_config, batch_size=batch_size)


NameError: name 'trt_engine_file' is not defined

In [None]:
encoder_last_hidden_states = whisper_trt_encoder(input_features=input_features)


In [111]:
whisper_trt_encoder.get_optimization_profile(1,1)

In [100]:
import tensorrt as trt

In [102]:
trt_logger = trt.Logger()
trt_runtime = trt.Runtime(trt_logger)

In [106]:
with open(whisper_trt_encoder_engine.fpath, "rb") as f:
    trt_logger = trt.Logger()
    trt_runtime = trt.Runtime(trt_logger)
    trt_engine = trt_runtime.deserialize_cuda_engine(f.read())
    trt_context = trt_engine.create_execution_context()


In [109]:
trt_engine.num_optimization_profiles

1

In [108]:
trt_context.active_optimization_profile = 0

  trt_context.active_optimization_profile = 0


In [98]:
whisper_trt_encoder_engine.create_execution_context()

AttributeError: 'WhisperEncoderTRTEngine' object has no attribute 'create_execution_context'

NameError: name 'tensorrt' is not defined

In [87]:
%%time
encoder_last_hidden_state, encoder_e2e_median_time = w_encoder_inference(
    whisper_trt_encoder, input_features, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
)
encoder_e2e_median_time


TypeError: (): incompatible function arguments. The following argument types are supported:
    1. (arg0: tensorrt.tensorrt.IExecutionContext, arg1: int) -> None

Invoked with: <tensorrt.tensorrt.IExecutionContext object at 0x7f7898bc7d70>, None

### End-to-End TensorRT Inference

In [78]:
from transformers.generation_logits_process import (
    NoRepeatNGramLogitsProcessor,
    MinLengthLogitsProcessor,
    ForcedBOSTokenLogitsProcessor,
    ForcedEOSTokenLogitsProcessor,
    LogitsProcessorList,
)
from transformers.generation_stopping_criteria import (
    MaxLengthCriteria,
    StoppingCriteriaList,
)
from transformers.generation_beam_search import (
    BeamSearchScorer,
)

stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_output_len)])
no_repeat_ngram_size = WhisperModelTRTConfig.NO_REPEAT_NGRAM_SIZE
min_length = WhisperModelTRTConfig.MIN_OUTPUT_LENGTH[Whisper_VARIANT]
logits_processor = LogitsProcessorList([
    NoRepeatNGramLogitsProcessor(no_repeat_ngram_size), 
    MinLengthLogitsProcessor(min_length, tokenizer.convert_tokens_to_ids(tokenizer.eos_token)),
    ForcedBOSTokenLogitsProcessor(tokenizer.convert_tokens_to_ids(tokenizer.bos_token)),
    ForcedEOSTokenLogitsProcessor(max_output_len, tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
]) # by checking HuggingFace's generate() implementation carefully, the default logits processor for BART has no_repeat_ngram_size = 3 and forced_eos_token_id = 2. In this way we can ensure identical results with raw HuggingFace

decoder_initial_input = torch.full(
    (batch_size, 1), tokenizer.convert_tokens_to_ids(tokenizer.eos_token), dtype=torch.int32
).to('cuda')

if num_beams > 1:
    decoder_initial_input = expand_inputs_for_beam_search(decoder_initial_input, expand_size=num_beams)
    
# FP32
def e2e_trt():
    with torch.no_grad():
        encoder_last_hidden_states = whisper_trt_encoder(input_features=input_features)
        
        if num_beams > 1:
            # prepare input for beam search
            encoder_last_hidden_states = expand_inputs_for_beam_search(encoder_last_hidden_states, expand_size=num_beams)

            # beam scorer must be reset before each beam search run, otherwise beam search will be skipped due to scorer cache
            beam_scorer = BeamSearchScorer(
                batch_size=batch_size,
                num_beams=num_beams,
                device="cuda",
                do_early_stopping=True,
            )
        
        bart_trt_decoder.set_encoder_hidden_states_for_inference_cycle(encoder_last_hidden_states)
        
        if num_beams == 1:
            decoder_output = whisper_trt_decoder.greedy_search(
                input_ids=decoder_initial_input,
                encoder_hidden_states=encoder_last_hidden_states,
                stopping_criteria=stopping_criteria,
                logits_processor=logits_processor,
                use_cache=metadata.other.kv_cache,
                use_cuda=True
            )
        else:
            decoder_output = whisper_trt_decoder.beam_search(
                input_ids=decoder_initial_input,
                beam_scorer=beam_scorer,
                encoder_hidden_states=encoder_last_hidden_states,
                stopping_criteria=stopping_criteria,
                logits_processor=logits_processor,
                use_cache=metadata.other.kv_cache,
                use_cuda=True
            )
    return decoder_output

output_ids = e2e_trt()
outputs_trt = tokenizer.decode(output_ids[0], skip_special_tokens=True)
trt_time = measure_python_inference_code(e2e_trt, timing_profile)

Using bos_token, but it is not set yet.


NameError: name 'bart_trt_decoder' is not defined

In [None]:
len(input_ids.int().flatten().contiguous().cuda())

In [75]:
input_features.shape

torch.Size([1, 80, 3000])

In [63]:
len(input_features.int().flatten().contiguous().cuda())

240000

In [None]:

# FP16
def e2e_trt_fp16():
    with torch.no_grad():
        encoder_last_hidden_states = whisper_trt_encoder_fp16(input_ids=input_ids)
        
        if num_beams > 1:
            # prepare input for beam search
            encoder_last_hidden_states = expand_inputs_for_beam_search(encoder_last_hidden_states, expand_size=num_beams)
            
            # beam scorer must be reset before each beam search run, otherwise beam search will be skipped due to scorer cache
            beam_scorer = BeamSearchScorer(
                batch_size=batch_size,
                num_beams=num_beams,
                device="cuda",
                do_early_stopping=True,
            )
        
        whisper_trt_decoder_fp16.set_encoder_hidden_states_for_inference_cycle(encoder_last_hidden_states)
        
        if num_beams == 1:
            decoder_output = whisper_trt_decoder_fp16.greedy_search(
                input_ids=decoder_initial_input,
                encoder_hidden_states=encoder_last_hidden_states,
                stopping_criteria=stopping_criteria,
                logits_processor=logits_processor,
                use_cache=metadata.other.kv_cache,
                use_cuda=True
            )
        else:
            decoder_output = whisper_trt_decoder_fp16.beam_search(
                input_ids=decoder_initial_input,
                beam_scorer=beam_scorer,
                encoder_hidden_states=encoder_last_hidden_states,
                stopping_criteria=stopping_criteria,
                logits_processor=logits_processor,
                use_cache=metadata.other.kv_cache,
                use_cuda=True
            )
    return decoder_output

output_ids_fp16 = e2e_trt_fp16()
outputs_trt_fp16 = tokenizer.decode(output_ids_fp16[0], skip_special_tokens=True)
trt_time_fp16 = measure_python_inference_code(e2e_trt_fp16, timing_profile)

In [74]:
input_features.shape[2]

3000

In [68]:
# print results and timing statistics
print(f'Device: {torch.cuda.get_device_name()}')
print(f"Using engine: {metadata_string + '-' + engine_tag}")   
print(f'Output identical to HF results? {outputs_trt == outputs_hf}')
print(f"Precision: FP32")
print(f'TRT time: {percentile_print(trt_time)}')
print()
print(f"Using engine: {metadata_string_fp16 + '-' + engine_tag}")   
print(f'Output identical to HF results? {outputs_trt_fp16 == outputs_hf}')
print(f"Precision: FP16")
print(f'TRT time: {percentile_print(trt_time_fp16)}')

Device: NVIDIA A100-SXM4-80GB
Using engine: Whisper-tiny-bs1


NameError: name 'outputs_trt' is not defined