In [1]:
# !pip install -r ../requirements.txt

In [2]:
# !cat ../requirements.txt

In [3]:
import sys
sys.path.insert(0, "/home/ec2-user/working/lib/python/")

In [4]:
import os
import sys
import torch 
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

# huggingface
# from transformers import (
#     GPT2LMHeadModel,
#     GPT2Tokenizer,
#     GPT2Config,
# )

In [6]:
import transformers
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers import AutoTokenizer, LlamaForCausalLM
from transformers import GPT2Tokenizer
from transformers import LlamaTokenizer

In [7]:
transformers.__version__

'4.33.2'

In [8]:
model_name = "llama-xs"
max_length = 20

xs_config = LlamaConfig(
    hidden_size=4*4,
    intermediate_size=128,
    num_hidden_layers=4,
    num_attention_heads=4,
    num_key_value_heads=None,
    hidden_act='silu',
    max_position_embeddings=128,
    initializer_range=0.02,
    rms_norm_eps=1e-06,
    use_cache=False,
    pad_token_id=None,
    bos_token_id=1,
    eos_token_id=2,
    pretraining_tp=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
)
xs_config

LlamaConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 16,
  "initializer_range": 0.02,
  "intermediate_size": 128,
  "max_position_embeddings": 128,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.33.2",
  "use_cache": false,
  "vocab_size": 32000
}

In [9]:
model = LlamaForCausalLM(xs_config)

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [11]:
prompt = "Hey, are you conscious? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=30)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Hey, are you conscious? Can you talk to me?� bowl waited 1024secut pitch !=345 mph Censusispers climbAround Cel principavor African reproduction'

In [12]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(model_name)
!mkdir -p $pytorch_model_dir

model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/llama-xs/pytorch


In [13]:
pytorch_model_dir

'./models/llama-xs/pytorch'

### Inference with PyTorch model

In [14]:
# carry out inference with a single sample
input_str = "Hello, my dog is a dog"
inputs = tokenizer(input_str, return_tensors="pt")
input_ids = inputs.input_ids

In [15]:
input_ids, input_ids.shape

(tensor([[15496,    11,   616,  3290,   318,   257,  3290]]),
 torch.Size([1, 7]))

In [16]:
# WAR: Using an ugly representation because cuda 11.4 does not support GPU models due to cublas errors
if "cuda-11.4" in os.environ["LD_LIBRARY_PATH"]:
    print("Using cpu")
    model = model.cpu()
    input_ids = input_ids.cpu()
    inputs = inputs.to('cpu')
else:
    print("Using gpu")
    model = model.cuda()
    input_ids = input_ids.cuda()
    inputs = inputs.to('cuda:0')

Using gpu


#### Single example inference

In [17]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs, labels=inputs['input_ids'], use_cache = False)

logits = outputs.logits

In [18]:
logits, logits.shape

(tensor([[[-0.0661,  0.1066,  0.0253,  ..., -0.0424, -0.0400,  0.0622],
          [ 0.0667,  0.0418,  0.1152,  ..., -0.0344,  0.1413,  0.0101],
          [-0.0968, -0.0149,  0.0534,  ..., -0.0528,  0.1104, -0.0157],
          ...,
          [-0.0481, -0.0632, -0.0397,  ..., -0.0672,  0.1063, -0.0620],
          [-0.0099, -0.0088, -0.0292,  ...,  0.0577, -0.0259, -0.0685],
          [-0.0105, -0.0950, -0.2092,  ..., -0.0007, -0.1921, -0.0915]]],
        device='cuda:0'),
 torch.Size([1, 7, 32000]))

<a id="2"></a>

## 2. Convert to ONNX format

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format: ONNX.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

At a high level, the steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU with the TensorRT engine. 

In [19]:
from NNDF.networks import NetworkMetadata, Precision
from GPT2.GPT2ModelConfig import GPT2Metadata
metadata = NetworkMetadata(variant=model_name, precision=Precision(fp16=True), other=GPT2Metadata(kv_cache=False))
metadata

NetworkMetadata(variant='llama-xs', precision=Precision(fp16=True), other=GPT2Metadata(kv_cache=False))

In [20]:
from torch.nn import Module
from transformers.generation_utils import GenerationMixin
from transformers.modeling_outputs import CausalLMOutputWithPast
class TorchModule(Module, GenerationMixin):
    """
    A simplied definition of Llama.
    """

    def __init__(self, llama_model, lm_head, config):
        super().__init__()
        self.llama_model = llama_model
        self.lm_head = lm_head
        self.config = config
        self.device = torch.device('cuda') # WAR to avoid beam search in framework
        self.main_input_name = "input_ids" # For better HuggingFace version compatibility

    # def prepare_inputs_for_generation(self, input_ids, past = None, use_cache=None, **kwargs):
    #     # Todo (@pchadha): add position_ids, token_type_ids support
    #     # cut decoder_input_ids if past is used
    #     if past is not None:
    #         input_ids = input_ids[:, -1:]

    #     return {
    #         "input_ids": input_ids,
    #         "use_cache": use_cache,
    #         "past_key_values": past
    #     }

    def forward(self, input_ids, **kwargs):
        outputs = self.llama_model(input_ids, **kwargs)
        hidden_states = outputs[0]
        lm_logits = self.lm_head(hidden_states)

        return CausalLMOutputWithPast(
            logits=lm_logits, 
            past_key_values=outputs.past_key_values
        )

    # def _reorder_cache(self, past, beam_idx):
    #     """
    #     This function is used to re-order the :obj:`past_key_values` cache if
    #     :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
    #     called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
    #     """
    #     return tuple(
    #         tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
    #         for layer_past in past
    #     )

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)



In [21]:
input_names = ('input_ids',)
output_names = ('logits',)
input_dynamic_axis = {'input_ids': {0: 'batch', 1: 'sequence'}}
output_dynamic_axis = {'logits': {0: 'batch', 1: 'sequence'}}

opt_args = {}

output_fpath = ('./models/{}/ONNX/{}.onnx'.format(model_name, model_name))
output_fpath_sim = ('./models/{}/ONNX/{}_sim.onnx'.format(model_name, model_name))
Path(output_fpath).parent.mkdir(exist_ok=True, parents=True)

In [22]:
llama_model = TorchModule(model.model, model.lm_head, xs_config)

In [23]:
old_forward = llama_model.forward
def _export_forward(input_ids, **kwargs):
    kwargs["use_cache"] = False
    result = old_forward(input_ids, **kwargs)
    return result[0]


llama_model.forward = _export_forward

torch.onnx.export(
    llama_model,
    input_ids,
    output_fpath,
    opset_version=13,
    do_constant_folding=True,
    input_names=input_names,
    output_names=output_names,
    dynamic_axes={
        **input_dynamic_axis,
        **output_dynamic_axis,
    },
    training=torch.onnx.TrainingMode.EVAL,
    **opt_args
)

  if input_shape[-1] > 1:
  if seq_len > self.max_seq_len_cached:
  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):


In [24]:
output_fpath

'./models/llama-xs/ONNX/llama-xs.onnx'

In [25]:
# !onnxsim $output_fpath $output_fpath_sim

In [26]:
!ls -lh $pytorch_model_dir

total 4.1M
-rw-r--r-- 1 root root  575 Sep 20 20:51 config.json
-rw-r--r-- 1 root root  133 Sep 20 20:51 generation_config.json
-rw-r--r-- 1 root root 4.1M Sep 20 20:51 pytorch_model.bin


In [27]:
!ls -lh $output_fpath

-rw-r--r-- 1 root root 4.2M Sep 20 20:51 ./models/llama-xs/ONNX/llama-xs.onnx


In [28]:
Path(output_fpath).stat().st_size / 1e6

4.342756

Let's take a look at the onnx file and investigate its input and output. You should see that "input_ids" as the input, and "logits" as the output.

In [29]:
onnx_path = output_fpath
onnx_path_sim = output_fpath_sim

In [30]:
import onnx

In [31]:
onnx_model = onnx.load(onnx_path)

In [32]:
onnx_model.graph.input

[name: "input_ids"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
    }
  }
}
]

In [33]:
onnx_model.graph.output

[name: "logits"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_param: "sequence"
      }
      dim {
        dim_value: 32000
      }
    }
  }
}
]

<a id="3"></a>

## 3. Convert to TensorRT engine

Now we are ready to parse the ONNX model and convert it to an optimized TensorRT model.

Since the model contains dynamic input shapes, we can specify a valid input range with a TensorRT optimization profile.

Note: As TensorRT carries out many optimization, this conversion process for the larger model might take a while.

In [34]:
from polygraphy.backend.trt import Profile
from tensorrt import PreviewFeature
from GPT2.export import GPT2ONNXFile, GPT2TRTEngine

In [35]:
!mkdir -p ./models/$model_name/trt-engine
trt_engine_folder = './models/{}/trt-engine'.format(model_name)

# Create optimization profile for dynamic shape input. Can modify batch_size / max_sequence_length to build engines for different shapes
batch_size = 1
disable_preview_dynamic_shapes = False # preview_dynamic_shapes optimizes the trt engine building time
# We can either use input length as the optimal length, or use max_length // 2. 
# In T5 or BART, input_length is better, but in GPT-2, max_length // 2 is better because we need to generate max_length number of tokens

use_input_length = False
opt_length = input_id.shape[1] if use_input_length else max_length // 2 
# Create different engine tags for different configurations
engine_tag = f"bs{batch_size}"
preview_features = [PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-noPreviewFasterDynamicShapes"
else:
    preview_features += [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]

profiles = [Profile().add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, opt_length), # Optimized based on the inputs. 
    max=(batch_size, max_length),
)]

In [36]:
profiles

[Profile().add('input_ids', min=(1, 1), opt=(1, 10), max=(1, 20))]

In [37]:
preview_features

[<PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805: 1>,
 <PreviewFeature.FASTER_DYNAMIC_SHAPES_0805: 0>]

In [38]:
engine_path = os.path.join(trt_engine_folder, f"{model_name}-{engine_tag}.engine")

if os.path.exists(engine_path):
    os.remove(engine_path)

if not os.path.exists(engine_path):
    gpt2_engine = GPT2ONNXFile(onnx_path, metadata).as_trt_engine(output_fpath=engine_path, profiles=profiles, preview_features=preview_features)
else:
    gpt2_engine = GPT2TRTEngine(engine_path, metadata)

[W] 'colored' module is not installed, will not use colors when logging. To enable colors, please install the 'colored' module: python3 -m pip install colored
[V] Loaded Module: tensorrt | Version: 8.6.1 | Path: ['/usr/local/lib/python3.10/dist-packages/tensorrt']
[V] [MemUsageChange] Init CUDA: CPU +1, GPU +0, now: CPU 2075, GPU 213 (MiB)
[X] Trying to load shared library libnvinfer_builder_resource.so.8.6.1
[X] Loaded shared library libnvinfer_builder_resource.so.8.6.1
[V] [MemUsageChange] Init builder kernel library: CPU +889, GPU +172, now: CPU 3041, GPU 385 (MiB)
[X] CUDA lazy loading is enabled.
[V] ----------------------------------------------------------------
[V] Input filename:   ./models/llama-xs/ONNX/llama-xs.onnx
[V] ONNX IR version:  0.0.7
[V] Opset version:    13
[V] Producer name:    pytorch
[V] Producer version: 1.13.1
[V] Domain:           
[V] Model version:    0
[V] Doc string:       
[V] ----------------------------------------------------------------
[X] Register

In [39]:
gpt2_engine

<GPT2.export.GPT2TRTEngine at 0x7fbf109a5630>

In [40]:
Path(engine_path).stat().st_size / 1e6

3.2077

In [41]:
1/0

ZeroDivisionError: division by zero

### Inference with TensorRT engine

Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the GPT-2 model, ready for us to carry out inference. 

The GPT-2 model with TensorRT backend can now be employed in place of the original HuggingFace GPT-2 model.

#### Single batch inference


In [None]:
from GPT2.trt import GPT2TRTDecoder
config = GPT2Config.from_pretrained(GPT2_VARIANT, use_cache = False)

In [None]:
gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)

In [None]:
# Benchmarking TensorRT performance on single batch
_, decoder_e2e_median_time = gpt2_inference(
            gpt2_trt, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
        )
decoder_e2e_median_time

In [None]:
with torch.no_grad():
    outputs = gpt2_trt(input_ids=input_ids)
logits = outputs.logits

In [None]:
logits, logits.shape

#### Open-end text generation
Let's generate the same task again. Since GPT-2 is an open-ended model, a small turbulent in the model might have a very different result. Since we have done some format changes and input/output restriction while exporting the model, you might see a different result compared to raw HuggingFace model.  

In [None]:
%%timeit
sample_output = gpt2_trt.generate(input_ids.cuda(), max_length=max_length)

# de-tokenize model output to raw text
tokenizer.decode(sample_output[0], skip_special_tokens=True)

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    gpt2_trt, input_ids.cuda(), tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length
)
full_e2e_median_runtime

You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference. On an NVIDIA V100 32GB GPU, this results in about ~5x performance improvement for the GPT-2 model (from an average of 0.704s to 0.134s).

Now you have known how to convert a model to onnx, build TRT engine and optimize it. As you might have recalled, using kv cache and beam search are two important ways to improve the performance of the decoder models. We have recently added thse support to our HuggingFace demo. 

<a id="4"></a>

## 4. Advanced Topic: KV Cache

As you have seen above, we put `use_cache = False` in some code blocks. This is because in the simplified model, we only take `input_ids` as input and `logits` as output. `input_ids` is growing as the sequence goes longer. In reality, we sometimes cache the self-attentions for each layer and reuse them in the later computations. This allows us to only take the last generated `input_ids`. This is a trade-off between space and time. When the model is small or the sequence is small, the D2D data copy time usually outweights the performance improvement of the model. However, performance improvements have been found in larger models with larger sequence length like 512. 

In [None]:
use_cache = True
kv_config = GPT2Config.from_pretrained(GPT2_VARIANT, use_cache = use_cache)

#### Raw HuggingFace

The model that we download from `GPT2LMHeadModel.from_pretrained` is dynamic in its inputs. It can take both kv and non-kv configurations. Changing `use_cache` will do it. You can see that changing this configuration, the output is changed. 

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    model, input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, use_cache = use_cache
)
full_e2e_median_runtime

In [None]:
sample_output = model.generate(input_ids, max_length=max_length, use_cache = use_cache)

# de-tokenize model output to raw text
tokenizer.decode(sample_output[0], skip_special_tokens=True)

#### TensorRT

For the 1st decoding step, we take `input_ids` and generate both `logits` and the kv cache. In other steps, we take the new `input_ids` with `past` kv-cache and the outputs are `logits` and the updated `present` kv-cache. Taking dynamic number of inputs for trt is not currently supported in our demo, so we need to output 2 onnx files and build 2 engines.

In [None]:
kv_metadata = NetworkMetadata(variant=GPT2_VARIANT, precision=Precision(fp16=True), other=GPT2Metadata(kv_cache=use_cache))
kv_gpt2 = GPT2TorchFile(model.to('cpu'), kv_metadata)

In [None]:
kv_onnx_path = ('./models/{}/ONNX/{}-kv_cache.onnx'.format(GPT2_VARIANT, GPT2_VARIANT))
kv_gpt2.as_onnx_model(kv_onnx_path, force_overwrite=False)

In [None]:
kv_onnx_model = onnx.load(kv_onnx_path)

We could see that the kv model has #inputs = #outputs = num_layers * 2 + 1

In [None]:
len(kv_onnx_model.graph.input), len(kv_onnx_model.graph.output)

The next blocks will set up the profile and build the engine. The only difference is that we now have the profile for kv cache

In [None]:
batch_size = 1
disable_preview_dynamic_shapes = False

engine_tag = "bs{}".format(batch_size)

preview_features = [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-disableFasterDynamicShapes"
    preview_features = []

use_input_length = False
num_heads = kv_config.n_head
embedding_size_per_head = kv_config.n_embd // num_heads
num_layers = kv_config.n_layer

max_sequence_length = max_length
max_output_length = max_length
if not use_input_length:
    opt_input_seq_len = max_sequence_length // 2
else:
    opt_input_seq_len = input_ids.shape[1]

opt_output_seq_len = max_output_length // 2

# context phase uses the provided input_ids to generate hidden states and self attention kv cache
# It is only used in the 1st decoder run.
dec_profiles_context = Profile().add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, opt_output_seq_len),
    max=(batch_size, max_output_length),
)
self_attention_profile_context = {
    "min": (batch_size, num_heads, 0, embedding_size_per_head),
    "opt": (batch_size, num_heads, 0, embedding_size_per_head),
    "max": (batch_size, num_heads, 0, embedding_size_per_head),
}

# generation phase uses previous self attention kv cache with the last input_ids token to generate the next hidden states and self attention kv cache
# This optimization profile is used after the 1st decoder run.
dec_profiles_generation = Profile().add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, 1),
    max=(batch_size, 1),
)

self_attention_profile_generation = {
    "min": (batch_size, num_heads, 1, embedding_size_per_head),
    "opt": (batch_size, num_heads, opt_output_seq_len - 1, embedding_size_per_head),
    "max": (batch_size, num_heads, max_output_length - 1, embedding_size_per_head),
}

for i in range(num_layers):
    dec_profiles_context = dec_profiles_context.add(
        f"past_key_values.{i}.decoder.key",
        **self_attention_profile_context
    ).add(
        f"past_key_values.{i}.decoder.value",
        **self_attention_profile_context
    )

    dec_profiles_generation = dec_profiles_generation.add(
        f"past_key_values.{i}.decoder.key",
        **self_attention_profile_generation
    ).add(
        f"past_key_values.{i}.decoder.value",
        **self_attention_profile_generation
    )

# TensorRT accepts multiple optimization engines for the same model.
# Profile 1 is only used in the first decoder iterations.
decoder_profiles = [dec_profiles_generation, dec_profiles_context]

In [None]:
kv_engine_path = os.path.join(trt_engine_folder, f"{GPT2_VARIANT}-kv_cache_{engine_tag}.engine")

# Set up the trt engine with both kv input/output augmented
if not os.path.exists(kv_engine_path):
    kv_gpt2_engine = GPT2ONNXFile(kv_onnx_path, kv_metadata).as_trt_engine(kv_engine_path,profiles=decoder_profiles, preview_features=preview_features)
else:
    kv_gpt2_engine = GPT2TRTEngine(kv_engine_path, kv_metadata)

    
kv_gpt2_trt = GPT2TRTDecoder(
    kv_gpt2_engine, kv_metadata, kv_config, batch_size=batch_size
)

Since we have 2 profiles, benchmarking single-run runtime does not make sense. We instead use `full_inference` to measure the time for the entire inference cycle.

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    kv_gpt2_trt, input_ids.cuda(), tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, use_cache = use_cache
)
full_e2e_median_runtime

In [None]:
kv_gpt2_trt.reset()
kv_sample_output = kv_gpt2_trt.generate(input_ids.cuda(), max_length=max_length)
tokenizer.decode(kv_sample_output[0], skip_special_tokens=True)

In this short example, kv cache performance does not improve the performance, and may even be slightly worse than non kv cache mode. However, when we have larger input sequences for the model, it will be better.

<a id="5"></a>

## 5. Advanced Topic: Beam Search

Beam search is a way to increase the model quality. It looks for the top `num_beams` number of possible words and pick the one that conditions the best to the current position. Similarly, the original HuggingFace PyTorch model supports beam search natively, while we need to build separate trt engine for different `num_beams`.

In [None]:
beam_config = GPT2Config.from_pretrained(GPT2_VARIANT, use_cache = False)
beam_metadata = NetworkMetadata(variant=GPT2_VARIANT, precision=Precision(fp16=True), other=GPT2Metadata(kv_cache=False))
num_beams = 3

#### HuggingFace

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    model, input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, num_beams = num_beams
)
full_e2e_median_runtime

In [None]:
sample_output = model.generate(input_ids, max_length=max_length, num_beams = num_beams)

# de-tokenize model output to raw text
tokenizer.decode(sample_output[0], skip_special_tokens=True)

You could see that the output is very different from the original one. If you change `num_beams`, the result will also change significantly.

#### TensorRT
It uses the same onnx file as the original configuration, but the engine set up is differently, because it expands the inputs by `num_beams` for the first dimension of inputs.

In [None]:
# Create optimization profile for dynamic shape input. Can modify batch_size / max_sequence_length to build engines for different shapes
batch_size = 1
disable_preview_dynamic_shapes = False # preview_dynamic_shapes optimizes the trt engine building time
# We can either use input length as the optimal length, or use max_length // 2. 
# In T5 or BART, input_length is better, but in GPT-2, max_length // 2 is better because we need to generate max_length number of tokens

use_input_length = False
opt_length = input_id.shape[1] if use_input_length else max_length // 2 
# Create different engine tags for different configurations
engine_tag = f"bs{batch_size}-beam{num_beams}"

preview_features = [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
if disable_preview_dynamic_shapes:
    engine_tag += "-disableFasterDynamicShapes"
    preview_features = []
    

beam_profiles = [Profile().add(
    "input_ids",
    min=(batch_size * num_beams, 1),
    opt=(batch_size * num_beams, opt_length), # Optimized based on the inputs. 
    max=(batch_size * num_beams, max_length),
)]

In [None]:
beam_profiles

In [None]:
beam_engine_path = os.path.join(trt_engine_folder, f"{GPT2_VARIANT}-{engine_tag}.engine")
if not os.path.exists(beam_engine_path):
    beam_gpt2_engine = GPT2ONNXFile(onnx_path, beam_metadata).as_trt_engine(output_fpath=beam_engine_path, profiles=beam_profiles, preview_features=preview_features)
else:
    beam_gpt2_engine = GPT2TRTEngine(beam_engine_path, beam_metadata)

In [None]:
beam_gpt2_trt = GPT2TRTDecoder(beam_gpt2_engine, beam_metadata, beam_config, num_beams = num_beams)

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    beam_gpt2_trt, input_ids.cuda(), tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, num_beams=num_beams
)
full_e2e_median_runtime

In [None]:
beam_sample_output = beam_gpt2_trt.generate(input_ids.cuda(), max_length=max_length, num_beams=num_beams)
tokenizer.decode(beam_sample_output[0], skip_special_tokens=True)

We could see that because of larger batch size, beam search will take slightly longer, but for most sequences, it will generate more meaningful outputs.

## Conclusion and where-to next?

This notebook has walked you through the process of converting a HuggingFace PyTorch GPT-2 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace GPT-2 model while providing significant speed up. 

If you are interested in further details of the conversion process, check out [GPT2/trt.py](../GPT2/trt.py)