In [19]:
import numpy as np
import torch
import json
from pathlib import Path
import os
import sys
ROOT_DIR = os.path.abspath("../")
from pathlib import Path
sys.path.append(ROOT_DIR)
from NNDF.networks import NetworkMetadata, Precision
from NNDF.torch_utils import expand_inputs_for_beam_search
from T5.T5ModelConfig import T5ModelTRTConfig, T5Metadata
from T5.trt import T5TRTEncoder, T5TRTDecoder
from T5.export import T5EncoderTRTEngine, T5DecoderTRTEngine
from torch.utils.dlpack import from_dlpack, to_dlpack

# from HuggingFace transformers
from transformers.generation_logits_process import (
    MinLengthLogitsProcessor,
    LogitsProcessorList,
)
from transformers.generation_stopping_criteria import (
    MaxLengthCriteria,
    StoppingCriteriaList,
)
from transformers.generation_beam_search import (
    BeamSearchScorer,
)
from transformers import AutoTokenizer, AutoConfig

In [28]:
# settings

T5_VARIANT = "t5-small"
num_beams = 2
batch_size = 1
early_stopping = False
max_length = 200
min_length = 30
# TRT KV Cache disabled due to performance improvements in progress, not beating non-KV version yet
use_cache = False

In [29]:

# Initialize TensorRT engines from disk
metadata = NetworkMetadata(variant=T5_VARIANT, precision=Precision(fp16=True), other=T5Metadata(kv_cache=use_cache))

encoder_onnx_model_fpath = T5_VARIANT + "-encoder.onnx"
decoder_onnx_model_fpath = T5_VARIANT + "-decoder-with-lm-head.onnx"
tensorrt_model_path = f"models/{T5_VARIANT}/tensorrt"
trt_config = AutoConfig.from_pretrained(T5_VARIANT)
trt_config.use_cache = metadata.other.kv_cache
trt_config.num_layers = T5ModelTRTConfig.NUMBER_OF_LAYERS[T5_VARIANT]
t5_trt_encoder_engine = T5EncoderTRTEngine(os.path.join(tensorrt_model_path, encoder_onnx_model_fpath) + ".engine", metadata)
t5_trt_decoder_engine = T5DecoderTRTEngine(os.path.join(tensorrt_model_path, decoder_onnx_model_fpath) + ".engine", metadata)
t5_trt_encoder = T5TRTEncoder(
                t5_trt_encoder_engine, metadata, trt_config, batch_size=batch_size
            )
t5_trt_decoder = T5TRTDecoder(
                t5_trt_decoder_engine, metadata, trt_config, num_beams=num_beams, batch_size=batch_size
            )

tokenizer = AutoTokenizer.from_pretrained(T5_VARIANT)
if early_stopping:
    logits_processor = LogitsProcessorList([])
else:
    logits_processor = LogitsProcessorList([
        MinLengthLogitsProcessor(min_length, tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
    ])
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length)]) 
pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

[11/09/2022-04:08:38] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
[11/09/2022-04:08:38] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars


In [30]:
texts4 = [
    "summarize: United States involvement in the Vietnam War began shortly after the end of World War II, first in an extremely limited capacity and escalated over a period of 20 years, peaking in April 1969 with 543,000 American combat troops stationed in Vietnam.[1] By the conclusion of the United States's involvement, over 3.1 million Americans had been stationed in Vietnam. This involvement, along with hippie culture, played a key role in sparking the Civil Rights Movement in the United States and wide ranging changes in popular culture.",
    "summarize: Abraham Lincoln (/ˈlɪŋkən/ LINK-ən; February 12, 1809 – April 15, 1865) was an American lawyer and statesman who served as the 16th president of the United States from 1861 until his assassination in 1865. Lincoln led the nation through the American Civil War and succeeded in preserving the Union, abolishing slavery, bolstering the federal government, and modernizing the U.S. economy.",
    "summarize: Elizabeth II (Elizabeth Alexandra Mary; 21 April 1926 – 8 September 2022) was Queen of the United Kingdom and other Commonwealth realms from 6 February 1952 until her death in 2022. She was queen regnant of 32 sovereign states during her lifetime, 15 of them at the time of her death. Her reign of 70 years and 214 days was the longest of any British monarch and the longest verified reign of any female monarch in history. ",
    "summarize: Obama was born in Honolulu, Hawaii. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black president of the Harvard Law Review. After graduating, he became a civil rights attorney and an academic, teaching constitutional law at the University of Chicago Law School from 1992 to 2004."
]

In [60]:
tokenized_text = tokenizer([texts4[3]], padding=True, return_tensors="pt")
input_ids = tokenized_text['input_ids'].cuda()

In [61]:
input_ids.shape

torch.Size([1, 92])

In [62]:
with torch.no_grad():
    encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)
    t5_trt_decoder.set_encoder_hidden_states_for_inference_cycle(encoder_last_hidden_state)
    if num_beams > 1:
        encoder_last_hidden_state = expand_inputs_for_beam_search(encoder_last_hidden_state, expand_size=num_beams)
    decoder_input_ids = torch.full((batch_size, 1), pad_token_id, dtype=torch.int32, device="cuda")
    if num_beams > 1:
        decoder_input_ids = expand_inputs_for_beam_search(decoder_input_ids, expand_size=num_beams)
    if num_beams == 1:
        decoder_output = t5_trt_decoder.greedy_search(
            input_ids=decoder_input_ids,
            encoder_hidden_states=encoder_last_hidden_state,
            stopping_criteria=stopping_criteria,
            logits_processor=logits_processor,
        )
    else:
        beam_scorer = BeamSearchScorer(
        batch_size=batch_size,
        num_beams=num_beams,
        device="cuda",
        do_early_stopping=early_stopping,
        )
        decoder_output = t5_trt_decoder.beam_search(
        input_ids=decoder_input_ids,
        beam_scorer=beam_scorer,
        encoder_hidden_states=encoder_last_hidden_state,
        stopping_criteria=stopping_criteria,
        logits_processor=logits_processor,
        use_cache=metadata.other.kv_cache
        )

In [63]:
outputs = tokenizer.batch_decode(decoder_output, skip_special_tokens=True)

In [64]:
outputs

['Obama was born in Honolulu, Hawaii. he was the first black president of the Harvard Law Review. he was a civil rights attorney and an academic.']

### Huggingface

In [65]:
from transformers import T5ForConditionalGeneration
hf_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT)

In [66]:
hf_model = hf_model.eval().cuda();

In [67]:
hf_decoder_output = hf_model.generate(input_ids, max_length=200, min_length=30, num_beams=2, early_stopping=True)

In [68]:
hf_outputs = tokenizer.batch_decode(hf_decoder_output, skip_special_tokens=True)

In [69]:
hf_outputs

['Obama was born in Honolulu, Hawaii. he was the first black president of the Harvard Law Review. he was a civil rights attorney and an academic.']