### Install HuggingFace Transformers

In [1]:
!pip install --quiet transformers[sentencepiece]

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


## Imports

In [2]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

# disable warning in notebook
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import configparser
import numpy as np
import torch
import os
import math
from transformers import PreTrainedTokenizerFast
from transformers import T5ForConditionalGeneration, T5Tokenizer 
from FasterTransformer.examples.pytorch.t5.utils.ft_encoder import FTT5EncoderWeight, FTT5Encoder
from FasterTransformer.examples.pytorch.t5.utils.ft_decoding import FTT5DecodingWeight, FTT5Decoding, FTT5

  from .autonotebook import tqdm as notebook_tqdm


## HuggingFace T5 Model

In [5]:
# specify model name or checkpoint path
model_name_or_path = 't5-small'
t5_model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
t5_model.eval()
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)

Downloading (…)lve/main/config.json: 100%|███████████████████████████████████████████████████████| 1.21k/1.21k [00:00<00:00, 316kB/s]
Downloading (…)"pytorch_model.bin";: 100%|████████████████████████████████████████████████████████| 242M/242M [00:15<00:00, 16.0MB/s]
Downloading (…)neration_config.json: 100%|██████████████████████████████████████████████████████████| 147/147 [00:00<00:00, 47.5kB/s]
Downloading (…)ve/main/spiece.model: 100%|████████████████████████████████████████████████████████| 792k/792k [00:00<00:00, 2.60MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Define FT T5 Model

In [11]:
import json

In [16]:
string = """
{
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 65536,
  "d_kv": 128,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 128,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "vocab_size": 32128
}
"""

In [18]:
config = json.loads(string)

In [19]:
config

{'architectures': ['T5WithLMHeadModel'],
 'd_ff': 65536,
 'd_kv': 128,
 'd_model': 1024,
 'decoder_start_token_id': 0,
 'dropout_rate': 0.1,
 'eos_token_id': 1,
 'initializer_factor': 1.0,
 'is_encoder_decoder': True,
 'layer_norm_epsilon': 1e-06,
 'model_type': 't5',
 'n_positions': 512,
 'num_heads': 128,
 'num_layers': 24,
 'output_past': True,
 'pad_token_id': 0,
 'relative_attention_num_buckets': 32,
 'task_specific_params': {'summarization': {'early_stopping': True,
   'length_penalty': 2.0,
   'max_length': 200,
   'min_length': 30,
   'no_repeat_ngram_size': 3,
   'num_beams': 4,
   'prefix': 'summarize: '},
  'translation_en_to_de': {'early_stopping': True,
   'max_length': 300,
   'num_beams': 4,
   'prefix': 'translate English to German: '},
  'translation_en_to_fr': {'early_stopping': True,
   'max_length': 300,
   'num_beams': 4,
   'prefix': 'translate English to French: '},
  'translation_en_to_ro': {'early_stopping': True,
   'max_length': 300,
   'num_beams': 4,
   'pr

In [9]:
encoder_config

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": false,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "pref

In [8]:
encoder_config = t5_model.encoder.config
decoder_config = t5_model.decoder.config
activation_type = encoder_config.feed_forward_proj
tie_word_embeddings = decoder_config.tie_word_embeddings

# single-gpu so set TP=1, PP=1
tensor_para_size = 1
pipeline_para_size = 1
t5_with_bias = False
use_gated_activation = False
position_embedding_type = 0
weight_data_type = np.float32
q_scaling = 1.0 / (math.sqrt(encoder_config.d_kv))

In [7]:
ft_encoder_weight = FTT5EncoderWeight(
    encoder_config,
    tensor_para_size,
    pipeline_para_size,
    t5_with_bias=t5_with_bias,
    use_gated_activation=use_gated_activation,
    position_embedding_type=position_embedding_type,
    weight_data_type=weight_data_type,
)
ft_decoding_weight = FTT5DecodingWeight(
    decoder_config,
    tensor_para_size,
    pipeline_para_size,
    t5_with_bias=t5_with_bias,
    use_gated_activation=use_gated_activation,
    position_embedding_type=position_embedding_type,
    weight_data_type=weight_data_type,
)

ft_encoder_weight.load_from_model(t5_model)
ft_decoding_weight.load_from_model(t5_model)



In [8]:
inference_data_type = "fp16"

if inference_data_type == "fp32":
    ft_encoder_weight.to_float()
    ft_decoding_weight.to_float()
elif inference_data_type == "fp16":
    ft_encoder_weight.to_half()
    ft_decoding_weight.to_half()
elif inference_data_type == "bf16":
    ft_encoder_weight.to_bfloat16()
    ft_decoding_weight.to_bfloat16()

In [9]:
remove_padding = False
max_distance = 128
sparse = False
lib_path = './FasterTransformer/build/lib/libth_transformer.so'

ft_encoder = FTT5Encoder(ft_encoder_weight.w,
                         lib_path,
                         encoder_config.num_heads,
                         encoder_config.d_kv,
                         encoder_config.d_ff,
                         encoder_config.d_model,
                         remove_padding,
                         encoder_config.num_layers,
                         encoder_config.relative_attention_num_buckets,
                         0, # num_experts
                         [], # moe_layer_index
                         max_distance,
                         sparse,
                         q_scaling,
                         tensor_para_size,
                         pipeline_para_size,
                         t5_with_bias,
                         position_embedding_type,
                         activation_type=activation_type)

ft_decoding = FTT5Decoding(ft_decoding_weight.w,
                           lib_path,
                           decoder_config.num_heads,
                           decoder_config.d_kv,
                           decoder_config.d_ff,
                           encoder_config.d_model,
                           decoder_config.d_model,
                           decoder_config.num_layers,
                           decoder_config.decoder_start_token_id,
                           decoder_config.eos_token_id,
                           decoder_config.vocab_size,
                           q_scaling,
                           decoder_config.relative_attention_num_buckets,
                           0, # num_experts
                           [], # moe_layer_index,
                           max_distance,
                           tensor_para_size=tensor_para_size,
                           pipeline_para_size=pipeline_para_size,
                           t5_with_bias=t5_with_bias,
                           position_embedding_type=position_embedding_type,
                           activation_type=activation_type,
                           tie_word_embeddings=tie_word_embeddings)



In [None]:
ft_t5 = FTT5(ft_encoder, ft_decoding)

## Define Inputs

In [10]:
INPUTS = [
    "translate English to French: Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems",
    "translate English to German: Giant sequoia trees are the largest trees by volume in the world"
]
batch_size = len(INPUTS)
inputs = tokenizer(INPUTS, padding=True, return_tensors="pt")
input_ids = inputs['input_ids']

In [11]:
input_ids.shape

torch.Size([2, 22])

## Set generation settings

In [12]:
# set output len to 64
max_seq_len = 64

## HF Output

In [13]:
output = t5_model.generate(input_ids, max_length=max_seq_len)
hf_tokens = tokenizer.batch_decode(output, skip_special_tokens=True)
hf_tokens

["L'intelligence artificielle est la simulation des processus de l'intelligence humaine par des machines, en particulier des systèmes informatiques.",
 'Riesensequoien sind die größten Baumarten weltweit']

## FT Output

In [15]:
beam_search_diversity_rate = 0.0
# beam width
num_beams = 1
# topk and topp sampling
topk = 0
topp = 0.0

# An example to prevent generating "Chef"
# bad_words_text = np.array([["Chef"]]* len(input_texts), dtype=object)
# bad_words_list = to_word_list_format(bad_words_text, tokenizer)
# bad_words_list = torch.Tensor(bad_words_list).to(torch.int32).to("cuda").contiguous()
bad_words_list = None

# An example to stop generation when the model generate "Chef"
# stop_words_text = np.array([["Chef"]] * len(input_texts), dtype=object)
# stop_words_list = to_word_list_format(stop_words_text, tokenizer)
# stop_words_list = torch.Tensor(stop_words_list).to(torch.int32).to("cuda").contiguous()
stop_words_list = None

repetition_penalty = 1.0
temperature = 1.0
len_penalty = 0.0

In [16]:
# ft_t5 returns output_ids of shape [batch_size, beam_width, max_output_seq_len]
# ft_t5 returns sequence_length of shape [batch_size, beam_width]
ft_output_ids, ft_sequence_length = ft_t5(input_token=inputs,
                                                  inputs_embeds=None,
                                                  beam_size=num_beams,
                                                  max_seq_len=max_seq_len,
                                                  top_k=topk,
                                                  top_p=topp,
                                                  beam_search_diversity_rate=beam_search_diversity_rate,
                                                  is_return_output_log_probs=False,
                                                  is_return_cum_log_probs=False,
                                                  repetition_penalty=repetition_penalty,
                                                  temperature=temperature,
                                                  len_penalty=len_penalty,
                                                  bad_words_list=bad_words_list,
                                                  stop_words_list=stop_words_list)

In [17]:
ft_outputs = []
for i in range(batch_size):
    # selecting the top sequence from beam width number of sequences
    ft_outputs.append(list(ft_output_ids[i, 0, :][:ft_sequence_length[i , 0]]))
ft_tokens = tokenizer.batch_decode(ft_outputs, skip_special_tokens=True)

ft_tokens

["L'intelligence artificielle est la simulation des processus de l'intelligence humaine par des machines, en particulier des systèmes informatiques.",
 'Riesensequoien sind die größten Baumarten weltweit']