### Install HuggingFace Transformers

In [1]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers[sentencepiece]
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 7.6 MB/s eta 0:00:01
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[K     |████████████████████████████████| 190 kB 127.6 MB/s eta 0:00:01
[?25hCollecting filelock
  Downloading filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 112.4 MB/s eta 0:00:01
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 121.1 MB/s eta 0:00:01
Installing collected packages: filelock, tokenizers, huggingface-hub

## Imports

In [2]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

# disable warning in notebook
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import configparser
import numpy as np
import torch
import os
import math
from transformers import PreTrainedTokenizerFast
from transformers import T5ForConditionalGeneration, T5Tokenizer 
from FasterTransformer.examples.pytorch.t5.utils.ft_encoder import FTT5EncoderWeight, FTT5Encoder
from FasterTransformer.examples.pytorch.t5.utils.ft_decoding import FTT5DecodingWeight, FTT5Decoding, FTT5

## Set HuggingFace T5 Model

In [5]:
# specify model name or checkpoint path
model_name_or_path = 't5-base'
t5_model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
t5_model.eval()
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)

Downloading (…)lve/main/config.json: 100%|███████████████████████████████████████████████████████| 1.21k/1.21k [00:00<00:00, 290kB/s]
Downloading (…)"pytorch_model.bin";: 100%|█████████████████████████████████████████████████████████| 892M/892M [00:04<00:00, 204MB/s]
Downloading (…)neration_config.json: 100%|██████████████████████████████████████████████████████████| 147/147 [00:00<00:00, 46.5kB/s]
Downloading (…)ve/main/spiece.model: 100%|████████████████████████████████████████████████████████| 792k/792k [00:00<00:00, 2.56MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Set FT T5 Model

In [12]:
encoder_config = t5_model.encoder.config
decoder_config = t5_model.decoder.config
encoder_config.update({"num_experts": 0})
decoder_config.update({"num_experts": 0})
encoder_config.update({"moe_layer_index": []})
decoder_config.update({"moe_layer_index": []})
activation_type = encoder_config.feed_forward_proj
tie_word_embeddings = decoder_config.tie_word_embeddings

# single-gpu so set TP=1, PP=1
tensor_para_size = 1
pipeline_para_size = 1
t5_with_bias = False
use_gated_activation = False
position_embedding_type = 0
weight_data_type = np.float32
q_scaling = 1.0 / (math.sqrt(encoder_config.d_kv))

In [13]:
ft_encoder_weight = FTT5EncoderWeight(
    encoder_config,
    tensor_para_size,
    pipeline_para_size,
    t5_with_bias=t5_with_bias,
    use_gated_activation=use_gated_activation,
    position_embedding_type=position_embedding_type,
    weight_data_type=weight_data_type,
)
ft_decoding_weight = FTT5DecodingWeight(
    decoder_config,
    tensor_para_size,
    pipeline_para_size,
    t5_with_bias=t5_with_bias,
    use_gated_activation=use_gated_activation,
    position_embedding_type=position_embedding_type,
    weight_data_type=weight_data_type,
)

ft_encoder_weight.load_from_model(t5_model)
ft_decoding_weight.load_from_model(t5_model)



In [14]:
use_fp16= True
if use_fp16:
    ft_encoder_weight.to_half()
    ft_decoding_weight.to_half()

In [17]:
remove_padding = False
max_distance = 128
sparse = False
lib_path = './FasterTransformer/build/lib/libth_transformer.so'
ft_encoder = FTT5Encoder(ft_encoder_weight.w, lib_path, encoder_config.num_heads,
                        encoder_config.d_kv, encoder_config.d_ff,
                        encoder_config.d_model, remove_padding, encoder_config.num_layers,
                        encoder_config.relative_attention_num_buckets, encoder_config.num_experts, encoder_config.moe_layer_index,
                        max_distance, sparse, q_scaling, tensor_para_size, pipeline_para_size, t5_with_bias,
                        position_embedding_type,
                        activation_type=activation_type,)
ft_decoding = FTT5Decoding(ft_decoding_weight.w, lib_path,
                        decoder_config.num_heads, decoder_config.d_kv,
                        decoder_config.d_ff, encoder_config.d_model,
                        decoder_config.d_model, decoder_config.num_layers,
                        decoder_config.decoder_start_token_id, decoder_config.eos_token_id,
                        decoder_config.vocab_size,
                        q_scaling,
                        decoder_config.relative_attention_num_buckets, decoder_config.num_experts, decoder_config.moe_layer_index, max_distance,
                        tensor_para_size=tensor_para_size, pipeline_para_size=pipeline_para_size,
                        t5_with_bias=t5_with_bias,
                        position_embedding_type=position_embedding_type,
                        activation_type=activation_type, tie_word_embeddings=tie_word_embeddings,)

ft_t5 = FTT5(ft_encoder, ft_decoding)



In [47]:
# repeat sample input 3 times to get to 150 input tokens length
INPUTS = [
    "translate English to French: Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems"
]
batch_size = len(INPUTS)
inputs = tokenizer(INPUTS, padding=True, return_tensors="pt")
input_ids = inputs['input_ids']

In [48]:
input_ids.shape

torch.Size([1, 21])

## Set beam width and max length and other settings

In [49]:
# set output len to 60
max_seq_len = 60
beam_search_diversity_rate = 0.0
# beam width
num_beams = 2
# topk and topp sampling
topk = 0
topp = 0.0

## HF Output

In [50]:
output = t5_model.generate(input_ids, max_length=max_seq_len, num_beams=num_beams)
hf_tokens = tokenizer.batch_decode(output, skip_special_tokens=True)
hf_tokens

["L'intelligence artificielle est la simulation des processus de l'intelligence humaine par des machines, en particulier des systèmes informatiques."]

## FT Output

In [51]:
# An example to prevent generating "Chef"
# bad_words_text = np.array([["Chef"]]* len(input_texts), dtype=object)
# bad_words_list = to_word_list_format(bad_words_text, tokenizer)
# bad_words_list = torch.Tensor(bad_words_list).to(torch.int32).to("cuda").contiguous()
bad_words_list = None

# An example to stop generation when the model generate "Chef"
# stop_words_text = np.array([["Chef"]] * len(input_texts), dtype=object)
# stop_words_list = to_word_list_format(stop_words_text, tokenizer)
# stop_words_list = torch.Tensor(stop_words_list).to(torch.int32).to("cuda").contiguous()
stop_words_list = None

repetition_penalty = 1.0
temperature = 1.0
len_penalty = 0.0

In [52]:
# ft_t5 returns output_ids of shape [batch_size, beam_width, max_output_seq_len]
# ft_t5 returns sequence_length of shape [batch_size, beam_width]
ft_output_ids, ft_sequence_length = ft_t5(input_token=inputs,
                                                  inputs_embeds=None,
                                                  beam_size=num_beams,
                                                  max_seq_len=max_seq_len,
                                                  top_k=topk,
                                                  top_p=topp,
                                                  beam_search_diversity_rate=beam_search_diversity_rate,
                                                  is_return_output_log_probs=False,
                                                  is_return_cum_log_probs=False,
                                                  repetition_penalty=repetition_penalty,
                                                  temperature=temperature,
                                                  len_penalty=len_penalty,
                                                  bad_words_list=bad_words_list,
                                                  stop_words_list=stop_words_list)

In [53]:
ft_outputs = []
for i in range(batch_size):
    # selecting the top sequence from beam width number of sequences
    ft_outputs.append(list(ft_output_ids[i, 0, :][:ft_sequence_length[i , 0]]))
ft_tokens = tokenizer.batch_decode(ft_outputs, skip_special_tokens=True)

ft_tokens

["L'intelligence artificielle est la simulation des processus de l'intelligence humaine par des machines, en particulier des systèmes informatiques."]