In [1]:
# Install tf-transformers from github

# T5 + Squad + Text Generation

This tutorial contains code to fine-tune an T5 Model for Squad as Text Generation or Auto-Regressive task

In this notebook:
- Load the data + create ```tf.data.Dataset``` using TFWriter
- Load and fine-tune T5
- Train using ```tf.keras.Model.fit``` and ```Custom Trainer``` 
- Minimze LM loss
- Evaluate EM/F1 score
- In production using faster ```tf.SavedModel``` + no architecture code

In [5]:
import datasets
import json
import os
import glob
import time

from tf_transformers.models import T5Model
from transformers import T5Tokenizer
from tf_transformers.data.squad_utils_sp import (
    read_squad_examples, evaluate_v1)
from tf_transformers.data import TFWriter, TFReader, TFProcessor
from tf_transformers.losses import cross_entropy_loss_label_smoothing
from tf_transformers.core import optimization, SimpleTrainer
from absl import logging
logging.set_verbosity("INFO")

### Load Tokenizer

In [6]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')


### Parse Train Data

In [5]:
input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/train-v1.1.json'

is_training = True

# 1. Read Examples
start_time = time.time()
train_examples = read_squad_examples(
      input_file=input_file_path,
      is_training=is_training,
      version_2_with_negative=False
      )
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))


max_passage_length = 384
max_question_length = 64
max_answer_length = 40

def parse_train():
    result = {}
    for f in train_examples:
        question_input_ids =  tokenizer.tokenize('question: ' + f['question_text'])[: max_question_length] 
        passage_input_ids  =  tokenizer.tokenize('context: '  + f['paragraph_text'])[: max_passage_length -1]  + [tokenizer.eos_token] # -1 to add </s>
        
        input_ids = tokenizer.convert_tokens_to_ids(question_input_ids + passage_input_ids)
        input_mask = [1] * len(input_ids)

        decoder_input_ids = [tokenizer.pad_token] + tokenizer.tokenize(f['orig_answer_text'])[: max_answer_length-2] + [tokenizer.eos_token]
        decoder_input_ids = tokenizer.convert_tokens_to_ids(decoder_input_ids)

        result = {}
        result['encoder_input_ids'] = input_ids
        result['encoder_input_mask'] = input_mask
        result['decoder_input_ids'] = decoder_input_ids[:-1] # except last word
        
        result['labels'] = decoder_input_ids[1:] # not including first word
        result['labels_mask'] = [1] * len(decoder_input_ids[1:])
        
        # Decoder doesnt need input_mask because by default decoder has causal mask mode

        yield result
        
# Lets write using TF Writer
# Use TFProcessor for smalled data

schema = {
    "encoder_input_ids": ("var_len", "int"),
    "encoder_input_mask": ("var_len", "int"),
    "decoder_input_ids": ("var_len", "int"),
    "labels": ("var_len", "int"),
    "labels_mask": ("var_len", "int"),
}

tfrecord_train_dir = '../OFFICIAL_TFRECORDS/squad_as_generation/t5/train'
tfrecord_filename = 'squad'
tfwriter = TFWriter(schema=schema, 
                    file_name=tfrecord_filename, 
                    model_dir=tfrecord_train_dir,
                    tag='train',
                    overwrite=True
                    )
tfwriter.process(parse_fn=parse_train())


Time taken 0.8381352424621582


### Read TFRecords using TFReader

In [44]:
# Read Data


schema = json.load(open("{}/schema.json".format(tfrecord_train_dir)))
all_files = glob.glob("{}/*.tfrecord".format(tfrecord_train_dir))
tf_reader = TFReader(schema=schema, 
                    tfrecord_files=all_files)

x_keys = ['encoder_input_ids', 'encoder_input_mask', 'decoder_input_ids']
y_keys = ['labels', 'labels_mask']
batch_size = 16
train_dataset = tf_reader.read_record(auto_batch=True, 
                                   keys=x_keys,
                                   batch_size=batch_size, 
                                   x_keys = x_keys, 
                                   y_keys = y_keys,
                                   shuffle=True, 
                                   drop_remainder=True
                                  )

In [None]:
for (batch_inputs, batch_labels) in train_dataset.take(1):
    print(batch_inputs, batch_labels)

### Load t5 Model

In [5]:
import tensorflow as tf

model_layer, model, config = T5Model(model_name='t5-small')
model.load_checkpoint("/mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/t5-small/")

INFO:absl:Overwride mask_mode with user_defined
INFO:absl:Initialized Variables
INFO:absl:Overwride mask_mode with causal
INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched
INFO:absl:Encoder loaded succesfully from /mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/roberta-base/
INFO:absl:Warm started decoder 197/202 variables
INFO:absl:Inputs -->
INFO:absl:encoder_input_ids ---> Tensor("input_ids:0", shape=(None, None), dtype=int32)
INFO:absl:encoder_input_mask ---> Tensor("input_mask:0", shape=(None, None), dtype=int32)
INFO:absl:encoder_input_type_ids ---> Tensor("input_type_ids:0", shape=(None, None), dtype=int32)
INFO:absl:decoder_input_ids ---> Tensor("decoder_input_ids:0", shape=(None, None), dtype=int32)
INFO:absl:decoder_input_type_ids ---> Tensor("decoder_input_type_ids:0", shape=(None, None), dtype=int32)
INFO:absl:Initialized Variables
INFO:absl:Inputs -->
INFO:absl:encoder_input_ids ---> Tensor("input_ids:0", shape=(None, None), dtype=int32)
INFO:

### Define Loss + Label Smoothing

Loss function is simple.
* labels: 2D (batch_size x sequence_length)
* logits: 3D (batch_size x sequence_length x vocab_size)
* label_weights: 2D (batch_size x sequence_length) # we don't want all words in the sequence to have loss so, we mask them and don't calculate for loss

In [53]:

def lm_loss(y_true_dict, y_pred_dict, smoothing=0.1):
    
    return cross_entropy_loss_label_smoothing(labels=y_true_dict['labels'], 
                                   logits=y_pred_dict['token_logits'],
                                   smoothing=smoothing,
                                      label_weights=y_true_dict['labels_mask'])


### Define Optimizer

* learning_rate is the key

**PRO TIP**: These models are very sensitive to optimizer, especially learning rates. So, make sure you play around to find a good combination

In [57]:
optimizer = optimization.AdamWeightDecay(learning_rate=0.0001)

INFO:absl:using Adamw optimizer


### Train Using Keras :-)

- ```compile2``` allows you to have directly use model outputs as well batch dataset outputs into the loss function, without any further complexity.

Note: For ```compile2```, loss_fn must be None, and custom_loss_fn must be active. Metrics are not supprted for time being.

In [60]:
# Keras Fit

keras_loss_fn = {'token_logits': lm_loss
                }
model.compile2(optimizer=optimizer, 
                            loss=None, 
                            custom_loss=keras_loss_fn, 
                            run_eagerly=False)
history = model.fit(train_dataset, epochs=2, steps_per_epoch=5)

Epoch 1/2
















Epoch 2/2


### Train using SimpleTrainer (part of tf-transformers)

In [None]:
train_data_size = 87000
steps_per_epoch = int(train_data_size / batch_size)
EPOCHS = 4

# Custom training
history2 = SimpleTrainer(model = model,
             optimizer = optimizer,
             loss_fn = lm_loss,
             dataset = train_dataset.repeat(EPOCHS+1), # This is important
             epochs = EPOCHS, 
             num_train_examples = train_data_size, 
             batch_size = batch_size, 
             steps_per_call=100)



### Save Models 

You can save models as checkpoints using ```.save_checkpoint``` attribute, which is a part of all ```LegacyModels```

In [7]:
model_save_dir = "../OFFICIAL_MODELS/squad_as_generation/t5_small_label_smoothing"
model.save_checkpoint(model_save_dir, overwrite=True)

### Load the model for Text Genration (Auto-Regressive)

1. For any model to use for auto-regressive tasks we have to provide **"pipeline_mode='auto-regressive'"**

tf-transformers will handle everything for you internally


In [None]:
# Load the model by disabling dropout and add pipeline_mode = 'auto-regressive'

import tensorflow as tf

model_layer, model, config = T5Model(model_name='t5-small', is_training=False, pipeline_mode='auto-regressive')
model.load_checkpoint(model_save_dir)


### Save the model as serialized version

This is very important, because serialized model is significantly faster.
tf-transfomers provide **save_as_serialize_module**

In [None]:
model.save_as_serialize_module("{}/saved_model".format(model_save_dir), overwrite=True)
loaded = tf.saved_model.load("{}/saved_model".format(model_save_dir))

### Parse validation data

We use ```TFProcessor``` to create validation data, because dev data is small

In [None]:
dev_input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/dev-v1.1.json'

is_training = False

start_time = time.time()
dev_examples = read_squad_examples(
      input_file=dev_input_file_path,
      is_training=is_training,
      version_2_with_negative=False
)
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

def parse_dev():
    for f in dev_examples:
        question_input_ids =  tokenizer.tokenize('question: ' + f['question_text'])[: max_question_length] 
        passage_input_ids  =  tokenizer.tokenize('context: '  + f['paragraph_text'])[: max_passage_length -1]  + [tokenizer.eos_token] # -1 to add </s>
       
        input_ids = tokenizer.convert_tokens_to_ids(question_input_ids + passage_input_ids)
        input_mask = [1] * len(input_ids)
        
        result = {}
        result['encoder_input_ids'] = input_ids
        result['encoder_input_mask'] = input_mask
       
        yield result
        
tf_processor = TFProcessor()
dev_dataset = tf_processor.process(parse_fn=parse_dev())
dev_dataset = tf_processor.auto_batch(dev_dataset, batch_size=32)

In [None]:
def split_by_id(predicted_ids, eos_id):
    all_ids = []
    for per_example_id in predicted_ids:
        try:
            index = per_example_id.index(eos_id)
        except:
            index = -1
        sliced_ids = per_example_id[:index]
        all_ids.append(sliced_ids)
    return all_ids

### Text-Generation for dev dataset

1. For **EncoderDecoder** models like Roberta2Roberts, Bert2GPT, t5, BART use **TextDecoderSeq2Seq**
2. For **Encoder** only models like GPT2, BERT, Roberta use **TextDecoder**

In [None]:
from tf_transformers.text import TextDecoderSeq2Seq

decoder = TextDecoderSeq2Seq(model=loaded, 
                            decoder_start_token_id=0, 
                            )
# OR if keras.model

#decoder = TextDecoderSeq2Seq(model=model, 
#                            decoder_start_token_id=0
#                            )

start_time = time.time()
predicted_answers = []
for batch_inputs in dev_dataset:
    model_outputs = decoder.decode(batch_inputs, 
                   mode='greedy',                 
                   max_iterations=40, 
                   eos_id=tokenizer.eos_token_id)

    predicted_ids = model_outputs['predicted_ids'][:, 0, :].numpy().tolist()
    predicted_ids_sliced = split_by_id(predicted_ids, tokenizer.eos_token_id)
    predicted_text = [tokenizer.decode(p_ids, skip_special_tokens=True) for p_ids in predicted_ids_sliced]
        
    
    predicted_answers.extend(predicted_text)
    
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))

squad_dev_data = json.load(open(dev_input_file_path))['data']
qas_id_answer  = {item['qas_id']: predicted_answers[i] for(i, item) in enumerate(dev_examples)}
eval_results = evaluate_v1(squad_dev_data, qas_id_answer)

# {'exact_match': 75.38315988647115, 'f1': 83.89726910155875}

### In Production
1. Lets see how we can deploy this model in production

In [None]:
import tensorflow as tf
from tf_transformers.text import TextDecoderSeq2Seq
from tf_transformers.data import pad_dataset

# 1. Load Saved Model
loaded = tf.saved_model.load("{}/saved_model".format(model_save_dir))

# 2. Initiate a decode object
decoder = TextDecoderSeq2Seq(model=loaded, 
                            decoder_start_token_id=0 # Decoder always expect a start_token_id
                            )

# 3. Convert text to inputs

# Tokenizer fn convert text -> model inputs
# Make sure you return dict with key-> list of list
# pad_dataset is a decorator, hich will automatically taken care of padding

# If you want to write your own function, please. model expect inputs in a specifed format thats all.
@pad_dataset
def tokenizer_fn(contexts, questions):
    input_ids      = []
    input_mask     = []
    for (question, context) in zip(contexts, questions):
        question_input_ids =  tokenizer.tokenize('question: ' + question)[: max_question_length] 
        passage_input_ids  =  tokenizer.tokenize('context: '  + context)[: max_passage_length -1]  + [tokenizer.eos_token] # -1 to add </s>
       
        input_ids = tokenizer.convert_tokens_to_ids(question_input_ids + passage_input_ids)
        input_mask = [1] * len(input_ids)
        
    result = {}
    result['encoder_input_ids'] = input_ids
    result['encoder_input_mask'] = input_mask
    
    return result

questions = ['When was Kerala formed?']
questions = ['What was prominent in Kerala?']
questions = ['How many districts are there in Kerala']
questions = ['When was Kerala formed?']

contexts = ['''Kerala (English: /ˈkɛrələ/; Malayalam: [ke:ɾɐɭɐm] About this soundlisten (help·info)) is a
state on the southwestern Malabar Coast of India. It was formed on 1 November 1956, 
following the passage of the States Reorganisation Act, by combining Malayalam-speaking regions of 
the erstwhile states of Travancore-Cochin and Madras. 
Spread over 38,863 km2 (15,005 sq mi), Kerala is the twenty-first largest Indian state by area. 
It is bordered by Karnataka to the north and northeast, Tamil Nadu to the east and south, and the Lakshadweep Sea[14] to the west. With 33,387,677 inhabitants as per the 2011 Census, Kerala is the thirteenth-largest Indian state by population. It is divided into 14 districts with the capital being Thiruvananthapuram. Malayalam is the most widely spoken language and is also the official language of the state.[15]

The Chera Dynasty was the first prominent kingdom based in Kerala. The Ay kingdom in the deep south and the Ezhimala kingdom in the north formed the other kingdoms in the early years of the Common Era (CE). The region had been a prominent spice exporter since 3000 BCE. The region's prominence in trade was noted in the works of Pliny as well as the Periplus around 100 CE. In the 15th century, the spice trade attracted Portuguese traders to Kerala, and paved the way for European colonisation of India. At the time of Indian independence movement in the early 20th century, there were two major princely states in Kerala-Travancore State and the Kingdom of Cochin. They united to form the state of Thiru-Kochi in 1949. The Malabar region, in the northern part of Kerala, had been a part of the Madras province of British India, which later became a part of the Madras State post-independence. After the States Reorganisation Act, 1956, the modern-day state of Kerala was formed by merging the Malabar district of Madras State (excluding Gudalur taluk of Nilgiris district, Lakshadweep Islands, Topslip, the Attappadi Forest east of Anakatti), the state of Thiru-Kochi (excluding four southern taluks of Kanyakumari district, Shenkottai and Tenkasi taluks), and the taluk of Kasaragod (now Kasaragod District) in South Canara (Tulunad) which was a part of Madras State.''']



# 5. Choose the type of decoding
batch_inputs = tokenizer_fn(questions, contexts)
model_outputs = decoder.decode(batch_inputs, 
               mode='greedy', 
               max_iterations=40, 
               eos_id=tokenizer.eos_token_id)

output_answer = tokenizer.batch_decode(tf.squeeze(model_outputs['predicted_ids'], 1), skip_special_tokens=True)
