In [16]:
# Install tf-transformers from github

In [None]:


import json
import tensorflow as tf
import time
import glob
import collections

from tf_transformers.utils.tokenization import BasicTokenizer, ROBERTA_SPECIAL_PEICE
from tf_transformers.utils import fast_sp_alignment
from tf_transformers.data.squad_utils_sp import (
    read_squad_examples,
    post_clean_train_squad,
    example_to_features_using_fast_sp_alignment_train,
    example_to_features_using_fast_sp_alignment_test, 
    _get_best_indexes, evaluate_v1
)
from tf_transformers.data import TFWriter, TFReader, TFProcessor
from tf_transformers.models import RobertaModel
from tf_transformers.core import optimization, SimpleTrainer
from tf_transformers.tasks import Span_Selection_Model

from transformers import RobertaTokenizer
from absl import logging
logging.set_verbosity("INFO")

from tf_transformers.pipeline.span_extraction_pipeline import Span_Extraction_Pipeline

### Load Tokenizer

In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
basic_tokenizer = BasicTokenizer(do_lower_case=False)

### Convert train data to Features

* using Fast Sentence Piece Alignment, we convert text to features (text -> list of sub words)

In [3]:
input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/train-v1.1.json'

is_training = True

# 1. Read Examples
start_time = time.time()
train_examples = read_squad_examples(
      input_file=input_file_path,
      is_training=is_training,
      version_2_with_negative=False
      )
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

# 2.Postprocess (clean text to avoid some unwanted unicode charcaters)
train_examples_processed, failed_examples = post_clean_train_squad(train_examples, basic_tokenizer, is_training=is_training)


# 3.Convert question, context and answer to proper features (tokenized words) not word indices
feature_generator = example_to_features_using_fast_sp_alignment_train(tokenizer, train_examples_processed, max_seq_length = 384, 
                                                           max_query_length=64, doc_stride=128, SPECIAL_PIECE=ROBERTA_SPECIAL_PEICE) 

all_features = []
for feature in feature_generator:
    all_features.append(feature)
end_time = time.time()
print("time taken {} seconds".format(end_time-start_time))

INFO:absl:Time taken 0.06583905220031738


Time taken 0.7573883533477783
time taken 1.051743984222412 seconds


### Convert features to TFRecords using TFWriter

In [7]:
# Convert tokens to id and add type_ids
# input_mask etc
# This is user specific/ tokenizer specific
# Eg: Roberta has input_type_ids = 0, BERT has input_type_ids = [0, 1]

def parse_train():
    result = {}
    for f in all_features:
        input_ids = tokenizer.convert_tokens_to_ids(f['input_ids'])
        input_type_ids = tf.zeros_like(input_ids).numpy().tolist()
        input_mask = tf.ones_like(input_ids).numpy().tolist()
        result['input_ids'] = input_ids
        result['input_type_ids'] = input_type_ids
        result['input_mask'] = input_mask
        result['start_position'] = f['start_position']
        result['end_position']   = f['end_position']
        yield result
        

# Lets write using TF Writer
# Use TFProcessor for smaller data

schema = {'input_ids': ("var_len", "int"), 
         'input_type_ids': ("var_len", "int"), 
         'input_mask': ("var_len", "int"), 
         'start_position': ("var_len", "int"), 
         'end_position': ("var_len", "int")}

tfrecord_train_dir = '../OFFICIAL_TFRECORDS/squad/train'
tfrecord_filename = 'squad'
tfwriter = TFWriter(schema=schema, 
                    file_name=tfrecord_filename, 
                    model_dir=tfrecord_train_dir,
                    tag='train',
                    overwrite=True
                    )
tfwriter.process(parse_fn=parse_train())

INFO:absl:Total individual observations/examples written is 100
INFO:absl:All writer objects closed


### Read TFRecords using TFReader

In [11]:
# Read Data


schema = json.load(open("{}/schema.json".format(tfrecord_train_dir)))
all_files = glob.glob("{}/*.tfrecord".format(tfrecord_train_dir))
tf_reader = TFReader(schema=schema, 
                    tfrecord_files=all_files)

x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['start_position', 'end_position']
batch_size = 16
train_dataset = tf_reader.read_record(auto_batch=True, 
                                   keys=x_keys,
                                   batch_size=batch_size, 
                                   x_keys = x_keys, 
                                   y_keys = y_keys,
                                   shuffle=True, 
                                   drop_remainder=True
                                  )

### Load Roberta base Model

In [13]:
model_layer, model, config = RobertaModel(model_name='roberta-base', return_all_layer_token_embeddings=False)
model.load_checkpoint("/mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/roberta-base/")

INFO:absl:We are overwriding `is_training` is False to `is_training` to True with `use_dropout` is False, no effects on your inference pipeline


Kwargs {}


INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched


### Load Span Selection Model

In [15]:

span_selection_layer = Span_Selection_Model(model=model,
                                      use_all_layers=False, 
                                      is_training=True)
span_selection_model = span_selection_layer.get_model()

In [18]:
# Delete to save up memory

del model
del model_layer
del span_selection_layer

### Define Loss

Loss function is simple.
* labels: 1D (batch_size) # start or end positions
* logits: 2D (batch_size x sequence_length)


In [1]:

# Cross Entropy
def span_loss(position, logits):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=tf.squeeze(position, axis=1)))
    return loss

# Start logits loss
def start_loss(y_true_dict, y_pred_dict):
    start_loss = span_loss(y_true_dict['start_position'], y_pred_dict['start_logits'])
    return start_loss

# End logits loss
def end_loss(y_true_dict, y_pred_dict):
    end_loss = span_loss(y_true_dict['end_position'], y_pred_dict['end_logits'])
    return end_loss
# (start_loss + end_loss) / 2.0
def joint_loss(y_true_dict, y_pred_dict):
    sl = start_loss(y_true_dict, y_pred_dict)
    el = end_loss(y_true_dict, y_pred_dict)
    return (sl + el)/2.0

In [None]:
for (batch_inputs, batch_labels) in train_dataset.take(1):
    print(batch_inputs, batch_labels)

### Define Optimizer

In [29]:
train_data_size = 89000
learning_rate   = 2e-5
steps_per_epoch = int(train_data_size / batch_size)
EPOCHS = 3
num_train_steps = steps_per_epoch * EPOCHS
warmup_steps = int(0.1 * num_train_steps)
# creates an optimizer with learning rate schedule
optimizer_type = 'adamw'
optimizer, learning_rate_fn = optimization.create_optimizer(learning_rate,
                                                steps_per_epoch * EPOCHS,
                                                warmup_steps,
                                                optimizer_type)

INFO:absl:using Adamw optimizer


### Train Using Keras :-)

- ```compile2``` allows you to have directly use model outputs as well batch dataset outputs into the loss function, without any further complexity.

Note: For ```compile2```, loss_fn must be None, and custom_loss_fn must be active. Metrics are not supprted for time being.

In [27]:
# Keras Fit

keras_loss_fn = {'start_logits': start_loss, 
           'end_logits': end_loss}
span_selection_model.compile2(optimizer=tf.keras.optimizers.Adam(), 
                            loss=None, 
                            custom_loss=keras_loss_fn
                            )
history = span_selection_model.fit(train_dataset, epochs=2, steps_per_epoch=10)

Epoch 1/2
Start logits (16, None)








Start logits (16, None)














<tensorflow.python.keras.callbacks.History at 0x7f90801d0c70>

### Train using SimpleTrainer (part of tf-transformers)

In [None]:
# Custom training
history = SimpleTrainer(model = span_selection_model,
             optimizer = optimizer,
             loss_fn = joint_loss,
             dataset = train_dataset.repeat(EPOCHS+1), # This is important
             epochs = EPOCHS, 
             num_train_examples = train_data_size, 
             batch_size = batch_size, 
             steps_per_call=100, 
             gradient_accumulation_steps=None)

### Save Models 

You can save models as checkpoints using ```.save_checkpoint``` attribute, which is a part of all ```LegacyModels```

In [9]:
model_save_dir = '../OFFICIAL_MODELS/squad/roberta_base2'
span_selection_model.save_checkpoint(model_save_dir)

INFO:absl:Succesful: Model checkpoints matched


### Parse validation data

We use ```TFProcessor``` to create validation data, because dev data is small

In [8]:
dev_input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/dev-v1.1.json'

is_training = False

start_time = time.time()
dev_examples = read_squad_examples(
      input_file=dev_input_file_path,
      is_training=is_training,
      version_2_with_negative=False
)
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))
dev_examples_cleaned = post_clean_train_squad(dev_examples, basic_tokenizer, is_training=False)
qas_id_info, dev_features = example_to_features_using_fast_sp_alignment_test(tokenizer, dev_examples_cleaned,  max_seq_length = 384, 
                                                           max_query_length=64, doc_stride=128, SPECIAL_PIECE=ROBERTA_SPECIAL_PEICE)



def parse_dev():
    result = {}
    for f in dev_features:
        input_ids = tokenizer.convert_tokens_to_ids(f['input_ids'])
        input_type_ids = tf.zeros_like(input_ids).numpy().tolist()
        input_mask = tf.ones_like(input_ids).numpy().tolist()
        result['input_ids'] = input_ids
        result['input_type_ids'] = input_type_ids
        result['input_mask'] = input_mask
        yield result
        

tf_processor = TFProcessor()
dev_dataset = tf_processor.process(parse_fn=parse_dev())
dev_dataset = tf_processor.auto_batch(dev_dataset, batch_size=32)

Time taken 0.07536649703979492


### Evaluate Exact Match

* Make Predictions
* Extract Answers
* Evaluate

### Make Batch Predictions

In [28]:
def extract_from_dict(dict_items, key):
    holder = []
    for item in dict_items:
        holder.append(item[key])
    return holder

qas_id_list = extract_from_dict(dev_features, 'qas_id')
doc_offset_list = extract_from_dict(dev_features, 'doc_offset')

# Make batch predictions

per_layer_start_logits = []
per_layer_end_logits   = []

start_time = time.time()
for (batch_inputs) in dev_dataset:
    model_outputs = span_selection_model(batch_inputs)
    per_layer_start_logits.append(model_outputs['start_logits'])
    per_layer_end_logits.append(model_outputs['end_logits'])
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

### Extract Answers (text) from Predictions

* Its little tricky as there will be multiple features for one example, if it is longer than max_seq_length

In [None]:
# Make batch predictions
n_best_size = 20
max_answer_length = 30
squad_dev_data = json.load(open(dev_input_file_path))['data']
predicted_results = []

# Unstack (matrxi tensor) into list arrays (list)
start_logits_unstcaked = []
end_logits_unstacked = []
for batch_start_logits in per_layer_start_logits:
    start_logits_unstcaked.extend(tf.unstack(batch_start_logits))
for batch_end_logits in per_layer_end_logits:
    end_logits_unstacked.extend(tf.unstack(batch_end_logits))

# Group (multiple predictions) of one example, due to big passage/context
# We need to choose the best anser of all the chunks of a examples
qas_id_logits = {}
for i in range(len(qas_id_list)):
    qas_id = qas_id_list[i]
    example = qas_id_info[qas_id]
    feature = dev_features[i]
    assert qas_id == feature['qas_id']
    if qas_id not in qas_id_logits:
        qas_id_logits[qas_id] = {'tok_to_orig_index': example['tok_to_orig_index'],
                                            'aligned_words': example['aligned_words'],
                                            'feature_length': [len(feature['input_ids'])],
                                            'doc_offset': [doc_offset_list[i]],
                                            'passage_start_pos': [feature['input_ids'].index(tokenizer.sep_token) + 1],
                                            'start_logits': [start_logits_unstcaked[i]], 
                                            'end_logits': [end_logits_unstacked[i]]}

    else:
        qas_id_logits[qas_id]['start_logits'].append(start_logits_unstcaked[i])
        qas_id_logits[qas_id]['end_logits'].append(end_logits_unstacked[i])
        qas_id_logits[qas_id]['feature_length'].append(len(feature['input_ids']))
        qas_id_logits[qas_id]['doc_offset'].append(doc_offset_list[i])
        qas_id_logits[qas_id]['passage_start_pos'].append(feature['input_ids'].index(tokenizer.sep_token) + 1)

# Extract answer assoiate it with single (qas_id) unique identifier     
qas_id_answer = {}
skipped = []
skipped_null = []
global_counter = 0
for qas_id in qas_id_logits:

    current_example = qas_id_logits[qas_id]

    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction",
        ["feature_index", "start_index", "end_index",
         "start_log_prob", "end_log_prob"])
    prelim_predictions = []
    example_features = []
    for i in range(len( current_example['start_logits'])):
        f = dev_features[global_counter]
        assert f['qas_id'] == qas_id
        example_features.append(f)
        global_counter += 1
        passage_start_pos = current_example['passage_start_pos'][i]
        feature_length = current_example['feature_length'][i]

        start_log_prob_list = current_example['start_logits'][i].numpy().tolist()[:feature_length]
        end_log_prob_list = current_example['end_logits'][i].numpy().tolist()[:feature_length]
        start_indexes = _get_best_indexes(start_log_prob_list, n_best_size)
        end_indexes   = _get_best_indexes(end_log_prob_list, n_best_size)

        for start_index in start_indexes:
            for end_index in end_indexes:
              # We could hypothetically create invalid predictions, e.g., predict
              # that the start of the span is in the question. We throw out all
              # invalid predictions.
              if start_index < passage_start_pos or end_index < passage_start_pos:
                continue
              if end_index < start_index:
                continue
              length = end_index - start_index + 1
              if length > max_answer_length:
                continue
              start_log_prob = start_log_prob_list[start_index]
              end_log_prob = end_log_prob_list[end_index]
              start_idx = start_index - passage_start_pos
              end_idx = end_index - passage_start_pos

              prelim_predictions.append(
                        _PrelimPrediction(
                            feature_index=i,
                            start_index=start_idx,
                            end_index=end_idx,
                            start_log_prob=start_log_prob,
                            end_log_prob=end_log_prob))



    prelim_predictions = sorted(
        prelim_predictions,
        key=lambda x: (x.start_log_prob + x.end_log_prob),
        reverse=True)

    if prelim_predictions:
        best_index = prelim_predictions[0].feature_index
        aligned_words = current_example['aligned_words']
        try:
            tok_to_orig_index = current_example['tok_to_orig_index']
            reverse_start_index_align = tok_to_orig_index[prelim_predictions[0].start_index + example_features[best_index]['doc_offset']] # aligned index
            reverse_end_index_align   = tok_to_orig_index[prelim_predictions[0].end_index + example_features[best_index]['doc_offset']]

            predicted_words = [w for w in aligned_words[reverse_start_index_align: reverse_end_index_align + 1] if w != ROBERTA_SPECIAL_PEICE]
            predicted_text = ' '.join(predicted_words)
            qas_id_answer[qas_id] = predicted_text
        except:
            qas_id_answer[qas_id] = ""
            skipped.append(qas_id)
    else:
        qas_id_answer[qas_id] = ""
        skipped_null.append(qas_id)
            
            
eval_results = evaluate_v1(squad_dev_data, qas_id_answer)
# {'exact_match': 81.46641438032167, 'f1': 89.72853269935702}

### Save as Serialized version 

- Now we can use ```save_as_serialize_module``` to save a model directly to saved_model

In [None]:
# Save as optimized version
span_selection_model.save_as_serialize_module("{}/saved_model".format(model_save_dir), overwrite=True)

# Load optimized version
span_selection_model_serialized = tf.saved_model.load("{}/saved_model".format(model_save_dir))

### TFLite Conversion

TFlite conversion requires:
- static batch size
- static sequence length

In [None]:
# Sequence_length = 384
# batch_size = 1

# Lets convert it to a TFlite model

# Load base model with specified sequence length and batch_size
model_layer, model, config = RobertaModel(model_name='roberta-base', 
                                          sequence_length=384, # Fix a sequence length for TFlite (it shouldnt be None)
                                          batch_size=1, 
                                          use_dropout=False) # batch_size=1

# Disable dropout (important) for TFlite

span_selection_layer = Span_Selection_Model(model=model,
                                      is_training=False)
span_selection_model = span_selection_layer.get_model()
span_selection_model.load_checkpoint(model_save_dir)

# Save to .pb format , we need it for tflite
span_selection_model.save_as_serialize_module("{}/saved_model_for_tflite".format(model_save_dir))

converter = tf.lite.TFLiteConverter.from_saved_model("{}/saved_model_for_tflite".format(model_save_dir)) # path to the SavedModel directory
converter.experimental_new_converter = True

tflite_model = converter.convert()

open("{}/converted_model.tflite".format(model_save_dir), "wb").write(tflite_model)

### **In production**

- We can use either ```tf.keras.Model``` or ```saved_model```. I recommend saved_model, which is much much faster and no hassle of having architecture code

In [None]:

def tokenizer_fn(features):
    """
    features: dict of tokenized text
    Convert them into ids
    """

    result = {}
    input_ids = tokenizer.convert_tokens_to_ids(features['input_ids'])
    input_type_ids = tf.zeros_like(input_ids).numpy().tolist()
    input_mask = tf.ones_like(input_ids).numpy().tolist()
    result['input_ids'] = input_ids
    result['input_type_ids'] = input_type_ids
    result['input_mask'] = input_mask
    return result

# Span Extraction Pipeline
pipeline = Span_Extraction_Pipeline(model = span_selection_model_serialized,
                tokenizer = tokenizer, 
                tokenizer_fn = tokenizer_fn, 
                SPECIAL_PIECE = ROBERTA_SPECIAL_PEICE, 
                n_best_size = 20, 
                n_best = 5, 
                max_answer_length = 30, 
                max_seq_length = 384, 
                max_query_length=64, 
                doc_stride=20)

questions = ['What was prominent in Kerala?']

questions = ['When was Kerala formed?']

questions = ['How many districts are there in Kerala']

contexts = ['''Kerala (English: /ˈkɛrələ/; Malayalam: [ke:ɾɐɭɐm] About this soundlisten (help·info)) is a state on the southwestern Malabar Coast of India. It was formed on 1 November 1956, following the passage of the States Reorganisation Act, by combining Malayalam-speaking regions of the erstwhile states of Travancore-Cochin and Madras. Spread over 38,863 km2 (15,005 sq mi), Kerala is the twenty-first largest Indian state by area. It is bordered by Karnataka to the north and northeast, Tamil Nadu to the east and south, and the Lakshadweep Sea[14] to the west. With 33,387,677 inhabitants as per the 2011 Census, Kerala is the thirteenth-largest Indian state by population. It is divided into 14 districts with the capital being Thiruvananthapuram. Malayalam is the most widely spoken language and is also the official language of the state.[15]

The Chera Dynasty was the first prominent kingdom based in Kerala. The Ay kingdom in the deep south and the Ezhimala kingdom in the north formed the other kingdoms in the early years of the Common Era (CE). The region had been a prominent spice exporter since 3000 BCE. The region's prominence in trade was noted in the works of Pliny as well as the Periplus around 100 CE. In the 15th century, the spice trade attracted Portuguese traders to Kerala, and paved the way for European colonisation of India. At the time of Indian independence movement in the early 20th century, there were two major princely states in Kerala-Travancore State and the Kingdom of Cochin. They united to form the state of Thiru-Kochi in 1949. The Malabar region, in the northern part of Kerala, had been a part of the Madras province of British India, which later became a part of the Madras State post-independence. After the States Reorganisation Act, 1956, the modern-day state of Kerala was formed by merging the Malabar district of Madras State (excluding Gudalur taluk of Nilgiris district, Lakshadweep Islands, Topslip, the Attappadi Forest east of Anakatti), the state of Thiru-Kochi (excluding four southern taluks of Kanyakumari district, Shenkottai and Tenkasi taluks), and the taluk of Kasaragod (now Kasaragod District) in South Canara (Tulunad) which was a part of Madras State.''']

result = pipeline(questions=questions, contexts=contexts)

### Sanity Check TFlite 

In [None]:
#### lets do a sanity check

sample_inputs = {}
input_ids = tf.random.uniform(minval=0, maxval=100, shape=(1, 384), dtype=tf.int32)
sample_inputs['input_ids'] = input_ids
sample_inputs['input_type_ids'] = tf.zeros_like(sample_inputs['input_ids'])
sample_inputs['input_mask'] = tf.ones_like(sample_inputs['input_ids'])

model_outputs = span_selection_model(sample_inputs)


# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="{}/converted_model.tflite".format(model_save_dir))
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

interpreter.set_tensor(input_details[0]['index'], sample_inputs['input_ids'])

interpreter.set_tensor(input_details[1]['index'], sample_inputs['input_mask'])

interpreter.set_tensor(input_details[2]['index'], sample_inputs['input_type_ids'])

interpreter.invoke()

end_logits = interpreter.get_tensor(output_details[0]['index'])
start_logits   = interpreter.get_tensor(output_details[1]['index'])

# Assertion 

print("Start logits", tf.reduce_sum(model_outputs['start_logits']), tf.reduce_sum(start_logits))
print("End logits", tf.reduce_sum(model_outputs['end_logits']), tf.reduce_sum(end_logits))

# We are good :-)