<a href="https://colab.research.google.com/github/ldivrala/Question-Answering-TF/blob/main/TFQATraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install required library

In [None]:
# !pip install tensorflow_text
# !pip install sentencepiece


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os
import tensorflow.experimental.numpy as tnp
import tensorflow_hub as hub
import typing
from typing import Any, Tuple
import tensorflow_text
import sentencepiece as spm

Connect colab with notebook

In [None]:
# Connect with Google Drive for dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root_dir = "/content/drive/MyDrive/Dataset/datasets/"

trainset_df_orig = pd.read_json(root_dir + "train-v2.0.json", encoding='utf-8')
devset_df_orig = pd.read_json(root_dir +"dev-v2.0.json", encoding='utf-8')

trainset_df_orig.head()

Unnamed: 0,version,data
0,v2.0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,v2.0,{'title': 'Sino-Tibetan_relations_during_the_M...
3,v2.0,"{'title': 'IPod', 'paragraphs': [{'qas': [{'qu..."
4,v2.0,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...


In [None]:
trainset_df_orig["data"][0]["paragraphs"][:1]

[{'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'qas': [{'answers': [{'answer_start': 269, 'text': 'in the late 1990s'}],
    'id': '56be85543aeaaa14008c9063',
    'is_impossible': False,
    'question': 'When did Beyonce start becoming popular?'},
   {'answers': [{'answer_start': 207, 'text': 'singing and dancing'}],
    'id': '5

Unzip albert module from drive

In [None]:
!mkdir /content/sample_data/albert_base_3
!mkdir /content/sample_data/albert_en_preprocess_3

!tar xf /content/drive/MyDrive/Models/Albert/albert_base_3.tar.gz -C /content/sample_data/albert_base_3
!tar xf /content/drive/MyDrive/Models/Albert/albert_en_preprocess_3.tar.gz -C /content/sample_data/albert_en_preprocess_3

Tokenizer

In [None]:
tokenizer = spm.SentencePieceProcessor("/content/sample_data/albert_base_3/assets/30k-clean.model")

In [None]:
Get our dataset

SyntaxError: ignored

In [None]:
def get_dataset(data_array):
    contexts = []
    questions = []
    answers = []
    
    for data in data_array:
        for paragraph in data["paragraphs"]:

            context = paragraph["context"]
            for qas in paragraph["qas"]:
                que = qas["question"]

                if "plausible_answers" in qas:
                    qas["answers"] = qas["plausible_answers"]

                if len(qas["answers"]) > 0:
                  answer_start = qas["answers"][0]["answer_start"]
                  answer_text = qas["answers"][0]["text"]

                  que_tokenize_length = len(tokenizer.tokenize(que.strip().lower()))
                  context_before_length = len(tokenizer.tokenize(context[:answer_start].strip().lower()))
                  answer_tokenize_length = len(tokenizer.tokenize(answer_text.strip().lower()))

                  ans = [que_tokenize_length + 2 + context_before_length, 
                          que_tokenize_length + 2 + context_before_length + answer_tokenize_length]

                  # contexts[:answer_start]
                else:
                  ans = "<NO_ANSWER>"

                contexts.append(context)
                questions.append(que)
                answers.append(ans)
                
            
    dataset = pd.DataFrame({"que": questions, "ans": answers, "context": contexts})
    return dataset

In [None]:
from sklearn.utils import shuffle

trainset_orig = get_dataset(trainset_df_orig["data"])
trainset_orig = shuffle(trainset_orig)

trainset_orig.reset_index(inplace = True,  drop=True)

# devset_orig = get_dataset(devset_df_orig["data"])
trainset_orig.head()

Unnamed: 0,que,ans,context
0,What is Melbourne's highest temperature recorded?,"[180, 195]",Melbourne is also prone to isolated convective...
1,What wasn't the Soviet response to the Nazi-So...,"[49, 55]",In response to the publication of the secret p...
2,What country has the longest Constitution?,"[10, 11]","In India, the longest constitutional text in t..."
3,How long of a running time did the Number 199 ...,"[180, 185]","Britain had successful tested a new HAA gun, 3..."
4,The period between 640-580 BC was known as what?,"[25, 28]",In the first large-scale depictions during the...


In [None]:
tokenizer.EncodeAsPieces("hii")

['▁hi', 'i']

In [None]:
len(trainset_orig)

130319

Preprocessor for sentences

In [None]:
preprocessor = hub.load("http://tfhub.dev/tensorflow/albert_en_preprocess/3")

# Step 1: tokenize batches of text inputs.
text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string), tf.keras.layers.Input(shape=(), dtype=tf.string)] # This SavedModel accepts up to 2 text inputs.
tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment) for segment in text_inputs]

# Step 2 (optional): modify tokenized inputs.
pass

# Step 3: pack input sequences for the Transformer encoder.
seq_length = 512  # Your choice here.
bert_pack_inputs = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=seq_length))  # Optional argument.

bert_pack_output = bert_pack_inputs(tokenized_inputs)
encoder_inputs = keras.Model(text_inputs, bert_pack_output)

In [None]:
encoder_inputs([tf.constant(["hii", "byy"]), tf.constant(["hii", "chai"])])

{'input_mask': <tf.Tensor: shape=(2, 512), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_type_ids': <tf.Tensor: shape=(2, 512), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>,
 'input_word_ids': <tf.Tensor: shape=(2, 512), dtype=int32, numpy=
 array([[   2, 4148,   49, ...,    0,    0,    0],
        [   2,   34,   93, ...,    0,    0,    0]], dtype=int32)>}

Dataset pipeline creation

In [None]:
batch_size = 4
def format_dataset():
    
    for i in range(len(trainset_orig)):
        que = trainset_orig.loc[i, "que"].strip().lower()
        context = trainset_orig.loc[i, "context"].strip().lower()
        answer = trainset_orig.loc[i, "ans"]
        if answer[0] >= seq_length:
          answer[0] = 510
        
        if answer[1] >= seq_length:
          answer[1] = 511


        yield ({
            "question_inputs": que,
            "context_inputs": context,
        }, {
            "answer_start_outputs": answer[0],
            "answer_end_outputs": answer[1]
        })
    

def make_dataset():
    dataset = tf.data.Dataset.from_generator(format_dataset, 
                                             output_signature =({
                                                 "question_inputs": tf.TensorSpec(shape=(), dtype=tf.string, name="question_inputs"),
                                                 "context_inputs": tf.TensorSpec(shape=(), dtype=tf.string, name="context_inputs"),
                                             },
                                              {
                                                "answer_start_outputs" : tf.TensorSpec(shape=(), dtype=tf.int64, name="answer_start_outputs"),
                                                "answer_end_outputs" : tf.TensorSpec(shape=(), dtype=tf.int64, name="answer_end_outputs")
                                              }
                                            ))
    dataset = dataset.cache().batch(batch_size)
    
    
    return dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
train_ds = make_dataset()

In [None]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["question_inputs", "context_inputs"].shape: {inputs["question_inputs"].shape, inputs["context_inputs"].shape}')
    print(f"targets.shape: {targets['answer_start_outputs'].shape}")

    context = inputs["context_inputs"][1].numpy().decode()
    question = inputs["question_inputs"][1].numpy().decode()
    print(question)
    print(context)
    answer = ["[CLS]"] + tokenizer.EncodeAsPieces(question) + ["[SEP]"] + tokenizer.EncodeAsPieces(context) +  ["[SEP]"]
    
    answer = answer[int(targets["answer_start_outputs"][1].numpy()):int(targets["answer_end_outputs"][1].numpy())]
    print(answer)

inputs["question_inputs", "context_inputs"].shape: (TensorShape([4]), TensorShape([4]))
targets.shape: (4,)
what wasn't the soviet response to the nazi-soviet relations publication
in response to the publication of the secret protocols and other secret german–soviet relations documents in the state department edition nazi–soviet relations (1948), stalin published falsifiers of history, which included the claim that, during the pact's operation, stalin rejected hitler's claim to share in a division of the world, without mentioning the soviet offer to join the axis. that version persisted, without exception, in historical studies, official accounts, memoirs and textbooks published in the soviet union until the soviet union's dissolution.
['▁fal', 's', 'ifier', 's', '▁of', '▁history']


Create model layers

In [None]:
class TextContextualEmbedding(keras.layers.Layer):
  def __init__(self, **kwargs):
      super(TextContextualEmbedding, self).__init__(**kwargs)
      encoder = hub.KerasLayer(
                  "https://tfhub.dev/tensorflow/albert_en_base/3",
                  trainable=True)
      self.albert_module = encoder
      
  def call(self, encoder_inputs):

      albert_inputs = dict(
          input_ids=encoder_inputs["input_word_ids"],
          input_mask= encoder_inputs["input_mask"],
          segment_ids=encoder_inputs["input_type_ids"])
      
      albert_outputs = self.albert_module(encoder_inputs)
      return albert_outputs["sequence_output"]

  def compute_mask(self, inputs, mask = None):
      return inputs["input_mask"]

class TransformerEncoder(keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(dense_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm_1 = keras.layers.LayerNormalization()
        self.layernorm_2 = keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
           
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

  
    def get_config(self):
        config = super(TransformerEncoder, self).get_config()

        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
        })
        return config

Model building

In [None]:
embed_dim = 768
num_heads = 8
latent_dim = 768
vocab_size = preprocessor.tokenize.get_special_tokens_dict()["vocab_size"]
sequence_length = seq_length

question_inputs = keras.Input(shape=(), dtype=tf.string, name="question_inputs")
context_inputs = keras.Input(shape=(), dtype=tf.string, name="context_inputs")

input_token = encoder_inputs([question_inputs, context_inputs])

# x = bert_pack_inputs([input_token])
encoder_outputs = TextContextualEmbedding()(input_token)

x = TransformerEncoder(embed_dim, latent_dim, num_heads)(encoder_outputs)

x = keras.layers.Dropout(0.4)(x)
x1 = keras.layers.Dense(1)(x)
x2 = keras.layers.Dense(1)(x)

x1 = keras.layers.Reshape((-1,))(x1)
x2 = keras.layers.Reshape((-1,))(x2)

output1 = keras.layers.Activation("softmax", name ="answer_start_outputs")(x1)
output2 = keras.layers.Activation("softmax", name ="answer_end_outputs")(x2)

transformer = keras.Model([question_inputs, context_inputs], [output1, output2])
transformer.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question_inputs (InputLayer)    [(None,)]            0                                            
__________________________________________________________________________________________________
context_inputs (InputLayer)     [(None,)]            0                                            
__________________________________________________________________________________________________
model (Functional)              {'input_type_ids': ( 0           question_inputs[0][0]            
                                                                 context_inputs[0][0]             
__________________________________________________________________________________________________
text_contextual_embedding (Text (None, 512, 768)     11683584    model[0][0]                

Train our model

In [None]:
transformer.layers[3].trainable = False

In [None]:
epochs = 1  # This should be at least 50 for convergence

transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["acc"]
)

transformer.fit(train_ds, epochs=epochs, use_multiprocessing=True)

# Save the entire model as a SavedModel.
transformer.save('saved_model/my_model')
transformer.save_weights('saved_model/weights')





INFO:tensorflow:Assets written to: saved_model/my_model/assets


INFO:tensorflow:Assets written to: saved_model/my_model/assets


In [None]:
transformer.layers[3].trainable = True
epochs = 1  # This should be at least 500 for convergence

transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["acc"]
)

transformer.fit(train_ds, epochs=epochs, use_multiprocessing=True)

# Save the entire model as a SavedModel.
transformer.save('saved_model/my_model')
transformer.save_weights('saved_model/weights')

  16660/Unknown - 11379s 682ms/step - loss: 12.5040 - answer_start_outputs_loss: 6.2524 - answer_end_outputs_loss: 6.2517 - answer_start_outputs_acc: 0.0019 - answer_end_outputs_acc: 0.0022