In [None]:
from google.colab import drive
drive.mount('/content/drive')

import zipfile

data_path = "/content/drive/MyDrive/NLP Project/quoref-train-dev-v0.1.zip"
zip_ref = zipfile.ZipFile(data_path, 'r')
zip_ref.extractall("/content")
zip_ref.close()


!pip install tokenizers
!pip install transformers

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import warnings
warnings.filterwarnings("ignore") 
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForQuestionAnswering

max_len = 512
configuration = BertConfig()  # default parameters and configuration for BERT


# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # using the pretrained tokenizer
save_path = "/content/bert_base_uncased/"
#if not os.path.exists(save_path):
    #os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("/content/bert_base_uncased/vocab.txt", lowercase=True)


# loading the data
train_path = '/content/quoref-train-dev-v0.1/quoref-train-v0.1.json'
eval_path = '/content/quoref-train-dev-v0.1/quoref-dev-v0.1.json'

with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)

In [None]:
class DataInstance:

  def __init__(self, question, context, start_word_idx, answer_text, all_answers):
    self.question = question
    self.context = context
    self.start_word_idx = start_word_idx
    self.answer_text = answer_text
    self.all_answers = all_answers
    self.skip = False


  def preprocess(self):
    context = self.context
    question = self.question
    answer_text = self.answer_text
    start_word_idx = self.start_word_idx

    # clean context, answer and question
    context = " ".join(str(context).split())
    question = " ".join(str(question).split())
    answer = " ".join(str(answer_text).split())

    # Find end character index of answer in context
    end_word_idx = start_word_idx + len(answer)
    if (end_word_idx >= len(context)):
      self.skip = True
      return

    # Mark the word indices in context that are in answer
    is_word_in_ans = [0 for _ in range(len(context))]
    for idx in range(start_word_idx, end_word_idx):
      is_word_in_ans[idx] = 1

    # Tokenize context
    tokenized_context = tokenizer.encode(context)


    # Find the words that were created from answer characters
    # The offsets associated to each token. 
    # These offsets let’s you slice the input string, and thus 
    # retrieve the original part that led to producing the corresponding token.
    ans_token_idx = []
    for idx, (start, end) in enumerate(tokenized_context.offsets):
      if (sum(is_word_in_ans[start:end]) > 0):
        ans_token_idx.append(idx)


    if len(ans_token_idx)==0:
      self.skip = True
      return

    # Find the start and end token index for tokens from answer
    start_token_idx = ans_token_idx[0]
    end_token_idx = ans_token_idx[-1]

    # tokenize question 
    tokenized_question = tokenizer.encode(question)

    # Create inputs
    #  The generated ID 
    #The IDs are the main input to a Language Model. 
    #They are the token indices, the numerical representations that a LM understands.
    input_ids = tokenized_context.ids + tokenized_question.ids[1:]
    token_type_ids = [0 for _ in  range(len(tokenized_context.ids))] + [1 for _ in range(len(tokenized_question.ids[1:]))]
    attention_mask = [1 for _ in range(len(input_ids))]


    # Pad and create attention masks.
    # Skip if truncation is needed
    padding_length = max_len - len(input_ids)
    if padding_length > 0:  # pad
        input_ids = input_ids + ([0 for _ in range(padding_length)])
        attention_mask = attention_mask + ([0 for _ in range(padding_length)])
        token_type_ids = token_type_ids + ([0 for _ in range(padding_length)])
    elif padding_length < 0:  # skip
        self.skip = True
        return

    self.input_ids = input_ids
    self.token_type_ids = token_type_ids
    self.attention_mask = attention_mask
    self.start_token_idx = start_token_idx
    self.end_token_idx = end_token_idx
    self.context_token_to_word = tokenized_context.offsets

In [None]:
def create_instance(raw_data):
  data_instances = []
  for item in raw_data["data"]:
    for para in item["paragraphs"]:
      context = para["context"]
      for qa in para["qas"]:
        question = qa["question"]
        answer_text = qa["answers"][0]["text"] # considering only first answer
        all_answers = [ans["text"] for ans in qa["answers"]]
        start_char_idx = qa["answers"][0]["answer_start"]
        quoref_instance = DataInstance(question, context, start_char_idx, answer_text, all_answers)
        quoref_instance.preprocess()
        data_instances.append(quoref_instance)
  
  return data_instances


def create_input_targets(data_instances):
  dataset_dict = {
      "input_ids" : [],
      "token_type_ids" : [],
      "attention_mask" : [],
      "start_token_idx": [],
      "end_token_idx": [],
  }

  for item in data_instances:
    if item.skip == False:
      for key in dataset_dict:
        dataset_dict[key].append(getattr(item, key))
  for key in dataset_dict:
    dataset_dict[key] = np.array(dataset_dict[key])

  x = [
       dataset_dict["input_ids"],
       dataset_dict["token_type_ids"],
       dataset_dict["attention_mask"],
  ]
  y = [
       dataset_dict["start_token_idx"],
       dataset_dict["end_token_idx"],
  ]

  return x, y

In [None]:
train_instances = create_instance(raw_train_data)
x_train, y_train = create_input_targets(train_instances)
print("trainset size: {}".format(len(train_instances)))

eval_instances = create_instance(raw_eval_data)
x_eval, y_eval = create_input_targets(eval_instances)
print("evalset size: {}".format(len(eval_instances)))


trainset size: 19399
evalset size: 2418


In [None]:
print(x_eval[0].shape)

(1662, 512)


In [None]:
def create_model():
  
  #encoder = TFBertModel.from_pretrained("bert-base-uncased")
  encoder = TFBertForQuestionAnswering.from_pretrained("bert-base-uncased")

  # single span QA model
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
  token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
  bert_output = encoder(
                input_ids = input_ids,
                token_type_ids = token_type_ids,
                attention_mask = attention_mask,
                return_dict= True
              ) # outputs (start_logits, end_logits)

  #start_logits = layers.Dense(1, name="start_logit")(embedding)
  #start_logits = layers.Flatten()(start_logits)

  #end_logits = layers.Dense(1, name="end_logits")(embedding)
  #end_logits = layers.Flatten()(end_logits)

  start_logits = bert_output["start_logits"]
  end_logits =  bert_output["end_logits"]

  start_probs = layers.Activation(keras.activations.softmax)(start_logits)
  end_probs = layers.Activation(keras.activations.softmax)(end_logits)

  model = keras.Model(
      inputs = [input_ids, token_type_ids, attention_mask],
      outputs = [start_probs, end_probs],
  )

  loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False) # cross entropy on probs of last layer
  optimizer = keras.optimizers.Adam(lr=5e-5)
  model.compile(optimizer=optimizer, loss=[loss, loss])

  return model

In [None]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()





INFO:tensorflow:Initializing the TPU system: grpc://10.17.87.226:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.17.87.226:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.














Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_bert_for_question_answering_ TFQuestionAnsweringM 108893186   input_16[0][0]                   
                                                                 input_18[0][0]             

In [None]:
def normalize_text(text):
    text = text.lower()

    # Remove punctuations
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # Remove articles
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    # Remove extra white space
    text = " ".join(text.split())
    return text

class ExactMatch(keras.callbacks.Callback):
  def __init__(self, x_eval, y_eval):
    self.x_eval = x_eval
    self.y_eval = y_eval

  def on_epoch_end(self, epoch, logs=None):
    pred_start, pred_end = self.model.predict(self.x_eval)
    count = 0
    eval_instances_no_skip = [ele for ele in eval_instances if ele.skip==False]
    
    for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
      quoref_eg = eval_instances_no_skip[idx]
      offsets = quoref_eg.context_token_to_word
      mat = np.outer(start, end)
      pos = np.argmax(mat)
      start = pos//mat.shape[1]
      end = pos%mat.shape[1]
      if (start >= len(offsets)):
        continue
      pred_word_start = offsets[start][0]
      if (end < len(offsets)):
        pred_word_end = offsets[end][1]
        pred_ans = quoref_eg.context[pred_word_start:pred_word_end] # span of predicted ans with their indices # pred_word_start is the starting index, pred_word_end-1 is the ending index in context
      else:
        pred_ans = quoref_eg.context[pred_word_start:]

      normalized_pred_ans = normalize_text(pred_ans)
      normalized_true_ans = [normalize_text(_) for _ in quoref_eg.all_answers]
      if (normalized_pred_ans in normalized_true_ans):
        count += 1
    acc_whole = count/len(self.y_eval[0])
    acc  = count/len(eval_instances_no_skip)
    print(f"epoch={epoch+1}, EM_on_whole = {acc_whole:.2f}, exact match score={acc:.2f}\n")


In [None]:
exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
    x_train,
    y_train,
    epochs=50,
    verbose=2,
    batch_size=32,
    callbacks=[exact_match_callback],
)

Epoch 1/50
























410/410 - 138s - loss: 5.7800 - activation_6_loss: 2.7873 - activation_7_loss: 2.9927












epoch=1, EM_on_whole = 0.37, exact match score=0.37

Epoch 2/50
410/410 - 55s - loss: 3.1741 - activation_6_loss: 1.4991 - activation_7_loss: 1.6750
epoch=2, EM_on_whole = 0.43, exact match score=0.43

Epoch 3/50
410/410 - 55s - loss: 2.0283 - activation_6_loss: 0.9481 - activation_7_loss: 1.0803
epoch=3, EM_on_whole = 0.43, exact match score=0.43

Epoch 4/50
410/410 - 55s - loss: 1.2891 - activation_6_loss: 0.5952 - activation_7_loss: 0.6940
epoch=4, EM_on_whole = 0.45, exact match score=0.45

Epoch 5/50
410/410 - 55s - loss: 0.9207 - activation_6_loss: 0.4271 - activation_7_loss: 0.4936
epoch=5, EM_on_whole = 0.44, exact match score=0.44

Epoch 6/50
410/410 - 55s - loss: 0.6703 - activation_6_loss: 0.3129 - activation_7_loss: 0.3574
epoch=6, EM_on_whole = 0.44, exact match score=0.44

Epoch 7/50
410/410 - 55s - loss: 0.5489 - activation_6_loss: 0.2589 - activation_7_loss: 0.2900
epoch=7, EM_on_whole = 0.45, exact match score=0.45

Epoch 8/50
410/410 - 55s - loss: 0.4220 - activation_

<tensorflow.python.keras.callbacks.History at 0x7f31b8af1f50>

In [None]:
pred_start, pred_end = model.predict(x_eval)
count = 0
eval_instances_no_skip = [ele for ele in eval_instances if ele.skip==False]
    
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
  quoref_eg = eval_instances_no_skip[idx]
  offsets = quoref_eg.context_token_to_word
  mat = np.outer(start, end)
  pos = np.argmax(mat)
  start = pos//mat.shape[1]
  end = pos%mat.shape[1]
  if (start >= len(offsets)):
    continue
  pred_word_start = offsets[start][0]
  if (end < len(offsets)):
    pred_word_end = offsets[end][1]
    pred_ans = quoref_eg.context[pred_word_start:pred_word_end] # span of predicted ans with their indices # pred_word_start is the starting index, pred_word_end-1 is the ending index in context
  else:
    pred_ans = quoref_eg.context[pred_word_start:]

  normalized_pred_ans = normalize_text(pred_ans)
  normalized_true_ans = [normalize_text(_) for _ in quoref_eg.all_answers]
  if (normalized_pred_ans in normalized_true_ans):
    count += 1


acc_whole = count/len(y_eval[0])
acc  = count/len(eval_instances_no_skip)
print(f"EM count = {count}, EM_on_whole = {acc_whole:.2f}, exact match score={acc:.2f}\n")


EM count = 776, EM_on_whole = 0.47, exact match score=0.47

