# SQuAD in Tensorflow 2.0 (Question and Answer)

## This time, we are going to solve SQuAD problem with Tensorflow by fine-tuning pretrained BERT-Large model

Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.

##### For donwloading SQuAD dataset, please refer to '03_SQuAD_in_Keras' notebook.

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import *
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import re
import pickle
import codecs
from tqdm import tqdm
import shutil
import json
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')

# 1. Data Preprocessing and load tokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

I0626 11:55:00.262751  6888 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\bokhy/.cache\torch\transformers\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [4]:
os.listdir('./data')

['.ipynb_checkpoints',
 'bert',
 'dev-v1.1.json',
 'glove',
 'News_Category_Dataset_v2.json',
 'ratings_test.txt',
 'ratings_train.txt',
 'train-v1.1.json']

A function to make SQuAD JSON file to Pandas Dataframe
> Reference: https://www.kaggle.com/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe

In [5]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [7]:
# Convert Train JSON data to Dataframe
train = squad_json_to_dataframe_train("./data/train-v1.1.json")

# New column that shows the length of the column 'context'
train['context_len'] = train['context'].str.len()

# Check the length of 'question' oolumn, which is hightly likely abnormal if it's less than 10
train.loc[train['question'].str.len() <= 10].head(10)
# Delete the ones less than 10
train = train.loc[train['question'].str.len() >= 10].reset_index(drop=True)

Reading the json file
processing...
shape of the dataframe is (87599, 6)
Done


In [11]:
# Same hyperparatmer setting as the KERAS version
SEQ_LEN = 384
DATA_COLUMN = "context"
QUESTION_COLUMN = "question"
TEXT = "text"

In [8]:
def convert_data(data_df):
    global tokenizer
    indices, segments, masks, target_start, target_end = [], [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # que : tokenized variable of 'question' column to be used as an BERT input. 'tokenizer.encode' give a tokenized list and segment, but here we only use list
        # doc : tokenized variable of 'context' column to be used as an BERT input.
        
        que = tokenizer.encode(data_df[QUESTION_COLUMN][i])
        doc = tokenizer.encode(data_df[DATA_COLUMN][i])
        
        # delete the [CLS] token, which is at the very first of each 'context' 
        doc.pop(0)

        # que_len, doc_len : length of question , length of context
        que_len = len(que)
        doc_len = len(doc)

        # If the length of question is over 64, it make it to 64 
        if que_len > 64:
            que = que[:63]
            # for any cut question, we want to have [SEP] at the last of the question 
            que.append(102)
 


        # If the length of context is over 384, it make it to 384 
        if len(que+doc) > SEQ_LEN:
            while len(que+doc) != SEQ_LEN:
                doc.pop(-1)
            doc.pop(-1)
            # for any cut context, we want to have [SEP] at the last of the context 
            doc.append(102)

        
        ############################
        ###### Segment example #####
        ############################
        
        # question, context, padding
        # 00000000, 1111111, 0000000
        
        segment = [0]*len(que) + [1]*len(doc) + [0]*(SEQ_LEN-len(que)-len(doc))
        if len(que + doc) <= SEQ_LEN:
            mask = [1]*len(que+doc) + [0]*(SEQ_LEN-len(que+doc))
        else:
            mask = [1]*len(que+doc)
        
        # If the sum lenght of question and context is less than 384, fill remaining with 0 
        if len(que + doc) <= SEQ_LEN:
            while len(que+doc) != SEQ_LEN:
                doc.append(0)

        # ids : Acutal input that is the sum of length of question and context
        ids = que + doc
        
        # slide the context to the length of the text, and if answer is found inside context,
        # show the first and last word of the text inside context 
        
        text = tokenizer.encode(data_df[TEXT][i])
        text_slide_len = len(text[1:-1])
        
        # exist_flag : if text is found inside context, convert from 0 to 1 
        for j in range(0,(len(doc))):  
            exist_flag = 0
            if text[1:-1] == doc[j:j+text_slide_len]:
                ans_start = j + len(que)
                ans_end = j + text_slide_len - 1 + len(que)
                exist_flag = 1
                break
        
        # if text is NOT found inside context, the put the first and last value as 384
        # (this would be deleted later) 
        if exist_flag == 0:
            ans_start = SEQ_LEN
            ans_end = SEQ_LEN

        # Add ids and segments to indices and segments, which is going to be BERT input 
        indices.append(ids)
        segments.append(segment)
        masks.append(mask)
        
        # Starting point of answer is ans_start and finishing point of answer is ans_end 
        target_start.append(ans_start)
        target_end.append(ans_end)

    # Save all "indices, segments, ans_start, ans_end" as numpy array     
    indices_x = np.array(indices)
    segments = np.array(segments)
    masks = np.array(masks)
    target_start = np.array(target_start)
    target_end = np.array(target_end)
    
    # del_list would be a list that should be deleted because ans_start and ans_end is not actually answer 
    del_list = np.where(target_start!=SEQ_LEN)[0]
    not_del_list = np.where(target_start==SEQ_LEN)[0]
    indices_x = indices_x[del_list]
    segments = segments[del_list]
    masks = masks[del_list]
    target_start = target_start[del_list]
    target_end = target_end[del_list]

    return [indices_x, masks, segments], [target_start, target_end], not_del_list

In [9]:
# Load dataframe and split it into train/test

def load_data(df):
    data_df = df
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data_df[QUESTION_COLUMN] = data_df[QUESTION_COLUMN].astype(str)
    data_df[TEXT] = data_df[TEXT].astype(str)
    data_x, data_y, del_list = convert_data(data_df)

    return data_x, data_y, del_list

In [12]:
train_x, train_y, z = load_data(train)

  4%|██▊                                                                         | 3247/87589 [00:14<05:00, 280.83it/s]W0626 11:56:05.136086  6888 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (718 > 512). Running this sequence through the model will result in indexing errors
W0626 11:56:05.150048  6888 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (718 > 512). Running this sequence through the model will result in indexing errors
W0626 11:56:05.164046  6888 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (718 > 512). Running this sequence through the model will result in indexing errors
  4%|██▊                                                                         | 3276/87589 [00:14<05:49, 241.33it/s]W0626 11:56:05.179003  6888 tokenization_utils.p

In [13]:
train_x

[array([[  101,  2000,  3183, ...,     0,     0,     0],
        [  101,  2054,  2003, ...,     0,     0,     0],
        [  101,  1996, 13546, ...,     0,     0,     0],
        ...,
        [  101,  2007,  2054, ...,     0,     0,     0],
        [  101,  1999,  2054, ...,     0,     0,     0],
        [  101,  2054,  2003, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])]

# 2. Load Pre-trained BERT model and fine-tuning

In [14]:
def MyModel():
  
    model = TFBertModel.from_pretrained('bert-large-uncased')

    token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
    seg_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segments')
    mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')

    seq_output, _ = model([token_inputs, mask_inputs, seg_inputs])
    
    x = tf.keras.layers.Dense(2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(seq_output)
    start, end = tf.split(x, 2, axis=-1)
    start = tf.squeeze(start, axis=-1)
    end = tf.squeeze(end, axis=-1)
    bert_model2 = tf.keras.Model([token_inputs, mask_inputs, seg_inputs], [start, end])
    
    import tensorflow_addons as tfa
    # https://github.com/tensorflow/addons/blob/master/docs/tutorials/losses_triplet.ipynb
    
    opt = tf.keras.optimizers.Adam(lr=0.01) # or 1.5e-5
    
    bert_model2.compile(
          optimizer = opt,
          loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          metrics=['sparse_categorical_accuracy'])
    bert_model2.summary()
    
    del model
    
    return bert_model2

# 3. Start Training

In [16]:
# create and view summary
bert_model2 = MyModel()

I0626 12:11:01.370805  6888 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at C:\Users\bokhy/.cache\torch\transformers\6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
I0626 12:11:01.371837  6888 configuration_utils.py:321] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

I0626 12:12:21.703265  6888 modeling_tf_utils.py:393] loading weights file https://cdn.huggingface.co/bert-large-uncased-tf_model.h5 f

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   ((None, 384, 1024),  335141888   input_word_ids[0][0]             
                                                                 input_masks[0][0]          

In [17]:
# Save weights for later use
# bert_model2.save_weights(os.path.join(path, "bert_large_2epoch.h5"))

# 4. Check the loss 

In [None]:
from sklearn.metrics import classification_report

preds = bert_model2.predict(train_x)

start_indexes = np.argmax(preds[0], axis=-1)
end_indexes = np.argmax(preds[1], axis=-1)

# f1_score of start_indexes
print(classification_report(train_y[0], start_indexes))

# f1_score of end_indexes
print(classification_report(train_y[1], end_indexes))

# 5. Inference on Test set 
### (same as Keras version)

In [None]:
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
input_file_path ='./data/dev-v1.1.json'
record_path = ['data','paragraphs','qas','answers']
verbose = 0
dev = squad_json_to_dataframe_dev(input_file_path=input_file_path,record_path=record_path)

In [None]:
dev

In [None]:
# new column that's the number of the answer
dev['answer_len'] = dev['answers'].map(lambda x: len(x))

In [None]:
# change answers to the list
def get_text(text_len, answers):
  # text_len : the number of answers in question and context
  # answers : example: [{'answer_start': 177, 'text': 'Denver Broncos'}, {'answer_start': 177, 'text': 'Denver Broncos'}, {'answer_start': 177, 'text': 'Denver Broncos'}]
    texts = []
    for i in range(text_len):
        texts.append(answers[i]['text'])
    return texts

In [None]:
# apply to all data in 'text' column
dev['texts'] = dev.apply(lambda x: get_text(x['answer_len'], x['answers']), axis=1)

In [None]:
# Specifcy text column
TEXT_COLUMN = 'texts'

In [None]:
# Same data converting step as training data
def convert_data(data_df):
    global tokenizer
    indices, segments, masks, target_start, target_end = [], [], [], [], []

    for i in tqdm(range(len(data_df))):
        que = tokenizer.encode(data_df[QUESTION_COLUMN][i])
        doc = tokenizer.encode(data_df[DATA_COLUMN][i])
        doc.pop(0)

        que_len = len(que)
        doc_len = len(doc)

        if que_len > 64:
            que = que[:63]
            que.append(102)
        
        if len(que+doc) > SEQ_LEN:
            while len(que+doc) != SEQ_LEN:
                doc.pop(-1)

            doc.pop(-1)
            doc.append(102)
        
        if len(que + doc) <= SEQ_LEN:
            mask = [1]*len(que+doc) + [0]*(SEQ_LEN-len(que+doc))
        else:
            mask = [1]*len(que+doc)
        segment = [0]*len(que) + [1]*len(doc) + [0]*(SEQ_LEN-len(que)-len(doc))
        if len(que + doc) <= SEQ_LEN:
            while len(que+doc) != SEQ_LEN:
                doc.append(0)

        ids = que + doc

        texts = data_df[TEXT_COLUMN][i]
        for text_element in texts:
            text = tokenizer.encode(text_element)

            text_slide_len = len(text[1:-1])
            for j in range(0,(len(doc))):  
                exist_flag = 0
                if text[1:-1] == doc[j:j+text_slide_len]:
                    ans_start = j + len(que)
                    ans_end = j + text_slide_len - 1 + len(que)
                    exist_flag = 1
                    break
        
          if exist_flag == 0:
            ans_start = SEQ_LEN
            ans_end = SEQ_LEN

        indices.append(ids)
        segments.append(segment)
        masks.append(mask)
        target_start.append(ans_start)
        target_end.append(ans_end)
        
 
    indices_x = np.array(indices)
    segments = np.array(segments)
    masks = np.array(masks)
    target_start = np.array(target_start)
    target_end = np.array(target_end)
    

    del_list = np.where(target_start!=SEQ_LEN)[0]
    not_del_list = np.where(target_start==SEQ_LEN)[0]
    indices_x = indices_x[del_list]
    segments = segments[del_list]
    masks = masks[del_list]
    target_start = target_start[del_list]
    target_end = target_end[del_list]

    return [indices_x, masks, segments], del_list

In [None]:
# Load and concert test data to dataframe
dev_bert_input = convert_data(dev)

In [None]:
# A bit cleaning
dev_bert_input, del_list = dev_bert_input[0], dev_bert_input[1]
dev = dev.iloc[del_list]
dev = dev.reset_index(drop=True)

In [None]:
indexes = dev_bert_input[0]
bert_predictions = bert_model2.predict(dev_bert_input)

In [None]:
start_indexes = np.argmax(bert_predictions[0], axis=-1)
end_indexes = np.argmax(bert_predictions[1], axis=-1)
not_del_list = np.where(start_indexes <= end_indexes)[0]
start_indexes = start_indexes[not_del_list]
end_indexes = end_indexes[not_del_list]
indexes = indexes[not_del_list]

In [None]:
dev = dev.iloc[not_del_list].reset_index(drop=True)

In [None]:
tokenizer.convert_ids_to_tokens(197)

In [None]:
# length :length of test dataset

length = len(dev)

sentences = []

untokenized = []

for j in range(len(start_indexes)):
    sentence = []
    for i in range(start_indexes[j], end_indexes[j]+1):
        token_based_word = tokenizer.convert_ids_to_tokens(indexes[j][i].item())
        sentence.append(token_based_word)
        
# save each tokenized word to 'sentence_string'   
    sentence_string = ""
  
    for w in sentence:
        
        # If the token starts with ##, delete ## 
        if w.startswith("##"):
            w = w.replace("##", "")
        # If the token does NOT start with ##, it's the start of the new sentence, so add a space     
        else:
            w = " " + w
        # concatanate all tokens    
        sentence_string += w
    # if sentence_string starts with " ", delete space   
    if sentence_string.startswith(" "):
        sentence_string = "" + sentence_string[1:]
        
    # Append all tokens to 'untokenized'
    untokenized.append(sentence_string)

    sentences.append(sentence)

In [None]:
dev_answers = []
for i in range(length):
    dev_answer = []
    texts_dict = dev['answers'][i]

    # save each answer as a list 
    for j in range(len(texts_dict)):
        dev_answer.append(texts_dict[j]['text'])

    dev_answers.append(dev_answer)

In [None]:
dev_tokens = []
for i in dev_answers:
    dev_tokened = []
    for j in i:
        temp_token = tokenizer.tokenize(j)
        #print(temp_token)
        #temp_token.pop(0)
        # DELETE [CLS] 
        #temp_token.pop(-1)
        # DELETE [SEP] 
        dev_tokened.append(temp_token)
    dev_tokens.append(dev_tokened)

In [None]:
# CONVERT tokenzied words into sentence and concetanate
dev_answer_lists = []
for dev_answers in dev_tokens:
    dev_answer_list = []
    for dev_answer in dev_answers:
        dev_answer_string = " ".join(dev_answer)
        dev_answer_list.append(dev_answer_string)
    dev_answer_lists.append(dev_answer_list)

In [None]:
# untokenizing
dev_strings_end = []
for dev_strings in dev_answer_lists:
    dev_strings_processed = []
    for dev_string in dev_strings:
        dev_string = dev_string.replace(" ##", "")
        dev_strings_processed.append(dev_string)
    dev_strings_end.append(dev_strings_processed)

dev_answers = dev_strings_end

In [None]:
from collections import Counter
import string, re

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


In [None]:
f1_sum = 0

for i in range(len(untokenized)):
    f1 = metric_max_over_ground_truths(f1_score, untokenized[i], dev_answers[i])
    f1_sum += f1
print("f1 score : ", f1_sum / length)

In [None]:
EM_sum = 0

for i in range(len(untokenized)):
    EM = metric_max_over_ground_truths(exact_match_score, untokenized[i], dev_answers[i])
    EM_sum += EM
print("EM Score : ", EM_sum / length)