In [1]:
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

--2022-08-30 21:47:15--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘dev-v2.0.json’


2022-08-30 21:47:16 (283 MB/s) - ‘dev-v2.0.json’ saved [4370528/4370528]

--2022-08-30 21:47:16--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.110.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2022-08-30 21:47:19 (429 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]



In [2]:
# !pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 17.7 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1


In [1]:
import pandas as pd 
import json
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers

from tokenizers import BertWordPieceTokenizer

In [2]:
MAX_SEQ_LENGTH = 384

# explore the data 

In [5]:
with open('dev-v2.0.json') as f:
     raw_dev_data = json.load(f)
     raw_dev_data["data"][0]['paragraphs'][0]['context']

In [6]:
# # data structure 
# data : list
#     [0]
#     'title': str
#     'paragraphs' : list
#                 [0]
#                 'qas': list
#                     [0]
#                     'question' : str
#                     'id' : str
#                     'answers': list  # is answers is empty list there will be plausible_answers same as answers
#                             'text' : str
#                             'answer_start' : int
#                     'is_impossible' :bool
#                 'context' : str

# prepare data for the model train

In [3]:
def get_text(row):
    if row['answers']:
        start =  row['answers'][0]['answer_start']
        answer = row['answers'][0]['text']
        
        return {'text':answer, 'start':start }

def read_squad_json(file):
    # read json file
    file = json.loads(open(file).read())
    #get context column and questions column
    context_questions = pd.json_normalize(file,['data','paragraphs'])
    #repeat context column for each question
    idx = np.repeat(context_questions['context'].values, context_questions.qas.str.len())
    #separate question and answer columns
    dataframe = pd.json_normalize(file,record_path=['data','paragraphs','qas'])
    #add context to each question
    dataframe['context'] = idx

    dataframe = pd.concat([dataframe,dataframe.apply(get_text,result_type='expand',axis = 1)],axis=1)
    dataframe['start'].fillna(-1,inplace = True)
    dataframe['start'] = dataframe['start'].astype('int32')
    return dataframe


def tokenizer_output(tokenized_context, tokenized_question, seq_maxlen):
  '''
  prepare input arrays for bert 

  '''
  input_ids = tokenized_context.ids + tokenized_question.ids[1:] # ignore [CLS] to the start of the question
  input_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
                            tokenized_question.ids[1:]  )
  input_mask = [1] * len(input_ids)
  # add padding if sentence less than seq_max length 
  padding_length = seq_maxlen - len(input_ids)
  if padding_length >= 0:

    input_ids      = input_ids      + ([0] * padding_length)
    input_mask     = input_mask     + ([0] * padding_length)
    input_type_ids = input_type_ids + ([0] * padding_length)

    return {'input_word_ids' : input_ids, 'input_mask':input_mask, 'input_type_ids':input_type_ids}

  
def get_start_end_tokens(context, context_tokens, answer_text, start_index):
  """
  Given a context and an answer, return the start and end token indices.
  """
  #check if question has answer
  if start_index != -1 : 
    #end char index
    end_index = len(answer_text) + start_index
    #build mask for cntext characters
    context_mask = [0] * len(context)
    
    # labeld answer characters with one
    for idx in range(start_index, end_index):
        context_mask[idx] = 1

    answer_token_ids = []

    # append index of answer tokens
    for idx, (start, end) in enumerate(context_tokens.offsets): # offset is index of start and end char for that token
        if sum(context_mask[start:end]) > 0:
            answer_token_ids.append(idx)
    
    return { 'start_token_idx' : answer_token_ids[0], 'end_token_idx': answer_token_ids[-1]} # start token index,  end token index

  # else : return { 'start_token_idx' : MAX_SEQ_LENGTH+1, 'end_token_idx': MAX_SEQ_LENGTH+1 }

def input_output_dataframe(row):
  '''
  this function will use in train only to get bert layert input and true out put 
  input :
    dataframe : dataframe row contain context,question, answer text , and answer start char 
  output :
     pandas dataframe with columns : [ input_word_ids , input_mask , input_type_ids , start_token_idx ,end_token_idx]

  '''
  # toknize the context
  tokenized_context = tokenizer.encode( row['context'])
  # tokenize the question
  tokenized_question = tokenizer.encode(row['question']) 
  seq_maxlen = MAX_SEQ_LENGTH
  if row['start'] != -1 :
    output =  tokenizer_output(tokenized_context, tokenized_question, seq_maxlen) 
    if output :
      output.update(get_start_end_tokens(row['context'], tokenized_context, row['text'], row['start']))
      return output
    else :
      return None
  else : return None  

In [4]:
#bert layer
bert_layer = hub.KerasLayer("bert_en_uncased_L-12_H-768_A-12_2", trainable=True)

# build tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)


In [5]:
train_data = read_squad_json("train-v2.0.json")[['context','question','text','start']]
train_data = train_data.apply(input_output_dataframe,result_type='expand',axis=1)
train_data.dropna(inplace = True)
train_data[['start_token_idx'	,'end_token_idx']] = train_data[['start_token_idx'	,'end_token_idx']].astype('int32')

In [6]:
print(train_data.shape)
train_data.head(2)

(85935, 5)


Unnamed: 0,input_word_ids,input_mask,input_type_ids,start_token_idx,end_token_idx
0,"[101, 20773, 21025, 19358, 22815, 1011, 5708, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",67,70
1,"[101, 20773, 21025, 19358, 22815, 1011, 5708, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,57


In [9]:
# val_data = read_squad_json("dev-v2.0.json")[['context','question','text','start']]
# val_data = val_data.apply(input_output_dataframe,result_type='expand',axis=1)
# val_data.dropna(inplace = True)
# val_data[['start_token_idx'	,'end_token_idx']] = val_data[['start_token_idx'	,'end_token_idx']].astype('int32')

# model architecture

In [7]:
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name='input_type_ids')
# bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])


start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = layers.Flatten()(start_logits)

end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = layers.Flatten()(end_logits)

start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)

model = keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=[start_probs, end_probs])
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
model.compile(optimizer=optimizer, loss=[loss, loss])
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 384)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 384)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 384)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 384, 768)]                'input_mask[0][0]',         

  super(Adam, self).__init__(name, **kwargs)


# train the model

In [8]:
model.fit([np.stack(train_data['input_word_ids'].values),np.stack(train_data['input_mask'].values),np.stack(train_data['input_type_ids'].values)],
  [np.stack(train_data['start_token_idx'].values),np.stack(train_data['end_token_idx'].values)]
  , epochs=2, batch_size=8,)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x21fb8f6ef80>

# predict

In [9]:
model.save('bert_model')
mm= tf.keras.models.load_model('bert_model')



INFO:tensorflow:Assets written to: bert_model\assets


INFO:tensorflow:Assets written to: bert_model\assets


In [17]:
question = 'what is tensorflow?'
context = ''' TensorFlow is a free and open-source software library for machine learning and artificial intelligence.
 It can be used across a range of tasks but has a particular focus on training and inference of deep neural networks.[4][5]
TensorFlow was developed by the Google Brain team for internal Google use in research and production.[6][7][8] 
The initial version was released under the Apache License 2.0 in 2015.[1][9] Google released the updated version of TensorFlow,
 named TensorFlow 2.0, in September 2019.[10]
TensorFlow can be used in a wide variety of programming languages, most notably Python, 
as well as Javascript, C++, and Java.[11] This flexibility lends itself to a range of applications in many different sectors. '''




In [18]:
def predict_answer(context,question):
  # toknize the context
  tokenized_context = tokenizer.encode(context)
  # tokenize the question
  tokenized_question = tokenizer.encode(question) 
  inp_dict  = tokenizer_output(tokenized_context, tokenized_question, MAX_SEQ_LENGTH)

  pred_start, pred_end = model.predict([np.array([inp_dict['input_word_ids']]),np.array([inp_dict['input_mask']]),np.array([inp_dict['input_type_ids']])])
  
  return tokenizer.decode(np.array(inp_dict['input_word_ids'])[pred_start.argmax():pred_end.argmax()+1])


In [19]:
predict_answer(context,question)

'a free and open - source software library for machine learning and artificial intelligence'