In [82]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tensorflow2-question-answering/simplified-nq-train.jsonl
/kaggle/input/tensorflow2-question-answering/sample_submission.csv
/kaggle/input/tensorflow2-question-answering/simplified-nq-test.jsonl


In [83]:
import numpy as np 
import pandas as pd
import json
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import re
import gc
import seaborn as sns

import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, SpatialDropout1D, Dense, Dropout, Input, concatenate, Conv1D, Activation, Flatten

In [84]:
#path for data files
train_path = '../input/tensorflow2-question-answering/simplified-nq-train.jsonl'
test_path = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl'

# **Part 1 - identifying correct long answer to a question**

In [85]:
# define training parameters 
num_train_ques = 2000
num_val_ques = 2050
sample_rate = 15

In [86]:
def get_question_and_document(line):
    question = line['question_text']
    text = line['document_text'].split(' ')
    annotations = line['annotations'][0]    
    return question, text, annotations
                
def get_long_candidate(i, annotations, candidate):
    # check if this candidate is the correct long answer
    if i == annotations['long_answer']['candidate_index']:
        label = True
    else:
        label = False

    # get place where long answer starts and ends in the document text
    long_start = candidate['start_token']
    long_end = candidate['end_token']    
    return label, long_start, long_end

def form_data_row(question, label, text, long_start, long_end):
    row = {
        'question': question,
        'long_answer': ' '.join(text[long_start:long_end]),
        'is_long_answer': label,
    }    
    return row

In [87]:
def load_data(file_path, questions_start, questions_end):
    rows = []
    
    with open(file_path) as file:
        for i in tqdm(range(questions_start, questions_end)):
            line = file.readline()
            line = json.loads(line)
            question, text, annotations = get_question_and_document(line)

            for i, candidate in enumerate(line['long_answer_candidates']):
                label, long_start, long_end = get_long_candidate(i, annotations, candidate)

                if label == True or (i % sample_rate == 0):
                    rows.append(
                        form_data_row(question, int(label), text, long_start, long_end)
                    )        
    return pd.DataFrame(rows)

In [88]:
train_df = load_data(train_path, 0, num_train_ques)
test_df = load_data(train_path, num_train_ques, num_val_ques)

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [89]:
train_df.head(5)

Unnamed: 0,question,long_answer,is_long_answer
0,which is the most common use of opt-in e-mail ...,<Table> <Tr> <Td> </Td> <Td> ( hide ) This art...,0
1,which is the most common use of opt-in e-mail ...,<Tr> <Td> <Ul> <Li> Pay - per - click </Li> <L...,0
2,which is the most common use of opt-in e-mail ...,<P> Email marketing has evolved rapidly alongs...,0
3,which is the most common use of opt-in e-mail ...,<Li> Advertisers can reach substantial numbers...,0
4,which is the most common use of opt-in e-mail ...,<P> A common example of permission marketing i...,1


In [90]:
test_df.head(5)

Unnamed: 0,question,long_answer,is_long_answer
0,which is the most common use of opt-in e-mail ...,<Table> <Tr> <Td> </Td> <Td> ( hide ) This art...,0
1,which is the most common use of opt-in e-mail ...,<Tr> <Td> <Ul> <Li> Pay - per - click </Li> <L...,0
2,which is the most common use of opt-in e-mail ...,<P> Email marketing has evolved rapidly alongs...,0
3,which is the most common use of opt-in e-mail ...,<Li> Advertisers can reach substantial numbers...,0
4,which is the most common use of opt-in e-mail ...,<P> A common example of permission marketing i...,1


# **Pre-processing texts**

In [91]:
#cleaning texts by removing stopwords 
def remove_stopwords(sentence):
    words = sentence.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

#removing html tags 
def remove_html(sentence):
    html = re.compile(r'<.*?>` `` ')
    return html.sub(r'', sentence)

#returns the pre-processed dataframe for long answers and questions
def preprocessed_df(df):
    df['long_answer'] = df['long_answer'].apply(lambda x : remove_html(x))
    df['long_answer'] = df['long_answer'].apply(lambda x : remove_stopwords(x))

    df['question'] = df['question'].apply(lambda x : remove_html(x))
    df['question'] = df['question'].apply(lambda x : remove_stopwords(x))
    
    return df

In [94]:
train_df = preprocessed_df(train_df)
test_df = preprocessed_df(test_df)
train_df.head(5)

Unnamed: 0,question,long_answer,is_long_answer
0,common use opt-in e-mail marketing,<Table> <Tr> <Td> </Td> <Td> ( hide ) This art...,0
1,common use opt-in e-mail marketing,<Tr> <Td> <Ul> <Li> Pay - per - click </Li> <L...,0
2,common use opt-in e-mail marketing,<P> Email marketing evolved rapidly alongside ...,0
3,common use opt-in e-mail marketing,<Li> Advertisers reach substantial numbers ema...,0
4,common use opt-in e-mail marketing,<P> A common example permission marketing news...,1


# ***Pre-processing***

In [None]:
train_df.head(10)['long_answer']

In [None]:
# Shuffle training dataframe

train_df = train_df.sample(frac=1, random_state=42)
train_df.head()

In [None]:
# How many examples of each class?
train_df.is_long_answer.value_counts()

In [None]:
# Data is unbalanced, up-sampling

from sklearn.utils import resample
#create two different dataframe of majority and minority class 
df_majority = train_df[(train_df['is_long_answer']==0)] 
df_minority = train_df[(train_df['is_long_answer']==1)] 
# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples= int((len(train_df) * 50) / 100), # to match majority class
                                 random_state=42)  # reproducible results
# Combine majority class with upsampled minority class
train_df = pd.concat([df_minority_upsampled, df_majority])

In [None]:
train_df.is_long_answer.value_counts()

In [None]:
test_df

In [None]:
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

In [None]:
# Visualize random training examples

import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df[["question", "long_answer", "is_long_answer"]][random_index:random_index+5].itertuples():
  _, q, la, ila = row
  print(f"Target: {ila}", "(true label)" if ila > 0 else "(not true label)")
  print(f"Question:\n{q}\n")
  print(f"Long answer:\n{la}\n")
  print("---\n")

In [None]:
# Splitting into training and validation sets

from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    train_df[["question", "long_answer"]],  # Select relevant columns
    train_df["is_long_answer"],
    test_size=0.1,
    random_state=42
)

len(train_data), len(val_data), len(train_labels), len(val_labels)

In [None]:
val_labels.value_counts()

In [None]:
train_data[:10], train_labels[:10]

In [None]:
train_data

# ***Converting text into numbers***

In [None]:
# Text vectorization (tokenization)

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6

# Before TensorFlow 2.6
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization 
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [None]:
# Find average number of tokens (words) in long answers
round(sum([len(i.split()) for i in train_data.long_answer])/len(train_data.long_answer))

In [None]:
# Setup text vectorization with custom variables
max_vocab_length = 20000 # max number of words to have in our vocabulary; true value ~ 67k
max_length = 150 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
# Fit the text vectorizer to the training text
# Combine 'question' and 'long_answer' into a single column
train_sentences = train_df['question'] + ' ' + train_df['long_answer']

# Adapt the vectorizer on the training data
text_vectorizer.adapt(train_sentences)

In [None]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

# ***Embedding layer***

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding_layer = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding_layer

In [None]:
# Get a random sentence from training set
random_sentence = random.choice(train_data.long_answer)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding_layer(text_vectorizer([random_sentence]))
sample_embed

In [None]:
# Check out a single token's embedding
sample_embed[0][0]

# ***Model definition***

In [None]:
# question encoding-encodes the question
question_input = layers.Input(shape=(1,), dtype="string")
question_x = text_vectorizer(question_input) # turn the input text into numbers
question_x = embedding_layer(question_x) # create an embedding of the numerized numbers
question_x = SpatialDropout1D(0.2)(question_x)
question_x = Bidirectional(LSTM(100, return_sequences=True))(question_x)
question_x = GlobalMaxPooling1D()(question_x) # outputs an encoded array representing the question

# answer encoding-encodes the answer
answer_input = Input(shape=(1,), dtype="string")
answer_x = text_vectorizer(answer_input)
answer_x = embedding_layer(answer_x)
answer_x = SpatialDropout1D(0.2)(answer_x)
answer_x = Bidirectional(LSTM(150, return_sequences=True))(answer_x)
answer_x = GlobalMaxPooling1D()(answer_x) #outputs an encoded array representing the answer

# classification
combined_x = concatenate([question_x, answer_x])
combined_x = Dense(300, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)
combined_x = Dense(300, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)
model_output = Dense(1, activation='sigmoid')(combined_x) # probability how close the potential answer is to the true answer to the question.

# defining model by combining above three parts
model = tf.keras.models.Model(inputs=[answer_input, question_input], outputs=model_output)

In [None]:
model.summary()

In [None]:
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam',
    metrics=['BinaryAccuracy', 'Recall', 'Precision'])

In [None]:
#define callbacks - to avoid plateauing & achieve early stopping
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1),]

In [None]:
# define model parameters
epochs = 10
batch_size = 128
# class_weights = {0: 0.5, 1: 5.}

In [None]:
history = model.fit(
    x=[train_data['long_answer'], train_data['question']],
    y=train_labels,
    validation_data=([val_data['long_answer'], val_data['question']], val_labels),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    shuffle=True
)

In [None]:
#save model
model.save('long_model_no_pretraining')

In [None]:
model.evaluate([val_data['long_answer'], val_data['question']], val_labels)

In [None]:
model_prediction_probs = model.predict([test_df['long_answer'], test_df['question']])
model_prediction_probs[:10]

In [None]:
model_prediction_probs = tf.squeeze(tf.round(model_prediction_probs)) # squeeze removes single dimensions
model_prediction_probs[:20]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
model_1_results = calculate_results(y_true=test_df['is_long_answer'], 
                                    y_pred=model_prediction_probs)
model_1_results

In [None]:
#part 2

#filter records where short answers exist
def get_short_answer(annotations, long_start, long_end):
    if len(annotations['short_answers']) > 0:
        short_start = annotations['short_answers'][0]['start_token']
        short_end = annotations['short_answers'][0]['end_token']        
        short_start = short_start - long_start
        short_end = short_end - long_start        
        return short_start, short_end
    else:
        return 0, 0
    
def form_short_data_row(question, text, long_start, long_end, short_start, short_end):
    long_answer = ' '.join(text[long_start:long_end])
    short_answer = ' '.join(long_answer.split(' ')[short_start:short_end])
    row = {
        'question': question,
        'long_answer': long_answer,
        'short_answer': short_answer,
        'short_start': short_start,
        'short_end': short_end
    }    
    return row

In [None]:
#loading short answers
def load_short_data(file_path, questions_start, questions_end):
    rows = []    
    with open(file_path) as file:

        for i in tqdm(range(questions_start, questions_end)):
            line = file.readline()
            line = json.loads(line)
            question, text, annotations = get_question_and_document(line)
            for i, candidate in enumerate(line['long_answer_candidates']):
                label, long_start, long_end = get_long_candidate(i, annotations, candidate)

                if label == True:
                    short_start, short_end = get_short_answer(annotations, long_start, long_end)
                    
                    rows.append(
                        form_short_data_row(question, text, long_start, long_end, short_start, short_end)
                    )
    return pd.DataFrame(rows)

In [None]:
train_short_data = load_short_data(train_path, 0 , num_train_ques)
test_short_data = load_short_data(train_path, num_train_ques, num_val_ques)

In [None]:
train_short_data.head(10)

In [None]:
#check count values in each column
def count_values_in_column(data,feature):
    total=data.loc[:,feature].value_counts(dropna=False)
    percentage=round(data.loc[:,feature].value_counts(dropna=False,normalize=True)*100,2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])

count_values_in_column(train_short_data, 'short_answer')

In [None]:
#tokenizing
#tokenization params
filters = '!"''#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
lower_case = True
max_len = 500 #max lenght of a sentence input in to the model

#assigning numeric index to each unique work
def define_tokenizer(series):
    sentences = pd.concat(series)    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower_case,filters=filters  )
    tokenizer.fit_on_texts(sentences)
    return tokenizer

#encoding
def encode(sentences, tokenizer):
    encoded_sentences = tokenizer.texts_to_sequences(sentences)
    encoded_sentences = tf.keras.preprocessing.sequence.pad_sequences(encoded_sentences,
                                                                      maxlen=max_len, padding='post')
    return encoded_sentences

tokenizer = define_tokenizer([train_df.long_answer, train_df.question, test_df.long_answer, test_df.question])

train_long_ans = encode(train_short_data['long_answer'].values, tokenizer)
train_questions = encode(train_short_data['question'].values, tokenizer)

test_long_ans = encode(test_short_data['long_answer'].values, tokenizer)
test_questions = encode(test_short_data['question'].values, tokenizer)

In [None]:
#define 2 arrays for the start index and another for the end index
def form_short_labels(df, sentence_length):
    start_labels = np.zeros((len(df), sentence_length))
    end_labels = np.zeros((len(df), sentence_length))

    #get the token indexes from short_start and short_end columns and assign it to new arrays. encoding with 1
    for i in range(len(df)):
        start = df.loc[i].short_start
        end = df.loc[i].short_end

        if start < 500 and end < 500:
            start_labels[i, start] = 1
            end_labels[i, end] = 1
        else:
            continue
    return start_labels, end_labels

train_start_labels, train_end_labels = form_short_labels(train_short_data, max_len)
test_start_labels, test_end_labels = form_short_labels(test_short_data, max_len)

In [None]:
print(train_short_data.loc[10].question)

print(train_short_data.loc[10].long_answer)
print(train_short_data.loc[10].short_answer)

print('Start index: {0}'.format(train_start_labels[10]))
print('End index: {0}'.format(train_end_labels[10]))

In [None]:
# short answer model parameters
short_epochs = 10
short_batch_size = 32
embed_size_short=200

In [None]:
#write to matrix after loading from file

vocab = train_df.long_answer + train_df.question + test_df.long_answer + test_df.question
embedding_dict = {word: i for i, word in enumerate(vocab)}


num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, embed_size_short))

for word, i in tokenizer.word_index.items():
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec
        
        
# load as tensorflow embedding
#define embedding layer for the short model
embedding_layer2 = tf.keras.layers.Embedding(
    len(tokenizer.word_index) + 1,
    embed_size_short,
    embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
    trainable = False
)

In [None]:
#encode the question input
question_input = Input(shape=(None,))
question_x = embedding_layer2(question_input)
question_x = SpatialDropout1D(0.2)(question_x)
question_x = Bidirectional(LSTM(200, return_sequences=True))(question_x)
question_x = Bidirectional(LSTM(100, return_sequences=True))(question_x)

#encode the answer input
answer_input = Input(shape=(None,))
answer_x = embedding_layer2(answer_input)
answer_x = SpatialDropout1D(0.2)(answer_x)
answer_x = Bidirectional(LSTM(250, return_sequences=True))(answer_x)
answer_x = Bidirectional(LSTM(150, return_sequences=True))(answer_x)

combined_x = concatenate([question_x, answer_x])

#predict start idx of the short answer
start_x = Dropout(0.1)(combined_x) 
start_x = Conv1D(1,1)(start_x)
start_x = Flatten()(start_x)
start_x = Activation('softmax', name='start_token')(start_x)

#predict end idx
end_x = Dropout(0.1)(combined_x) 
end_x = Conv1D(1,1)(end_x)
end_x = Flatten()(end_x)
end_x = Activation('softmax', name='end_token')(end_x)

short_model = tf.keras.models.Model(inputs=[answer_input, question_input], outputs=[start_x, end_x])

In [None]:
short_model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam',
    metrics=['categorical_accuracy', 'Recall', 'Precision'])

In [None]:
short_model.summary()

In [None]:
# define callbacks for the short model
#inreased patience or number of epochs with no improvement after which training will be stopped.
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=4, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1),]

In [None]:
history = short_model.fit(
    x = [train_long_ans, train_questions], 
    y = [train_start_labels, train_end_labels],epochs = short_epochs, callbacks = callbacks,
    validation_data = ([test_long_ans, test_questions], [test_start_labels, test_end_labels]),
    batch_size = short_batch_size,  shuffle = True)

In [None]:
short_model.save('short_model_no_pretraining')

In [None]:
print('Epoch: {0}'.format(len(history.history['loss'])))
print('Loss: {0}'.format(history.history['loss'][-1]))

In [None]:
print('Training final results')

accuracy = history.history['start_token_categorical_accuracy'][-1]
recall = history.history['start_token_recall'][-1]
precision = history.history['start_token_precision'][-1]

print('--------------------------------------------------')
print('Start token accuracy: {0}'.format(accuracy))
print('Start token recall: {0}'.format(recall))
print('Start token precision: {0}'.format(precision))
print('Start token F1 score: {0:.4f}'.format(2 * (precision * recall) / (precision + recall)))

print('--------------------------------------------------')

accuracy = history.history['end_token_categorical_accuracy'][-1]
recall = history.history['end_token_recall_1'][-1]
precision = history.history['end_token_precision_1'][-1]

print('End token accuracy: {0}'.format(accuracy))
print('End token recall: {0}'.format(recall))
print('End token precision: {0}'.format(precision))
print('End token F1 score: {0:.4f}'.format(2 * (precision * recall) / (precision + recall)))

In [None]:
print('Validation final results')
print('--------------------------------------------------')

accuracy = history.history['val_start_token_categorical_accuracy'][-1]
recall = history.history['val_start_token_recall'][-1]
precision = history.history['val_start_token_precision'][-1]

print('Start token accuracy: {0}'.format(accuracy))
print('Start token recall: {0}'.format(recall))
print('Start token precision: {0}'.format(precision))
print('Start token F1 score: {0:.4f}'.format( 2 * (precision * recall) / (precision + recall)))

print('--------------------------------------------------')

accuracy = history.history['val_end_token_categorical_accuracy'][-1]
recall = history.history['val_end_token_recall_1'][-1]
precision = history.history['val_end_token_precision_1'][-1]

print('End token accuracy: {0}'.format(accuracy))
print('End token recall: {0}'.format(recall))
print('End token precision: {0}'.format(precision))
print('End token F1 score: {0:.4f}'.format(2 * (precision * recall) / (precision + recall)))