# Sentence Picker Model

## Load preprocessed data

In [0]:
gdrive_project_pathname = '/My Drive/w266/SQuad/UniversalSentenceEncoder/'  #@param {type: "string"}

In [0]:
import os.path
from google.colab import drive

# Mount the current user's Google Drive
GOOGLE_DRIVE_MOUNT_POINT = "/content/drive"
print("Mounting Google Drive beneath %s" % GOOGLE_DRIVE_MOUNT_POINT)
drive.mount(GOOGLE_DRIVE_MOUNT_POINT, force_remount = True)

# Build the pathname to the project's folder residing beneath the current user's Google Drive
if not gdrive_project_pathname.startswith("/"):
  gdrive_project_pathname = "/" + gdrive_project_pathname
abs_project_pathname = GOOGLE_DRIVE_MOUNT_POINT + gdrive_project_pathname
print("Project folder: %s" % gdrive_project_pathname)

# Check that the subdirectories anticipated beneath the Google Drive project folder exist
# by checking for the presence of the DO_NOT_DELETE.txt file
for subfolder in ["data", "py"]:
    if not os.path.exists(abs_project_pathname + "/" + subfolder + "/DO_NOT_DELETE.txt"):
        raise FileNotFoundError("Required subfolder '" + subfolder + "' does not exist beneath the Google Drive project folder")
print("Project subfolders successfully verified")

abs_data_pathname = abs_project_pathname + "data"
abs_py_pathname = abs_project_pathname + "py/"

Mounting Google Drive beneath /content/drive
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
Project folder: /My Drive/w266/SQuad/UniversalSentenceEncoder/
Project subfolders successfully verified


In [0]:
import os
import sys
# Python resources developed for this project
sys.path.append(abs_py_pathname)
import util

In [0]:
!pip install -q tensorflow tensorflow-datasets matplotlib
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf

import tensorflow_datasets as tfds
tfds.disable_progress_bar()
tf.enable_v2_behavior()

In [0]:
import pandas as pd
import time
import tensorflow_hub as hub
# disable warnings:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
# import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [0]:
# Version checks
import importlib
def version_greater_equal(v1, v2):
    for x, y in zip(v1.split('.'), v2.split('.')):
        if int(x) != int(y):
            return int(x) > int(y)
    return True
    
def version_check(libname, min_version):
    m = importlib.import_module(libname)
    print ("%s version %s is " % (libname, m.__version__))
    print ("OK"
           if version_greater_equal(m.__version__, min_version) 
           else "out-of-date. Please upgrade!")
    
version_check("tensorflow", "1.10")

tensorflow version 2.2.0-rc2 is 
OK


In [0]:
#train_data=pd.read_pickle('/gdrive/My Drive/Colab Notebooks/SQuad/data/train_v1_pd.pkl')
train_data=pd.read_pickle(abs_data_pathname + '/train_v2_pd.pkl')

In [0]:
train_data.head()

Unnamed: 0,ID,Context,Question,Answers,Start,Target,Context_length,Input,Expectation
0,[5733be284776f41900661182],"[Architecturally, the school has a Catholic ch...",[To whom did the Virgin Mary allegedly appear ...,[Saint Bernadette Soubirous],[515],"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",7,[To whom did the Virgin Mary allegedly appear ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1,[5733be284776f4190066117f],"[Architecturally, the school has a Catholic ch...",[What is in front of the Notre Dame Main Build...,[a copper statue of Christ],[188],"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,[What is in front of the Notre Dame Main Build...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
2,[5733be284776f41900661180],"[Architecturally, the school has a Catholic ch...",[The Basilica of the Sacred heart at Notre Dam...,[the Main Building],[279],"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,[The Basilica of the Sacred heart at Notre Dam...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
3,[5733be284776f41900661181],"[Architecturally, the school has a Catholic ch...",[What is the Grotto at Notre Dame?],[a Marian place of prayer and reflection],[381],"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",7,"[What is the Grotto at Notre Dame?, Architectu...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,[5733be284776f4190066117e],"[Architecturally, the school has a Catholic ch...",[What sits on top of the Main Building at Notr...,[a golden statue of the Virgin Mary],[92],"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7,[What sits on top of the Main Building at Notr...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


## **Setting Hyper Parameter**


In [0]:
# Hyper Parameter
embedding_model="https://tfhub.dev/google/universal-sentence-encoder-large/5" #"https://tfhub.dev/google/universal-sentence-encoder-qa/3" #"https://tfhub.dev/google/Wiki-words-250-with-normalization/2"
Embedding_dimension=512 #250 # Must be consistent with the embedding model chosen
Embedding_expansion=1024
Layer_num=8
num_heads=8
batch_size=32 #64 #32
embed_training=False
assert Embedding_expansion % num_heads == 0
learning_rate=0.0001

## Define model


In [0]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, embedding_model_link, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = hub.KerasLayer(embedding_model_link,input_shape=[],trainable=True)
    
    self.enc_layers = [util.EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
    
        
  def call(self, x, training,mask=None):

    seq_len = tf.shape(x)[1]
    out_list=None
    for batch in x:
        temp=self.embedding(batch)

        temp=temp[tf.newaxis,...]
        if out_list is None:
            out_list=temp
        else:
            out_list=tf.concat([out_list,temp],axis=0)
    x=out_list # (batch_size, input_seq_len, d_model)
    
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training,mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [0]:
class Mymodel(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff,embedding_model_link,rate=0.1):
    super(Mymodel, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, embedding_model_link, rate)
    
    self.output_layer1 = tf.keras.layers.Dense(1)
    
    self.d_model=d_model
    
    
  def call(self, para, training, mask=None):

    enc_output = self.encoder(para,training,mask)  # (batch_size, inp_seq_len, d_model)
    sentence_num=enc_output.shape[-2]
    
    output=tf.reshape(enc_output,(-1,self.d_model))
    
    output=self.output_layer1(output)
    output=tf.reshape(output,(-1,sentence_num))
    
    if mask is not None:
        mask=tf.squeeze(mask,axis=[1,2])
        output += (mask * -1e9)
    output=output[:,1:]
    output=tf.nn.softmax(output,axis=-1)
    return output

In [0]:
AnswerLocator=""
AnswerLocator=Mymodel(Layer_num,Embedding_dimension,num_heads,Embedding_expansion,embedding_model,0.1)

## Training Setup


In [0]:
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [0]:
loss_fn= tf.keras.losses.CategoricalCrossentropy()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')

In [0]:


# @tf.function()#input_signature=train_step_signature)
def train_step(context, target,mask=None):
  
  with tf.GradientTape() as tape:
    predictions = AnswerLocator(context,True,mask)
    loss = loss_fn(target, predictions)

  gradients = tape.gradient(loss, AnswerLocator.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, AnswerLocator.trainable_variables))
  
  train_loss(loss)
  train_accuracy(target, predictions)

In [0]:
#Check Point setup:
checkpoint_path = "/gdrive/My Drive/w266/SQuad/UniversalSentenceEncoder/checkpoint/UniversalSentenceEn/Large/"

ckpt = tf.train.Checkpoint(AnswerLocator=AnswerLocator,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

# if a checkpoint exists, restore the latest checkpoint.
# if ckpt_manager.latest_checkpoint:
#   ckpt.restore(ckpt_manager.latest_checkpoint)
#   print ('Latest checkpoint restored!!')

In [0]:
# @tf.function
def eval_step(Inputs,Validation):
  correct=0
  count=0
  predictions = AnswerLocator(Inputs,False,None)
  for n,prediction in enumerate(predictions):
      count+=1
      
      result=np.argmax(prediction)
      
      if result in Validation:
          correct+=1
  return correct,count


In [0]:
dev_data=pd.read_pickle(abs_data_pathname + '/dev_v1_pd.pkl')


In [0]:
def eval_result():
  start = time.time()
  
  dev_correct=0
  dev_count=0
  
  dev_index=np.arange(len(dev_data))

  for batch,index in enumerate(dev_index):
    Inputs=util.get_value_ts(dev_data,[index],'Input')
    Validation=dev_data.iloc[index]['Validation']
    
    correct,count=eval_step(Inputs,Validation)
    dev_correct+=correct
    dev_count+=count
    
  print ('Accuracy {:.4f}'.format(dev_correct/dev_count))

  print ('Time taken: {} secs\n'.format(time.time() - start))

## Execute Training

In [0]:
EPOCHS=2
batch_list = []
loss_list = []
accuracy_list = []

for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  train_index=util.shuffle_data(train_data,batch_size)


  for batch,index in enumerate(train_index):
    Inputs=util.get_value_ts(train_data,index,'Input')
    
    target=util.get_value_ts(train_data,index,'Target')
    
    
    mask=util.create_padding_mask(Inputs)
    train_step(Inputs,target,mask)
    
    if batch % 200 == 0:
      batch_list.append(batch)
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  
  ckpt_save_path = ckpt_manager.save()
  
  # eval_result()
  print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))
  loss_list.append(train_loss.result())
  accuracy_list.append(train_accuracy.result())

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.5935 Accuracy 0.3125
Epoch 1 Batch 200 Loss 1.5716 Accuracy 0.2463
Epoch 1 Batch 400 Loss 1.5499 Accuracy 0.2604
Epoch 1 Batch 600 Loss 1.5115 Accuracy 0.2892
Epoch 1 Batch 800 Loss 1.4565 Accuracy 0.3240
Epoch 1 Batch 1000 Loss 1.3882 Accuracy 0.3701
Epoch 1 Batch 1200 Loss 1.3200 Accuracy 0.4126
Epoch 1 Batch 1400 Loss 1.2623 Accuracy 0.4473
Epoch 1 Batch 1600 Loss 1.2122 Accuracy 0.4766
Epoch 1 Batch 1800 Loss 1.1701 Accuracy 0.5008
Epoch 1 Batch 2000 Loss 1.1326 Accuracy 0.5219
Epoch 1 Batch 2200 Loss 1.0999 Accuracy 0.5393
Epoch 1 Batch 2400 Loss 1.0724 Accuracy 0.5552
Epoch 1 Batch 2600 Loss 1.0488 Accuracy 0.5687
Saving checkpoint for epoch 1 at /gdrive/My Drive/w266/SQuad/UniversalSentenceEncoder/checkpoint/UniversalSentenceEn/Large/ckpt-1
Epoch 1 Loss 1.0320 Accuracy 0.5777
Time taken for 1 epoch: 9481.01419377327 secs

Epoch 2 Batch 0 Loss 0.6132 Accuracy 0.8438
Epoch 2 Batch 200 Loss 0.5903 Accuracy 0.8030
Epoch 2 Batch 400 Loss 0.5832 Accuracy 0.8039


In [0]:

ckpt_save_path = ckpt_manager.save()
print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                        ckpt_save_path))

Saving checkpoint for epoch 2 at /gdrive/My Drive/w266/SQuad/UniversalSentenceEncoder/checkpoint/UniversalSentenceEn/Large/ckpt-3


## Evaluation

In [0]:
eval_result()

Accuracy 0.8117
Time taken: 998.7151756286621 secs

