In [1]:
import json
import re
import os
import zipfile
import collections
import numpy as np
import pandas as pd
import urllib.request
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import defaultdict
from tensorflow.keras.layers import dot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, SimpleRNN, Embedding, Dense, Reshape, LSTM, GRU, Dropout, Bidirectional, BatchNormalization, Flatten
from tensorflow.keras.preprocessing.sequence import skipgrams
from collections import Counter
from operator import itemgetter
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# set path with magic
%env DATA_DIR=./data/squad 

# download the data
def download_squad(version=1):
    if version == 1:
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
    else:
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
            
download_squad(version=2)

env: DATA_DIR=./data/squad
--2021-05-30 04:59:04--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘./data/squad/train-v2.0.json’


2021-05-30 04:59:06 (119 MB/s) - ‘./data/squad/train-v2.0.json’ saved [42123633/42123633]

--2021-05-30 04:59:06--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘./data/squad/dev-v2.0.json’


2021-05-30 04:59:06 (39.8 MB/s) - 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def get_dataframe(file):
  f = open(file, 'r')
  #loading json file 
  data = json.loads(f.read())
  #creating empty lists to store df values 
  iid = []
  tit = []
  con = []
  que = []
  ans = []
  txt = []
  #root tags contains 'title' tag and 'paragraphs' list 
  for i in range(len(data['data'])):
    title = data['data'][i]['title']
    #'paragraphs' list contains 'context' tag and 'qas' list 
    for p in range(len(data['data'][i]['paragraphs'])):
      context = data['data'][i]['paragraphs'][p]['context']
      for q in range(len(data['data'][i]['paragraphs'][p]['qas'])):
        # 'qas'list contains 'question', 'Id' tag and 'answers' list 
        question = data['data'][i]['paragraphs'][p]['qas'][q]['question']
        id = data['data'][i]['paragraphs'][p]['qas'][q]['id']
        #'answers' list contains 'answer_start' and 'text' tag 
        for a in range(len(data['data'][i]['paragraphs'][p]['qas'][q]['answers'])):
          ans_start = data['data'][i]['paragraphs'][p]['qas'][q]['answers'][a]['answer_start']
          text = data['data'][i]['paragraphs'][p]['qas'][q]['answers'][a]['text']
          
          #appending values to list 
          iid.append(id)
          tit.append(title)
          con.append(context)
          que.append(question)
          ans.append(ans_start)
          txt.append(text)
  #creating dataframe from lists 
  new_df = pd.DataFrame(columns=['Id', 'title', 'context', 'question', 'ans_start', 'text'])
  new_df.Id = iid
  new_df.title=tit
  new_df.context = con
  new_df.question = que
  new_df.ans_start = ans 
  new_df.text = txt 
  #removing duplicate columns 
  final_df = new_df.drop_duplicates(keep='first')

  return final_df

In [51]:
# Get SQuAD training set
df = get_dataframe('/content/data/squad/dev-v2.0.json')

In [52]:
# Get training data from Univ.AI
df_faq = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/UNIV-AI-AI3/faq.csv')

In [53]:
# Concatenate question and answer
df['qa_pair'] = df['question'] + ' ' + df['text']
df_faq['qa_pair'] = df_faq['Question'] + ' ' + df_faq['Answer']

In [54]:
def clean_data(data, col_name): 
    data[col_name] = [re.sub("[^a-zA-Z ]", "", i) for i in data[col_name]]
    data = data.applymap(lambda s:s.lower() if type(s) == str else s)
    l = [i for i in data.qa_pair if len(i.split(" "))>5 and len(i.split(" "))<=55]
    data = pd.DataFrame(l, columns=[col_name])
    data[col_name] = '<s> '+data[col_name] + ' </s>'
    return data

In [55]:
# Clean data
df = clean_data(df, 'qa_pair')
df_faq = clean_data(df_faq, 'qa_pair')

In [56]:
# Concatenate SQuAD and Univ.AI training data
df_joint = pd.concat([df, df_faq])

In [57]:
# Size of the vocabulary
vocab_size = 5000 

# Tokenize the data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, lower=True, char_level=False, split=' ', oov_token=None, filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n')

In [58]:
# Fit the tokenizer on text
tokenizer.fit_on_texts(df_joint.qa_pair)

# Text to sequence
data = tokenizer.texts_to_sequences(df.qa_pair)
data_faq = tokenizer.texts_to_sequences(df_faq.qa_pair)

In [59]:
x_data = [i[:-1] for i in data]
y_data = [i[1:] for i in data]

x_data_faq = [i[:-1] for i in data]
y_data_faq = [i[1:] for i in data]

# Post-pad input and output (max length is 55)
x_data = tf.convert_to_tensor(sequence.pad_sequences(x_data, padding='post', maxlen=55))
y_data = tf.convert_to_tensor(sequence.pad_sequences(y_data, padding='post', maxlen=55))

x_data_faq = tf.convert_to_tensor(sequence.pad_sequences(x_data_faq, padding='post', maxlen=55))
y_data_faq = tf.convert_to_tensor(sequence.pad_sequences(y_data_faq, padding='post', maxlen=55))

In [14]:
# Simple RNN model

hidden_size = 300

tf.keras.backend.clear_session()

inputs = Input(shape=x_data.shape[1:], name='input')
x = Embedding(input_dim=vocab_size+1, output_dim=hidden_size, name="embedding", mask_zero=True)(inputs)
x = tf.keras.layers.SimpleRNN(hidden_size, return_sequences=True, name="RNN_layer_1")(x)
x = tf.keras.layers.SimpleRNN(hidden_size, return_sequences=True, name="RNN_layer_2")(x)
outputs = Dense(vocab_size, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs, name="Simple_RNN_model")
model.summary()

Model: "Simple_RNN_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 55)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 55, 300)           1500300   
_________________________________________________________________
RNN_layer_1 (SimpleRNN)      (None, 55, 300)           180300    
_________________________________________________________________
RNN_layer_2 (SimpleRNN)      (None, 55, 300)           180300    
_________________________________________________________________
dense (Dense)                (None, 55, 5000)          1505000   
Total params: 3,365,900
Trainable params: 3,365,900
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-2), metrics='accuracy')

In [16]:
history = model.fit(x_data, y_data, epochs=50, batch_size=512, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [42]:
# Save/load weights
model.load_weights('/content/drive/My Drive/Colab Notebooks/UNIV-AI-AI3/model_weights_1.h5')

In [43]:
# Output single prediction
output = outputs[:,-1,:]
model_pred = Model(inputs, outputs=output)
model_pred.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 55)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 55, 300)           1500300   
_________________________________________________________________
RNN_layer_1 (SimpleRNN)      (None, 55, 300)           180300    
_________________________________________________________________
RNN_layer_2 (SimpleRNN)      (None, 55, 300)           180300    
_________________________________________________________________
dense (Dense)                (None, 55, 5000)          1505000   
_________________________________________________________________
tf.__operators__.getitem_1 ( (None, 5000)              0         
Total params: 3,365,900
Trainable params: 3,365,900
Non-trainable params: 0
_________________________________________________

In [44]:
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/UNIV-AI-AI3/test.csv')

In [45]:
for test_num in range(len(df_test)-1):

  i = 0
  predicted_word = ""
  input = df_test.Questions[test_num].lower()

  # Predict next 10 words or until the </s> token
  while i<10 or predicted_word!="</s>":
    test_data = '<s> '+input+' </s>'
    test_data = tokenizer.texts_to_sequences([test_data])
    test_data[0] = test_data[0][:-1] 
    test_data = np.array(test_data)
    test_data = sequence.pad_sequences(test_data, padding='post',maxlen=55)
    pred = model_pred([test_data])
    index = pred[0].numpy().argmax()
    character = tokenizer.sequences_to_texts([[index]])
    predicted_word = character[0]
    input = input+" "+ predicted_word
    i+=1

  print("Question:", df_test.Questions[test_num])
  print("Answer:", input[len(df_test.Questions[test_num]):])

Question: Will the pre-class session be recorded?
Answer:  in the university decide to get rid of the football program emphasize academics </s>
Question: What is the deadline for quiz submission?
Answer:  plague spreading to the individual </s> </s> </s> </s> </s>
Question: What is the deadline for exercise submission?
Answer:  were to a higher energy content </s> </s> </s> </s>
Question: How many hours do I need to complete this course?
Answer:  like responses </s> </s> </s> </s> </s> </s> </s> </s>
Question: Who will grade the exercise?
Answer:  treaty the concept of human capital formation long </s> </s>
Question: Why is the auto-grader failing me?
Answer:  with huguenots foreign </s> </s> </s> </s> </s> </s> </s>
Question: Do I do the exercises individually?
Answer:  in farming lead to the black death </s> </s> </s>
Question: Is the lab compulsory?
Answer:  in scotland </s> </s> </s> </s> </s> </s> </s> </s>
Question: Will the sessions be recorded?
Answer:  in the amazon basin mois

In [50]:
model.trainable = False

inputs = Input(shape=x_data.shape[1:], name='input')
x = model(inputs, training=False)
x = tf.keras.layers.SimpleRNN(hidden_size, return_sequences=True, name="RNN_layer_3")(x)
x = tf.keras.layers.SimpleRNN(hidden_size, return_sequences=True, name="RNN_layer_4")(x)
outputs = Dense(vocab_size, activation='softmax')(x)
model2 = Model(inputs=inputs, outputs=outputs, name="Simple_RNN_model_2")
model2.summary()

Model: "Simple_RNN_model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 55)]              0         
_________________________________________________________________
Simple_RNN_model (Functional (None, 55, 5000)          3365900   
_________________________________________________________________
RNN_layer_3 (SimpleRNN)      (None, 55, 300)           1590300   
_________________________________________________________________
RNN_layer_4 (SimpleRNN)      (None, 55, 300)           180300    
_________________________________________________________________
dense_3 (Dense)              (None, 55, 5000)          1505000   
Total params: 6,641,500
Trainable params: 3,275,600
Non-trainable params: 3,365,900
_________________________________________________________________


In [60]:
model2.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-2), metrics='accuracy')

In [None]:
history = model2.fit(x_data_faq, y_data_faq, epochs=50, batch_size=512, validation_split=0.2)

In [64]:
# Save/load weights
model.load_weights('/content/drive/My Drive/Colab Notebooks/UNIV-AI-AI3/model_weights_4.h5')

In [65]:
# Output single prediction
output = outputs[:,-1,:]
model_pred2 = Model(inputs, outputs=output)

In [66]:
for test_num in range(len(df_test)-1):

  i = 0
  predicted_word = ""
  input = df_test.Questions[test_num].lower()

  # Predict next 10 words or until the </s> token
  while i<10 or predicted_word!="</s>":
    test_data = '<s> '+input+' </s>'
    test_data = tokenizer.texts_to_sequences([test_data])
    test_data[0] = test_data[0][:-1] 
    test_data = np.array(test_data)
    test_data = sequence.pad_sequences(test_data, padding='post',maxlen=55)
    pred = model_pred2([test_data])
    index = pred[0].numpy().argmax()
    character = tokenizer.sequences_to_texts([[index]])
    predicted_word = character[0]
    input = input+" "+ predicted_word
    i+=1

  print("Question:", df_test.Questions[test_num])
  print("Answer:", input[len(df_test.Questions[test_num]):])

Question: Will the pre-class session be recorded?
Answer:  the the the </s> </s> </s> </s> </s> </s> </s>
Question: What is the deadline for quiz submission?
Answer:  the the the the </s> </s> </s> </s> </s> </s>
Question: What is the deadline for exercise submission?
Answer:  the the the </s> </s> </s> </s> </s> </s> </s>
Question: How many hours do I need to complete this course?
Answer:  </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>
Question: Who will grade the exercise?
Answer:  the the the the </s> </s> </s> </s> </s> </s>
Question: Why is the auto-grader failing me?
Answer:  the the the </s> </s> </s> </s> </s> </s> </s>
Question: Do I do the exercises individually?
Answer:  the the the the </s> </s> </s> </s> </s> </s>
Question: Is the lab compulsory?
Answer:  of the the the the </s> </s> </s> </s> </s>
Question: Will the sessions be recorded?
Answer:  of the the the the </s> </s> </s> </s> </s>
Question: Can I have access to the recorded videos?
Answer:  the the </s> </s> <