In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
import os
from  tqdm import tqdm
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K
from tensorflow.keras import layers, activations, models, preprocessing, utils
from gensim.models import Word2Vec
import re

In [None]:
# Load the Drive helper and mount
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
import json
arr = []
question = []
answer = []
with open("/content/sample_data/train_data.txt","rb") as json_file:
    data = json.load(json_file)
    for data_ in data['data']:
        for j,para in enumerate(data_['paragraphs']):
          for k,qas in enumerate(para['qas']):
            for ans in qas['answers']:
              answer.append(ans['text'])
              question.append(qas['question'])  
              break

In [None]:
data['data']

In [None]:
question[:5]

['When did Beyonce start becoming popular?',
 'What areas did Beyonce compete in when she was growing up?',
 "When did Beyonce leave Destiny's Child and become a solo singer?",
 'In what city and state did Beyonce  grow up? ',
 'In which decade did Beyonce become famous?']

In [None]:
len(answer), len(question)

(86821, 86821)

In [None]:
answer[:5]

['in the late 1990s',
 'singing and dancing',
 '2003',
 'Houston, Texas',
 'late 1990s']

First we will try to load train dataset which is in json format, so we shall extract the 'question' and 'reply' sections from it.

In [None]:
train_df = pd.DataFrame(list(zip(question, answer)), columns =['question', 'reply'])

In [None]:
train_df.head(2)

Unnamed: 0,question,reply
0,When did Beyonce start becoming popular?,in the late 1990s
1,What areas did Beyonce compete in when she was...,singing and dancing


Loading the QA dataset

In [None]:
qa_df = pd.read_csv(r'/content/sample_data/qa_dataset.csv', encoding = "ISO-8859-1", low_memory=False)

In [None]:
len(qa_df)

2917

In [None]:
qa_df.head(2)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Alessandro_Volta,Was Volta an Italian physicist?,yes,easy,easy,data/set4/a10
1,Alessandro_Volta,Is Volta buried in the city of Pittsburgh?,no,easy,easy,data/set4/a10


Loading the music dataset

In [None]:
music_df = pd.read_csv(r'/content/sample_data/music_questions.csv', encoding = "ISO-8859-1", low_memory=False)

In [None]:
len(music_df)

2976

In [None]:
music_df.head(2)

Unnamed: 0.1,Unnamed: 0,Questions,Answers
0,0,how long is this cord? the pictures looks like...,I took a photo: <http://imgur.com/G48f1C4>I bo...
1,1,Has anyone used this to split a stereo signal?...,I believe this adapter yields a mono split and...


In [None]:
grocery_df = pd.read_csv(r'/content/sample_data/grocery_questions.csv', encoding = "ISO-8859-1", low_memory=False)

In [None]:
len(grocery_df)

2997

In [None]:
grocery_df.head(2)

Unnamed: 0.1,Unnamed: 0,Questions,Answers
0,0,what are the colors that come in the package?,"All colors seen on box plus Teal, Burgundy, Bl..."
1,1,difference between meat cure and pickling salt,Pickling salt is a very pure form of salt. A m...


In [None]:
video_game_df = pd.read_csv(r'/content/sample_data/video_game_qa.csv', encoding = "ISO-8859-1", low_memory=False)

In [None]:
len(video_game_df)

1183

In [None]:
video_game_df.head(2)

Unnamed: 0.1,Unnamed: 0,Questions,Answers
0,0,"Yes, you will need to go to their website to d...","Yes, you will need to go to their website to d..."
1,1,As long as it has a USB port it should work fi...,As long as it has a USB port it should work fi...


We will be keeping only necessary columns i.e., question and reply

In [None]:
qa_df = qa_df.drop(['ArticleTitle','DifficultyFromQuestioner','DifficultyFromAnswerer','ArticleFile'],axis =1 ) 
qa_df.columns = ['question', 'reply']

In [None]:
music_df = music_df.drop(['Unnamed: 0'],axis =1)
music_df.columns = ['question', 'reply']

In [None]:
grocery_df = grocery_df.drop(['Unnamed: 0'],axis =1)
grocery_df.columns = ['question', 'reply']

In [None]:
video_game_df  = video_game_df.drop(['Unnamed: 0'],axis =1)
video_game_df.columns = ['question', 'reply']

We will add all the data frames and make it as 1 dataframe

In [None]:
final_df = [train_df[:100], qa_df[:100],music_df[:100], grocery_df[:100], video_game_df[:100]]

In [None]:
final_df = pd.concat(final_df)

In [None]:
final_df.shape

(500, 2)

Now lets clean some data and do analysis

In [None]:
final_df = final_df.apply(lambda x: x.astype(str).str.lower())
final_df.head(2)

Unnamed: 0,question,reply
0,when did beyonce start becoming popular?,in the late 1990s
1,what areas did beyonce compete in when she was...,singing and dancing


Removing all special characters from the data and stop words from questions

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would be removed in the 1st step

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [None]:
# https://stackoverflow.com/a/47091490/4084039
import re
from bs4 import BeautifulSoup
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
final_df.head(3)

Unnamed: 0,question,reply
0,when did beyonce start becoming popular?,in the late 1990s
1,what areas did beyonce compete in when she was...,singing and dancing
2,when did beyonce leave destiny's child and bec...,2003


In [None]:
from tqdm import tqdm
all_questions= []
for sentence in tqdm(final_df['question'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    # sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    all_questions.append(sentence.strip())

100%|██████████| 500/500 [00:00<00:00, 4538.78it/s]


In [None]:
final_df['question'] =  all_questions

We have 296 duplicate values in our dataset, we will not be dropping those as it will be helpful in getting multiple replies.

In [None]:
final_df.head(2)

Unnamed: 0,question,reply
0,when did beyonce start becoming popular,in the late 1990s
1,what areas did beyonce compete in when she was...,singing and dancing


In [None]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( final_df['question'] + final_df['reply'] )
VOCAB_SIZE = len(tokenizer.word_index)+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

VOCAB SIZE : 3598


Preparing data for Seq2Seq model
This model requires 3 arrays encoder_input_data, decoder_input_data and decoder_output_data.

For encoder_input_data: Tokensize the Questions and Pad them to their maximum Length.

For decoder_input_data: Tokensize the Answers and Pad them to their maximum Length.

For decoder_output_data: Tokensize the Answers and Remove the 1st element from all the tokenized_answers. This is the element which was added earlier.

In [None]:
vocab = []
for word in tokenizer.word_index:
  vocab.append(word)

def tokenize(sentences):
  tokens_list = []
  vocabulary = []
  for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    tokens = sentence.split()
    vocabulary += tokens
    tokens_list.append(tokens)
  return tokens_list, vocabulary

In [None]:
#encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( final_df['question'] )
maxlen_questions = max( [len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions, maxlen = maxlen_questions, padding = 'post')
encoder_input_data = np.array(padded_questions)
print(encoder_input_data.shape, maxlen_questions)

(500, 913) 913


In [None]:
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( final_df['reply'] )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape , maxlen_answers )

(500, 930) 930


In [None]:
x  = 

In [None]:
# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( final_df['reply'] )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]

In [None]:
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )

In [None]:
padded_answers[:500].shape

(500, 930)

In [None]:
#https://www.geeksforgeeks.org/python-keras-keras-utils-to_categorical/
onehot_answers = utils.to_categorical(padded_answers , VOCAB_SIZE)

In [None]:
decoder_output_data = np.array( onehot_answers)
print( decoder_output_data.shape)

(500, 930, 3598)


In [None]:
encoder_inputs = tf.keras.layers.Input(shape=( maxlen_questions , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [state_h , state_c]

decoder_inputs = tf.keras.layers.Input(shape=( maxlen_answers ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 913)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 930)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 913, 200)     719600      ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 930, 200)     719600      ['input_4[0][0]']                
                                                                                            

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=150 ) 
model.save('model.h5')

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

--> Here we used encoder and decoder model with LSTM layer unit, we were able to acheive a loss of 0.0501. however this model could not take the words which are out of its context, which we will try to improve when we use feature engineering and apply a better model.