In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [4]:
len(test_data)

1000

In [5]:
len(train_data)

10000

In [6]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [7]:
#here we can see 3 main components :
#1>the stroy 
#2>the question
#3>the answer

In [8]:
' '.join(train_data[0][0]) #the story

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [9]:
' '.join(train_data[0][1]) #the question

'Is Sandra in the hallway ?'

In [10]:
' '.join(train_data[0][2]) #answer

'n o'

In [11]:
all_data = test_data + train_data

In [12]:
len(all_data)

11000

In [13]:
vocab = set()

for story,questions,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(questions))

In [14]:
vocab.add('no')

In [15]:
vocab.add('yes')

In [16]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [17]:
#while we are constructing our own questions we will be limited to these words

In [18]:
vocab_len = len(vocab)+1

In [19]:
vocab_len

38

In [20]:
#now lets figure out how long is the longest story and how long is the longest question

In [21]:
#Longest Story
all_story_lens = [len(data[0]) for data in all_data]

In [22]:
max_story_leng = max(all_story_lens) #we will need these later on for padding our sequences

In [23]:
all_story_lens = [len(data[1]) for data in all_data]
max_question_leng = max(all_story_lens)

In [24]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


# Vectorization 


In [25]:
tokenizer = Tokenizer(filters = [])
tokenizer.fit_on_texts(vocab)

In [26]:
tokenizer.word_index

{'no': 1,
 'apple': 2,
 '?': 3,
 'yes': 4,
 'grabbed': 5,
 'took': 6,
 'up': 7,
 '.': 8,
 'office': 9,
 'travelled': 10,
 'milk': 11,
 'john': 12,
 'back': 13,
 'bathroom': 14,
 'sandra': 15,
 'football': 16,
 'in': 17,
 'to': 18,
 'the': 19,
 'left': 20,
 'went': 21,
 'there': 22,
 'got': 23,
 'moved': 24,
 'put': 25,
 'journeyed': 26,
 'bedroom': 27,
 'is': 28,
 'garden': 29,
 'daniel': 30,
 'mary': 31,
 'dropped': 32,
 'hallway': 33,
 'picked': 34,
 'down': 35,
 'discarded': 36,
 'kitchen': 37}

In [27]:
train_story_text = []
train_questions_text = []
train_answers = []

In [28]:
for story,questions,answer in train_data:
    train_story_text.append(story)
    train_questions_text.append(questions)   
    train_answers.append(answer)    

In [29]:
#train_story_text

In [30]:
#creating sequential data format for our stories
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [31]:
#train_story_seq 

In [32]:
#here we can see that the first word has sequence number 29 which is marry in the orginal stroy. this 29 also matches-
#-with tokenizer word index where marry has index 29

In [33]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_leng=max_story_leng,max_question_leng=max_question_leng):
   
    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []
    
    
    for story, query, answer in data:
        
        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        # Grab the word index for every word in query
        xq = [word_index[word.lower()] for word in query]
        
        # Grab the Answers (either Yes/No so we don't need to use list comprehension here)
        # Index 0 is reserved so we're going to use + 1
        y = np.zeros(len(word_index) + 1)
        
        # Now that y is all zeros and we know its just Yes/No , we can use numpy logic to create this assignment
        #
        y[word_index[answer]] = 1
        
        # Append each set of story,query, and answer to their respective holding lists
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    # Finally, pad the sequences based on their max length so the RNN can be trained on uniformly long sequences.
        
    # RETURN TUPLE FOR UNPACKING
    return (pad_sequences(X, maxlen=max_story_leng),pad_sequences(Xq, maxlen=max_question_leng), np.array(Y))

In [34]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [35]:
#these are just arrays we have  word index positon along with word index
inputs_test

array([[ 0,  0,  0, ..., 19, 27,  8],
       [ 0,  0,  0, ..., 19, 29,  8],
       [ 0,  0,  0, ..., 19, 29,  8],
       ...,
       [ 0,  0,  0, ..., 19,  2,  8],
       [ 0,  0,  0, ..., 19, 29,  8],
       [ 0,  0,  0, ...,  2, 22,  8]], dtype=int32)

In [36]:
queries_test

array([[28, 12, 17, 19, 37,  3],
       [28, 12, 17, 19, 37,  3],
       [28, 12, 17, 19, 29,  3],
       ...,
       [28, 31, 17, 19, 27,  3],
       [28, 15, 17, 19, 29,  3],
       [28, 31, 17, 19, 29,  3]], dtype=int32)

In [37]:
answers_test

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
sum(answers_test)

array([  0., 503.,   0.,   0., 497.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [39]:
tokenizer.word_index['yes']

4

In [40]:
tokenizer.word_index['no']

1

In [41]:
from keras.models import Sequential,Model

In [42]:
from keras.layers.embeddings import Embedding

In [43]:
from keras.layers import Input,Activation,Dense,Permute,Dropout,add,dot,concatenate,LSTM

In [44]:
#WE HAVE 2 INPUTS WE HAVE STORIES AND QUESTION THAT THE ENCODER HAS TO UNDERSTAND AND WE HAVE TO LINK THEM TOGETHER-
#-TO PROVIDE A LABEL I.E YES OR NO

In [45]:
#WE ARE GOING TO CREATE PLACEHOLDERS USING INPUTS 

In [46]:
#place holder shape = (max_story_leng,batch_size)
input_sequence = Input((max_story_leng,))
question = Input((max_question_leng,))
#this input take a shape and the shape we are going to pass in are max story length and max question length.

In [47]:
#So the above are the place holders ready to receive input later on

In [48]:
#Now lets create Input Encoders,

In [49]:
vocab_size = len(vocab)+1

# Input Encoder M

In [52]:
input_encoder_m = Sequential()
#we will create 2 layers for this the Embedding layer and the dropout layer
input_encoder_m.add(Embedding(input_dim = vocab_size,output_dim = 64))
input_encoder_m.add(Dropout(0.3))


#now what this encoder is going to output is -
#-(samples,story_maxlen,embedding_dim)

Instructions for updating:
Colocations handled automatically by placer.


# Input Encoder C

In [54]:
input_encoder_c = Sequential()
#we will create 2 layers for this the Embedding layer and the dropout layer
input_encoder_c.add(Embedding(input_dim = vocab_size,output_dim = max_question_leng))
input_encoder_c.add(Dropout(0.3))

##-(samples,story_maxlen,max_question_leng)

# Question Encoder

In [55]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim = vocab_size,output_dim = 64,input_length = max_question_leng))
question_encoder.add(Dropout(0.3))

#(samples,query_maxlen,embedding_dim)

# Now we will encode the sequences

In [None]:
#we will want to pass in the input_Sequence,question Inputs() from line 46 to all the encoders

In [58]:
#Encoded <------- Encoder(Input)
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [60]:
match = dot([input_encoded_m,question_encoded],axes =(2,2))
match = Activation('softmax')(match)

In [61]:
response = add([match,input_encoded_c])
response = Permute((2,1))(response)

In [62]:
answer = concatenate([response,question_encoded])

In [63]:
answer

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 6, 220) dtype=float32>

In [64]:
#So now we have our answer tensor and we are going to reduce it using RNN here using LSTM layer

In [65]:
answer = LSTM(32)(answer)

In [66]:
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

In [67]:
answer =Activation('softmax')(answer)

In [68]:
model = Model([input_sequence,question],answer) #and this answer here will link all the encoders here and that is -
#-how we link our models with the encodings

In [70]:
model.compile(optimizer = 'rmsprop',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [71]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_2 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_5 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
____________________________________________________________________________________________

In [72]:
history = model.fit([inputs_train,queries_train],answers_train,batch_size = 32,epochs = 100,
                    validation_data = ([inputs_test,queries_test],answers_test))



Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


2022-07-21 18:25:09.540112: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2
2022-07-21 18:25:09.541460: I tensorflow/core/common_runtime/process_util.cc:71] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


Train on 10000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [74]:
#Evaluating on the given test set

In [75]:
pred_results = model.predict(([inputs_test, queries_test]))

In [76]:
test_data[0][0]

['Mary',
 'got',
 'the',
 'milk',
 'there',
 '.',
 'John',
 'moved',
 'to',
 'the',
 'bedroom',
 '.']

In [77]:
story =' '.join(word for word in test_data[0][0])
print(story)

Mary got the milk there . John moved to the bedroom .


In [78]:
query = ' '.join(word for word in test_data[0][1])
print(query)

Is John in the kitchen ?


In [79]:
print("True Test Answer from Data is:",test_data[0][2])

True Test Answer from Data is: no


In [80]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  no
Probability of certainty was:  0.99979717


# Writing my own stories and questions

In [81]:
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'the',
 'football',
 'in',
 'the',
 'garden',
 '.']

In [82]:
my_question = "Is the football in the garden ?"

In [83]:
my_question.split()

['Is', 'the', 'football', 'in', 'the', 'garden', '?']

In [84]:
mydata = [(my_story.split(),my_question.split(),'yes')]

In [85]:
my_story,my_ques,my_ans = vectorize_stories(mydata)

In [86]:
pred_results = model.predict(([ my_story, my_ques]))

In [87]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  yes
Probability of certainty was:  0.8167605
