In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [4]:
len(test_data)

1000

In [5]:
len(train_data)

10000

In [6]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [7]:
#here we can see 3 main components :
#1>the stroy 
#2>the question
#3>the answer

In [8]:
' '.join(train_data[0][0]) #the story

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [9]:
' '.join(train_data[0][1]) #the question

'Is Sandra in the hallway ?'

In [10]:
' '.join(train_data[0][2]) #answer

'n o'

In [11]:
all_data = test_data + train_data

In [12]:
len(all_data)

11000

In [13]:
vocab = set()

for story,questions,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(questions))

In [14]:
vocab.add('no')

In [15]:
vocab.add('yes')

In [16]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [17]:
#while we are constructing our own questions we will be limited to these words

In [18]:
vocab_len = len(vocab)+1

In [19]:
vocab_len

38

In [20]:
#now lets figure out how long is the longest story and how long is the longest question

In [21]:
#Longest Story
all_story_lens = [len(data[0]) for data in all_data]

In [22]:
max_story_leng = max(all_story_lens) #we will need these later on for padding our sequences

In [23]:
all_story_lens = [len(data[1]) for data in all_data]
max_question_leng = max(all_story_lens)

In [24]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

# Vectorization 


In [28]:
tokenizer = Tokenizer(filters = [])
tokenizer.fit_on_texts(vocab)

In [29]:
tokenizer.word_index

{'got': 1,
 'no': 2,
 'back': 3,
 'football': 4,
 'bathroom': 5,
 'down': 6,
 'office': 7,
 'john': 8,
 'journeyed': 9,
 'travelled': 10,
 'up': 11,
 'kitchen': 12,
 'in': 13,
 'dropped': 14,
 'left': 15,
 'yes': 16,
 'daniel': 17,
 'the': 18,
 'put': 19,
 '?': 20,
 'sandra': 21,
 'moved': 22,
 'discarded': 23,
 'grabbed': 24,
 'to': 25,
 '.': 26,
 'milk': 27,
 'apple': 28,
 'mary': 29,
 'went': 30,
 'hallway': 31,
 'garden': 32,
 'picked': 33,
 'there': 34,
 'bedroom': 35,
 'took': 36,
 'is': 37}

In [33]:
train_story_text = []
train_questions_text = []
train_answers = []

In [35]:
for story,questions,answer in train_data:
    train_story_text.append(story)
    train_questions_text.append(questions)   
    train_answers.append(answer)    

In [42]:
#train_story_text

In [43]:
#creating sequential data format for our stories
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [47]:
#train_story_seq 

In [48]:
#here we can see that the first word has sequence number 29 which is marry in the orginal stroy. this 29 also matches-
#-with tokenizer word index where marry has index 29

In [53]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_leng=max_story_leng,max_question_leng=max_question_leng):
   
    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []
    
    
    for story, query, answer in data:
        
        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        # Grab the word index for every word in query
        xq = [word_index[word.lower()] for word in query]
        
        # Grab the Answers (either Yes/No so we don't need to use list comprehension here)
        # Index 0 is reserved so we're going to use + 1
        y = np.zeros(len(word_index) + 1)
        
        # Now that y is all zeros and we know its just Yes/No , we can use numpy logic to create this assignment
        #
        y[word_index[answer]] = 1
        
        # Append each set of story,query, and answer to their respective holding lists
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    # Finally, pad the sequences based on their max length so the RNN can be trained on uniformly long sequences.
        
    # RETURN TUPLE FOR UNPACKING
    return (pad_sequences(X, maxlen=max_story_leng),pad_sequences(Xq, maxlen=max_question_leng), np.array(Y))

In [54]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [60]:
#these are just arrays we have  word index positon along with word index
inputs_test

array([[ 0,  0,  0, ..., 18, 35, 26],
       [ 0,  0,  0, ..., 18, 32, 26],
       [ 0,  0,  0, ..., 18, 32, 26],
       ...,
       [ 0,  0,  0, ..., 18, 28, 26],
       [ 0,  0,  0, ..., 18, 32, 26],
       [ 0,  0,  0, ..., 28, 34, 26]], dtype=int32)

In [61]:
queries_test

array([[37,  8, 13, 18, 12, 20],
       [37,  8, 13, 18, 12, 20],
       [37,  8, 13, 18, 32, 20],
       ...,
       [37, 29, 13, 18, 35, 20],
       [37, 21, 13, 18, 32, 20],
       [37, 29, 13, 18, 32, 20]], dtype=int32)

In [62]:
answers_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
sum(answers_test)

array([  0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [64]:
tokenizer.word_index['yes']

16

In [65]:
tokenizer.word_index['no']

2