___

<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>
___
# Question and Answer Chat Bots

----

------

## Loading the Data

We will be working with the Babi Data Set from Facebook Research.

Full Details: https://research.fb.com/downloads/babi/

- Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush,
  "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks",
  http://arxiv.org/abs/1502.05698


In [4]:
import pickle
import numpy as np

In [5]:
with open("/content/train_qa.txt", "rb") as fp:   # Unpickling
    train_data =  pickle.load(fp)

In [6]:
with open("/content/test_qa.txt", "rb") as fp:   # Unpickling
    test_data =  pickle.load(fp)

----

## Exploring the Format of the Data

In [7]:
type(test_data)

list

In [8]:
type(train_data)

list

In [9]:
len(test_data)

1000

In [10]:
len(train_data)

10000

In [11]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [12]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [13]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [14]:
train_data[0][2]

'no'

-----

## Setting up Vocabulary of All Words

In [15]:
# Create a set that holds the vocab words
vocab = set()

In [16]:
all_data = test_data + train_data

In [17]:
for story, question , answer in all_data:
    # In case you don't know what a union of sets is:
    # https://www.programiz.com/python-programming/methods/set/union
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [18]:
vocab.add('no')
vocab.add('yes')

In [19]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [20]:
vocab_len = len(vocab) + 1 #we add an extra space to hold a 0 for Keras's pad_sequences

In [21]:
max_story_len = max([len(data[0]) for data in all_data])

In [22]:
max_story_len

156

In [23]:
max_question_len = max([len(data[1]) for data in all_data])

In [24]:
max_question_len

6

## Vectorizing the Data

In [25]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [26]:
# Reserve 0 for pad_sequences
vocab_size = len(vocab) + 1

-----------

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [28]:
# integer encode sequences of words
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [29]:
tokenizer.word_index

{'left': 1,
 'in': 2,
 'yes': 3,
 'kitchen': 4,
 'the': 5,
 'there': 6,
 'up': 7,
 'bathroom': 8,
 'john': 9,
 'sandra': 10,
 'mary': 11,
 'apple': 12,
 'office': 13,
 'is': 14,
 'grabbed': 15,
 'no': 16,
 'back': 17,
 'daniel': 18,
 'bedroom': 19,
 'moved': 20,
 'picked': 21,
 'took': 22,
 'travelled': 23,
 'to': 24,
 'milk': 25,
 'journeyed': 26,
 'discarded': 27,
 'went': 28,
 'garden': 29,
 'down': 30,
 'football': 31,
 'hallway': 32,
 'got': 33,
 '?': 34,
 'dropped': 35,
 '.': 36,
 'put': 37}

In [30]:
train_story_text = []
train_question_text = []
train_answers = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)

In [31]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [32]:
len(train_story_text)

10000

In [33]:
len(train_story_seq)

10000

In [34]:
# word_index = tokenizer.word_index

### Functionalize Vectorization

In [35]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):
    '''
    INPUT:

    data: consisting of Stories,Queries,and Answers
    word_index: word index dictionary from tokenizer
    max_story_len: the length of the longest story (used for pad_sequences function)
    max_question_len: length of the longest question (used for pad_sequences function)


    OUTPUT:

    Vectorizes the stories,questions, and answers into padded sequences. We first loop for every story, query , and
    answer in the data. Then we convert the raw words to an word index value. Then we append each set to their appropriate
    output list. Then once we have converted the words to numbers, we pad the sequences so they are all of equal length.

    Returns this in the form of a tuple (X,Xq,Y) (padded based on max lengths)
    '''


    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []


    for story, query, answer in data:

        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        # Grab the word index for every word in query
        xq = [word_index[word.lower()] for word in query]

        # Grab the Answers (either Yes/No so we don't need to use list comprehension here)
        # Index 0 is reserved so we're going to use + 1
        y = np.zeros(len(word_index) + 1)

        # Now that y is all zeros and we know its just Yes/No , we can use numpy logic to create this assignment
        #
        y[word_index[answer]] = 1

        # Append each set of story,query, and answer to their respective holding lists
        X.append(x)
        Xq.append(xq)
        Y.append(y)

    # Finally, pad the sequences based on their max length so the RNN can be trained on uniformly long sequences.

    # RETURN TUPLE FOR UNPACKING
    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [36]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [37]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [38]:
inputs_test

array([[ 0,  0,  0, ...,  5, 19, 36],
       [ 0,  0,  0, ...,  5, 29, 36],
       [ 0,  0,  0, ...,  5, 29, 36],
       ...,
       [ 0,  0,  0, ...,  5, 12, 36],
       [ 0,  0,  0, ...,  5, 29, 36],
       [ 0,  0,  0, ..., 12,  6, 36]], dtype=int32)

In [39]:
queries_test

array([[14,  9,  2,  5,  4, 34],
       [14,  9,  2,  5,  4, 34],
       [14,  9,  2,  5, 29, 34],
       ...,
       [14, 11,  2,  5, 19, 34],
       [14, 10,  2,  5, 29, 34],
       [14, 11,  2,  5, 29, 34]], dtype=int32)

In [40]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
sum(answers_test)

array([  0.,   0.,   0., 497.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [42]:
tokenizer.word_index['yes']

3

In [43]:
tokenizer.word_index['no']

16

## Creating the Model

In [44]:
from keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

### Placeholders for Inputs

Recall we technically have two inputs, stories and questions. So we need to use placeholders. `Input()` is used to instantiate a Keras tensor.


In [45]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

### Building the Networks

To understand why we chose this setup, make sure to read the paper we are using:

* Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, Rob Fergus,
  "End-To-End Memory Networks",
  http://arxiv.org/abs/1503.08895

## Encoders

### Input Encoder m

In [46]:
# Input gets embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

# This encoder will output:
# (samples, story_maxlen, embedding_dim)

### Input Encoder c

In [47]:
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)

### Question Encoder

In [48]:
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)



### Encode the Sequences

In [49]:
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

##### Use dot product to compute the match between first input vector seq and the query

In [50]:
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

#### Add this match matrix with the second input vector sequence

In [51]:
# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

#### Concatenate

In [52]:
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])

In [53]:
answer

<KerasTensor shape=(None, 6, 220), dtype=float32, sparse=False, name=keras_tensor_18>

In [54]:
# Reduce with RNN (LSTM)
answer = LSTM(32)(answer)  # (samples, 32)

In [55]:
# Regularization with Dropout
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)  # (samples, vocab_size)

In [56]:
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [57]:
model.summary()

In [58]:
# train
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=120,validation_data=([inputs_test, queries_test], answers_test))

Epoch 1/120
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.4828 - loss: 1.1952 - val_accuracy: 0.5030 - val_loss: 0.6946
Epoch 2/120
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.5143 - loss: 0.7086 - val_accuracy: 0.5030 - val_loss: 0.6935
Epoch 3/120
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.5037 - loss: 0.6976 - val_accuracy: 0.4970 - val_loss: 0.6936
Epoch 4/120
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step - accuracy: 0.5089 - loss: 0.6955 - val_accuracy: 0.4980 - val_loss: 0.6932
Epoch 5/120
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.4946 - loss: 0.6970 - val_accuracy: 0.5030 - val_loss: 0.6932
Epoch 6/120
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.5119 - loss: 0.6947 - val_accuracy: 0.4970 - val_loss: 0.6934
Epoch 7/120
[

### Saving the Model

In [59]:
# filename = 'chatbot_120_epochs.h5'
# model.save(filename)

## Evaluating the Model

### Plotting Out Training History

In [60]:
import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

dict_keys(['accuracy', 'loss', 'val_accuracy', 'val_loss'])


KeyError: 'acc'

### Evaluating on Given Test Set

In [None]:
model.load_weights(filename)
pred_results = model.predict(([inputs_test, queries_test]))

In [None]:
test_data[0][0]

In [None]:
story =' '.join(word for word in test_data[0][0])
print(story)

In [None]:
query = ' '.join(word for word in test_data[0][1])
print(query)

In [None]:
print("True Test Answer from Data is:",test_data[0][2])

In [None]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

## Writing Your Own Stories and Questions

Remember you can only use words from the existing vocab

In [None]:
vocab

In [None]:
# Note the whitespace of the periods
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
my_story.split()

In [None]:
my_question = "Is the football in the garden ?"

In [None]:
my_question.split()

In [None]:
mydata = [(my_story.split(),my_question.split(),'yes')]

In [None]:
my_story,my_ques,my_ans = vectorize_stories(mydata)

In [None]:
pred_results = model.predict(([ my_story, my_ques]))

In [None]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

# Great Job!