# RIKI CHATBOT

### Import Libraries

In [1]:
import re
import numpy as np
import pandas as pd
import os
import random
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### GLOVE TWITTER EMBEDDING - 25 size vector per word

In [2]:
f = open('glove.twitter.27B.25d.txt', 'r+', encoding="utf8")
glove_embedding = {}
vector_size = 25
for line in f:
    word = " ".join(line.split()[0:len(line.split()) - vector_size])
    vector = np.array([float(val) for val in line.split()[-vector_size:]])
    glove_embedding[word] = vector
f.close()
glove_embedding['gooooooooossss'] = np.array([1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.,1.])
glove_embedding['eooooooooossss'] = np.array([0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5])

### MOVIE CHAT DATASET
1. Pair of conversations - (Question by a speaker, answer by the 2nd speaker)
2. Large Dataset - Pair of conversations with size 7 words or less chosen for training

In [3]:
with open('movie_lines_cleaned.txt', 'r+') as f:
    lines = f.readlines()
pairs = []
for i, chat in enumerate(lines):
    if i+1 == len(lines): 
        break
    elif len(chat.strip().split(' ')) <= 7 and len(lines[i+1].strip().split(' ')) <= 7: 
        pairs.append((chat,lines[i+1]))
    else:
        continue

### DATA PREPARATION

#### TEXT CLEANING

1. Converting conversational language to pure written language, eg: 'ain't' > 'is not'
2. Only alpha numeric characters are kept for model training
3. Final dataset: Pair of conversations (question, answer)

In [4]:
word_mapping = {"ain't": "is not", 
                "aren't": "are not",
                "can't": "cannot", 
                "'cause": "because", 
                "could've": "could have",
                "couldn't": "could not",
                "didn't": "did not", 
                "doesn't": "does not",
                "don't": "do not", 
                "hadn't": "had not", 
                "hasn't": "has not", 
                "haven't": "have not",
                "he'd": "he would",
                "he'll": "he will", 
                "he's": "he is", 
                "how'd": "how did",
                "how'd'y": "how do you", 
                "how'll": "how will",
                "how's": "how is",
                "I'd": "I would", 
                "I'd've": "I would have", 
                "I'll": "I will", 
                "I'll've": "I will have",
                "I'm": "I am",
                "I've": "I have", 
                "i'd": "i would",
                "i'd've": "i would have", 
                "i'll": "i will", 
                "i'll've": "i will have",
                "i'm": "i am", 
                "i've": "i have", 
                "isn't": "is not",
                "it'd": "it would",
                "it'd've": "it would have", 
                "it'll": "it will", 
                "it'll've": "it will have",
                "it's": "it is", 
                "let's": "let us", 
                "ma'am": "madam",
                "mayn't": "may not",
                "might've": "might have",
                "mightn't": "might not",
                "mightn't've": "might not have", 
                "must've": "must have",
                "mustn't": "must not", 
                "mustn't've": "must not have", 
                "needn't": "need not", 
                "needn't've": "need not have",
                "o'clock": "of the clock",
                "oughtn't": "ought not",
                "oughtn't've": "ought not have",
                "shan't": "shall not",
                "sha'n't": "shall not", 
                "shan't've": "shall not have",
                "she'd": "she would",
                "she'd've": "she would have", 
                "she'll": "she will",
                "she'll've": "she will have", 
                "she's": "she is",
                "should've": "should have", 
                "shouldn't": "should not",
                "shouldn't've": "should not have", 
                "so've": "so have",
                "so's": "so as",
                "this's": "this is",
                "that'd": "that would", 
                "that'd've": "that would have",
                "that's": "that is", 
                "there'd": "there would",
                "there'd've": "there would have", 
                "there's": "there is",
                "here's": "here is",
                "they'd": "they would", 
                "they'd've": "they would have",
                "they'll": "they will", 
                "they'll've": "they will have", 
                "they're": "they are",
                "they've": "they have",
                "to've": "to have",
                "wasn't": "was not", 
                "we'd": "we would", 
                "we'd've": "we would have",
                "we'll": "we will", 
                "we'll've": "we will have", 
                "we're": "we are",
                "we've": "we have", 
                "weren't": "were not", 
                "what'll": "what will", 
                "what'll've": "what will have", 
                "what're": "what are",
                "what's": "what is", 
                "what've": "what have", 
                "when's": "when is", 
                "when've": "when have",
                "where'd": "where did", 
                "where's": "where is",
                "where've": "where have", 
                "who'll": "who will", 
                "who'll've": "who will have", 
                "who's": "who is", 
                "who've": "who have",
                "why's": "why is", 
                "why've": "why have", 
                "will've": "will have", 
                "won't": "will not", 
                "won't've": "will not have",
                "would've": "would have", 
                "wouldn't": "would not", 
                "wouldn't've": "would not have", 
                "y'all": "you all",
                "y'all'd": "you all would",
                "y'all'd've": "you all would have",
                "y'all're": "you all are",
                "y'all've": "you all have",
                "you'd": "you would", 
                "you'd've": "you would have", 
                "you'll": "you will", 
                "you'll've": "you will have",
                "you're": "you are", 
                "you've": "you have", 
                "'bout": "about",
                "intellectu": "intellectally",
                "arwticle": "article",
                "dissconnected": "disconnected",
                "deaaaddddd": "dead",
                "y-y-y-you": "you", 
                "g-g-g-going": "going",
                "t-t-t-to": "to",
                "muh-muh-muh-marry": "marry",
                "Ah-ah-ah-are": "are",
                "C-C-C-C-Candy": "Candy",
                "I-I-I-I": "I",
                "th-th-think": "think"}

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub('"','', text)
    text = ' '.join([word_mapping[word] if word in word_mapping else word for word in text.split(' ')])
    text = re.sub(r"'s\b", '', text)
    text = re.sub("[^a-zA-Z0-9]", " ", text) 
    tokens = [word for word in text.split()]
    return " ".join(tokens).strip()

In [6]:
final_pairs = []
for sent1, sent2 in pairs:
    sent1 = clean_text(sent1)
    sent2 = clean_text(sent2)
    final_pairs.append((sent1, sent2))

In [7]:
final_pairs

[('they do not', 'they do to'),
 ('they do to', 'i hope so'),
 ('i hope so', 'she okay'),
 ('she okay', 'let us go'),
 ('let us go', 'wow'),
 ('like my fear of wearing pastels', 'the real you'),
 ('the real you', 'what good stuff'),
 ('what crap', 'do you listen to this crap'),
 ('do you listen to this crap', 'no'),
 ('you always been this selfish', 'but'),
 ('but', 'then that is all you had to say'),
 ('then that is all you had to say', 'well no'),
 ('tons', 'have fun tonight'),
 ('have fun tonight', 'i believe we share an art instructor'),
 ('i believe we share an art instructor', 'you know chastity'),
 ('you know chastity', 'looks like things worked out tonight huh'),
 ('looks like things worked out tonight huh', 'hi'),
 ('you got something on your mind', 'where'),
 ('where', 'there'),
 ('forget french', 'that is because it is such a nice one'),
 ('c esc ma tete this is my head', 'let me see what i can do'),
 ('great', 'joey'),
 ('joey', 'who'),
 ('you might wanna think about it', '

1. Answers/Target docs have 'goooooooooosss' and 'eooooooooooss' for the decoder model to recognize when the start and end a sequence of words

In [8]:
input_docs = [sent1 for sent1, sent2 in final_pairs]
target_docs = ['gooooooooossss '+ sent2 +' eooooooooossss' for sent1, sent2 in final_pairs]

In [9]:
# Total number of samples/conversations
len(input_docs)

88429

### TOKENIZER

1. Input/encoder and target/Decoder docs are tokenized with keras tokenizer
2. Docs are padding using keras padding function with the max length. Incase of the decoder docs, length will be max_length-1
3. Decoder input: len - 1. eg: 'gooooooooooooos hello world'
4. Decoder onput: len - 1. eg: 'hello world eooooooooos'
5. encoder target word 2 id and id 2 word dictionary created for predictions
6. decoder target word 2 id and id 2 word dictionary created for predictions
7. dictionaries are saved as npy format

In [10]:
enc_tokenizer = Tokenizer()
enc_tokenizer.fit_on_texts(input_docs)
end_tokenized_sents = enc_tokenizer.texts_to_sequences(input_docs)

max_input_length = max([len(tokens.split(' ')) for tokens in input_docs])
    
input_pad_data = pad_sequences(end_tokenized_sents, max_input_length, padding='post', value=0)
encoder_input_data = np.array(input_pad_data)

enc_target_word2id = enc_tokenizer.word_index
enc_target_id2word = dict((token, word) for word, token in enc_target_word2id.items())
enc_nbr_tokens = len(enc_target_word2id)+1
print(max_input_length, enc_nbr_tokens, encoder_input_data.shape)
np.save('enc_target_word2id.npy', enc_target_word2id)

18 16455 (88429, 18)


In [11]:
# DECODER IP: <START> HELLO WORLD
dec_tokenizer = Tokenizer(split=' ', lower=False)
dec_tokenizer.fit_on_texts(target_docs)
dec_tokenized_sents = dec_tokenizer.texts_to_sequences(target_docs)

max_target_length = max([len(tokens.split(' ')) for tokens in target_docs])
    
dec_input_data = [sent[:-1] for sent in dec_tokenized_sents]    
dec_input_pad_data = pad_sequences(dec_input_data, max_target_length-1, padding='post', value=0)
decoder_input_data = np.array(dec_input_pad_data)

dec_target_word2id = dec_tokenizer.word_index
dec_target_id2word = dict((token, word) for word, token in dec_target_word2id.items())
dec_nbr_tokens = len(dec_target_word2id)+1
print(max_target_length, dec_nbr_tokens, decoder_input_data.shape)
np.save('dec_target_word2id.npy', dec_target_word2id)

21 16478 (88429, 20)


In [12]:
# HELLO WORLD <eos>
target_output = [sent[1:] for sent in dec_tokenized_sents]

dec_output_pad_data = pad_sequences(target_output, max_target_length-1, padding='post', value=0)
decoder_output_data = np.array(dec_output_pad_data)
decoder_output_data.shape

(88429, 20)

### Data generator

1. Large datasets require a data generator per batch size for memory allocation
2. Using the glove embedding vector for training is not feasible (large data size)

In [13]:
def training_data_generator(enc_data, dec_ip, dec_op, enc_nbr_tokens, dec_nbr_tokens, batch_size=64):
    i = 0
    while i < len(enc_data):
        if i+batch_size > len(enc_data): 
            batch_size = len(enc_data) - i + 1
        enc_ip = to_categorical(enc_data[i:i+batch_size], enc_nbr_tokens)
        dec_ip = to_categorical(dec_ip[i:i+batch_size], dec_nbr_tokens)
        dec_op = to_categorical(dec_op[i:i+batch_size], dec_nbr_tokens)
        
        if i+batch_size > len(enc_data):
            i = 0
        else:
            i += batch_size
        
        yield enc_ip, dec_ip, dec_op

### Glove Embedding

1. Adding words to the glove embedding with new set of randomly generated vectors with size 25
2. Embedding matrix to be used as the initial weights for the Embedding layer in the encoder/decoder model

In [None]:
for token in list(enc_target_word2id.keys())+list(dec_target_word2id.keys()):
    if token not in glove_embedding.keys():
        glove_embedding[token] = np.array([round(random.random(),5) for _ in range(25)])
    else:
        continue

In [None]:
enc_embedding_matrix = np.zeros((enc_nbr_tokens, 25))
for word, index in enc_target_word2id.items():
    embedding_vector = glove_embedding.get(word)
    if embedding_vector is not None:
        enc_embedding_matrix[index] = embedding_vector
        
dec_embedding_matrix = np.zeros((dec_nbr_tokens, 25))
for word, index in dec_target_word2id.items():
    embedding_vector = glove_embedding.get(word)
    if embedding_vector is not None:
        dec_embedding_matrix[index] = embedding_vector

### MODEL ARCHITECTURE

1. Encoder: Input: max_enc_length X encoder vocab size
2. Encoder: LTSM layer with 256 cells
3. Encoder: output: list of hidden state and cell state memories of the encoder

4. Decoder: Input: max_dec_length-1 X decoder vocab size
5. Decoder: LTSM layer with 256 cells (initial state: encoder states)
6. Decoder: output: Dense layer (softmax activation, decoder vocab size)

### FOR INFERENCE

7. Encoder_model: Input: Encoder Input, Output: Encoder states
8. Deocder_model: Input: Encoder states, Output: Dense layer

In [14]:
def define_models(n_input, n_output, n_units):
    encoder_inputs = Input(shape=(None, n_input))
    #encoder_embedding = Embedding(enc_nbr_tokens, 25, weights=[enc_embedding_matrix], input_length=max_input_length)(encoder_inputs)
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]
    decoder_inputs = Input(shape=(None, n_output))
    #decoder_embedding = Embedding(dec_nbr_tokens, 25 , weights=[dec_embedding_matrix], input_length=max_target_length-1)(decoder_inputs)
    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(n_output, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    encoder_model = Model(encoder_inputs, encoder_states)
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    return model, encoder_model, decoder_model

In [15]:
model, infenc, infdec = define_models(enc_nbr_tokens, dec_nbr_tokens, 256)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 16455)  0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 16478)  0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 17113088    input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  17136640    input_2[0][0]                    
                                     

In [16]:
infenc.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 16455)       0         
_________________________________________________________________
lstm (LSTM)                  [(None, 256), (None, 256) 17113088  
Total params: 17,113,088
Trainable params: 17,113,088
Non-trainable params: 0
_________________________________________________________________


In [17]:
infdec.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 16478)  0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  17136640    input_2[0][0]                    
                                                                 input_3[0][0]                    
          

In [18]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
X1, X2, y = next(training_data_generator(encoder_input_data, decoder_input_data, decoder_output_data, enc_nbr_tokens, dec_nbr_tokens, 512))

In [20]:
es = EarlyStopping(monitor='acc', mode='auto', verbose=1, patience=10)

In [21]:
model.fit([X1, X2], y, epochs=3*len(input_docs)//1024, callbacks=[es])

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/259
Epoch 2/259
Epoch 3/259
Epoch 4/259
Epoch 5/259
Epoch 6/259
Epoch 7/259
Epoch 8/259
Epoch 9/259
Epoch 10/259
Epoch 11/259
Epoch 12/259
Epoch 13/259
Epoch 14/259
Epoch 15/259
Epoch 16/259
Epoch 17/259
Epoch 18/259
Epoch 19/259
Epoch 20/259
Epoch 21/259
Epoch 22/259
Epoch 23/259
Epoch 24/259
Epoch 25/259
Epoch 26/259
Epoch 27/259
Epoch 28/259
Epoch 29/259
Epoch 30/259
Epoch 31/259
Epoch 32/259
Epoch 33/259
Epoch 34/259
Epoch 35/259
Epoch 36/259
Epoch 37/259
Epoch 38/259
Epoch 39/259
Epoch 40/259
Epoch 41/259
Epoch 42/259
Epoch 43/259
Epoch 44/259
Epoch 45/259
Epoch 46/259
Epoch 47/259
Epoch 48/259
Epoch 49/259
Epoch 50/259
Epoch 51/259
Epoch 52/259
Epoch 53/259
Epoch 54/259
Epoch 55/259
Epoch 56/259
Epoch 57/259
Epoch 58/259
Epoch 59/259
Epoch 60/259
Epoch 61/259
Epoch 62/259
Epoch 63/259
Epoch 64/259
Epoch 65/259
Epoch 66/259
Epoch 67/259
Epoch 68/259

Epoch 155/259
Epoch 156/259
Epoch 157/259
Epoch 158/259
Epoch 159/259
Epoch 160/259
Epoch 161/259
Epoch 162/259
Epoch 163/259
Epoch 164/259
Epoch 165/259
Epoch 166/259
Epoch 167/259
Epoch 168/259
Epoch 169/259
Epoch 170/259
Epoch 171/259
Epoch 172/259
Epoch 173/259
Epoch 174/259
Epoch 175/259
Epoch 176/259
Epoch 177/259
Epoch 178/259
Epoch 179/259
Epoch 180/259
Epoch 181/259
Epoch 182/259
Epoch 183/259
Epoch 184/259
Epoch 185/259
Epoch 186/259
Epoch 187/259
Epoch 188/259
Epoch 189/259
Epoch 190/259
Epoch 191/259
Epoch 192/259
Epoch 193/259
Epoch 194/259
Epoch 195/259
Epoch 196/259
Epoch 197/259
Epoch 198/259
Epoch 199/259
Epoch 200/259
Epoch 201/259
Epoch 202/259
Epoch 203/259
Epoch 204/259
Epoch 205/259
Epoch 206/259
Epoch 207/259
Epoch 208/259
Epoch 209/259
Epoch 210/259
Epoch 211/259
Epoch 212/259
Epoch 213/259
Epoch 214/259
Epoch 215/259
Epoch 216/259
Epoch 217/259
Epoch 218/259
Epoch 219/259
Epoch 220/259
Epoch 221/259
Epoch 222/259
Epoch 223/259
Epoch 224/259
Epoch 225/259
Epoch 

<tensorflow.python.keras.callbacks.History at 0x1c6502b3f08>

### FINAL PREDICTION

In [68]:
# input encoder shape: 1Xsent_lengthXnbr_of_tokens
def enc_text_to_seq(text):
    text = clean_text(text)
    tokens = []
    for token in text.split(' '):
        if token in enc_target_word2id.keys():
            tokens.append(enc_target_word2id[token])
    pad_data = np.zeros((max_input_length, ), dtype='int')
    for i, token in enumerate(tokens):
        pad_data[i] = token
    data = to_categorical(pad_data, enc_nbr_tokens)
    data = np.expand_dims(data, axis=0)
    return data

In [101]:
def predict_sequence(infenc, infdec, source, n_steps, cardinality):
    state = infenc.predict(source)
    target_seq = np.array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
    output = []
    tokens = []
    for t in range(n_steps):
        yhat, h, c = infdec.predict([target_seq] + state)
        output.append(yhat[0,0,:])
        state = [h, c]
        target_seq = yhat
        if np.argmax(yhat) != 0:
            tokens.append(np.argmax(yhat))
        else:
            break
    if len(tokens) > 0:
        return ' '.join([dec_target_id2word[token] for token in tokens if dec_target_id2word[token] != 'eooooooooossss'])
    else:
        return 'No response'

In [114]:
for i in range(100,110,1):
    print(f'Question: {input_docs[i]}')
    print(f'Answer: {final_pairs[i][1]}')
    print(f'Chatbot: {predict_sequence(infenc, infdec, enc_text_to_seq(input_docs[i]), 256, dec_nbr_tokens)}')

Question: a hundred bucks a date
Answer: what
Chatbot: what what
Question: what
Answer: i just upped my price
Chatbot: he wake along
Question: two legs nice rack
Answer: what do you think
Chatbot: what do you think do
Question: what do you think
Answer: yeah
Chatbot: yeah
Question: you and verona
Answer: uh yeah we are old friend
Chatbot: uh yeah we are old friend
Question: uh yeah we are old friend
Answer: i hear you are helpin verona
Chatbot: i hear you are helpin verona
Question: we do not chat
Answer: nope just came by to chat
Chatbot: nope just came a a chat
Question: nope just came by to chat
Answer: are you lost
Chatbot: are you lost
Question: are you lost
Answer: hey
Chatbot: hey
Question: who
Answer: have you seen him
Chatbot: why you you been to before
