<a href="https://colab.research.google.com/github/hritvikgupta/Chatbot/blob/master/Chatbotipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Importing the libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding,LSTM, Dropout, Dense
from tensorflow.keras import utils

Using TensorFlow backend.


In [None]:

import requests, zipfile, io

r = requests.get( 'https://github.com/shubham0204/Dataset_Archives/blob/master/chatbot_nlp.zip?raw=true' ) 
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()


## Importing and preprocessing the data

In [None]:
import os
import yaml


dir_path = 'chatbot_nlp/data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()
for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

answer_with_tags = list() 
for i in range(len(answers)):
  if type(answers[i])==str:
    answer_with_tags.append(answers[i])
  else:
    questions.pop(i)

answers = list()    
for i in range(len(answer_with_tags)):
  answers.append('<START>' + answer_with_tags[i] + '<END>')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(answers + questions)
vocab_size = len(tokenizer.word_index)+1
print('vocab_Size: {}'.format(vocab_size))\


vocab_Size: 1894


In [None]:
questions[:3]

['Hello', 'Hi', 'Greetings!']

In [None]:
answers[:4]

['<START>Hi<END>',
 '<START>Hello<END>',
 '<START>Hello<END>',
 '<START>Greetings!<END>']

#### Preprocessing the data
* Tokenize and pad the questions
* Tokenize and pad the answers. append start and end in all the sequences

* Tokenize the pad answers remove the start in all sequences one hot encode the sequence

In [None]:

from gensim.models import Word2Vec
import re

vocab = []
for word in tokenizer.word_index:
    vocab.append( word )

def tokenize( sentences ):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = str(sentence).lower()
        sentence = re.sub( '[^a-zA-Z]', ' ', sentence )
        tokens = sentence.split()
        vocabulary += tokens
        tokens_list.append( tokens )
    return tokens_list , vocabulary

p = tokenize( questions + answers )
model = Word2Vec(p[0])

embedding_matrix = np.zeros( ( vocab_size, 100 ) )
#for i in range(len(tokenizer.word_index)):
 #   embedding_matrix[i] = model[vocab[i]]

In [None]:
a,c = tokenize(questions+answers)
c

['hello',
 'hi',
 'greetings',
 'hello',
 'hi',
 'how',
 'is',
 'it',
 'going',
 'hi',
 'how',
 'is',
 'it',
 'going',
 'hi',
 'how',
 'is',
 'it',
 'going',
 'hi',
 'how',
 'is',
 'it',
 'going',
 'hi',
 'how',
 'is',
 'it',
 'going',
 'hi',
 'how',
 'is',
 'it',
 'going',
 'how',
 'are',
 'you',
 'doing',
 'how',
 'are',
 'you',
 'doing',
 'how',
 'are',
 'you',
 'doing',
 'nice',
 'to',
 'meet',
 'you',
 'how',
 'do',
 'you',
 'do',
 'how',
 'do',
 'you',
 'do',
 'hi',
 'nice',
 'to',
 'meet',
 'you',
 'it',
 'is',
 'a',
 'pleasure',
 'to',
 'meet',
 'you',
 'top',
 'of',
 'the',
 'morning',
 'to',
 'you',
 'top',
 'of',
 'the',
 'morning',
 'to',
 'you',
 'what',
 's',
 'up',
 'what',
 's',
 'up',
 'what',
 's',
 'up',
 'what',
 's',
 'up',
 'what',
 's',
 'up',
 'what',
 'is',
 'ai',
 'what',
 'is',
 'ai',
 'are',
 'you',
 'sentient',
 'are',
 'you',
 'sentient',
 'are',
 'you',
 'sentient',
 'are',
 'you',
 'sapient',
 'are',
 'you',
 'sapient',
 'are',
 'you',
 'sapient',
 'are'

In [None]:

ques = []

tokenizer.fit_on_texts(questions)
tokenized_seq = tokenizer.texts_to_sequences(questions)

length_list = []
for token_seq in tokenized_seq:
  length_list.append(len(token_seq))
max_input_length = np.array(length_list).max()
print("Questions max lengths {}".format(max_input_length))

## Padding the sequence
padded_question_lines = pad_sequences(tokenized_seq, maxlen = max_input_length, padding = 'post' )
encoder_input_data =  np.array(padded_question_lines)
print("Encoder input data shape{}".format(encoder_input_data.shape))
num_encoded_tokens = len(tokenizer.word_index)+1

Questions max lengths 22
Encoder input data shape(564, 22)


#### Preprocessing input Data for the decoder(decoder_input_data)

the decoder will be fed with the preprocessed Answers

* Append<start> tag at the first position in each answer sequence
* Append<END> tag at the last position in each answer sentence

In [None]:
## This is decoder input Data

tokenized_ans_lines = tokenizer.texts_to_sequences(answers) 

length_list1 = list()
for token_seq in tokenized_ans_lines:
    length_list1.append( len( token_seq ))
max_output_length = np.array( length_list1 ).max()
print( 'Answer max length is {}'.format( max_output_length ))

padded_ans_lines = pad_sequences( tokenized_ans_lines , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array( padded_ans_lines )
print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

ans_word_dict = tokenizer.word_index
num_ans_tokens = len( ans_word_dict )+1
print( 'Number of Answer tokens = {}'.format( num_ans_tokens))


Answer max length is 74
Decoder input data shape -> (564, 74)
Number of Answer tokens = 1894


#### Preparing the Target Data For the decoder
we take a copy of the tokenized_ans_lines and modify like this
1. we remove the <start> tag which we appended earlier.
2. convert the padded_ans_lines to the one hot vectors


Or in general machine learning language this is y means labels we want to predict

In [None]:


## This is decoder Output Data


tokenized_answers = tokenizer.texts_to_sequences(answers)
# Removing the start tag from all the tokenized nswers
for i in range(len(tokenized_answers)):
  tokenized_answers[i] = tokenized_answers[i][1: ]

  
padded_ans_lines = pad_sequences(tokenized_answers, maxlen = max_output_length, padding = 'post')
onehot_lines = utils.to_categorical(padded_ans_lines, vocab_size)
decoder_target_data = np.array(onehot_lines)
print("Decoder_Target_input {}".format(decoder_target_data.shape))

Decoder_Target_input (564, 74, 1894)


In [None]:
num_encoded_tokens, num_ans_tokens, max_input_length, max_output_length

(1894, 1894, 22, 74)

In [None]:

decoder_target_data.shape

(564, 74, 1894)

In [None]:
encoder_input_data

array([[234,   0,   0, ...,   0,   0,   0],
       [ 87,   0,   0, ...,   0,   0,   0],
       [432,   0,   0, ...,   0,   0,   0],
       ...,
       [ 24,  45,  97, ...,   0,   0,   0],
       [537, 266,   0, ...,   0,   0,   0],
       [ 31,   5,   7, ...,   0,   0,   0]], dtype=int32)

###Defining and training the model

* The model will have Embedding LSTM, and Dense Layer, and Dense Layers the basic Configuration as follow

* 2 input layers: One for encoder input Data and other for decoder input Data

* Embedding layer : for converting token vectors to fixed sized dense vectors

* lstm layer: provide access to long short term cells


## Working: 
1. The Encoder_input_data comes in the embedding layer (encoder_embedding)

2. the output of the embedding layer goes to the lstm cell which produces 2 state vectors (h and c which are encoder states)

3. These states are set in the lstm cell of the decoder
4. the decoder_input data comes through the embedding layer
5. the embedding goes in the lstm cell (which had the states) to produce sequences

In [None]:
from tensorflow.keras.layers import TimeDistributed

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=(None, ))
encoder_embedding = tf.keras.layers.Embedding(num_encoded_tokens, 200 , mask_zero=True)(encoder_inputs)
## State_h, state_c contains all the information about the questions or the input that is been given 
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(None,  ))
decoder_embedding = tf.keras.layers.Embedding(num_ans_tokens, 200, mask_zero=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200, return_state=True, return_sequences=True )
decoder_outputs, _,_ = decoder_lstm(decoder_embedding,initial_state = encoder_states)
decoder_dense = tf.keras.layers.Dense(num_ans_tokens, activation=tf.keras.activations.softmax)
output = decoder_dense(decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer = tf.keras.optimizers.RMSprop(), loss = 'categorical_crossentropy')

model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 200)    378800      input_16[0][0]                   
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 200)    378800      input_17[0][0]                   
___________________________________________________________________________________________

## Training the model

In [None]:
num_encoded_tokens, max_input_length

(1894, 22)

In [None]:
encoder_input_data.shape, decoder_input_data.shape, decoder_target_data.shape

((564, 22), (564, 74), (564, 74, 1894))

In [None]:

model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=64, epochs=100, validation_split=0.1 ) 
model.save( 'model.h5' ) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## INferencing the model
1) we create the inferencing in predicting the answers

* Encoder Inference model takes the questions as inputs and outputs the LSTM states(h and c)

* decoder inference model takes 2 inputs one are the LSTM states(output of the encoder model), second os the question input one not having the start tag it will output the answers which we fed to the encoder model and its state values.

In [None]:
def make_inference_models():
  encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

  decoder_state_input_h = tf.keras.layers.Input(shape = (200, ))
  decoder_state_input_c = tf.keras.layers.Input(shape = (200, ))
  
  decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
 
  decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_state_inputs)
  decoder_states = [state_h, state_c]
  decoder_outputs = decoder_dense(decoder_outputs)
  decoder_model = tf.keras.models.Model(
                  [decoder_inputs] + decoder_state_inputs,
                  [decoder_outputs]+ decoder_states )
  
  return encoder_model, decoder_model

## Talking to our bot
First we Define a Method str_to_tokens which converts the str questions to the string of integers ith padding


In [None]:
def str_to_tokens(sentence : str):
  words  = sentence.lower().split()
  tokens_list = list()
  for word in words:
    tokens_list.append(tokenizer.word_index[word])
  return pad_sequences([tokens_list], maxlen = max_input_length, padding = 'post')



In [None]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_input_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )



Enter question : hi
 hello end
Enter question : do you like football
 i am not capable of my computer i am not yet end
Enter question : what is AI
 in physics the distance measured in the direction of prograssion of a wave from the computer to the computer to put the computer
Enter question : what is ai
 in physics the distance measured in the direction of prograssion of a wave from the computer to the computer to put the computer
Enter question : what is wavelength
 in the branch of physics dealing with the transformation of eleven to the means of production and distribution by the means of production


KeyboardInterrupt: ignored

In [None]:
how shknotokenizer.word_index

## Using simple LSTM layer

In [None]:
df = pd.DataFrame(conversations, columns = ['questions','answers'])

In [None]:


input_texts, target_texts = [], []
input_vocabulary = set()
output_vocabulary = set()
start_token = '\t'
stop_token = '\n'
max_training_samples = min(25000, len(df) - 1)
 
for input_text, target_text in zip(df.questions, df.answers):
    target_text = start_token + target_text \
        + stop_token
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_vocabulary:
            input_vocabulary.add(char)
    for char in target_text:
        if char not in output_vocabulary:
           output_vocabulary.add(char)



In [None]:
input_vocabulary = sorted(input_vocabulary)

output_vocabulary = sorted(output_vocabulary)
input_vocab_size = len(input_vocabulary)
output_vocab_size = len(output_vocabulary)
max_encoder_seq_length = max(
    [len(txt) for txt in input_texts])
max_decoder_seq_length = max(
    [len(txt) for txt in target_texts])

input_token_index = dict([(char, i) for i, char in
    enumerate(input_vocabulary)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(output_vocabulary)])
reverse_input_char_index = dict((i, char) for char, i in
    input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in
    target_token_index.items())



In [None]:
import numpy as np
 
encoder_input_data = np.zeros((len(input_texts),
    max_encoder_seq_length, input_vocab_size),
    dtype='float32')
decoder_input_data = np.zeros((len(input_texts),
    max_decoder_seq_length, output_vocab_size),
    dtype='float32')
decoder_target_data = np.zeros((len(input_texts),
    max_decoder_seq_length, output_vocab_size),
    dtype='float32')

for i, (input_text, target_text) in enumerate(
            zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[
            i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_input_data[
            i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1



In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

batch_size = 64
epochs = 100
num_neurons = 256

encoder_inputs = Input(shape=(None, input_vocab_size))
encoder = LSTM(num_neurons, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]
 
decoder_inputs = Input(shape=(None, output_vocab_size))
decoder_lstm = LSTM(num_neurons, return_sequences=True,
                    return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
    initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['acc'])
model.fit([encoder_input_data, decoder_input_data],
    decoder_target_data, batch_size=batch_size, epochs=epochs,
    validation_split=0.1)



In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
thought_input = [
    Input(shape=(num_neurons,)), Input(shape=(num_neurons,))]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=thought_input)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
 
decoder_model = Model(
    inputs=[decoder_inputs] + thought_input,
    output=[decoder_outputs] + decoder_states)  




In [None]:
def decode_sequence(input_seq):
    thought = encoder_model.predict(input_seq)
 
    target_seq = np.zeros((1, 1, output_vocab_size))
    target_seq[0, 0, target_token_index[stop_token]
        ] = 1.
    stop_condition = False
    generated_sequence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + thought)
 
        generated_token_idx = np.argmax(output_tokens[0, -1, :])
        generated_char = reverse_target_char_index[generated_token_idx]
        generated_sequence += generated_char
        if (generated_char == stop_token or
                len(generated_sequence) > max_decoder_seq_length
                ):  
            stop_condition = True

        target_seq = np.zeros((1, 1, output_vocab_size))
        target_seq[0, 0, generated_token_idx] = 1.
        thought = [h, c]

    return generated_sequence



In [None]:
def response(input_text):
   input_seq = np.zeros((1, max_encoder_seq_length, input_vocab_size),
       dtype='float32')
   for t, char in enumerate(input_text):
       input_seq[0, t, input_token_index[char]] = 1.
   decoded_sentence = decode_sequence(input_seq)
   print('Bot Reply (Decoded sentence):', decoded_sentence)



In [None]:
response('what is internet')
response("do you like football")

In [None]:


for _ in range(10):
    states_values = encoder_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = decoder_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_input_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


In [None]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM


In [None]:
X = [encoder_input_data, decoder_input_data]
y = decoder_target_data

model = Sequential()
model.add(LSTM(output_dim = 200, input_shape = encoder_input_data.shape, return_sequences = True, init = 'glorot_normal', inner_init = 'glorot_normal', activation = 'sigmoid'))
model.add(LSTM(output_dim = 200, input_shape = encoder_input_data.shape, return_sequences = True, init = 'glorot_normal', inner_init = 'glorot_normal', activation = 'sigmoid'))
model.add(LSTM(output_dim = 200, input_shape = encoder_input_data.shape, return_sequences = True, init = 'glorot_normal', inner_init = 'glorot_normal', activation = 'sigmoid'))
model.add(LSTM(output_dim = 200, input_shape = encoder_input_data.shape, return_sequences = True, init = 'glorot_normal', inner_init = 'glorot_normal', activation = 'sigmoid'))
model.compile(optimizer='adam', loss='cosine_proximity', metrics=['accuracy'] )



In [None]:
model.summary()

In [None]:
model.fit(np.expand_dims(encoder_input_data, axis=0),decoder_target_data, epochs = 50)

In [None]:
np.expand_dims(encoder_input_data, axis=0).shape