<a href="https://colab.research.google.com/github/hritvikgupta/Chatbot/blob/master/WORKING_CHATBOT_Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Importing the libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding,LSTM, Dropout, Dense
from tensorflow.keras import utils

Using TensorFlow backend.


In [None]:

import requests, zipfile, io

r = requests.get( 'https://github.com/shubham0204/Dataset_Archives/blob/master/chatbot_nlp.zip?raw=true' ) 
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()


## Importing and preprocessing the data

In [None]:
import os
import yaml


dir_path = 'chatbot_nlp/data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()
for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

answer_with_tags = list() 
for i in range(len(answers)):
  if type(answers[i])==str:
    answer_with_tags.append(answers[i])
  else:
    questions.pop(i)

answers = list()    
for i in range(len(answer_with_tags)):
  answers.append('<START>' + answer_with_tags[i] + '<END>')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(answers + questions)
vocab_size = len(tokenizer.word_index)+1
print('vocab_Size: {}'.format(vocab_size))\


vocab_Size: 1894


In [None]:
questions[:3]

['Hello', 'Hi', 'Greetings!']

In [None]:
answers[:4]

['<START>Hi<END>',
 '<START>Hello<END>',
 '<START>Hello<END>',
 '<START>Greetings!<END>']

#### Preprocessing the data
* Tokenize and pad the questions
* Tokenize and pad the answers. append start and end in all the sequences

* Tokenize the pad answers remove the start in all sequences one hot encode the sequence

In [None]:

from gensim.models import Word2Vec
import re

vocab = []
for word in tokenizer.word_index:
    vocab.append( word )

def tokenize( sentences ):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = str(sentence).lower()
        sentence = re.sub( '[^a-zA-Z]', ' ', sentence )
        tokens = sentence.split()
        vocabulary += tokens
        tokens_list.append( tokens )
    return tokens_list , vocabulary

p = tokenize( questions + answers )
model = Word2Vec(p[0])

embedding_matrix = np.zeros( ( vocab_size, 100 ) )
#for i in range(len(tokenizer.word_index)):
 #   embedding_matrix[i] = model[vocab[i]]

In [None]:

ques = []

tokenizer.fit_on_texts(questions)
tokenized_seq = tokenizer.texts_to_sequences(questions)

length_list = []
for token_seq in tokenized_seq:
  length_list.append(len(token_seq))
max_input_length = np.array(length_list).max()
print("Questions max lengths {}".format(max_input_length))

## Padding the sequence
padded_question_lines = pad_sequences(tokenized_seq, maxlen = max_input_length, padding = 'post' )
encoder_input_data =  np.array(padded_question_lines)
print("Encoder input data shape{}".format(encoder_input_data.shape))
num_encoded_tokens = len(tokenizer.word_index)+1

Questions max lengths 22
Encoder input data shape(564, 22)


#### Preprocessing input Data for the decoder(decoder_input_data)

the decoder will be fed with the preprocessed Answers

* Append<start> tag at the first position in each answer sequence
* Append<END> tag at the last position in each answer sentence

In [None]:
## This is decoder input Data

tokenized_ans_lines = tokenizer.texts_to_sequences(answers) 

length_list1 = list()
for token_seq in tokenized_ans_lines:
    length_list1.append( len( token_seq ))
max_output_length = np.array( length_list1 ).max()
print( 'Answer max length is {}'.format( max_output_length ))

padded_ans_lines = pad_sequences( tokenized_ans_lines , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array( padded_ans_lines )
print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

ans_word_dict = tokenizer.word_index
num_ans_tokens = len( ans_word_dict )+1
print( 'Number of Answer tokens = {}'.format( num_ans_tokens))


Answer max length is 74
Decoder input data shape -> (564, 74)
Number of Answer tokens = 1894


#### Preparing the Target Data For the decoder
we take a copy of the tokenized_ans_lines and modify like this
1. we remove the <start> tag which we appended earlier.
2. convert the padded_ans_lines to the one hot vectors


Or in general machine learning language this is y means labels we want to predict

In [None]:


## This is decoder Output Data


tokenized_answers = tokenizer.texts_to_sequences(answers)
# Removing the start tag from all the tokenized nswers
for i in range(len(tokenized_answers)):
  tokenized_answers[i] = tokenized_answers[i][1: ]

  
padded_ans_lines = pad_sequences(tokenized_answers, maxlen = max_output_length, padding = 'post')
onehot_lines = utils.to_categorical(padded_ans_lines, vocab_size)
decoder_target_data = np.array(onehot_lines)
print("Decoder_Target_input {}".format(decoder_target_data.shape))

Decoder_Target_input (564, 74, 1894)


In [None]:
num_encoded_tokens, num_ans_tokens, max_input_length, max_output_length

(1894, 1894, 22, 74)

In [None]:

decoder_target_data.shape

(564, 74, 1894)

###Defining and training the model

* The model will have Embedding LSTM, and Dense Layer, and Dense Layers the basic Configuration as follow

* 2 input layers: One for encoder input Data and other for decoder input Data

* Embedding layer : for converting token vectors to fixed sized dense vectors

* lstm layer: provide access to long short term cells


## Working: 
1. The Encoder_input_data comes in the embedding layer (encoder_embedding)

2. the output of the embedding layer goes to the lstm cell which produces 2 state vectors (h and c which are encoder states)

3. These states are set in the lstm cell of the decoder
4. the decoder_input data comes through the embedding layer
5. the embedding goes in the lstm cell (which had the states) to produce sequences

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=(None, ))
encoder_embedding = tf.keras.layers.Embedding(num_encoded_tokens, 200, mask_zero=True)(encoder_inputs)
## State_h, state_c contains all the information about the questions or the input that is been given 
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(None, ))
decoder_embedding = tf.keras.layers.Embedding(num_ans_tokens, 200, mask_zero=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200, return_state=True, return_sequences=True )
decoder_outputs, _,_ = decoder_lstm(decoder_embedding,initial_state = encoder_states)
decoder_dense = tf.keras.layers.Dense(num_ans_tokens, activation=tf.keras.activations.softmax)
output = decoder_dense(decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer = tf.keras.optimizers.RMSprop(), loss = 'categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    378800      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    378800      input_2[0][0]                    
______________________________________________________________________________________________

## Training the model

In [None]:
num_encoded_tokens, max_input_length

(1894, 22)

In [None]:
encoder_input_data.shape, decoder_input_data.shape, decoder_target_data.shape

((564, 22), (564, 74), (564, 74, 1894))

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=25, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f049d136358>

In [None]:
model.save('model.h5')

## INferencing the model
1) we create the inferencing in predicting the answers

* Encoder Inference model takes the questions as inputs and outputs the LSTM states(h and c)

* decoder inference model takes 2 inputs one are the LSTM states(output of the encoder model), second os the question input one not having the start tag it will output the answers which we fed to the encoder model and its state values.

In [None]:
def make_inference_models():
  encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

  decoder_state_input_h = tf.keras.layers.Input(shape = (200, ))
  decoder_state_input_c = tf.keras.layers.Input(shape = (200, ))
  
  decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
 
  decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_state_inputs)
  decoder_states = [state_h, state_c]
  decoder_outputs = decoder_dense(decoder_outputs)
  decoder_model = tf.keras.models.Model(
                  [decoder_inputs] + decoder_state_inputs,
                  [decoder_outputs]+ decoder_states )
  
  return encoder_model, decoder_model

## Talking to our bot
First we Define a Method str_to_tokens which converts the str questions to the string of integers ith padding


In [None]:
def str_to_tokens(sentence : str):
  words  = sentence.lower().split()
  tokens_list = list()
  for word in words:
    tokens_list.append(tokenizer.word_index[word])
  return pad_sequences([tokens_list], maxlen = max_input_length)



In [None]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_input_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


 i like to a lot of the universe end
 my favorite subject is 2001 end
 i am not myself end
 i like to a lot of the computer end
 i like to a lot of the computer end
 no i am not as myself as i am not yet end
 i like to a lot of the computer end
 i like to a lot of the computer end


In [None]:
how shknotokenizer.word_index