In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
import re
import tqdm

In [5]:
#read dataset
data_df = pd.read_csv('chatbot.csv', index_col=False)
data_df

Unnamed: 0,question,Answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
4286,"The Hubble Space Telescope, launched into low ...",Edwin Hubble
4287,What is the name of the nearest major galaxy t...,The Andromeda Galaxy.
4288,God Save the Queen is the national anthem of w...,The United Kingdom of Great Britain
4289,"The Celtic Shelf, the seabed under the Celtic ...",Europe


In [6]:
#details of data
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4291 entries, 0 to 4290
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  4291 non-null   object
 1   Answer    4291 non-null   object
dtypes: object(2)
memory usage: 67.2+ KB


In [7]:
data_df.columns=["question","answer"]

In [8]:
def clean_text(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('\n', '', text)
  text = re.sub(r'[^\w]',' ',text)
  text = re.sub('\w*\d\w*', '', text)
  return text

data_df.question = data_df.question.map(clean_text)
data_df.answer = data_df.answer.map(clean_text)

In [9]:
def add_start_end(text):
  text = f'<start> {text} <end>'
  return text

data_df.question = data_df.question.map(add_start_end)
data_df.answer = data_df.answer.map(add_start_end)

In [10]:
data_df

Unnamed: 0,question,answer
0,<start> hi how are you doing <end>,<start> i m fine how about yourself <end>
1,<start> i m fine how about yourself <end>,<start> i m pretty good thanks for asking <end>
2,<start> i m pretty good thanks for asking <end>,<start> no problem so how have you been <end>
3,<start> no problem so how have you been <end>,<start> i ve been great what about you <end>
4,<start> i ve been great what about you <end>,<start> i ve been good i m in school right no...
...,...,...
4286,<start> the hubble space telescope launched i...,<start> edwin hubble <end>
4287,<start> what is the name of the nearest major ...,<start> the andromeda galaxy <end>
4288,<start> god save the queen is the national ant...,<start> the united kingdom of great britain <end>
4289,<start> the celtic shelf the seabed under the...,<start> europe <end>


In [11]:
#nlp process
# fea ture ext
# lemmatizing
# 
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', oov_token='<OOV>'
  )
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  return tensor, lang_tokenizer

In [12]:

question_sequence, question_tokenizer = tokenize(data_df.question)
answer_sequence, answer_tokenizer = tokenize(data_df.answer)

In [13]:
#feature selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(question_sequence, 
                answer_sequence, test_size = 0.1, random_state=42) 

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3861, 24), (430, 24), (3861, 74), (430, 74))

In [14]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print('%d---> %s' % (t, lang.index_word[t]))

print('Question')
convert(question_tokenizer, x_train[0])
print()
print('Answer')
convert(answer_tokenizer, y_train[0])

Question
2---> <start>
60---> tell
20---> me
7---> a
95---> joke
3---> <end>

Answer
2---> <start>
14---> what
16---> do
5---> you
42---> get
64---> when
5---> you
139---> cross
356---> music
15---> and
72---> an
1723---> automobile
3021---> cartune
3---> <end>


In [15]:
#length of sentence
vocab_inp_size = len(question_tokenizer.word_index)+1
vocab_tar_size =  len(answer_tokenizer.word_index)+1
#number of hidden layers
embedding_dim = 256
#number of neuron
units = 1024
batch_size=32

In [16]:
#create train_dataset,test_dataset
def create_dataset(x, y, batch_size=32):
  data = tf.data.Dataset.from_tensor_slices((x, y))

  data = data.shuffle(1028)
  data = data.batch(batch_size, drop_remainder=True)

  data = data.prefetch(tf.data.experimental.AUTOTUNE)

  return data

train_dataset = create_dataset(x_train, y_train)
test_dataset = create_dataset(x_test, y_test)

In [17]:
for q, a in train_dataset.take(1):
  print(f'Question:{q.shape}\n{q}')
  
  print(f'Answer:{a.shape}\n{a}')

Question:(32, 24)
[[   2   22  102 2233    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   2    6  188  786    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   2    5   71    7 1142  112   91    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   2    6  715   25   49   47    3    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   2   13   12   78   72 1567   80 1568    3    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   2    9 1969 1970    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   2  106   17    9   13  108    4  246    3    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   2    7  332    4  667   58    6  260    3    0    0    0    0    0
     0    0    0    0    0    0    

In [18]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
      super(Encoder, self).__init__()

      self.batch_size = batch_size
      self.encoder_units = encoder_units
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
      self.gru = tf.keras.layers.GRU(self.encoder_units, 
                                           return_sequences=True,
                                           return_state=True,
                                           recurrent_initializer = 'glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.encoder_units))

In [19]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
      super(Decoder, self).__init__()

      self.batch_size = batch_size
      self.decoder_units = decoder_units
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
      self.gru = tf.keras.layers.GRU(self.decoder_units, 
                                           return_sequences=True,
                                           return_state=True,
                                           recurrent_initializer = 'glorot_uniform')
      
      self.fc = tf.keras.layers.Dense(vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, hidden = self.gru(x, initial_state = hidden)
    output = tf.reshape(output, (-1, output.shape[2]))
    x =  tf.nn.softmax(self.fc(output))
    return x, hidden


In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(q, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (32, 24, 1024)
Encoder Hidden state shape: (batch size, units) (32, 1024)


In [21]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)

sample_decoder_output, _ = decoder(tf.random.uniform((batch_size, 1)), sample_hidden)

print ('Decoder output shape: (batch size, vocab_size) {}'.format(sample_decoder_output.shape))


Decoder output shape: (batch size, vocab_size) (32, 3319)


In [22]:
# create the optimizer using the Adam optimizer
optimizer = tf.keras.optimizers.Adam()
# create the loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')

# define the loss function for the training
def loss_function(real, pred):
  # create the mask to ignore the padding tokens
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  # mask shape == (batch_size, sequence_length)
  # calculate the loss
  loss_ = loss_object(real, pred)
  # mask the loss
  # how the mask works:
  # if the value is 1, the loss is calculated
  # if the value is 0, the loss is ignored
    #[1,1,1,1,1,1,0,0,0,0,0] mask
    # *
    #[2,6,2,1,6,3,2,1,5,7,9] input
    # =
    #[2,6,2,1,6,3,0,0,0,0,0] output
  mask = tf.cast(mask, dtype=loss_.dtype)
  # mask shape == (batch_size, sequence_length)

  loss_ *= mask
  # calculate the average loss per batch 
  return tf.reduce_mean(loss_)

In [23]:
# create the training metric 
train_loss = tf.metrics.Mean(name='train loss')
# create the testing metric 
test_loss =tf.metrics.Mean(name='test loss')

In [24]:
# create the training step
# using the tf.function decorator to speed up the training process by converting the training function to a TensorFlow graph
@tf.function
# define the training step 
def train_step(inputs, target, enc_hidden):
  # the encoder_hidden is the initial hidden state of the encoder
  # enc_hidden shape == (batch_size, hidden_size)

  # inilaize the loss to zero
  loss = 0
  # create the gradient tape to record the gradient of the loss with respect to the weights

  with tf.GradientTape() as tape:
    # pass the input to the encoder
    # enc_output shape == (batch_size, 49, hidden_size)
    # enc_hidden shape == (batch_size, hidden_size)
    # using the encoder to get the encoder_output and the encoder_hidden
    # using the encoder_hidden as the initial hidden state of the decoder
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    # set the initial decoder hidden state to the encoder hidden state
    dec_hidden = enc_hidden

    # create the start token 
    # start_token shape == (batch_size, 1)
    # repeat the start token for the batch size times
    dec_input = tf.expand_dims([answer_tokenizer.word_index['<start>']] * inputs.shape[0], 1)
    
    # Teacher forcing - feeding the target as the next input
    
    for t in range(1, target.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)
      # calculate the loss for the current time step using the loss function
      loss += loss_function(target[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(target[:, t], 1)
  # calculate the loss for the current batch
  batch_loss = (loss / int(target.shape[1]))

  # get the trainable variables
  variables = encoder.trainable_variables + decoder.trainable_variables
  # calculate the gradients using the tape 
  gradients = tape.gradient(loss, variables)
  # update the trainable variables
  optimizer.apply_gradients(zip(gradients, variables))
  # add the loss to the training loss metric
  train_loss(batch_loss)
  return batch_loss

In [25]:
# create the training step
# using the tf.function decorator to speed up the training process by converting the training function to a TensorFlow graph
@tf.function 
def test_step(inputs, target, enc_hidden):
    # the encoder_hidden is the initial hidden state of the encoder
    # enc_hidden shape == (batch_size, hidden_size)
    # inilaize the loss to zero
    loss = 0
    # pass the input to the encoder 
    # enc_output shape == (batch_size, 49, hidden_size) 
    # enc_hidden shape == (batch_size, hidden_size)
    # using the encoder to get the encoder_output and the encoder_hidden
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    # set the initial decoder hidden state to the encoder hidden state
    dec_hidden = enc_hidden
    # create the start token
    # start_token shape == (batch_size, 1)
    # repeat the start token for the batch size times
    dec_input = tf.expand_dims([answer_tokenizer.word_index['<start>']] * inputs.shape[0], 1)
    for t in range(1, target.shape[1]):
        # passing enc_output to the decoder with dec_hidden as the initial hidden state
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        # calculate the loss for the current time step using the loss function 
        loss += loss_function(target[:, t], predictions)

        # using teacher forcing
        dec_input = tf.expand_dims(target[:, t], 1)
    # calculate the loss for the current batch
    batch_loss = (loss / int(target.shape[1]))
    # add the batch loss to the test loss metric
    test_loss(batch_loss)

In [26]:
# set the epochs to 10
EPOCHS = 30
# set the old test loss to high number 

old_test_loss=1000000
# create the training loop
for epoch in range(EPOCHS):
    # reset the training loss metric
    train_loss.reset_states()
    # reset the testing loss metric
    test_loss.reset_states()

    # initalize the hidden state of the encoder to zeros 
    enc_hidden = encoder.initialize_hidden_state()
    # create the training progress bar set the total number of batches to the length of the training dataset and the batch size to the test size
    steps_per_epoch = answer_sequence.shape[0]//batch_size #=> 4356 batch in the dataset 
    bar = tf.keras.utils.Progbar(target=steps_per_epoch)
    
    count=0
    # iterate over the training dataset 
    for (batch, (inputs, target)) in enumerate(train_dataset):
        # update the progress bar
        count += 1
        # run the training step
        batch_loss = train_step(inputs, target, enc_hidden)
        bar.update(count)  # manually update the progress bar    
    # iterate over the testing dataset    
    for (batch, (inputs, target)) in enumerate(test_dataset):
        count += 1
        # run the testing step
        batch_loss = test_step(inputs, target, enc_hidden)
        bar.update(count)
    # save the best performance model on the test dataset 
    
    if old_test_loss> test_loss.result():
        # set the old test loss to the test loss 
        old_test_loss= test_loss.result()
        encoder.save_weights(filepath='/content/models/encoder')
        decoder.save_weights(filepath='/content/models/decoder')
        print('Model is saved')
    # print the training and testing loss
    print('#' * 50)
    print(f'Epoch #{epoch + 1}')
    print(f'Accuracy {test_loss.result()}')
    print('#' * 50)
  


##################################################
Epoch #1
Accuracy 0.5891615152359009
##################################################
##################################################
Epoch #2
Accuracy 0.5588918328285217
##################################################
##################################################
Epoch #3
Accuracy 0.55002361536026
##################################################
Epoch #4
Accuracy 0.5584268569946289
##################################################
Epoch #5
Accuracy 0.5645029544830322
##################################################
Epoch #6
Accuracy 0.5821784138679504
##################################################
Epoch #7
Accuracy 0.5971917510032654
##################################################
Epoch #8
Accuracy 0.5968561768531799
##################################################
Epoch #9
Accuracy 0.6266809105873108
##################################################
Epoch #10
Accuracy 0.6400573253631592
###################

In [27]:
# create the chatbot function
# the chatbot function takes in the question as input and answers the input sentence 
def chatbot(sentence):
  
  # clean the input question sentence 
  sentence = clean_text(sentence)
  # add the start token to the sentence
  sentence =add_start_end(sentence)
  # tokenize the sentence
  inputs = question_tokenizer.texts_to_sequences([sentence])
  # pad the sentence
  inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                         maxlen=29,
                                                         padding='post')
  
  # initalize the hidden state of the encoder to zeros
  hidden = [tf.zeros((1, units))]
  # pass the sentence to the encoder with the hidden state as the initial hidden state
  enc_out, enc_hidden = encoder(inputs, hidden)
  # set the initial decoder hidden state to the encoder hidden state
  dec_hidden = enc_hidden
  # create the start token
  # start_token shape == (batch_size, 1)
  # repeat the start token for the batch size times
  dec_input = tf.expand_dims([answer_tokenizer.word_index['<start>']], 0)
  # create the result string
  result = ''
  # loop over the length of the sentence (32)

  for t in range(32):
    # passing the encoder output and the decoder hidden state to the decoder make sure the decoder input is the previous predicted word
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # getting the predicted word index
    predicted_id = tf.argmax(predictions[0]).numpy()
    # getting the predicted word using the predicted index
    # add the predicted word to the result string 
    result += answer_tokenizer.index_word[predicted_id] + ' '
    # if the predicted word is the <end> token then stop the loop
    if answer_tokenizer.index_word[predicted_id] == '<end>':
      # remove the <start> and <end> tokens from the result string
      result = result.replace('<start> ', '')
      result = result.replace(' <end> ','')
      # remove the <start> and <end> tokens from the sentence string
      sentence = sentence.replace('<start> ', '')
      sentence = sentence.replace(' <end>', '')
      return  sentence, result

    # using the predicted word as the next decoder input
    dec_input = tf.expand_dims([predicted_id], 0)
  # remove the <start> and <end> tokens from the result string
  result = result.replace('<start> ', '')
  result = result.replace('<end>','')
  # remove the <start> and <end> tokens from the sentence string
  sentence = sentence.replace('<start> ', '')
  sentence = sentence.replace('<end>', '')
  

  
 
  
  # return the result string and the original sentence
  return sentence, result

In [28]:
chatbot("how are you today")

('how are you today', 'i m doing great what about you')

In [29]:
chatbot('what is the weather outside')

('what is the weather outside', 'it s the force that pulls everything down')

In [30]:
chatbot('can you run')

('can you run', 'what s the point')

In [31]:
chatbot('where are you going to school')


('where are you going to school', 'i m going to pcc')

In [32]:
how about a movie

SyntaxError: invalid syntax (509022788.py, line 1)

In [None]:
how are you doing today

In [None]:
chatbot('how are you doing today')

In [None]:
chatbot('what is your name')

In [None]:
chatbot('what school do you go to')

In [None]:
chatbot('fuck you')

In [None]:
chatbot('how are you')

In [None]:
chatbot('what about temprature')

In [None]:
chatbot('what about tempreature')

In [None]:
chatbot('what about the weather')

In [None]:
chatbot('what about the weather')

In [None]:
chatbot('whout the at abweather')



In [33]:
chatbot('whout the at abweather')

('whout the at abweather', 'of course not')

In [34]:
chatbot('what about the weather')

('what about the weather', 'i was a painter')

In [35]:
chatbot('how are you')

('how are you', 'fine and you')

In [36]:
chatbot('what is your name')

('what is your name',
 'an established system of political administration by which a nation state district etc is governed')

In [37]:
chatbot('what about football')

('what about football',
 'i am was never really born and therefore am effectively deathless')

In [38]:
chatbot('how are you')

('how are you', 'fine and you')

In [39]:
chatbot('what is your name')

('what is your name',
 'an established system of political administration by which a nation state district etc is governed')

In [40]:
chatbot('what school do you go to')

('what school do you go to', 'i go to pcc')

In [41]:
chatbot('what about the weather ')

('what about the weather ', 'i was a painter')

In [42]:
chatbot('can you run')

('can you run', 'what s the point')