# ChatBot

In [4]:
import pandas as pd 
import numpy as np
import random
import re
import tensorflow as tf
import unicodedata
from keras import layers
from keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')

In [17]:
# Open and read a txt file
with open('dialogs.txt', 'r') as file:
 train = [line.rstrip() for line in file]
print(len(train))

3725


In [18]:
train[:3]

["hi, how are you doing?\ti'm fine. how about yourself?",
 "i'm fine. how about yourself?\ti'm pretty good. thanks for asking.",
 "i'm pretty good. thanks for asking.\tno problem. so how have you been?"]

In [20]:
Separator = '\t'
train_input, train_target = map(list,zip(*[pair.split(Separator) for pair in train]))

In [23]:
print(train_input[:3])
print(train_target[:3])

['hi, how are you doing?', "i'm fine. how about yourself?", "i'm pretty good. thanks for asking."]
["i'm fine. how about yourself?", "i'm pretty good. thanks for asking.", 'no problem. so how have you been?']


In [26]:
def preprocess_sentence(s):
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [27]:
train_preprocessed_input = [preprocess_sentence(s) for s in train_input]
train_preprocessed_target = [preprocess_sentence(s) for s in train_target]

print(train_preprocessed_input[:3])

['hi , how are you doing ?', "i'm fine . how about yourself ?", "i'm pretty good . thanks for asking ."]


In [28]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)

train_tagged_preprocessed_target[:3]

["<sos> i'm fine . how about yourself ? <eos>",
 "<sos> i'm pretty good . thanks for asking . <eos>",
 '<sos> no problem . so how have you been ? <eos>']

In [29]:
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
source_tokenizer.fit_on_texts(train_preprocessed_input)
# source_tokenizer.get_config()

In [30]:
source_vocab_size = len(source_tokenizer.word_index) + 1
print(source_vocab_size)

2397


In [31]:
# Tokenizer for the English target sentences.
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_tokenizer.fit_on_texts(train_tagged_preprocessed_target)
# target_tokenizer.get_config()

In [32]:
target_vocab_size = len(target_tokenizer.word_index) + 1
print(target_vocab_size)

2461


In [33]:
train_encoder_inputs = source_tokenizer.texts_to_sequences(train_preprocessed_input)

In [34]:
print(train_encoder_inputs[:3])
print(source_tokenizer.sequences_to_texts(train_encoder_inputs[:3]))

[[1016, 6, 40, 18, 5, 160, 3], [29, 640, 2, 40, 37, 548, 3], [29, 152, 48, 2, 290, 28, 486, 2]]
['hi , how are you doing ?', "i'm fine . how about yourself ?", "i'm pretty good . thanks for asking ."]


In [35]:
def generate_decoder_inputs_targets(sentences, tokenizer):
  seqs = tokenizer.texts_to_sequences(sentences)
  decoder_inputs = [s[:-1] for s in seqs] # Drop the last token in the sentence.
  decoder_targets = [s[1:] for s in seqs] # Drop the first token in the sentence.
  # Example ["<sos> i'm fine . how about yourself ?"] ["i'm fine . how about yourself ? <eos>"]
  return decoder_inputs, decoder_targets

In [36]:
train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(train_tagged_preprocessed_target,target_tokenizer)

print(train_decoder_inputs[0],'\n', train_decoder_targets[0])

[2, 38, 561, 4, 44, 39, 562, 6] 
 [38, 561, 4, 44, 39, 562, 6, 3]


In [37]:
print(target_tokenizer.sequences_to_texts(train_decoder_inputs[:1]),
      target_tokenizer.sequences_to_texts(train_decoder_targets[:1]))

["<sos> i'm fine . how about yourself ?"] ["i'm fine . how about yourself ? <eos>"]


In [38]:
max_encoding_len = len(max(train_encoder_inputs, key=len))
max_encoding_len

22

In [39]:
max_decoding_len = len(max(train_decoder_inputs, key=len))
max_decoding_len

23

In [40]:
padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')

print(padded_train_encoder_inputs[0])
print(padded_train_decoder_inputs[0])
print(padded_train_decoder_targets[0])

[1016    6   40   18    5  160    3    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
[  2  38 561   4  44  39 562   6   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0]
[ 38 561   4  44  39 562   6   3   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0]


In [41]:
target_tokenizer.sequences_to_texts([padded_train_decoder_inputs[0]])

["<sos> i'm fine . how about yourself ? <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>"]

In [42]:
with open('dialogs.txt', 'r') as file:
    data = [line.rstrip() for line in file]

In [43]:
import random

# spliting the data into train and test
random.shuffle(data)
train_size = int(0.8 * len(data))
train = data[:train_size]
val = data[train_size:]
print(len(train), len(val))

2980 745


In [45]:
def process_dataset(dataset):

  # Split the Hungarian and English sentences into separate lists.
  input, output = map(list, zip(*[pair.split(Separator) for pair in dataset]))

  # Unicode normalization and inserting spaces around punctuation.
  preprocessed_input = [preprocess_sentence(s) for s in input]
  preprocessed_output = [preprocess_sentence(s) for s in output]

  # Tag target sentences with <sos> and <eos> tokens.
  tagged_preprocessed_output = tag_target_sentences(preprocessed_output)

  # Vectorize encoder source sentences.
  encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)

  # Vectorize and create decoder input and target sentences.
  decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output,
                                                                    target_tokenizer)

  # Pad all collections.
  padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
  padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
  padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')

  return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets

# Process validation dataset
padded_val_encoder_inputs, padded_val_decoder_inputs, padded_val_decoder_targets = process_dataset(val)

In [46]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2
batch_size = 32
epochs = 100

# Encoder model with Masking
encoder_inputs = tf.keras.Input(shape=(None,))

encoder_masking = layers.Masking(mask_value=0.0)(encoder_inputs)

encoder_embedding = layers.Embedding(input_dim=source_vocab_size, output_dim=embedding_dim)(encoder_masking)

encoder_lstm = layers.LSTM(hidden_dim, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

encoder_states = [state_h, state_c]

encoder_model = tf.keras.Model(encoder_inputs, encoder_states)

# Decoder model with Masking
decoder_inputs = tf.keras.Input(shape=(None,))

decoder_masking = layers.Masking(mask_value=0.0)(decoder_inputs)

decoder_embedding = layers.Embedding(input_dim=target_vocab_size, output_dim=embedding_dim)(decoder_masking)

decoder_lstm = layers.LSTM(hidden_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = layers.Dense(target_vocab_size, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

# Full model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [47]:
# Train the model
history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs],
                    np.expand_dims(padded_train_decoder_targets, -1),
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=([padded_val_encoder_inputs, padded_val_decoder_inputs],
                                     np.expand_dims(padded_val_decoder_targets, -1)))

Epoch 1/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 153ms/step - accuracy: 0.5988 - loss: 3.7120 - val_accuracy: 0.6769 - val_loss: 1.9754
Epoch 2/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 143ms/step - accuracy: 0.6978 - loss: 1.9216 - val_accuracy: 0.6999 - val_loss: 1.8563
Epoch 3/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 142ms/step - accuracy: 0.7077 - loss: 1.8120 - val_accuracy: 0.7011 - val_loss: 1.7945
Epoch 4/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 144ms/step - accuracy: 0.7081 - loss: 1.7639 - val_accuracy: 0.7045 - val_loss: 1.7490
Epoch 5/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 143ms/step - accuracy: 0.7084 - loss: 1.7318 - val_accuracy: 0.7069 - val_loss: 1.7073
Epoch 6/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 144ms/step - accuracy: 0.7134 - loss: 1.6805 - val_accuracy: 0.7130 - val_loss: 1.6608
Epoc

In [48]:
padded_train_decoder_targets[:3]

array([[ 38, 561,   4,  44,  39, 562,   6,   3,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 38, 174,  50,   4, 217,  28, 485,   4,   3,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 29, 175,   4,  23,  44,  19,   8, 105,   6,   3,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [49]:
# Encoder inference model
encoder_model = tf.keras.Model(encoder_inputs, encoder_states)

In [50]:
decoder_state_input_h = tf.keras.Input(shape=(hidden_dim,))
decoder_state_input_c = tf.keras.Input(shape=(hidden_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = tf.keras.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [51]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate an empty target sequence of length 1.
    target_seq = np.zeros((1, 1)) # [[0]]
    # Populate the first character of the target sequence with the start token.
    target_seq[0, 0] = target_tokenizer.word_index['<sos>']

    # Sampling loop
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :]) # (1, 1, 10556)
        sampled_word = target_tokenizer.index_word.get(sampled_token_index, '<unk>')

        if sampled_word == '<eos>' or len(decoded_sentence) > max_decoding_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]
    print(output_tokens.shape)
    return decoded_sentence

In [52]:
# Test with a validation sentence
test_input_seq = padded_val_encoder_inputs[0:1]  # Take the first sentence from validation set
decoded_sentence = decode_sequence(test_input_seq)
print('Input sentence:', ' '.join(source_tokenizer.sequences_to_texts(test_input_seq)))
print('Decoded sentence:', decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
(1, 1, 2461)
Input sentence: it's probably around somewhere . <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>
Decoded sentence:  oh , that's easy .


In [53]:
input_sentence = input("Enter your sentence: ")

input_sequence = source_tokenizer.texts_to_sequences([input_sentence])

padded_input_sequence = pad_sequences(input_sequence, maxlen=32, padding='post')

decoded_sentence = decode_sequence(padded_input_sequence)

print('Input sentence:', input_sentence)
print('Decoded sentence:', decoded_sentence)

Enter your sentence: how are you?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
(1, 1, 2461)
Input sentence: how are you?
Decoded sentence:  you should take a break


In [54]:
def translate_without_attention(padded_source, encoder, target_tokenizer, decoder):
    # Your translation logic here, for example:
    states_value = encoder.predict(padded_source)

    # Start decoding process
    target_seq = np.zeros((1, 1))  # Initial target sequence
    target_seq[0, 0] = target_tokenizer.word_index['<sos>']  # Assuming <sos> is the start token

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder.predict([target_seq] + [states_value])

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_tokenizer.index_word.get(sampled_token_index, '<unk>')

        if sampled_word == '<eos>' or len(decoded_sentence) > max_decoding_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence  # Return the decoded sentence


In [55]:
# random.seed is just here to re-create results.
random.seed(1)
sentences = random.sample(val, 15)
sentences

['golf is a silly game.\tit certainly is.',
 'i got a dent in the parking lot.\thow did you get it?',
 'what is your favorite kind of music?\ti listen to various types of music.',
 'they forget where they came from.\tmaybe you should run for office.',
 "thank you very much, that's kind of you.\tdon't mention it.",
 'did you take pictures at the world war ii monument?\toh, yes. we all took lots of pictures.',
 "really, where did you get it?\ti got it from macy's.",
 "yes. that's why it's also the best hamburger in town.\ta great burger and great service.",
 "a good gun costs $400 or more.\twell, if you bring your receipt, maybe they'll give you $400.",
 "a lot, like the party i'm having on friday.\twell, that's cool.",
 "i bought three pounds of potatoes for a dollar.\tthat's a good deal.",
 'why do you want to move there?\tbecause i want to make a lot of money.',
 "no one at work likes him.\tcan't you report him to his supervisor?",
 "who were you writing to?\tit's to my mom.",
 "i don

In [61]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

import numpy as np
from keras.preprocessing.sequence import pad_sequences

def translate_sentences(sentences, translation_func, source_tokenizer, encoder,
                        target_tokenizer, decoder, max_encoder_seq_length):
    translations = {'Tokenized Original': [], 'Reference': [], 'Translation': []}

    for s in sentences:
        # Split the input into source and target sentences
        source, target = s.split(Separator)

        # Preprocess the source sentence
        source = preprocess_sentence(source)

        # Tokenize the source sentence
        tokenized_sentence = source_tokenizer.texts_to_sequences([source])[0]  # Convert to sequence

        # Pad the tokenized sentence to the maximum length
        padded_source = pad_sequences([tokenized_sentence], maxlen=max_encoder_seq_length, padding='post')

        # Call the translation function to get the translation
        translated = translation_func(padded_source, encoder, target_tokenizer, decoder)

        # Store the results in the dictionary
        translations['Tokenized Original'].append(tokenized_sentence)
        translations['Reference'].append(target)
        translations['Translation'].append(translated)

    return translations


In [62]:
# Example definition of max_encoder_seq_length
max_encoder_seq_length = 30  # Adjust this value based on your model's configuration

# Now call translate_sentences with the required parameters
translations_no_attention = pd.DataFrame(translate_sentences(
    sentences,
    translate_without_attention,
    source_tokenizer,
    encoder_model,
    target_tokenizer,
    decoder_model,  # Ensure decoder is passed correctly
    max_encoder_seq_length  # Pass the max length
))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3

In [63]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()

        # No masking here. We'll handle it ourselves.
        self.embedding = layers.Embedding(source_vocab_size,
                                          embedding_dim,
                                          name='encoder_embedding_layer')

        # return_sequences is set to True this time.
        self.lstm = layers.LSTM(hidden_dim,
                                return_sequences=True,
                                return_state=True,
                                name='encoder_lstm')

    def call(self, input):
        embeddings = self.embedding(input)

        # output_seq will hold the encoder's hidden states from each time step.
        output_seq, state_h, state_c = self.lstm(embeddings)

        return output_seq, state_h, state_c


In [64]:
test_encoder = Encoder(source_vocab_size, embedding_dim, hidden_dim)

In [65]:
test_encoder_batch = padded_train_encoder_inputs[:3]
print(test_encoder_batch.shape)
test_encoder_batch

(3, 22)


array([[1016,    6,   40,   18,    5,  160,    3,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  29,  640,    2,   40,   37,  548,    3,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  29,  152,   48,    2,  290,   28,  486,    2,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [66]:
test_encoder_outputs, state_h, state_c = test_encoder(test_encoder_batch)

In [67]:
print(test_encoder_outputs.shape)
print(state_h.shape)
print(state_c.shape)

(3, 22, 256)
(3, 256)
(3, 256)


In [68]:
class LuongAttention(tf.keras.Model):
  def __init__(self, hidden_dim):
    super(LuongAttention, self).__init__()

    self.w = layers.Dense(hidden_dim, name='encoder_outputs_dense')

  def call(self, inputs):
    encoder_output_seq, decoder_output = inputs
    z = self.w(encoder_output_seq)
    attention_scores = tf.matmul(decoder_output, z, transpose_b=True)
    attention_weights = tf.keras.activations.softmax(attention_scores, axis=-1)
    context = tf.matmul(attention_weights, encoder_output_seq)

    return attention_weights, context


In [69]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(Decoder, self).__init__()

    self.embedding_layer = layers.Embedding(vocab_size,
                                            embedding_dim,
                                            name='decoder_embedding_layer')

    self.lstm = layers.LSTM(hidden_dim,
                            return_sequences=True,
                            return_state=True,
                            name='decoder_lstm')

    self.attention = LuongAttention(hidden_dim)

    self.w = tf.keras.layers.Dense(hidden_dim, activation='tanh', name='attended_outputs_dense')

    self.dense = layers.Dense(vocab_size, name='decoder_dense')


  def call(self, inputs):
    decoder_input, encoder_output_seq, lstm_state = inputs
    embeddings = self.embedding_layer(decoder_input)

    decoder_output, state_h, state_c = self.lstm(embeddings, initial_state=lstm_state)

    weights, context = self.attention([encoder_output_seq, decoder_output])

    decoder_output_with_attention = self.w(tf.concat(
        [tf.squeeze(context, 1), tf.squeeze(decoder_output, 1)], -1))

    logits = self.dense(decoder_output_with_attention)

    return logits, state_h, state_c, weights

In [70]:
test_decoder = Decoder(target_vocab_size, embedding_dim, hidden_dim)

In [71]:
test_decoder_batch = padded_train_decoder_inputs[:3]
print(test_decoder_batch.shape)
test_decoder_batch

(3, 23)


array([[  2,  38, 561,   4,  44,  39, 562,   6,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,  38, 174,  50,   4, 217,  28, 485,   4,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,  29, 175,   4,  23,  44,  19,   8, 105,   6,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [72]:
test_decoder_batch[:, 1]

array([38, 38, 29])

In [73]:
next_decoder_inputs = tf.expand_dims(test_decoder_batch[:, 1], 1)
next_decoder_inputs

<tf.Tensor: shape=(3, 1), dtype=int32, numpy=
array([[38],
       [38],
       [29]])>

In [74]:
# Initial values for state_h and state_c are from the encoder.
test_decoder_logits, state_h, state_c, test_decoder_weights = test_decoder(
    [
      next_decoder_inputs,
      test_encoder_outputs,
      [state_h, state_c]
    ])

In [75]:
print(test_decoder_logits.shape)
print(test_decoder_weights.shape)

(3, 2461)
(3, 1, 22)


In [76]:
def loss_func(targets, logits):
  ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  mask = tf.cast(tf.math.not_equal(targets, 0), tf.float32)

  return ce_loss(targets, logits, sample_weight=mask)

In [77]:
dataset = tf.data.Dataset.from_tensor_slices((padded_train_encoder_inputs,
                                              padded_train_decoder_inputs,
                                              padded_train_decoder_targets)).batch(batch_size, drop_remainder=True)

In [78]:
class TranslatorTrainer(tf.keras.Model):
  def __init__(self, encoder, decoder):
    super(TranslatorTrainer, self).__init__()

    self.encoder = encoder
    self.decoder = decoder

  # This method will be called by model.fit for each batch.
  @tf.function
  def train_step(self, inputs):
      loss = 0.

      encoder_input_seq, decoder_input_seq, decoder_target_seq = inputs

      with tf.GradientTape() as tape:
          encoder_output_seq, state_h, state_c = self.encoder(encoder_input_seq)

          # We need to create a loop to iterate through the target sequences
          for i in range(decoder_target_seq.shape[1]):

              # Input to the decoder must have shape of (batch_size, length)
              # so we need to expand one dimension (just like in the previous example).
              next_decoder_input = tf.expand_dims(decoder_input_seq[:, i], 1)
              logits, state_h, state_c, _ = self.decoder(
                  [next_decoder_input, encoder_output_seq, (state_h, state_c)])

              # The loss is now accumulated through the whole batch
              loss += self.loss(decoder_target_seq[:, i], logits)

      # Update the parameters and the optimizer
      variables = encoder.trainable_variables + decoder.trainable_variables
      gradients = tape.gradient(loss, variables)
      self.optimizer.apply_gradients(zip(gradients, variables))

      return {'loss': loss / decoder_target_seq.shape[1]}

In [79]:
encoder = Encoder(source_vocab_size, embedding_dim, hidden_dim)
decoder = Decoder(target_vocab_size, embedding_dim, hidden_dim)
optimizer = tf.keras.optimizers.Adam()

translator_trainer = TranslatorTrainer(encoder, decoder)
translator_trainer.compile(optimizer=optimizer, loss=loss_func)

In [80]:
epochs = 75
translator_trainer.fit(dataset, epochs=epochs)

Epoch 1/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 258ms/step - loss: 2.1550
Epoch 2/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 260ms/step - loss: 1.8967
Epoch 3/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 253ms/step - loss: 1.7858
Epoch 4/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 258ms/step - loss: 1.7040
Epoch 5/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 254ms/step - loss: 1.6401
Epoch 6/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 256ms/step - loss: 1.5792
Epoch 7/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 253ms/step - loss: 1.5204
Epoch 8/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 258ms/step - loss: 1.4674
Epoch 9/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 288ms/step - loss: 1.4230
Epoch 10/75
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x209a3beff10>

In [81]:
encoder.save_weights('attention_encoder.weights.h5')
decoder.save_weights('attention_decoder.weights.h5')

# !zip -r ./attention_weights.zip ./attention_encoder.weights.h5 ./attention_decoder.weights.h5

# files.download('./attention_weights.zip')

In [82]:
def translate_with_attention(padded_source, encoder, target_tokenizer, decoder, max_translated_len=30):
    # Prediction logic
    encoder_output, state_h, state_c = encoder.predict(padded_source)

    current_word = '<sos>'
    decoded_sentence = []

    while len(decoded_sentence) < max_translated_len:
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = target_tokenizer.word_index[current_word]

        logits, state_h, state_c, _ = decoder.predict([target_seq, encoder_output, (state_h, state_c)])
        current_token_index = np.argmax(logits[0])
        current_word = target_tokenizer.index_word.get(current_token_index, '')

        if current_word == '<eos>':
            break

        decoded_sentence.append(current_word)

    return ' '.join(decoded_sentence)


In [83]:
# Example definition of max_encoder_seq_length
max_encoder_seq_length = 30  # Adjust this value based on your model's configuration

# Now call translate_sentences with the required parameters
shorter_translations_w_attention = pd.DataFrame(translate_sentences(
    sentences,
    translate_with_attention,
    source_tokenizer,
    encoder,
    target_tokenizer,
    decoder,  # Ensure decoder is passed correctly
    max_encoder_seq_length  # Pass the max length
))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 349ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [84]:
max_decoder_seq_length = 30

shorter_translations_w_attention = pd.DataFrame(translate_sentences(sentences, translate_with_attention,
                                                                    source_tokenizer, encoder,
                                                                    target_tokenizer, decoder,
                                                                    max_decoder_seq_length))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39

In [85]:
shorter_translations_w_attention.rename(columns={'Translation': 'Translation W/ Attention'}, inplace=True)
shorter_translations_w_attention['Translation W/O Attention'] = translations_no_attention['Translation']
shorter_translations_w_attention

Unnamed: 0,Tokenized Original,Reference,Translation W/ Attention,Translation W/O Attention
0,"[627, 14, 9, 1315, 171, 2]",it certainly is.,it certainly is .,it was seven feet tall .
1,"[4, 90, 9, 1857, 19, 7, 615, 87, 2]",how did you get it?,how did you get it ?,how did you get it ?
2,"[13, 14, 42, 277, 212, 15, 401, 3]",i listen to various types of music.,i listen to various types of music .,i like not to .
3,"[24, 601, 122, 24, 366, 120, 2]",maybe you should run for office.,maybe you should run for office .,maybe you should see you
4,"[153, 5, 138, 89, 6, 38, 212, 15, 5, 2]",don't mention it.,don't mention it .,"i know , you should really"
5,"[20, 5, 115, 1375, 54, 7, 173, 633, 759, 1374, 3]","oh, yes. we all took lots of pictures.","oh , yes . we all took lots of pictures .","oh , yes . we all took lots"
6,"[51, 6, 122, 20, 5, 56, 10, 3]",i got it from macy's.,i got it from macy's .,i don't know which one i
7,"[30, 2, 38, 43, 22, 447, 7, 194, 916, 19, 937, 2]",a great burger and great service.,a great burger and great service .,a great burger and great
8,"[9, 48, 631, 414, 1351, 95, 123, 2]","well, if you bring your receipt, maybe they'll...","well , if you bring your receipt , maybe they'...","well , if you bring your"
9,"[9, 87, 6, 23, 7, 165, 29, 338, 35, 226, 2]","well, that's cool.","well , that's cool .","well , that's cool ."


While attention mechanisms have significantly improved natural language processing tasks, their effectiveness can be limited when trained on smaller datasets. For building a truly functional and reliable chatbot, it's essential to fine-tune a model that has been pre-trained on a vast amount of diverse data. Models like BERT, GPT, or LLaMA offer robust understanding and generation capabilities, making them ideal candidates for this purpose. By leveraging their extensive training, these models can better comprehend a wide range of inputs and contexts, resulting in more accurate and relevant responses in conversational applications.

In [86]:
pairs = train.copy()
pairs.sort(key=lambda s: len(s))
longer_sentences = pairs[-10:]
longer_sentences

['he sank a 20-foot putt on the last hole to win by one stroke!\the sank a 25-footer last year at the same tournament to win by one stroke.',
 "you're not going to examine it before we order dinner?\tno, i'd rather not find out that it's dirty, because i'm pretty hungry right now.",
 "i like the one where the fireman tells us why we should vote yes.\tdon't believe him! whatever the tv ads tell you, the opposite is true.",
 "i couldn't keep from laughing throughout the whole movie.\ti was laughing hysterically the whole time; my stomach muscles hurt afterwards.",
 "you don't need a good nose for thatÂ—cigarettes stink.\tbut when i sneak just one cigarette in the morning, she can smell it that evening!",
 'bluedog123 is just the street. you have to give me the city, state, and zip code.\toh, i get it. my email address is bluedog123@yahoo.com.',
 "considering that it's over ninety degrees outside, that would be weird.\texactly, it wouldn't be nice if it started raining. it's too hot.",
 '

In [88]:
def chatbot_response(input_sentence, source_tokenizer, encoder, target_tokenizer, decoder, max_encoder_seq_length=32):
  input_sequence = source_tokenizer.texts_to_sequences([input_sentence])
  padded_input_sequence = pad_sequences(input_sequence, maxlen=max_encoder_seq_length, padding='post')
  decoded_sentence = translate_with_attention(padded_input_sequence, encoder, target_tokenizer, decoder)

  return decoded_sentence

input_sentence = input("Enter your sentence: ")
response = chatbot_response(input_sentence, source_tokenizer, encoder, target_tokenizer, decoder)
print('Input sentence:', input_sentence)
print('Chatbot response:', response)

Enter your sentence: good luck with school
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Input sentence: good luck with school
Chatbot response: thank you very much .
