In [None]:
import tensorflow as tf
import os
import time
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_data_pairs(filename):
    data_pairs = []
    converasations = load_doc(filename)
    converasations = converasations.split('\n')
    for pairs in converasations:
        pair = pairs.split(' +++$+++ ')
        data_pairs.append(pair)
    return data_pairs

def vocab_frequency(data_pairs):
    vocab = {}
    for pair in data_pairs:
        for conv_tile in pair:
            for word in conv_tile.split():
                if word not in vocab:
                    vocab[word] = 1
                else:
                    vocab[word] += 1
    return vocab

def vocab(vocab_dict):
    threshold = 2
    vocab = []
    for word, val in vocab_dict.items():
        if val > threshold:
            vocab.append(word)
    return vocab

def clean(data_pairs, vocab):
  for pair in range(len(data_pairs)):
    for i in range(len(data_pairs[pair])):
      sentence = data_pairs[pair][i].split(' ')
      #print(sentence)
      for j in range(len(sentence)):
        if sentence[j] not in vocab:
          sentence[j] = ''
      sentence = ' '.join(sentence)
      data_pairs[pair][i] = sentence
  return data_pairs

def tokenize(words, conv_tile):
    vocab = words
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(vocab)

    tensor = tokenizer.texts_to_sequences(conv_tile)
    tensor = pad_sequences(tensor, padding='post')
    return tokenizer, tensor

def dataset_for_training(words, data_pairs):
    questions = []
    replies = []
    for pair in data_pairs:
        questions.append(pair[0])
        replies.append(pair[1])
    input_tokenizer, input_tensor = tokenize(words, questions)
    target_tokenizer, target_tensor = tokenize(words, replies)
    return input_tensor, target_tensor, input_tokenizer, target_tokenizer

def max_length(data_pairs):
    conv_tiles = []
    for pair in data_pairs:
        for conv_tile in pair:
            conv_tiles.appen(conv_tile)
    return max(len(words) for words in conv_tiles.split())

In [None]:
data_pairs = load_data_pairs('conversations.txt')
print('there are ' + str(len(data_pairs)) + ' conversation pairs')

vocab_frequency = vocab_frequency(data_pairs)
vocab = vocab(vocab_frequency)
vocab_size = len(vocab) + 1
print('there are ' + str(vocab_size) + ' words in vocab')

data_pairs = clean(data_pairs, vocab)
print(data_pairs[:20])

input_tensor, target_tensor, input_tokenizer, target_tokenizer = dataset_for_training(vocab, data_pairs)

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.3
)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

buffer_size = len(input_tensor_train)
batch_size = 64
steps_per_epoch = len(input_tensor_train)
embedding_dim = 360
units = 1535

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

there are 167413 conversation pairs
there are 20482 words in vocab
[['<start> well i thought we would start with  if that is okay with you <end>', '<start> not the hacking and gagging and spitting part please <end>'], ['<start> not the hacking and gagging and spitting part please <end>', '<start> okay then how about we try out some french cuisine saturday night <end>'], ['<start> you are asking me out that is so cute that is your name again <end>', '<start> forget it <end>'], ['<start> no no it is my fault we did not have a proper introduction <end>', '<start> cameron <end>'], ['<start> gosh if only we could find kat a boyfriend <end>', '<start> let me see what i can do <end>'], ['<start>  ma  this is my head <end>', '<start> right see you are ready for the quiz <end>'], ['<start> that is because it is such a nice one <end>', '<start> forget french <end>'], ['<start> how is our little find the wench a date plan  <end>', '<start> well theres someone i think might be <end>'], ['<start> t

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    #self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform'))

  def call(self, x, hidden): #x is the input
    x = self.embedding(x)
    #output, state = self.gru(x, initial_state=hidden)
    output, forward_h, backward_h = self.gru(x, initial_state=hidden)
    state = tf.concat([forward_h, backward_h], 1)
    return output, state

  def init_hidden_state(self):
    return [tf.zeros((self.batch_size, self.enc_units)) for i in range(2)]

class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return attention_weights, context_vector

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
    super(Decoder, self).__init__()
    self.batch_size = batch_size
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform'))
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, dec_input, dec_hidden, enc_output):
    attention_weights, context_vector = self.attention(dec_hidden, enc_output)
    dec_input = self.embedding(dec_input)
    dec_input = tf.concat([tf.expand_dims(context_vector, 1), dec_input], axis=-1)
    output, forward_h, backward_h = self.gru(dec_input)
    state = tf.concat([forward_h, backward_h], 1)
    output = tf.reshape(output, (-1, output.shape[2]))
    dec_input = self.fc(output)

    return dec_input, state

encoder = Encoder(vocab_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_size, embedding_dim, units, batch_size)

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
  #logical_not inverses the array values? and equal checks if array is eual to somthing
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss.dtype) #cast converts specific values to different types
  loss *= mask
  return tf.reduce_mean(loss)

In [None]:
checkpoint_dir = '/content/drive/MyDrive/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)
#checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * batch_size, 1)
    for t in range(1, targ.shape[1]):
      predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)
  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
epochs = 10

for epoch in range(epochs):
  start = time.time()

  enc_hidden = encoder.init_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  #if (epoch + 1) % 2 == 0:
  checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / (steps_per_epoch)))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.5889
Epoch 1 Batch 100 Loss 2.2637
Epoch 1 Batch 200 Loss 1.9607
Epoch 1 Batch 300 Loss 2.0754
Epoch 1 Batch 400 Loss 2.1382
Epoch 1 Batch 500 Loss 1.8533
Epoch 1 Batch 600 Loss 1.8045
Epoch 1 Batch 700 Loss 2.0734
Epoch 1 Batch 800 Loss 2.1117
Epoch 1 Batch 900 Loss 1.5241
Epoch 1 Batch 1000 Loss 1.9126
Epoch 1 Batch 1100 Loss 2.1467
Epoch 1 Batch 1200 Loss 1.9532
Epoch 1 Batch 1300 Loss 1.6134
Epoch 1 Batch 1400 Loss 1.6871
Epoch 1 Batch 1500 Loss 2.0645
Epoch 1 Batch 1600 Loss 1.8147
Epoch 1 Batch 1700 Loss 1.6418
Epoch 1 Batch 1800 Loss 1.4214
Epoch 1 Loss 0.0299
Time taken for 1 epoch 959.4093389511108 sec

Epoch 2 Batch 0 Loss 1.5764
Epoch 2 Batch 100 Loss 1.4888
Epoch 2 Batch 200 Loss 1.6988
Epoch 2 Batch 300 Loss 1.6182
Epoch 2 Batch 400 Loss 1.7910
Epoch 2 Batch 500 Loss 1.7959
Epoch 2 Batch 600 Loss 1.7635
Epoch 2 Batch 700 Loss 1.8053
Epoch 2 Batch 800 Loss 1.7149
Epoch 2 Batch 900 Loss 1.9843
Epoch 2 Batch 1000 Loss 1.8439
Epoch 2 Batch 1100 Loss 2.12

In [None]:
import string
import re

In [None]:
def clean_sentence(sentence):
    table = str.maketrans('','',string.punctuation)
    sentence = sentence
    sentence = sentence.lower()
    sentence = re.sub(r"i'm", "i am", sentence)
    sentence = re.sub(r"he's", "he is", sentence)
    sentence = re.sub(r"she's", "she is", sentence)
    sentence = re.sub(r"it's", "it is", sentence)
    sentence = re.sub(r"that's", "that is", sentence)
    sentence = re.sub(r"what's", "that is", sentence)
    sentence = re.sub(r"where's", "where is", sentence)
    sentence = re.sub(r"how's", "how is", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"can't", "cannot", sentence)
    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub(r"n'", "ng", sentence)
    sentence = re.sub(r"'bout", "about", sentence)
    sentence = re.sub(r"'til", "until", sentence)
    sentence = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", sentence)
    words = sentence.split()
    words = [word.lower() for word in words]
    words = [word.translate(table) for word in words]
    words = [word for word in words if(word.isalpha())]
    sentence = '<start> ' + ' '.join(words) + ' <end>'
    sentence_splitted = sentence.split()
    for i, word in enumerate(sentence_splitted):
      if word not in vocab:
        sentence_splitted[i] = ''
    sentence_splitted = ' '.join(sentence_splitted)
    sentence = sentence_splitted

    return sentence

In [None]:
max_length = 22
def evaluate(sentence):
  sentence = clean_sentence(sentence)

  inputs = [input_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)

  for t in range(max_length):
    predictions, dec_hidden = decoder(dec_input, dec_hidden,  enc_out)

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += target_tokenizer.index_word[predicted_id] + ' '

    if target_tokenizer.index_word[predicted_id] == '<end>':
      return result, sentence

    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [None]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    print('Input: %s' % (sentence))
    print('Reply: {}'.format(result))

translate('you might need more training')

Input: <start> you might need more training <end>
Reply: i am not i need to be a writer i am eccentric <end> 
