In [None]:
#Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Original translation code copyright 2019 The TensorFlow Authors.

Chatbot modifications copyright 2020 Jennifer Handsel


# Chatbot with attention

I repurposed Tensorflow's seq2seq translation notebook in order to make a chatbot model. 

https://www.tensorflow.org/tutorials/text/nmt_with_attention

**Changes**

*   Conversation needed a much deeper network than translation. I made the encoder a four layer GRU instead of single layer 
*   Added attentional feeding, as described in Luong 2015 (Effective Approaches to Attention-based Neural Machine Translation).
*   Added hidden-state passing in the decoder. In the original implementation, the output was calculated solely from the previous output and the context vector
*   Initialize embedding layer with pre-trained GLOVE word vectors. Experimented with having these weights frozen, but the model's fit was better when these weights were fine-tuned.
*   Shared embeddings for input and output
*   Implemented a beam search for finding the most probable output sentence, instead of the original greedy approach that gave less satisfactory results
*   Added live plots for training, to show loss function and gradient norm (to detect exploding gradients). I experimented with clipping the gradient norm, but I found the training loss was better without this modification.

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from IPython.display import clear_output
import collections
%matplotlib inline

import random

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pandas as pd

import nltk
nltk.download('punkt')

from tqdm import tqdm

import re
import numpy as np
import os
import io
import time

In [None]:
print(tf.__version__)
print(tf.executing_eagerly())

# Ploting functions

These functions allow live plotting of the loss function

In [None]:
# Plot that updates each time it's fed new data
def live_plot(data_dict, figsize=(7,5), title=''):
    clear_output(wait=True)
    plt.figure(figsize=figsize)
    for label,data in data_dict.items():
        plt.plot(data, label=label)
    plt.title(title)
    plt.grid(True)
    plt.xlabel('epoch')
    plt.legend(loc='center left') # the plot evolves to the right
    plt.show();

# fizz = collections.defaultdict(list)
# for i in range(10):
#     fizz['foo'].append(np.random.random())
#     fizz['bar'].append(np.random.random())
#     fizz['baz'].append(np.random.random())
#     live_plot(fizz)

In [None]:
# Plot that updates each time it's fed new data
def live_plot_double(loss_dict, throughput_dict, figsize=(13,5), 
                                            xlabels=['iteration', 'epoch']):
    clear_output(wait=True)

    plt.figure(figsize=(13,5))
    ax = plt.subplot(1, 2, 1)
    #plt.figure(figsize=(7,5))
    for label,data in loss_dict.items():
        plt.plot(data, label=label)
    #plt.title(title)
    plt.grid(True)
    plt.xlabel(xlabels[0])
    plt.legend(loc='center left') # the plot evolves to the right
    ax.set_yscale('log')
    
    plt.subplot(1, 2, 2)
    #plt.figure(figsize=(7,5))
    for label,data in throughput_dict.items():
        plt.plot(data, label=label)
    #plt.title(title)
    plt.grid(True)
    plt.xlabel(xlabels[1])
    plt.legend(loc='center left') # the plot evolves to the right
    
    plt.tight_layout()
    plt.show()

# fizz = collections.defaultdict(list)
# buzz = collections.defaultdict(list)
# for i in range(10):
#     fizz['foo'].append(np.random.random())
#     fizz['bar'].append(np.random.random())
#     buzz['baz'].append(np.random.random())
#     buzz['foobar'].append(np.random.random())
#     live_plot_double(fizz, buzz)

# Load dataset

The model is trained with Cornell and OpenSubs movie subtitles datasets. We can extract sentence pairs from this dataset:

```
'Now what did you lose?' 'My son... my mind...'
```

Here are the steps we'll take to prepare the data:

1. Clean the sentences by removing special characters.
2. Add a *start* and *end* token to each reply.
3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).
4. Pad each sentence to a maximum length.

In [None]:
def preprocess_sentence(w):
    w = w.lower().strip()

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    # Very needed, as dialogues still contain '-'
    w = re.sub(r"[^a-zA-Z?.!,']+", " ", w)
    
    # Replace multiple spaces with a single
    w = re.sub('\s+', ' ', w)

    # Converts to list of tokens as expected for GLOVE vectors
    # eg don't -> do n't; the dog's -> the dog 's
    w = nltk.word_tokenize(w)

    w = ' '.join(w)

    return w

In [None]:
# 1. Clean the sentences
# 2. Return sentence two lists of sentences A and B
# max_len is maximum number of words to extract
def load_sentences(data_path, max_len=25, max_sentences=30000):
    print('Loading sentences')
    data = pd.read_csv(data_path, header=None, sep='\t', dtype=str, 
                                        na_filter=False).values.tolist()

    # Clean sentences
    # Bit of a hack here to speed things up - may miss some sentences
    # Make sure dataset is pre-shuffled!
    sentences = [(preprocess_sentence(x[0]), preprocess_sentence(x[1])) for x in tqdm(data[:max_sentences*2])]

    # Drop long sentences
    sentences = [(x[0], x[1]) for x in sentences if 
     ( (len(x[0].split(' ')) <= max_len) and (len(x[1].split(' ')) <= max_len) )]

    # Extract as separate lists and pad with <start> and <end>
    sentA = [x[0] for x in sentences[0:max_sentences]]
    sentB = ['<start> ' + x[1] + ' <end>' for x in sentences[0:max_sentences]]

    print('Got %d sentences pairs' % (len(sentA)))

    return sentA, sentB

In [None]:
# Create a tokenizer that converts sentence into list of ints
# Return tokenized inputs and outputs
# Takes two lists, of input and output sentences
def tokenize(input_list, target_list, max_words=25000):
    
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',
                                            num_words=max_words, oov_token='<unk>')
    lang_tokenizer.fit_on_texts(input_list + target_list)

    # Convert sentences into list of integers
    input_tensor = lang_tokenizer.texts_to_sequences(input_list)
    # Pad to uniform length with zeros
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='post')
    
    target_tensor = lang_tokenizer.texts_to_sequences(target_list)
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post')

    return input_tensor, target_tensor, lang_tokenizer

In [None]:
# Load sparse encoding vectors and 'tokenizer' for generating encoding vectors
def load_dataset(dataset_path, max_len=30, max_sent=30000, max_words=25000):

    def max_length(tensor):
        return max(len(t) for t in tensor)

    # creating cleaned input, output pairs
    sentA, sentB = load_sentences(dataset_path, max_len=max_len, max_sentences=max_sent)

    input_tensor, target_tensor, lang_tokenizer = tokenize(sentA, sentB, max_words=max_words)

    # Print vital statistics
    max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

    print('Max length of input: %d' % max_length_inp)
    print('Max length of target (including <start> and <end>): %d' % max_length_targ)

    print('Number of items in vocab: %d' % len(lang_tokenizer.word_index))

    return input_tensor, target_tensor, lang_tokenizer

In [None]:
# Data should be tab-separated, with column one containing a sentence,
# and column two containing the response
dataset_path = '/path/to/dataset.tsv'

max_len = 30
max_sent = 3000000
max_words = 50000
input_tensor, target_tensor, lang = load_dataset(dataset_path, max_len, max_sent, max_words)

### Example

In [None]:
example_tensor = lang.texts_to_sequences(['<start> how are you gentlemen fufufu <end>'])
print(example_tensor)

In [None]:
print([lang.index_word[x] for x in example_tensor[0]])

# Create training set and validation set

Here we split the input conversations 80/20 into a training set and a validation set. The sets are shuffled to avoid bias within a batch.

In [None]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = \
    train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

### Examples

In [None]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
print ("Input Sentence; index to word mapping")
convert(lang, input_tensor_train[25])
print ()
print ("Target Sentence; index to word mapping")
convert(lang, target_tensor_train[25])

In [None]:
print('Number of words found: %d' % len(lang.index_word))
print('Limiting number of words to: %d' % lang.num_words)

## Create a tf.data training set

In [None]:
# Number of items in training set
BUFFER_SIZE = len(input_tensor_train)
# Batching is necessary to prevent GPU running out of memory - so use in both training and validation
# Can evaluate batch size by measuring training sample throughput per unit time
BATCH_SIZE = 512
# Number of samples per batch
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE

#vocab_size = len(lang.word_index)+1
vocab_size = lang.num_words+1

train_set = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
train_set = train_set.batch(BATCH_SIZE, drop_remainder=True)

### Examples

In [None]:
train_set

In [None]:
example_input_batch, example_target_batch = next(iter(train_set))
example_input_batch.shape, example_target_batch.shape
#print(example_input_batch)

## Create a tf.data validation set

In [None]:
# VAL_BUFFER SIZE is number of items in validation set
VAL_BUFFER_SIZE = len(input_tensor_val)
# Number of samples per batch
val_steps_per_epoch = VAL_BUFFER_SIZE//BATCH_SIZE

val_set = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(VAL_BUFFER_SIZE)
val_set = val_set.batch(BATCH_SIZE, drop_remainder=True)

### Examples

In [None]:
example_val_input_batch, example_val_target_batch = next(iter(val_set))
example_val_input_batch.shape, example_val_target_batch.shape
#print(example_val_input_batch)

# Load Word Vectors

To improve model performance, the embedding layer will used pretrained GLOVE vectors for its weights.

Best practice is to tokenize the texts in the same way that the glove vectors were tokenized - lower case + nltk tokenizing

In [None]:
GLOVE_FILE = 'glove.6B.300d.txt'

embeddings_dict = {}
f = open(GLOVE_FILE)
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = coefs
f.close()

embedding_dim = len(embeddings_dict['the'])
print('Found %s word vectors, length %d.' % (len(embeddings_dict), embedding_dim))

In [None]:
# Create embedding matrix from words in vocab
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in lang.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_dict.get(word)
        # words not found in embedding index will be initialized randomly
        if embedding_vector is None:
            embedding_matrix[i] = np.asarray(np.random.random_sample(embedding_dim)-0.5, dtype='float32')
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Initialize Shared Word Embeddings

The embedding layer is shared between the encoder and decoder. This is more efficient and results in a better model.

In [None]:
shared_embedding = tf.keras.layers.Embedding(vocab_size,
                            embedding_dim,
                            weights=[embedding_matrix],
                            trainable=True)

# Define Encoder

Simple 4-layer GRU

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, dropout_prob):
        # Initialize the base class
        super().__init__()
        # Initialize variables specific to this class
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        #self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru1 = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='orthogonal',
                                   dropout=dropout_prob,
                                   go_backwards=True)
        self.gru2 = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='orthogonal',
                                   dropout=dropout_prob,
                                   go_backwards=True)
        self.gru3 = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='orthogonal',
                                   dropout=dropout_prob,
                                   go_backwards=True)
        self.gru4 = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='orthogonal',
                                   dropout=dropout_prob,
                                   go_backwards=True)
        

    def call(self, x, training=True):
        #x = self.embedding(x)
        x_emb = shared_embedding(x)
        # We get matrix of outputs from each step, as well as separate final state
        output, state1 = self.gru1(x_emb, training=training)
        output, state2 = self.gru2(output, training=training)
        output, state3 = self.gru3(output, training=training)
        output, state4 = self.gru4(output, training=training)
        state = tf.math.accumulate_n([state1, state2, state3, state4])
        #state = tf.concat([state1, state2, state3, state4], axis=-1)
        # output = tf.concat([output1, output2], axis=-1)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
    # For random initialization during training
    def initialize_hidden_state_random(self, stddev=0.3):
        tf.random.normal((self.batch_sz, self.enc_units), stddev=stddev)

# Define Attention

Bahdanau-style additive attention using a trained tanh layer

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        # Dense layer with no activation: just think of it as a matrix multiplication
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        # And this one is vector multiplication, as the output dimension is 1
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

# Define Decoder

Calculates context based on current hidden state and encoder output, and predicts following word with a fully connected layer.

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, dropout_prob):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units

        #self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='orthogonal',
                                       kernel_initializer='orthogonal',
                                       dropout=dropout_prob,
                                       recurrent_dropout=dropout_prob)

        # Dense layer for outputing word logits
        self.fc = tf.keras.layers.Dense(vocab_size)
            
        # Include attention layer
        self.attention = BahdanauAttention(self.dec_units)

    # hidden is previous hidden state
    # x is tokenized batch of input words for current position, shape == (batch_size, 1)
    # dec_state shape == (batch_size, hidden_size)
    # context shape == (batch_size, hidden_size)
    # enc_output is tensor containing encoder outputs, shape == (batch_size, max_length, hidden_size)
    #def call(self, x, hidden, enc_output):
    def call(self, x, dec_state, context, enc_output, training=True):
        # Encoder outputs and previous hidden state used to calculate context vector
        # enc_output 
        # context_vector shape == (batch_size, hidden_size)
        #context_vector, attention_weights = self.attention(hidden, enc_output)

        ################################
        #  Calculate new decoder state GRU #
        ################################

        # Embed input batch of words
        # Shape == (batch_size, 1, embedding_dim)
        x_emb = shared_embedding(x)

        # Combine context and x; calculate new state
        # Need to convert context vector to shape == (batch_size, 1, hidden_size)
        context_x = tf.concat([tf.expand_dims(context, 1), x_emb], axis=-1)
        _, dec_state = self.gru(context_x, initial_state=dec_state, training=training)

        # Reshape output
        # output shape == (batch_size * 1, hidden_size)
        #gru_output = tf.reshape(gru_output, (-1, gru_output.shape[2]))

        ####################
        # Calculate logits #
        ####################

        # Calculate new context from new state
        context, attention_weights = self.attention(dec_state, enc_output)

        # Combine context and new state
        context_state = tf.concat([context, dec_state], axis=1)
        # Calculate logits for next word y
        # logits shape == (batch_size, vocab)
        logits = self.fc(context_state)

        return logits, dec_state, context, attention_weights

    # Use this to get a zero initial decoder state
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

# Initialize encoder, decoder and loss function

In [None]:
# Dropout will be applied to non-recurrent weights of each GRU
# (both encoder and decoder)
DROPOUT_PROBABILITY = 0.2

# Size of hidden state
units = 256

encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE, DROPOUT_PROBABILITY)
decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE, DROPOUT_PROBABILITY)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0004)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

# Accepts batch of word encodings
# Real is an array of shape == batch_size
# Pred has shape == (batch_size x vocab_size)
def loss_function(real, pred):
    # Ignore end padding positions
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    # loss_ has shape == batch_size
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './checkpoints/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder,
                                 shared_embedding=shared_embedding)

### Example

In [None]:
example_input_batch

In [None]:
# sample input
sample_output, sample_hidden = encoder(example_input_batch, training=False)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

sample_decoder_output, sample_decoder_state, sample_context, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), 
                        sample_hidden, tf.random.uniform((BATCH_SIZE, units)), sample_output, training=False)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))
print ('Decoder state shape: (batch_size, units) {}'.format(sample_decoder_state.shape))
print ('Context shape: (batch_size, units) {}'.format(sample_context.shape))

# Training / Validation Steps

1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.
2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.
3. The decoder returns the *predictions* and the *decoder hidden state*.
4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
5. Use *teacher forcing* to decide the next input to the decoder.
6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.
7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

In [None]:
# @ sign is a decorator
# Basically it means, pass the function below as an argument to tf.function
# tf.function compiles input into a callable tensorflow graph - 
# it instantiates a separate graph for every unique set of input shapes and datatypes.
# Input should be tensorflow datatypes
#
# In short, it turns function below into a tensorflow function, speeding code up
# immensely
@tf.function
def train_step(inp, targ):
    loss = 0

    # Record output of operations on trainable variables for easy differentiation
    with tf.GradientTape(persistent=False) as tape:
        # Get encoder output for whole input sentence
        enc_output, enc_hidden = encoder(inp, training=True)

        # Using start token as initial decoder input
        dec_x = tf.expand_dims([lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Get initial decoder state - copy from encoder
        dec_hidden = enc_hidden

        # Initial context should be blank state, same shape as encoder hidden state
        context = encoder.initialize_hidden_state()

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # Calculates attention using previous hidden state and encoder output
            logits, dec_hidden, context, _ = decoder(dec_x, dec_hidden, context, enc_output, training=True)

            # Compute scalar loss for current position in sentence
            loss += loss_function(targ[:, t], logits)

            # Compute perplexity - using average sentence perplexity
            #log_perplexity = tf.add(log_perplexity, loss_vec)
            #print(log_perplexity)

            # using teacher forcing
            dec_x = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    # Get average sentence perplexity
    #perplexity_vec = tf.exp(log_perplexity)
    #batch_perplexity = tf.reduce_mean(log_perplexity)

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    # Clip by global norm
    #gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
    # Clip individual norms
    # gradients = [
    #     None if gradient is None else tf.clip_by_norm(gradient, 1.0)
    #     for gradient in gradients]

    optimizer.apply_gradients(zip(gradients, variables))

    # Get gradient norms
    # Note: getting extra norms this way is very expensive, not recommended
    # for production runs, just to diagnose problems
    # grads_encoder = tape.gradient(loss, encoder.trainable_variables)
    # grads_decoder = tape.gradient(loss, decoder.trainable_variables)
    # del tape
    # gnorm_encoder = tf.linalg.global_norm(grads_encoder)
    # gnorm_decoder = tf.linalg.global_norm(grads_decoder)

    gnorm = tf.linalg.global_norm(gradients)

    return batch_loss, gnorm

In [None]:
# Instantiate accuracy calculator
acc_object = tf.keras.metrics.SparseCategoricalAccuracy()

@tf.function
def validation_step(inp, targ):
    
    # Initialize running total loss
    loss = 0
    #log_perplexity = tf.zeros((BATCH_SIZE))

    # Reset running total accuracy
    acc_object.reset_states()
    
    # Send input to encoder
    enc_output, enc_hidden = encoder(inp, training=False)

    # Using start token as initial decoder input
    dec_x = tf.expand_dims([lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Get initial decoder state - copy from encoder
    dec_hidden = enc_hidden
    # Initialize as zero tensor
    #dec_hidden = decoder.initialize_hidden_state()

    # Get initial context
    # This is wrong
    #context, _ = decoder.attention(dec_hidden, enc_output)

    # Initial context should be blank state, same shape as encoder hidden state
    context = encoder.initialize_hidden_state()

    # targ has dimensions BATCH_SIZE * SENT_LENGTH
    for t in range(1, targ.shape[1]):
        # predictions shape == (batch_size, vocab)
        logits, dec_hidden, context, _ = decoder(dec_x, dec_hidden, context, enc_output, training=False)

        # Compute scalar loss for current position in sentence
        #loss += tf.reduce_mean(loss_function(targ[:, t], logits))

        # Compute scalar loss for current position in sentence
        loss += loss_function(targ[:, t], logits)

        # Compute perplexity - using average sentence perplexity
        #log_perplexity = tf.add(log_perplexity, loss_vec)

        # Compute the accuracy
        mask = tf.math.logical_not(tf.math.equal(targ[:, t], 0))
        acc_object.update_state(targ[:, t], logits, sample_weight=mask)

        # predicted_ids is a vector of length batch_size
        #predicted_ids = tf.argmax(logits, 1)

        # the predicted ID is fed back into the model
        #dec_x = tf.expand_dims(predicted_ids, 1)

        # using teacher forcing
        dec_x = tf.expand_dims(targ[:, t], 1)

    # Normalize the loss
    # Essentially, this is weighted average cross-entropy
    # (log-perplexity) per symbol.
    batch_loss = (loss / int(targ.shape[1]))

    # Get average sentence perplexity
    #perplexity_vec = tf.exp(log_perplexity)
    #print(log_perplexity.numpy()[5])
    #print(np.exp(log_perplexity.numpy()[5]))
    #batch_perplexity = tf.reduce_mean(log_perplexity)

    # Accuracy is already normalized
    #batch_accuracy = acc_object.result().numpy()
    batch_accuracy = acc_object.result()

    return batch_loss, batch_accuracy

# Train the Network

In [None]:
EPOCHS = 30

initial_time = time.time()

# Dictionary to contain loss, accuracy, etc.
epoch_history = collections.defaultdict(list)
# Store train/validation loss at each iteration
batch_history = collections.defaultdict(list)
throughput_history = collections.defaultdict(list)

for epoch in range(EPOCHS):
    start_time = time.time()

    # Train model on training set
    train_loss = 0
    for (batch, (inp, targ)) in enumerate(train_set.take(steps_per_epoch)):
        batch_loss, gnorm = train_step(inp, targ)
        train_loss += batch_loss
        batch_history['training_loss'].append(batch_loss)
        batch_history['gradient_norm'].append(gnorm)
        # batch_history['gradient_norm_encoder'].append(gnorm_encoder)
        # batch_history['gradient_norm_decoder'].append(gnorm_decoder)

        if (batch + 1) % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
    
    # Get timing statistics
    end_time = time.time()
    duration = end_time - start_time
    throughput = BUFFER_SIZE/duration
    throughput_history['sample_per_sec'].append(throughput)
    
    # Record training loss
    epoch_loss = train_loss / steps_per_epoch
    epoch_history['training_loss'].append(epoch_loss)
            
    # Calculate validation loss
    val_loss = 0
    val_accuracy = 0
    for (batch, (inp, targ)) in enumerate(val_set.take(val_steps_per_epoch)):
        vbatch_loss, vbatch_accuracy = validation_step(inp, targ)
        val_loss += vbatch_loss
        val_accuracy += vbatch_accuracy

    # Record validation loss
    val_epoch_loss = val_loss / val_steps_per_epoch
    epoch_history['validation_loss'].append(val_epoch_loss)
    val_epoch_accuracy = val_accuracy / val_steps_per_epoch
    epoch_history['validation_accuracy'].append(val_epoch_accuracy)

    # Plot loss
    #live_plot(loss_history)
    #live_plot_double(epoch_history, throughput_history)
    live_plot_double(batch_history, epoch_history)
    
    # Print results and plot loss
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, epoch_loss))
    print('Validation {} Loss {:.4f}'.format(epoch + 1, val_epoch_loss))
    print('Validation {} Accuracy {:.4f}'.format(epoch + 1, val_epoch_accuracy))
    print('')
    # Get epoch with lowest validation loss
    best_epoch = np.argmin(epoch_history['validation_loss'])
    print('Minimum Validation Loss ({}) {:.4f}'.format(best_epoch+1, 
                            epoch_history['validation_loss'][best_epoch]))
    print('Corresponding Validation Accuracy {:.4f}'.format(
                            epoch_history['validation_accuracy'][best_epoch]))
    print('')
    print('Time taken to train for 1 epoch {:.2f} sec'.format(duration))
    print('Processed {:.2f} samples / sec\n'.format(throughput))

    # Save checkpoint if validation loss has improved
    if len(epoch_history['validation_loss']) > 1:
        if epoch_history['validation_loss'][-1] < epoch_history['validation_loss'][-2]:
            checkpoint.save(file_prefix = checkpoint_prefix)

    # Stop if validation loss keeps going up
    if len(epoch_history['validation_loss']) > 3:
        if epoch_history['validation_loss'][-1] > \
                                    epoch_history['validation_loss'][-4]:
            print('Validation loss has increased for three epochs, stopping')
            break

final_time = time.time()
total_time = final_time - initial_time
print('Training took %.2f hours' % (total_time / 3600))

# Restore the latest checkpoint

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint_dir = './checkpoints/'
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

# Evaluate

* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Randomly picks from the top two predictions at each step
* Stop predicting when the model predicts the *end token*.
* And store the *attention weights for every time step*.

Note: The encoder output is calculated only once for one input.

In [None]:
def evaluate(sentence, max_length_targ=30, max_length_inp=30):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = []
    for w in sentence.split(' '):
        try:
            if lang.word_index[w] < (vocab_size):
                inputs.append(lang.word_index[w])
            else:
                inputs.append(lang.word_index['<unk>'])
        except:
            inputs.append(lang.word_index['<unk>'])

    #inputs = [lang.word_index[i] for i in sentence.split(' ')]

    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    enc_output, enc_hidden = encoder(inputs, training=False)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([lang.word_index['<start>']], 0)

    # Get initial context
    #context = encoder.initialize_hidden_state()
    context = tf.zeros((1, units))

    for t in range(max_length_targ):
        predictions, dec_hidden, context, attention_weights = decoder(dec_input,
                                dec_hidden, context, enc_output, training=False)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        # Pick top predicted next words
        #predicted_id = tf.argmax(predictions[0]).numpy()
        _, predicted_ids = tf.math.top_k(predictions[0], k=2)
        predicted_ids = predicted_ids.numpy()

        # Remove <unk>
        predicted_ids = [i for i in predicted_ids if lang.index_word[i] != '<unk>']

        # Stop if most likely tag is end tag
        if lang.index_word[predicted_ids[0]] == '<end>':
            return result, sentence, attention_plot

        # Otherwise pick one of the top words
        predicted_id = random.choice(predicted_ids)

        if lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        result += lang.index_word[predicted_id] + ' '
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def respond(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

# Beam Search
This search tries to find the most probable sentence with a breadth-first search. It evaluates several predicted sentences (controlled by beam_width), keeping only the most probable sequences.

It gives better replies than a totally gready search (beam_width=1), and is far more coherent than choosing a random word from the top k predictions at each stage.

I find a narrow beam width (5-20) gives quite vulgar replies, a larger widgh (100-200) is more refined.

The final response is chosen randomly, weighted by probability. This can be relaxed somewhat to allow less probable replies with fuzzy_choice=True.

In [None]:
def evaluate_beam(sentence, max_length_inp=30, max_length_targ=None, 
                        beam_width=10, fuzzy_choice=False):

    # Final results
    final_results = []
    # Cumulative logits for each result
    final_logits = []

    # Clean sentence
    sentence = preprocess_sentence(sentence)
    
    if max_length_targ == None:
        max_length_targ = 3*len(sentence.split(' '))

    # Tokenize inputs and pad
    inputs = []
    for w in sentence.split(' '):
        try:
            if lang.word_index[w] < (vocab_size):
                inputs.append(lang.word_index[w])
            else:
                inputs.append(lang.word_index['<unk>'])
        except:
            inputs.append(lang.word_index['<unk>'])

    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    #result = ['', '', '']
    result = ['' for x in range(beam_width)]

    # Get encoder output
    enc_output, enc_hidden = encoder(inputs, training=False)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([lang.word_index['<start>']], 0)

    #print('First dec_input shape: {}'.format(dec_input.shape))

    # Get initial context
    #context = encoder.initialize_hidden_state()
    context = tf.zeros((1, units))

    # Get initial decoder output
    # dec_input has shape == (batch_size, 1)
    # dec_hidden shape == (batch_size, hidden_size)
    # context shape == (batch_size, hidden_size)
    # enc_output is tensor containing encoder outputs, shape == (batch_size, max_length, hidden_size)
    logits, dec_hidden, context, _ = decoder(dec_input,
                            dec_hidden, context, enc_output, training=False)

    # Softmax logits
    logits = tf.nn.log_softmax(logits)

    # Get top three predictions - beam search with beam_width
    top_logits, predicted_ids = tf.math.top_k(logits[0], k=beam_width)

    # Add word to results
    for i, idx in enumerate(predicted_ids.numpy()):
        result[i] += lang.index_word[idx] + ' '

    #print("First step top words: {}".format(result))

    # Stack hidden state
    dec_hidden_list = [dec_hidden for x in range(beam_width)]
    dec_hidden_b = tf.concat(dec_hidden_list, axis=0)
    #print("Batch hidden state shape: {}".format(dec_hidden_b.shape))

    # Stack context
    context_list = [context for x in range(beam_width)]
    context_b = tf.concat(context_list, axis=0)
    #print("Batch context shape: {}".format(context_b.shape))

    # Stack encoder state
    enc_output_list = [enc_output for x in range(beam_width)]
    enc_output_b = tf.concat(enc_output_list, axis=0)
    #print("enc_output shape: {}".format(enc_output.shape))
    #print("Batch enc_output shape: {}".format(enc_output_b.shape))

    for t in range(max_length_targ):
        # Run next encoder step with batch of top results
        # Resize to shape == (beam_width, 1)
        predicted_ids = tf.expand_dims(predicted_ids, 1)
        #print("predicted_ids shape: {}".format(predicted_ids.shape))
        top_logits = tf.expand_dims(top_logits, 1)
        #print(top_logits.numpy())

        # Run through network again
        # predictions shape == (beam_width, vocab)
        logits_batch, dec_hidden_b, context_b, _ = decoder(predicted_ids,
                                dec_hidden_b, context_b, enc_output_b, training=False)
        
        # Softmax logits
        logits_batch = tf.nn.log_softmax(logits_batch)

        #print('Logits batch shape: {}'.format(logits_batch.shape))
        #print('Hidden state shape: {}'.format(dec_hidden_b.shape))
        
        # Basically: need to add top logits to predictions_b
        # Tensorflow has automagic singleton expansion!
        logits_conditional = top_logits + logits_batch

        # In fact, since we have to mess about, let's flatten first
        # logits_conditional now shape == (vocab * beam_width)
        logits_conditional = tf.reshape(logits_conditional, [-1])

        #print("Flattened logits shape: {}".format(logits_conditional.shape))

        # Now get top 3
        # Cumulative log probability
        # Index of most probable word to come next
        top_logits, indices = tf.math.top_k(logits_conditional, k=beam_width)

        # Get corresponding path for each in the top 3 (i.e. 0, 1, 2)
        path = indices.numpy() // vocab_size

        #print("Flattened top indices: {}".format(indices))
        #print("Corresponding paths: {}".format(path))

        # Pick results to carry over
        #print("")
        #print(result)
        #print(path)
        #print("")
        result = [result[p] for p in path]

        #print('Carry over result: {}'.format(result))

        predicted_ids = []
        # Get get new predicted ids
        for p, idx in zip(path, indices.numpy()):
            #print(idx - p*vocab_size)
            predicted_ids += [idx - p*vocab_size]

        #print(predicted_ids)
        
        # Add word to results
        ended = []
        for i, idx in enumerate(predicted_ids):
            next_word = lang.index_word[idx]
            if next_word == '<end>':
                final_results.append(result[i].rstrip())
                final_logits.append(top_logits.numpy()[i])
                ended.append(i)
                beam_width -= 1
            else:
                result[i] += next_word + ' '

        # Remove ended sentences from pool
        result = [result[i] for i in range(len(result)) if i not in ended]
        # Break if have no results left
        if (len(result) == 0) or (beam_width == 0):
            break
        # Do same for path and predicted ids
        path = [path[i] for i in range(len(path)) if i not in ended]
        predicted_ids = [predicted_ids[i] for i in range(len(predicted_ids)) if i not in ended]
        #print(top_logits.shape)

        
        top_logits = [top_logits.numpy()[i] for i in range(len(top_logits)) if i not in ended]
        top_logits = tf.stack(top_logits)

        # # Regenerate correct batchsize of encoder output
        enc_output_list = [enc_output for x in range(len(result))]
        enc_output_b = tf.concat(enc_output_list, axis=0)

        # print("Final results: {}".format(final_results))
        # print("Top sequences: {}".format(result))
        # print("Path: {}".format(path))
        # print(top_logits.shape)
        # print("")

        # Get hidden states to carry over, shape == (3, units)
        hidden_list = [dec_hidden_b[x] for x in path]
        #print(dec_hidden_b.shape)
        dec_hidden_b = tf.stack(hidden_list)
        #print(dec_hidden_b.shape)

        # Get contexts to carry over
        context_list = [context_b[x] for x in path]
        #print(context_b.shape)
        context_b = tf.stack(context_list)
        #print(context_b.shape)

    if len(final_results) == 0:
        return np.random.choice(["I don't understand.", "what do you mean?"])

    # Fuzzy choice will get more erratic (and entertaining) results
    if fuzzy_choice == True:
        l_sum = sum(final_logits)
        p = [x/l_sum for x in final_logits]
    else:
        p = np.exp(final_logits) / sum(np.exp(final_logits))

    sentence = np.random.choice(final_results, p=p)

    if '<unk>' in sentence.split(' '):
        sentence = np.random.choice(['all your base are belong to us', 
                                     "you are on your way to destruction", 
                                     "someone set up us the bomb", 
                                     "what you say?"])
    else:
        sentence = sentence.replace(" i ", " I ")
        sentence = re.sub("^i ", "I ", sentence)
        sentence = sentence.replace(" n't", "n't")
        sentence = sentence.replace(" '", "'")
        sentence = sentence.replace("gon na", "gonna")
        sentence = sentence.replace("wan na", "wanna")
        sentence = sentence.replace(" .", ".")
        sentence = sentence.replace(" !", "!")
        sentence = sentence.replace(" ?", "?")
        sentence = sentence.replace(" ,", ",")

    return sentence