## Transformer Model

In [1]:
#!pip install tensorflow==2.1.0

In [2]:
%load_ext tensorboard

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime 

In [4]:
import tensorflow as tf
#import tensorflow_addons as tfa
print(tf.__version__)
from sklearn.model_selection import train_test_split
import os
#import io
import numpy as np
# import re
import unicodedata
# import urllib3
# import shutil
# import zipfile
# import itertools
from tensorflow import keras
import time

2.1.0


In [5]:
!rm -rf ./logs/

In [6]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


### Create Dataset

In [7]:
# import os
# file_list = []
# for file in os.listdir("./data/dataset"):
#     if file.endswith(".txt"):
#         file_list.append(os.path.join("./data/dataset", file))

In [8]:
def unicode_to_ascii(s):
    """ Converts the unicode file to ascii """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

In [9]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # adding a start and an end token to the sentence
    w = '<start> ' + w + ' <end>'
    return w

In [10]:
# def create_dataset(file_list):
#     dataset = []
#     for file in file_list:
#         lines = io.open(file, encoding='UTF-8').read().strip().split('\n')
#         word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines]
#         dataset.extend(word_pairs)
#     dataset = [s for s in dataset if len(s) ==2]  
#     dataset = list(set(tuple(x) for x in dataset))
#     return zip(*dataset)    

In [11]:
# equation, integration = create_dataset(file_list)

In [12]:
# import os

In [13]:
# os.mkdir('./data/cleaned_data')

In [14]:
# f = open('./data/cleaned_data/equation.txt', 'w+')
# for line in equation:
#     f.write(line + '\n')
# f.close()

In [15]:
# f = open('./data/cleaned_data/integration.txt', 'w+')
# for line in integration:
#     f.write(line + '\n')
# f.close()

#### import data

In [16]:
num_samples = None

In [17]:
f = open('./data/cleaned_data/equation.txt', 'r')
equation = f.read().splitlines()[:num_samples]
f = open('./data/cleaned_data/integration.txt', 'r')
integration = f.read().splitlines()[:num_samples]

In [18]:
# f = open('./equation.txt', 'r')
# equation = f.read().splitlines()[:num_samples]
# f = open('./integration.txt', 'r')
# integration = f.read().splitlines()[:num_samples]

### Preprocess dataset

In [19]:
def tokenize(inp, sequence_length):
    """ word to index """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(inp)
    sequences = tokenizer.texts_to_sequences(inp)
    sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post', maxlen=sequence_length, truncating='post')
    return  sequences, tokenizer

#### Train test split

In [20]:
#sequence_length = 512
sequence_length = 512
# Tokenize each word into index and return the tokenized list and tokenizer
X , X_tokenizer = tokenize(equation, sequence_length)
Y,  Y_tokenizer = tokenize(integration, sequence_length)
X_train,  X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [21]:
# tokenize by frequency
X_tokenizer.word_index['<start>']   

9

In [22]:
# vocabulary size # add 1 for 0 padding 
input_vocab_size = len(X_tokenizer.word_index) + 1 
output_vocab_size = len(Y_tokenizer.word_index)+ 1

print("input_vocab_size : ", input_vocab_size)
print("output_vocab_size : " ,output_vocab_size)

input_vocab_size :  1737
output_vocab_size :  1079


### Build transformer 
- building in ...

In [23]:
from tensorflow.keras import models, layers
from tensorflow.keras import backend as K

In [24]:
# ### only for model test
# sequence_length = 512
# input_vocabulary_size = 1000
# output_vocabulary_size = 1000
# ###
BUFFER_SIZE = len(X_train)
batch_size = 64
d_model = 512
embedding_size = 512
num_layers = 6
num_heads = 8
depth = d_model // num_heads
dff = 2048
dropout_rate = 0.1
learning_rate = 10**(-4)
training = True
epochs = 1

In [25]:
# BUFFER_SIZE = len(X_train)
# batch_size = 250
# d_model = 128
# embedding_size = 128
# num_layers = 4
# num_heads = 4
# depth = d_model // num_heads
# dff = 512
# dropout_rate = 0.1
# learning_rate = 10**(-4)
# training = True
# epochs = 1

In [26]:
dataset_train = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)
dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)

#### Positional encoding

In [27]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [28]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

#### Masking

Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value `0` is present: it outputs a `0` at those locations, and a `1` otherwise.

In [29]:
def create_padding_mask(seq):
    seq = tf.cast(1 - tf.cast(tf.math.equal(seq, 0), tf.int32), tf.bool)
  
    return seq

#### Multi-head attention

In [30]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, causal=False):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(self.depth)
        self.wk = tf.keras.layers.Dense(self.depth)
        self.wv = tf.keras.layers.Dense(self.depth)
        self.attention = tf.keras.layers.Attention(use_scale=True, causal=causal)
        self.dense = tf.keras.layers.Dense(d_model)

    
    def call(self, inputs, mask):
        batch_size = tf.shape(inputs[0])[0]
    
        q = self.wq(inputs[0])
        k = self.wk(inputs[1])
        v = self.wv(inputs[2])
        for i in range(num_heads):
            self_attention = self.attention(inputs=[q, v, k], mask=[None, mask])
            if i == 0:
                concat_attention = tf.concat([self_attention], axis=2)
            else:
                concat_attention = tf.concat([concat_attention, self_attention], axis=2)      
        self_attention = self.dense(concat_attention)
        return self_attention

#### Point wise feed forward network

Point wise feed forward network consists of two fully-connected layers with a ReLU activation in between.

In [31]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

#### Encoder layer

In [32]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        attn_output = self.mha([x, x, x], mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

#### Decoder layer

In [33]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads, causal=True)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, en_padding_mask, de_padding_mask):

        attn1 = self.mha1([x, x, x], de_padding_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2 = self.mha2(
          [out1, enc_output, enc_output], en_padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3

#### Encoder

In [34]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                              self.d_model)


        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                        for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

### Decoder

In [35]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                        for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, en_padding_mask, de_padding_mask):

        seq_len = tf.shape(x)[1]

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output, training, en_padding_mask, de_padding_mask)


        return x

#### Create the Transformer

Transformer consists of the encoder, decoder and a final linear layer. The output of the decoder is the input to the linear layer and its output is returned.

In [36]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                            input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                            target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size, activation="softmax")

    def call(self, inp, tar, training, en_padding_mask, de_padding_mask):

        enc_output = self.encoder(inp, training, en_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output = self.decoder(
          tar, enc_output, training, en_padding_mask, de_padding_mask)

        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output

###  Optimizer

In [37]:
optimizer = tf.keras.optimizers.Adam(learning_rate)

### Loss and metrics

Since the target sequences are padded, it is important to apply a padding mask when calculating the loss.

In [38]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [39]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [40]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='test_accuracy')

### Training and checkpointing

In [41]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, output_vocab_size, 
                          pe_input=sequence_length, 
                          pe_target=sequence_length,
                          rate=dropout_rate)

Create the checkpoint path and the checkpoint manager. This will be used to save checkpoints every `n` epochs.

In [42]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(model=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [43]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    en_padding_mask = create_padding_mask(inp)
    de_padding_mask = create_padding_mask(tar_inp)
    with tf.GradientTape() as tape:
        predictions = transformer(inp, tar_inp, 
                                 True, 
                                 en_padding_mask,
                                 de_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)    

In [44]:
test_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=test_step_signature)
def test_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    en_padding_mask = create_padding_mask(inp)
    de_padding_mask = create_padding_mask(tar_inp)
    predictions = transformer(inp, tar_inp, 
                                 False, 
                                 en_padding_mask,
                                 de_padding_mask)
    loss = loss_function(tar_real, predictions)


    test_loss(loss)
    test_accuracy(tar_real, predictions)

In [45]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
test_log_dir = 'logs/gradient_tape/' + current_time + '/test'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
test_summary_writer = tf.summary.create_file_writer(test_log_dir)

In [46]:
for epoch in range(epochs):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()
    
    test_loss.reset_states()
    test_accuracy.reset_states()

    # inp -> equation, tar -> integration
    #tf.summary.trace_on(graph=True) 
    for (batch, (inp, tar)) in enumerate(dataset_train):
        train_step(inp, tar)

        if batch % 50 == 0:
            print ('Epoch {} Batch {} Train_Loss {:.4f} Train_Accuracy {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result()))
           
    with train_summary_writer.as_default():
        tf.summary.scalar('train_loss', train_loss.result(), step=epoch)
        tf.summary.scalar('train_accuracy', train_accuracy.result(), step=epoch)
        #tf.summary.trace_export(name="test_model", step=epoch)
        #train_summary_writer.flush()
        
    for (batch, (inp, tar)) in enumerate(dataset_test):
        test_step(inp, tar)

        if batch % 50 == 0:
            print ('Epoch {} Batch {} Test_Loss {:.4f} Test_Accuracy {:.4f}'.format(
              epoch + 1, batch, test_loss.result(), test_accuracy.result()))
            
    with test_summary_writer.as_default():
        tf.summary.scalar('test_loss', test_loss.result(), step=epoch)
        tf.summary.scalar('test_accuracy', test_accuracy.result(), step=epoch)  
        
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))

    print ('Epoch {} Train_Loss {:.4f} Train_Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))
    
    print ('Epoch {} Test_Loss {:.4f} Test_Accuracy {:.4f}'.format(epoch + 1, 
                                                test_loss.result(), 
                                                test_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

ValueError: in converted code:

    <ipython-input-43-fd4e224152eb>:19 train_step  *
        predictions = transformer(inp, tar_inp,
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py:778 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    <ipython-input-36-1fc743cb4ce6>:16 call  *
        enc_output = self.encoder(inp, training, en_padding_mask)  # (batch_size, inp_seq_len, d_model)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py:778 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    <ipython-input-34-a1bc917f560e>:24 call  *
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py:748 __call__
        self._maybe_build(inputs)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py:2116 _maybe_build
        self.build(input_shapes)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/utils/tf_utils.py:306 wrapper
        output_shape = fn(instance, input_shape)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/layers/embeddings.py:140 build
        constraint=self.embeddings_constraint)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/keras/engine/base_layer.py:446 add_weight
        caching_device=caching_device)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/training/tracking/base.py:725 _add_variable_with_custom_getter
        name=name, shape=shape)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/training/tracking/base.py:792 _preload_simple_restoration
        checkpoint_position=checkpoint_position, shape=shape)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/training/tracking/base.py:75 __init__
        self.wrapped_value.set_shape(shape)
    /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/framework/ops.py:1088 set_shape
        (self.shape, shape))

    ValueError: Tensor's shape (1737, 256) is not compatible with supplied shape (1737, 512)


### Inference

In [None]:
def evaluate(inp_sentence):
    # add '<start> ' and ' <end>'
    input_sentence = preprocess_sentence(inp_sentence)
    
    # tokenize input_sentence
    input_sentence = np.asarray([X_tokenizer.word_index[w] for w in input_sentence.split(' ')], dtype=np.int32)
    encoder_input = tf.expand_dims(input_sentence, 0)
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(encoder_input, padding='post', maxlen=sequence_length, truncating='post')

    # tokenize decoder_input
    decoder_input = np.asarray([Y_tokenizer.word_index['<start>']], dtype=np.int32)
    output = tf.expand_dims(decoder_input, 0)
    #output = tf.keras.preprocessing.sequence.pad_sequences(output, padding='post', maxlen=sequence_length, truncating='post')

    for i in range(sequence_length):
        en_padding_mask = create_padding_mask(encoder_input)
        de_padding_mask = create_padding_mask(output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions = transformer(encoder_input, 
                                  output, 
                                  False, 
                                  en_padding_mask,
                                  de_padding_mask)

        # select the last word from the seq_len dimension
        
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
        
        # greedy decoder
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

#         # return the result if the predicted_id is equal to the end token
        if predicted_id == Y_tokenizer.word_index['<end>']:
             return tf.squeeze(output, axis=0)

#         # concatentate the predicted_id to the output which is given to the decoder
#         # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)



In [None]:
def translate(sentence):
    result = evaluate(sentence)
    print(result.shape)
    predicted_sentence = " ".join([Y_tokenizer.index_word[w.numpy()] for w in result[1:]]) 

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(predicted_sentence))
    return predicted_sentence

### beam search

In [None]:
from math import log

In [None]:
def evaluate_beam_search(inp_sentence, beam_width):
    # add '<start> ' and ' <end>'
    input_sentence = preprocess_sentence(inp_sentence)
    
    # tokenize input_sentence
    input_sentence = np.asarray([X_tokenizer.word_index[w] for w in input_sentence.split(' ')], dtype=np.int32)
    encoder_input = tf.expand_dims(input_sentence, 0)
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(encoder_input, padding='post', maxlen=sequence_length, truncating='post')

    # tokenize decoder_input
    decoder_input = np.asarray([Y_tokenizer.word_index['<start>']], dtype=np.int32)
    output = tf.expand_dims(decoder_input, 0)
    #output = tf.keras.preprocessing.sequence.pad_sequences(output, padding='post', maxlen=sequence_length, truncating='post')
    
    k = beam_width
    sequences = [[list(), 1.0]]
    
    for i in range(sequence_length):
        en_padding_mask = create_padding_mask(encoder_input)
        de_padding_mask = create_padding_mask(output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions = transformer(encoder_input, 
                                  output, 
                                  False, 
                                  en_padding_mask,
                                  de_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
        vocab_size = predictions.shape[2]
        
        #beam search decoder
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            if len(seq)>0 and seq[-1] == Y_tokenizer.word_index['<end>']:
                pass
            else:    
                for j in range(vocab_size):
                    prob = tf.cast(predictions[0,-1, j], tf.float32)
                    candidate = [seq + [j], score * (-log(prob))]
                    all_candidates.append(candidate)
        # order all candidates by score
        ordered = sorted(all_candidates, key=lambda tup:tup[1])
        # select k best
        sequences = ordered[:k]
    return sequences

In [None]:
def translate_beam_search(sentence, beam_width):
    result = evaluate(sentence, beam_width)
    predicted_sentences = []
    print('Input: {}'.format(sentence))
    for i in range(len(result)):
        predicted_sentence = " ".join([Y_tokenizer.index_word[w] for w in result[i][0]]) 
        print('Predicted translation: {}'.format(predicted_sentence))
        predicted_sentences.append([predicted_sentence, result[i][1]])
    return predicted_sentences

In [None]:
result = translate_beam_search("+ x 3", beam_width=2)

## Tensorboard

In [None]:
%tensorboard --logdir logs/gradient_tape