### Transformer Model

In [None]:
!pip install tensorflow==2.1.0

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
print(tf.__version__)
from sklearn.model_selection import train_test_split
import os
#import io
import numpy as np
# import re
# import unicodedata
# import urllib3
# import shutil
# import zipfile
# import itertools
from tensorflow import keras
import time

### Create Dataset

In [None]:
# import os
# file_list = []
# for file in os.listdir("./data/dataset"):
#     if file.endswith(".txt"):
#         file_list.append(os.path.join("./data/dataset", file))

In [None]:
# def unicode_to_ascii(s):
#     """ Converts the unicode file to ascii """
#     return ''.join(c for c in unicodedata.normalize('NFD', s)
#       if unicodedata.category(c) != 'Mn')

In [None]:
# def preprocess_sentence(w):
#     w = unicode_to_ascii(w.lower().strip())
#     # adding a start and an end token to the sentence
#     w = '<start> ' + w + ' <end>'
#     return w

In [None]:
# def create_dataset(file_list):
#     dataset = []
#     for file in file_list:
#         lines = io.open(file, encoding='UTF-8').read().strip().split('\n')
#         word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines]
#         dataset.extend(word_pairs)
#     dataset = [s for s in dataset if len(s) ==2]  
#     dataset = list(set(tuple(x) for x in dataset))
#     return zip(*dataset)    

In [None]:
# equation, integration = create_dataset(file_list)

In [None]:
# import os

In [None]:
# os.mkdir('./data/cleaned_data')

In [None]:
# f = open('./data/cleaned_data/equation.txt', 'w+')
# for line in equation:
#     f.write(line + '\n')
# f.close()

In [None]:
# f = open('./data/cleaned_data/integration.txt', 'w+')
# for line in integration:
#     f.write(line + '\n')
# f.close()

In [None]:
num_samples = 5000

In [None]:
# f = open('./data/cleaned_data/equation.txt', 'r')
# equation = f.read().splitlines()
# f = open('./data/cleaned_data/integration.txt', 'r')
# integration = f.read().splitlines()

In [None]:
f = open('./equation.txt', 'r')
equation = f.read().splitlines()[:num_samples]
f = open('./integration.txt', 'r')
integration = f.read().splitlines()[:num_samples]

### Preprocess dataset

In [None]:
def tokenize(inp, sequence_length):
    """ word to index """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(inp)
    sequences = tokenizer.texts_to_sequences(inp)
    sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post', maxlen=sequence_length, truncating='post')
    return  sequences, tokenizer

### Train test split

In [None]:
#sequence_length = 512
sequence_length = 256
# Tokenize each word into index and return the tokenized list and tokenizer
X , X_tokenizer = tokenize(equation, sequence_length)
Y,  Y_tokenizer = tokenize(integration, sequence_length+1)
X_train,  X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [None]:
# tokenize by frequency
X_tokenizer.word_index['<start>']   

In [None]:
# vocabulary size # add 1 for 0 padding 
input_vocab_size = len(X_tokenizer.word_index) + 1 
output_vocab_size = len(Y_tokenizer.word_index)+ 1

print("input_vocab_size : ", input_vocab_size)
print("output_vocab_size : " ,output_vocab_size)

### Build transformer 
- building in ...

In [None]:
from tensorflow.keras import models, layers
from tensorflow.keras import backend as K

In [None]:
# ### only for model test
# sequence_length = 512
# input_vocabulary_size = 1000
# output_vocabulary_size = 1000
# ###
# BUFFER_SIZE = len(X_train)
# batch_size = 256
# d_model = 512
# embedding_size = 512
# num_layers = 6
# num_heads = 8
# depth = d_model // num_heads
# dff = 2048
# dropout_rate = 0.1
# learning_rate = 10**(-4)
# training = True
# epochs = 1

In [None]:
BUFFER_SIZE = len(X_train)
batch_size = 256
d_model = 128
embedding_size = 128
num_layers = 4
num_heads = 4
depth = d_model // num_heads
dff = 512
dropout_rate = 0.1
learning_rate = 10**(-4)
training = True
epochs = 10

In [None]:
dataset_train = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)
dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)

#### Positional encoding

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

#### Masking

Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value `0` is present: it outputs a `0` at those locations, and a `1` otherwise.

In [None]:
def create_padding_mask(seq):
    seq = tf.cast(1 - tf.cast(tf.math.equal(seq, 0), tf.int32), tf.bool)
  
    return seq

#### Multi-head attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, causal=False):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.attention = tf.keras.layers.Attention(use_scale=True, causal=causal)
        self.dense = tf.keras.layers.Dense(d_model)

    
    def call(self, inputs, mask):
        batch_size = tf.shape(inputs[0])[0]
    
        q = self.wq(inputs[0])
        k = self.wk(inputs[1])
        v = self.wv(inputs[2])
        for i in range(num_heads):
            self_attention = self.attention(inputs=[q, v, k], mask=[None, mask])
            if i == 0:
                concat_attention = tf.concat([self_attention], axis=2)
            else:
                concat_attention = tf.concat([concat_attention, self_attention], axis=2)      
        self_attention = self.dense(concat_attention)

        return self_attention

## Point wise feed forward network

Point wise feed forward network consists of two fully-connected layers with a ReLU activation in between.

In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

## Encoder and decoder

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
      super(EncoderLayer, self).__init__()

      self.mha = MultiHeadAttention(d_model, num_heads)
      self.ffn = point_wise_feed_forward_network(d_model, dff)

      self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
      self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

      self.dropout1 = tf.keras.layers.Dropout(rate)
      self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

      attn_output = self.mha([x, x, x], mask)  # (batch_size, input_seq_len, d_model)
      attn_output = self.dropout1(attn_output, training=training)
      out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

      ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
      ffn_output = self.dropout2(ffn_output, training=training)
      out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

      return out2

#### Decoder layer

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
      super(DecoderLayer, self).__init__()

      self.mha1 = MultiHeadAttention(d_model, num_heads, causal=True)
      self.mha2 = MultiHeadAttention(d_model, num_heads)

      self.ffn = point_wise_feed_forward_network(d_model, dff)

      self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
      self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
      self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

      self.dropout1 = tf.keras.layers.Dropout(rate)
      self.dropout2 = tf.keras.layers.Dropout(rate)
      self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, padding_mask):

      attn1 = self.mha1([x, x, x], padding_mask)  # (batch_size, target_seq_len, d_model)
      attn1 = self.dropout1(attn1, training=training)
      out1 = self.layernorm1(attn1 + x)

      attn2 = self.mha2(
          [out1, enc_output, enc_output], padding_mask)  # (batch_size, target_seq_len, d_model)
      attn2 = self.dropout2(attn2, training=training)
      out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

      ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
      ffn_output = self.dropout3(ffn_output, training=training)
      out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

      return out3

#### Encoder

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
      super(Encoder, self).__init__()

      self.d_model = d_model
      self.num_layers = num_layers

      self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
      self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                              self.d_model)


      self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                        for _ in range(num_layers)]

      self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

      seq_len = tf.shape(x)[1]

      # adding embedding and position encoding.
      x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
      x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
      x += self.pos_encoding[:, :seq_len, :]

      x = self.dropout(x, training=training)

      for i in range(self.num_layers):
          x = self.enc_layers[i](x, training, mask)

      return x  # (batch_size, input_seq_len, d_model)

### Decoder

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
      super(Decoder, self).__init__()

      self.d_model = d_model
      self.num_layers = num_layers

      self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
      self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

      self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                        for _ in range(num_layers)]
      self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, padding_mask):

      seq_len = tf.shape(x)[1]
      attention_weights = {}

      x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
      x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
      x += self.pos_encoding[:, :seq_len, :]

      x = self.dropout(x, training=training)

      for i in range(self.num_layers):
          x = self.dec_layers[i](x, enc_output, training, padding_mask)


      return x

## Create the Transformer

Transformer consists of the encoder, decoder and a final linear layer. The output of the decoder is the input to the linear layer and its output is returned.

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
      super(Transformer, self).__init__()

      self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                            input_vocab_size, pe_input, rate)

      self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                            target_vocab_size, pe_target, rate)

      self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, padding_mask):

      enc_output = self.encoder(inp, training, padding_mask)  # (batch_size, inp_seq_len, d_model)

      # dec_output.shape == (batch_size, tar_seq_len, d_model)
      dec_output = self.decoder(
          tar, enc_output, training, padding_mask)

      final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

      return final_output

####  Optimizer

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate)

#### Loss and metrics

Since the target sequences are padded, it is important to apply a padding mask when calculating the loss.

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_mean(loss_)

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

#### Training and checkpointing

In [None]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, output_vocab_size, 
                          pe_input=sequence_length, 
                          pe_target=sequence_length+1,
                          rate=dropout_rate)

Create the checkpoint path and the checkpoint manager. This will be used to save checkpoints every `n` epochs.

In [None]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(model=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [None]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    padding_mask = create_padding_mask(inp)

    with tf.GradientTape() as tape:
      predictions = transformer(inp, tar_inp, 
                                 True, 
                                 padding_mask)
      loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [None]:
for epoch in range(epochs):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  # inp -> equation, tar -> integration
  for (batch, (inp, tar)) in enumerate(dataset_train):
    train_step(inp, tar)
    
    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))