# TDDE09 NLP Project - Transformer Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive/NLP_Project/

/content/drive/My Drive/NLP_Project


**Import dependencies**

In [None]:
import numpy as np
import math
import re
import time

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

**Load the dataset**

In [None]:
with open("europarl-v7.sv-en.en", mode='r', encoding='utf-8') as f:
  europarl_en = f.read()

with open("europarl-v7.sv-en.sv", mode='r', encoding='utf-8') as f:
  europarl_sv = f.read()

In [None]:
europarl_en[:50]

'Resumption of the session\nI declare resumed the se'

In [None]:
#a.m = a.$$$m = am

### **Data preprocess**

In [None]:
corpus_en = europarl_en
#any char following '.' replace it with '.$$$'
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_en)
#now we remove all such instance of '.$$$' from corpus
corpus_en = re.sub(r".\$\$\$", '', corpus_en)
#replace two whitespaces with single whitespace 
corpus_en = re.sub(r"  +", " ", corpus_en)
#split each sentence in corpus based on '\n' new line char
corpus_en = corpus_en.split('\n')

corpus_sv = europarl_sv
corpus_sv = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_sv)
corpus_sv = re.sub(r".\$\$\$", '', corpus_sv)
corpus_sv = re.sub(r"  +", " ", corpus_sv)
corpus_sv = corpus_sv.split('\n')

In [None]:
corpus_en[:50]

['Resumption of the session',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
 "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.",
 'You have requested a debate on this subject in the course of the next few days, during this part-session.',
 "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.",
 "Please rise, then, for this minute' s silence.",
 "(The House rose and observed a minute' s silence)",
 'Madam President, on a point of order.',
 'You will be aware from the press and television that there have be

In [None]:
corpus_en[0]

'Resumption of the session'

**Tokenizer will encode each word in sentence with its unique integer value and build a vocab for us. It will also make all sentences to lower case, add spaces before ' . ' and ' , '**. Encoding is fully invertible because all out-of-vocab wordpieces are byte-encoded. Which means unknown word pieces will be encoded one character at a time.
8192 + 26(all english alphabets) + 2(start, end tokens) = 8221

In [None]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_en, target_vocab_size=2**13)

In [None]:
tokenizer_sv = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_sv, target_vocab_size=2**13)

In [None]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2 
VOCAB_SIZE_EN

8221

In [None]:
tokenizer_sv.subwords

['att_',
 ', ',
 'och_',
 'i_',
 'som_',
 'för_',
 'en_',
 'av_',
 'det_',
 'är_',
 'de_',
 'till_',
 'om_',
 'har_',
 'på_',
 'den_',
 'med_',
 'inte_',
 'vi_',
 's_',
 'ett_',
 't_',
 'a_',
 'Jag_',
 'jag_',
 'Det_',
 'kommer_',
 'kan_',
 'måste_',
 'detta_',
 'r_',
 'er_',
 'från_',
 'Vi_',
 'n_',
 'vill_',
 'skulle_',
 'också_',
 'så_',
 'na_',
 'denna_',
 'man_',
 'EU',
 'en',
 'mycket_',
 ' - ',
 '. ',
 'alla_',
 '! ',
 'sig_',
 'men_',
 'när_',
 'Europeiska_',
 'vara_',
 'eller_',
 'talman',
 'gäller_',
 'Herr_',
 'dessa_',
 'andra_',
 'kommissionen_',
 ') ',
 'även_',
 'mot_',
 'I_',
 'ska_',
 'inom_',
 'oss_',
 'skall_',
 'e_',
 'finns_',
 'utan_',
 'under_',
 'bara_',
 'här_',
 'et_',
 'bör_',
 'er',
 'göra_',
 'genom_',
 'ta_',
 'EU_',
 'mer_',
 'mellan_',
 'var_',
 ': ',
 'eftersom_',
 'anser_',
 'na',
 'Detta_',
 'nu_',
 'kunna_',
 'än_',
 'vid_',
 'nde_',
 'vilket_',
 'europeiska_',
 'därför_',
 'få_',
 'där_',
 'ni_',
 'över_',
 'ar_',
 'ha_',
 'fram_',
 'Den_',
 'allt_'

In [None]:
VOCAB_SIZE_SV = tokenizer_sv.vocab_size + 2
VOCAB_SIZE_SV

8234

**Pad the 'start' and 'end' token to all sentences in the corpus**

In [None]:
inputs = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1] for sentence in corpus_en]

In [None]:
outputs = [[VOCAB_SIZE_SV-2] + tokenizer_sv.encode(sentence) + [VOCAB_SIZE_SV-1] for sentence in corpus_sv]

In [None]:
inputs[0]

[8219, 2561, 1009, 2044, 3, 1, 2573, 8220]

In [None]:
corpus_en[0]

'Resumption of the session'

In [None]:
outputs[0]

[8232, 3338, 78, 7341, 5898, 8, 5836, 44, 8233]

**Get sentences having Max length of 20**

In [None]:
MAX_LENGTH = 20

idx_to_remove = [count for count, sent in enumerate(inputs) if len(sent) > MAX_LENGTH]

#delete sentences from inputs(source) that exceed max len of 20, correspondingly also delete sentences in outputs(target).
for idx in reversed(idx_to_remove):
  del inputs[idx]
  del outputs[idx]

#we do the same thing for outputs(target) ie: find sent that exceed max len of 20 in target outputs and del from both outputs 
#and inputs
idx_to_remove = [count for count, sent in enumerate(outputs) if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
  del inputs[idx]
  del outputs[idx]

**Pad value of 0 for sentences less than its max length**

In [None]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, value=0, padding='post', maxlen=MAX_LENGTH)

In [None]:
inputs.shape

(444842, 20)

In [None]:
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs, value=0, padding='post', maxlen=MAX_LENGTH)

In [None]:
outputs.shape

(444842, 20)

In [None]:
valid_src = inputs[-1000:]
valid_ref = outputs[-1000:]
valid_src.shape, valid_ref.shape

((1000, 20), (1000, 20))

In [None]:
inputs = inputs[:-1000]
outputs = outputs[:-1000]
inputs.shape, outputs.shape

((443842, 20), (443842, 20))

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

#to help increase speed during training - store it in cache
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
#to help access to data faster - further improving speed (Note: it has no effect on accuracy)
#This transformation basically uses a background thread and an internal buffer to prefetch elements 
#from the input dataset ahead of the time they are requested.
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
next(iter(dataset))

(<tf.Tensor: shape=(64, 20), dtype=int32, numpy=
 array([[8219,   11, 5611, ...,    0,    0,    0],
        [8219,   25,  554, ...,    0,    0,    0],
        [8219,   29,   43, ...,    0,    0,    0],
        ...,
        [8219,   25,  516, ...,    0,    0,    0],
        [8219, 8220,    0, ...,    0,    0,    0],
        [8219,   11,  343, ...,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: shape=(64, 20), dtype=int32, numpy=
 array([[8232,   24, 2491, ...,  823, 8022, 8233],
        [8232, 2231,   10, ...,    0,    0,    0],
        [8232,   58,   56, ...,    0,    0,    0],
        ...,
        [8232, 7834,  182, ...,    0,    0,    0],
        [8232,   58,   56, ...,    0,    0,    0],
        [8232,   24,  404, ...,    0,    0,    0]], dtype=int32)>)

## Positional Encoding

**Embeddings represent a word in a d-dimensional space where tokens with similar meaning will be closer to each other. But the embeddings do not encode the relative position of words in a sentence. So after adding the positional encoding, words will be closer to each other based on the similarity of their meaning and also their position in the sentence. We represent this using the following formula:**<br><br>
$$\Large{PE_{(pos, 2i)} = sin(pos / 10000^{2i / d_{model}})} $$
$$\Large{PE_{(pos, 2i+1)} = cos(pos / 10000^{2i / d_{model}})} $$
<br>
**For each each dimension of an embedding which is represented as 'i' we will get cosine function w.r.t the position in the sequence.**

In [None]:
class PositionalEncoding(layers.Layer):

    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model):
        #pos -> [seqlen, 1] , i -> [1, d_model] , d_model -> embedding dimension size
        angles = 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))
        return pos * angles

    def call(self, inputs):
        #get the first dimension of the input tensor (seq length)
        seq_length = inputs.shape.as_list()[-2]
        #get the second dimension of the input tensor (embedding dim)
        d_model = inputs.shape.as_list()[-1]
        #send list of positions from 0 to seq length with an additional axis [seq, 1], send list of dimensions [1, dim]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        #all angle values 0 to last with a step of 2 (to access even part)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        #1:last:2 with step of 2 to access the odd part
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        # add extra dim [1, seq, d_model] to accomodate batch size [batch, seq, d_model]
        pos_encoding = angles[np.newaxis, ...]
        #concat (add the embedding input to pos encoding (convert to tensor))
        return inputs + tf.cast(pos_encoding, tf.float32)

## Scaled Dot Product

$$\Large{Attention(Q, K, V) = softmax_k(\frac{QK^T}{\sqrt{d_k}}) V} $$

In [None]:
def scaled_dot_product_attention(queries, keys, values, mask):
    #Q, K and V  size -> [batch, nb.proj, seq, nb.proj.dim]
    #matrix multiply query with the transpose of key matrix => [batch, nb.proj, 20, 20]
    product = tf.matmul(queries, keys, transpose_b=True)

    #get the keys dimension size and type caste it to float
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)

    #scale the product by dimension size -> so as to get consistent variance regardless the value of dim
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    #if there is a valid mask!
    if mask is not None:
        #padded values get multiplied by a very large negative number ~ close to negative infinity
        #this makes sure that softmax applied on the padded values go to zeroes. 
        scaled_product += (mask * -1e9)
    
    #finally we apply softmax along last dimension such that prob of seq sum up to 1, 
    #the result is multiplied with values matrix.
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)  # [batch, nb.proj, seq, nb.proj.dim]
    
    return attention, tf.nn.softmax(scaled_product, axis=-1)

In [None]:
x = tf.random.uniform((64, 8, 20, 64))
product, _ = scaled_dot_product_attention(x, x, x, mask=None)
product.shape

TensorShape([64, 8, 20, 64])

In [None]:
[2, 4, 6, 0, 0 ,0 ] , [0, 0, 0 , 1, 1, 1]

([2, 4, 6, 0, 0, 0], [0, 0, 0, 1, 1, 1])

## Multi Head Attention 

<center><img src="https://www.tensorflow.org/images/tutorials/transformer/multi_head_attention.png" width="300" alt="multi-head attention">

In [None]:
class MultiHeadAttention(layers.Layer):
    
    def __init__(self, nb_proj):
        #call the base super class
        super(MultiHeadAttention, self).__init__()
        #initialize the no. of projections
        self.nb_proj = nb_proj
        
    def build(self, input_shape):
        #get the dimension (d_model)
        self.d_model = input_shape[-1]

        #we check if the d_model dimension is divisible by no. of proj
        assert self.d_model % self.nb_proj == 0
        
        #divide and get only the integer part of the fraction
        self.d_proj = self.d_model // self.nb_proj
        
        #define dense layers having d_model as hidden units for Q, K and V
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        
        #Final output linear layer
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
        #we define the split shape we want for our output tensor here [Batch, seq, nb_proj, dim_proj]
        shape = (batch_size, -1, self.nb_proj, self.d_proj)

        #Now we reshape the inputs into the above defined shape
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, nb_proj, d_proj)

        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)
    
    def call(self, queries, keys, values, mask):
        #get the bactch size
        batch_size = tf.shape(queries)[0]
        
        #apply the layers onto Q, K and V
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        
        #we get the splitted projections for Q, K and V respectively
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        
        #get the attention weights
        attention, weights = scaled_dot_product_attention(queries, keys, values, mask)
        
        #permute and get back original tensor shape of [batch, seq, nb.proj, dim_proj]
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        
        #merge and reshape back into [Batch, seq, nb.proj * dim_proj] = [Batch, seq, d_model]
        concat_attention = tf.reshape(attention, shape=(batch_size, -1, self.d_model))
 
        
        outputs = self.final_lin(concat_attention)  #[Batch, Seq, d_model]

        return outputs, weights

In [None]:
temp_mha = MultiHeadAttention(8)
y = tf.random.uniform((64, 20, 512))  # (batch_size, seq_len, d_model)

In [None]:
out, w = temp_mha(y, y, y, mask=None)
out.shape

TensorShape([64, 20, 512])

## Encoder-Decoder Architecture

<center><img src="https://www.tensorflow.org/images/tutorials/transformer/transformer.png" width="400" alt="transformer">
<br><center>Transformer Model





In [None]:
class EncoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        #Define MHA for the encoder
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        #Define the FFN layer!
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, mask, training, index, flag):

        #test case: reordering of sublayers in encoder
        if index == 0 and flag:
            attention, _ = self.multi_head_attention(inputs, inputs, inputs, mask)
            attention = self.dropout_1(attention, training=training)
            attention = self.norm_1(attention + inputs)

            inputs = attention + inputs

            attention, _ = self.multi_head_attention(inputs, inputs, inputs, mask)
            attention = self.dropout_1(attention, training=training)
            attention = self.norm_1(attention + inputs)

            return attention
        
        if index == 3 and flag:
            outputs = self.dense_1(inputs)
            outputs = self.dense_2(outputs)
            outputs = self.dropout_2(outputs, training=training)
            outputs = self.norm_2(outputs + inputs)

            outputs = self.dense_1(inputs)
            outputs = self.dense_2(outputs)
            outputs = self.dropout_2(outputs, training=training)
            outputs = self.norm_2(outputs + inputs)

            return outputs


        #call MHA here with the query, key and value == input
        attention, _ = self.multi_head_attention(inputs, inputs, inputs, mask)
        #apply dropout for regularization
        attention = self.dropout_1(attention, training=training)
        #perfrom add and normalization
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)
        
        return outputs # [Batch, Seq, D_model]

In [None]:
EL = EncoderLayer(1024, 8, 0.1)
x = tf.random.uniform((64, 20, 512))
EL(x, None, False, 4, True).shape

TensorShape([64, 20, 512])

In [None]:
class Encoder(layers.Layer):
    
    def __init__(self, nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size, d_model, name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        #create 'N' Encodinglayers, where N = No. of layers
        self.enc_layers = [EncoderLayer(FFN_units, nb_proj, dropout_rate) for _ in range(nb_layers)]
    
    def call(self, inputs, mask, training, flag):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        #loop through and call each encoding layer Nb layers of time
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training, i, flag)

        return outputs

In [None]:
enc = Encoder(6, 1024, 8, 0.1, 8192, 512)
x = tf.random.uniform((64, 20)) #[Batch, Seq]
enc(x, None, False, False).shape

TensorShape([64, 20, 512])

In [None]:
class DecoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        # Define first Multi head attention with itself
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Second Multi head attention combined with encoder output
        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        # Feed foward Network
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, enc_outputs, mask_1, mask_2, training, index, flag):
        
        #apply MHA with padding mask
        attention, wb1 = self.multi_head_attention_1(inputs, inputs, inputs, mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)
        
        #apply MHA with look ahead mask
        attention_2, wb2 = self.multi_head_attention_2(attention, enc_outputs, enc_outputs, mask_2)
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)
        
        #apply the final FFN layer
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)
        
        return outputs, wb1, wb2

In [None]:
class Decoder(layers.Layer):
    
    def __init__(self, nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size, d_model, name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.dec_layers = [DecoderLayer(FFN_units, nb_proj, dropout_rate) for i in range(nb_layers)]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training, flag):
        #apply the decoder layer architecture in steps
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)

        for i in range(self.nb_layers):
            #loop through and call all such decoder layer instances
            outputs, wb1, wb2 = self.dec_layers[i](outputs, enc_outputs, mask_1, mask_2, training, i, flag) 
            #[Batch, Seq, d_model]

        return outputs, wb2

In [None]:
dec = Decoder(6, 1024, 8, 0.1, 8232, 512)
x = tf.random.uniform((64, 20)) #[Batch, Seq]
y = enc(x, None, False, False)
a, b = dec(x, y, None, None, False, False) #[Batch, Seq, D_model]
a.shape, b.shape

(TensorShape([64, 20, 512]), TensorShape([64, 8, 20, 20]))

###**Now Lets! combine everything together to build our final transformer architecture**

In [None]:
class Transformer(tf.keras.Model):
    
    def __init__(self, vocab_size_enc, vocab_size_dec, d_model, nb_layers, FFN_units, nb_proj, dropout_rate, name="transformer"):
        super(Transformer, self).__init__(name=name)
        
        #Define Encoder, decoder and final linear layer
        self.encoder = Encoder(nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size_enc, d_model)
        self.decoder = Decoder(nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size_dec, d_model)
        self.last_linear = layers.Dense(units=vocab_size_dec, name="final_output")
    
    def create_padding_mask(self, seq):
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        #We add 2 empty dimesions [Batch, nb.proj, seq, seq] since masking is done after scaled dot product 
        #which has input dimension of size = [batch, nb.proj, seq, seq].
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        #here we only consider the lower left traingle and hide upper right traingle of the matrix
        #the value -1 means keep the lower left traingle and 0 -> disable upper right traingle
        #(using the linalg.band_part function).
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
    
    def call(self, enc_inputs, dec_inputs, training, flag):
        #encoder mask
        enc_mask = self.create_padding_mask(enc_inputs)
        #decoder first mask 
        dec_mask_1 = tf.maximum(self.create_padding_mask(dec_inputs), self.create_look_ahead_mask(dec_inputs))
        #decoder second mask, here we use encoder inputs since our keys and values to MHA are from the output of encoder 
        #and queries come from decoder side. We want to mask encoder padded outputs when we recombine with decoder inputs.
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        
        #get the encoder outputs
        enc_outputs = self.encoder(enc_inputs, enc_mask, training, flag)
        #get the decoder outputs
        dec_outputs, weights = self.decoder(dec_inputs, enc_outputs, dec_mask_1, dec_mask_2, training, flag)
        
        #apply the final output layer of unit size = decoder vocab size (such that the model will 
        #predict the words from swedish vocab that have highest probabilities given english input sentence.)
        outputs = self.last_linear(dec_outputs)
        
        return outputs, weights  #[Batch, Seq, Voacab_size_dec]

In [None]:
tf.linalg.band_part(tf.ones((10, 10)), 0, -1)

<tf.Tensor: shape=(10, 10), dtype=float32, numpy=
array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)>

In [None]:
def create_padding_mask(seq):
  mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return mask[:, tf.newaxis, tf.newaxis, :]

In [None]:
def create_look_ahead_mask(seq):
  seq_len = tf.shape(seq)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  return look_ahead_mask

In [None]:
seq = tf.cast([[583, 288, 0, 412, 103, 0, 0, 0]], tf.int32)
create_padding_mask(seq)

<tf.Tensor: shape=(1, 1, 1, 8), dtype=float32, numpy=array([[[[0., 0., 1., 0., 0., 1., 1., 1.]]]], dtype=float32)>

In [None]:
create_look_ahead_mask(seq)

<tf.Tensor: shape=(8, 8), dtype=float32, numpy=
array([[0., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)>

In [None]:
#automatically reshapes into [..., seq, seq] and compares with look ahead max
#this operation helps us apply both the mask!
tf.maximum(create_padding_mask(seq), create_look_ahead_mask(seq))

<tf.Tensor: shape=(1, 1, 8, 8), dtype=float32, numpy=
array([[[[0., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 0., 1., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.]]]], dtype=float32)>

In [None]:
tm = Transformer(10000, 10000, 512, 6, 1024, 8, 0.1)
t_input = tf.random.uniform((1, 20), dtype=tf.int64, minval=0, maxval=200)
t_target = tf.random.uniform((1, 20), dtype=tf.int64, minval=0, maxval=200)

In [None]:
a, w = tm(t_input, t_target, False, False)
a.shape, w.shape

(TensorShape([1, 20, 10000]), TensorShape([1, 8, 20, 20]))

In [None]:
head = 0
# shape: (batch=1, num_heads, seq_len_q, seq_len_k)
attention_heads = tf.squeeze(w, 0)
attention = attention_heads[head]
attention.shape

TensorShape([20, 20])

**Hyperparameters**

In [None]:
#tf.keras.backend.clear_session()

# Lets define the Hyper-parameters needed to train our dataset
# we initially choose small size parameter values for faster training compared to as stated in the paper!
D_MODEL = 128       # 512
NB_LAYERS = 4       # 6
FFN_UNITS = 512     # 2048
NB_PROJ = 8         # 8
DROPOUT_RATE = 0.1  # 0.1

#Instantiate the transformer model
transformer = Transformer(VOCAB_SIZE_EN, VOCAB_SIZE_SV, D_MODEL, NB_LAYERS, FFN_UNITS, NB_PROJ, DROPOUT_RATE)

**Now before we start training we need to do few very important steps:**<br><br>1) First we define our loss object as Sparse CategoricalCrossentropy (we use this crossentropy loss function since in the output we have two or more class labels to predict.)<br><br>2) Next we define loss function that creates a mask to hide the padded values and do not include them in the computaion of loss metric.

In [None]:
#since our outputs from model are real numbers ready to be transformed into probabilities we set from_logits = True.
#And reduction none indicates dont sum over all probabilities and calc mean loss as of yet. Since we need to remove the 
#padding part before summing the loss!

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss = loss_object(target, pred)
    
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss)

**TODO:<br> 
1) Custom learning rate as descibed in paper<br>
2) Training the model<br>
3) Evaluation on test data....** 

In [None]:
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name="track_training_loss")

#### Test training: to check if loss decresases over time!

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
    print("epoch: ", epoch+1)
    start = time.time()
    train_loss.reset_states()
    
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        #get everything except last word for input to the decoder
        dec_inputs = targets[:, :-1]

        #and the output from decoder is the shifted right part.
        dec_outputs_real = targets[:, 1:]

        #store everything that happens during training on a tape
        with tf.GradientTape() as tape:
            predictions, _ = transformer(enc_inputs, dec_inputs, True, False) #predict
            loss = loss_function(dec_outputs_real, predictions) #calc loss
            
        
        #Calc gradients dL/dw
        gradients = tape.gradient(loss, transformer.trainable_variables)
        #update the weights
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)

        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(epoch+1, batch, train_loss.result()))

    print("Time taken for 1 epoch: ", time.time() - start)

epoch:  1
Epoch 1 Batch 0 Loss 5.8639
Epoch 1 Batch 50 Loss 4.7266
Epoch 1 Batch 100 Loss 4.5628
Epoch 1 Batch 150 Loss 4.4917
Epoch 1 Batch 200 Loss 4.4450
Epoch 1 Batch 250 Loss 4.4181
Epoch 1 Batch 300 Loss 4.3800
Epoch 1 Batch 350 Loss 4.3467
Epoch 1 Batch 400 Loss 4.3262
Epoch 1 Batch 450 Loss 4.3070
Epoch 1 Batch 500 Loss 4.2850
Epoch 1 Batch 550 Loss 4.2696
Epoch 1 Batch 600 Loss 4.2585
Epoch 1 Batch 650 Loss 4.2485
Epoch 1 Batch 700 Loss 4.2392
Epoch 1 Batch 750 Loss 4.2231
Epoch 1 Batch 800 Loss 4.2042
Epoch 1 Batch 850 Loss 4.1860
Epoch 1 Batch 900 Loss 4.1713
Epoch 1 Batch 950 Loss 4.1580
Epoch 1 Batch 1000 Loss 4.1460
Epoch 1 Batch 1050 Loss 4.1366
Epoch 1 Batch 1100 Loss 4.1239
Epoch 1 Batch 1150 Loss 4.1149
Epoch 1 Batch 1200 Loss 4.1033
Epoch 1 Batch 1250 Loss 4.0949
Epoch 1 Batch 1300 Loss 4.0859
Epoch 1 Batch 1350 Loss 4.0827
Epoch 1 Batch 1400 Loss 4.0800
Epoch 1 Batch 1450 Loss 4.0728
Epoch 1 Batch 1500 Loss 4.0691
Epoch 1 Batch 1550 Loss 4.0660
Epoch 1 Batch 1600 Lo