<a href="https://colab.research.google.com/github/kmerchan/holbertonschool-machine_learning/blob/main/Transformer_Applications.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 2.x

import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import numpy as np

class Dataset:
    """                                                                                        
    Loads and preps a dataset for machine translation                                          
                                                                                               
    class constructor:                                                                         
        def __init__(self)                                                                     
                                                                                               
    public instance attributes:                                                                
        data_train:                                                                            
            contains the ted_hrlr_translate/pt_to_en                                           
                tf.data.Dataset train split, loaded as_supervided                              
        data_valid:                                                                            
            contains the ted_hrlr_translate/pt_to_en                                           
                tf.data.Dataset validate split, loaded as_supervided                           
        tokenizer_pt:                                                                          
            the Portuguese tokenizer created from the training set                             
        tokenizer_en:                                                                          
            the English tokenizer created from the training set                                
                                                                                               
    instance method:                                                                           
        def tokenize_dataset(self, data):                                                      
            that creates sub-word tokenizers for our dataset                                   
        def encode(self, pt, en):                                                              
            that encodes a translation into tokens 
        def tf_encode(self, pt, en):                                                           
            that acts as a TensorFlow wrapper for the encode method                            
    """
    def __init__(self, batch_size, max_len):
        """                                                                                    
        Class constructor                                                                      
                                                                                               
        parameters:                                                                            
            batch_size [int]:                                                                  
                the batch size for training/validation                                         
            max_len [int]:                                                                     
                the maximum number of tokens allowed per example sentence                      
                                                                                               
        Sets the public instance attributes:                                                   
            data_train:                                                                        
                contains the ted_hrlr_translate/pt_to_en                                       
                    tf.data.Dataset train split, loaded as_supervided                          
            data_valid:                                                                        
                contains the ted_hrlr_translate/pt_to_en                                       
                    tf.data.Dataset validate split, loaded as_supervided                       
            tokenizer_pt:                                                                      
                the Portuguese tokenizer created from the training set                         
            tokenizer_en:                                                                      
                the English tokenizer created from the training set                            
        """
        data_train = tfds.load("ted_hrlr_translate/pt_to_en",
                               split="train",
                               as_supervised=True)
        data_valid = tfds.load("ted_hrlr_translate/pt_to_en",
                               split="validation",
                               as_supervised=True)
        self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(
            data_train)

        # set attributes to encoded data                                                       
        self.data_train = data_train.map(self.tf_encode)
        self.data_valid = data_valid.map(self.tf_encode)

        def filter_max_length(x, y, max_len=max_len):
            """                                                                                
            Filters data by max_len                                                            
            """
            filtered = tf.logical_and(tf.size(x) <= max_len,
                                      tf.size(y) <= max_len)
            return filtered

        # filter training and validation by max_len number of tokens                           
        self.data_train = self.data_train.filter(filter_max_length)
        self.data_valid = self.data_valid.filter(filter_max_length)

        # increase performance by caching training dataset                                     
        self.data_train = self.data_train.cache()

        # shuffle the training dataset                                                         
        data_size = sum(1 for data in self.data_train)
        self.data_train = self.data_train.shuffle(data_size)

        # split training and validation datasets into padded batches                           
        self.data_train = self.data_train.padded_batch(batch_size)
        self.data_valid = self.data_valid.padded_batch(batch_size)

        # increase performance by prefetching training dataset                                 
        self.data_train = self.data_train.prefetch(
            tf.data.experimental.AUTOTUNE)

    def tokenize_dataset(self, data):
        """                                                                                    
        Creates sub_word tokenizers for our dataset                                            
                                                                                               
        parameters:                                                                            
            data [tf.data.Dataset]:                                                            
                dataset to use whose examples are formatted as tuple (pt, en)                  
                pt [tf.Tensor]:                                                                
                    contains the Portuguese sentence                                           
                en [tf.Tensor]:                                                                
                    contains the corresponding English sentence                                
        returns:                                                                               
            tokenizer_pt, tokenizer_en:                                                        
                tokenizer_pt: the Portuguese tokenizer                                         
                tokenizer_en: the English tokenizer                                            
        """
        SubwordTextEncoder = tfds.deprecated.text.SubwordTextEncoder
        tokenizer_pt = SubwordTextEncoder.build_from_corpus(
            (pt.numpy() for pt, en in data),
            target_vocab_size=(2 ** 15))
        tokenizer_en = SubwordTextEncoder.build_from_corpus(
            (en.numpy() for pt, en in data),
            target_vocab_size=(2 ** 15))
        return tokenizer_pt, tokenizer_en

    def encode(self, pt, en):
        """                                                                                    
        Encodes a translation into tokens                                                      
                                                                                               
        parameters:                                                                            
            pt [tf.Tensor]:                                                                    
                contains the Portuguese sentence                                               
            en [tf.Tensor]:                                                                    
                contains the corresponding English sentence                                    
        returns:                                                                               
            pt_tokens, en_tokens:                                                              
                pt_tokens [np.ndarray]: the Portuguese tokens                                  
                en_tokens [np.ndarray]: the English tokens                                     
        """
        pt_start_index = self.tokenizer_pt.vocab_size
        pt_end_index = pt_start_index + 1
        en_start_index = self.tokenizer_en.vocab_size
        en_end_index = en_start_index + 1
        pt_tokens = [pt_start_index] + self.tokenizer_pt.encode(
            pt.numpy()) + [pt_end_index]
        en_tokens = [en_start_index] + self.tokenizer_en.encode(
            en.numpy()) + [en_end_index]
        return pt_tokens, en_tokens

    def tf_encode(self, pt, en):
        """                                                                                    
        Acts as a TensorFlow wrapper for the encode method                                     
            to return tensors instead of numpy arrays                                          
                                                                                               
        parameters:                                                                            
            pt [tf.Tensor]:                                                                    
                contains the Portuguese sentence                                               
            en [tf.Tensor]:                                                                    
                contains the corresponding English sentence                                    
                                                                                               
        returns:                                                                               
            pt [tf.Tensor]: encoded Portuguese sentence                                        
            en [tf.Tensor]: encoded English sentence                                           
        """
        pt_encoded, en_encoded = tf.py_function(func=self.encode,
                                                inp=[pt, en],
                                                Tout=[tf.int64, tf.int64])
        pt_encoded.set_shape([None])
        en_encoded.set_shape([None])
        return pt_encoded, en_encoded

def create_masks(inputs, target):
    """                                                                                        
    Creates all masks for training/validation                                                  
                                                                                               
    parameters:                                                                                
        inputs [tf.Tensor of shape (batch_size, seq_len_in)]:                                  
            contains the input sentence                                                        
        target [tf.Tensor of shape (batch_size, seq_len_out)]:                                 
            contains the target sentence                                                       
    returns:                                                                                   
        encoder_mask, combined_mask, decoder_mask                                              
        encoder_mask [tf.Tensor of shape                                                       
                        (batch_size, 1, 1, seq_len_in)]:                                       
            padding mask to be applied to the encoder                                          
        combined_mask [tf.Tensor of shape                                                      
                        (batch_size, 1, seq_len_out, seq_len_out)]:                            
            mask used in the 1st attention block in the decoder to pad                         
                and mask future tokens in the input received by the decoder                    
            Maximum between look ahead mask and decoder target padding mask                    
        decoder_mask [tf.Tensor of shape (batch_size, 1, 1, seq_len_in)]:                      
            padding mask used in the 2nd attention block in the decoder                        
    """
    encoder_mask = tf.cast(tf.math.equal(inputs, 0), tf.float32)
    encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]

    decoder_mask = tf.cast(tf.math.equal(inputs, 0), tf.float32)
    decoder_mask = decoder_mask[:, tf.newaxis, tf.newaxis, :]

    batch_size, seq_len_out = target.shape

    look_ahead_mask = tf.linalg.band_part(tf.ones(
        (seq_len_out, seq_len_out)), -1, 0)
    look_ahead_mask = 1 - look_ahead_mask

    padding_mask = tf.cast(tf.math.equal(target, 0), tf.float32)
    padding_mask = padding_mask[:, tf.newaxis, tf.newaxis, :]

    combined_mask = tf.maximum(look_ahead_mask, padding_mask)

    return encoder_mask, combined_mask, decoder_mask

def get_angle(pos, i, dm):
    """                                                                                         
    Calculates the angles for the following formulas for positional encoding:                   
                                                                                                
    PE(pos, 2i) = sin(pos / 10000^(2i / dm))                                                    
    PE(pos, 2i + 1) = cos(pos / 10000^(2i / dm))                                                
    """
    angle_rates = 1 / (10000 ** (i / dm))
    return pos * angle_rates


def positional_encoding(max_seq_len, dm):
    """                                                                                         
    Calculates the positional encoding for a transformer                                        
                                                                                                
    parameters:                                                                                 
        max_seq_len [int]:                                                                      
            represents the maximum sequence length                                              
        dm: model depth                                                                         
                                                                                                
    returns:                                                                                    
        [numpy.ndarray of shape (max_seq_len, dm)]:                                             
            contains the positional encoding vectors                                            
    """
    positional_encoding = np.zeros([max_seq_len, dm])

    for pos in range(max_seq_len):
        for i in range(0, dm, 2):
            # sin for even indices of positional_encoding                                       
            positional_encoding[pos, i] = np.sin(get_angle(pos, i, dm))
            # cos for odd indices of positional_encoding                                        
            positional_encoding[pos, i + 1] = np.cos(get_angle(pos, i, dm))
    return positional_encoding

def sdp_attention(Q, K, V, mask=None):
    """                                                                                         
    Calculates the scaled dot product attention                                                 
                                                                                                
    parameters:                                                                                 
        Q [tensor with last two dimensions as (..., seq_len_q, dk)]:                            
            contains the query matrix                                                           
        K [tensor with last two dimensions as (..., seq_len_v, dk)]:                            
            contains the key matrix                                                             
        V [tensor with last two dimensions as (..., seq_len_v, dv)]:                            
            contains the value matrix                                                           
        mask [tensor that can be broadcast into (..., seq_len_q, seq_len_v)]:                   
            contains the optional mask, or defaulted to None                                    
                                                                                                
    returns:                                                                                    
        outputs, weights:                                                                       
            outputs [tensor with last two dimensions as (..., seq_len_q, dv)]:                  
                contains the scaled dot product attention                                       
            weights [tensor with last two dimensions as                                         
                    (..., seq_len_q, seq_len_v)]:                                               
                contains the attention weights                                                  
    """
    matmul_qk = tf.matmul(Q, K, transpose_b=True)
    # scale matmul_qk                                                                           
    dk = tf.cast(tf.shape(K)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    # add mask to scaled tensor                                                                 
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    # normalize softmax on last axis so all scores add up to 1                                  
    weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    # calculate outputs                                                                         
    outputs = tf.matmul(weights, V)
    return outputs, weights

class MultiHeadAttention(tf.keras.layers.Layer):
    """                                                                                         
    Class to perform multi-head attention                                                       
                                                                                                
    class constructor:                                                                          
        def __init__(self, dm, h)                                                               
                                                                                                
    public instance attribute:                                                                  
        h: number of heads                                                                      
        dm: the dimensionality of the model                                                     
        depth: the depth of each attention head                                                 
        Wq: a Dense layer with dm units, used to generate the query matrix                      
        Wk: a Dense layer with dm units, used to generate the key matrix                        
        Wv: a Dense layer with dm units, used to generate the value matrix                      
        linear: a Dense layer with dm units, used to generate attention output                  
                                                                                                
    public instance methods:                                                                    
        def call(self, Q, K, V, mask):                                                          
            generates the query, key, and value matrices and                                    
                outputs the scaled dot product attention                                        
    """
    def __init__(self, dm, h):
        """                                                                                     
        Class constructor                                                                       
                                                                                                
        parameters:                                                                             
            dm [int]:                                                                           
                represents the dimensionality of the model                                      
            h [int]:                                                                            
                represents the number of heads                                                  

        sets the public instance attributes:                                                    
            h: number of heads                                                                  
            dm: the dimensionality of the model                                                 
            depth: the depth of each attention head                                             
            Wq: a Dense layer with dm units, used to generate the query matrix                  
            Wk: a Dense layer with dm units, used to generate the key matrix                    
            Wv: a Dense layer with dm units, used to generate the value matrix                  
            linear: a Dense layer with dm units,                                                
                used to generate attention output                                               
        """
        if type(dm) is not int:
            raise TypeError(
                "dm must be int representing dimensionality of model")
        if type(h) is not int:
            raise TypeError(
                "h must be int representing number of heads")
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.dm = dm
        self.depth = dm // h
        self.Wq = tf.keras.layers.Dense(units=dm)
        self.Wk = tf.keras.layers.Dense(units=dm)
        self.Wv = tf.keras.layers.Dense(units=dm)
        self.linear = tf.keras.layers.Dense(units=dm)

    def split_heads(self, x, batch):
        """                                                                                     
        Splits the last dimension of tensor into (h, dm) and                                    
            transposes the result so the shape is (batch, h, seq_len, dm)                       
        """
        x = tf.reshape(x, (batch, -1, self.h, self.depth))
        x = tf.transpose(x, perm=[0, 2, 1, 3])
        return x

    def call(self, Q, K, V, mask):
        """                                                                                     
        Generates the query, key, and value matrices and                                        
            outputs the scaled dot product attention                                            
                                                                                                
        parameters:                                                                             
            Q [tensor of shape (batch, seq_len_q, dk)]:                                         
                contains the input to generate the query matrix                                 
            K [tensor of shape (batch, seq_len_v, dk)]:                                         
                contains the input to generate the key matrix                                   
            V [tensor of shape (batch, seq_len_v, dv)]:                                         
                contains the input to generate the value matrix                                 
            mask [always None]                                                                  
                                                                                                
        returns:                                                                                
            outputs, weights:                                                                   
                outputs [tensor with last two dimensions (..., seq_len_q, dm)]:                 
                    contains the scaled dot product attention                                   
                weights [tensor with last dimensions                                            
                        (..., h, seq_len_q, seq_len_v)]:                                        
                    contains the attention weights                                              
        """
        # batch = Q.get_shape().as_list()[0]                                                    
        batch = tf.shape(Q)[0]

        q = self.Wq(Q)
        k = self.Wk(K)
        v = self.Wv(V)

        q = self.split_heads(q, batch)
        k = self.split_heads(k, batch)
        v = self.split_heads(v, batch)

        attention, weights = sdp_attention(q, k, v, mask)

        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch, -1, self.dm))
        outputs = self.linear(concat_attention)

        return outputs, weights

class EncoderBlock(tf.keras.layers.Layer):
    """                                                                                         
    Class to create an encoder block for a transformer                                          
                                                                                                
    class constructor:                                                                          
        def __init__(self, dm, h, hidden, drop_rate=0.1)                                        
                                                                                                
    public instance attribute:                                                                  
        mha: MultiHeadAttention layer                                                           
        dense_hidden: the hidden dense layer with hidden units, relu activation                 
        dense_output: the output dense layer with dm units                                      
        layernorm1: the first layer norm layer, with epsilon=1e-6                               
        layernorm2: the second layer norm layer, with epsilon=1e-6                              
        drouput1: the first dropout layer                                                       
        dropout2: the second dropout layer                                                      
                                                                                                
    public instance method:                                                                     
        call(self, x, training, mask=None):                                                     
            calls the encoder block and returns the block's output                              
    """
    def __init__(self, dm, h, hidden, drop_rate=0.1):
        """                                                                                     
        Class constructor                                                                       
                                                                                                
        parameters:                                                                             
            dm [int]:                                                                           
                represents the dimensionality of the model                                      
            h [int]:                                                                            
                represents the number of heads                                                  
            hidden [int]:                                                                       
                represents the number of hidden units in fully connected layer                  
            drop_rate [float]:                                                                  
                the dropout rate                                    
        sets the public instance attributes:                                                    
            mha: MultiHeadAttention layer                                                       
            dense_hidden: the hidden dense layer with hidden units, relu activ.                 
            dense_output: the output dense layer with dm units                                  
            layernorm1: the first layer norm layer, with epsilon=1e-6                           
            layernorm2: the second layer norm layer, with epsilon=1e-6                          
            drouput1: the first dropout layer                                                   
            dropout2: the second dropout layer                                                  
        """
        if type(dm) is not int:
            raise TypeError(
                "dm must be int representing dimensionality of model")
        if type(h) is not int:
            raise TypeError(
                "h must be int representing number of heads")
        if type(hidden) is not int:
            raise TypeError(
                "hidden must be int representing number of hidden units")
        if type(drop_rate) is not float:
            raise TypeError(
                "drop_rate must be float representing dropout rate")
        super(EncoderBlock, self).__init__()
        self.mha = MultiHeadAttention(dm, h)
        self.dense_hidden = tf.keras.layers.Dense(units=hidden,
                                                  activation='relu')
        self.dense_output = tf.keras.layers.Dense(units=dm)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(drop_rate)
        self.dropout2 = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, training, mask=None):
        """                                                                                     
        Calls the encoder block and returns the block's output                                  
                                                                                                
        parameters:                                                                             
            x [tensor of shape (batch, input_seq_len, dm)]:                                     
                contains the input to the encoder block                                         
            training [boolean]:                                                                 
                determines if the model is in training                                          
            mask:                                                                               
                mask to be applied for multi-head attention                                     
                                                                                                
        returns:                                                                                
            [tensor of shape (batch, input_seq_len, dm)]:                                       
                contains the block's output                                                     
        """
        attention_output, _ = self.mha(x, x, x, mask)
        attention_output = self.dropout1(attention_output, training=training)
        output1 = self.layernorm1(x + attention_output)

        dense_output = self.dense_hidden(output1)
        ffn_output = self.dense_output(dense_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        output2 = self.layernorm2(output1 + ffn_output)

        return output2

class DecoderBlock(tf.keras.layers.Layer):
    """                                                                                         
    Class to create a decoder block for a transformer                                           
                                                                                                
    class constructor:                                                                          
        def __init__(self, dm, h, hidden, drop_rate=0.1)                                        
                                                                                                
    public instance attribute:                                                                  
        mha1: the first MultiHeadAttention layer                                                
        mha2: the second MultiHeadAttention layer                                               
        dense_hidden: the hidden dense layer with hidden units, relu activation                 
        dense_output: the output dense layer with dm units                                      
        layernorm1: the first layer norm layer, with epsilon=1e-6                               
        layernorm2: the second layer norm layer, with epsilon=1e-6                              
        layernorm3: the third layer norm layer, with epsilon=1e-6                               
        drouput1: the first dropout layer                                                       
        dropout2: the second dropout layer                                                      
        dropout3: the third dropout layer                                                       
                                                                                                
    public instance method:                                                                     
        def call(self, x, encoder_output, training, look_ahead_mask,                            
                    padding_mask):                                                              
            calls the decoder block and returns the block's output                              
    """
    def __init__(self, dm, h, hidden, drop_rate=0.1):
        """                                                                                     
        Class constructor                                                                       
                                                                                                
        parameters:                                                                             
            dm [int]:                                                                           
                represents the dimensionality of the model                                      
            h [int]:                                                                            
                represents the number of heads                                                  
            hidden [int]:                                                                       
                represents the number of hidden units in fully connected layer                  
            drop_rate [float]:                                                                  
                the dropout rate                                                                
                                                                          
        sets the public instance attributes:                                                    
            mha1: the first MultiHeadAttention layer                                            
            mha2: the second MultiHeadAttention layer                                           
            dense_hidden: the hidden dense layer with hidden units, relu activ.                 
            dense_output: the output dense layer with dm units                                  
            layernorm1: the first layer norm layer, with epsilon=1e-6                           
            layernorm2: the second layer norm layer, with epsilon=1e-6                          
            layernorm3: the third layer norm layer, with epsilon=1e-6                           
            drouput1: the first dropout layer                                                   
            dropout2: the second dropout layer                                                  
            dropout3: the third dropout layer                                                   
        """
        if type(dm) is not int:
            raise TypeError(
                "dm must be int representing dimensionality of model")
        if type(h) is not int:
            raise TypeError(
                "h must be int representing number of heads")
        if type(hidden) is not int:
            raise TypeError(
                "hidden must be int representing number of hidden units")
        if type(drop_rate) is not float:
            raise TypeError(
                "drop_rate must be float representing dropout rate")
        super(DecoderBlock, self).__init__()
        self.mha1 = MultiHeadAttention(dm, h)
        self.mha2 = MultiHeadAttention(dm, h)
        self.dense_hidden = tf.keras.layers.Dense(units=hidden,
                                                  activation='relu')
        self.dense_output = tf.keras.layers.Dense(units=dm)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(drop_rate)
        self.dropout2 = tf.keras.layers.Dropout(drop_rate)
        self.dropout3 = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, encoder_output, training, look_ahead_mask, padding_mask):
        """                                                                                     
        Calls the decoder block and returns the block's output                                  
                                                                                                
        parameters:                                                                             
            x [tensor of shape (batch, target_seq_len, dm)]:                                    
                contains the input to the decoder block                                         
            encoder_output [tensor of shape (batch, input_seq_len, dm)]:                        
                contains the output of the encoder                                              
            training [boolean]:                                                                 
                determines if the model is in training                                          
            look_ahead_mask:                                                                    
                mask to be applied to the first multi-head attention                            
            padding_mask:                                                                       
                mask to be applied to the second multi-head attention                           
                                                                                                
        returns:                                                                                
            [tensor of shape (batch, target_seq_len, dm)]:                                      
                contains the block's output                                                     
        """
        attention_output1, _ = self.mha1(x, x, x, look_ahead_mask)
        attention_output1 = self.dropout1(attention_output1, training=training)
        output1 = self.layernorm1(x + attention_output1)

        attention_output2, _ = self.mha2(output1, encoder_output,
                                         encoder_output, padding_mask)
        attention_output2 = self.dropout2(attention_output2, training=training)
        output2 = self.layernorm2(output1 + attention_output2)

        dense_output = self.dense_hidden(output2)
        ffn_output = self.dense_output(dense_output)
        ffn_output = self.dropout3(ffn_output, training=training)
        output3 = self.layernorm3(output2 + ffn_output)

        return output3

class Encoder(tf.keras.layers.Layer):
    """                                                                                         
    Class to create the encoder for a transformer                                               
                                                                                                
    class constructor:                                                                          
        def __init__(self, N, dm, h, hidden, input_vocab, max_seq_len,                          
                        drop_rate=0.1)                                                          
                                                                                                
    public instance attribute:                                                                  
        N: the number of blocks in the encoder                                                  
        dm: the dimensionality of the model                                                     
        embedding: the embedding layer for the inputs                                           
        positional_encoding [numpy.ndarray of shape (max_seq_len, dm)]:                         
            contains the positional encodings                                                   
        blocks [list of length N]:                                                              
            contains all the EncoderBlocks                                                      
        dropout: the dropout layer, to be applied to the positional encodings                   
                                                                                                
    public instance method:                                                                     
        call(self, x, training, mask):                                                          
            calls the encoder and returns the encoder's output                                  
    """
    def __init__(self, N, dm, h, hidden, input_vocab, max_seq_len,
                 drop_rate=0.1):
        """                                                                                     
        Class constructor                                                                       
                                                                                                
        parameters:                                                                             
            N [int]:                                                                            
                represents the number of blocks in the encoder                                  
            dm [int]:                                                                           
                represents the dimensionality of the model                                      
            h [int]:                                                                            
                represents the number of heads                                                  
            hidden [int]:                                                                       
                represents the number of hidden units in fully connected layer                  
            input_vocab [int]:                                                                  
                represents the size of the input vocabulary             
            max_seq_len [int]:                                                                  
                represents the maximum sequence length possible                                 
            drop_rate [float]:                                                                  
                the dropout rate                                                                
                                                                                                
        sets the public instance attributes:                                                    
            N: the number of blocks in the encoder                                              
            dm: the dimensionality of the model                                                 
            embedding: the embedding layer for the inputs                                       
            positional_encoding [numpy.ndarray of shape (max_seq_len, dm)]:                     
                contains the positional encodings                                               
            blocks [list of length N]:                                                          
                contains all the EncoderBlocks                                                  
            dropout: the dropout layer, applied to the positional encodings                     
        """
        if type(N) is not int:
            raise TypeError(
                "N must be int representing number of blocks in the encoder")
        if type(dm) is not int:
            raise TypeError(
                "dm must be int representing dimensionality of model")
        if type(h) is not int:
            raise TypeError(
                "h must be int representing number of heads")
        if type(hidden) is not int:
            raise TypeError(
                "hidden must be int representing number of hidden units")
        if type(input_vocab) is not int:
            raise TypeError(
                "input_vocab must be int representing size of input vocab")
        if type(max_seq_len) is not int:
            raise TypeError(
                "max_seq_len must be int representing max sequence length")
        if type(drop_rate) is not float:
            raise TypeError(
                "drop_rate must be float representing dropout rate")
        super(Encoder, self).__init__()
        self.N = N
        self.dm = dm
        self.embedding = tf.keras.layers.Embedding(input_dim=input_vocab,
                                                   output_dim=dm)
        self.positional_encoding = positional_encoding(max_seq_len, dm)
        self.blocks = [EncoderBlock(dm, h, hidden, drop_rate)
                       for block in range(N)]
        self.dropout = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, training, mask):
        """                                                                                     
        Calls the encoder and returns the encoder's output                                      
                                                                                                
        parameters:                                                                             
            x [tensor of shape (batch, input_seq_len, dm)]:                                     
                contains the input to the encoder                                               
            training [boolean]:                                                                 
                determines if the model is in training                                          
            mask:                                                                               
                mask to be applied for multi-head attention                                     
                                                                                                
        returns:                                                                                
            [tensor of shape (batch, input_seq_len, dm)]:                                       
                contains the encoder output                                                     
        """
        seq_len = x.shape[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.dm, tf.float32))
        x += self.positional_encoding[:seq_len, :]
        x = self.dropout(x, training=training)

        for i in range(self.N):
            x = self.blocks[i](x, training, mask)

        return x

class Decoder(tf.keras.layers.Layer):
    """                                                                                         
    Class to create the decoder for a transformer                                               
                                                                                                
    class constructor:                                                                          
        def __init__(self, N, dm, h, hidden, target_vocab, max_seq_len,                         
                        drop_rate=0.1)                                                          
                                                                                                
    public instance attribute:                                                                  
        N: the number of blocks in the encoder                                                  
        dm: the dimensionality of the model                                                     
        embedding: the embedding layer for the targets                                          
        positional_encoding [numpy.ndarray of shape (max_seq_len, dm)]:                         
            contains the positional encodings                                                   
        blocks [list of length N]:                                                              
            contains all the DecoderBlocks                                                      
        dropout: the dropout layer, to be applied to the positional encodings                   
                                                                                                
    public instance method:                                                                     
        def call(self, x, encoder_output, training, look_ahead_mask,                            
                    padding_mask):                                                              
            calls the decoder and returns the decoder's output                                  
    """
    def __init__(self, N, dm, h, hidden, target_vocab, max_seq_len,
                 drop_rate=0.1):
        """                                                                                     
        Class constructor                                                                       
                                                                                                
        parameters:                                                                             
            N [int]:                                                                            
                represents the number of blocks in the encoder                                  
            dm [int]:                                                                           
                represents the dimensionality of the model                                      
            h [int]:                                                                            
                represents the number of heads                                                  
            hidden [int]:                                                                       
                represents the number of hidden units in fully connected layer                  
            target_vocab [int]:                                                                 
                represents the size of the target vocabulary      
            max_seq_len [int]:                                                                  
                represents the maximum sequence length possible                                 
            drop_rate [float]:                                                                  
                the dropout rate                                                                
                                                                                                
        sets the public instance attributes:                                                    
            N: the number of blocks in the encoder                                              
            dm: the dimensionality of the model                                                 
            embedding: the embedding layer for the targets                                      
            positional_encoding [numpy.ndarray of shape (max_seq_len, dm)]:                     
                contains the positional encodings                                               
            blocks [list of length N]:                                                          
                contains all the DecoderBlocks                                                  
            dropout: the dropout layer,                                                         
                to be applied to the positional encodings                                       
        """
        if type(N) is not int:
            raise TypeError(
                "N must be int representing number of blocks in the encoder")
        if type(dm) is not int:
            raise TypeError(
                "dm must be int representing dimensionality of model")
        if type(h) is not int:
            raise TypeError(
                "h must be int representing number of heads")
        if type(hidden) is not int:
            raise TypeError(
                "hidden must be int representing number of hidden units")
        if type(target_vocab) is not int:
            raise TypeError(
                "target_vocab must be int representing size of target vocab")
        if type(max_seq_len) is not int:
            raise TypeError(
                "max_seq_len must be int representing max sequence length")
        if type(drop_rate) is not float:
            raise TypeError(
                "drop_rate must be float representing dropout rate")
        super(Decoder, self).__init__()
        self.N = N
        self.dm = dm
        self.embedding = tf.keras.layers.Embedding(input_dim=target_vocab,
                                                   output_dim=dm)
        self.positional_encoding = positional_encoding(max_seq_len, dm)
        self.blocks = [DecoderBlock(dm, h, hidden, drop_rate)
                       for block in range(N)]
        self.dropout = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, encoder_output, training, look_ahead_mask, padding_mask):
        """                                                                                     
        Calls the decoder and returns the decoder's output                                      
                                                                                                
        parameters:                                                                             
            x [tensor of shape (batch, target_seq_len, dm)]:                                    
                contains the input to the decoder                                               
            encoder_output [tensor of shape (batch, input_seq_len, dm)]:                        
                contains the output of the encoder                                              
            training [boolean]:                                                                 
                determines if the model is in training                                          
            look_ahead_mask:                                                                    
                mask to be applied to first multi-head attention                                
            padding_mask:                                                                       
                mask to be applied to second multi-head attention                               
                                                                                                
        returns:                                                                                
            [tensor of shape (batch, target_seq_len, dm)]:                                      
                contains the decoder output                                                     
        """
        seq_len = x.shape[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.dm, tf.float32))
        x += self.positional_encoding[:seq_len]

        x = self.dropout(x, training=training)

        for i in range(self.N):
            x = self.blocks[i](x, encoder_output, training,
                               look_ahead_mask, padding_mask)
        return x


class Transformer(tf.keras.layers.Layer):
    """                                                                                         
    Class to create the transformer network                                                     
                                                                                                
    class constructor:                                                                          
        def __init__(self, N, dm, h, hidden, input_vocab, target_vocab,                         
                     max_seq_input, max_seq_target, drop_rate=0.1)                              
                                                                                                
    public instance attributes:                                                                 
        encoder: the encoder layer                                                              
        decoder: the decoder layer                                                              
        linear: the Dense layer with target_vocab units                                         
                                                                                                
    public instance method:                                                                     
        def call(self, inputs, target, training, encoder_mask,                                  
                    look_ahead_mask, decoder_mask):                                             
            calls the transformer network and returns the transformer output                    
    """
    def __init__(self, N, dm, h, hidden, input_vocab, target_vocab,
                 max_seq_input, max_seq_target, drop_rate=0.1):
        """                                                                                     
        Class constructor                                                                       
                                                                                                
        parameters:                                                                             
            N [int]:                                                                            
                represents the number of blocks in the encoder and decoder                      
            dm [int]:                                                                           
                represents the dimensionality of the model                                      
            h [int]:                                                                            
                represents the number of heads                                                  
            hidden [int]:                                                                       
                represents the number of hidden units in fully connected layer                  
            input_vocab [int]:                                                                  
                represents the size of the input vocabulary                       
            target_vocab [int]:                                                                 
                represents the size of the target vocabulary                                    
            max_seq_input [int]:                                                                
                represents the maximum sequence length possible for input                       
            max_seq_target [int]:                                                               
                represents the maximum sequence length possible for target                      
            drop_rate [float]:                                                                  
                the dropout rate                                                                
                                                                                                
        sets the public instance attributes:                                                    
            encoder: the encoder layer                                                          
            decoder: the decoder layer                                                          
            linear: the Dense layer with target_vocab units                                     
        """
        if type(N) is not int:
            raise TypeError(
                "N must be int representing number of blocks in the encoder")
        if type(dm) is not int:
            raise TypeError(
                "dm must be int representing dimensionality of model")
        if type(h) is not int:
            raise TypeError(
                "h must be int representing number of heads")
        if type(hidden) is not int:
            raise TypeError(
                "hidden must be int representing number of hidden units")
        if type(input_vocab) is not int:
            raise TypeError(
                "input_vocab must be int representing size of input vocab")
        if type(target_vocab) is not int:
            raise TypeError(
                "target_vocab must be int representing size of target vocab")
        if type(max_seq_input) is not int:
            raise TypeError(
                "max_seq_input must be int representing max length for input")
        if type(max_seq_target) is not int:
            raise TypeError(
                "max_seq_target must be int representing max len for target")
        if type(drop_rate) is not float:
            raise TypeError(
                "drop_rate must be float representing dropout rate")
        super(Transformer, self).__init__()
        self.encoder = Encoder(
            N, dm, h, hidden, input_vocab, max_seq_input, drop_rate)
        self.decoder = Decoder(
            N, dm, h, hidden, target_vocab, max_seq_target, drop_rate)
        self.linear = tf.keras.layers.Dense(units=target_vocab)

    def call(self, inputs, target, training, encoder_mask, look_ahead_mask,
             decoder_mask):
        """                                                                                     
        Calls the transformer network and returns the transformer output                        
                                                                                                
        parameters:                                                                             
            inputs [tensor of shape (batch, input_seq_len)]:                                    
                contains the inputs                                                             
            target [tensor of shape (batch, target_seq_len)]:                                   
                contains the target                                                             
            training [boolean]:                                                                 
                determines if the model is in training                                          
            encoder_mask:                                                                       
                padding mask to be applied to the encoder                                       
            look_ahead_mask:                                                                    
                look ahead mask to be applied to the decoder                                    
            decoder_mask:                                                                       
                padding mask to be applied to the decoder                                       
                                                                                                
        returns:                                                                                
            [tensor of shape (batch, target_seq_len, target_vocab)]:                            
                contains the transformer output                                                 
        """
        encoder_output = self.encoder(inputs, training, encoder_mask)
        decoder_output = self.decoder(target, encoder_output, training,
                                      look_ahead_mask, decoder_mask)
        final_output = self.linear(decoder_output)
        return final_output

class LearningSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """                                                                                         
    Establishes learning rate schedule for training transformer model                           
                                                                                                
    Utilizes given function for learning rate:                                                  
    l_rate = (dm ** -0.5) * min(                                                                
                     (step_num ** -0.5), (step_num * warmup_steps ** -1.5))                     
    """
    def __init__(self, dm, warmup_steps=4000):
        """                                                                                     
        Class constructor                                                                       
                                                                                                
        parameters:                                                                             
            dm [int]:                                                                           
                dimensionality of the model                                                     
            warmup_steps [int]:                                                                 
                number of warmup steps, default=4000                                            
        """
        super(LearningSchedule, self).__init__()

        self.dm = tf.cast(dm, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        """                                                                                     
        Evaluates the learning rate for the given step                                          
        """
        rsqrt_dm = tf.math.rsqrt(self.dm)
        rsqrt_step_arg = tf.math.rsqrt(step)
        warmup_step_arg = step * (self.warmup_steps ** -1.5)
        l_rate = rsqrt_dm * tf.math.minimum(rsqrt_step_arg, warmup_step_arg)
        return l_rate

def train_transformer(N, dm, h, hidden, max_len, batch_size, epochs):
    """                                                                                         
    Creates and trains a transformer for machine translation                                    
        of Portuguese to English using previously created dataset                               
                                                                                                
    parameters:                                                                                 
        N [int]:                                                                                
            number of blocks in the encoder and decoder                                         
        dm [int]:                                                                               
            dimensionality of the model                                                         
        h [int]:                                                                                
            number of heads                                                                     
        hidden [int]:                                                                           
            number of hidden units in the fully connected layers                                
        max_len [int]:                                                                          
            maximum number of tokens per sequence                                               
        batch_size [int]:                                                                       
            batch size used for training                                                        
        epochs [int]:                                                                           
            number of epochs to train for                                                       
    returns:                                                                                    
        the trained model                                                                       
    """
    data = Dataset(batch_size, max_len)
    input_vocab = data.tokenizer_pt.vocab_size + 2
    target_vocab = data.tokenizer_en.vocab_size + 2
    # encoder = data.tokenizer_pt                                                               
    # decoder = data.tokenizer_en                                                               

    transformer = Transformer(N, dm, h, hidden,
                              input_vocab, target_vocab,
                              max_len, max_len)

    learning_rate = LearningSchedule(dm)

    optimizer = tf.keras.optimizers.Adam(learning_rate,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)
    losses = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                           reduction='none')

    def loss_function(actual, prediction):
        """                                                                                     
        Calculate the loss from actual value and the prediction                                 
        """
        mask = tf.math.logical_not(tf.math.equal(actual, 0))
        loss = losses(actual, prediction)
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

    def accuracy_function(actual, prediction):
        """                                                                                     
        Calculates the accuracy of the prediction                                               
        """
        accuracies = tf.equal(actual, tf.argmax(prediction, axis=2))
        mask = tf.math.logical_not(tf.math.equal(actual, 0))
        accuracies = tf.math.logical_and(mask, accuracies)
        accuracies = tf.cast(accuracies, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

    for epoch in range(epochs):
        batch = 0
        for (input, target) in data.data_train:
            target_input = target[:, :-1]
            target_actual = target[:, 1:]
            encoder_mask, look_ahead_mask, decoder_mask = create_masks(
                input, target_input)
            with tf.GradientTape() as tape:
                prediction = transformer(input, target_input, True,
                                         encoder_mask, look_ahead_mask,
                                         decoder_mask)
                loss = loss_function(target_actual, prediction)
            gradients = tape.gradient(loss, transformer.trainable_variables)
            optimizer.apply_gradients(zip(
                gradients, transformer.trainable_variables))
            t_loss = train_loss(loss)
            t_accuracy = train_accuracy(target_actual, prediction)
            if batch % 50 is 0:
                print("Epoch {}, batch {}: loss {} accuracy {}".format(
                    epoch, batch, t_loss, t_accuracy))
            batch += 1
        print("Epoch {}: loss {} accuracy {}".format(
            epoch, t_loss, t_accuracy))
    return transformer




#data = Dataset()
#print("TASK 0:")
#for pt, en in data.data_train.take(1):
    #print(pt.numpy().decode('utf-8'))
    #rint(en.numpy().decode('utf-8'))
#for pt, en in data.data_valid.take(1):
    #print(pt.numpy().decode('utf-8'))
    #print(en.numpy().decode('utf-8'))
#print(type(data.tokenizer_pt))
#print(type(data.tokenizer_en))
#print("TASK 1:")
#for pt, en in data.data_train.take(1):
    #print(data.encode(pt, en))
#for pt, en in data.data_valid.take(1):
    #print(data.encode(pt, en))
#print("TASK 2:")
#for pt, en in data.data_train.take(1):
    #print(pt, en)
#for pt, en in data.data_valid.take(1):
    #print(pt, en)
#print("TASK 3:")
#tf.compat.v1.set_random_seed(0)
#data = Dataset(32, 40)
#for pt, en in data.data_train.take(1):
    #print(pt, en)
#for pt, en in data.data_valid.take(1):
    #print(pt, en)
#print("TASK 4:")
#tf.compat.v1.set_random_seed(0)
#data = Dataset(32, 40)
#for inputs, target in data.data_train.take(1):
    #print(create_masks(inputs, target))
print("TASK 5:")
tf.compat.v1.set_random_seed(0)
transformer = train_transformer(4, 128, 8, 512, 32, 40, 2)
print(type(transformer))


TASK 5:
Epoch 0, batch 0: loss 10.247076034545898 accuracy 0.0
Epoch 0, batch 50: loss 10.212627410888672 accuracy 0.0012723066611215472
Epoch 0, batch 100: loss 10.133011817932129 accuracy 0.011149213649332523
Epoch 0, batch 150: loss 10.015902519226074 accuracy 0.015223746187984943
Epoch 0, batch 200: loss 9.859915733337402 accuracy 0.01601114124059677
Epoch 0, batch 250: loss 9.667801856994629 accuracy 0.018717065453529358
Epoch 0, batch 300: loss 9.447187423706055 accuracy 0.02274213545024395
Epoch 0, batch 350: loss 9.203826904296875 accuracy 0.026899071410298347
Epoch 0, batch 400: loss 8.95121955871582 accuracy 0.03125317767262459
Epoch 0, batch 450: loss 8.706740379333496 accuracy 0.0348038487136364
Epoch 0, batch 500: loss 8.490663528442383 accuracy 0.03764767944812775
Epoch 0, batch 550: loss 8.30311393737793 accuracy 0.04032387584447861
Epoch 0, batch 600: loss 8.133316040039062 accuracy 0.04309402406215668
Epoch 0, batch 650: loss 7.976280689239502 accuracy 0.04649521782994