<a href="https://colab.research.google.com/github/icculp/holbertonschool-machine_learning/blob/main/supervised_learning/0x12-transformer_apps/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
print(tf.__version__)

2.5.0


In [2]:
def positional_encoding(max_seq_len, dm):
    """ calculates the positional encoding for a transformer
        max_seq_len is an integer representing the maximum sequence length
        dm is the model depth
        Returns: a numpy.ndarray of shape (max_seq_len, dm)
            containing the positional encoding vectors
    """
    pe = np.zeros((max_seq_len, dm))

    pe = np.array([[pos / np.power(10000, 2 * (j // 2) / dm)
                    for j in range(dm)]
                  for pos in range(max_seq_len)])
    pe[:, 0::2] = np.sin(pe[:, 0::2])
    pe[:, 1::2] = np.cos(pe[:, 1::2])
    '''
    for pos in range(max_seq_len):
        for i in range(0, dm, 2):
            pe[pos, i] = (np.sin(pos / (10000 ** ((2 * i) / dm))))
            pe[pos, i + 1] = (np.cos(pos / (10000 ** ((2 * (i + 1)) / dm))))'''
    return pe


def sdp_attention(Q, K, V, mask=None):
    """ Q is a tensor with its last two dimensions as (..., seq_len_q, dk)
            containing the query matrix
        K is a tensor with its last two dimensions as (..., seq_len_v, dk)
            containing the key matrix
        V is a tensor with its last two dimensions as (..., seq_len_v, dv)
            containing the value matrix
        mask is a tensor that can be broadcast into (..., seq_len_q, seq_len_v)
            containing the optional mask, or defaulted to None
        if mask is not None, multiply -1e9 to the mask and add it to the
            scaled matrix multiplication
        The preceding dimensions of Q, K, and V are the same
        Returns: output, weights
            output a tensor with its last two dimensions as
                (..., seq_len_q, dv) containing scaled dot product attention
            weights a tensor with its last two dimensions as
                (..., seq_len_q, seq_len_v) containing the attention weights
    """
    matmul_qk = tf.matmul(Q, K, transpose_b=True)
    dk = tf.cast(tf.shape(K)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, V)
    return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
    """  perform multi head attention """

    def __init__(self, dm, h):
        """ dm is an integer representing the dimensionality of the model
            h is an integer representing the number of heads
            dm is divisible by h

            Sets the following public instance attributes:
            h - the number of heads
            dm - the dimensionality of the model
            depth - the depth of each attention head
            Wq - a Dense layer with dm units, used to generate the query matrix
            Wk - a Dense layer with dm units, used to generate the key matrix
            Wv - a Dense layer with dm units, used to generate the value matrix
            linear - Dense layer with dm units, to generate attention output
        """
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.dm = dm
        self.depth = dm // h
        self.Wq = tf.keras.layers.Dense(dm)
        self.Wk = tf.keras.layers.Dense(dm)
        self.Wv = tf.keras.layers.Dense(dm)
        self.linear = tf.keras.layers.Dense(dm)

    def call(self, Q, K, V, mask):
        """
            Q is a tensor of shape (batch, seq_len_q, dk) containing the
                input to generate the query matrix
            K is a tensor of shape (batch, seq_len_v, dk) containing the
                input to generate the key matrix
            V is a tensor of shape (batch, seq_len_v, dv) containing the
                input to generate the value matrix
            mask is always None
            Returns: output, weights
            outputa tensor with its last two dimensions as (..., seq_len_q, dm)
                containing the scaled dot product attention
            weights a tensor with its last three dimensions as
                (..., h, seq_len_q, seq_len_v) containing the attention weights
        """
        def split_heads(x, batch_size):
            """Split the last dimension into (num_heads, depth).
                Transpose the result such that the shape is
                    (batch_size, num_heads, seq_len, depth)
            """
            x = tf.reshape(x, (batch_size, -1, self.h, self.depth))
            return tf.transpose(x, perm=[0, 2, 1, 3])

        batch_size = tf.shape(Q)[0]
        q = self.Wq(Q)
        k = self.Wk(K)
        v = self.Wv(V)
        q = split_heads(q, batch_size)
        k = split_heads(k, batch_size)
        v = split_heads(v, batch_size)

        scaled_attention, attention_weights = sdp_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.dm))

        output = self.linear(concat_attention)
        return output, attention_weights


class EncoderBlock(tf.keras.layers.Layer):
    """  perform multi head attention """

    def __init__(self, dm, h, hidden, drop_rate=0.1):
        """
            dm - the dimensionality of the model
            h - the number of heads
            hidden - the number of hidden units in the fully connected layer
            drop_rate - the dropout rate

            Sets the following public instance attributes:
            mha - a MultiHeadAttention layer
            dense_hidden - hidden dense layer w/ hidden units & relu activation
            dense_output - output dense layer w/ dm units
            layernorm1 - the first layer norm layer, with epsilon=1e-6
            layernorm2 - the second layer norm layer, with epsilon=1e-6
            dropout1 - the first dropout layer
            dropout2 - the second dropout layer
        """
        super(EncoderBlock, self).__init__()
        self.mha = MultiHeadAttention(dm, h)
        self.dense_hidden = tf.keras.layers.Dense(hidden, activation='relu')
        self.dense_output = tf.keras.layers.Dense(dm)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(drop_rate)
        self.dropout2 = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, training, mask=None):
        """
            x - a tensor of shape (batch, input_seq_len, dm)
                containing the input to the encoder block
            training - a boolean to determine if the model is training
            mask - the mask to be applied for multi head attention

            Returns: a tensor of shape (batch, input_seq_len, dm)
                containing the block’s output
        """
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.dense_output(self.dense_hidden(out1))
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2


class DecoderBlock(tf.keras.layers.Layer):
    """  perform multi head attention """

    def __init__(self, dm, h, hidden, drop_rate=0.1):
        """
            dm - the dimensionality of the model
            h - the number of heads
            hidden - the number of hidden units in the fully connected layer
            drop_rate - the dropout rate

            Sets the following public instance attributes:
            mha1 - the first MultiHeadAttention layer
            mha2 - the second MultiHeadAttention layer
            dense_hidden - hidden dense layer w/ hidden units & relu activation
            dense_output - the output dense layer with dm units
            layernorm1 - the first layer norm layer, with epsilon=1e-6
            layernorm2 - the second layer norm layer, with epsilon=1e-6
            layernorm3 - the third layer norm layer, with epsilon=1e-6
            dropout1 - the first dropout layer
            dropout2 - the second dropout layer
            dropout3 - the third dropout layer
        """
        super(DecoderBlock, self).__init__()
        self.mha1 = MultiHeadAttention(dm, h)
        self.mha2 = MultiHeadAttention(dm, h)
        self.dense_hidden = tf.keras.layers.Dense(hidden, activation='relu')
        self.dense_output = tf.keras.layers.Dense(dm)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(drop_rate)
        self.dropout2 = tf.keras.layers.Dropout(drop_rate)
        self.dropout3 = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, encoder_output, training, look_ahead_mask, padding_mask):
        """
            x - a tensor of shape (batch, target_seq_len, dm)
                containing the input to the decoder block
            encoder_output - a tensor of shape (batch, input_seq_len, dm)
                containing the output of the encoder
            training - a boolean to determine if the model is training
            look_ahead_mask - mask applied to first multi head attention layer
            padding_mask - mask applied to second multi head attention layer
            Returns: a tensor of shape (batch, target_seq_len, dm)
                containing the block’s output
        """
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        # print('51')
        attn2, attn_weights_block2 = self.mha2(
            out1, encoder_output, encoder_output, padding_mask)
        # print(54)
        attn2 = self.dropout2(attn2, training=training)
        # print(56)
        out2 = self.layernorm2(attn2 + out1)
        # print(58)
        ffn_output = self.dense_output(self.dense_hidden(out2))
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)
        return out3


class Encoder(tf.keras.layers.Layer):
    """  create encoder for transformer """

    def __init__(self, N, dm, h, hidden, input_vocab,
                 max_seq_len, drop_rate=0.1):
        """
            N - the number of blocks in the encoder
            dm - the dimensionality of the model
            h - the number of heads
            hidden - the number of hidden units in the fully connected layer
            input_vocab - the size of the input vocabulary
            max_seq_len - the maximum sequence length possible
            drop_rate - the dropout rate

            Sets the following public instance attributes:
            N - the number of blocks in the encoder
            dm - the dimensionality of the model
            embedding - the embedding layer for the inputs
            positional_encoding - a numpy.ndarray of shape (max_seq_len, dm)
                containing the positional encodings
            blocks - a list of length N containing all of the EncoderBlock‘s
            dropout - the dropout layer, applied to the positional encodings
        """
        super(Encoder, self).__init__()
        self.N = N
        self.dm = dm
        self.embedding = tf.keras.layers.Embedding(input_dim=input_vocab,
                                                   output_dim=dm)
        self.positional_encoding = positional_encoding(max_seq_len, dm)
        self.blocks = [EncoderBlock(dm, h, hidden, drop_rate)
                       for _ in range(N)]
        self.dropout = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, training, mask):
        """
            x - a tensor of shape (batch, input_seq_len, dm)
                containing the input to the encoder
            training - a boolean to determine if the model is training
            mask - the mask to be applied for multi head attention
            Returns: a tensor of shape (batch, input_seq_len, dm)
                containing the encoder output
        """
        seq_len = x.get_shape().as_list()[1]
        # print(type(seq_len), seq_len)
        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.dm, tf.float32))
        # print('000', self.positional_encoding.shape)
        x += self.positional_encoding[:seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.N):
            x = self.blocks[i](x, training, mask)

        return x


class Decoder(tf.keras.layers.Layer):
    """  create encoder for transformer """

    def __init__(self, N, dm, h, hidden, target_vocab,
                 max_seq_len, drop_rate=0.1):
        """
            N - the number of blocks in the encoder
            dm - the dimensionality of the model
            h - the number of heads
            hidden - the number of hidden units in the fully connected layer
            target_vocab - the size of the target vocabulary
            max_seq_len - the maximum sequence length possible
            drop_rate - the dropout rate

            Sets the following public instance attributes:
            N - the number of blocks in the encoder
            dm - the dimensionality of the model
            embedding - the embedding layer for the inputs
            positional_encoding - a numpy.ndarray of shape (max_seq_len, dm)
                containing the positional encodings
            blocks - a list of length N containing all of the EncoderBlock‘s
            dropout - the dropout layer, applied to the positional encodings
        """
        super(Decoder, self).__init__()
        self.N = N
        self.dm = dm
        self.embedding = tf.keras.layers.Embedding(input_dim=target_vocab,
                                                   output_dim=dm)
        self.positional_encoding = positional_encoding(max_seq_len, dm)
        self.blocks = [DecoderBlock(dm, h, hidden, drop_rate)
                       for _ in range(N)]
        self.dropout = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, encoder_output, training, look_ahead_mask, padding_mask):
        """
            x - a tensor of shape (batch, target_seq_len, dm)
                containing the input to the decoder
            encoder_output - a tensor of shape (batch, input_seq_len, dm)
                containing the output of the encoder
            training - a boolean to determine if the model is training
            look_ahead_mask - mask applied to 1st multi head attention layer
            padding_mask - mask to be applied to 2nd multi head attention layer
            Returns: tensor of shape
                (batch, target_seq_len, dm) containing decoder output
        """
        seq_len = x.get_shape().as_list()[1]
        # seq_len = x.shape[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x = x * tf.math.sqrt(tf.cast(self.dm, tf.float32))
        x = x + self.positional_encoding[:seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.N):
            x = self.blocks[i](x, encoder_output, training,
                               look_ahead_mask, padding_mask)

            # attention_weights[f'decoder_layer{i+1}_block1'] = block1
            # attention_weights[f'decoder_layer{i+1}_block2'] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x  # , attention_weights


class Transformer(tf.keras.Model):
    """  create encoder for transformer """

    def __init__(self, N, dm, h, hidden, input_vocab, target_vocab,
                 max_seq_input, max_seq_target, drop_rate=0.1):
        """
            N - the number of blocks in the encoder and decoder
            dm - the dimensionality of the model
            h - the number of heads
            hidden - the number of hidden units in the fully connected layers
            input_vocab - the size of the input vocabulary
            target_vocab - the size of the target vocabulary
            max_seq_input - the maximum sequence length possible for the input
            max_seq_target - the maximum sequence length possible for target
            drop_rate - the dropout rate

            Sets the following public instance attributes:
            encoder - the encoder layer
            decoder - the decoder layer
            linear - a final Dense layer with target_vocab units
        """
        super(Transformer, self).__init__()
        self.encoder = Encoder(N, dm, h, hidden, input_vocab,
                               max_seq_input, drop_rate)
        self.decoder = Decoder(N, dm, h, hidden, target_vocab,
                               max_seq_target, drop_rate)
        self.linear = tf.keras.layers.Dense(target_vocab)

    def call(self, inputs, target, training, encoder_mask,
             look_ahead_mask, decoder_mask):
        """
            inputs - tensor (batch, input_seq_len)containing the inputs
            target - tensor (batch, target_seq_len)containing the target
            training - a boolean to determine if the model is training
            encoder_mask - the padding mask to be applied to the encoder
            look_ahead_mask - the look ahead mask to be applied to the decoder
            decoder_mask - the padding mask to be applied to the decoder
            Returns: a tensor of shape (batch, target_seq_len, target_vocab)
                containing the transformer output
        """
        enc_output = self.encoder(inputs, training, encoder_mask)
        dec_output = self.decoder(
            target, enc_output, training, look_ahead_mask, decoder_mask)

        final_output = self.linear(dec_output)

        return final_output


In [3]:
''' training pre-imports '''
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

tf.config.run_functions_eagerly(True)
# tf.data.experimental.enable.debug_mode()
class Dataset():
    """ loads and preps a dataset for machine translation """

    def __init__(self, batch_size, max_len):
        """
        batch_size is the batch size for training/validation
        max_len is the maximum number of tokens allowed per example sentence

        data_train, which contains the ted_hrlr_translate/pt_to
            en tf.data.Dataset train split, loaded as_supervided
        data_valid, which contains the ted_hrlr_translate/pt
            to_en tf.data.Dataset validate split, loaded as_supervided
        tokenizer_pt is the Portuguese tokenizer created
            from the training set
        tokenizer_en is the English tokenizer created
            from the training set
        """
        def filter(d, max):
            ''' filters dataset by max_len '''
            return tf.math.less(d, max)
        self.batch_size = batch_size
        train = tfds.load('ted_hrlr_translate/pt_to_en',
                          split='train', as_supervised=True)
        self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(train)
        self.data_train = train.map(self.tf_encode)
        '''tf.logical_and(
                tf.size(x) <= max_len,
                tf.size(y) <= max_len
            )'''
        self.data_train = self.data_train.filter(lambda x, y: tf.logical_and(
            tf.size(x) <= max_len, tf.size(y) <= max_len))  # filter)
        self.data_train = self.data_train.cache()
        # s = 0
        # for _ in self.data_train:
        #    buffer_size += 1
        s = sum(1 for _ in self.data_train)
        '''s = self.data_train.map(lambda x: 1,
                                num_parallel_calls=tf.data.experimental.
                                AUTOTUNE).reduce(tf.constant(0),
                                                 lambda x, _: x+1)'''
        self.data_train = self.data_train.shuffle(s)
        # self.data_train.cardinality().numpy().size)
        self.data_train = self.data_train.padded_batch(batch_size)
        # , padded_shapes=batch_size)  # , (batch_size, [None]))
        self.data_train = self.data_train.prefetch(tf.data.
                                                   experimental.AUTOTUNE)
        valid = tfds.load('ted_hrlr_translate/pt_to_en',
                          split='validation', as_supervised=True)
        self.data_valid = valid.map(self.tf_encode)
        self.data_valid = self.data_valid.filter(lambda x, y: tf.logical_and(
            tf.size(x) <= max_len, tf.size(y) <= max_len))
        self.data_valid = self.data_valid.padded_batch(batch_size)

    def tokenize_dataset(self, data):
        """ creates sub-word tokenizers for our dataset:
            data is a tf.data.Dataset whose examples are formatted as a tuple
                (pt, en)
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
            The maximum vocab size should be set to 2**15
            Returns: tokenizer_pt, tokenizer_en
                tokenizer_pt is the Portuguese tokenizer
                tokenizer_en is the English tokenizer
        """
        STE = tfds.deprecated.text.SubwordTextEncoder
        tokenizer_pt = STE.build_from_corpus(
            (pt.numpy() for pt, en in data), target_vocab_size=2**15)
        tokenizer_en = STE.build_from_corpus(
            (en.numpy() for pt, en in data), target_vocab_size=2**15)
        return tokenizer_pt, tokenizer_en

    def encode(self, pt, en):
        """ encodes a translation into tokens
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
            The tokenized sentences should include the start and end of
                sentence tokens
            The start token should be indexed as vocab_size
            The end token should be indexed as vocab_size + 1
            Returns: pt_tokens, en_tokens
                pt_tokens is a np.ndarray containing the Portuguese tokens
                en_tokens is a np.ndarray. containing the English tokens
        """
        vocab_en = self.tokenizer_en.vocab_size
        vocab_pt = self.tokenizer_pt.vocab_size
        p = [vocab_pt] + self.tokenizer_pt.encode(pt.numpy()) + [vocab_pt + 1]
        e = [vocab_en] + self.tokenizer_en.encode(en.numpy()) + [vocab_en + 1]
        return p, e

    def tf_encode(self, pt, en):
        ''' acts as a tensorflow wrapper for the encode instance method
            set shape of return tensors
        '''
        tp, te = tf.py_function(func=self.encode,
                                inp=[pt, en],
                                Tout=[tf.int64, tf.int64])
        tp = tf.ensure_shape(tp, [None])
        te = tf.ensure_shape(te, [None])
        # tp.set_shape([None])
        # te.set_shape([None])
        return tp, te

def create_masks(inputs, target):
    ''' creates all masks for training/validation
        inputs is a tf.Tensor of shape (batch_size, seq_len_in)
            that contains the input sentence
        target is a tf.Tensor of shape (batch_size, seq_len_out)
            that contains the target sentence
        This function should only use tensorflow operations in order to
            properly function in the training step
        Returns: encoder_mask, combined_mask, decoder_mask
            encoder_mask is the tf.Tensor padding mask of shape
                (batch_size, 1, 1, seq_len_in) to be applied in the encoder
            combined_mask is the tf.Tensor of shape
                (batch_size, 1, seq_len_out, seq_len_out) used in the 1st
                attention block in the decoder to pad and mask future tokens
                in the input received by the decoder. It takes the maximum
                between a lookaheadmask and the decoder target padding mask.
            decoder_mask is the tf.Tensor padding mask of shape
                (batch_size, 1, 1, seq_len_in) used in the 2nd attention block
                in the decoder.
    '''
    batch_size, seq_len_in = tf.shape(inputs).numpy()
    _, seq_len_out = tf.shape(target).numpy()

    def create_look_ahead_mask(size):
        ''' lookahead mask '''
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask

    def create_padding_mask(seq):
        ''' padding mask '''
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
        # add extra dimensions to add the padding
        # to the attention logits.
        return seq[:, tf.newaxis, tf.newaxis, :]

    encoder_mask = create_padding_mask(inputs)
    decoder_mask = create_padding_mask(inputs)
    look_ahead = create_look_ahead_mask(seq_len_out)
    decoder_target = create_padding_mask(target)
    combined_mask = tf.maximum(look_ahead, decoder_target)
    return encoder_mask, combined_mask, decoder_mask

In [20]:
def train_transformer(N, dm, h, hidden, max_len, batch_size, epochs):
    """ creates and trains a transformer model for machine translation of
            Portuguese to English using our previously created dataset
        N the number of blocks in the encoder and decoder
        dm the dimensionality of the model
        h the number of heads
        hidden the number of hidden units in the fully connected layers
        max_len the maximum number of tokens per sequence
        batch_size the batch size for training
        epochs the number of epochs to train for
        trained with Adam optimization with
            beta_1=0.9, beta_2=0.98, epsilon=1e-9
        learning rate should scheduled using the following equation:
            lrate = (d ** 0.5) * min(step_num ** -.5,
                                     step_num * warmup_steps ** -1.5)
            with warmup_steps=4000:

        sparse categorical crossentropy loss, ignoring padded tokens

        Returns the trained model
    """

    class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        ''' custom schedule for learning rate '''
        def __init__(self, d_model, warmup_steps=4000):
            ''' constructor '''
            super(CustomSchedule, self).__init__()

            self.d_model = d_model
            self.d_model = tf.cast(self.d_model, tf.float32)

            self.warmup_steps = warmup_steps

        def __call__(self, step):
            ''' instance '''
            arg1 = tf.math.rsqrt(step)
            arg2 = step * (self.warmup_steps ** -1.5)

            return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

    data = Dataset(batch_size, max_len)
    # create_masks()
    input_v = data.tokenizer_pt.vocab_size + 2
    target_v = data.tokenizer_en.vocab_size + 2
    transformer = Transformer(N, dm, h, hidden, input_v,
                              target_v, max_len, max_len)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    def loss_function(real, pred):
        ''' loss '''
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

    def accuracy_function(real, pred):
        ''' acc '''
        accuracies = tf.equal(real, tf.argmax(pred, axis=2))

        mask = tf.math.logical_not(tf.math.equal(real, 0))
        accuracies = tf.math.logical_and(mask, accuracies)

        accuracies = tf.cast(accuracies, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

    learning_rate = CustomSchedule(dm)

    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9,
                                         beta_2=0.98, epsilon=1e-9)
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    train_step_signature = [
        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    ]

    @tf.function(input_signature=train_step_signature)
    def train_step(inp, tar):
        ''' train_step '''
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        enc_padding_mask, combined_mask, dec_padding_mask = \
            create_masks(inp, tar_inp)

        with tf.GradientTape() as tape:
            predictions = transformer(inp, tar_inp,
                                         True,
                                         enc_padding_mask,
                                         combined_mask,
                                         dec_padding_mask)
            loss = loss_function(tar_real, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients,
                                  transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(accuracy_function(tar_real, predictions))

    for epoch in range(epochs):
        batch = 0
        for (inp, tar) in data.data_train:
            train_step(inp, tar)

            if batch % 50 == 0:
                print(f'Epoch {epoch + 1} Batch {batch} Loss ' +
                      f'{train_loss.result()} Accuracy ' +
                      f'{train_accuracy.result()}')
            batch += 1
        print(f'Epock {epoch+1}: loss {train_loss.result()}')
    return model


In [21]:
tf.compat.v1.set_random_seed(0)
transformer = train_transformer(4, 128, 8, 512, 32, 40, 2)
print(type(transformer))

  "Even though the `tf.config.experimental_run_functions_eagerly` "


Epoch 1 Batch 0 Loss 10.247076034545898 Accuracy 0.0
Epoch 1 Batch 50 Loss 10.212628364562988 Accuracy 0.0024144179187715054
Epoch 1 Batch 100 Loss 10.133011817932129 Accuracy 0.02189357951283455
Epoch 1 Batch 150 Loss 10.015898704528809 Accuracy 0.02988712303340435
Epoch 1 Batch 200 Loss 9.859907150268555 Accuracy 0.031428012996912
Epoch 1 Batch 250 Loss 9.667792320251465 Accuracy 0.036626458168029785
Epoch 1 Batch 300 Loss 9.447176933288574 Accuracy 0.04432251304388046
Epoch 1 Batch 350 Loss 9.203819274902344 Accuracy 0.052301373332738876
Epoch 1 Batch 400 Loss 8.951211929321289 Accuracy 0.060824524611234665
Epoch 1 Batch 450 Loss 8.70673656463623 Accuracy 0.06777610629796982
Epoch 1 Batch 500 Loss 8.490659713745117 Accuracy 0.07332771271467209
Epoch 1 Batch 550 Loss 8.30309009552002 Accuracy 0.07851012051105499
Epoch 1 Batch 600 Loss 8.133262634277344 Accuracy 0.08390526473522186
Epoch 1 Batch 650 Loss 7.976217269897461 Accuracy 0.09054746478796005
Epoch 1 Batch 700 Loss 7.828542709

NameError: ignored