In [1]:
import numpy as np
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import layers, regularizers, optimizers, activations, Model
import scipy as sy


In [2]:
from keras.layers import Layer, Dense, ReLU, LayerNormalization, Dropout, Input
from keras.backend import softmax

In [3]:
from tensorflow import matmul, math, cast, float32, reshape, transpose, shape

In [4]:
from tensorflow import convert_to_tensor, string
from keras.layers import TextVectorization, Embedding
from tensorflow import data
from tensorflow.data import Dataset

In [5]:
class DotProductAttention(Layer):
    def __init__(self, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)

    def call(self, queries, keys, values, d_k, mask=None):
        # Score the queres against the keys after transposing the latter, scaling
        scores = matmul(queries, keys, transpose_b=True)/ math.sqrt(cast(d_k, float32))

        #apply mask to the attention scores 
        if mask is not None:
            scores += -1e9 * mask

        #Compute the weights by a softmax operation
        weights = softmax(scores)

        #Coimputing the attention by a weighted sum ofthe value vectors

        return matmul(weights, values)

In [6]:
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_model = 512  # Dimensionality of the model sub-layers' outputs
batch_size = 64  # Batch size from the training process

In [7]:
'''vocab_size = 10
input_seq_length = 5  # Maximum length of the input sequence
output_sequence_length = 5
queries = random.random((batch_size, input_seq_length, d_k))
keys = random.random((batch_size, input_seq_length, d_k))
values = random.random((batch_size, input_seq_length, d_v))'''

'vocab_size = 10\ninput_seq_length = 5  # Maximum length of the input sequence\noutput_sequence_length = 5\nqueries = random.random((batch_size, input_seq_length, d_k))\nkeys = random.random((batch_size, input_seq_length, d_k))\nvalues = random.random((batch_size, input_seq_length, d_v))'

In [8]:
#attention = DotProductAttention()
#print(attention(queries, keys, values, d_k))

In [9]:
class MultiHeadAttention(Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.attention = DotProductAttention() #Scaled dot product attention
        self.heads = h #number of attention heads to use
        self.d_k = d_k # dimensionality of the linearly projected queries and keys
        self.d_v = d_v # dimensionality of the linearly projected values
        self.d_model = d_model #Dimensionality of the model
        self.W_q = Dense(d_k) # Learned projectino matrix for the queries
        self.W_k = Dense(d_k) # Learned projection matrix for the keys
        self.W_v = Dense(d_v) # Learning projection matrix for the values
        self.W_o = Dense(d_model) # learning projection matrix for the multi-head output

    def reshape_tensor(self, x, heads, flag):
        if flag:
            #Tensor shape after reshaping and transpoing: (batch_size, heads, seq_length, -1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, int(x.shape[2]/heads)))
            x = transpose(x, perm=(0,2,1,3))

        else:
            #reverting the reshaping and transpoing operations: (batch_size, seq_length, d_model)
            x = transpose(x, perm=(0,2,1,3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], int(x.shape[2]*x.shape[3])))

        return x

    def call(self, queries, keys, values, mask=None):
        # Rearange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)

        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)

        #Rearrange the values to be able to compute all the heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
        #resulting tensor shape: (batch_size, heads, input_seq_length, -1)

        # Compute the multi-head attention output using the reshape queries, keys, and values
        o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, self.d_k, mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)

        # rearranage back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, self.heads, d_v)

        # apply one final linear projection to the output to generate the multi-head attention
        # resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)


            

In [10]:
#multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
#print(multihead_attention(queries, keys, values))

In [11]:
#Building the positional embedding 

In [12]:
class PositionEmbeddingFixedWeights(Layer):
    def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
        super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)   
        position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)                                          
        self.word_embedding_layer = Embedding(
            input_dim=vocab_size, output_dim=output_dim,
            weights=[word_embedding_matrix],
            trainable=False
        )
        self.position_embedding_layer = Embedding(
            input_dim=sequence_length, output_dim=output_dim,
            weights=[position_embedding_matrix],
            trainable=False
        )
             
    def get_position_encoding(self, seq_len, d, n=100000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P


    def call(self, inputs):        
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

In [13]:
#Building the Encoder

In [14]:
class FeedForward(Layer):
    def __init__(self, d_ff, d_model, **kwargs):
        super(FeedForward, self).__init__(**kwargs)
        self.fully_connected1 = Dense(d_ff) #first fully connected layer
        self.fully_connected2 = Dense(d_model) # Second fully connected layer
        self.activation = ReLU() # ReLU activation layer

    def call(self, x):
        #the input is passed into the fully connected layers, with a relu inbetween
        x_fc1 = self.fully_connected1(x)

        return self.fully_connected2(self.activation(x_fc1))
    

In [15]:
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super(AddNormalization, self).__init__(**kwargs)
        self.layer_norm = LayerNormalization() #Layer normalization layer

    def call(self, x, sublayer_x):
        # the sublayer input and output need to be of the same shape to be summed

        add = x + sublayer_x

        return self.layer_norm(add)

In [16]:
class EncoderLayer(Layer):
    def __init__(self, sequence_length,h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super(EncoderLayer, self).__init__(**kwargs)
        self.multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
        self.sequence_length = sequence_length
        self.build(input_shape=[None, sequence_length, d_model])
        self.d_model = d_model

    def build_graph(self):
        input_layer = Input(shape=(self.sequence_length, self.d_model))
        return Model(inputs=[input_layer], outputs=self.call(input_layer, None, True))

    def call(self, x, padding_mask, training):
        # Multi-head attention layer
        multihead_output = self.multihead_attention(x, x,x, padding_mask)
        #Expected output shape = (batch_size, sequence_length, d_model)

        # Add in a dropout layer
        multihead_output = self.dropout1(multihead_output, training=training)

        #folling by an add and normalization layer
        addnorm_output = self.add_norm1(x, multihead_output)

        #followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output)

        #add in another dropout layer
        feedforward_output = self.dropout2(feedforward_output, training=training)

        #followed by another add and norm layer
        return self.add_norm2(addnorm_output, feedforward_output)

In [17]:
class Encoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
        self.dropout = Dropout(rate)
        self.encoder_layer = [EncoderLayer(sequence_length,h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]

    def call(self, input_sentence, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(input_sentence)
        #Expected output shape = (batch_size, sequence_length, d_model)

        #add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)

        # Pass on the positional encoded values to each encoder layer
        for i, layer in enumerate(self.encoder_layer):
            x = layer(x, padding_mask, training)

        return x

In [18]:
#testing the encoder layer
'''
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_ff = 2048  # Dimensionality of the inner fully connected layer
d_model = 512  # Dimensionality of the model sub-layers' outputs
n = 6  # Number of layers in the encoder stack

batch_size = 64  # Batch size from the training process
dropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers


enc_vocab_size = 20 # Vocabulary size for the encoder
input_seq_length = 5  # Maximum length of the input sequence

input_seq = random.random((batch_size, input_seq_length))
'''


"\nh = 8  # Number of self-attention heads\nd_k = 64  # Dimensionality of the linearly projected queries and keys\nd_v = 64  # Dimensionality of the linearly projected values\nd_ff = 2048  # Dimensionality of the inner fully connected layer\nd_model = 512  # Dimensionality of the model sub-layers' outputs\nn = 6  # Number of layers in the encoder stack\n\nbatch_size = 64  # Batch size from the training process\ndropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers\n\n\nenc_vocab_size = 20 # Vocabulary size for the encoder\ninput_seq_length = 5  # Maximum length of the input sequence\n\ninput_seq = random.random((batch_size, input_seq_length))\n"

In [19]:
#Building the Decoder

In [20]:
class DecoderLayer(Layer):
    def __init__(self,sequence_length, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super(DecoderLayer, self).__init__(**kwargs)
        self.multihead_attention1 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()
        self.multihead_attention2 = MultiHeadAttention(h, d_k, d_v, d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()
        self.feed_forward = FeedForward(d_ff, d_model)
        self.dropout3 = Dropout(rate)
        self.add_norm3 = AddNormalization()
        self.build(input_shape=[None, sequence_length, d_model])
        self.sequence_length = sequence_length
        self.d_model = d_model

    def build_graph(self):
        input_layer = Input(shape=(self.sequence_length, self.d_model))
        return Model(inputs=[input_layer], outputs=self.call(input_layer, input_layer, None, None, True))
    
    def call(self, x, encoder_output, lookahead_mask, padding_mask, training):
        # Multi-head attention layer
        multihead_output1 = self.multihead_attention1(x, x, x, lookahead_mask)
        #Expected output shape = (batch_size, sequence_length, d_model)

        #add in a dropout layer
        multihead_output1 = self.dropout1(multihead_output1, training=training)

        #followed by the add and norm layer
        addnorm_output1 = self.add_norm1(x, multihead_output1)

        #followed by another multi-head attention layer
        multihead_output2 = self.multihead_attention2(addnorm_output1, encoder_output, encoder_output, padding_mask)

        #another dropout
        multihead_output2 = self.dropout2(multihead_output2, training=training)

        #another add and norm layer
        addnorm_output2 = self.add_norm2(addnorm_output1, multihead_output2)

        # Followed by a fully connected layer
        feedforward_output = self.feed_forward(addnorm_output2)

        #and another dropout
        feedforward_output = self.dropout3(feedforward_output, training=training)

        #followed by another add and norm layer
        return self.add_norm3(addnorm_output2, feedforward_output)

In [21]:
class Decoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length, vocab_size, d_model)
        self.dropout = Dropout(rate)
        self.decoder_layer = [DecoderLayer(sequence_length,h, d_k, d_v, d_model, d_ff, rate) for _ in range(n)]

    def call(self, output_target, encoder_output, lookahead_mask, padding_mask, training):
        #Generate the positional encoding
        pos_encoding_output = self.pos_encoding(output_target)
        #Epected output shape = (number of sentences, sequence_length, d_model)

        # add in a dropout layer

        x = self.dropout(pos_encoding_output, training=training)

        #pass on the positional encoded values to each decoder layer
        for i, layer in enumerate(self.decoder_layer):
            x = layer(x, encoder_output, lookahead_mask, padding_mask, training)

        return x


In [22]:
#testing the decoder layer
'''
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_ff = 2048  # Dimensionality of the inner fully connected layer
d_model = 512  # Dimensionality of the model sub-layers' outputs
n = 6  # Number of layers in the encoder stack

batch_size = 64  # Batch size from the training process
dropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers


dec_vocab_size = 20 # Vocabulary size for the decoder
input_seq_length = 5  # Maximum length of the input sequence

input_seq = random.random((batch_size, input_seq_length))
enc_output = random.random((batch_size, input_seq_length, d_model))


decoder = Decoder(dec_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)
print(decoder(input_seq, enc_output, None, True))
'''



"\nh = 8  # Number of self-attention heads\nd_k = 64  # Dimensionality of the linearly projected queries and keys\nd_v = 64  # Dimensionality of the linearly projected values\nd_ff = 2048  # Dimensionality of the inner fully connected layer\nd_model = 512  # Dimensionality of the model sub-layers' outputs\nn = 6  # Number of layers in the encoder stack\n\nbatch_size = 64  # Batch size from the training process\ndropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers\n\n\ndec_vocab_size = 20 # Vocabulary size for the decoder\ninput_seq_length = 5  # Maximum length of the input sequence\n\ninput_seq = random.random((batch_size, input_seq_length))\nenc_output = random.random((batch_size, input_seq_length, d_model))\n\n\ndecoder = Decoder(dec_vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)\nprint(decoder(input_seq, enc_output, None, True))\n"

In [23]:
# Putting it all together  + masking

In [24]:
#Masking

In [25]:
def padding_mask(input):
    #create a mask which marks the zero padding values in the input by a 1
    mask = math.equal(input, 0)
    mask = cast(mask, float32)

    return mask

In [26]:
input = np.array([1,2,3,4,0,0,0])

print(padding_mask(input))

tf.Tensor([0. 0. 0. 0. 1. 1. 1.], shape=(7,), dtype=float32)


In [27]:
from tensorflow import linalg, ones

In [28]:
def lookahead_mask(shape):
    # mask out the future entries by marking them with a 1.0
    mask = 1 - linalg.band_part(ones((shape, shape)),-1,0)

    return mask

In [29]:
print(lookahead_mask(5))

tf.Tensor(
[[0. 1. 1. 1. 1.]
 [0. 0. 1. 1. 1.]
 [0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32)


In [30]:
#Creating the transformer model

In [31]:
from keras import Model

In [32]:
class TransformerModel(Model):
    def __init__(self, enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff_inner, n, rate, **kwargs):
        super(TransformerModel, self).__init__(**kwargs)

        #set up the encoder
        self.encoder = Encoder(enc_vocab_size, enc_seq_length, h, d_k, d_v, d_model, d_ff_inner, n, rate)

        #set up the decoder
        self.decoder = Decoder(dec_vocab_size, dec_seq_length, h, d_k, d_v, d_model, d_ff_inner, n, rate)

        #the final dense layer
        self.model_last_layer = Dense(dec_vocab_size)

    def padding_mask(self, input):
        # Create mask which marks the zero padding values in the input by a 1.0
        mask = math.equal(input, 0)
        mask = cast(mask, float32)
 
        # The shape of the mask should be broadcastable to the shape
        # of the attention weights that it will be masking later on
        return mask[:, tf.newaxis, tf.newaxis, :]
 
    def lookahead_mask(self, shape):
        # Mask out future entries by marking them with a 1.0
        mask = 1 - linalg.band_part(ones((shape, shape)), -1, 0)
 
        return mask
    def call(self, encoder_input, decoder_input, training):

        #create a padding mask to mask the encoder inputs and the encoder outputs in the decoder
        enc_padding_mask = self.padding_mask(encoder_input)

        #Create and conbine the padding and look-ahead masks to be fed into the decoder
        dec_in_padding_mask = self.padding_mask(decoder_input)
        dec_in_lookahead_mask = self.lookahead_mask(decoder_input.shape[1])
        dec_in_lookahead_mask = math.maximum(dec_in_padding_mask, dec_in_lookahead_mask)

        #feed the input into the encoder
        encoder_output = self.encoder(encoder_input, enc_padding_mask, training)

        #feed the encoder output into the decoder
        decoder_output = self.decoder(decoder_input, encoder_output, dec_in_lookahead_mask, enc_padding_mask, training)

        #pass the deocder output through the final dense layer
        model_output = self.model_last_layer(decoder_output)

        return model_output



In [33]:
#Creating an instance of the transformer model

In [34]:
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_ff = 2048  # Dimensionality of the inner fully connected layer
d_model = 512  # Dimensionality of the model sub-layers' outputs
n = 6  # Number of layers in the encoder stack

dropout_rate = 0.1  # Frequency of dropping the input units in the dropout layers

enc_seq_length = 20  # Encoder sequence length
dec_seq_length = 75  # Decoder sequence length
enc_vocab_size = 2404  # Encoder vocabulary size
dec_vocab_size = 3864  # Decoder vocabulary size

In [35]:
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

In [36]:
encoder = EncoderLayer(enc_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate)
encoder.build_graph().summary()
 


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20, 512)]    0           []                               
                                                                                                  
 multi_head_attention_18 (Multi  (None, 20, 512)     131776      ['input_1[0][0]',                
 HeadAttention)                                                   'input_1[0][0]',                
                                                                  'input_1[0][0]']                
                                                                                                  
 dropout_32 (Dropout)           (None, 20, 512)      0           ['multi_head_attention_18[0][0]']
                                                                                              

In [37]:
decoder = DecoderLayer(dec_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate)
decoder.build_graph().summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 75, 512)]    0           []                               
                                                                                                  
 multi_head_attention_19 (Multi  (None, 75, 512)     131776      ['input_2[0][0]',                
 HeadAttention)                                                   'input_2[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 dropout_34 (Dropout)           (None, 75, 512)      0           ['multi_head_attention_19[0][0]']
                                                                                            

In [38]:
#Gathering and cleaning dataset

In [39]:
#Cleaned data available here:
#https://github.com/Rishav09/Neural-Machine-Translation-System/blob/master/english-german-both.pkl


In [40]:
#German-English sentence pairs for translation

In [41]:
from pickle import load
from numpy.random import shuffle
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow import convert_to_tensor, int64

In [42]:
def load_clean_sentences(filename):
 return load(open(filename, 'rb'))

In [43]:
dataset = load_clean_sentences('english-german-large.pkl')

In [44]:
from pickle import load, dump, HIGHEST_PROTOCOL
from numpy.random import shuffle
from numpy import savetxt

In [45]:
class PrepareDataset:
 def __init__(self, **kwargs):
    super(PrepareDataset, self).__init__(**kwargs)
    self.n_sentences = 100000  # Number of sentences to include in the dataset
    self.train_split = 0.8  # Ratio of the training data split
    self.val_split = 0.1 # Ratio of th evalidation data split
 
 # Fit a tokenizer
 def create_tokenizer(self, dataset):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataset)
 
    return tokenizer
 
 def find_seq_length(self, dataset):
    return max(len(seq.split()) for seq in dataset)
 
 def find_vocab_size(self, tokenizer, dataset):
    tokenizer.fit_on_texts(dataset)
 
    return len(tokenizer.word_index) + 1

   # Encode and pad the input sequences
 def encode_pad(self, dataset, tokenizer, seq_length):
      x = tokenizer.texts_to_sequences(dataset)
      x = pad_sequences(x, maxlen=seq_length, padding = 'post')
      x = convert_to_tensor(x, dtype=int64)

      return x
 def save_tokenizer(self, tokenizer, name):
   with open(name + '_tokenizer.pkl','wb') as handle:
      dump(tokenizer, handle, protocol=HIGHEST_PROTOCOL)
 
 def __call__(self, filename, **kwargs):
    # Load a clean dataset
    clean_dataset = load(open(filename, 'rb'))
    
    # Reduce dataset size
    dataset = clean_dataset[:self.n_sentences, :]
    
    # Include start and end of string tokens
    for i in range(dataset[:, 0].size):
        dataset[i, 0] = "<START> " + dataset[i, 0] + " <EOS>"
        dataset[i, 1] = "<START> " + dataset[i, 1] + " <EOS>"
    
    # Random shuffle the dataset
    shuffle(dataset)
    
    # Split the dataset
    train = dataset[:int(self.n_sentences * self.train_split)]
    val = dataset[int(self.n_sentences*self.train_split):int(self.n_sentences*(1-self.val_split))]
    test = dataset[int(self.n_sentences*(1-self.val_split)):]
       
    # Prepare tokenizer for the encoder input
    enc_tokenizer = self.create_tokenizer(dataset[:, 0])
    enc_seq_length = self.find_seq_length(dataset[:, 0])
    enc_vocab_size = self.find_vocab_size(enc_tokenizer, train[:, 0])

    # Prepare tokenizer for the decoder input
    dec_tokenizer = self.create_tokenizer(dataset[:, 1])
    dec_seq_length = self.find_seq_length(dataset[:, 1])
    dec_vocab_size = self.find_vocab_size(dec_tokenizer, train[:, 1])
    
    # Encode and pad the training input
    trainX = self.encode_pad(train[:, 0], enc_tokenizer, enc_seq_length)
    trainY = self.encode_pad(train[:, 1], dec_tokenizer, dec_seq_length)

   # Encode and pad the validation input
    valX = self.encode_pad(val[:, 0], enc_tokenizer, enc_seq_length)
    valY = self.encode_pad(val[:, 1], dec_tokenizer, dec_seq_length)

   # Save the encoder tokenizer
    self.save_tokenizer(enc_tokenizer, 'enc')

   # Save the decoder tokenizer
    self.save_tokenizer(dec_tokenizer, 'dec')

   # Save the testing dataset into a text file
    savetxt('test_dataset.txt', test, fmt='%s')

    return trainX, trainY, valX, valY, train, val, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size


In [46]:

from keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from keras.metrics import Mean
from tensorflow import data, train, math, reduce_sum, cast, equal, argmax, float32, GradientTape, TensorSpec, function, int64
from keras.losses import sparse_categorical_crossentropy

from time import time

In [108]:
# Define the model parameters
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_model = 512  # Dimensionality of model layers' outputs
d_ff = 2048  # Dimensionality of the inner fully connected layer
n = 6  # Number of layers in the encoder stack
 
# Define the training parameters
epochs = 30
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1

In [48]:
class LRScheduler(LearningRateSchedule):
    def __init__(self, d_model, warmup_steps = 4000, **kwargs):
        super(LRScheduler, self).__init__(**kwargs)
        self.d_model = cast(d_model, float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step_num):
        # Linearly increasing the learning rate for the first warmup_steps and then decreasing
        arg1 = step_num ** -0.5
        arg2 = step_num * (self.warmup_steps  ** -1.5)

        return (self.d_model ** -0.5) * math.minimum(arg1, arg2)

In [109]:
optimizer = Adam(LRScheduler(d_model),beta_1, beta_2, epsilon)

In [50]:
# Prepare the training dataset
dataset = PrepareDataset()
trainX, trainY, valX, valY, train_orig, val_orig, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size = dataset('english-german-large.pkl')
 
print(enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size)

12 19 10301 19319


In [51]:
# Prepare the training dataset batches
train_dataset = data.Dataset.from_tensor_slices((trainX, trainY))
train_dataset = train_dataset.batch(batch_size)
 
# Prepare the validation dataset batches
val_dataset = data.Dataset.from_tensor_slices((valX, valY))
val_dataset = val_dataset.batch(batch_size)

In [52]:
print(trainX[:,1])

tf.Tensor([ 5  3 24 ... 22 89  5], shape=(80000,), dtype=int64)


In [53]:
#create model
training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n , dropout_rate)

In [54]:
# Defining the loss function
def loss_fcn(target, prediction):
    # Create mask so that the zero padding values are not included in the computation of loss
    padding_mask = math.logical_not(equal(target, 0))
    padding_mask = cast(padding_mask, float32)
 
    # Compute a sparse categorical cross-entropy loss on the unmasked values
    loss = sparse_categorical_crossentropy(target, prediction, from_logits=True) * padding_mask
 
    # Compute the mean loss over the unmasked values
    return reduce_sum(loss) / reduce_sum(padding_mask)
 
 
# Defining the accuracy function
def accuracy_fcn(target, prediction):
    # Create mask so that the zero padding values are not included in the computation of accuracy
    padding_mask = math.logical_not(equal(target, 0))
 
    # Find equal prediction and target values, and apply the padding mask
    accuracy = equal(target, argmax(prediction, axis=2))
    accuracy = math.logical_and(padding_mask, accuracy)
 
    # Cast the True/False values to 32-bit-precision floating-point numbers
    padding_mask = cast(padding_mask, float32)
    accuracy = cast(accuracy, float32)
 
    # Compute the mean accuracy over the unmasked values
    return reduce_sum(accuracy) / reduce_sum(padding_mask)
 
 
# Include metrics monitoring
train_loss = Mean(name='train_loss')
train_accuracy = Mean(name='train_accuracy')
val_loss = Mean(name='val_loss')
val_accuracy = Mean(name='val_accuracy')
 
# Create a checkpoint object and manager to manage multiple checkpoints
ckpt = train.Checkpoint(model=training_model, optimizer=optimizer)
ckpt_manager = train.CheckpointManager(ckpt, "./checkpoints", max_to_keep=None)

#Initialize dictionaries to store the training and validation metrics
train_loss_dict = {}
val_loss_dict = {}
train_accuracy_dict = {}
val_accuracy_dict = {}

# Speeding up the training process
@function
def train_step(encoder_input, decoder_input, decoder_output):
    with GradientTape() as tape:
 
        # Run the forward pass of the model to generate a prediction
        prediction = training_model(encoder_input, decoder_input, training=True)
 
        # Compute the training loss
        loss = loss_fcn(decoder_output, prediction)
 
        # Compute the training accuracy
        accuracy = accuracy_fcn(decoder_output, prediction)
 
    # Retrieve gradients of the trainable variables with respect to the training loss
    gradients = tape.gradient(loss, training_model.trainable_weights)
 
    # Update the values of the trainable variables by gradient descent
    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))
    
    train_loss(loss)
    train_accuracy(accuracy)

In [110]:
for epoch in range(epochs):
 
    train_loss.reset_states()
    train_accuracy.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
 
    print("\nStart of epoch %d" % (epoch + 1))
 
    start_time = time()
 
    # Iterate over the dataset batches
    for step, (train_batchX, train_batchY) in enumerate(train_dataset):
 
        # Define the encoder and decoder inputs, and the decoder output
        encoder_input = train_batchX[:, 1:]
        decoder_input = train_batchY[:, :-1]
        decoder_output = train_batchY[:, 1:]
 
        train_step(encoder_input, decoder_input, decoder_output)
 
        if step % 50 == 0:
            print(f'Epoch {epoch + 1} Step {step} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
            # print("Samples so far: %s" % ((step + 1) * batch_size))
    
    #Run a validation step after every epoch of training
    for val_batchX, val_batchY in val_dataset:
        
        #Define the encoder and decoder inputs, and the decoder output
        encoder_input = val_batchX[:, 1:]
        decoder_input = val_batchY[:, :-1]
        decoder_output = val_batchY[:, 1:]

        #Generate a prediction
        prediction = training_model(encoder_input, decoder_input, training=False)

        #Compute the validation loss
        loss = loss_fcn(decoder_output, prediction)
        val_loss(loss)
        accuracy = accuracy_fcn(decoder_output,prediction)
        val_accuracy(accuracy)

    # Print epoch number and loss value at the end of every epoch
    print("Epoch %d: Training Loss %.4f, Training Accuracy %.4f, Validation Loss %.4f, Validation Accuracy %.4f" % (epoch + 1, train_loss.result(), train_accuracy.result(), val_loss.result(), val_accuracy.result()))
 
    # Save a checkpoint after every five epochs
    if (epoch + 1) % 5 == 0:
        save_path = ckpt_manager.save()
        print("Saved checkpoint at epoch %d" % (epoch + 1))

        # Save the trained model weights
        training_model.save_weights('weights/wghts' + str(epoch+1) + '.ckpt')

        train_loss_dict[epoch] = train_loss.result()
        train_accuracy_dict[epoch] = train_accuracy.result()
        val_loss_dict[epoch] = val_loss.result()
        val_accuracy_dict[epoch] = val_accuracy.result()
# Save the training loss values
with open('./train_loss.pkl', 'wb') as file:
    dump(train_loss_dict, file)

with open('./train_accuracy.pkl', 'wb') as file:
    dump(train_accuracy_dict, file)
 
# Save the validation loss values
with open('./val_loss.pkl', 'wb') as file:
    dump(val_loss_dict, file)

with open('./val_accuracy.pkl', 'wb') as file:
    dump(val_accuracy_dict, file)
 
print("Total time taken: %.2fs" % (time() - start_time))


Start of epoch 21
Epoch 21 Step 0 Loss 0.6587 Accuracy 0.8519
Epoch 21 Step 50 Loss 0.6977 Accuracy 0.8482
Epoch 21 Step 100 Loss 0.6986 Accuracy 0.8478
Epoch 21 Step 150 Loss 0.7095 Accuracy 0.8460
Epoch 21 Step 200 Loss 0.7128 Accuracy 0.8463
Epoch 21 Step 250 Loss 0.7133 Accuracy 0.8459
Epoch 21 Step 300 Loss 0.7108 Accuracy 0.8462
Epoch 21 Step 350 Loss 0.7088 Accuracy 0.8468
Epoch 21 Step 400 Loss 0.7074 Accuracy 0.8471
Epoch 21 Step 450 Loss 0.7043 Accuracy 0.8473
Epoch 21 Step 500 Loss 0.7030 Accuracy 0.8473
Epoch 21 Step 550 Loss 0.7034 Accuracy 0.8472
Epoch 21 Step 600 Loss 0.7034 Accuracy 0.8473
Epoch 21 Step 650 Loss 0.7031 Accuracy 0.8477
Epoch 21 Step 700 Loss 0.7021 Accuracy 0.8482
Epoch 21 Step 750 Loss 0.7010 Accuracy 0.8482
Epoch 21 Step 800 Loss 0.7000 Accuracy 0.8484
Epoch 21 Step 850 Loss 0.7004 Accuracy 0.8482
Epoch 21 Step 900 Loss 0.7001 Accuracy 0.8482
Epoch 21 Step 950 Loss 0.6986 Accuracy 0.8482
Epoch 21 Step 1000 Loss 0.6978 Accuracy 0.8485
Epoch 21 Step 105

In [111]:
# Define the model parameters
h = 8  # Number of self-attention heads
d_k = 64  # Dimensionality of the linearly projected queries and keys
d_v = 64  # Dimensionality of the linearly projected values
d_model = 512  # Dimensionality of model layers' outputs
d_ff = 2048  # Dimensionality of the inner fully connected layer
n = 6  # Number of layers in the encoder stack
 
# Define the dataset parameters
enc_seq_length = 12  # Encoder sequence length
dec_seq_length = 19  # Decoder sequence length
enc_vocab_size = 10301  # Encoder vocabulary size
dec_vocab_size = 19319  # Decoder vocabulary size
 
# Create model
inferencing_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, 0)

In [68]:
from tensorflow import Module

In [69]:
class Translate(Module):
    def __init__(self, inferencing_model, **kwargs):
        super(Translate, self).__init__(**kwargs)
        self.transformer = inferencing_model

    def load_tokenizer(self, name):
        with open(name, 'rb') as handle:
            return load(handle)

    def __call__(self, sentence):
        #modify the sentence to have the start and end tokens
        sentence[0] = "<START> " + sentence[0] + " <EOS>"
        #print(sentence)

        # Load the tokenizers from the training model
        enc_tokenizer = self.load_tokenizer('enc_tokenizer.pkl')
        dec_tokenizer = self.load_tokenizer('dec_tokenizer.pkl')

        #Tokenize the sentence, pad it, and convert to a tensor
        encoder_input = enc_tokenizer.texts_to_sequences(sentence)
        encoder_input = pad_sequences(encoder_input, maxlen=enc_seq_length, padding = 'post')
        encoder_input = convert_to_tensor(encoder_input, dtype=int64)
        print(encoder_input)

        output_start = dec_tokenizer.texts_to_sequences(['<START>'])
        output_start = convert_to_tensor(output_start[0], dtype=int64)

        output_end = dec_tokenizer.texts_to_sequences(['<EOS>'])
        output_end = convert_to_tensor(output_end[0], dtype = int64)

        #Prepare the output array that will contain the translated text. 
        # Since you do not know the length of the translated sentence in advance, you will initialize the 
        # size of the output array to 0, but set its dynamic_size parameter to True so that it may grow past its initial size. 
        # You will then set the first value in this output array to the <START> token:

        decoder_output = tf.TensorArray(dtype=int64, size = 0, dynamic_size=True)
        decoder_output = decoder_output.write(0,output_start)

        #Iterate up to the sequence max length each time calling the transformer to predict a token
        #training is set to false so no dropouts occur
        #prediction with the highest score is then selected and written at the next index of hte output array
        #for loop breaks when <EOS> is predicted

        for i in range(dec_seq_length):

            prediction = self.transformer(encoder_input, transpose(decoder_output.stack()), training=False)

            prediction = prediction[:, -1,:]

            predicted_id = argmax(prediction, axis=-1)
            predicted_id = predicted_id[0][tf.newaxis]
 
            decoder_output = decoder_output.write(i + 1, predicted_id)
        
            if predicted_id == output_end:
                break

            #decode the predicted tokens into an output list and return it

        output = transpose(decoder_output.stack())[0]
        output = output.numpy()

        output_str = []

        # Decode the predicted tokens into an output list
        for i in range(output.shape[0]):

            key = output[i]
            translation = dec_tokenizer.index_word[key]
            output_str.append(translation)

        return output_str
        




In [112]:
inferencing_model.load_weights("weights\wghts30.ckpt")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1eb61745e20>

In [113]:
translator = Translate(inferencing_model)

In [124]:
sentence = ['i often listen to sad songs']

In [127]:
print(translator(sentence)[1:-1])

tf.Tensor([[   1    1    1    4  234  485    6  738 1291    2    2    2]], shape=(1, 12), dtype=int64)
['fang', 'an', 'ich', 'es', 'ist', 'oft', 'nun']
