In [1]:
import tensorflow as tf

In [2]:
class Encoder(tf.keras.Model):
     
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = L.Embedding(vocab_size, embedding_dim)
        self.gru = L.GRU(self.enc_units, return_sequence=True, return_state= True, recurrent_initializer='golrot_uniform')
        self.bidirection = L.Bidirectional(self.gru, merge_mode='concat')
    
    def call(self, x, hidden_fd, hidden_bd):
        x= self.embedding(x)
        enc_hidden_states, fd_state, bd_state = self.bidirection(x, initial_state=[hidden_fd, hidden_bd])
        
        return enc_hidden_states, fd_state, bd_state
    
    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)) for i in range(2)]

In [3]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = L.Dense(units)
        self.W2 = L.Dense(units)
        self.V = L.Dense(1)

    def call(self, dec_hidden_state, annotations):
        dec_hidden_state_time = tf.expand_dims(dec_hidden_state, 1)

        score = self.V(tf.nn.tanh(
            self.W1(dec_hidden_state_time) + self.W2(annotations)))


        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * annotations
        context_vector = tf.reduce_sum(context_vector, axis=1)


        return context_vector, attention_weights



In [4]:
 class Decoder(tf.keras.Model):
        def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
            super(Decoder, self).__init__()
            self.batch_sz = batch_sz
            self.dec_units = dec_units
            self.embedding = L.Embedding(vocab_size, embedding_dim)
            self.gru = L.GRU(self.dec_units,
                             return_sequences=True,
                             return_state=True,
                             recurrent_initializer='glorot_uniform')
            self.fc = L.Dense(vocab_size)
            # used for attention
            self.attention = BahdanauAttention(self.dec_units)


        def call(self, x, dec_hidden_state, annotations):
            context_vector, attention_weights = self.attention(dec_hidden_state, annotations)


            x = self.embedding(x)
            x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
            output, state = self.gru(x)
            output = tf.reshape(output, (-1, output.shape[2]))
            x = self.fc(output)
            return x, state, attention_weights


In [5]:
class NMT(tf.keras.Model):
    def __init__(self, encoder, decoder):
        super(NMT, self).__init__()
        self.encoder = encoder
        self.decoder = decoder


    def train_step(self, data):
        # Every sentence is different
        # We would not want the memory state to flow from
        # one sentence to other
        enc_hidden_fd, enc_hidden_bd = self.encoder.initialize_hidden_state()
        inp, targ = data
        loss = 0
        with tf.GradientTape() as tape:
            annotations, enc_hidden_fd, _ = self.encoder(inp,enc_hidden_fd, enc_hidden_bd)
            dec_hidden = enc_hidden_fd
            dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE,1)
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, att_weights = self.decoder(dec_input, dec_hidden, annotations)
                loss += self.compiled_loss(targ[:, t], predictions)
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        batch_loss = (loss / int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        return {"custom_loss": batch_loss}
    
    def test_step(self, data):
        enc_hidden_fd, enc_hidden_bd = self.encoder.initialize_hidden_state()
        inp, targ = data
        loss = 0
        enc_output, enc_hidden_fd, _ = self.encoder(inp, enc_hidden_fd, enc_hidden_bd)
        dec_hidden = enc_hidden_fd
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, att_weights = self.decoder(dec_input, dec_hidden, annotations)
            loss += self.compiled_loss(targ[:, t], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)
        batch_loss = (loss / int(targ.shape[1]))
        return {"custom_loss": batch_loss}
