- functioning NMT model
- attention model using classes
- bi-directional LSTMs in Encoder
- training worked; loss went down over 25 epochs
- No attention plots or embeddings yet

In [2]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, Concatenate, LSTM, Embedding, Dense
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

#import plotly.graph_objects as go


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

df_en_de = pd.read_table('/content/gdrive/MyDrive/deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

Mounted at /content/gdrive


In [4]:
#df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [5]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub("'", '', sentence)
    sentence = sentence.replace('ü', 'ue').replace('ä', 'ae').replace('ö', 'oe').replace('ß', 'ss')
    exclude = set(string.punctuation)
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = 'start_ ' + sentence + ' _end'
    sentence = sentence.encode("ascii", "ignore")
    sentence = sentence.decode()
    return sentence

In [6]:
pairs = df_en_de
pairs = pairs.sample(frac = 0.1)
pairs['english'] = pairs['english'].apply(preprocess_sentence)
pairs['german'] = pairs['german'].apply(preprocess_sentence)

In [7]:
source = pairs['english']
target = pairs ['german']

In [8]:
# create tokenizer & tensor for source and target
source_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
source_sentence_tokenizer.fit_on_texts(source)
source_tensor = source_sentence_tokenizer.texts_to_sequences(source)
source_tensor= tf.keras.preprocessing.sequence.pad_sequences(source_tensor, padding='post' )

target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(target)
target_tensor = target_sentence_tokenizer.texts_to_sequences(target)
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post' )

In [9]:
source_train_tensor, source_test_tensor, target_train_tensor, target_test_tensor = train_test_split(
                                                                source_tensor, target_tensor,test_size=0.2
                                                                )

In [10]:
max_target_length= max(len(t) for t in  target_tensor)
max_source_length= max(len(t) for t in source_tensor)

In [11]:
max_target_length, max_source_length

(49, 43)

In [12]:
BATCH_SIZE = 32
#Create data in memeory 
dataset = tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(BATCH_SIZE)
# shuffles the data in the batch
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [13]:
source_batch, target_batch =next(iter(dataset))
print(source_batch.shape, target_batch.shape)
print(source_batch[1])

(32, 43) (32, 49)
tf.Tensor(
[   1    3   57  324   29 1238    2    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0], shape=(43,), dtype=int32)


In [14]:
BUFFER_SIZE = len(source_train_tensor)
steps_per_epoch= BUFFER_SIZE//BATCH_SIZE
embedding_dim= 256
units= 256
attention_layer_units = 100
source_vocab_size= len(source_sentence_tokenizer.word_index)+1
target_vocab_size= len(target_sentence_tokenizer.word_index)+1

In [15]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm= Bidirectional(LSTM (encoder_units, 
                                      return_sequences=True,
                                      return_state=True,                                      
                                      recurrent_initializer='glorot_uniform'
                                     ))
    def call(self, x):
                                                                # x = (m, Tx)
                                                                # hidden = (m, encoder_units)
                                                                
        x = self.embedding(x)                                   # x = (m, Tx, embedding_dim)
        # pass input x through bi-directional LSTM
                                                                
        (enc_sequential, enc_forward_h, 
        enc_forward_c, enc_backward_h, enc_backward_c) = self.lstm(x)

        # concatenate forward and backward states
        enc_final_h = Concatenate()([enc_forward_h, enc_backward_h])
        enc_final_c = Concatenate()([enc_forward_c, enc_backward_c])

        return enc_sequential, enc_final_h, enc_final_c                     # enc_sequential = (m, Tx, 2 * encoder_units) 
                                                                            # enc_h = (m, 2 * encoder_units)
                                                                            # enc_c = (m, 2 * encoder_units)
    
    # def initialize_state(self):
    #     return tf.zeros((self.batch_size, self.encoder_units))
    

In [16]:
# check dimensions
encoder = Encoder(source_vocab_size, embedding_dim, units, BATCH_SIZE)

#initial_state = encoder.initialize_state()
enc_sequential, enc_final_h, enc_final_c = encoder(source_batch)

print (f'Encoder sequential: {enc_sequential.shape}')
print (f'Encoder final state_h: {enc_final_h.shape}')
print (f'Encoder final state_c: {enc_final_c.shape}')

Encoder sequential: (32, 43, 512)
Encoder final state_h: (32, 512)
Encoder final state_c: (32, 512)


In [17]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super( BahdanauAttention, self).__init__()
        self.W1= tf.keras.layers.Dense(units)  # decoder hidden (at time-step "t-1")
        self.W2= tf.keras.layers.Dense(units)  # encoder hidden (at time-step "t")
        self.V= tf.keras.layers.Dense(1)
    
    def call(self, dec_hidden, enc_hidden):
                                                                # dec_hidden = (m, 2*units) 
                                                                # enc_hidden:   (m, Tx, 2*units)

        dec_hidden_with_time = tf.expand_dims(dec_hidden, 1)    # dec_hidden_with_time = (m, 1, 2*units)
        
                                                                
        # W1() = (m, 1, 10) 
        # W2() = (m, Tx, 10)
        # Broadcasting happens when you add
        # W1() + W2 () = (m, Tx, 10)

        score = self.V(tf.nn.tanh(self.W1(dec_hidden_with_time) + self.W2(enc_hidden))) # (m, Tx, 1)
        
        # normalise scores with softmax
        attention_weights = tf.nn.softmax(score, axis=1)                                # (m, Tx, 1)
        
        # apply each weight to encoder hidden state at respective time-step 
        context_vector= attention_weights * enc_hidden                                  # (m, Tx, 2*units)
       
        # linear combination of enc_hidden vectors for all Tx
        # so sum along Tx axis
        context_vector = tf.reduce_sum(context_vector, axis=1)                          # (m, 2*units)
        return context_vector, attention_weights

In [18]:
attention_layer= BahdanauAttention(attention_layer_units)
attention_result, attention_weights = attention_layer(enc_final_h, enc_sequential)
print(f"context vector: (batch size, units) {attention_result.shape}")
print(f"attention weights: (batch_size, sequence_length, 1) {attention_weights.shape}")

context vector: (batch size, units) (32, 512)
attention weights: (batch_size, sequence_length, 1) (32, 43, 1)


In [19]:
# Decoder for one time-step

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_sz):
        super (Decoder,self).__init__()
        self.batch_sz= batch_sz
        self.decoder_units = decoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm= LSTM (decoder_units, 
                        return_sequences= True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform')
        # Fully connected layer
        self.fc= Dense(vocab_size)      # Note, we don't use an activation here.
                                        # For the calculation of the loss, we will use 
                                        # sparse_softmax_cross_entropy_with_logits, which performs 
                                        # the softmax on the logits internally for greater efficiency
        
        # attention
        self.attention = BahdanauAttention(attention_layer_units)
    
    def call(self, y, dec_h, dec_c, enc_sequential):
                                                                                    # dec_h: (m, 2*units) 
                                                                                    # dec_c: (m, 2*units)
                                                                                    # enc_sequential: (m, Tx, 2*units) 

        context_vector, attention_weights = self.attention(dec_h, enc_sequential)   # context_vector = (m, 2*units)
        
        y= self.embedding(y)                                                        # y = (m, 1, embedding_dim)
        
        # concatenate context vector and embedding for output sequence
        y = tf.concat([tf.expand_dims(context_vector, 1), y],                       # (m, 1, 2*units) + (m, 1, embedding_dim)
                                      axis=-1)                                      # (m, 1, 2*units + embedding_dim)
        
        # passing the concatenated vector to the GRU
        output, dec_h, dec_c = self.lstm(y, initial_state = [dec_h, dec_c])                                         # output = (m, 1, 2*units)
                                                                                    # dec_h = (m, 2*units)
                                                                                    # dec_c = (m, 2*units)

        output= tf.reshape(output, (-1, output.shape[2]))                           # output = (m, 2*units)
        
        # pass the output thru Fc layers
        y = self.fc(output)                                                         # y = (m, vocab_size)
        return y, dec_h, dec_c, attention_weights

In [20]:
# Make sure to pass in "2*units", since the encoder uses bi-directional LSTM
# We're feeding final_h and final_c from Encoder as init_h and init_c for Decoder
decoder= Decoder(target_vocab_size, embedding_dim, 2*units, BATCH_SIZE)

sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
print (f'Decoder output shape: (batch_size, vocab size) {sample_decoder_output.shape}')

Decoder output shape: (batch_size, vocab size) (32, 12006)


In [21]:
#Define the optimizer and the loss function
optimizer = tf.keras.optimizers.Adam()

In [22]:
def loss_function(real, pred):      # "real" = (m, 1), "pred" = (m, vocab_size)
    mask = 1 - np.equal(real, 0)    # mask = 1 when "real" != 0
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [23]:
EPOCHS = 5

# initial_state = encoder.initialize_state()
# enc_sequential, enc_final_h, enc_final_c = encoder(source_batch, initial_state)
# sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
# decoder returns: y, dec_h, dec_c, attention_weights

for epoch in range(EPOCHS):
    start = time.time()
    
    #initial_state = encoder.initialize_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
                                                        # inp: (batch_size, Tx)
                                                        # targ: (batch_size, Ty)
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_sequential, enc_final_h, enc_final_c = encoder(inp)
            
            dec_h = enc_final_h
            dec_c = enc_final_c
            
            dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)    # (m, 1)   
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_h, dec_c, _ = decoder(dec_input, dec_h, dec_c, enc_sequential) # predictions = (m, vocab_size)
                
                loss += loss_function(targ[:, t], predictions)
                
                # update dec_input for teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

Epoch 1 Batch 0 Loss 1.4018
Epoch 1 Batch 50 Loss 0.9843
Epoch 1 Batch 100 Loss 0.8379
Epoch 1 Batch 150 Loss 0.8643
Epoch 1 Batch 200 Loss 0.7574
Epoch 1 Batch 250 Loss 0.7565
Epoch 1 Batch 300 Loss 0.8579
Epoch 1 Batch 350 Loss 0.7772
Epoch 1 Batch 400 Loss 0.8775
Epoch 1 Batch 450 Loss 0.7663
Epoch 1 Batch 500 Loss 0.8439
Epoch 1 Batch 550 Loss 0.7852
Epoch 1 Batch 600 Loss 0.7404
Epoch 1 Loss 0.8419
Time taken for 1 epoch 571.0552086830139 sec

Epoch 2 Batch 0 Loss 0.7107
Epoch 2 Batch 50 Loss 0.7231
Epoch 2 Batch 100 Loss 0.6233
Epoch 2 Batch 150 Loss 0.6535
Epoch 2 Batch 200 Loss 0.7617
Epoch 2 Batch 250 Loss 0.6155
Epoch 2 Batch 300 Loss 0.5816
Epoch 2 Batch 350 Loss 0.7521
Epoch 2 Batch 400 Loss 0.6159
Epoch 2 Batch 450 Loss 0.6223
Epoch 2 Batch 500 Loss 0.6742
Epoch 2 Batch 550 Loss 0.6651
Epoch 2 Batch 600 Loss 0.6689
Epoch 2 Loss 0.6798
Time taken for 1 epoch 561.5900504589081 sec

Epoch 3 Batch 0 Loss 0.6477
Epoch 3 Batch 50 Loss 0.6275
Epoch 3 Batch 100 Loss 0.5663
Epoch 3

In [None]:
EPOCHS = 5

# initial_state = encoder.initialize_state()
# enc_sequential, enc_final_h, enc_final_c = encoder(source_batch, initial_state)
# sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
# decoder returns: y, dec_h, dec_c, attention_weights

for epoch in range(EPOCHS):
    start = time.time()
    
    #initial_state = encoder.initialize_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
                                                        # inp: (batch_size, Tx)
                                                        # targ: (batch_size, Ty)
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_sequential, enc_final_h, enc_final_c = encoder(inp)
            
            dec_h = enc_final_h
            dec_c = enc_final_c
            
            dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)    # (m, 1)   
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_h, dec_c, _ = decoder(dec_input, dec_h, dec_c, enc_sequential) # predictions = (m, vocab_size)
                
                loss += loss_function(targ[:, t], predictions)
                
                # update dec_input for teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

Epoch 1 Batch 0 Loss 0.4228
Epoch 1 Batch 50 Loss 0.4039
Epoch 1 Batch 100 Loss 0.3948
Epoch 1 Batch 150 Loss 0.3573
Epoch 1 Batch 200 Loss 0.3884
Epoch 1 Batch 250 Loss 0.3471
Epoch 1 Batch 300 Loss 0.3367
Epoch 1 Batch 350 Loss 0.3136
Epoch 1 Batch 400 Loss 0.3891
Epoch 1 Batch 450 Loss 0.2907
Epoch 1 Batch 500 Loss 0.3416
Epoch 1 Batch 550 Loss 0.3502
Epoch 1 Batch 600 Loss 0.3179
Epoch 1 Loss 0.3460
Time taken for 1 epoch 553.4982507228851 sec

Epoch 2 Batch 0 Loss 0.2916
Epoch 2 Batch 50 Loss 0.3165
Epoch 2 Batch 100 Loss 0.2944
Epoch 2 Batch 150 Loss 0.2923
Epoch 2 Batch 200 Loss 0.2704
Epoch 2 Batch 250 Loss 0.2812
Epoch 2 Batch 300 Loss 0.2997
Epoch 2 Batch 350 Loss 0.2561
Epoch 2 Batch 400 Loss 0.3098
Epoch 2 Batch 450 Loss 0.2252
Epoch 2 Batch 500 Loss 0.3036
Epoch 2 Batch 550 Loss 0.2497
Epoch 2 Batch 600 Loss 0.2668
Epoch 2 Loss 0.2869
Time taken for 1 epoch 553.5461626052856 sec

Epoch 3 Batch 0 Loss 0.2743
Epoch 3 Batch 50 Loss 0.2441
Epoch 3 Batch 100 Loss 0.2420
Epoch 3

In [33]:
EPOCHS = 5

# initial_state = encoder.initialize_state()
# enc_sequential, enc_final_h, enc_final_c = encoder(source_batch, initial_state)
# sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
# decoder returns: y, dec_h, dec_c, attention_weights

for epoch in range(EPOCHS):
    start = time.time()
    
    #initial_state = encoder.initialize_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
                                                        # inp: (batch_size, Tx)
                                                        # targ: (batch_size, Ty)
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_sequential, enc_final_h, enc_final_c = encoder(inp)
            
            dec_h = enc_final_h
            dec_c = enc_final_c
            
            dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)    # (m, 1)   
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_h, dec_c, _ = decoder(dec_input, dec_h, dec_c, enc_sequential) # predictions = (m, vocab_size)
                
                loss += loss_function(targ[:, t], predictions)
                
                # update dec_input for teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

Epoch 1 Batch 0 Loss 0.1368
Epoch 1 Batch 50 Loss 0.1775
Epoch 1 Batch 100 Loss 0.1402
Epoch 1 Batch 150 Loss 0.1846
Epoch 1 Batch 200 Loss 0.1682
Epoch 1 Batch 250 Loss 0.1795
Epoch 1 Batch 300 Loss 0.1342
Epoch 1 Batch 350 Loss 0.1203
Epoch 1 Batch 400 Loss 0.1433
Epoch 1 Batch 450 Loss 0.1124
Epoch 1 Batch 500 Loss 0.1609
Epoch 1 Batch 550 Loss 0.1427
Epoch 1 Batch 600 Loss 0.1332
Epoch 1 Loss 0.1370
Time taken for 1 epoch 553.1023693084717 sec

Epoch 2 Batch 0 Loss 0.1285
Epoch 2 Batch 50 Loss 0.1421
Epoch 2 Batch 100 Loss 0.1137
Epoch 2 Batch 150 Loss 0.1351
Epoch 2 Batch 200 Loss 0.1494
Epoch 2 Batch 250 Loss 0.0974
Epoch 2 Batch 300 Loss 0.1149
Epoch 2 Batch 350 Loss 0.1070
Epoch 2 Batch 400 Loss 0.1119
Epoch 2 Batch 450 Loss 0.0716
Epoch 2 Batch 500 Loss 0.1012
Epoch 2 Batch 550 Loss 0.1095
Epoch 2 Batch 600 Loss 0.0932
Epoch 2 Loss 0.1125
Time taken for 1 epoch 550.3403558731079 sec

Epoch 3 Batch 0 Loss 0.1127
Epoch 3 Batch 50 Loss 0.1101
Epoch 3 Batch 100 Loss 0.1056
Epoch 3

In [38]:
EPOCHS = 5

# initial_state = encoder.initialize_state()
# enc_sequential, enc_final_h, enc_final_c = encoder(source_batch, initial_state)
# sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
# decoder returns: y, dec_h, dec_c, attention_weights

for epoch in range(EPOCHS):
    start = time.time()
    
    #initial_state = encoder.initialize_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
                                                        # inp: (batch_size, Tx)
                                                        # targ: (batch_size, Ty)
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_sequential, enc_final_h, enc_final_c = encoder(inp)
            
            dec_h = enc_final_h
            dec_c = enc_final_c
            
            dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)    # (m, 1)   
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_h, dec_c, _ = decoder(dec_input, dec_h, dec_c, enc_sequential) # predictions = (m, vocab_size)
                
                loss += loss_function(targ[:, t], predictions)
                
                # update dec_input for teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

Epoch 1 Batch 0 Loss 0.0451
Epoch 1 Batch 50 Loss 0.0565
Epoch 1 Batch 100 Loss 0.0497
Epoch 1 Batch 150 Loss 0.0493
Epoch 1 Batch 200 Loss 0.0503
Epoch 1 Batch 250 Loss 0.0515
Epoch 1 Batch 300 Loss 0.0537
Epoch 1 Batch 350 Loss 0.0453
Epoch 1 Batch 400 Loss 0.0473
Epoch 1 Batch 450 Loss 0.0275
Epoch 1 Batch 500 Loss 0.0484
Epoch 1 Batch 550 Loss 0.0397
Epoch 1 Batch 600 Loss 0.0483
Epoch 1 Loss 0.0485
Time taken for 1 epoch 543.0466475486755 sec

Epoch 2 Batch 0 Loss 0.0550
Epoch 2 Batch 50 Loss 0.0472
Epoch 2 Batch 100 Loss 0.0486
Epoch 2 Batch 150 Loss 0.0537
Epoch 2 Batch 200 Loss 0.0343
Epoch 2 Batch 250 Loss 0.0402
Epoch 2 Batch 300 Loss 0.0426
Epoch 2 Batch 350 Loss 0.0413
Epoch 2 Batch 400 Loss 0.0415
Epoch 2 Batch 450 Loss 0.0222
Epoch 2 Batch 500 Loss 0.0330
Epoch 2 Batch 550 Loss 0.0359
Epoch 2 Batch 600 Loss 0.0379
Epoch 2 Loss 0.0388
Time taken for 1 epoch 545.248651266098 sec

Epoch 3 Batch 0 Loss 0.0337
Epoch 3 Batch 50 Loss 0.0362
Epoch 3 Batch 100 Loss 0.0304
Epoch 3 

In [43]:
EPOCHS = 5

# initial_state = encoder.initialize_state()
# enc_sequential, enc_final_h, enc_final_c = encoder(source_batch, initial_state)
# sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
# decoder returns: y, dec_h, dec_c, attention_weights

for epoch in range(EPOCHS):
    start = time.time()
    
    #initial_state = encoder.initialize_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
                                                        # inp: (batch_size, Tx)
                                                        # targ: (batch_size, Ty)
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_sequential, enc_final_h, enc_final_c = encoder(inp)
            
            dec_h = enc_final_h
            dec_c = enc_final_c
            
            dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)    # (m, 1)   
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_h, dec_c, _ = decoder(dec_input, dec_h, dec_c, enc_sequential) # predictions = (m, vocab_size)
                
                loss += loss_function(targ[:, t], predictions)
                
                # update dec_input for teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

Epoch 1 Batch 0 Loss 0.0200
Epoch 1 Batch 50 Loss 0.0201
Epoch 1 Batch 100 Loss 0.0144
Epoch 1 Batch 150 Loss 0.0224
Epoch 1 Batch 200 Loss 0.0236
Epoch 1 Batch 250 Loss 0.0191
Epoch 1 Batch 300 Loss 0.0175
Epoch 1 Batch 350 Loss 0.0165
Epoch 1 Batch 400 Loss 0.0150
Epoch 1 Batch 450 Loss 0.0125
Epoch 1 Batch 500 Loss 0.0119
Epoch 1 Batch 550 Loss 0.0104
Epoch 1 Batch 600 Loss 0.0131
Epoch 1 Loss 0.0157
Time taken for 1 epoch 550.7450428009033 sec

Epoch 2 Batch 0 Loss 0.0146
Epoch 2 Batch 50 Loss 0.0137
Epoch 2 Batch 100 Loss 0.0143
Epoch 2 Batch 150 Loss 0.0131
Epoch 2 Batch 200 Loss 0.0126
Epoch 2 Batch 250 Loss 0.0161
Epoch 2 Batch 300 Loss 0.0126
Epoch 2 Batch 350 Loss 0.0100
Epoch 2 Batch 400 Loss 0.0118
Epoch 2 Batch 450 Loss 0.0099
Epoch 2 Batch 500 Loss 0.0152
Epoch 2 Batch 550 Loss 0.0157
Epoch 2 Batch 600 Loss 0.0104
Epoch 2 Loss 0.0134
Time taken for 1 epoch 551.1150047779083 sec

Epoch 3 Batch 0 Loss 0.0122
Epoch 3 Batch 50 Loss 0.0151
Epoch 3 Batch 100 Loss 0.0093
Epoch 3

In [44]:
def evaluate(inputs, encoder, decoder, max_source_length, max_target_length):
    # inputs = (1, Tx)

    
    input_sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        input_sentence = input_sentence + source_sentence_tokenizer.index_word[i] + ' '
    #input_sentence = input_sentence[:-1]
    

    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    # Encoder: 
    # Input: x, init_state
    # Return: enc_sequential, enc_final_h, enc_final_c

    init_state = [tf.zeros((1, units))]
    enc_sequential, enc_final_h, enc_final_c = encoder(inputs)

    dec_h = enc_final_h
    dec_c = enc_final_c
    
    dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']], 0)             # dec_input = (1, 1)

    # Decoder:
    # Input: y, dec_h, dec_c, enc_sequential
    # Return: y, dec_h, dec_c, attention_weights

    # start decoding
    for t in range(max_target_length): # limit the length of the decoded sequence
        predictions, dec_h, dec_c, attention_weights = decoder(dec_input, dec_h, dec_c, enc_sequential)    # predictions = (1, vocab_size)
                                                                                                           # dec_h = (1, 2*units)
                                                                                                           # dec_c = (1, 2*units)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_sentence_tokenizer.index_word[predicted_id] + ' '

        # stop decoding if '_end' is predicted
        if target_sentence_tokenizer.index_word[predicted_id] == '_end':
            return result, input_sentence
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)                                         # dec_input = (1,1)  

    return result, input_sentence
  
def predict_random_val_sentence():
    
    k = np.random.randint(len(source_train_tensor))
    random_input = source_train_tensor[k]
    random_output = target_train_tensor[k]
    random_input = np.expand_dims(random_input,0)           # random_input = (1, Tx)
    result, sentence = evaluate(random_input, encoder, decoder, max_source_length, max_target_length)
    print(f'Input: {sentence[7:-5]}')                   # Want to skip "start_ " and " _end"
    print(f'Predicted translation: {result[:-5]}')
    true_translation = ''
    for i in random_output:
        if i == 0:
            break
        true_translation = true_translation + target_sentence_tokenizer.index_word[i] + ' '
    true_translation = true_translation[7:-6]               # Want to skip "start_" and " _end "
    print(f'Actual translation: {true_translation}')


In [45]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: tom bent down and picked up a stone 
Predicted translation: tom bestellte sich und nahm einen kuchen 
Actual translation: tom beugte sich herab und nahm einen stein auf
None
Input: my cat is sleeping on my bed 
Predicted translation: meine katze schlaeft auf meinem bett 
Actual translation: meine katze schlaeft auf meinem bett
None
Input: we have so many students 
Predicted translation: wir haben so viele schueler 
Actual translation: wir haben so viele schueler
None
Input: tom often complains about mosquitoes 
Predicted translation: tom beschwert sich haeufig ueber muecken 
Actual translation: tom beschwert sich haeufig ueber muecken
None
Input: tom listened to the chirping of the birds 
Predicted translation: tom hoerte wie sich in die operation versteckt 
Actual translation: tom lauschte dem gezwitscher der voegel
None


In [46]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: tom doesnt know if he can do what youre asking him to do 
Predicted translation: tom weiss nicht ob er zu dem worum du ihn batst in der lage ist 
Actual translation: tom weiss nicht ob er zu dem worum du ihn batst in der lage ist
None
Input: i thought you knew me 
Predicted translation: ich dachte du wuerdest mir verstanden 
Actual translation: ich dachte du kenntest mich
None
Input: it has to mean something 
Predicted translation: es muss etwas wofuer es bedeutet 
Actual translation: das muss was bedeuten
None
Input: tom is safe 
Predicted translation: tom ist in sicherheit 
Actual translation: tom ist in sicherheit
None
Input: were up a creek without a paddle 
Predicted translation: wir sind in einer schwierigen lage 
Actual translation: wir sind in einer schwierigen lage
None
Input: the surgeon who did toms operation is very experienced and highly regarded 
Predicted translation: der groesste junge hat tom sehr gute spieler der dunkelheit 
Actual translation: der chirurg der 

In [42]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: i work very hard 
Predicted translation: ich esse sehr gut 
Actual translation: ich arbeite sehr hart
None
Input: could you please do me a favor 
Predicted translation: koenntest du mir bitte einen gefallen tun 
Actual translation: koennten sie mir bitte einen gefallen tun
None
Input: i wish id been born a canadian 
Predicted translation: ich wuenschte ich waere als kanadierin geboren 
Actual translation: ich waere gerne als kanadier geboren worden
None
Input: have you found your glasses yet not yet 
Predicted translation: haben sie schon mal eine chance schon gestern gemacht 
Actual translation: hast du deine brille schon gefunden noch nicht
None
Input: i brought tom some fruit 
Predicted translation: ich habe tom etwas obst mitgebracht 
Actual translation: ich habe tom etwas obst mitgebracht
None
Input: can you pay attention please 
Predicted translation: kannst du bitte aufpassen 
Actual translation: koenntest du bitte aufpassen
None
Input: he is about thirty 
Predicted trans