- functioning NMT model
- attention model using classes
- bi-directional LSTMs in Encoder
- training seems to work; loss generally going down over 3 epochs
- make sure to use batch size = 32
- Check the final evaluation function – needs to be adapted

In [1]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, Concatenate, LSTM, Embedding, Dense
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

#import plotly.graph_objects as go


In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# df_en_de = pd.read_table('/content/gdrive/MyDrive/deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [3]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub("'", '', sentence)
    sentence = sentence.replace('ü', 'ue').replace('ä', 'ae').replace('ö', 'oe').replace('ß', 'ss')
    exclude = set(string.punctuation)
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = 'start_ ' + sentence + ' _end'
    sentence = sentence.encode("ascii", "ignore")
    sentence = sentence.decode()
    return sentence

In [5]:
pairs = df_en_de
pairs = pairs.sample(frac = 0.01)
pairs['english'] = pairs['english'].apply(preprocess_sentence)
pairs['german'] = pairs['german'].apply(preprocess_sentence)

In [6]:
source = pairs['english']
target = pairs ['german']

In [7]:
# create tokenizer & tensor for source and target
source_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
source_sentence_tokenizer.fit_on_texts(source)
source_tensor = source_sentence_tokenizer.texts_to_sequences(source)
source_tensor= tf.keras.preprocessing.sequence.pad_sequences(source_tensor, padding='post' )

target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(target)
target_tensor = target_sentence_tokenizer.texts_to_sequences(target)
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post' )

In [8]:
source_train_tensor, source_test_tensor, target_train_tensor, target_test_tensor = train_test_split(
                                                                source_tensor, target_tensor,test_size=0.2
                                                                )

In [9]:
max_target_length= max(len(t) for t in  target_tensor)
max_source_length= max(len(t) for t in source_tensor)

In [10]:
max_target_length, max_source_length

(24, 23)

In [11]:
BATCH_SIZE = 32
#Create data in memeory 
dataset = tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(BATCH_SIZE)
# shuffles the data in the batch
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

2022-08-10 08:35:43.152977: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
source_batch, target_batch =next(iter(dataset))
print(source_batch.shape, target_batch.shape)
print(source_batch[1])

(32, 23) (32, 24)
tf.Tensor(
[  1  50  36 218 156   2   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0], shape=(23,), dtype=int32)


In [13]:
BUFFER_SIZE = len(source_train_tensor)
steps_per_epoch= BUFFER_SIZE//BATCH_SIZE
embedding_dim=256
units=128
attention_layer_units = 100
source_vocab_size= len(source_sentence_tokenizer.word_index)+1
target_vocab_size= len(target_sentence_tokenizer.word_index)+1

In [14]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm= Bidirectional(LSTM (encoder_units, 
                                      return_sequences=True,
                                      return_state=True,                                      
                                      recurrent_initializer='glorot_uniform'
                                     ))
    def call(self, x, ini_state):
                                                                # x = (m, Tx)
                                                                # hidden = (m, encoder_units)
                                                                
        x = self.embedding(x)                                   # x = (m, Tx, embedding_dim)
        # pass input x through bi-directional LSTM
                                                                
        (enc_sequential, enc_forward_h, 
        enc_forward_c, enc_backward_h, enc_backward_c) = self.lstm(x, initial_state = [ini_state, ini_state, ini_state, ini_state])

        # concatenate forward and backward states
        enc_final_h = Concatenate()([enc_forward_h, enc_backward_h])
        enc_final_c = Concatenate()([enc_forward_c, enc_backward_c])

        return enc_sequential, enc_final_h, enc_final_c                     # enc_sequential = (m, Tx, 2 * encoder_units) 
                                                                            # enc_h = (m, 2 * encoder_units)
                                                                            # enc_c = (m, 2 * encoder_units)
    
    def initialize_state(self):
        return tf.zeros((self.batch_size, self.encoder_units))
    
    # def initialize_cell_state(self):
    #     return tf.zeros((self.batch_size, 2 * self.encoder_units))

In [15]:
# check dimensions
encoder = Encoder(source_vocab_size, embedding_dim, units, BATCH_SIZE)

initial_state = encoder.initialize_state()
enc_sequential, enc_final_h, enc_final_c = encoder(source_batch, initial_state)

print (f'Encoder sequential: {enc_sequential.shape}')
print (f'Encoder final state_h: {enc_final_h.shape}')
print (f'Encoder final state_c: {enc_final_c.shape}')

Encoder sequential: (32, 23, 256)
Encoder final state_h: (32, 256)
Encoder final state_c: (32, 256)


In [16]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super( BahdanauAttention, self).__init__()
        self.W1= tf.keras.layers.Dense(units)  # decoder hidden (at time-step "t-1")
        self.W2= tf.keras.layers.Dense(units)  # encoder hidden (at time-step "t")
        self.V= tf.keras.layers.Dense(1)
    
    def call(self, dec_hidden, enc_hidden):
                                                                # dec_hidden = (m, 2*units) 
                                                                # enc_hidden:   (m, Tx, 2*units)

        dec_hidden_with_time = tf.expand_dims(dec_hidden, 1)    # dec_hidden_with_time = (m, 1, 2*units)
        
                                                                
        # W1() = (m, 1, 10) 
        # W2() = (m, Tx, 10)
        # Broadcasting happens when you add
        # W1() + W2 () = (m, Tx, 10)

        score = self.V(tf.nn.tanh(self.W1(dec_hidden_with_time) + self.W2(enc_hidden))) # (m, Tx, 1)
        
        # normalise scores with softmax
        attention_weights = tf.nn.softmax(score, axis=1)                                # (m, Tx, 1)
        
        # apply each weight to encoder hidden state at respective time-step 
        context_vector= attention_weights * enc_hidden                                  # (m, Tx, 2*units)
       
        # linear combination of enc_hidden vectors for all Tx
        # so sum along Tx axis
        context_vector = tf.reduce_sum(context_vector, axis=1)                          # (m, 2*units)
        return context_vector, attention_weights

In [17]:
attention_layer= BahdanauAttention(attention_layer_units)
attention_result, attention_weights = attention_layer(enc_final_h, enc_sequential)
print(f"context vector: (batch size, units) {attention_result.shape}")
print(f"attention weights: (batch_size, sequence_length, 1) {attention_weights.shape}")

context vector: (batch size, units) (32, 256)
attention weights: (batch_size, sequence_length, 1) (32, 23, 1)


In [18]:
# Decoder for one time-step

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_sz):
        super (Decoder,self).__init__()
        self.batch_sz= batch_sz
        self.decoder_units = decoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm= LSTM (decoder_units, 
                        return_sequences= True,
                        return_state=True,
                        recurrent_initializer='glorot_uniform')
        # Fully connected layer
        self.fc= Dense(vocab_size)      # Note, we don't use an activation here.
                                        # For the calculation of the loss, we will use 
                                        # sparse_softmax_cross_entropy_with_logits, which performs 
                                        # the softmax on the logits internally for greater efficiency
        
        # attention
        self.attention = BahdanauAttention(attention_layer_units)
    
    def call(self, y, dec_h, dec_c, enc_sequential):
                                                                                    # dec_h: (m, 2*units) 
                                                                                    # dec_c: (m, 2*units)
                                                                                    # enc_sequential: (m, Tx, 2*units) 

        context_vector, attention_weights = self.attention(dec_h, enc_sequential)   # context_vector = (m, 2*units)
        
        y= self.embedding(y)                                                        # y = (m, 1, embedding_dim)
        
        # concatenate context vector and embedding for output sequence
        y = tf.concat([tf.expand_dims(context_vector, 1), y],                       # (m, 1, 2*units) + (m, 1, embedding_dim)
                                      axis=-1)                                      # (m, 1, 2*units + embedding_dim)
        
        # passing the concatenated vector to the GRU
        output, dec_h, dec_c = self.lstm(y, initial_state = [dec_h, dec_c])                                         # output = (m, 1, 2*units)
                                                                                    # dec_h = (m, 2*units)
                                                                                    # dec_c = (m, 2*units)

        output= tf.reshape(output, (-1, output.shape[2]))                           # output = (m, 2*units)
        
        # pass the output thru Fc layers
        y = self.fc(output)                                                         # y = (m, vocab_size)
        return y, dec_h, dec_c, attention_weights

In [19]:
# Make sure to pass in "2*units", since the encoder uses bi-directional LSTM
# We're feeding final_h and final_c from Encoder as init_h and init_c for Decoder
decoder= Decoder(target_vocab_size, embedding_dim, 2*units, BATCH_SIZE)

sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
print (f'Decoder output shape: (batch_size, vocab size) {sample_decoder_output.shape}')

Decoder output shape: (batch_size, vocab size) (32, 3206)


In [20]:
#Define the optimizer and the loss function
optimizer = tf.keras.optimizers.Adam()

In [21]:
def loss_function(real, pred):      # "real" = (m, 1), "pred" = (m, vocab_size)
    mask = 1 - np.equal(real, 0)    # mask = 1 when "real" != 0
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [22]:
EPOCHS = 1

# initial_state = encoder.initialize_state()
# enc_sequential, enc_final_h, enc_final_c = encoder(source_batch, initial_state)
# sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final_h, enc_final_c, enc_sequential)
# decoder returns: y, dec_h, dec_c, attention_weights

for epoch in range(EPOCHS):
    start = time.time()
    
    initial_state = encoder.initialize_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
                                                        # inp: (batch_size, Tx)
                                                        # targ: (batch_size, Ty)
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_sequential, enc_final_h, enc_final_c = encoder(inp, initial_state)
            
            dec_h = enc_final_h
            dec_c = enc_final_c
            
            dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)    # (m, 1)   
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_h, dec_c, _ = decoder(dec_input, dec_h, dec_c, enc_sequential) # predictions = (m, vocab_size)
                
                loss += loss_function(targ[:, t], predictions)
                
                # update dec_input for teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

Epoch 1 Batch 0 Loss 2.2702
Epoch 1 Batch 50 Loss 1.7710
Epoch 1 Loss 1.9602
Time taken for 1 epoch 41.70515418052673 sec



In [25]:
def evaluate(inputs, encoder, decoder, max_source_length, max_target_length):
    # inputs = (1, Tx)

    
    input_sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        input_sentence = input_sentence + source_sentence_tokenizer.index_word[i] + ' '
    #input_sentence = input_sentence[:-1]
    

    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    # Encoder: 
    # Input: x, init_state
    # Return: enc_sequential, enc_final_h, enc_final_c

    init_state = [tf.zeros((1, units))]
    enc_sequential, enc_final_h, enc_final_c = encoder(inputs, init_state)

    dec_h = enc_final_h
    dec_c = enc_final_c
    
    dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']], 0)             # dec_input = (1, 1)

    # Decoder:
    # Input: y, dec_h, dec_c, enc_sequential
    # Return: y, dec_h, dec_c, attention_weights

    # start decoding
    for t in range(max_target_length): # limit the length of the decoded sequence
        predictions, dec_h, dec_c, attention_weights = decoder(dec_input, dec_h, dec_c, enc_sequential)    # predictions = (1, vocab_size)
                                                                                                           # dec_h = (1, 2*units)
                                                                                                           # dec_c = (1, 2*units)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_sentence_tokenizer.index_word[predicted_id] + ' '

        # stop decoding if '_end' is predicted
        if target_sentence_tokenizer.index_word[predicted_id] == '_end':
            return result, input_sentence
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)                                         # dec_input = (1,1)  

    return result, input_sentence
  
def predict_random_val_sentence():
    
    k = np.random.randint(len(source_train_tensor))
    random_input = source_train_tensor[k]
    random_output = target_train_tensor[k]
    random_input = np.expand_dims(random_input,0)           # random_input = (1, Tx)
    result, sentence = evaluate(random_input, encoder, decoder, max_source_length, max_target_length)
    print(f'Input: {sentence[7:-5]}')                   # Want to skip "start_ " and " _end"
    print(f'Predicted translation: {result[:-5]}')
    true_translation = ''
    for i in random_output:
        if i == 0:
            break
        true_translation = true_translation + target_sentence_tokenizer.index_word[i] + ' '
    true_translation = true_translation[7:-6]               # Want to skip "start_" and " _end "
    print(f'Actual translation: {true_translation}')


In [26]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

AttributeError: Exception encountered when calling layer "forward_lstm" (type LSTM).

'list' object has no attribute 'shape'

Call arguments received:
  • inputs=tf.Tensor(shape=(1, 23, 256), dtype=float32)
  • mask=None
  • training=None
  • initial_state=[['tf.Tensor(shape=(1, 128), dtype=float32)'], ['tf.Tensor(shape=(1, 128), dtype=float32)']]

In [151]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: start_ now just relax _end
Predicted translation: ich habe eine gute idee _end 
Actual translation: start_ jetzt entspann dich einfach _end 
None
Input: start_ she asked him to give her some money so she could go to a restaurant with her friends _end
Predicted translation: tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom 
Actual translation: start_ sie bat ihn um geld damit sie mit ihren freunden in ein restaurant gehen koennte _end 
None
Input: start_ lets see what we remember from the last lesson _end
Predicted translation: tom hat eine gute idee _end 
Actual translation: start_ wir wollen mal sehen was von der letzten unterrichtsstunde noch haengen geblieben ist _end 
None
Input: start_ is this house yours _end
Predicted translation: das ist _end 
Actual translation: start_ ist das dein haus _end 
None
Input: start_ this car belongs to tom _end
Predicted translation: tom ist nicht sehr _end 
Actual translation: start_ dieser 

In [152]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: start_ i thought you were older than me _end
Predicted translation: ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich 
Actual translation: start_ ich dachte du waerest aelter als ich _end 
None
Input: start_ these are real _end
Predicted translation: das ist ein guter gitarrist _end 
Actual translation: start_ die sind echt _end 
None
Input: start_ do you think i dont care _end
Predicted translation: ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich 
Actual translation: start_ glaubst du etwa das ist mir gleich _end 
None
Input: start_ i dont understand why people idolize criminals _end
Predicted translation: ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich 
Actual translation: start_ ich verstehe nicht warum manche leute verbrecher verehren _end 
None
Input: start_ the kitchen table was bare ex