- functioning NMT model
- attention model using classes
- uni-directional gru

In [62]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

#import plotly.graph_objects as go


In [196]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# df_en_de = pd.read_table('/content/gdrive/MyDrive/deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [88]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [89]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub("'", '', sentence)
    sentence = sentence.replace('ü', 'ue').replace('ä', 'ae').replace('ö', 'oe').replace('ß', 'ss')
    exclude = set(string.punctuation)
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = 'start_ ' + sentence + ' _end'
    sentence = sentence.encode("ascii", "ignore")
    sentence = sentence.decode()
    return sentence

In [157]:
pairs = df_en_de
pairs = pairs.sample(frac = 0.1)
pairs['english'] = pairs['english'].apply(preprocess_sentence)
pairs['german'] = pairs['german'].apply(preprocess_sentence)

In [158]:
source = pairs['english']
target = pairs ['german']

In [159]:
# create tokenizer & tensor for source and target
source_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
source_sentence_tokenizer.fit_on_texts(source)
source_tensor = source_sentence_tokenizer.texts_to_sequences(source)
source_tensor= tf.keras.preprocessing.sequence.pad_sequences(source_tensor, padding='post' )

target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(target)
target_tensor = target_sentence_tokenizer.texts_to_sequences(target)
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post' )

In [160]:
source_train_tensor, source_test_tensor, target_train_tensor, target_test_tensor = train_test_split(
                                                                source_tensor, target_tensor,test_size=0.2
                                                                )

In [161]:
max_target_length= max(len(t) for t in  target_tensor)
max_source_length= max(len(t) for t in source_tensor)

In [162]:
max_target_length, max_source_length

(34, 33)

In [163]:
BATCH_SIZE = 64
#Create data in memeory 
dataset = tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(BATCH_SIZE)
# shuffles the data in the batch
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [165]:
source_batch, target_batch =next(iter(dataset))
print(source_batch.shape, target_batch.shape)
print(source_batch[1])

(64, 33) (64, 34)
tf.Tensor(
[   1   93  194  286    4   60   29 4839 3489    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0], shape=(33,), dtype=int32)


In [184]:
BUFFER_SIZE = len(source_train_tensor)
steps_per_epoch= BUFFER_SIZE//BATCH_SIZE
embedding_dim=256
units=1024
attention_layer_units = 100
source_vocab_size= len(source_sentence_tokenizer.word_index)+1
target_vocab_size= len(target_sentence_tokenizer.word_index)+1

In [185]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding =tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru= tf.keras.layers.GRU(encoder_units, 
                                      return_sequences=True,
                                      return_state=True,                                      
                                      recurrent_initializer='glorot_uniform'
                                     )
    def call(self, x, hidden):
                                                                # x = (m, Tx)
                                                                # hidden = (m, encoder_units)
        x = self.embedding(x)                                   # (m, Tx, embedding_dim)
                                                                
        enc_sequential, enc_final = self.gru(x, initial_state = hidden)
        return enc_sequential, enc_final                                    # enc_sequential = (m, Tx, encoder_units) 
                                                                            # enc_final = (m, encoder_units)
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoder_units))

In [186]:
encoder = Encoder(source_vocab_size, embedding_dim, units, BATCH_SIZE)

enc_final = encoder.initialize_hidden_state()
enc_sequential, enc_final = encoder(source_batch, enc_final)

print (f'Encoder sequential: {enc_sequential.shape}')
print (f'Encoder final: {enc_final.shape}')

Encoder sequential: (64, 33, 1024)
Encoder final: (64, 1024)


In [187]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super( BahdanauAttention, self).__init__()
        self.W1= tf.keras.layers.Dense(units)  # decoder hidden
        self.W2= tf.keras.layers.Dense(units)  # encoder hidden
        self.V= tf.keras.layers.Dense(1)
    
    def call(self, dec_hidden, enc_hidden):
                                                                # dec_hidden = (m, units)
                                                                # enc_hidden:   (m, Tx, units)
        dec_hidden_with_time = tf.expand_dims(dec_hidden, 1)    # dec_hidden_with_time = (m, 1, units)
        
                                                                
        # W1() = (m, 1, 10) 
        # W2() = (m, Tx, 10)
        # Broadcasting happens when you add
        # W1() + W2 () = (m, Tx, 10)

        score = self.V(tf.nn.tanh(self.W1(dec_hidden_with_time) + self.W2(enc_hidden))) # (m, Tx, 1)
        
        # normalise scores with softmax
        attention_weights = tf.nn.softmax(score, axis=1)                                # (m, Tx, 1)
        
        # apply each weight to encoder hidden state at respective time-step 
        context_vector= attention_weights * enc_hidden                                  # (m, Tx, units)
       
        # linear combination of enc_hidden vectors for all Tx
        # so sum along Tx axis
        context_vector = tf.reduce_sum(context_vector, axis=1)                          # (m, units)
        return context_vector, attention_weights

In [188]:
# tens1 = tf.Variable([[[1,2],[1,2]], [[1,2],[1,2]], [[1,2],[1,2]]])
# tens2 = tf.Variable([[[2],[2]], [[2],[2]], [[2],[2]]])
# tens1 * tens2

In [189]:
attention_layer= BahdanauAttention(attention_layer_units)
attention_result, attention_weights = attention_layer(enc_final, enc_sequential)
print(f"context vector: (batch size, units) {attention_result.shape}")
print(f"attention weights: (batch_size, sequence_length, 1) {attention_weights.shape}")

context vector: (batch size, units) (64, 1024)
attention weights: (batch_size, sequence_length, 1) (64, 33, 1)


In [190]:
# Decoder for one time-step

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_sz):
        super (Decoder,self).__init__()
        self.batch_sz= batch_sz
        self.decoder_units = decoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                                   embedding_dim)
        self.gru= tf.keras.layers.GRU(decoder_units, 
                                      return_sequences= True,
                                      return_state=True,
                          recurrent_initializer='glorot_uniform')
        # Fully connected layer
        self.fc= tf.keras.layers.Dense(vocab_size)
        
        # attention
        #self.attention = BahdanauAttention(self.decoder_units)
        self.attention = BahdanauAttention(attention_layer_units)
    
    def call(self, y, dec_hidden, enc_hidden):
                                                                                    # dec_hidden: (m, units) 
                                                                                    # enc_hidden: (m, Tx, units) 

        context_vector, attention_weights = self.attention(dec_hidden, enc_hidden)  # context_vector = (m, units)
        
        y= self.embedding(y)                                                        # y = (m, 1, embedding_dim)
        
        # concatenate context vector and embedding for output sequence
        y = tf.concat([tf.expand_dims(context_vector, 1), y],                       # (m, 1, units) + (m, 1, embedding_dim)
                                      axis=-1)                                      # (m, 1, units + embedding_dim)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(y)                                                 # output = (m, 1, units)
                                                                                    # state = (m, units)

        output= tf.reshape(output, (-1, output.shape[2]))                           # output = (m, units)
        
        # pass the output thru Fc layers
        y = self.fc(output)                                                         # y = (m, vocab_size)
        return y, state, attention_weights

In [191]:
decoder= Decoder(target_vocab_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _= decoder(tf.random.uniform((BATCH_SIZE,1)), enc_final, enc_sequential )
print (f'Decoder output shape: (batch_size, vocab size) {sample_decoder_output.shape}')

Decoder output shape: (batch_size, vocab size) (64, 12072)


In [192]:
#Define the optimizer and the loss function
optimizer = tf.keras.optimizers.Adam()

In [194]:
def loss_function(real, pred):      # "real" = (m, 1), "pred" = (m, vocab_size)
    mask = 1 - np.equal(real, 0)    # mask = 1 when "real" != 0
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [195]:
EPOCHS = 5

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
                                                        # inp: (batch_size, Tx)
                                                        # targ: (batch_size, Ty)
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_sequential, enc_final = encoder(inp, hidden)
            
            dec_hidden = enc_final
            
            dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)    # (m, 1)   
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_sequential) # predictions = (m, vocab_size)
                
                loss += loss_function(targ[:, t], predictions)
                
                # update dec_input for teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

Epoch 1 Batch 0 Loss 1.9135


KeyboardInterrupt: 

In [223]:
def evaluate(inputs, encoder, decoder, max_source_length, max_target_length):
    # inputs = (1, Tx)

    
    input_sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        input_sentence = input_sentence + source_sentence_tokenizer.index_word[i] + ' '
    input_sentence = input_sentence[:-1]
    

    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_sequential, enc_final = encoder(inputs, hidden)

    dec_hidden = enc_final
    dec_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']], 0)             # dec_input = (1, 1)

    # start decoding
    for t in range(max_target_length): # limit the length of the decoded sequence
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_sequential)    # predictions = (1, vocab_size)
                                                                                                       # dec_hidden = (1, units)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_sentence_tokenizer.index_word[predicted_id] + ' '

        # stop decoding if '<end>' is predicted
        if target_sentence_tokenizer.index_word[predicted_id] == '_end':
            return result, input_sentence
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)                                         # dec_input = (1,1)  

    return result, input_sentence
  
def predict_random_val_sentence():
    
    k = np.random.randint(len(source_train_tensor))
    random_input = source_train_tensor[k]
    random_output = target_train_tensor[k]
    random_input = np.expand_dims(random_input,0)           # random_input = (1, Tx)
    result, sentence = evaluate(random_input, encoder, decoder, max_source_length, max_target_length)
    print(f'Input: {sentence[7:-5]}')                   # Want to skip "start_ " and " _end"
    print(f'Predicted translation: {result[:-5]}')
    true_translation = ''
    for i in random_output:
        if i == 0:
            break
        true_translation = true_translation + target_sentence_tokenizer.index_word[i] + ' '
    true_translation = true_translation[7:-6]               # Want to skip "start_" and " _end "
    print(f'Actual translation: {true_translation}')


In [150]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: start_ shes very ill and has been in bed for a week _end
Predicted translation: tom ist der einzige der einzige der einzige der einzige der einzige der einzige der einzige der einzige der einzige der einzige der einzige der 
Actual translation: start_ sie ist sehr krank und liegt schon seit einiger woche im bett _end 
None
Input: start_ come on im taking you home _end
Predicted translation: ich habe eine gute idee _end 
Actual translation: start_ komm ich bringe dich nach hause _end 
None
Input: start_ in each beehive there can only be one queen _end
Predicted translation: tom hat sich auf die ganze zeit _end 
Actual translation: start_ in jedem bienenstock kann es nur eine bienenkoenigin geben _end 
None
Input: start_ im going to the bar _end
Predicted translation: ich habe eine grosse spinne in der lage sein _end 
Actual translation: start_ ich gehe in die kneipe _end 
None
Input: start_ its a wonder theyre still awake _end
Predicted translation: tom ist ein guter fahrer _end 

In [151]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: start_ now just relax _end
Predicted translation: ich habe eine gute idee _end 
Actual translation: start_ jetzt entspann dich einfach _end 
None
Input: start_ she asked him to give her some money so she could go to a restaurant with her friends _end
Predicted translation: tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom hat tom 
Actual translation: start_ sie bat ihn um geld damit sie mit ihren freunden in ein restaurant gehen koennte _end 
None
Input: start_ lets see what we remember from the last lesson _end
Predicted translation: tom hat eine gute idee _end 
Actual translation: start_ wir wollen mal sehen was von der letzten unterrichtsstunde noch haengen geblieben ist _end 
None
Input: start_ is this house yours _end
Predicted translation: das ist _end 
Actual translation: start_ ist das dein haus _end 
None
Input: start_ this car belongs to tom _end
Predicted translation: tom ist nicht sehr _end 
Actual translation: start_ dieser 

In [152]:
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())
print (predict_random_val_sentence())

Input: start_ i thought you were older than me _end
Predicted translation: ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich 
Actual translation: start_ ich dachte du waerest aelter als ich _end 
None
Input: start_ these are real _end
Predicted translation: das ist ein guter gitarrist _end 
Actual translation: start_ die sind echt _end 
None
Input: start_ do you think i dont care _end
Predicted translation: ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich 
Actual translation: start_ glaubst du etwa das ist mir gleich _end 
None
Input: start_ i dont understand why people idolize criminals _end
Predicted translation: ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich habe ich 
Actual translation: start_ ich verstehe nicht warum manche leute verbrecher verehren _end 
None
Input: start_ the kitchen table was bare ex