## Neural Machine Translation using Google's Encoder Decoder Architecture

### Importing all the dependencies

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
import tarfile
import os
import random
import matplotlib.pyplot as plt

## Turning on mixed precision policy for better performance

In [ ]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

### Getting one with the dataset

In [None]:
# Extracting all the files from the tar file
if not os.path.exists('/Users/klsharma22/Desktop/EncoderDecoderExp/wiki'):
    tar_ref = tarfile.TarFile('/Users/klsharma22/Desktop/EncoderDecoderExp/wiki-titles.tar')
    tar_ref.extractall()
    print('Files extracted')

In [None]:
# Extracting data from the file
with open('/Users/klsharma22/Desktop/EncoderDecoderExp/wiki/hi-en/wiki-titles.hi-en', 'r') as f:
    lines = f.readlines()

len(lines)

In [None]:
# Separating english from hindi
hin_sentences = [line.split('|||')[0] for line in lines]
eng_sentences = [line.split('|||')[1][:-1] for line in lines]
len(hin_sentences), len(eng_sentences)

### Visualising the dataset we have

In [None]:
random_idx = random.choices(range(len(hin_sentences)), k= 5)
for idx in random_idx:
    print(f"English sentence: {eng_sentences[idx]}")
    print(f"Hindi sentce: {hin_sentences[idx]}")
    print('--------------------\n')

In [None]:
# Lets look at the distibution of number of words per sentence
eng_words_sentences = [len(sentence.split()) for sentence in eng_sentences]
hin_word_sentences = [len(sentence.split()) for sentence in hin_sentences]

len(eng_words_sentences), len(hin_word_sentences)

In [None]:
# Create vocabulary set for both the language
eng_vocab = set()

for line in eng_sentences:
    for word in line.split():
        eng_vocab.add(word)

eng_vocab.add('<SOS>')
eng_vocab.add('<EOS>')
len(eng_vocab)

In [None]:
english_dictionary = pd.DataFrame(eng_vocab)

In [None]:
# Cross verifying the values
english_dictionary.nunique()

In [None]:
# Storing vocab value of hindi same as english
hin_vocab = set()

for line in hin_sentences:
    for word in line.split():
        hin_vocab.add(word)

len(hin_vocab)

In [None]:
# Cross verifying
hindi_dictionary = pd.DataFrame(hin_vocab)
hindi_dictionary.nunique()

In [None]:
# Plotting number of words distribution
plt.subplot(1, 2, 1)
plt.hist(eng_words_sentences)

plt.subplot(1, 2, 2)
plt.hist(hin_word_sentences)

In [None]:
# Let's cover the maximum number of data
max_len_eng = max(eng_words_sentences)
max_len_hin = max(hin_word_sentences)
max_len_eng, max_len_hin

## Preprocess the data

In [None]:
def preprocess_sentence(sentences: list):
    return ['<SOS> ' + sentence.strip() + ' <EOS>' for sentence in sentences]

In [None]:
eng_sentences_preprocessed = preprocess_sentence(eng_sentences)
hin_sentences_preprocessed = preprocess_sentence(hin_sentences)

In [None]:
random_idx = random.randint(0, len(eng_sentences_preprocessed) - 1)
print(eng_sentences_preprocessed[random_idx])
print(hin_sentences_preprocessed[random_idx])

### Create Encoder class

In [None]:
class Encoder(tf.keras.layers.Layer):

    def __init__(self, vocab, embedding_size, units, encoding_layers, **kwargs):
        # initialisation of the variables
        super().__init__(**kwargs)
        self.vocab = vocab
        self.units = units
        self.encoding_layers = encoding_layers
        self.embedding_size = embedding_size
        # self.output = None

        #initialisation of the layers required
        # self.input_layer = layers.Input(shape= (None, ), dtype= tf.int32)
        self.embedding = layers.Embedding(input_dim= self.vocab,
                                          output_dim= self.embedding_size,name= 'encoder_embedding_layer')
        self.lstm_init_units = 2 * (self.embedding_size + self.units)
        self.lstm_layers_recurrent = []
        self.lstm_layers_recurrent.append(layers.LSTM(self.lstm_init_units, return_sequences=True, name= 'lst_layers_recurrent_0'))
        self.lstm_units = self.lstm_init_units
        for _ in range(self.encoding_layers - 3):
            self.lstm_units += (self.embedding_size + self.units)
            self.lstm_layers_recurrent.append(layers.LSTM(self.lstm_units, return_sequences=True, name=f'lstm_layers_recurrent_{_ + 1}'))
        self.lstm_layer_non_recurrent = layers.LSTM(self.units, return_sequences=False, return_state= True, name= 'lstm_layer_non_recurrent')
        self.bilst_layer = layers.Bidirectional(layers.LSTM(self.units // 2, return_sequences=True), name= 'bilst_layer')
        self.concatenate_layer = layers.Concatenate(name= 'concatenate_layer')
        self.dropout_layer = layers.Dropout(0.5, name= 'dropout_layer')
        

    def call(self, inputs):
        x = self.embedding(inputs)
        bilstm_output = self.bilst_layer(x)
        x = self.dropout_layer(bilstm_output)
        x = self.concatenate_layer([bilstm_output, x])
        for i in range(self.encoding_layers - 2):
            lstm_layer_output = self.lstm_layers_recurrent[i](x)
            lstm_layer_output = self.dropout_layer(lstm_layer_output)
            x = self.concatenate_layer([lstm_layer_output, x])
            
        x, h, c = self.lstm_layer_non_recurrent(x)
        output = self.dropout_layer(x)
        # x = tf.keras.layers.Dense(15, activation= 'softmax')(x)
        
        return output, h, c
    


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(eng_sentences_preprocessed, hin_sentences_preprocessed, test_size= 0.2,
                                                    random_state= 42)
len(X_train), len(X_test), len(y_train), len(y_test)

In [None]:
# Creating a text vectorization for the source language
eng_vectorizer = layers.TextVectorization(max_tokens= len(eng_vocab),
                                          pad_to_max_tokens= True,
                                          output_sequence_length= max_len_eng + 2,
                                          name= 'eng_vectorizer')

eng_vectorizer.adapt(X_train)



In [None]:
# A sample action of our text vectorization
random_text = random.choice(X_train)
print(f"Original text: {random_text}")
print(f"Vectorized text: {eng_vectorizer(random_text)}")
print(f"Vector shape: {eng_vectorizer(random_text).shape}")

In [None]:
# Creating an instance of our encoder layer
encoder_layer = Encoder(vocab= len(eng_vocab),
                        embedding_size= 128,
                        units= 512,
                        encoding_layers= 8,
                        name= 'encoder_layer', trainable= True)


encoder_layer.get_config()

In [None]:
print(len(X_train[0].split()))
print(eng_vectorizer(X_train[:128]).shape)

In [None]:
with tf.device('GPU:0'):
    output, final_memory_state, final_carry_state = encoder_layer(eng_vectorizer(X_train[:128]))
    
print(output.shape, final_memory_state.shape, final_carry_state.shape)

In [None]:
hin_vectorizer = tf.keras.layers.TextVectorization(max_tokens= len(hin_vocab),
                                                   pad_to_max_tokens= True,
                                                   output_sequence_length= max_len_hin,
                                                   name= 'hin_vectorizer')

hin_vectorizer.adapt(y_train)

In [None]:
random_text = random.choice(y_train)

print(f"Hindi text: {random_text}")
print(f"Vectorize text: {hin_vectorizer(random_text)}")
print(f"Vector shape: {hin_vectorizer(random_text).shape}")

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, units, vocab_size, embedding_size, decoding_layers, initial_state_size, **kwargs):
        super().__init__(**kwargs)
        
        # self.initial_state = initial_state
        self.units = units
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.decoding_layers = decoding_layers
        self.initial_state_size = initial_state_size
        
        # initializing all the layers
        self.initial_state = tf.keras.layers.Input(shape= (self.initial_state_size,), name= 'decoder_initial_state')
        self.embedding = tf.keras.layers.Embedding(input_dim= self.vocab_size,
                                                   output_dim= self.embedding_size,
                                                   name= 'decoder_embedding_layer')
        # self.decoder_input = tf.keras.layers.Input(shape= (None, ), name= 'decoder_input', dtype= 'string')
        self.concatenate_layer = tf.keras.layers.Concatenate(name= 'concatenate_layer')
        self.lstm_layers_recurrent = []
        for i in range(self.decoding_layers):
            self.lstm_layers_recurrent.append(tf.keras.layers.LSTM(self.units, return_sequences=True, name= f'decoder_lstm_layer_{i}'))
            
        self.dense = tf.keras.layers.Dense(vocab_size, activation= 'softmax', name='decode_output_layer')
        
        
    def call(self, inputs, initial_state= None):
        if initial_state == None:
            initial_state = self.initial_state
        x = self.embedding(inputs)
        x = self.lstm_layers_recurrent[0](x, initial_state=initial_state)
        x = self.lstm_layers_recurrent[1](x, initial_state=initial_state)
        for i in range(2, self.decoding_layers - 1):
            lstm_output = self.lstm_layers_recurrent[i](x, initial_state)
            x = self.concatenate_layer([x, lstm_output])
        
        x = self.lstm_layers_recurrent[-1](x)
        x = self.dense(x)
        
        return x

In [None]:
decoder_layer = Decoder(units= 512,
                        vocab_size= len(hin_vocab),
                        embedding_size= 128,
                        decoding_layers= 8,
                        initial_state_size= 512,
                        name= 'decoding_layer',
                        trainable= True)
decoder_layer.get_config()

In [None]:
with tf.device('/gpu:0'):
    encoder_output, final_memory_state, final_carry_state = encoder_layer(eng_vectorizer(X_train[:128]))
    decoder_output = decoder_layer(hin_vectorizer(y_train[:128]), initial_state= [final_memory_state, final_carry_state])
    
decoder_output.shape

## Model Creation

We have created encoder and decoder as layer in the above code using subclass method and also we have created text vecotrizer usign keras `TextVectorization` layer.

Let's list the component in the order we want to build the mode:
1. Encoder Text Vectorization
2. Encoder layer with in built embedding layer
3. Decoder Text Vectorizartion
4. Decoder layer connected with in built embedding layer and feature for initial state for encoder output

In [None]:
# Creating the entire encoder decoder model
inputs_encoder = tf.keras.layers.Input(shape=(1,  ), dtype= 'string', name= 'encoder_input_layer')
print(inputs_encoder.shape)
encoder_text_vectors = eng_vectorizer(inputs_encoder)
print(encoder_text_vectors.shape)
encoder_outputs, final_memory_states, final_carry_states = encoder_layer(encoder_text_vectors)
print(encoder_outputs.shape, final_memory_states.shape, final_carry_states)

inputs_decoder =  tf.keras.layers.Input(shape= (1,  ), dtype= 'string', name= 'decoder_input_layer')
print(inputs_decoder.shape)
decoder_text_vectors = hin_vectorizer(inputs_decoder)
print(decoder_text_vectors.shape)
decoder_outputs = decoder_layer(decoder_text_vectors, initial_state= [final_memory_states, final_carry_states])
print(decoder_outputs.shape)

model = tf.keras.Model(inputs = [inputs_encoder, inputs_decoder], outputs =[decoder_outputs])

model.compile(loss= 'sparse_categorical_crossentropy',
              optimizer= tf.keras.optimizers.RMSprop(learning_rate= 1e-3))

model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_layer_names= True)

## Creating dataset for faster and better training of the model

Using `tf.data` API, we are going to create a better pipeline to train the model usign batch and prefetch method

In [None]:
train_labels = hin_vectorizer(y_train)
test_labels = hin_vectorizer(y_test)
len(X_train[0].split()), len(y_train), train_labels.shape, len(X_test), len(y_test), test_labels.shape

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_labels = tf.data.Dataset.from_tensor_slices(train_labels)
train_dataset = tf.data.Dataset.zip(train_data, train_labels).batch(128).prefetch(tf.data.AUTOTUNE)
valid_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))
valid_labels = tf.data.Dataset.from_tensor_slices(test_labels)
valid_dataset = tf.data.Dataset.zip(valid_data, valid_labels).batch(128).prefetch(tf.data.AUTOTUNE)

train_dataset, valid_dataset


In [None]:
with tf.device('/gpu:0'):
    history = model.fit(train_dataset,
                      epochs= 5,
                      validation_data= valid_dataset,
                        validation_steps= int(0.1 * len(valid_dataset)),
                      callbacks= [tf.keras.callbacks.ModelCheckpoint('GNMT_exp.weights.h5', monitor='val_loss', save_best_only=True, save_weights_only=True),
                                  tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience= 3, verbose= 1),
                                  tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience= 2, verbose= 1)])

In [None]:
model.load_weights('GNMT_exp.weights.h5')
model.evaluate(valid_dataset)

In [None]:
model_pred = model.predict(valid_dataset)
model_pred.shape

In [None]:
model_preds = model.predict([tf.expand_dims(X_test[-1], axis= 0), tf.expand_dims('<SOS>', axis= 0)])
model_preds.shape

In [None]:
model_preds = tf.argmax(tf.squeeze(model_preds), axis= 1)
model_preds

In [None]:
vocab = hin_vectorizer.get_vocabulary()
" ".join([vocab[pred.numpy()] for pred in model_preds])