# Encoder decoder translation model

In [1]:
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from lib import clean_sentence, tokenise

## Data Preprocessing

In [2]:
path_to_data = "./data/fra.txt"

In [3]:
# read data
translation_file = open(path_to_data, "r", encoding='utf-8')
raw = translation_file.read()
translation_file.close()

In [4]:
raw = raw.split('\n')
pairs = [sentence.split('\t') for sentence in raw[:-1]]

In [5]:
# clean the data
english_sen = [clean_sentence(pair[0]) for pair in pairs]
french_sen = [clean_sentence(pair[1]) for pair in pairs]

In [6]:
# tokenise the sentence
fre_txt_tokenised, fre_txt_tokeniser = tokenise(french_sen)
eng_txt_tokenised, eng_txt_tokeniser = tokenise(english_sen)

In [7]:
print(f'Max length of French sentence: {len(max(fre_txt_tokenised, key=len))}')
print(f'Max length of English sentence: {len(max(eng_txt_tokenised, key=len))}')

Max length of French sentence: 55
Max length of English sentence: 47


In [8]:
fre_vocab = len(fre_txt_tokeniser.word_index) + 1
eng_vocab = len(eng_txt_tokeniser.word_index) + 1
print(f"French vocab is of {fre_vocab}")
print(f"English vocab is of {eng_vocab}")

French vocab is of 37638
English vocab is of 16447


In [9]:
# find max length of all the sentences in each language
max_fre_len = int(len(max(fre_txt_tokenised, key=len)))
max_eng_len = int(len(max(eng_txt_tokenised, key=len)))

# add padding at the end of the sequences so that sentence of the same language all have the same length
fre_pad_sen = pad_sequences(fre_txt_tokenised, max_fre_len, padding = "post")
eng_pad_sen = pad_sequences(eng_txt_tokenised, max_eng_len, padding = "post")

# reshape data, add a dimension
fre_pad_sen = fre_pad_sen.reshape(fre_pad_sen.shape[0], fre_pad_sen.shape[1], 1)
eng_pad_sen = eng_pad_sen.reshape(eng_pad_sen.shape[0], eng_pad_sen.shape[1], 1)

## Build Model

In [10]:
# input embedding
input_sequence = Input(shape=(max_fre_len, ))
embedding = Embedding(input_dim=fre_vocab, output_dim=128, )(input_sequence)
# LSTM layer of encoder
encoder = LSTM(64, return_sequences=False)(embedding)
# hidden layer
r_vec = RepeatVector(max_eng_len)(encoder)
# LSTM layer of decoder, return sequence is True bc we want the output vector at each timestep
decoder = LSTM(64, return_sequences = True, dropout = 0.2)(r_vec)
# output layer
logits = TimeDistributed(Dense(eng_vocab))(decoder)

Input layer is a sequence of int representing indices of input words in mandarin

Output dim of the embedding vector converts any Mandarin word into a vector of shape of the specified output dim where each dim represents a characteristic defining the word. The higher dim, the more semantic meaning and the more calculations.

Each hidden state represents a summary of the input sequence up to the corresponding time step. We only want the final hidden state which is the last time step of the input sequence due to reduced computational complexity, variable length inputs and better performance. Hidden vector is repeated n times using RepeatVector so LSTM in the decoder layer receives the same vector. N is defined here as the number of time steps in the decoder, which is the max English sentence length.

Dense layer is used to predict translated word. The number of unit is shape of output vector which is the same as length of English vocabulary bc all values will be close to 0 except 1. The index of unit that outputs a 1 will then be mapped to a dictionary for a word. It predicts one word at a time using TimeDistributed bc it applies same Dense layer to every time step.

In [11]:
# stack layers to create a model
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
                      optimizer = Adam(1e-3),
                      metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 55)]              0         
                                                                 
 embedding (Embedding)       (None, 55, 128)           4817664   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVector  (None, 47, 64)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 47, 64)            33024     
                                                                 
 time_distributed (TimeDistr  (None, 47, 16447)        1069055   
 ibuted)                                                     

# Model Training

In [12]:
model_output = enc_dec_model.fit(fre_pad_sen, eng_pad_sen, batch_size = 32, epochs = 30)

Epoch 1/30
Epoch 2/30
 997/6529 [===>..........................] - ETA: 1:14:12 - loss: 0.8662 - accuracy: 0.8765

KeyboardInterrupt: 

In [56]:
enc_dec_model.save("enc_dec-LSTM.h5")

## Evaluation

In [13]:
def logits_to_sentence(logits, tokeniser):
    index_to_words = {idx: word for word, idx in tokeniser.word_index.items()}
    index_to_words[0] = '<empty>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 5
print(f"The English sentence is {english_sen[index]}")
print(f"The French sentence is: {french_sen[index]}")

The English sentence is hi
The French sentence is: salut


In [14]:
print(logits_to_sentence(enc_dec_model.predict(fre_pad_sen[index:index+1])[0], eng_txt_tokeniser))

i you to <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty>


In [15]:
fre_word = "Je suis Joyce"  # Replace with your random Chinese word
fre_seq = fre_txt_tokeniser.texts_to_sequences([fre_word])
fre_pad_seq = pad_sequences(fre_seq, maxlen=max_fre_len, padding='post')

# Generate the English translation
eng_logits = enc_dec_model.predict(fre_pad_seq)
eng_sentence = logits_to_sentence(eng_logits[0], eng_txt_tokeniser)

# Print the translation
print("French word:", fre_word)
print("English translation:", eng_sentence)

French word: Je suis Joyce
English translation: i you to <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty>
