<a href="https://colab.research.google.com/github/kartthik-18/Seq2Seq-Machine-Translation-with-LSTM/blob/main/Seq2Seq_Machine_Translation_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import collections
import numpy as np

from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

### Verify access to the GPU

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12852348280800136569
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14619377664
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12669869457634620505
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [None]:
import os
os.listdir()


['.config', 'small_vocab_fr.txt', 'small_vocab_en.txt', 'sample_data']

In [None]:
with open('small_vocab_en.txt', 'r') as f:
    eng_sentences = f.read().split('\n')

with open('small_vocab_fr.txt', 'r') as f:
    fre_sentences = f.read().split('\n')



In [None]:
for sample_i in range(3):
    print('English Sentence {} :  {}'.format(sample_i+1, eng_sentences[sample_i]))
    print('French Sentence {}  :  {}\n'.format(sample_i+1, fre_sentences[sample_i]))

English Sentence 1 :  new jersey is sometimes quiet during autumn , and it is snowy in april .
French Sentence 1  :  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .

English Sentence 2 :  the united states is usually chilly during july , and it is usually freezing in november .
French Sentence 2  :  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .

English Sentence 3 :  california is usually quiet during march , and it is usually hot in june .
French Sentence 3  :  california est généralement calme en mars , et il est généralement chaud en juin .



# 2. Pre-process text
## 2.1. Tokenize function

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer


def tokenize(x, encode_start_end = False):

    if encode_start_end:
        x = ["startofsentence " + sentence + " endofsentence" for sentence in x]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    tokenized_x = tokenizer.texts_to_sequences(x)

    return tokenized_x, tokenizer

## 2.2. Padding  function

In [None]:
from keras.preprocessing.sequence import pad_sequences

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    if length is None:
        length = max([len(sentence) for sentence in x])

    padded_x = pad_sequences(x, maxlen = length, padding = 'post', truncating = 'post')

    return padded_x

In [None]:
eng_tokenized, eng_tokenizer = tokenize(eng_sentences)
fre_tokenized, fre_tokenizer = tokenize(fre_sentences, encode_start_end = True)

eng_encoded = pad(eng_tokenized)
fre_encoded = pad(fre_tokenized)

eng_vocab_size = len(eng_tokenizer.word_index)
fre_vocab_size = len(fre_tokenizer.word_index)

print("English vocabulary size: ", eng_vocab_size)
print("french vocabulary size: ", fre_vocab_size)
print()

eng_seq_len = len(eng_encoded[0])
fre_seq_len = len(fre_encoded[0])

print("Length of longest English sentence: ", eng_seq_len)
print("Length of longest french sentence: ", fre_seq_len)
print()

English vocabulary size:  199
french vocabulary size:  346

Length of longest English sentence:  15
Length of longest french sentence:  23



# 3. Build Seq2Seq Model & Train
## 3.1. Training model

In [None]:
english_vocab_size = len(eng_tokenizer.word_index) + 1
ed_french_vocab_size = len(fre_tokenizer.word_index) + 1

from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

ed_preproc_english_sentences = eng_encoded
ed_preproc_french_sentences = fre_encoded

english_input = ed_preproc_english_sentences   # (137861, 15)

decoder_french_input = ed_preproc_french_sentences[:, :-1]
decoder_french_target = ed_preproc_french_sentences[:, 1:]

# Reshape to (batch_size, seq_len, 1)
decoder_french_input = np.expand_dims(decoder_french_input, -1)
decoder_french_target = np.expand_dims(decoder_french_target, -1)

# continue model definition...

# Reshape to (batch_size, seq_len, 1)
decoder_french_input = np.expand_dims(decoder_french_input, -1)
decoder_french_target = np.expand_dims(decoder_french_target, -1)


# 1. Define Encoder
input_seq_encoder = Input(shape = (None, ),
                          name = "encoder_input")     # (batch_size, sentence_length, 1)

embed_dim = 200
embedded_seq_encoder = Embedding(input_dim = english_vocab_size,
                                 output_dim = embed_dim)(input_seq_encoder)

encoder_lstm = LSTM(units = 256,
                    activation = 'relu',
                    return_sequences = False,
                    return_state = True,
                    name = "encoder_LSTM")

_, last_hidden_encoder, last_cell_encoder = encoder_lstm(embedded_seq_encoder)


# 2. Define Decoder
input_seq_decoder = Input(shape = (None, 1),
                          name = "decoder_input")     # (batch_size, sentence_length, 1)

decoder_lstm = LSTM(units = 256,
                    activation = 'relu',
                    return_sequences = True,
                    return_state = True,
                    name = "decoder_LSTM")

all_hidden_decoder, _, _ = decoder_lstm(input_seq_decoder,
                                        initial_state = [last_hidden_encoder, last_cell_encoder])

decoder_dense = Dense(ed_french_vocab_size,   # NOT TIMEDISTRIBUTED (NOT RECURSIVE)
                      activation = 'softmax',
                      name = "decoder_dense")
logits = decoder_dense(all_hidden_decoder)


# 3. Define Model
final_rnn_model = Model(inputs = [input_seq_encoder, input_seq_decoder],
                        outputs = logits)


final_rnn_model.compile(loss = sparse_categorical_crossentropy,
                        optimizer = Adam(learning_rate = 0.002),
                        metrics = ['accuracy'])

# 4. Fit the Model
final_rnn_model.fit([english_input, decoder_french_input],
                    decoder_french_target,
                    batch_size = 1024,
                    epochs = 16,
                    validation_split = 0.2)

Epoch 1/16
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 102ms/step - accuracy: 0.2036 - loss: 2.4966 - val_accuracy: 0.1655 - val_loss: 0.6956
Epoch 2/16
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 60ms/step - accuracy: 0.1644 - loss: 0.5956 - val_accuracy: 0.1647 - val_loss: 0.3790
Epoch 3/16
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 60ms/step - accuracy: 0.1636 - loss: 0.3372 - val_accuracy: 0.1644 - val_loss: 0.1782
Epoch 4/16
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 60ms/step - accuracy: 0.1636 - loss: 0.1439 - val_accuracy: 0.1643 - val_loss: 0.0673
Epoch 5/16
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 60ms/step - accuracy: 0.1633 - loss: 0.0525 - val_accuracy: 0.1642 - val_loss: 0.0586
Epoch 6/16
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 60ms/step - accuracy: 0.1634 - loss: 0.0406 - val_accuracy: 0.1642 - val_loss: 0.0354
Epoch 7/16
[1m1

<keras.src.callbacks.history.History at 0x78047088d4d0>

In [None]:
final_rnn_model.summary()

## 3.2. Inference model
### 2.1. Encoder Model for inference

In [None]:
last_states_encoder = [last_hidden_encoder, last_cell_encoder]
inference_encoder_model = Model(inputs = input_seq_encoder,
                                outputs = last_states_encoder)


### 2.2. Decoder Model for inference

In [None]:
decoder_initial_state = [Input(shape = (256,)), Input(shape = (256,))]
all_hidden_decoder, last_hidden_decoder, last_cell_decoder = decoder_lstm(input_seq_decoder,
                                                                          initial_state = decoder_initial_state)

logits = decoder_dense(all_hidden_decoder)

inference_decoder_model = Model(inputs  = [input_seq_decoder] + decoder_initial_state,
                                outputs = [logits, last_hidden_decoder, last_cell_decoder])


### 2.3. Decode Sequence Function

In [None]:
target_id_to_word = {idx: word for word, idx in fre_tokenizer.word_index.items()}

def decode_sequence(input_seq):
    """
    Gets predictions using the final model defined above
    :param input_seq: (list) encoded english sentence (list of word ids)
    returns : translated French sentence
    """
    decoder_input = inference_encoder_model.predict(input_seq)

    # Initialize decoder input as a length 1 sentence containing "startofsentence"
    prev_word = np.zeros((1, 1, 1))
    prev_word[0, 0, 0] = fre_tokenizer.word_index["startofsentence"]

    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        # 1. predict the next word using decoder model
        logits, last_h, last_c = inference_decoder_model.predict([prev_word] + decoder_input)

        # 2. Get the predicted word
        predicted_id = np.argmax(logits[0, 0, :])
        predicted_word = target_id_to_word.get(predicted_id, '')

        decoded_sentence.append(predicted_word)

        # 3. End condition
        if (predicted_word == 'endofsentence' or len(decoded_sentence) > decoder_french_target.shape[1]):
            stop_condition = True

        # 4. Update decoder input and state
        prev_word[0, 0, 0] = predicted_id
        decoder_input = [last_h, last_c]

    return " ".join(decoded_sentence).replace('endofsentence', '')

### 2.4. Prediction

In [None]:
# TODO: Print prediction(s)
for i in [100, 284, 380, 345]:
    english_seq = ed_preproc_english_sentences[i].reshape(1, ed_preproc_english_sentences.shape[1])
    french_translation = decode_sequence(english_seq)

    print("English Sentence            : ", eng_sentences[i])
    print("Predicted French Translation: ", french_translation)
    print("Correct French Translation  : ", fre_sentences[i])
    print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35