# Neural Machine Translation: a seq2seq implementation to translate English to German

In [None]:
%%capture
!pip install "tensorflow-text>=2.11"

In [None]:
# import necessary libs
import numpy as np
import re
import gdown
import os
import random
import nltk
from tqdm import tqdm
from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.utils as utils
import tensorflow_text as tf_text

### Introduction
This notebook is to implement the seq2seq model proposed by [Sutskever et al.,2014.](https://arxiv.org/pdf/1409.3215.pdf)

In [None]:
# Let's define some constant variables
max_vocab_size = 20000
dropout = .5

BUFFER_SIZE = 256
BATCH_SIZE = 64

embedding_size = 256
hidden_units = 128

data_file = "deu.txt"
data_dir = '/content/data/'
os.makedirs(data_dir, exist_ok=True)

# Data Preprocessing
We will use the [Kaggle English to German](https://www.kaggle.com/datasets/kaushal2896/english-to-german) data as the dataset on the English-German language pair.

In [None]:
# Download dataset
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1jqne1TQIir7usU7iBCCveOzCmtVLJcLU' -O en_de.zip
!unzip en_de.zip -d data/

--2024-01-17 15:02:14--  https://drive.google.com/uc?export=download&id=1jqne1TQIir7usU7iBCCveOzCmtVLJcLU
Resolving drive.google.com (drive.google.com)... 142.251.2.138, 142.251.2.102, 142.251.2.100, ...
Connecting to drive.google.com (drive.google.com)|142.251.2.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1jqne1TQIir7usU7iBCCveOzCmtVLJcLU&export=download [following]
--2024-01-17 15:02:14--  https://drive.usercontent.google.com/download?id=1jqne1TQIir7usU7iBCCveOzCmtVLJcLU&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.2.132, 2607:f8b0:4023:c0d::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8381019 (8.0M) [application/octet-stream]
Saving to: ‘en_de.zip’


2024-01-17 15:02:16 (39.9 MB/s) - ‘en_de.zip’ saved [8381019/8381019]

Arc

In [None]:
# Take a look at the first lines
with open(os.path.join(data_dir, data_file)) as f:
    for n, line in enumerate(f):
        print(line)

        if n == 4:
            break

Go.	Geh.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)

Hi.	Hallo!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)

Hi.	Grüß Gott!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)

Run!	Lauf!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)

Run.	Lauf!	CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #941078 (Fingerhut)



To maintain coherence and prevent loss of word's meaning, contracted terms are expanded.

In [None]:
contraction_mapping = {
    # This should be wrapped as a JSON file.
    "Let's": "Let us",
    "'d better": " had better",
    "'s": " is",
    "'re": " are",
    "n't": " not",
    "'m": " am",
    "'ll": " will",
    "'d": " would",
    "'ve": " have",
    "won't": "will not",
    "cannot": "can not"
}

def expand_contraction(text, mapping=contraction_mapping):
    for contraction, expanded in mapping.items():
        text = text.replace(contraction, expanded)
    return text

In [None]:
# Let's test the function
expand_contraction("He definitely didn't do it. He must be forced to commit crime.")

'He definitely did not do it. He must be forced to commit crime.'

The dataset are still in unprocessed form. It is necessary to preprocess and store them in appropriate form.

In [None]:
en_set = []
de_set = []

with open(os.path.join(data_dir, data_file)) as f:
    for line in f:
        en_de = line.split("CC-BY")
        if len(en_de) > 0:
            sample = en_de[0]
            sample = sample.strip().split('\t')
            en_set.append(expand_contraction(sample[0]))
            de_set.append("<SOS> " + sample[1] + " <EOS>")

for i in range(5):
    print("{:4} --> {:4}".format(en_set[i], de_set[i]))

en_set = np.array(en_set)
de_set = np.array(de_set)

Go.  --> <SOS> Geh. <EOS>
Hi.  --> <SOS> Hallo! <EOS>
Hi.  --> <SOS> Grüß Gott! <EOS>
Run! --> <SOS> Lauf! <EOS>
Run. --> <SOS> Lauf! <EOS>


In [None]:
# Let' examine each the total size
len(de_set)

221533

In [None]:
train_val_en, test_en, train_val_de, test_de = train_test_split(en_set, de_set, test_size=25000, random_state=25)
train_en, val_en, train_de, val_de = train_test_split(train_val_en, train_val_de, test_size=25000, random_state=25)

In [None]:
train_raw = (
    tf.data.Dataset
    .from_tensor_slices((train_en, train_de))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE))

val_raw = (
    tf.data.Dataset
    .from_tensor_slices((val_en, val_de))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE))

test_raw = (
    tf.data.Dataset
    .from_tensor_slices((test_en, test_de))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
en_length = [len(sentence[0]) for sentence in en_set]
de_length = [len(sentence[1]) for sentence in de_set]
time_periods = range(1, len(en_length) + 1)

fig, axs = plt.subplots(1, 2, figsize=(10, 4))  # Adjust figsize as needed

# Create the first bar plot
axs[0].bar(time_periods, en_length, alpha=0.7, edgecolor='black')
axs[0].set_title('Length of English sentences')
axs[0].set_xlabel('ith')
axs[0].set_ylabel('Length')

# Create the second bar plot
axs[1].bar(time_periods, de_length, alpha=0.7, edgecolor='black')
axs[1].set_title('Length of German sentences')
axs[1].set_xlabel('ith')
axs[1].set_ylabel('Length')

# Adjust spacing between subplots
plt.tight_layout()

# Show the subplots
plt.show()

It can be inferred from the plots that despite some outliers, sentence length tends to remain stable along both dataset. Therefore, it is not necessary to implement bucketing by length.

# Text preprocessing
Computer obviously cannot handle raw text. Instead, they need to be converted into numerical form for further calculations. Besides, while both removing punctuation and lowercasing all words are common practice in NLP tasks, it is not really the case for Neural Machine Translation. Punctuation is important to mark the start or end of a sentence. Therefore, we may well
necessarily tokenize them.

In [None]:
def tf_lower_and_split_punct(text):

    # Split accented characters.
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)

    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, r'[^ a-z.?!\\,<>]', '')

    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, r'[.?!,]', r' \0 ')

    # Strip whitespace.
    text = tf.strings.strip(text)

    return text

In [None]:
# Vectorizer initial
en_vec = layers.TextVectorization(max_tokens=max_vocab_size,
                                  standardize=tf_lower_and_split_punct)
de_vec = layers.TextVectorization(max_tokens=max_vocab_size,
                                  standardize=tf_lower_and_split_punct)

en_vec.adapt(train_raw.map(lambda src, tar: src))
de_vec.adapt(train_raw.map(lambda src, tar: tar))

In [None]:
# Vocabulary
en_voc = en_vec.get_vocabulary()
de_voc = de_vec.get_vocabulary()

text_from_ids = {}
for i in range(len(de_voc)):
    text_from_ids[i] = de_voc[i]

def get_text_from_ids(ids):
    result = []

    for i in ids:
        result.append(text_from_ids[i])

    return result

In [None]:
# Assign vocab size of each vectorizer
input_vocab_size = len(en_vec.get_vocabulary())
output_vocab_size = len(de_vec.get_vocabulary())
print(input_vocab_size)
print(output_vocab_size)

14838
20000


# Model implementation
In seq2seq, we need a RNN block, which is also known as Encoder, to encode the input sequence to a fixed-length vector, then another RNN block called Decoder to decode it. Block generally consists of LSTM cells.

## Encoder
Encoder can be defined multi-layered RNN network. For the sake of simplicity, I will implement it as an one-layer RNN network with 1 cell at each timestep.

Each RNN cell receives a source word and previous hidden state as inputs.

\begin{align*}
s_{i}=tanh(Ws_{i-1}+Ux_{i})
\end{align*}

According to the formula, the $i^{th}$ hidden state $s_{i}$ is calculated from the $(i-1)^{th}$ hidden state and the $i^{th}$ input.

In [None]:
# Let's define the encoder
class Encoder(layers.Layer):
    def __init__(self,
                 hidden_units):
        """
            Encoder Block in seq2seq

        :param hidden_units: dimensionality of the output
        :param tokenizer: tokenizer of the source language
        """

        super(Encoder, self).__init__()
        self.encoder_block = layers.LSTM(units=hidden_units,
                                         dropout=dropout,
                                         return_state=True)

    def call(self,
             src,
             **kwargs):
        """
            Calculate vector representation.

        :param src: [batch, timesteps]

        :return:
            encoder_hidden_state: [batch, hidden_state_dim]
            state_h: [batch, hidden_state_dim]
            state_c: [batch, hidden_state_dim]
        """


        encoder_outputs, state_h, state_c = self.encoder_block(inputs=src,
                                                               **kwargs)
        return encoder_outputs, state_h, state_c

## Decoder
Encoder and Decoder share the same structure as well as hidden units but the last dense layer at each state which holds for predicting the next word using a softmax.

In [None]:
# Let's define the decoder
class Decoder(tf.keras.Model):
    def __init__(self,
                 hidden_units):
        """
            Decoder Block in seq2seq

        :param hidden_units: dimensionality of the output
        """

        super(Decoder, self).__init__()
        self.decoder_block = layers.LSTM(units=hidden_units,
                                         dropout=dropout,
                                         return_sequences=True,
                                         return_state=True)

    def call(self,
             trg,
             previous_state,
             **kwargs):
        """
            Inputs:

        :param trg: [batch, timesteps]
        :param previous_state: [batch, hidden_unit_dim]

        :return:
            prediction: [vocab_size, None]
        """
        decoder_outputs, state_h, state_c = self.decoder_block(inputs=trg,
                                                               initial_state=previous_state,
                                                               **kwargs)

        return decoder_outputs, state_h, state_c

## seq2seq
Now we have got the Encoder and Decoder. Let's combine them into seq2seq model.

In [None]:
class NMT(tf.keras.Model):
    def __init__(self,
                 input_vocab_size,
                 output_vocab_size,
                 embedding_size,
                 hidden_units):
        """
            Initialize an instance for Neural Machine Translation Task

        :param input_vocab_size: number of unique word in source language
        :param output_vocab_size: number of unique word in target language
        :param embedding_size: dimensionality of embedding layer
        :param hidden_units: dimensionality of the output
        """

        super(NMT, self).__init__()
        self.e_embedding = layers.Embedding(input_dim=input_vocab_size,
                                            output_dim=embedding_size)
        self.d_embedding = layers.Embedding(input_dim=output_vocab_size,
                                            output_dim=embedding_size)
        self.encoder = Encoder(hidden_units)
        self.decoder = Decoder(hidden_units)
        self.dense = layers.Dense(output_vocab_size, activation='softmax')

    def call(self,
             src,
             trg):
        e_mask = tf.not_equal(src, 0)
        d_mask = tf.not_equal(trg, 0)

        embed_src = self.e_embedding(src)
        embed_trg = self.d_embedding(trg)

        encoder_outputs, state_h, state_c = self.encoder(src=embed_src,
                                                         mask=e_mask,
                                                         training=True)
        decoder_outputs, state_h, state_c = self.decoder(trg=embed_trg,
                                                         previous_state=[state_h, state_c],
                                                         mask=d_mask,
                                                         training=True)
        prediction = self.dense(decoder_outputs)

        return prediction, state_h, state_c

    def train(self,
              dataset,
              loss_fn,
              optimizer,
              epochs=5,
              val_set=None):
        """
            Train the model.

        :param model: generator model
        :param dataset: training dataset
        :param loss_fn: loss function
        :param optimizer: optimizer
        :param epochs: number of training epochs
        :param val_set: validation set
        """

        for epoch in range(epochs):
            loss_sum = 0
            for step, (context, target) in enumerate(tqdm(dataset)):
                tokenized_context = en_vec(context)
                tokenized_target = de_vec(target)

                # Apply Teacher Forcing (TF)
                TF_target = tf.map_fn(lambda x: x[1:], tokenized_target)
                tokenized_target = tf.map_fn(lambda x: x[:-1], tokenized_target)

                with tf.GradientTape() as tape:
                    prediction, _, _ = self(tokenized_context, tokenized_target)
                    loss = loss_fn(TF_target, prediction)
                    loss_sum += loss

                gradients = tape.gradient(loss, self.trainable_weights)
                optimizer.apply_gradients(zip(gradients, self.trainable_weights))

            val_loss_sum = 0;
            if val_set is not None:
                for step, (context, target) in enumerate(tqdm(val_set)):
                    tokenized_context = en_vec(context)
                    tokenized_target = de_vec(target)

                    TF_target = tf.map_fn(lambda x: x[1:], tokenized_target)
                    tokenized_target = tf.map_fn(lambda x: x[:-1], tokenized_target)

                    prediction, _, _ = self(tokenized_context, tokenized_target)
                    loss = loss_fn(TF_target, prediction)
                    val_loss_sum += loss

            print("\nEpoch: {:1d}, loss = {:4f}, val_loss = {:4f}".format(epoch, loss_sum, val_loss_sum))

    def predict(self,
                inputs):
        """
            Generate translation from input.
        """
        translation = []

        tokenized_input = en_vec(inputs)
        embed_input = self.e_embedding(tokenized_input)
        embed_input = tf.expand_dims(embed_input, axis=0)

        next_word = "<sos>"
        de_vocab = de_vec.get_vocabulary()

        encoder_state, state_h, state_c = self.encoder(src=embed_input,
                                                       training=False)

        for i in range(30):
            if next_word != "<eos>":
                tokenized_word = de_vec(next_word)
                embed_word = self.d_embedding(tokenized_word)
                embed_word = tf.expand_dims(embed_word, axis=0)

                decoder_outputs, state_h, state_c = self.decoder(trg=embed_word,
                                                            previous_state=[state_h, state_c],
                                                            training=False)

                prediction = self.dense(decoder_outputs)

                dist = prediction.numpy().squeeze()
                idx = np.random.choice(range(len(de_vocab)), p=dist)

                next_word = de_vocab[tf.squeeze(idx)]

                if next_word == "[UNK]":
                    continue

                translation.append(next_word)
            else:
                break

        return translation

In [None]:
nmt = NMT(input_vocab_size,
          output_vocab_size,
          embedding_size,
          hidden_units)

In [None]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

In [None]:
nmt.train(train_raw, loss_fn, optimizer, epochs=5, val_set=val_raw)

100%|██████████| 2681/2681 [13:51<00:00,  3.22it/s]
100%|██████████| 391/391 [01:46<00:00,  3.66it/s]



Epoch: 0, loss = 4453.133789, val_loss = 712.141541


100%|██████████| 2681/2681 [13:53<00:00,  3.22it/s]
100%|██████████| 391/391 [01:46<00:00,  3.66it/s]



Epoch: 1, loss = 4134.282227, val_loss = 688.767029


100%|██████████| 2681/2681 [14:21<00:00,  3.11it/s]
100%|██████████| 391/391 [01:40<00:00,  3.88it/s]



Epoch: 2, loss = 3879.782715, val_loss = 669.163818


100%|██████████| 2681/2681 [13:35<00:00,  3.29it/s]
100%|██████████| 391/391 [02:21<00:00,  2.76it/s]



Epoch: 3, loss = 3673.938721, val_loss = 655.645447


100%|██████████| 2681/2681 [13:28<00:00,  3.32it/s]
100%|██████████| 391/391 [01:41<00:00,  3.87it/s]


Epoch: 4, loss = 3505.182129, val_loss = 644.817932





In [None]:
nmt.save_weights("model_v1")

In [None]:

" ".join(nmt.predict("He asks me about his family."))

'er spricht mir uber dreiig familie . <eos>'

In [None]:
" ".join(nmt.predict("He is my brother."))

'er ist mein sohn . <eos>'

In [None]:
nmt1 = NMT(input_vocab_size,
          output_vocab_size,
          embedding_size,
          hidden_units)

nmt1.load_weights("model_v1")

def extractTranslation(eng):
    result = nmt1.predict(eng)
    return " ".join(result[:-1])

In [None]:
extractTranslation("It is such a beautiful day!")

'es ist so schones verschoben ein schoner tag .'