# Neural Machine Translation: a seq2seq implementation to translate English to German

In [None]:
%%capture
!pip install "tensorflow-text>=2.11"

In [None]:
# import necessary libs
import numpy as np
import re
import os
import random
from typing import Any, Tuple

from tqdm import tqdm
from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt

import tensorflow_text as tf_text
import tensorflow as tf
import tensorflow.keras.utils as utils
from tensorflow.keras.layers import (Layer, Dense, LSTM, Embedding,
                                     TextVectorization, Bidirectional, Add,
                                     LayerNormalization, AdditiveAttention,
                                     StringLookup, Masking)

# Introduction
This notebook is to implement the seq2seq model with Attention proposed by [Sutskever et al., 2014](https://arxiv.org/abs/1409.3215) and [Bahdanau et al., 2015](https://arxiv.org/abs/1409.0473).


In [None]:
# Let's define some constant variables
max_vocab_size = 15000
DROPOUT = 0.5

BUFFER_SIZE = 1024
BATCH_SIZE = 64

embedding_size = 32
hidden_units = 64

data_file = "deu.txt"
data_dir = '/content/data/'
os.makedirs(data_dir, exist_ok=True)

# Data Preprocessing
We will use the [English to German](https://www.manythings.org/anki/deu-eng.zip) dataset from Manythings.org.

In [None]:
# Download dataset
!wget --no-check-certificate 'https://www.manythings.org/anki/deu-eng.zip' -O deu-eng.zip
!unzip deu-eng.zip -d data/

--2024-04-13 15:12:14--  https://www.manythings.org/anki/deu-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10364105 (9.9M) [application/zip]
Saving to: ‘deu-eng.zip’


2024-04-13 15:12:15 (27.1 MB/s) - ‘deu-eng.zip’ saved [10364105/10364105]

Archive:  deu-eng.zip
  inflating: data/deu.txt            
  inflating: data/_about.txt         


In [None]:
# Take a look at the first lines
with open(os.path.join(data_dir, data_file)) as f:
    for n, line in enumerate(f):
        print(line.strip())

        if n == 4:
            break

Go.	Geh.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)
Hi.	Hallo!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)
Hi.	Grüß Gott!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)
Run!	Lauf!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)
Run.	Lauf!	CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #941078 (Fingerhut)


To maintain coherence and prevent loss of word's meaning, contracted terms are expanded.

In [None]:
contraction_map = {
    # This should be wrapped as a JSON file.
    "Let's": "Let us",
    "'d better": " had better",
    "'s": " is",
    "'re": " are",
    "'m": " am",
    "'ll": " will",
    "'d": " would",
    "'ve": " have",
    "won't": "will not",
    "n't": " not",
    "cannot": "can not"
}

def expand_contractions(text, mapping=contraction_map):
    for key, value in mapping.items():
        text = tf.strings.regex_replace(text, key, value)
    return text

In [None]:
# Let's test the function
expand_contractions("He definitely didn't do it. He must've been forced to commit crime. He won't do it again.")

<tf.Tensor: shape=(), dtype=string, numpy=b'He definitely did not do it. He must have been forced to commit crime. He will not do it again.'>

The dataset are still in unprocessed form. It is necessary to preprocess and store them in appropriate form.

In [None]:
english = []
german = []

with open(os.path.join(data_dir, data_file)) as f:
    for line in f:
        line = line.split("CC-BY")

        if len(line) > 0:
            sample = line[0]
            sample = sample.strip().split('\t')

            english.append(sample[0])
            german.append(sample[1])

english = np.array(english)
german = np.array(german)

for i in range(5):
    print("{:4} --> {:4}".format(english[i], german[i]))

Go.  --> Geh.
Hi.  --> Hallo!
Hi.  --> Grüß Gott!
Run! --> Lauf!
Run. --> Lauf!


In [None]:
# english = english[40000:]
# german = german[40000:]

In [None]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_mask = np.random.uniform(size=(len(english),)) < train_ratio
val_mask = np.logical_and(~train_mask, np.random.uniform(size=(len(english),)) < val_ratio)
test_mask = ~(train_mask | val_mask)

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((english[train_mask], german[train_mask]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE))

val_raw = (
    tf.data.Dataset
    .from_tensor_slices((english[val_mask], german[val_mask]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE))

test_raw = (
    tf.data.Dataset
    .from_tensor_slices((english[test_mask], german[test_mask]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE))

It can be inferred from the plots that despite some outliers, sentence length tends to remain stable along both dataset. Therefore, it is not necessary to implement bucketing by length.

# Tokenization
Computer obviously cannot handle raw text. Instead, they need to be converted into numerical form for further calculations. Besides, while both removing punctuation and lowercasing all words are common practice in NLP tasks, it is not really the case for Neural Machine Translation. Punctuation is important to mark the start or end of a sentence. Therefore, we may well
necessarily tokenize them.

In [None]:
def tf_lower_and_split_punct(text):
  # Split accented characters.
  text = expand_contractions(text)
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)

  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')

  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')

  # Strip whitespace.
  text = tf.strings.strip(text)

  return text

In [None]:
def tf_split_punct(text):
  # Split accented characters.
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)

  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')

  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')

  # Strip whitespace and add special tokens
  text = tf.strings.strip(text)
  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')

  return text

In [None]:
# Vectorizer initial
en_vec = TextVectorization(max_tokens=max_vocab_size,
                           standardize=tf_lower_and_split_punct)
ger_vec = TextVectorization(max_tokens=max_vocab_size,
                            standardize=tf_split_punct)

In [None]:
en_vec.adapt(train_raw.map(lambda x, y: x))
ger_vec.adapt(train_raw.map(lambda x, y: y))

In [None]:
# Vocabulary
en_voc = en_vec.get_vocabulary()
ger_voc = ger_vec.get_vocabulary()

In [None]:
# Word to Idx for prediction
word_to_idx = {}

for i in range(len(ger_voc)):
    word_to_idx[ger_voc[i]] = i

In [None]:
# Assign vocab size of each vectorizer
input_vocab_size = len(en_vec.get_vocabulary())
output_vocab_size = len(ger_vec.get_vocabulary())
print(input_vocab_size)
print(output_vocab_size)

15000
15000


# Data Preparation
Structure the dataset to use the tf.keras.models.Model's fit() method.

In [None]:
def process_text(context, target):
  context = en_vec(context)
  target = ger_vec(target)
  targ_in = target[:, :-1]
  targ_out = target[:, 1:]
  return (context, targ_in), targ_out

train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)
test_ds = test_raw.map(process_text, tf.data.AUTOTUNE)

# Model implementation
In seq2seq, we need a RNN block, which is also known as Encoder, to encode the input sequence to a fixed-length vector, then another RNN block called Decoder to decode it. Block generally consists of LSTM cells.

## Encoder
Encoder can be defined multi-layered RNN network. For the sake of simplicity, I will implement it as an one-layer RNN network with 1 cell at each timestep.

Each RNN cell receives a source word and previous hidden state as inputs.

\begin{align*}
s_{i}=tanh(Ws_{i-1}+Ux_{i})
\end{align*}

According to the formula, the $i^{th}$ hidden state $s_{i}$ is calculated from the $(i-1)^{th}$ hidden state and the $i^{th}$ input.

In [None]:
# Let's define the encoder
class Encoder(Layer):
    def __init__(self,
                 tokenizer,
                 embedding_size,
                 hidden_units):
        """
            Encoder Block in seq2seq

        :param tokenizer: tokenizer of the source language
        :param embedding_size: dimensionality of the embedding layer
        :param hidden_units: dimensionality of the output
        """

        super(Encoder, self).__init__()
        self.hidden_units = hidden_units
        self.tokenizer = tokenizer
        self.embedding = Embedding(input_dim=tokenizer.vocabulary_size(),
                                   output_dim=embedding_size)
        self.rnn = Bidirectional(
            merge_mode="sum",
            layer=LSTM(units=hidden_units,
                       return_sequences=True,
                       return_state=True))

    def call(self,
             x):
        """
        :param x: [batch, time_steps]
        :return:
            encoder_hidden_state: [batch, hidden_state_dim]
            state_h: [batch, hidden_state_dim]
            state_c: [batch, hidden_state_dim]
        """
        mask = tf.where(x != 0, True, False)
        x = self.embedding(x)
        x, forward_h, forward_c, backward_h, backward_c = self.rnn(x, mask=mask)

        return x, forward_h + backward_h, forward_c + backward_c

## Attention Layer
The Attention Mechanism used in this project is Bahdanau Attention, which is first introduced in the [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473) paper by Bahdanau et al., 2015. \\
To recap, at each inference step $i$, Decoder incorporates information from both the previous Decoder timestep $s_{i-1}$ and all encoder states $h=(\{h_1, h_2, ..., h_{Tx}\})$ to take only the most relevant words to $y_{i-1}$ through alignment model $a$.

\begin{align*}
e_{ij} = a(s_{i-1}, h_j)=v_a^T . tanh(W_a s_{i-1} + U_a h_j)
\end{align*}

Therefore, the context vector $c_i$ can be calculated as

\begin{align*}
c_i = \sum_{j=1}^{Tx} \alpha_{ij} h_j
\end{align*}

in which
\begin{align*}
\alpha_{ij} = \frac{exp(e_{ij})}{\sum_{k=1}^{Tx}exp(e_{ik})}
\end{align*}

During the training process, we will implement Teacher Forcing by combining context vector $c_i$ with Decoder input $x_i$.

In [None]:
class BahdanauAttention(Layer):
    def __init__(self,
                 hidden_units):
        super(BahdanauAttention, self).__init__()
        self.Va = Dense(1)
        self.Wa = Dense(hidden_units)
        self.Ua = Dense(hidden_units)

    def build(self, input_shape):
        super(BahdanauAttention, self).build(input_shape)

    def call(self,
             context, x):
        """
            Calculate the context vector based on all encoder hidden states and
            previous decoder state.

        :param: context: tensor, all encoder hidden states
        :param: state: tensor, previous state from Decoder
        :return:
            context_vector: tensor, the calculated context vector based on the
            input parameters
        """
        # Expand dims to ensure scores shape = [batch, Ty, Tx]
        context = tf.expand_dims(context, axis=1)
        x = tf.expand_dims(x, axis=2)

        scores = self.Va(tf.math.tanh(self.Wa(context) + self.Ua(x)))
        scores = tf.squeeze(scores)
        attn_weights = tf.nn.softmax(scores, axis=-1)

        # NOTE: context shape = [batch, 1, Tx, feature] so that expand
        # dim of attention weights
        context_vector = tf.expand_dims(attn_weights, axis=-1) * context
        context_vector = tf.reduce_sum(context_vector, axis=-2)

        return context_vector

## Decoder
Encoder and Decoder share the same structure as well as hidden units but the last dense layer at each state which holds for predicting the next word using a softmax.

In [None]:
# Let's define the decoder
class Decoder(Layer):
    @classmethod
    def add_method(cls, fun):
        setattr(cls, fun.__name__, fun)
        return fun

    def __init__(self,
                 tokenizer,
                 embedding_size,
                 hidden_units):
        """
            Decoder Block in seq2seq

        :param tokenizer: tokenizer of the source language
        :param embedding_size: dimensionality of the embedding layer
        :param hidden_units: dimensionality of the output
        """

        super(Decoder, self).__init__()
        self.hidden_units = hidden_units
        self.tokenizer = tokenizer
        self.vocab = tokenizer.get_vocabulary()
        self.vocab_size = tokenizer.vocabulary_size()
        self.embedding = Embedding(input_dim=self.vocab_size,
                                   output_dim=embedding_size)
        self.rnn = LSTM(units=hidden_units,
                        return_sequences=True,
                        return_state=True)
        self.attention = BahdanauAttention(hidden_units)
        self.dense = Dense(15000)

    def call(self,
            context, x,
            encoder_state,
            return_state=False):
        """
        :param trg: [batch, timesteps]
        :param previous_state: [batch, hidden_unit_dim]

        :return:
            prediction: [vocab_size, None]
        """
        mask = tf.where(x != 0, True, False)
        x = self.embedding(x)
        decoder_outputs, state_h, state_c = self.rnn(x, initial_state=encoder_state,
                                                     mask=mask)
        context_vector = self.attention(context, decoder_outputs)
        dense_inputs = tf.concat([decoder_outputs, context_vector], axis=-1)
        logits = self.dense(dense_inputs)

        if return_state:
            return logits, state_h, state_c
        else:
            return logits

## seq2seq
Now we have got the Encoder and Decoder. Let's combine them into seq2seq model.

In [None]:
class NMT(tf.keras.Model):
    @classmethod
    def add_method(cls, fun):
        setattr(cls, fun.__name__, fun)
        return fun

    def __init__(self,
                 input_tokenizer,
                 output_tokenizer,
                 embedding_size,
                 hidden_units):
        """
            Initialize an instance for Neural Machine Translation Task

        :param input_tokenizer: tokenizer of the input language
        :param output_tokenizer: tokenizer of the output language
        :param embedding_size: dimensionality of embedding layer
        :param hidden_units: dimensionality of the output
        """

        super(NMT, self).__init__()
        self.encoder = Encoder(input_tokenizer,
                               embedding_size,
                               hidden_units)
        self.decoder = Decoder(output_tokenizer,
                               embedding_size,
                               hidden_units)

    def call(self,
             inputs):
        """
        """
        encoder_inputs, decoder_inputs = inputs
        encoder_outputs, state_h, state_c = self.encoder(encoder_inputs)
        logits = self.decoder(encoder_outputs, decoder_inputs,
                              [state_h, state_c])

        return logits

In [None]:
@NMT.add_method
def predict(self, next_inputs,
            maxlen=40):
    """
    """
    def sampling(logits):
        probs = tf.nn.softmax(logits)
        dist = probs.numpy().squeeze()
        idx = np.random.choice(range(self.decoder.vocab_size), p=dist)

        return idx

    translation = []
    next_idx = np.asarray(self.encoder.tokenizer(next_inputs))

    while next_idx.ndim != 2:
        next_idx = tf.expand_dims(next_idx, axis=0)

    encoder_outputs, state_h, state_c = self.encoder(next_idx)

    next_inputs = "[START]"
    next_idx = np.asarray(word_to_idx[next_inputs])

    for i in range(maxlen):
        while next_idx.ndim != 2:
            next_idx = tf.expand_dims(next_idx, axis=0)

        logits, state_h, state_c = self.decoder(encoder_outputs, next_idx,
                                                [state_h, state_c],
                                                return_state=True)
        next_idx = sampling(logits)
        next_inputs = self.decoder.vocab[next_idx]

        if next_inputs == "[END]":
            break
        elif next_inputs == "[UNK]":
            continue
        else:
            translation.append(next_inputs)

    return " ".join(translation)

That's done for training. We still have things to do with inference stage. Model should be expected to translate multiple sentences at once.

In [None]:
model = NMT(en_vec, ger_vec, embedding_size, hidden_units)
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [None]:
%%capture
model.predict("Hello world")

In [None]:
history = model.fit(train_ds,
                    epochs=5,
                    validation_data=val_ds)

Epoch 1/5
[1m3470/3470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 38ms/step - accuracy: 0.5368 - loss: 3.2523 - val_accuracy: 0.6146 - val_loss: 2.5244
Epoch 2/5
[1m3470/3470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 39ms/step - accuracy: 0.6931 - loss: 1.8080 - val_accuracy: 0.6630 - val_loss: 1.9527
Epoch 3/5
[1m3470/3470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 38ms/step - accuracy: 0.7653 - loss: 1.2691 - val_accuracy: 0.7346 - val_loss: 1.3988
Epoch 4/5
[1m3470/3470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 38ms/step - accuracy: 0.8137 - loss: 0.9277 - val_accuracy: 0.7555 - val_loss: 1.2303
Epoch 5/5
[1m3470/3470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 39ms/step - accuracy: 0.8357 - loss: 0.7643 - val_accuracy: 0.7855 - val_loss: 1.0606


In [None]:
model.predict("She was so angry that she could not speak.")

'sie war so schnell argerlich , dass sie nichts sprechen konnte .'

In [None]:
model.save_weights("model_v4.weights.h5")