# Language Translation using Sequence to Sequence Models

The most common sequence to sequence models tend to be encoder-decoder models, which use RNNs to encode
the source text (into a single vector) which is often called the context vector. The context vector is then decoded by a second RNN which is used to generate the output one word at a time.

![Basic Seq2Seq Model](https://www.researchgate.net/publication/335801052/figure/fig1/AS:871953862754305@1584901409562/The-Basic-Seq2Seq-Model.png)

## Importing Dependencies

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate

## Data Preparation

In [None]:
seed = 1337
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
# setting the M2 GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

The dataset being used is a [subset](https://huggingface.co/datasets/bentrevett/multi30k) of the Multi30k dataset which can be found on Hugging Face

In [None]:
# load the dataset
dataset = datasets.load_dataset('bentrevett/multi30k')

In [None]:
# inspect the dataset metadata
dataset

In [None]:
# train, test and validation split
train_data, val_data, test_data = (
    dataset['train'],
    dataset['validation'],
    dataset['test']
)

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

## Tokenization

The tokenizer being used is from spacy.

In [None]:
en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

In [None]:
def tokenize(sample, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(sample["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(sample["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize, fn_kwargs=fn_kwargs)
valid_data = val_data.map(tokenize, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize, fn_kwargs=fn_kwargs)

## Vocabulary

The vocabulary is used to assosciate the unique tokens obtained from the tokenization of the dataset to a unique integer index. We use the `build_vocab_from_iterator` function, provided by torchtext to build our vocabulary, along with adding two more tokens.
1. `<unk>` which will be used when a token is present in the test set but not the training set
2. `<pad>` which is used to denote tokens that we want to add to the vocabulary but do not want to add to the tokenized examples.

In [None]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [unk_token, pad_token, sos_token, eos_token]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"], min_freq=min_freq, specials=special_tokens
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"], min_freq=min_freq, specials=special_tokens
)

In [None]:
# the length of the vocabulary gives us the number of tokens in each one
en_vocab_size = len(en_vocab)
de_vocab_size = len(de_vocab)

print(f"Number of unique tokens in source (en) vocabulary: {en_vocab_size}")
print(f"Number of unique tokens in target (de) vocabulary: {de_vocab_size}")

In [None]:
unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

print(f"Index of <unk> token in source (en) vocabulary: {unk_index}")
print(f"Index of <pad> token in source (en) vocabulary: {pad_index}")

In [None]:
# convert tokens to ids
def convert_tokens_to_ids(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {'en_ids': en_ids, 'de_ids': de_ids}

fn_kwargs = {
    'en_vocab': en_vocab,
    'de_vocab': de_vocab
}

train_data = train_data.map(convert_tokens_to_ids, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(convert_tokens_to_ids, fn_kwargs=fn_kwargs)
test_data = test_data.map(convert_tokens_to_ids, fn_kwargs=fn_kwargs)

## Feature Transformation

Convert the features indicated in the columns into a torch datatype

In [None]:
data_type = 'torch'
format_columns = ['en_ids', 'de_ids']

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

test_data = test_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

## Data Loading


In [None]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        en_batch_ids = [example['en_ids'] for example in batch]
        en_batch_ids = nn.utils.rnn.pad_sequence(en_batch_ids, padding_value=pad_index)
        de_batch_ids = [example['de_ids'] for example in batch]
        de_batch_ids = nn.utils.rnn.pad_sequence(de_batch_ids, padding_value=pad_index)
        batch = {'en_ids': en_batch_ids, 'de_ids': de_batch_ids}
        return batch
    return collate_fn

In [None]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn
    )
    return data_loader

In [None]:
batch_size = 128
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

## Model

### Encoder


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

### Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

### Seq2Seq

The sequence to sequence model will handle
- receiving the input/source sentence
- using the encoder to produce the context vectors
- using the decoder to produce the predicted output/target sentence

The model should finally look like,
![seq2seq](https://cdn-images-1.medium.com/max/1200/1*aNcybCTdPlrXsCwIo1OfTg.png)


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

#### Training the Model

The input and output dimensions are determined by the size of the vocabulary and the number of layers and the size and shape of the hidden and cell states must be same for the Encoder and Decoder.

The weights of the model will be a uniform distribution between -0.08 and +0.08

In [None]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

#### Optimizer

The optimizer used by us is the [Adam optimizer](https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/)

In [None]:
optimizer = optim.Adam(model.parameters())

#### Loss Function

We use [Cross Entropy Loss](https://www.datacamp.com/tutorial/the-cross-entropy-loss-function-in-machine-learning)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

#### Training


In [None]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)

        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

#### Evaluation Loop

In [None]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            output = model(src, trg, 0)  
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

#### Model Training

In [None]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

### Model Evaluation

In [None]:
def translate(sentence, model, en_nlp, de_nlp, en_vocab, de_vocab, lower, sos_token, eos_token, device, max_output_length=25):
  # set model to evaluation mode
  model.eval()
  with torch.no_grad():
    if isinstance(sentence, str):
      tokens = [token.text for token in de_nlp.tokenizer(sentence)]
    else:
      tokens = [token for token in sentence]
    if lower:
      tokens = [token.lower() for token in tokens]
    tokens = [sos_token] + tokens + [eos_token]
    indices = de_vocab.lookup_indices(tokens)
    tensor = torch.LongTensor(indices).unsqueeze(-1).to(device)
    hidden, cell = model.encoder(tensor)
    inputs = en_vocab.lookup_indices([sos_token])
    for _ in range(max_output_length):
      inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
      output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
      predicted_token = output.argmax(-1).item()
      inputs.append(predicted_token)
      if predicted_token == en_vocab[eos_token]:
          break
      tokens = en_vocab.lookup_tokens(inputs)
    return tokens


In [None]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]
sentence, expected_translation

In [None]:
# load the model 
model.load_state_dict(torch.load("translate_en_de.pt"))

In [None]:
translation = translate(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

print(translation)

In [None]:
arbitrary = 'Ein Mann sitzt auf dem Stuhl.'
translation_arb = translate(
    arbitrary,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

print(translation_arb)

In [None]:
# translate the entire test set
translations = [
    translate(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    )
    for example in tqdm.tqdm(test_data)
]

In [None]:
bleu_score = evaluate.load('bleu')

In [None]:
predictions = [" ".join(translation[1:-1]) for translation in translations]
references = [[example["en"]] for example in test_data] # ground truth

In [None]:
def get_tokenizer(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

tokenizer = get_tokenizer(en_nlp, lower)

In [None]:
tokenizer(predictions[0]), tokenizer(references[0][0])

In [None]:
results = bleu_score.compute(
    predictions=predictions, references=references, tokenizer=tokenizer
)
results

## Conclusion

The BLEU score for this model is 17.9, which is much lower than the BLEU score achieved by achieved by the original research paper (they had achieved a BLEU score of 34.8)

The final result for model evaluation metrics is given below 

| Metric | Value |
|---|---|
| BLEU | 0.1791 |
| Precision (Class 1) | 0.5178 |
| Precision (Class 2) | 0.2687 |
| Precision (Class 3) | 0.1459 |
| Precision (Class 4) | 0.0850 |
| Brevity Penalty | 0.8790 |
| Length Ratio | 0.8857 |
| Translation Length | 11566 |
| Reference Length | 13058 |
