# Neural machine translation with attention

In [1]:
#!pip uninstall torch -y
#!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
#!pip install torchtext #==0.15.2
#!pip install opencc==1.1.6

In [2]:
import torch
import warnings

warnings.filterwarnings("ignore")
seed = 42
torch.manual_seed(seed)

print(torch.__version__)

2.0.1+cu117


## Download and prepare the dataset

We'll use a language dataset provided by http://www.manythings.org/anki/. This dataset contains language translation pairs in the format:

```
The cat is adorable.	這隻貓很可愛。
```

There are a variety of languages available, but we'll use the English-Ｍandarin dataset. For convenience, we've hosted a copy of this dataset on Google Cloud, but you can also download your own copy. After downloading the dataset, here are the steps we'll take to prepare the data:

1. Add a *start* and *end* token to each sentence.
2. Clean the sentences by removing special characters.
3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).
4. Pad each sentence to a maximum length.

In [3]:
from urllib.request import Request, urlopen
from io import BytesIO
from zipfile import ZipFile
import os
zip_url = 'https://www.manythings.org/anki/cmn-eng.zip'
txt_name = 'cmn.txt'

def read_txt_from_zip_url(url, read_txt_name) -> str:
    req = Request(url)
    req.add_header('user-agent', '')  # Set a user-agent to prevent 406 error (Not Acceptable)
    response = urlopen(req)

    with response as resp:
        with BytesIO(resp.read()) as b, ZipFile(b) as zip_file:
            txt_file = zip_file.open(read_txt_name)
            text_str = txt_file.read().decode('utf8')

    return text_str

In [4]:
if txt_name in os.listdir():
    print("load txt\n")
    with open(txt_name, encoding='utf8') as file:
        text = file.read()
else:
    text = read_txt_from_zip_url(zip_url, txt_name)
    with open(txt_name, 'w', encoding='utf8') as file:
        file.write(text)
    print("saved txt\n")

print(text[:500])

load txt

Hi.	嗨。	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #891077 (Martha)
Hi.	你好。	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4857568 (musclegirlxyp)
Run.	你用跑的。	CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #3748344 (egg0073)
Stop!	住手！	CC-BY 2.0 (France) Attribution: tatoeba.org #448320 (CM) & #448321 (GlossaMatik)
Wait!	等等！	CC-BY 2.0 (France) Attribution: tatoeba.org #1744314 (belgavox) & #4970122 (wzhd)
Wait!	等一下！	CC-BY 2.0 (France) Attribution: ta


##Tokenization
Tokenization is the process of breaking down sentences or phrases into individual words or tokens.

In this example, different tokenization methods are applied to English and Chinese texts, respectively.

In both cases, the output requires the addition of '\<sos>' (start of sentence) and '\<eos>' (end of sentence) tokens at the beginning and the end of the tokenized sequences, respectively.

In [5]:
from torchtext.data.utils import get_tokenizer

def tokenizer_decorator(tokenizer):
    """Add <sos> and <eos>"""
    def new_tokenizer(*args, **kw):
        output = tokenizer(*args, **kw)
        return ['<sos>'] + output + ['<eos>']
    return new_tokenizer

en_tokenizer = get_tokenizer('basic_english')
zh_tokenizer = list

print(en_tokenizer("The cat is adorable."))
print(zh_tokenizer("這隻貓很可愛。"))

en_tokenizer = tokenizer_decorator(en_tokenizer)
zh_tokenizer = tokenizer_decorator(zh_tokenizer)

print()
print(en_tokenizer("The cat is adorable."))
print(zh_tokenizer("這隻貓很可愛。"))

2023-08-06 12:53:11.440851: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-06 12:53:11.630259: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-08-06 12:53:12.124109: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2023-08-06 12:53:12.124202: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7:

['the', 'cat', 'is', 'adorable', '.']
['這', '隻', '貓', '很', '可', '愛', '。']

['<sos>', 'the', 'cat', 'is', 'adorable', '.', '<eos>']
['<sos>', '這', '隻', '貓', '很', '可', '愛', '。', '<eos>']


### Limit the size of the dataset to experiment faster (optional)
Training on the complete dataset of >25,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 10,000 sentences (of course, translation quality degrades with less data):

In [31]:
len(data)

10000

In [33]:
train_num, valid_num = 80000, 20000
data = text.split('\n')[:train_num+valid_num]

en_data, zh_data = [], []
for line in data:
    try:
        en_sentence, zh_sentence = line.split('\t')[:2]
        en_data.append(en_tokenizer(en_sentence))
        zh_data.append(zh_tokenizer(zh_sentence))
    except:
        print(line)

print(en_data[:5])
print(zh_data[:5])


[['<sos>', 'hi', '.', '<eos>'], ['<sos>', 'hi', '.', '<eos>'], ['<sos>', 'run', '.', '<eos>'], ['<sos>', 'stop', '!', '<eos>'], ['<sos>', 'wait', '!', '<eos>']]
[['<sos>', '嗨', '。', '<eos>'], ['<sos>', '你', '好', '。', '<eos>'], ['<sos>', '你', '用', '跑', '的', '。', '<eos>'], ['<sos>', '住', '手', '！', '<eos>'], ['<sos>', '等', '等', '！', '<eos>']]


In [34]:
from torchtext.vocab import vocab
from itertools import chain
from collections import Counter

en_counter = Counter(chain.from_iterable(en_data))
en_vocab = vocab(en_counter, specials=('<sos>', '<eos>', '<pad>'))
zh_counter = Counter(chain.from_iterable(zh_data))
zh_vocab = vocab(zh_counter, specials=('<sos>', '<eos>', '<pad>'))

print("The length of the en_vocab is:", len(en_vocab))
en_stoi = en_vocab.get_stoi()
print("The index of '<sos>' is:", en_stoi['<sos>'])
en_itos = en_vocab.get_itos()
print("The token at index 2 is:", en_itos[2])
print("The token at index 5 is:", en_itos[5])

print()

print("The length of the zh_vocab is:", len(zh_vocab))
zh_stoi = zh_vocab.get_stoi()
print("The index of '<sos>' is:", zh_stoi['<sos>'])
zh_itos = zh_vocab.get_itos()
print("The token at index 2 is:", zh_itos[2])
print("The token at index 5 is:", zh_itos[5])
print()

max_length = max(map(len, en_data + zh_data))
print("The length of the longest sequence:", max_length)

The length of the en_vocab is: 7178
The index of '<sos>' is: 0
The token at index 2 is: <pad>
The token at index 5 is: run

The length of the zh_vocab is: 3650
The index of '<sos>' is: 0
The token at index 2 is: <pad>
The token at index 5 is: 你

The length of the longest sequence: 46


## Preparing your data for training with Dataset & DataLoader
While training a model, we typically want to pass samples in “mini-batches”, reshuffle the data at every epoch to reduce model overfitting.

A custom **Dataset** class must implement three functions: **\_\_init__**, **\_\_getitem__**., and **\_\_len__**.

**DataLoader** is an iterable that abstracts this complexity for us in an easy API.

In [35]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class TranslateDataset(Dataset):
    def __init__(self, inp_data, tar_data):
        self.inp_data = inp_data
        self.tar_data = tar_data
        self._len = len(inp_data)

    def __getitem__(self, index):
        return self.inp_data[index], self.tar_data[index]

    def __len__(self):
        return self._len

def collate_batch(batch):
    inp_sequences, tar_sequences = zip(*batch)

    max_length_inp = max(map(len, inp_sequences))
    max_length_tar = max(map(len, tar_sequences))
    max_length = max(max_length_inp, max_length_tar)

    padded_inp_sequences = [seq + ['<pad>'] * (max_length - len(seq)) for seq in inp_sequences]
    padded_tar_sequences = [seq + ['<pad>'] * (max_length - len(seq)) for seq in tar_sequences]

    return padded_inp_sequences, padded_tar_sequences

### Here we simply take the first 8,000 data for training, and the last 2,000 data for validation.

In [36]:
train_dataset = TranslateDataset(en_data[:train_num], zh_data[:train_num])
valid_dataset = TranslateDataset(en_data[-valid_num:], zh_data[-valid_num:])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)

for batch in train_loader:
    print(*batch, sep='\n')
    break

[['<sos>', 'hi', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'hi', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'run', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'stop', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'wait', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'wait', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'begin', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'hello', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'i', 'try', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'i', 'won', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'oh', 'no', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'cheers', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'got', 'it', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<so

## Write the encoder and decoder model

Implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://github.com/tensorflow/nmt). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://github.com/tensorflow/nmt#background-on-the-attention-mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence. The below picture and formulas are an example of attention mechanism from [Luong's paper](https://arxiv.org/abs/1508.04025v5).

<img src="https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg" width="500" alt="attention mechanism">

The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*.

Here are the equations that are implemented:

<img src="https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg" alt="attention equation 0" width="800">
<img src="https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg" alt="attention equation 1" width="800">

This tutorial uses [Bahdanau attention](https://arxiv.org/pdf/1409.0473.pdf) for the encoder. Let's decide on notation before writing the simplified form:

* FC = Fully connected (dense) layer
* EO = Encoder output
* H = hidden state
* X = input to the decoder

And the pseudo-code:

* `score = FC(tanh(FC(EO) + FC(H)))`
* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, hidden_size)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.
* `context vector = sum(attention weights * EO, axis = 1)`. Same reason as above for choosing axis as 1.
* `embedding output` = The input to the decoder X is passed through an embedding layer.
* `merged vector = concat(embedding output, context vector)`
* This merged vector is then given to the GRU

The shapes of all the vectors at each step have been specified in the comments in the code:

In [37]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        # vacab_size=src_vocab_size=1949, embedding_dim=256 enc_units=512
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, enc_units)

    def forward(self, x, hidden=None):
        '''
        Input:
            ``x``
            shape == (seq_length, batch_size)
            dtype == tf.int64

            ``hidden``
            shape == (1, batch_size, enc_units) or None
            dtype == tf.float32

        Output:
            ``h_e``
            shape == (seq_length, batch_size, enc_units)

            ``last_h_e``
            shape == (1, batch_size, enc_units)
        '''

        # passing through embedding
        # x shape == (seq_length, batch_size, embedding_dim) -> (seq_length, 64, 256)
        x = self.embedding(x)

        # passing x and hidden state to the GRU
        # h_e shape == (seq_length, batch_size, enc_units) -> (seq_length, 64, 512)
        # last_h_e shape == (1, batch_size, enc_units) -> (1, 64, 512)
        h_e, last_h_e = self.gru(x, hidden)

        # h_e contains the whole output of the sequence, last_h_e is the hidden state of the last timestamp
        return h_e, last_h_e

In [38]:
input_batch_tensor = torch.randint(low=0, high=1949, size=(10, 64))
print('input shape == (seq_length, batch_size) ->', input_batch_tensor.shape)

# build Encoder layer
encoder = Encoder(vocab_size=1949, embedding_dim=256, enc_units=512)

# test your code
h_e, last_h_e = encoder(input_batch_tensor)
print('h_e shape == (seq_length, batch_size, units) ->', h_e.shape)
print('last_h_e shape == (1, batch_size, units) ->', last_h_e.shape)

# show last_h_e is the hidden state of the last timestamp
print(torch.any(h_e[-1,:,:] == last_h_e))

input shape == (seq_length, batch_size) -> torch.Size([10, 64])
h_e shape == (seq_length, batch_size, units) -> torch.Size([10, 64, 512])
last_h_e shape == (1, batch_size, units) -> torch.Size([1, 64, 512])
tensor(True)


In [39]:
class BahdanauAttention(nn.Module):
    def __init__(self, enc_units, dec_units, att_units):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(dec_units, att_units)
        self.W2 = nn.Linear(enc_units, att_units)
        self.V = nn.Linear(att_units, 1)
        self.softmax = nn.Softmax(1)

    def score_function(self, h_d, h_e):
        '''
        Bahdanau's additive style:
            score = V tanh( W1 h_d + W2 h_e )

        Luong's multiplicative style:
            score = h_d W h_e
        '''
        score = self.V(torch.tanh(self.W1(h_d)+self.W2(h_e)))
        return score


    def forward(self, h_d, h_e):
        '''
        Input:
            ``h_d``
            shape == (1, batch_size, dec_units)

            ``h_e``
            shape == (seq_length, batch_size, enc_units)

        Output:
            ``context_vector``
            shape == (batch_size, enc_units)

            ``attention_weights``
            shape == (batch_size, seq_length, 1)
        '''

        # we are doing this to perform addition to calculate the score
        # after permute h_d shape == (batch_size, 1, dec_units)
        # after permute h_e shape == (batch_size, seq_length, enc_units)
        h_d = h_d.permute(1, 0, 2)
        h_e = h_e.permute(1, 0, 2)

        # score shape == (batch_size, seq_length, 1) -> (64, seq_length, 1)
        score = self.score_function(h_d, h_e)

        # use softmax to normalize the score
        # attention_weights shape == (batch_size, seq_length, 1) -> (64, seq_length, 1)
        attention_weights = self.softmax(score)
        
        # context_vector = attention_weights * h_e
        # context_vector shape == (batch_size, seq_length, enc_units) -> (64, seq_length, 512)
        context_vector = attention_weights * h_e
        
        # sum up all h_e sequences
        # context_vector shape after sum == (batch_size, enc_units) -> (64, 512)
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector, attention_weights

In [40]:
h_d = torch.randn(size=(1, 64, 512))
print('h_d shape == (1, batch_size, units) ->', h_d.shape)

# build BahdanauAttention layer
attention = BahdanauAttention(enc_units=512, dec_units=512, att_units=512)

# test your code
context_vector, attention_weights = attention(h_d, h_e)
print('context vector shape == (batch_size, units) ->', context_vector.shape)
print('attention weights shape == (batch_size, seq_length, 1) ->', attention_weights.shape)

h_d shape == (1, batch_size, units) -> torch.Size([1, 64, 512])
context vector shape == (batch_size, units) -> torch.Size([64, 512])
attention weights shape == (batch_size, seq_length, 1) -> torch.Size([64, 10, 1])


In [41]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, attention):
        # vocab_size=tar_vocab_size=1831, embedding_dim=256, dec_units=512
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim + enc_units, dec_units)

        # the dimension of the output is the vocab size, through the softmax function,
        # this layer will return the probability of each word in the dictory
        self.fc = nn.Linear(dec_units, vocab_size)

        # used for attention
        self.attention = attention

    def forward(self, x, hidden, enc_output):
        '''
            This function outputs a result at each timestamp.
            The length of each sentence is 1. (previous word)

        Input:
            ``x``
            shape == (1, batch_size)
            dtype == tf.int64

            ``hidden``
            shape == (1, batch_size, dec_units)

            ``enc_output``
            shape == (seq_length, batch_size, enc_units)

        Output:
            ``output``
            shape == (batch_size, vocab)

            ``state``
            shape == (1, batch_size, dec_units)

            ``attention_weights``
            shape == (batch_size, max_length, 1)
        '''
        # passing through embedding
        # x shape == (seq_length, batch_size, embedding_dim) -> (1, 64, 256)
        x = self.embedding(x)

        # The hidden state of fisrt timestamp in the decoder is the hidden state of last timestamp in the encoder
        # context_vector shape == (batch_size, enc_units) -> (64, 512)
        context_vector, attention_weights = self.attention(hidden, enc_output)
        # resize the context_vector shape == (1, batch_size, enc_units) -> (1, 64, 512)
        context_vector = context_vector.unsqueeze(0)

        # concatenate the input x and the context_vector
        # after concatenation, x shape == (1, batch_size, embedding_dim + enc_units) -> (1, 64, 256 + 512)
        x = torch.cat((x, context_vector), dim=2)

        # passing the concatenated vector to the GRU
        # output shape == (1, batch_size, dec_units) -> (1, 64, 512)
        # state shape == (1, batch_size, dec_units) -> (1, 64, 512)
        output, state = self.gru(x)

        # resize the output shape == (batch_size, dec_units) -> (64, 512)
        output = output.squeeze()

        # passing through a linear layer
        # output shape == (batch_size, vocab) -> (64, 1831)
        x = self.fc(output)

        return x, state, attention_weights

In [42]:
target_batch_tensor = torch.zeros(size=(1, 64), dtype=torch.int64)
print('target shape == (seq_length, batch_size) ->', target_batch_tensor.shape)

# build Decoder layer
decoder = Decoder(vocab_size=1831, embedding_dim=256, enc_units=512, dec_units=512, attention=attention)

# test your code
output, state, _ = decoder(target_batch_tensor, h_d, h_e)
print('output shape == (batch_size, vocab) ->', output.shape)
print('state shape == (1, batch_size, units) ->', state.shape)

target shape == (seq_length, batch_size) -> torch.Size([1, 64])
output shape == (batch_size, vocab) -> torch.Size([64, 1831])
state shape == (1, batch_size, units) -> torch.Size([1, 64, 512])


## Translator

* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the *end token*.
* And store the *attention weights for every time step*.

Note: The encoder output is calculated only once for one input.

In [43]:
import torch.optim as optim
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

class Translator(nn.Module):
    def __init__(self, config, inp_vocab, tar_vocab, loss_fn):
        super(Translator, self).__init__()
        self.__dict__.update(config)

        self.inp_stoi = inp_vocab.get_stoi()
        self.tar_stoi = tar_vocab.get_stoi()
        self.tar_itos = tar_vocab.get_itos()

        self.attention = BahdanauAttention(self.enc_units, self.dec_units, self.att_units)
        self.encoder = Encoder(self.src_vocab_size, self.src_emb_dim, self.enc_units)
        self.decoder = Decoder(self.tar_vocab_size, self.tar_emb_dim, self.enc_units, self.dec_units, self.attention)
        self.optimizer = optim.Adam(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=self.learning_rate)
        self.loss_fn = loss_fn


    def train_step(self, inp, tar, use_teacher_forcing=False):
        """
        1. Pass the input through the encoder which return encoder output and the encoder hidden state.
        2. The encoder output, encoder hidden state and the decoder input (which is the start token) is passed to the decoder.
        3. The decoder returns the predictions and the decoder hidden state.
        4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
        5. Use teacher forcing to decide the next input to the decoder.
        6. Teacher forcing is the technique where the target word is passed as the next input to the decoder.
        7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate.
        """

        # feed the <sos> as the first input of the decoder
        dec_input = torch.LongTensor([self.tar_stoi['<sos>']] * len(tar[0])).unsqueeze(0).to('cuda')

        enc_output, hidden = self.encoder(inp)

        loss = 0
        # Teacher forcing - feeding the target as the next input
        for t in range(1, tar.shape[0]):
            # passing enc_output to the decoder
            pred, hidden, _ = self.decoder(dec_input, hidden, enc_output)

            # predictions shape == (vocab_size, batch_size) -> (1831, 64)
            loss += self.loss_fn(pred, tar[t, :])

            # using teacher forcing
            if use_teacher_forcing:
                # (1, batch_size)
                dec_input = tar[t, :].unsqueeze(0)
            else:
                dec_input = torch.argmax(pred, dim=0).unsqueeze(0)

        batch_loss = loss.item() / tar.shape[0]

        self.optimizer.zero_grad()

        loss.backward()

        self.optimizer.step()

        return batch_loss
    
    def evaluate_step(self, inp, tar, use_teacher_forcing=False):
        # feed the <sos> as the first input of the decoder
        dec_input = torch.LongTensor([self.tar_stoi['<sos>']] * len(tar[0])).unsqueeze(0).to('cuda')

        enc_output, hidden = self.encoder(inp)

        loss = 0
        # Teacher forcing - feeding the target as the next input
        for t in range(1, tar.shape[0]):
            # passing enc_output to the decoder
            pred, hidden, _ = self.decoder(dec_input, hidden, enc_output)

            # predictions shape == (vocab_size, batch_size) -> (1831, 64)
            loss += self.loss_fn(pred, tar[t, :])

            # using teacher forcing
            if use_teacher_forcing:
                # (1, batch_size)
                dec_input = tar[t, :].unsqueeze(0)
            else:
                dec_input = torch.argmax(pred, dim=0).unsqueeze(0)

        batch_loss = loss.item() / tar.shape[0]
        return batch_loss
    
    def translate(self, en_sentence):
        with torch.no_grad():
            en_sentence = en_tokenizer(en_sentence)
            inputs = [self.inp_stoi[s] for s in en_sentence]
            inputs = torch.LongTensor(inputs).unsqueeze(-1).to('cuda')

            result = ['<sos>']

            # enc out shape == (1, max_length_inp, 1024) -> (1, 46, 1024)
            # enc hidden shape == (1, 1024)
            enc_out, enc_hidden = self.encoder(inputs)

            dec_hidden = enc_hidden
            dec_input = torch.LongTensor([self.tar_stoi['<sos>']]).unsqueeze(0).to('cuda')

            for t in range(20):

                predictions, dec_hidden, attention_weights = self.decoder(dec_input, dec_hidden, enc_out)

                # get the index which has the highest probability
                predicted_id = torch.argmax(predictions)
                # convert the index to the word
                word = self.tar_itos[predicted_id]
                result.append(word)

                # when the decoder predicts the end, stop prediction
                if word == '<eos>':
                    return result, en_sentence

                # the predicted id is fed back into the model
                dec_input = torch.LongTensor([predicted_id]).unsqueeze(0).to('cuda')

            return result, en_sentence


    def evaluate(self, inp_sent, tar_sent):
        translated, inp_sent = self.translate(inp_sent)
        tar_sent = zh_tokenizer(tar_sent)
        bleu = sentence_bleu([tar_sent], translated, weights=(0.25, 0.25, 0.25, 0.25))

        return bleu, ' '.join(translated), ' '.join(inp_sent)


    def save_model(self, filename='translator.pt'):
        try:
            os.makedirs(self.checkpoint_dir)
        except:
            pass

        torch.save({
            'encoder_state_dict': self.encoder.state_dict(),
            'decoder_state_dict': self.decoder.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, os.path.join(self.checkpoint_dir, filename))


    def load_model(self, filename='translator.pt'):
        try:
            path = os.path.join(self.checkpoint_dir, filename)
            checkpoint = torch.load(path)

            self.encoder.load_state_dict(checkpoint['encoder_state_dict'])
            self.decoder.load_state_dict(checkpoint['decoder_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        except Exception as e:
            print(e)

## Define the loss function

In [44]:
# When calculating the loss value, ignore all the <pad> tags in the target.
ignore_index = zh_stoi['<pad>']
loss_fn = nn.CrossEntropyLoss(ignore_index=ignore_index)

## Training

In [45]:
from tqdm import tqdm_notebook

def train(translator, train_loader, valid_loader, config, use_teacher_forcing=True):
    inp_stoi = translator.inp_stoi
    tar_stoi = translator.tar_stoi
    for epoch in range(config['epochs']):
        total_loss = 0
        eval_total_loss = 0

        tn = tqdm_notebook(total=len(train_loader))
        tn.set_description('Epoch: {}/{}'.format(epoch + 1, config['epochs']))
        
        # train
        for batch in train_loader:
            inp_sentences, tar_sentences = batch

            inp = torch.LongTensor(
                [list(map(inp_stoi.__getitem__, sentence)) for sentence in zip(*inp_sentences)]
            ).to('cuda')
            tar = torch.LongTensor(
                [list(map(tar_stoi.__getitem__, sentence)) for sentence in zip(*tar_sentences)]
            ).to('cuda')


            batch_loss = translator.train_step(inp, tar, use_teacher_forcing)
            total_loss += batch_loss

            tn.set_postfix(loss=batch_loss)
            tn.update(n=1)
        
        # evalute
        with torch.no_grad():
            for batch in valid_loader:
                inp_sentences, tar_sentences = batch
                inp = torch.LongTensor(
                    [list(map(inp_stoi.__getitem__, sentence)) for sentence in zip(*inp_sentences)]
                ).to('cuda')
                tar = torch.LongTensor(
                    [list(map(tar_stoi.__getitem__, sentence)) for sentence in zip(*tar_sentences)]
                ).to('cuda')
                eval_batch_loss = translator.evaluate_step(inp, tar, use_teacher_forcing)
                eval_total_loss += eval_batch_loss
        
        # saving (checkpoint) the model every 2 epochs
        if (epoch + 1) % 2 == 0:
            translator.save_model()
            
        bleu, translated, inp_sent = translator.evaluate("The cat is adorable.", "這隻貓很可愛。")
        tn.set_postfix(loss=total_loss / len(train_loader), bleu=bleu)

        print('    [Source]:', inp_sent)
        print('[Translated]:', translated)
        print(f"training loss: {total_loss}")
        print(f"validation loss: {eval_total_loss}")

In [46]:
if __name__ == '__main__':
    config = {
        'src_vocab_size': len(en_vocab),
        'tar_vocab_size': len(zh_vocab),
        'src_emb_dim': 256,
        'tar_emb_dim': 256,
        'enc_units': 512,
        'dec_units': 512,
        'att_units': 512,
        'batch_size': 64,
        'epochs': 30,
        'learning_rate': 1e-3,
        'checkpoint_dir': './checkpoints/',
        'device': 'cuda:0',
    }

    # build translator model
    translator = Translator(config, en_vocab, zh_vocab, loss_fn).to(config['device'])

    # train model
    train(translator, train_loader, valid_loader, config, use_teacher_forcing=True)

  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 的 是 一 个 。 <eos>
training loss: 1469.2644093414483
validation loss: 847.7313242174887


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 所 有 的 。 <eos>
training loss: 1052.0327899174679
validation loss: 681.9518570208543


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 所 有 的 。 <eos>
training loss: 832.099831076115
validation loss: 566.9420728311037


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 所 有 很 难 。 <eos>
training loss: 675.6015781976794
validation loss: 470.2029368660419


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 白 。 <eos>
training loss: 577.2662894820987
validation loss: 372.1735665719375


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 白 地 方 都 很 冷 。 <eos>
training loss: 465.84356672150915
validation loss: 309.5830808631293


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 很 冷 。 <eos>
training loss: 390.6665150132846
validation loss: 251.35051302831585


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 白 地 是 很 冷 。 <eos>
training loss: 328.13168661913124
validation loss: 229.67037013893122


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 很 冷 。 <eos>
training loss: 275.10410647727764
validation loss: 271.5710144274113


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 所 有 人 很 可 能 。 <eos>
training loss: 236.09665515390702
validation loss: 168.2533051045884


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 很 危 。 <eos>
training loss: 187.3538822272524
validation loss: 232.3167138932162


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 地 区 。 <eos>
training loss: 220.48337440171264
validation loss: 126.49653280028998


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 很 有 。 <eos>
training loss: 142.38324241688264
validation loss: 105.22019041332726


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 很 可 愛 。 <eos>
training loss: 110.10599879257703
validation loss: 77.56605690994589


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 船 是 很 危 。 <eos>
training loss: 89.67628832579696
validation loss: 77.25971273034158


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 危 险 很 可 愛 。 <eos>
training loss: 83.30999135849252
validation loss: 125.45514113349469


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 很 可 愛 。 <eos>
training loss: 83.52549726336886
validation loss: 89.36690908992234


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 。 <eos>
training loss: 76.45442207945234
validation loss: 73.35781662263216


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 危 险 很 危 。 <eos>
training loss: 74.1542040872521
validation loss: 97.8381220612057


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 是 湖 。 <eos>
training loss: 63.119350325262026
validation loss: 80.80101161761098


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 危 险 很 可 愛 。 <eos>
training loss: 64.81796635697609
validation loss: 57.89700406361006


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 。 <eos>
training loss: 51.57286736164841
validation loss: 51.6366244072228


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 是 黑 人 是 黑 。 <eos>
training loss: 58.91404148526248
validation loss: 72.8202497902855


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 很 可 愛 。 <eos>
training loss: 58.6116004933473
validation loss: 48.101496989672235


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 。 <eos>
training loss: 58.53594063271745
validation loss: 46.90209296519008


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 很 可 愛 。 <eos>
training loss: 43.66899895291139
validation loss: 34.667820456273404


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 它 很 可 愛 。 <eos>
training loss: 39.20054997221913
validation loss: 36.6207997017852


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 。 <eos>
training loss: 38.763050352992565
validation loss: 41.26712187958673


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 。 <eos>
training loss: 44.728595197940024
validation loss: 60.21149675563772


  0%|          | 0/462 [00:00<?, ?it/s]

    [Source]: <sos> the cat is adorable . <eos>
[Translated]: <sos> 。 <eos>
training loss: 48.053712576023486
validation loss: 59.606776980304765


## Restore the latest checkpoint and test

In [47]:
translator.load_model()
# input: English string to be translated, Chinese string reference
# output: (bleu score, translated & tokenized Chinese string, tokenized English string)
print(translator.evaluate("The cat is adorable.", '這隻貓很可愛。'))
print(translator.evaluate('I have a dog.', '我有一条狗。'))
print(translator.evaluate("It is raining.", '下雨了。'))

(1.6975624222192167e-155, '<sos> 。 <eos>', '<sos> the cat is adorable . <eos>')
(3.4139826703418696e-78, '<sos> 狗 。 <eos>', '<sos> i have a dog . <eos>')
(0.6434588841607617, '<sos> 在 下 雨 了 。 <eos>', '<sos> it is raining . <eos>')


## Applying OpenCC
Adding a Traditional Chinese to Simplified Chinese conversion module allows us to use Traditional Chinese as the input.

In [27]:
import opencc
tw2sp = opencc.OpenCC('tw2sp.json').convert
s2twp = opencc.OpenCC('s2twp.json').convert

eng_s = "The taxi has arrived."
tw_ref = "計程車到了。"
sp_ref = tw2sp(tw_ref)
print(sp_ref)

print(translator.evaluate(eng_s, sp_ref))

出租车到了。
(0.7506238537503395, '<sos> 到 出 租 车 到 了 。 <eos>', '<sos> the taxi has arrived . <eos>')
