# Neural Machine Translation
Perform machine translation with two deep learning approaches: Recurrent Neural Network (RNN) and Transformer.

Specifically, we are going to train sequence to sequence models for Spanish to English translation. **Refer** to the following resources for more details:

1.   https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
2.   https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
3. https://arxiv.org/pdf/1409.0473.pdf

We recommend runing this notebook on Google Colab instead of your local computer to avoid the hassle of installing necessary Python packages on local machine, and to get free GPU. Selecting "GPU" as the runtime type as this will speed up the training of the models. You can find this by going to <TT>Runtime > Change Runtime Type</TT> and select "GPU" from the dropdown menu.



# Step 1: Download & Prepare the Data

In [None]:
import pandas as pd
import unicodedata
import re
from torch.utils.data import Dataset
import torch
import random
import os

## Helper Functions
This cell contains helper functions for the dataloader.

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """Normalizes latin chars with accent to their canonical decomposition"""
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    '''
    Preprocess the sentence to add the start, end tokens and make them lower-case
    '''
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r'([?.!,¿])', r' \1 ', w)
    w = re.sub(r'[" "]+', ' ', w)

    w = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', w)
    
    w = w.rstrip().strip()
    w = '<start> ' + w + ' <end>'
    return w


def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded


def preprocess_data_to_tensor(dataframe, src_vocab, trg_vocab):
    # Vectorize the input and target languages
    src_tensor = [[src_vocab.word2idx[s if s in src_vocab.vocab else '<unk>'] for s in es.split(' ')] for es in dataframe['es'].values.tolist()]
    trg_tensor = [[trg_vocab.word2idx[s if s in trg_vocab.vocab else '<unk>'] for s in eng.split(' ')] for eng in dataframe['eng'].values.tolist()]

    # Calculate the max_length of input and output tensor for padding
    max_length_src, max_length_trg = max(len(t) for t in src_tensor), max(len(t) for t in trg_tensor)
    print('max_length_src: {}, max_length_trg: {}'.format(max_length_src, max_length_trg))

    # Pad all the sentences in the dataset with the max_length
    src_tensor = [pad_sequences(x, max_length_src) for x in src_tensor]
    trg_tensor = [pad_sequences(x, max_length_trg) for x in trg_tensor]

    return src_tensor, trg_tensor, max_length_src, max_length_trg


def train_test_split(src_tensor, trg_tensor):
    '''
    Create training and test sets.
    '''
    total_num_examples = len(src_tensor) - int(0.2*len(src_tensor))
    src_tensor_train, src_tensor_test = src_tensor[:int(0.75*total_num_examples)], src_tensor[int(0.75*total_num_examples):total_num_examples]
    trg_tensor_train, trg_tensor_test = trg_tensor[:int(0.75*total_num_examples)], trg_tensor[int(0.75*total_num_examples):total_num_examples]

    return src_tensor_train, src_tensor_test, trg_tensor_train, trg_tensor_test

## Download and Visualize the Data

Download the translation data, a model to translate Spanish to English will be learned

In [None]:
if __name__ == '__main__':
    os.system("wget http://www.manythings.org/anki/spa-eng.zip")
    os.system("unzip -o spa-eng.zip")

Now visualize the data.

In [None]:
if __name__ == '__main__':
    lines = open('spa.txt', encoding='UTF-8').read().strip().split('\n')
    total_num_examples = 50000 
    original_word_pairs = [[w for w in l.split('\t')][:2] for l in lines[:total_num_examples]]
    random.seed(42)
    random.shuffle(original_word_pairs)
    dat = pd.DataFrame(original_word_pairs, columns=['eng', 'es'])
    print(dat) # Visualize the data

                             eng                                      es
0               I was disgusted.                    Yo estaba indignado.
1         The prize went to him.                      Le tocó el premio.
2           You can't marry Tom.              No puedes casarte con Tom.
3         Why don't you like me?                   ¿Por qué no te gusto?
4           Are they still here?                    ¿Todavía están aquí?
...                          ...                                     ...
49995         Tom is passionate.                      Tom es apasionado.
49996  I told Tom to stay still.  Le dije a Tom que se estuviera quieto.
49997                Stay still.                           No te muevas.
49998            I like fishing.                      Me gusta la pesca.
49999    You're never satisfied.                 Jamás estás satisfecho.

[50000 rows x 2 columns]


Next, preprocess the data

In [None]:
if __name__ == '__main__':
    data = dat.copy()
    data['eng'] = dat.eng.apply(lambda w: preprocess_sentence(w))
    data['es'] = dat.es.apply(lambda w: preprocess_sentence(w))
    print(data) # visualizing the data

                                            eng                                                 es
0               <start> i was disgusted . <end>                <start> yo estaba indignado . <end>
1         <start> the prize went to him . <end>                  <start> le toco el premio . <end>
2           <start> you can t marry tom . <end>          <start> no puedes casarte con tom . <end>
3         <start> why don t you like me ? <end>              <start> ¿ por que no te gusto ? <end>
4           <start> are they still here ? <end>               <start> ¿ todavia estan aqui ? <end>
...                                         ...                                                ...
49995         <start> tom is passionate . <end>                  <start> tom es apasionado . <end>
49996  <start> i told tom to stay still . <end>  <start> le dije a tom que se estuviera quieto ...
49997                <start> stay still . <end>                       <start> no te muevas . <end>
49998     

## Vocabulary & Dataloader Classes

First a class for managing the vocabulary is caculated. Two different vocabularies are needed, one for English and one for Spanish.

Then the dataloader is prepared and make sure it returns the source sentence and target sentence.

These classes will be instantiated later on when the pretrained embeddings have been created

In [None]:
class Vocab_Lang():
    def __init__(self, vocab):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.vocab = vocab
        
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 2 # +2 because of <pad> and <unk> token
            self.idx2word[index + 2] = word

class MyData(Dataset):
    def __init__(self, X, y):
        self.length = torch.LongTensor([np.sum(1 - np.equal(x, 0)) for x in X])
        self.data = torch.LongTensor(X)
        self.target = torch.LongTensor(y)
    
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        return x, y

    def __len__(self):
        return len(self.data)

# Step 2: Create Pretrained Embeddings

Instead of initializing embeddings with random vectors and learning them while training, here we will use the FastText embedding method proposed by Facebook's AI Research lab to improve our translation result. In particular, we will use an implementation from the gensim library to train the embedding of our corpus.

You can read more about FastText and gensim library:
https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText

In [None]:
from gensim.models import FastText
import numpy as np
import random
from torch.utils.data import DataLoader

## Train FastText Embeddings

In [None]:
def compute_FastText_embeddings(pd_dataframe, embedding_dim):
    """
    Given dataset (pd.DataFrame as used in the beginning), train FastText embeddings
    Return FastText trained model and embeddings vectors (np array [2 + vocab_size, embedding_dim])
    """
    
    print('Computing FastText Embeddings...')
    sentences = [sen.split() for sen in pd_dataframe]
    model, embedding_vec = None, None
    
    # (1) Create FastText model to learn `embedding_dim` sized embedding vectors
    model = FastText(size=embedding_dim)

    # (2) Build vocab from sentences
    model.build_vocab(sentences=sentences)

    # (3) Train model on sentences for 10 epochs
    # Note: between total_examples and total_words, only need total_examples
    model.train(sentences=sentences, total_examples=model.corpus_count, epochs=10)

    # (4) The sentences that we used to train the embedding don't contain '<pad>', or '<unk>' 
    #     so add two all-zero or random rows in the beginning of the embedding np array for '<pad>' and '<unk>'
    # Note: the following is of np.array[vocab_size, embedding_dim]
    temp_embd_vec = model.wv.vectors
    # Note: the following add all-zero rows to make it np array [2 + vocab_size, embedding_dim]
    embedding_vec = np.concatenate((np.zeros((2, embedding_dim)), temp_embd_vec))
    return model, embedding_vec

In [None]:
if __name__ == '__main__':
    # HYPERPARAMETERS 
    BATCH_SIZE = 64
    EMBEDDING_DIM = 256

    fasttext_model_src, embedding_src = compute_FastText_embeddings(data['es'], EMBEDDING_DIM)
    fasttext_model_trg, embedding_trg = compute_FastText_embeddings(data['eng'], EMBEDDING_DIM)

Computing FastText Embeddings...
Computing FastText Embeddings...


## Instantiate Datasets

Now the pretrained embeddings have been created, the training and validation datasets can be instantiated

In [None]:
if __name__ == '__main__':
    src_vocab = Vocab_Lang(fasttext_model_src.wv.vocab)
    trg_vocab = Vocab_Lang(fasttext_model_trg.wv.vocab)

    src_tensor, trg_tensor, max_length_src, max_length_trg = preprocess_data_to_tensor(data, src_vocab, trg_vocab)
    src_tensor_train, src_tensor_val, trg_tensor_train, trg_tensor_val = train_test_split(src_tensor, trg_tensor)

    # create train and val datasets
    train_dataset = MyData(src_tensor_train, trg_tensor_train)
    train_dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)

    test_dataset = MyData(src_tensor_val, trg_tensor_val)
    test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=False)

max_length_src: 16, max_length_trg: 12


  app.launch_new_instance()


In [None]:
if __name__ == '__main__':
    idxes = random.choices(range(len(train_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    print('Source:', src)
    print('Target:', trg)

Source: tensor([[   2,   79,   18,   33,   27,  184,   71,   10, 1973,    6,    7,    0,
            0,    0,    0,    0],
        [   2,   12,    4,  920,  131,  249,    6,    7,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,   16,    1,  318, 1313,   10,    1,   21,    7,    0,    0,    0,
            0,    0,    0,    0],
        [   2,  178, 1207, 1552,  139, 1208,    6,    7,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,  142,   12,   79,  138,    6,    7,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
Target: tensor([[   2,    3,   19, 1512,    7,  415,    5,    6,    0,    0,    0,    0],
        [   2,    3,  283,   14,  124,   10,   70,    5,    6,    0,    0,    0],
        [   2,   13,    7, 1605,   39,  318,   21,    6,    0,    0,    0,    0],
        [   2,  186,    4, 1520,  134,  977,    5,    6,    0,    0,    0,    0],
        [   2,    3,   18,   14,  145,  125,  203,    5,    6,    0,    0,    0]

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import time
from tqdm.notebook import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

# Step 3: Train a Recurrent Neural Network (RNN)

Write a recurrent model for machine translation, and then train and evaluate its results.

Here are some useful links:
1. Attention paper: https://arxiv.org/pdf/1409.0473.pdf
2. Explanation of LSTM's & GRU's: https://towardsdatascience.com/illustrated-guide-to-lstms-and-gru-s-a-step-by-step-explanation-44e9eb85bf21
3. Attention explanation: https://towardsdatascience.com/attention-in-neural-networks-e66920838742 
4. Another attention explanation: https://towardsdatascience.com/attention-and-its-different-forms-7fc3674d14dc


##Encoder Model

First a recurrent encoder model is built. Instead of using a fully connected layer as the output, a sequence of outputs of the GRU as well as the final hidden state will be returned. These will be used in the decoder.

Here, the `__init(...)` and `forward(...)` functions are implemented

In [None]:
class RnnEncoder(nn.Module):
    def __init__(self, pretrained_emb, vocab_size, embedding_dim, hidden_units):
        super(RnnEncoder, self).__init__()
        """
        Args:
            pretrained_emb: np.array, the pre-trained source embedding computed from compute_FastText_embeddings
            vocab_size: int, the size of the source vocabulary
            embedding_dim: the dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """
        
        # Convert pretrained_emb from np.array to torch.FloatTensor
        pretrained_emb_cnvtd = torch.from_numpy(pretrained_emb)
       
        # Initialize embedding layer with pretrained_emb
        # (see: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)
        # Note: the web site has usage example embedding = nn.Embedding.from_pretrained(weight)
        # we should do similar thing because we are also using pre-trained embedding
        self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrained_emb_cnvtd)
                                                 
        # Initialize a single directional GRU with 1 layer and batch_first=False
        # Note: the following 3 arguments just follow the order, "1" is the number of hidden layers
        # also the default for batch_first and bidirectional are both False, so no need to specify these two
        self.rnn = nn.GRU(embedding_dim, hidden_units, 1)

    def forward(self, x):
        """
        Args:
            X: source texts, [max_len, batch_size]

        Returns:
            output: [max_len, batch_size, hidden_units]
            hidden_state: [1, batch_size, hidden_units] 
        """
        
        # Pass x through an embedding layer and pass the results through the rnn
        embeddings = self.EmbeddingLayer(x)
        output, hidden_state = self.rnn(embeddings.float())

        # Return output and hidden states from the recurrent net
        return output, hidden_state

##Decoder Model
Implement a Decoder model with an attention mechanism, as provided in https://arxiv.org/pdf/1409.0473.pdf. We have broken this up into three functions: `__init__(self, ...)`, `compute_attention(self, dec_hs, enc_output)`, and `forward(self, x, dec_hs, enc_output)`:

* <b>`__init__(self, ...)`: </b> Instantiate the parameters for the model, and store them in `self` variables.

* <b>`compute_attention(self, dec_hs, enc_output)` </b>: Compute the <b>context vector</b>, which is a weighted sum of the encoder output states. Suppose the decoder hidden state at time $t$ is $\mathbf{h}_t$, and the encoder hidden state at time $s$ is $\mathbf{\bar h}_s$. The pseudocode is as follows:

  1. <b>Attention scores:</b> Compute real-valued scores for the decoder hidden state $\mathbf{h}_t$ and each encoder hidden state $\mathbf{\bar h}_s$: $$\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_s)=
      \mathbf{v}_a^T \tanh(\mathbf{W}_1 \mathbf{h}_t +\mathbf{W}_2 \mathbf{\bar h}_s)
$$
  A higher score indicates a stronger "affinity" between the decoder state and a specific encoder state. Note that the matrices $\mathbf{W}_1$, $\mathbf{W}_2$ and the vector $\mathbf{v_a}$ can all be implemented with `nn.Linear(...)` in Pytorch.

 2. <b>Attention weights:</b> Normalize the attention scores to obtain a valid probability distribution: $$\alpha_{ts} = \frac{\exp \big (\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_s) \big)}{\sum_{s'=1}^S \exp \big (\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_{s'}) \big)}$$ Notice that this is just the softmax function, and can be implemented with `torch.softmax(...)` in Pytorch.

 3. <b>Context vector:</b> Compute a context vector $\mathbf{c}_t$ that is a weighted average of the encoder hidden states, where the weights are given by the attention weights computed as above: $$\mathbf{c}_t=\sum_{s=1}^S \alpha_{ts} \mathbf{\bar h}_s$$

 This context vector should be returned along with the attention weights.



* <b>`forward(self, x, dec_hs, enc_output)`:</b> Run a <b>single</b> decoding step, resulting in a distribution over the vocabulary for the next token in the sequence. 

In [None]:
class RnnDecoder(nn.Module):
    def __init__(self, pretrained_emb, vocab_size, embedding_dim, hidden_units):
        super(RnnDecoder, self).__init__()
        """
        Args:
            pretrained_emb: The pre-trained target embedding computed from compute_FastText_embeddings (np.array)
            vocab_size: The size of the target vocabulary
            embedding_dim: The dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """

        # Convert pretrained_emb from np.array to torch.FloatTensor
        pretrained_emb_cnvtd = torch.from_numpy(pretrained_emb)

        # Initialize embedding layer with pretrained_emb
        self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrained_emb_cnvtd)

        # Initialize layers to compute attention score
        # Note: we assume hidden size to be the same across encoder & decoder
        # the following one is to calculate W1*h_t
        self.W1Ht = nn.Linear(hidden_units, hidden_units)
        # the following one is to calculate W2*h_s
        self.W2Hs = nn.Linear(hidden_units, hidden_units)
        # the following one is to score(h_t, h_s), output is scalar, so dimension is "1"
        # This is for the dot product of v_a.T with tanh(....)
        self.Scores = nn.Linear(hidden_units, 1)
        
        # Initialize a single directional GRU with 1 layer and batch_first=True
        # NOTE: Input to the RNN will be the concatenation of the embedding vector and the context vector
        # input size should be embedding_dim + hidden_units due to the concatenation
        # context vector's dimension is same as h_s (as in the above-mentioned formula), the following "1" is numebr of layers
        self.rnn = nn.GRU(embedding_dim + hidden_units, hidden_units, 1, batch_first = True)
       
        # Initialize fully connected layer
        self.FullyCnnctdLayer = nn.Linear(hidden_units, vocab_size)
    
    def compute_attention(self, dec_hs, enc_output):
        '''
        This function computes the context vector and attention weights.

        Args:
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            context_vector: Context vector, according to formula; [batch_size, hidden_units]
            attention_weights: The attention weights; [batch_size, max_len_src, 1]
        '''    
        


        # (1) Compute the attention scores for dec_hs & enc_output
        # permute from dimension [1, batch_size, hidden_units] to [batch_size, 1, hidden_units]
        dec_hs = dec_hs.permute(1, 0, 2) 
        
        # permute from dimension [max_len_src, batch_size, hidden_units] to [batch_size, max_len_src, hidden_units]
        enc_output = enc_output.permute(1, 0, 2) 
        
        # (2) Compute attention_weights by taking a softmax over the scores to normalize the distribution
        #  - Output size: [batch_size, max_len_src, 1]
        # In the following, the "+" is using bradasting to make W1Ht(dec_hs) dimensions 
        # from [batch_size, 1, hidden_units] to the same dimension as W2Hs(enc_output),
        # i.e. [batch_size, max_len_src, hidden_units]. This is for computation efficiency, avoid a loop. 
        # After going through self.Scores(), the output score's dimension is [batch_size, max_len_src, 1]
        scores = self.Scores(torch.tanh(self.W1Ht(dec_hs)+ self.W2Hs(enc_output)))
        
        # Softmax along the dim = 1, along the dimension of max_len_src, output dimension is [batch_size, max_len_src, 1]
        attention_weights = F.softmax(scores, dim = 1)

        # In the following, broadcasting makes attention_weights dimensions the same as enc_output for 
        # element-wise multiplication, i.e. the 3rd dimension of attention_weights changes from 1 to "hidden units". 
        # torch.sum adds along the 2nd dimension (dim =1), i.e. along "max_len_src", and keepsim = False, 
        # so the 2nd dimension of context_vector is squeezed, and its dimensions become [batch_size, hidden_units]
        context_vector = torch.sum(attention_weights * enc_output, dim = 1, keepdim = False)

        return context_vector, attention_weights

    def forward(self, x, dec_hs, enc_output):
        '''
        This function runs the decoder for a single time step.

        Args:
            x: Input token; [batch_size, 1]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            fc_out: (Unnormalized) output distribution [batch_size, vocab_size]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            attention_weights: The learned attention weights; [batch_size, max_len_src, 1]
        '''
 
        # (1)Compute the context vector & attention weights
        context_vector, attention_weights = self.compute_attention(dec_hs, enc_output)

        # (2) Obtain embedding vectors for input x, - Output size: [batch_size, 1, embedding_dim]      
        emb_vectors  = self.EmbeddingLayer(x)


        # (3) Concatenate the context vector & the embedding vectors along the appropriate dimension
        # unsqueeze to make context_vector dimention [batch_size, 1, embedding_dim], concat along the dimension
        # of hidden_units and embedding_dim, so output is [batch_size, 1, hidden_units+embedding_dim]
        ConcatVector = torch.cat((torch.unsqueeze(context_vector, 1), emb_vectors), dim = 2)

        # (4) Feed this result through the rnn (along with the current hidden state) to get output and new hidden state
        #            - Output sizes: [batch_size, 1, hidden_units] & [1, batch_size, hidden_units] 
        RnnOutput, dec_hs = self.rnn(ConcatVector.float(), dec_hs)
        
        # (5) Feed the output of the rnn through linear layer to get (unnormalized) output distribution (don't call softmax!)
        # squeeze to make RnnOutput dimension [batch_size, hidden_units] before passing into fully connected layer
        fc_out = self.FullyCnnctdLayer(torch.squeeze(RnnOutput, 1))

        return fc_out, dec_hs, attention_weights

## Train RNN Model

We will train the encoder and decoder using cross-entropy loss.

In [None]:
def loss_function(real, pred):
    mask = real.ge(1).float() # Only consider non-zero inputs in the loss
    
    loss_ = F.cross_entropy(pred, real) * mask 
    return torch.mean(loss_)

def train_rnn_model(encoder, decoder, dataset, optimizer, trg_vocab, device, n_epochs):
    batch_size = dataset.batch_size
    for epoch in range(n_epochs):
        start = time.time()
        n_batch = 0
        total_loss = 0
        
        encoder.train()
        decoder.train()
        
        for src, trg in tqdm(dataset):
            n_batch += 1
            loss = 0
            
            enc_output, enc_hidden = encoder(src.transpose(0,1).to(device))
            dec_hidden = enc_hidden
            
            # use teacher forcing - feeding the target as the next input (via dec_input)
            dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size)
        
            # run code below for every timestep in the ys batch
            for t in range(1, trg.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
                loss += loss_function(trg[:, t].to(device), predictions.to(device))
                dec_input = trg[:, t].unsqueeze(1)
        
            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss
            
            optimizer.zero_grad()
            
            batch_loss.backward()

            # update model parameters
            optimizer.step()
        
        # Save checkpoint for model
        print('Epoch:{:2d}/{}\t Loss: {:.4f} \t({:.2f}s)'.format(epoch + 1, n_epochs, total_loss / n_batch, time.time() - start))

    print('Model trained!')

In [None]:
def count_parameters(model):
    """
    Count number of trainable parameters in the model
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
if __name__ == '__main__':
    # HYPERPARAMETERS
    LEARNING_RATE = 0.001
    HIDDEN_UNITS=256
    N_EPOCHS=10

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  
    src_vocab_size = len(src_vocab.word2idx)
    trg_vocab_size = len(trg_vocab.word2idx)

    rnn_encoder = RnnEncoder(embedding_src, src_vocab_size, EMBEDDING_DIM, HIDDEN_UNITS).to(device)
    rnn_decoder = RnnDecoder(embedding_trg, trg_vocab_size, EMBEDDING_DIM, HIDDEN_UNITS).to(device)

    rnn_model_params = list(rnn_encoder.parameters()) + list(rnn_decoder.parameters())
    optimizer = torch.optim.Adam(rnn_model_params, lr=LEARNING_RATE)

    print('Encoder and Decoder models initialized!')

    print('The rnn_encoder has {:,d} trainable parameters'.format(count_parameters(rnn_encoder)))
    print('The rnn_decoder has {:,d} trainable parameters'.format(count_parameters(rnn_decoder)))

Encoder and Decoder models initialized!
The rnn_encoder has 394,752 trainable parameters
The rnn_decoder has 1,337,688 trainable parameters


In [None]:
if __name__ == '__main__':
    train_rnn_model(rnn_encoder, rnn_decoder, train_dataset, optimizer, trg_vocab, device, N_EPOCHS)

  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 1/10	 Loss: 1.7422 	(20.28s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 2/10	 Loss: 1.2758 	(20.17s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 3/10	 Loss: 1.0705 	(20.03s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 4/10	 Loss: 0.9280 	(20.05s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 5/10	 Loss: 0.8129 	(20.05s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 6/10	 Loss: 0.7177 	(20.19s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 7/10	 Loss: 0.6379 	(20.03s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 8/10	 Loss: 0.5689 	(19.96s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 9/10	 Loss: 0.5084 	(20.09s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:10/10	 Loss: 0.4563 	(20.13s)
Model trained!


## Inference (Decoding) Function

Now that the model has been trained, it can be used on test data.

The following inference function takes the trained model and a source sentence (Spanish), and returns its translation (English sentence). Instead of using the concept of teacher forcing, the input to the decoder at time step $t_i$ will be the prediction of the decoder at time $t_{i-1}$.

In [None]:
def decode_rnn_model(encoder, decoder, src, max_decode_len, trg_vocab, device):
    """
    Args:
        encoder: Your RnnEncoder object
        decoder: Your RnnDecoder object
        src: [max_src_length, batch_size] the source sentences you wish to translate
        max_decode_len: The maximum desired length (int) of your target translated sentences
        trg_vocab: The Vocab_Lang object for the target language
        device: the device your torch tensors are on (you may need to call x.to(device) for some of your tensors)

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in your vocabulary at each time step
    """

    # Initialize variables
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len))
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)))

    # Start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size)
    curr_output[:, 0] = dec_input.squeeze(1)
    
    # Obtain encoder output and hidden state by encoding src sentences
    enc_output, enc_hidden = encoder(src)
    dec_hidden = enc_hidden

    # At each time step, get the best prediction and save it
    for t in range(1, max_decode_len):
        # Obtain(unnormalized) prediction probabilities and hidden state by feeding dec_input (the best words 
        # from the previous time step), previous hidden state, and encoder output to decoder        
        predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))

        # Save the (unnormalized) prediction probabilities in curr_predictions at index t
        curr_predictions[:,t,:] = predictions

        # Obtain the new dec_input by selecting the most likely (highest probability) token
        dec_input = torch.argmax(predictions, dim=1).unsqueeze(1)

        # Save dec_input in curr_output at index t
        curr_output[:, t] = dec_input.squeeze(1)
        
    return curr_output, curr_predictions

Run the following cell to qualitatively compare some sentences generated fro the model with the some of the correct translations.

In [None]:
if __name__ == '__main__':
    idxes = random.choices(range(len(test_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    curr_output, _ = decode_rnn_model(rnn_encoder, rnn_decoder, src.transpose(0,1).to(device), trg.size(1), trg_vocab, device)
    for i in range(len(src)):
        print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
        print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
        print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
        print("----------------")

Source sentence: <start> tom me saca de mis <unk> . <end>
Target sentence: <start> tom gets on my <unk> . <end>
Predicted sentence: <start> tom <unk> my <unk> . <end>
----------------
Source sentence: <start> apuesto a que no lo hace . <end>
Target sentence: <start> i bet he doesn t make it . <end>
Predicted sentence: <start> i bet it s not the <unk> . <end>
----------------
Source sentence: <start> me preocupo . <end>
Target sentence: <start> i care . <end>
Predicted sentence: <start> i m angry . <end>
----------------
Source sentence: <start> me gusta tu articulo . <end>
Target sentence: <start> i like your article . <end>
Predicted sentence: <start> i like your tie . <end>
----------------
Source sentence: <start> que vengas . <end>
Target sentence: <start> come here . <end>
Predicted sentence: <start> have a nice . <end>
----------------


## Evaluate RNN Model

Use the following function to run the test set through the model and calculate BLEU scores. 
Read more about Bleu Score at :

1.   https://en.wikipedia.org/wiki/BLEU
2.   https://www.aclweb.org/anthology/P02-1040.pdf

In [None]:
def get_reference_candidate(target, pred, trg_vocab):
    def _to_token(sentence):
        lis = []
        for s in sentence[1:]:
            x = trg_vocab.idx2word[s]
            if x == "<end>": break
            lis.append(x)
        return lis
    reference = _to_token(list(target.numpy()))
    candidate = _to_token(list(pred.numpy()))
    return reference, candidate

def compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab):
    bleu_1 = 0.0
    bleu_2 = 0.0
    bleu_3 = 0.0
    bleu_4 = 0.0

    smoother = SmoothingFunction()
    save_reference = []
    save_candidate = []
    for i in range(len(target_tensor_val)):
        reference, candidate = get_reference_candidate(target_output[i], final_output[i], trg_vocab)
    
        bleu_1 += sentence_bleu(reference, candidate, weights=(1,), smoothing_function=smoother.method1)
        bleu_2 += sentence_bleu(reference, candidate, weights=(1/2, 1/2), smoothing_function=smoother.method1)
        bleu_3 += sentence_bleu(reference, candidate, weights=(1/3, 1/3, 1/3), smoothing_function=smoother.method1)
        bleu_4 += sentence_bleu(reference, candidate, weights=(1/4, 1/4, 1/4, 1/4), smoothing_function=smoother.method1)

        save_reference.append(reference)
        save_candidate.append(candidate)
    
    bleu_1 = bleu_1/len(target_tensor_val)
    bleu_2 = bleu_2/len(target_tensor_val)
    bleu_3 = bleu_3/len(target_tensor_val)
    bleu_4 = bleu_4/len(target_tensor_val)

    scores = {"bleu_1": bleu_1, "bleu_2": bleu_2, "bleu_3": bleu_3, "bleu_4": bleu_4}
    print('BLEU 1-gram: %f' % (bleu_1))
    print('BLEU 2-gram: %f' % (bleu_2))
    print('BLEU 3-gram: %f' % (bleu_3))
    print('BLEU 4-gram: %f' % (bleu_4))

    return save_candidate, scores

def evaluate_rnn_model(encoder, decoder, test_dataset, target_tensor_val, trg_vocab, device):
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()
    
    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0
            curr_output, curr_predictions = decode_rnn_model(encoder, decoder, src.transpose(0,1).to(device), trg.size(1), trg_vocab, device)
            for t in range(1, trg.size(1)):
                loss += loss_function(trg[:, t].to(device), curr_predictions[:,t,:].to(device))

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(1)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(1)))
            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg
            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss

        print('Loss {:.4f}'.format(total_loss / n_batch))
    
    # Compute BLEU scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [None]:
if __name__ == '__main__':
    rnn_save_candidate, rnn_scores = evaluate_rnn_model(rnn_encoder, rnn_decoder, test_dataset, trg_tensor_val, trg_vocab, device)

Loss 1.8789
BLEU 1-gram: 0.302554
BLEU 2-gram: 0.084362
BLEU 3-gram: 0.061512
BLEU 4-gram: 0.058509


# Step 4: Train a Transformer

Implement a transformer model for machine translation, and then train and evaluate its results. Here are some helpful links:
<ul>

1.  Original transformer paper: https://arxiv.org/pdf/1706.03762.pdf
2.  Helpful tutorial: http://jalammar.github.io/illustrated-transformer/ 
3. Another tutorial: http://peterbloem.nl/blog/transformers 
</ul>

In [None]:
import math

## <font color='red'>TODO:</font> Positional Embeddings

Similar to the RNN, we start with the Encoder model. A key component of the encoder is the Positional Embedding. As we know, word embeddings encode words in such a way that words with similar meaning have similar vectors. Because there are no recurrences in a Transformer, we need a way to tell the transformer the relative position of words in a sentence: so we will add a positional embedding to the word embeddings. Now, two words with a similar embedding will both be close in meaning and occur near each other in the sentence.

We will create a positional embedding matrix of size $(max\_len, embed\_dim)$ using the following formulae:
<br>
$\begin{align*} pe[pos,2i] &= \sin \Big (\frac{pos}{10000^{2i/embed\_dim}}\Big )\\pe[pos,2i+1] &= \cos \Big (\frac{pos}{10000^{2i/embed\_dim}}\Big ) \end{align*}$

In [None]:
def create_positional_embedding(max_len, embed_dim):
    '''
    Args:
        max_len: The maximum length supported for positional embeddings
        embed_dim: The size of your embeddings
    Returns:
        pe: [max_len, 1, embed_dim] computed as in the formulae above
    '''

    # create two arrays of [max_len,embed_dim] 
    # for numerators and denominators inside sin() and cos () of the pe[]formula
    array1 = np.fromfunction(lambda pos, i: pos, (max_len,embed_dim))
    array2 = np.fromfunction(lambda pos, i: ((i//2)*2/embed_dim), (max_len,embed_dim))

    # element-wise raise to power for denominators:
    array2 = np.power(10000, array2)

    # element-wise divide the two arrays:
    array1 = array1/array2
    # element-wise calculate sin and cos for even-numbered columns and odd-numbered columns
    sin_arr = np.sin(array1[:, 0::2])
    cos_arr = np.cos(array1[:, 1::2])

    # create and assemble pe[]
    array = np.empty(((max_len,embed_dim)))  #[max_len, embed_dim]
    array[:, 0::2] = sin_arr
    array[:, 1::2] = cos_arr
    pe = torch.from_numpy(array)
    pe = pe.unsqueeze(1)                  #[max_len, 1, embed_dim]
    return pe

## Encoder Model

Create the Encoder model for the transformer. Implement the `__init(...)` and `forward(...)` functions.

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, pretrained_emb, src_vocab_size, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_src, device):
        super(TransformerEncoder, self).__init__()
        self.device = device
        """
        Args:
            pretrained_emb: np.array, the pre-trained source embedding computed from compute_FastText_embeddings
            src_vocab_size: int, the size of the source vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of features in the encoder (??) hidden state 
            num_layers: the number of Transformer Encoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_src: maximum length of the source sentences
            device: the working device (may need to map the postional embedding to this device)
        """

        # Create positional embedding matrix
        # Note: obeseerved that the following .to(device) is necessary, otherwise 
        # "RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"
        # will occur in forward() when adding self.position_embedding to word_embeddings. 

        self.position_embedding = create_positional_embedding(max_len_src, embedding_dim).to(device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that position_embedding is not a learnable parameter

        # Convert pretrained_emb from np.array to torch.FloatTensor
        pretrained_emb_cnvtd = torch.from_numpy(pretrained_emb)

        # Initialize embedding layer with pretrained_emb
        self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrained_emb_cnvtd)

        # Dropout layer
        self.DropoutLayer = nn.Dropout()
        
        # Initialize an nn.TransformerEncoder model (use embedding_dim, num_layers, num_heads, & dim_feedforward here)
        EncoderLayer = nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward)
        self.Encoder = nn.TransformerEncoder(EncoderLayer, num_layers)

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == 0 # padding idx
        return src_mask.to(self.device) # (batch_size, max_src_len)

    def forward(self, x):
        """
        Args:
            x: [max_len, batch_size]
        Returns:
            output: [max_len, batch_size, embed_dim]
        """

        # Pass x through the word embedding
        word_embeddings = self.EmbeddingLayer(x)                # [max_len, batch_size, embed_dim]

        # Add positional embedding to the word embedding, then apply dropout
        # Adding [max_len, 1, embed_dim] to [max_len, batch_size, embed_dim], broasdcasting
        # adding the same position embedding to each sentence in a batch        
        embeddings = self.DropoutLayer(self.position_embedding + word_embeddings)

        # Call make_src_mask(x) to compute a mask: this tells us which indices in x
        # are padding, which we want to ignore for the self-attention
        src_mask = self.make_src_mask(x)

        # Call the encoder, with src_key_padding_mask = src_mask
        output = self.Encoder(embeddings.float(), src_key_padding_mask = src_mask)

        return output    

## Decoder Model
Implement a Decoder model, the `__init(...)` and `forward(...)` functions. Unlike the RNN, there is no need to explicitly compute inter-attention with the encoder; use the nn.TransformerDecoder model, which takes care this.

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, pretrained_emb, trg_vocab_size, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_trg, device):
        super(TransformerDecoder, self).__init__()
        self.device = device
        """
        Args:
            pretrained_emb: np.array, the pre-trained target embedding computed from compute_FastText_embeddings
            trg_vocab_size: int, the size of the target vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of features in the decoder (??) hidden state 
            num_layers: the number of Transformer Decoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_trg: maximum length of the target sentences
            device: the working device (you may need to map your postional embedding to this device)
        """

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_trg, embedding_dim).to(device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that positional_embedding is not a learnable parameter

        # Convert pretrained_emb from np.array to torch.FloatTensor
        pretrained_emb_cnvtd = torch.from_numpy(pretrained_emb)

        # Initialize embedding layer with pretrained_emb
        self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrained_emb_cnvtd)

        # Dropout layer
        self.DropoutLayer = nn.Dropout()
        
        # Initialize a nn.TransformerDecoder model (use embedding_dim, num_layers, num_heads, & dim_feedforward here)
        DecoderLayer = nn.TransformerDecoderLayer(embedding_dim, num_heads, dim_feedforward)
        self.Decoder = nn.TransformerDecoder(DecoderLayer, num_layers)

        # Final fully connected layer
        self.FullyCnnctdLayer = nn.Linear(embedding_dim, trg_vocab_size)

    def generate_square_subsequent_mask(self, sz):
        """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

        return mask

    def forward(self, dec_in, enc_out):
        """
        Args:
            dec_in: [sequence length, batch_size]
            enc_out: [max_len, batch_size, embed_dim]
        Returns:
            output: [sequence length, batch_size, trg_vocab_size]
        """

        # Compute input word and positional embeddings in similar manner to encoder
        word_embeddings = self.EmbeddingLayer(dec_in)                # [sequence length, batch_size, embed_dim]

        # In the following dec_in.size(dim=0) is equal to the first dimension's size of dec_in, 
        # which is "sequence length",  it is less or equal to max_len_trg. We need to take only the first n from the
        # first dimension of position_embedding, where n = dec_in.size(dim=0) = "sequence length", so that the
        # dimensions of word embedding and position embedding agree (with broadcasting) and the two can be added
        pos_embedding = self.position_embedding[0:dec_in.size(dim=0)]  # [sequence length, batch_size, embed_dim]
        embeddings = self.DropoutLayer(pos_embedding + word_embeddings)

        # Call generate_square_subsequent_mask() to compute a mask, the dimension of the mask
        # should be a square of sequence length by sequence length, where sequence length = dec_in.size(dim=0)
        # The mask is to prevent the decoder from attending to tokens in the "future".
        # In other words, at time step i, the decoder should only attend to tokens 1 to i-1.
        target_mask = self.generate_square_subsequent_mask(dec_in.size(dim=0))

        # add .to(self.device) to fix the RuntimeError: Expected all tensors to be on the same device, 
        # but found at least two devices, cuda:0 and cpu! 
        target_mask = target_mask.to(self.device)

        # Call the decoder, with trg_mask = trg_mask
        # the documentation of TransformerDecoder is incomplete and confusing, so the first time
        # I had the two first arguments "embeddings.float(), enc_out" swapped and had RuntimeError
        Decoder_Output = self.Decoder(embeddings.float(), enc_out, tgt_mask = target_mask)

        # Run the output through the fully-connected layer and return it
        output = self.FullyCnnctdLayer(Decoder_Output)
 
        return output    

## Train Transformer Model

Like the RNN, we train the encoder and decoder using cross-entropy loss.

In [None]:
def train_transformer_model(encoder, decoder, dataset, optimizer, device, n_epochs):
    encoder.train()
    decoder.train()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    for epoch in range(n_epochs):
        start = time.time()
        losses = []

        for src, trg in tqdm(train_dataset):
            
            src = src.to(device).transpose(0,1) # [max_src_length, batch_size]
            trg = trg.to(device).transpose(0,1) # [max_trg_length, batch_size]

            enc_out = encoder(src)
            output = decoder(trg[:-1, :], enc_out)

            output = output.reshape(-1, output.shape[2])
            trg = trg[1:].reshape(-1)

            optimizer.zero_grad()

            loss = criterion(output, trg)
            losses.append(loss.item())

            loss.backward()

            # Clip to avoid exploding grading issues
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1)

            optimizer.step()

        mean_loss = sum(losses) / len(losses)
        print('Epoch:{:2d}/{}\t Loss:{:.4f} ({:.2f}s)'.format(epoch + 1, n_epochs, mean_loss, time.time() - start))


In [None]:
if __name__ == '__main__':
    # HYPERPARAMETERS - feel free to play around
    LEARNING_RATE = 0.001
    DIM_FEEDFORWARD=512
    N_EPOCHS=10
    N_HEADS=2
    N_LAYERS=2

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    transformer_encoder = TransformerEncoder(embedding_src, src_vocab_size, EMBEDDING_DIM, N_HEADS, 
                                 N_LAYERS,DIM_FEEDFORWARD,
                                 max_length_src, device).to(device)
    transformer_decoder = TransformerDecoder(embedding_trg, trg_vocab_size, EMBEDDING_DIM, N_HEADS, 
                              N_LAYERS,DIM_FEEDFORWARD,
                              max_length_trg, device).to(device)

    transformer_model_params = list(transformer_encoder.parameters()) + list(transformer_decoder.parameters())
    optimizer = torch.optim.Adam(transformer_model_params, lr=LEARNING_RATE)

    print('Encoder and Decoder models initialized!')

Encoder and Decoder models initialized!


In [None]:
if __name__ == '__main__':
    train_transformer_model(transformer_encoder, transformer_decoder, train_dataset, optimizer, device, N_EPOCHS)

  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 1/15	 Loss:3.4171 (18.35s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 2/15	 Loss:2.8673 (18.00s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 3/15	 Loss:2.6347 (18.17s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 4/15	 Loss:2.4588 (18.17s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 5/15	 Loss:2.3299 (18.21s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 6/15	 Loss:2.2157 (18.22s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 7/15	 Loss:2.1284 (18.27s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 8/15	 Loss:2.0417 (18.28s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 9/15	 Loss:1.9666 (18.30s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:10/15	 Loss:1.8992 (18.33s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:11/15	 Loss:1.8418 (18.32s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:12/15	 Loss:1.7868 (18.30s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:13/15	 Loss:1.7413 (18.34s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:14/15	 Loss:1.7029 (18.38s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:15/15	 Loss:1.6633 (18.36s)


## Inference (Decoding) Function

Now that the model has been trained, it can be used on test data.

The inference function takes the trained transformer model and a source sentence (Spanish), and returns its translation (English sentence). Like the RNN, we use the prediction of the decoder as the input to the decoder for the sequence of outputs. For the RNN, at time step $t_i$ the decoder takes the hidden state $h_{i-1}$ and the previous prediction $w_{i-1}$ at each time step. However, because the transformer does not use recurrences, we do not pass a hidden state; instead, at time step $t_i$ we pass $w_1,w_2 \cdots w_{i-1}$, which is the entire sequence predicted so far.

In [None]:
def decode_transformer_model(encoder, decoder, src, max_decode_len, trg_vocab, device):
    """
    Args:
        encoder: Your RnnEncoder object
        decoder: Your RnnDecoder object
        src: [max_src_length, batch_size] the source sentences you wish to translate
        max_decode_len: The maximum desired length (int) of your target translated sentences
        trg_vocab: The Vocab_Lang object for the target language
        device: the device your torch tensors are on (you may need to call x.to(device) for some of your tensors)

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in your vocabulary at each time step
     """

    # Initialize variables
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len))
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)))

    # We start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size).transpose(0,1)
    curr_output[:, 0] = dec_input.squeeze(1)
    
    # Obtain encoder output by encoding src sentences
    enc_output = encoder(src)

    # At each time step, get the best prediction and save it
    for t in range(1, max_decode_len):
        # Obtain dec_input as the best words so far for previous time steps (you can get this from curr_output)
        dec_input = curr_output[:,:t].transpose(0,1)
        # Obtain the (unnormalized) prediction probabilities by feeding dec_input and encoder output to decoder
        output = decoder(dec_input.int().to(device), enc_output)
        output = output[-1]
        # Save the (unnormalized) prediction probabilities in curr_predictions at index t
        curr_predictions[:,t,:] = output
        # Calculate the most likely (highest probability) token and save in curr_output at timestep t
        predictions = torch.argmax(output, dim=1)
        curr_output[:, t] = predictions

    return curr_output, curr_predictions

Run the following cell to qualitatively compare some of the sentences the model generates with the some of the correct translations.

In [None]:
if __name__ == '__main__':
    idxes = random.choices(range(len(test_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    curr_output, _ = decode_transformer_model(transformer_encoder, transformer_decoder, src.transpose(0,1).to(device), trg.size(1), trg_vocab, device)
    for i in range(len(src)):
        print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
        print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
        print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
        print("----------------")

Source sentence: <start> ello ha estado ahi por un rato . <end>
Target sentence: <start> it s been there a while . <end>
Predicted sentence: <start> please can come and draw a while . <end> . <end>
----------------
Source sentence: <start> ¿ puedes resolver este problema ? <end>
Target sentence: <start> can you do this problem ? <end>
Predicted sentence: <start> who intervened this juice ? <end> ? <end> <end> <end> <end>
----------------
Source sentence: <start> ¿ quien intervino ? <end>
Target sentence: <start> who intervened ? <end>
Predicted sentence: <start> where are you going ? <end> ? <end> you ? <end>
----------------
Source sentence: <start> os <unk> . <end>
Target sentence: <start> i watched you . <end>
Predicted sentence: <start> we re <unk> . <end> . <end> . <end> . <end>
----------------
Source sentence: <start> tom pudo haber sido <unk> . <end>
Target sentence: <start> tom could ve been killed . <end>
Predicted sentence: <start> tom paid be <unk> . <end> . <end> . <end> .

## Evaluate Transformer Model

Run the test set through the transformer model, compute the BLEU scores. 


In [None]:
def evaluate_model(encoder, decoder, test_dataset, target_tensor_val, trg_vocab, device):
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    losses=[]
    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0
            
            src, trg = src.transpose(0,1).to(device), trg.transpose(0,1).to(device)
            curr_output, curr_predictions = decode_transformer_model(encoder, decoder, src, trg.size(0), trg_vocab, device)

            for t in range(1, trg.size(0)):
                loss += criterion(curr_predictions[:,t,:].to(device), trg[t,:].reshape(-1).to(device))

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(0)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(0)))

            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg.transpose(0,1)
            losses.append(loss.item() / (trg.size(0)-1))

        mean_loss = sum(losses) / len(losses)
        print('Loss {:.4f}'.format(mean_loss))
    
    # Compute Bleu scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [None]:
if __name__ == '__main__':
    transformer_save_candidate, transformer_scores = evaluate_model(transformer_encoder, transformer_decoder, test_dataset, trg_tensor_val, trg_vocab, device)

Loss 3.2614
BLEU 1-gram: 0.307520
BLEU 2-gram: 0.086041
BLEU 3-gram: 0.063189
BLEU 4-gram: 0.060484
