<div style="line-height:0.5">
<h1 style="color:#BF66F2 "> Natural Language Processing in PyTorch 2 </h1>
<h4> NLP translation (eng to ita) with Bahdanau Attention Mechanism. </h4>
<h3 style="color:lightblue"> Keywords: </h3> matplotlib ticker + unicodedata + bottom-margin in markdown + RandomSampler + FixedLocator + set_yticklabels
</div>

In [None]:
#from __future__ import division, print_function, unicode_literals
#import print_function
#import unicode_literals
#import division

In [None]:
import re
import io
import time
import math
from io import open
import random
import numpy as np
import pandas as pd

from google.colab import files

import unicodedata

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import FixedLocator, FixedFormatter

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
#%%script echo uncomment if not on Colab

# Ask the user to upload a file
uploaded = files.upload()

## Read the contents of the uploaded file
file = next(iter(uploaded))
file_content = uploaded[file].decode('utf-8')

# Load the contents of the file into a pandas dataframe
df = pd.read_csv(io.StringIO(file_content), sep='\t', header=None, names=['English', 'Italian'])

Saving eng-ita.txt to eng-ita.txt


In [None]:
df.head()

Unnamed: 0,English,Italian
0,Hi.,Ciao!
1,Run!,Corri!
2,Run!,Corra!
3,Run!,Correte!
4,Who?,Chi?


<h3 style="color:#BF66F2 "> => One-hot encoding </h3>

In [None]:
""" Create a language model for text dataset.
N.B.
Start of sequence SOS (first input to a neural network model) and End of sequence (EOS) are special tokens,\\
that are added to the beginning and end of a sentence.
"""
SOS_token, EOS_token = 0, 1

class Lang:
    """ Custom Language model. 
    
    Attributes:
        - name: The name of the language [str]
        - word2index: that maps words in the language to unique integer indices [dict]
        - word2count: that stores the count of each word in the language [dict]
        - index2word: that maps integer indices back to words in the language [dict]
        - n_words: that stores the total number of unique words in the language [int]
    
    Methods:
        - addSentence(self, sentence): Add to the Lang object a sentence 
        - addWord(self, word): Add to the Lang object a word 
    """
    def __init__(self, name):
        """ Initializations. """
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        """ Add all the words in the sentence to the Lang object. """
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        """ Add the word to the Lang object if it is not already present.
        If the word is already present, it increments the count of the word in the word2count dict.
        """
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

<div style="line-height:0.1">

<h4 style="color:#BF66F2 "><b> Recap: </b> </h4>
</div>
The 'unicodedata' module can access to the Unicode Character Database (UCD) which defines character properties <br>
for all Unicode characters. The data contained in this database is compiled from the UCD version 14.0.0. <br>
It supports all of the world’s writing systems and ensures that data can be retrieved or combined using any combination of languages. <br>
The module uses the same names and symbols as defined by Unicode Standard Annex #44, “Unicode Character Database”. <br>

<h4 style="color:#BF66F2; margin-top: 5px;"> Common functions: </h4>

<div style="margin-top: -15px;">

- unicodedata.lookup(name) => Look up character by name. 
- unicodedata.name(chr[, default]) => Returns the name assigned to the character chr as a string.
- unicodedata.decimal(chr[, default]) => Returns the decimal value assigned to the character chr as integer.
- unicodedata.digit(chr[, default]) => Returns the digit value assigned to the character chr as integer.
- unicodedata.numeric(chr[, default]) => Returns the numeric value assigned to the character chr as float.
- unicodedata.category(chr) => Returns the general category assigned to the character chr as string.
- unicodedata.bidirectional(chr) => Returns the bidirectional class assigned to the character chr as string. 
- unicodedata.combining(chr) => Returns the canonical combining class assigned to the character chr as integer.
- unicodedata.east_asian_width(chr) => Returns the east asian width assigned to the character chr as string.
- unicodedata.mirrored(chr) => Returns the mirrored property assigned to the character chr as integer.
- unicodedata.decomposition(chr) => Returns the character decomposition mapping assigned to the character chr as string. 
- unicodedata.normalize(form, unistr) => Return the normal form form for the Unicode string unistr. 
</div>

In [None]:
def unicodeToAscii(s):
    """ Turn a Unicode string to plain ASCII.\\
        1. Normalize the Unicode string "s" by decomposing any accented characters into the base character and the accent mark.
        2. Remove any combining diacritical marks (Mn) from the string.
    N.B.
    Valid values for 'normalize()' are 'NFC', 'NFKC' 'NFD', and 'NFKD'.
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    """ Normalize given string.
    
    Details:
        - Convert from Unicode to ASCII
        - Transform to lowercase and remove trailing whitespace
        - Replace periods, exclamation marks, or question marks with a space and the same punctuation mark
        - Remove non-letter characters from the string
        - Remove trailing whitespace from the string
    """
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [None]:
def readLangs(lang1, lang2, reverse=False):
    """ Pair from a file and returns two Lang objects and a list of pairs.
    
    Parameters:
        - "lang1" and "lang2" specify the names of the languages in the file
        - "reverse" specifies whether to reverse the order of the pairs
        
    Details:
        - Read the file and split into lines
        - Split every line into pairs and normalize
        - Reverse pairs, make the instances of Lang if "reverse" is True
        
    Returns:
        Input and output Lang objects and the list of pairs
    """
    lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
""" Trim the data set to only relatively short and simple sentences. """

MAX_LENGTH = 10
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re ")

def filterPair(p):
    """ Filter a pair of sentences based on their length and prefix.

    Parameters:
        p: A pair of sentences tuple]
    
    Returns:
        Boolean value => True if both sentences have a length less than MAX_LENGTH\\
        and the second sentence starts with one of the prefixes in eng_prefixes \\
        False otherwise.
    """
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    """ Filters a list of sentence pairs based on their length and prefix.

    Parameters:
        pairs [list]: List of sentence pairs as tuples
    
    Returns:
        New list containing only the sentence pairs that passed the filter
    """
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    """ Read language pairs from a file, filters the pairs, and creates Lang objects for the input and output languages.
    
    Parameters:
        - lang1: The name of the first language in the file [str]
        - lang2: The name of the second language in the file [str]
        - reverse: Whether to reverse the order of the language pairs [bool]
    
    Returns:
        Input_lang, output_lang, and pairs
    """
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs..." % len(pairs))

    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))

    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print("Num of words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)

    return input_lang, output_lang, pairs

In [None]:
input_lang, output_lang, pairs = prepareData('eng', 'ita', True)

Read 331799 sentence pairs...
Trimmed to 32084 sentence pairs
Counting words...
Num of words:
ita 5400
eng 3168


In [None]:
input_lang, output_lang, pairs = prepareData('eng', 'ita', True)
print(random.choice(pairs))

Read 331799 sentence pairs...
Trimmed to 32084 sentence pairs
Counting words...
Num of words:
ita 5400
eng 3168
['stai facendo dell ottima roba qui', 'you re doing great stuff here']


<h3 style="color:#BF66F2 "> => Seq2seq Network: Encoder-Decoder objects </h3>

In [None]:
class EncoderRNN(nn.Module):
    """ Encoder component of a sequence-to-sequence model.
    
    Attributes:
        - input_size [int): The size of the input vocabulary.
        - hidden_size [int]: The size of the hidden state of the GRU.
        - dropout_p [float]: The probability of dropping out a unit in the dropout layer.
    """
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        """ Initialize the encoder """
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        """ Perform a forward pass of the encoder.
        
        Parameters:
            Input tensor of shape (batch_size, seq_length)
        
        Returns:
            - The output tensor of shape (batch_size, seq_length, hidden_size)
            - The hidden state tensor of shape (1, batch_size, hidden_size)
        """
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
class DecoderRNN(nn.Module):
    """ Decoder component of a sequence-to-sequence model.
    
    Attributes:
        - Size of the hidden state of the GRU [int]
        - Size of the output vocabulary [int]
    """
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        """ Performs a forward pass of the decoder.
        
        Parameters:
            - The output tensor from the encoder of shape (batch_size, seq_length, hidden_size).
            - The hidden state tensor from the encoder of shape (1, batch_size, hidden_size).
            - The target tensor of shape (batch_size, seq_length). (default: None)

        Details:
            - The decoder receives the ground truth target sequence as input at each time step, known as "Teacher forcing",\\
            as the decoder is being "forced" to produce the correct output at each time step based on the ground truth targets.
            
            - The decoder's output at each time step is compared to the corresponding target token using a loss function\\
            (such as cross-entropy loss).\\
            The gradients are then backpropagated through time to update the model's parameters.
            
            - The decoder generates its output tokens based on its own previous output tokens (rather than the ground truth targets),\\
            which can lead to errors propagating through the sequence.
            
            - Teacher forcing is not used during inference, and instead the decoder's previous output token\\ 
            is used as input to generate the next token.\\
            
            - The process is repeated until an end-of-sequence token is generated or a maximum sequence length is reached.

        Returns:
            - Output tensor of shape (batch_size, seq_length, output_size)
            - Hidden state tensor of shape (1, batch_size, hidden_size)
            - None placeholder value to maintain consistency in the training loop
        """
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # With teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                # Without teacher forcing: use its own predictions as the next input (detach from history as input)
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

        return decoder_outputs, decoder_hidden, None

    def forward_step(self, input, hidden):
        """ Performs a single step of the forward pass of the decoder.
        
        Parameters:
            - Input tensor of shape (batch_size, 1)
            - Hidden state tensor of shape (1, batch_size, hidden_size)
        
        Returns:
            Output tensor of shape (batch_size, 1, output_size)
            Hidden state tensor of shape (1, batch_size, hidden_size)
        """
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)

        return output, hidden

<h3 style="color:#BF66F2 "> Attention Decoder: </h3>
<div style="margin-top: -20px;">
Bahdanau attention, also known as additive attention, is an attention mechanism in sequence-to-sequence models. <br>
It employs a learned alignment model to compute attention scores between the encoder and decoder hidden states.<br>
It utilizes a feed-forward neural network to calculate alignment scores.
</div>

In [None]:
class BahdanauAttention(nn.Module):
    """ Bahdanau attention mechanism.

    Args:
        - Size of the hidden state of the decoder [int]

    Attributes:
        - Wa: Linear layer for the query transformation [nn.Linear]
        - Ua: Linear layer for the keys transformation [nn.Linear]
        - Va: Linear layer for the attention scoring [nn.Linear]
    
    Methods:
        forward(self, query, keys): Forward pass of the Bahdanau Attention module
    """
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        """ Performs a forward pass of the attention mechanism.
        
        Parameters:
            - Query tensor of shape (batch_size, 1, hidden_size)
            - Keys tensor of shape (batch_size, seq_length, hidden_size)
        
        Details: 
            - Calculate attention scores using query and keys
            - Squeeze the scores tensor to remove the third dimension and unsqueeze to add a new dimension
            - Apply softmax to obtain attention weights
            - Calculate the context vector by applying attention weights to keys

        Returns:
            - Context tensor of shape (batch_size, 1, hidden_size)
            - Attention weights tensor of shape (batch_size, 1, seq_length)
        """
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights


class AttnDecoderRNN(nn.Module):
    """ Decoder component of a sequence-to-sequence model with attention.\\
    AttnDecoderRNN network consists of five layers:
        - embedding: Embedding layer for the output tokens
        - attention: Bahdanau Attention layer
        - gru: Gated Recurrent Unit (GRU) layer
        - out: Linear layer for output generation
        - dropout: Dropout layer for regularization

    Args:
        - Size of the hidden state of the decoder [int]
        - Size of the output vocabulary [int]
        - Dropout probability [float, optional (Default is 0.1)]
    
    Attributes:
        - embedding: Embedding layer for the output tokens [nn.Embedding]
        - attention: Bahdanau Attention module [BahdanauAttention]
        - gru: Gated Recurrent Unit (GRU) layer [nn.GRU]
        - out: Linear layer for output generation [nn.Linear]
        - dropout: Dropout layer for regularization. [nn.Dropout]
    """
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        """ Performs a forward pass of the decoder.
        
        Parameters:
            - Output tensor from the encoder of shape (batch_size, seq_length, hidden_size).
            - Hidden state tensor from the encoder of shape (1, batch_size, hidden_size).
            - Target tensor of shape (batch_size, seq_length). (default: None)
        
        Returns:
            - Output tensor of shape (batch_size, seq_length, output_size).
            - Hidden state tensor of shape (1, batch_size, hidden_size).
            - Attention weights tensor of shape (batch_size, seq_length, 1).
        """
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs, attentions = [], []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(decoder_input, decoder_hidden, encoder_outputs)
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # ...With teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                # ...Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()       #detach from history as input!

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, input, hidden, encoder_outputs):
        """ Perform a single step of the forward pass during decoding.
        
        Parameters:
            - The input tensor of shape (batch_size, 1)
            - The hidden state tensor of shape (1, batch_size, hidden_size)
            - The output tensor from the encoder of shape (batch_size, seq_length, hidden_size)
        
        Details:
            - Apply embedding and dropout to the input tensor
            - Permute the dimensions of the hidden state tensor to obtain the query tensor
            - Pass the query tensor and encoder outputs through the attention mechanism to obtain the context vector\\
            and attention weights
            - Concatenate the embedded input and context vector to obtain the input tensor for the GRU
            - Pass the input tensor and hidden state tensor through the GRU to obtain the output tensor\\
            and updated hidden state tensor
            - Pass the output tensor through a linear layer to obtain the output logits tensor
        
        Returns:
            - Output logits tensor of shape (batch_size, 1, output_size)
            - Updated hidden state tensor of shape (1, batch_size, hidden_size)
            - Attention weights tensor of shape (batch_size, 1, seq_length)
        """
        embedded = self.dropout(self.embedding(input))
        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)
        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

<h2 style="color:#BF66F2 "> Training </h2>

In [None]:
""" Preparing Training Data.
N.B.1
Need an input tensor (indexes of the words in the input sentence) + 
target tensor (indexes of the words in the target sentence).
N.B.2
Append the EOS token to both sequences.
"""
def indexesFromSentence(lang, sentence):
    """Convert a sentence to a list of its corresponding word indices in the given language.
    
    Parameters:
        - Language object representing the language
        - Sentence to convert [str]
    
    Returns:
        Word indices [list]
    """
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    """ Convert a sentence to a tensor of its corresponding word indices, with an EOS token appended.
    
    Parameters:
        - Language object representing the language
        - Sentence to convert [str]
    
    Returns:
        Tensor of word indices, with shape (1, seq_length)
    """
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    """ Convert a pair of input and target sentences to corresponding tensors of word indices.
    
    Parameters:
        Pair of input and target sentences. [(Tuple[str, str])]
    
    Returns:
        Input and target tensors of word indices, with shapes (1, input_seq_length) and (1, target_seq_length), respectively.\\
        [Tuple[Tensor, Tensor]]
    """
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    """ Get a PyTorch DataLoader for the training data.
    
    Parameters:
        Batch size for the DataLoader [int]
    
    Details:
        - Convert the sentences in the pairs to numpy arrays of word indices
        - Create a PyTorch TensorDataset from the numpy arrays of word indices
        - Create a PyTorch DataLoader from the TensorDataset
    
    Returns:
        Input, output Language objects + training DataLoader
    """
    input_lang, output_lang, pairs = prepareData('eng', 'ita', True)

    ########## Convert the sentences
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    # TensorDataset
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device), torch.LongTensor(target_ids).to(device))

    ## DataLoaders
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return input_lang, output_lang, train_dataloader


<div style="line-height:0.5">
<h2 style="color:#BF66F2 "> Actual Training:</h2>
</div>
The input sentence pass through the encoder, every output and the latest hidden state are tracked. <br> 
The decoder take the <SOS> token as first input, and the last hidden state of the encoder as its first hidden state. <br>

The outputs of teacher-forced networks read with coherent grammar but wander far from the correct translation since it learned <br> to represent the output grammar and can get the meaning once the teacher tells it the first few words, 
but it has not properly learned <br> how to create the sentence from the translation in the first place. <br>
<div style="line-height:0.7">
<h3 style="color:#BF66F2 "> Steps </h3>
</div>
<div style="margin-top: -15px;">

- Start a timer    
- Initialize optimizers and criterion   
- Create set of training pairs    
- Start empty losses array for plotting   
</div>

In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    """ Train the neural machine translation model for a single epoch.
    
    Parameters:
        - DataLoader containing the training data
        - Encoder object
        - Decoder object
        - optimizer for the Encoder
        - optimizer for the Decoder
        - loss function
    
    Details:
        - Zero the gradients for the Encoder and Decoder optimizers;
        - Feed the input tensor through the Encoder to obtain the Encoder outputs and hidden state;
        - Feed the Encoder outputs, hidden state, and target tensor through the Decoder to obtain the Decoder outputs;
        - Compute the loss between the Decoder outputs and the target tensor;
        - Backpropagate the loss and compute the gradients for the Encoder and Decoder;
        - Update the parameters of the Encoder and Decoder using their respective optimizers;
        - Add the loss for this batch to the total loss for the epoch;
    
    Returns:
        The average loss per batch for the epoch [float].
    """
    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        ## Feed
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1))
        loss.backward()
        ## Update
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
def asMinutes(s):
    """Convert a time in seconds to a string in the format m minutes s seconds.
    
    Parameters:
        - Time in seconds 
    
    Details:
        - Compute the number of minutes
        - Subtract the number of minutes from the total time to get the number of seconds
    
    Returns:
        Time in the format "m minutes s seconds" [str]
    """
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    """ Compute the time elapsed since a given time, as well as the estimated time remaining.
    
    Parameters:
        - Starting time in seconds (since) [float]
        - Percentage of the task that has been complete [float]
    
    Details:
        - Get the current time
        - Compute the time elapsed since the starting time
        - Compute the estimated total time based on the percentage of the task that has been completed
        - Compute the estimated time remaining
        - Return the elapsed time and estimated time remaining, in the format "elapsed time (- estimated time remaining)"
    
    Returns:
        - Elapsed time and the estimated time remaining, in the format "elapsed time (- estimated time remaining)" [str]
    """
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def showPlot(points):
    # Change the backend used for rendering plots
    plt.switch_backend('agg') 
    plt.figure()
    fig, ax = plt.subplots()
    # Put ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2) 
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001, print_every=100, plot_every=100):
    """ Train the neural machine translation model for a specified number of epochs.
    
    Parameters:
        - DataLoader containing the training data.
        - Encoder object.
        - Decoder object.
        - Number of epochs to train for. [int]
        - Learning rate for the optimizer. Default 0.001. [float]
        - Frequency (in epochs) at which to print the average loss. Default 100 [int] => print_every
        - Frequency (in epochs) at which to record the average loss for plotting. Default 100. [int] => plot_every
    
    Details:
        - Record the starting time
        - Reset print_every
        - Reset plot_every
        - Create an optimizer for the Encoder
        - Create an optimizer for the Decoder
        - Create a loss function
        - Loop over the specified number of epochs
            - Compute the average loss over the last print_every epochs
            - Compute the average loss over the last plot_every epochs
    """
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        # Train the model for one epoch
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)  
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    """ Evaluate the model on a single sentence.
    
    Parameters:
        - Encoder object
        - Decoder object
        - Input sentence to be translated
        - Lang object for the input language
        - Lang object for the output language
    
    Details:
        - Disable gradient computation to save memory
        - Convert the input sentence to a tensor
        - Feed the input tensor through the Encoder to obtain the Encoder outputs and hidden state
        - Feed the Encoder outputs and hidden state through the Decoder to obtain the Decoder outputs and attention weights
        - Get the index of the highest-scoring output word for each position in the output sequence
        - Remove the extra dimension from the tensor
        - Stop decoding when the end-of-sequence token is encountered
        - Convert the index to a word and append it to the list of decoded words
        - Return the list of decoded words and the attention weights
    
    Returns:
        Decoded words and the attention weights
    """
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])

    return decoded_words, decoder_attn

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    """ Evaluate model on a random selection of sentences from the training set.
    
    Parameters:
        - Encoder object.
        - Decoder object.
        - Number of sentences to evaluate. Default is 10.
    """
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

<h2 style="color:#BF66F2 "> Main #1 </h2>

In [None]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

Read 331799 sentence pairs...
Trimmed to 32084 sentence pairs
Counting words...
Num of words:
ita 5400
eng 3168
1m 46s (- 26m 35s) (5 6%) 0.8322
3m 33s (- 24m 54s) (10 12%) 0.1661
5m 20s (- 23m 9s) (15 18%) 0.0822
7m 6s (- 21m 20s) (20 25%) 0.0574
8m 53s (- 19m 34s) (25 31%) 0.0465
10m 41s (- 17m 49s) (30 37%) 0.0409
12m 28s (- 16m 2s) (35 43%) 0.0371
14m 16s (- 14m 16s) (40 50%) 0.0349
16m 4s (- 12m 30s) (45 56%) 0.0332
17m 51s (- 10m 43s) (50 62%) 0.0320
19m 39s (- 8m 56s) (55 68%) 0.0310
21m 27s (- 7m 9s) (60 75%) 0.0300
23m 16s (- 5m 22s) (65 81%) 0.0292
25m 5s (- 3m 35s) (70 87%) 0.0292
26m 53s (- 1m 47s) (75 93%) 0.0286
28m 41s (- 0m 0s) (80 100%) 0.0281


In [None]:
""" Set dropout layers to eval mode """
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> siamo matricole
= we re freshmen
< we re freshmen <EOS>

> non sono una celebrita
= i m not a celebrity
< i m not a celebrity <EOS>

> io ho un po paura
= i m a little scared
< i m a little scared <EOS>

> non sono interessato a farlo adesso
= i m not interested in doing that now
< i m not interested in doing that now <EOS>

> io sono ancora interessata a farlo
= i m still interested in doing that
< i m still interested in doing that <EOS>

> lui sta molto bene oggi
= he s doing very well today
< he s doing very well today <EOS>

> sono felice di sentirlo
= i m happy to hear it
< i m happy to hear you <EOS>

> voi siete manipolatori
= you re manipulative
< you re manipulative <EOS>

> tu sei oberata di lavoro
= you are overworked
< you are overworked <EOS>

> sei tutto quel che ho
= you re all i ve got
< you re all i ve got <EOS>



<h3 style="color:#BF66F2 "> => Visualizing Attention </h3>

In [None]:
def showAttention(input_sentence, output_words, attentions):
    """ Plot the attention weights for a single input-output pair
    
    Parameters:
        - Input sentence [str]
        - Decoded output words [list]
        - Attention weights for the input-output pair [Tensor]
    
    Details:
        - Create a new figure
        - Add a new subplot to the figure\\
        (the argument 111 means that the figure has only one row, one column,\\
        and the current subplot is the first (and only) subplot)
        - Display the attention weights as a matrix
        - Add a color bar to the figure

        - Set up the x-axis and y-axis labels
            - Comment this:
                - ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
                - ax.set_yticklabels([''] + output_words)
            To avoid the warning message:\\
            UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels([''] + output_words)\\
            Is a result of using set_xticklabels with a list of tick labels that does not match the number of ticks on the x-axis

        - Split the input sentence into words and create the input_words list
        - Set up the x-axis tick locations and labels
            - Set the tick locator and formatter
        - Set the tick label rotation

        - Set up the y-axis tick locations and labels
        - Set the tick locator and formatter for the y-ax
        - Show a label at every tick
    """
    fig = plt.figure()
    ax = fig.add_subplot(111) #equivalent to #ax = add_subplot(1, 1, 1)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
    fig.colorbar(cax)

    # Split
    input_words = [''] + input_sentence.split() + ['<EOS>']
    # Set up
    x_ticks = range(len(input_words))
    x_tick_labels = input_words
    # Locator and formatter
    ax.xaxis.set_major_locator(FixedLocator(x_ticks))
    ax.xaxis.set_major_formatter(FixedFormatter(x_tick_labels))
    # Tick label rotation
    ax.tick_params(axis='x', labelrotation=90)

    # Set up
    y_ticks = range(len(output_words))
    y_tick_labels = output_words
    # Tick locator and formatter
    ax.yaxis.set_major_locator(FixedLocator(y_ticks))
    ax.yaxis.set_major_formatter(FixedFormatter(y_tick_labels))

    #ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
    #ax.set_yticklabels([''] + output_words)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

def evaluateAndShowAttention(input_sentence):
    """ Evaluate the neural machine translation model on a single input sentence, get the output words and plot the attention weights.
    
    Parameters:
        Input sentence [str]
    """
    output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    # Plot
    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])


<h2 style="color:#BF66F2 "> Main #2 </h2>

In [None]:
#evaluateAndShowAttention('io sono coraggioso')
evaluateAndShowAttention('io sono famelico')
print()
evaluateAndShowAttention('sono grata');
evaluateAndShowAttention('sei tutto quel che ho')

input = io sono famelico
output = i m famished <EOS>

input = sono grata
output = i m thankful for that <EOS>
input = sei tutto quel che ho
output = you re all i ve got <EOS>
