In [1]:
#Google drive setup
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/AttentionModelsCoursera/W1_NMT_Attention/W1/por-eng

/content/drive/MyDrive/AttentionModelsCoursera/W1_NMT_Attention/W1/por-eng


In [3]:
import torch
import itertools
import numpy as np
import torch.nn as nn
from collections import Counter
from utils import (sentences, train_dataset, val_dataset, train_loader, val_loader,
                   tokenizer_eng, tokenizer_por, masked_loss, masked_acc, tokens_to_text)

# Data preparation

In [4]:
english_sentences, portuguese_sentences = sentences

print(f"English (to translate) sentence:\n\n{english_sentences[-5]}\n")
print(f"Portuguese (translation) sentence:\n\n{portuguese_sentences[-5]}")

English (to translate) sentence:

No matter how much you try to convince people that chocolate is vanilla, it'll still be chocolate, even though you may manage to convince yourself and a few others that it's vanilla.

Portuguese (translation) sentence:

Não importa o quanto você tenta convencer os outros de que chocolate é baunilha, ele ainda será chocolate, mesmo que você possa convencer a si mesmo e poucos outros de que é baunilha.


In [5]:
del portuguese_sentences
del english_sentences
del sentences

In [6]:
print(f"First 10 words of the english vocabulary:\n\n{sorted(tokenizer_eng.get_vocab().items(), key=lambda item: item[1])[:10]}\n")
print(f"First 10 words of the portuguese vocabulary:\n\n{sorted(tokenizer_por.get_vocab().items(), key=lambda item: item[1])[:10]}")

First 10 words of the english vocabulary:

[('[PAD]', 0), ('[UNK]', 1), ('[EOS]', 2), ('[SOS]', 3), ('.', 4), ('tom', 5), ('i', 6), ('to', 7), ('you', 8), ('the', 9)]

First 10 words of the portuguese vocabulary:

[('[PAD]', 0), ('[UNK]', 1), ('[EOS]', 2), ('[SOS]', 3), ('.', 4), ('tom', 5), ('que', 6), ('o', 7), ('nao', 8), ('eu', 9)]


In [7]:
# Size of the vocabulary
vocab_size_por = tokenizer_eng.get_vocab_size()
vocab_size_eng = tokenizer_eng.get_vocab_size()

print(f"Portuguese vocabulary is made up of {vocab_size_por} words")
print(f"English vocabulary is made up of {vocab_size_eng} words")

Portuguese vocabulary is made up of 12000 words
English vocabulary is made up of 12000 words


In [8]:
def word_to_id(token):
    return tokenizer_por.token_to_id(token)


def id_to_word(id):
    return tokenizer_por.id_to_token(id)

In [9]:
unk_id = word_to_id("[UNK]")
sos_id = word_to_id("[SOS]")
eos_id = word_to_id("[EOS]")
baunilha_id = word_to_id("baunilha")

print(f"The id for the [UNK] token is {unk_id}")
print(f"The id for the [SOS] token is {sos_id}")
print(f"The id for the [EOS] token is {eos_id}")
print(f"The id for baunilha (vanilla) is {baunilha_id}")

The id for the [UNK] token is 1
The id for the [SOS] token is 3
The id for the [EOS] token is 2
The id for baunilha (vanilla) is 5242


In [10]:
(to_translate, sr_translation), translation = next(iter(train_loader))

print(f"Tokenized english sentence:\n{to_translate[0, :].numpy()}\n\n")
print(f"Tokenized portuguese sentence (shifted to the right):\n{sr_translation[0, :].numpy()}\n\n")
print(f"Tokenized portuguese sentence:\n{translation[0, :].numpy()}\n\n")

print()

print(f"Len of Tokenized english sentence:\n{len(to_translate[0, :].numpy())}\n\n")
print(f"Len of Tokenized portuguese sentence (shifted to the right):\n{len(sr_translation[0, :].numpy())}\n\n")


Tokenized english sentence:
[   3  173   46   66  282   66   22 2167  793    4    2    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


Tokenized portuguese sentence (shifted to the right):
[  3 103 171   6  12 744 378   4   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]


Tokenized portuguese sentence:
[103 171   6  12 744 378   4   2   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]



Len of Tokenized english sentence:
42


Len of Tokenized portuguese sentence (shifted to the right):
49




In [11]:
print(to_translate.size()[0])

64


# NMT model with attention

In [12]:
VOCAB_SIZE = 12000
UNITS = 256

In [13]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, units):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, units, padding_idx=0)
        self.rnn = nn.LSTM(units, units, bidirectional=True, batch_first=True)


    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        # Summarizing the bidirectional RNNs to follow the TF version
        forward_output = x[:, :, :UNITS]
        backward_output = x[:, :, UNITS:]
        x = forward_output + backward_output

        return x

In [14]:
encoder = Encoder(VOCAB_SIZE, UNITS)

encoder_output = encoder(to_translate)

print(f'Tensor of sentences in english has shape: {to_translate.shape}\n')
print(f'Encoder output has shape: {encoder_output.shape}')

Tensor of sentences in english has shape: torch.Size([64, 42])

Encoder output has shape: torch.Size([64, 42, 256])


# Cross Attention

In [15]:
class CrossAttention(nn.Module):
    def __init__(self, units):
        super().__init__()

        self.mha = nn.MultiheadAttention(units, 1, batch_first=True)
        self.layernorm = nn.LayerNorm(units)

    def forward(self, context, target):
        attn_output = self.mha(query=target,key=context, value=context)
        x = target + attn_output[0]
        x = self.layernorm(x)

        return x

In [16]:
attention_layer = CrossAttention(UNITS)

sr_translation_embed = nn.Embedding(VOCAB_SIZE, UNITS, 0)(sr_translation)

attention_result = attention_layer(encoder_output, sr_translation_embed)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of translations has shape: {sr_translation_embed.shape}')
print(f'Tensor of attention scores has shape: {attention_result.shape}')

Tensor of contexts has shape: torch.Size([64, 42, 256])
Tensor of translations has shape: torch.Size([64, 49, 256])
Tensor of attention scores has shape: torch.Size([64, 49, 256])


# Decoder

In [17]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, units):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, units, padding_idx=0)
        self.pre_attention_rnn = nn.LSTM(units, units, batch_first=True)
        self.attention = CrossAttention(units)
        self.post_attention_rnn = nn.LSTM(units, units, batch_first=True)
        self.output_layer = nn.Linear(units, vocab_size)
        self.activation = nn.LogSoftmax(dim=-1)

    def forward(self, context, target, state=None, return_state=False):
        x = self.embedding(target)
        x, (hidden_state, cell_state) = self.pre_attention_rnn(x, state)
        x = self.attention(context, x)
        x, _ = self.post_attention_rnn(x)
        x = self.output_layer(x)
        logits = self.activation(x)

        if return_state:
            return logits, [hidden_state, cell_state]

        return logits

In [18]:
decoder = Decoder(VOCAB_SIZE, UNITS)

logits = decoder(encoder_output, sr_translation)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')

Tensor of contexts has shape: torch.Size([64, 42, 256])
Tensor of right-shifted translations has shape: torch.Size([64, 49])
Tensor of logits has shape: torch.Size([64, 49, 12000])


# Translator

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
class Translator(nn.Module):
    def __init__(self, vocab_size, units):
        super().__init__()

        self.encoder = Encoder(vocab_size, units)
        self.decoder = Decoder(vocab_size, units)


    def forward(self, inputs):
        context, targets = inputs

        encoded_context = self.encoder(context)
        logits = self.decoder(encoded_context, targets)

        return logits

In [21]:
translator = Translator(VOCAB_SIZE, UNITS).to(device)
to_translate, sr_translation = to_translate.to(device), sr_translation.to(device)

logits = translator((to_translate, sr_translation))

print(f'Tensor of sentences to translate has shape: {to_translate.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')

Tensor of sentences to translate has shape: torch.Size([64, 42])
Tensor of right-shifted translations has shape: torch.Size([64, 49])
Tensor of logits has shape: torch.Size([64, 49, 12000])


In [22]:
optimizer = torch.optim.Adam(params=translator.parameters())
criterion = masked_loss
acc = masked_acc

# Training

In [None]:
NUM_EPOCHS = 20
STEPS_PER_EPOCH = 500
patience = 3
min_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    # Mini batch loss
    running_loss = 0.0
    # Epoch loss for early stopping
    epoch_loss = 0.0
    translator.train()

    # Using itertools for fixed length iteration over non subscriptable DataLoader
    for i, data in enumerate(itertools.islice(train_loader,  STEPS_PER_EPOCH)):
        (context, target_in), target_out = data

        context, target_in, target_out = context.to(device), target_in.to(device), target_out.to(device)

        optimizer.zero_grad()
        outputs = translator((context, target_in))
        accuracy = acc(target_out, outputs)
        loss = criterion(target_out, outputs)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        #Getting the loss of the epoch
        if i+1 == STEPS_PER_EPOCH:
            epoch_loss = running_loss

        if i % 100 == 99:
            print(f"\n[epoch: {epoch+1}, mini batch: {i+1}] loss: {running_loss:.4f}, accuracy: {accuracy:.4f}\n")
            running_loss = 0

    # Update the best loss if it's better than the previous one
    if epoch_loss < min_loss:
        min_loss = epoch_loss
        patience = 3

    else:
        # Losing patience
        patience -= 1

        if patience == 0:
            break

## Validation

In [None]:
patience = 3
min_loss = float('inf')

running_loss = 0.0
translator.eval()

with torch.no_grad():
    for i, data in enumerate(itertools.islice(val_loader,  STEPS_PER_EPOCH)):
        (context, target_in), target_out = data

        context, target_in, target_out = context.to(device), target_in.to(device), target_out.to(device)

        outputs = translator((context, target_in))
        loss = criterion(target_out, outputs)
        accuracy = acc(target_out, outputs)

        running_loss += loss.item()

        if i % 100 == 99:
            print(f"\n[mini batch: {i+1}] validation loss: {running_loss:.4f}, validation accuracy: {accuracy:.4f}\n")
            running_loss = 0