In [186]:
import numpy as np
import torch
import pathlib
import re
import unicodedata
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from collections import Counter

torch.set_grad_enabled(True)

<torch.autograd.grad_mode.set_grad_enabled at 0x28e18da72d0>

In [2]:
config = {
    'MAX_VOCAB_SIZE': 13000,
    'BATCH_SIZE': 8,
    'raw_dataset_path': './dataset/por.txt',
    'MAX_SEQ_LEN': 16,
    'BUFFER_SIZE': 1000,
    'UNITS': 256
}

In [3]:
# data loader
dataset_path = pathlib.Path(config['raw_dataset_path'])
text_data = dataset_path.read_text(encoding = 'utf-8')

lines = text_data.splitlines()
pairs = [line.split('\t') for line in lines]

context_en = np.array([context for context, target, _ in pairs])
target_por = np.array([target for context, target, _ in pairs])

sentences = np.array((context_en, target_por))

In [4]:
def tokenizer(text):
    text = unicodedata.normalize("NFKD", text)
    text = text.lower()
    text = re.sub(r"[^ a-z.?!,¿]", "", text)
    text = re.sub(r"([.?!,¿])", r" \1 ", text)
    text = text.strip()
    return text.split()

tokenizer(context_en[34]), tokenizer(target_por[34])

(['go', 'on', '.'], ['siga', 'em', 'frente', '.'])

In [5]:
# build a vocabulary

class Vocabulary:
    def __init__(self, freq_threshold, max_vocab_size):
        # maintain two different mappings
        self.itos = {0: '[PAD]', 1: '[SOS]', 2: '[EOS]', 3: '[UNK]'}
        self.stoi = {'[PAD]': 0, '[SOS]': 1, '[EOS]': 2, '[UNK]': 3}
        self.freq_threshold = freq_threshold
        self.max_vocab_size = max_vocab_size
        
        self.pad_id = self.stoi['[PAD]']
        self.sos_id = self.stoi['[SOS]']
        self.eos_id = self.stoi['[EOS]']
        self.oov_id = self.stoi['[UNK]']

    def __len__(self):
        return len(self.itos)

    def vocab_size(self):
        return len(self.itos)

    def get_vocabulary(self):
        return self.stoi

    def token_to_ids(self, tokens):
        if isinstance(tokens, str): # handle a single word or sentence here
            token_list = self.tokenizer(tokens)
            return [self.stoi[t] if t in self.stoi else self.stoi['[UNK]'] for t in token_list]

        elif isinstance(tokens, list):
            return [self.stoi[t] if t in self.stoi else self.stoi['[UNK]'] for t in tokens]
        
        else:
            raise TypeError("Input must be either String or List of words.")

    def ids_to_token(self, ids):
        return [self.itos[id] for id in ids]

    # building vocab with the input sentence list
    def adapt(self, sentences, tokenizer):
        self.tokenizer = tokenizer
        idx = len(self.itos)
        token_freqs = {}

        for sentence in sentences:
            for token in self.tokenizer(sentence):
                if token not in self.stoi:
                    token_freqs[token] = 1
                else:
                    token_freqs[token] += 1
                
                if (token_freqs[token] == self.freq_threshold) and (idx < self.max_vocab_size):
                    self.itos[idx] = token
                    self.stoi[token] = idx
                    idx += 1

In [6]:
# english vocabulary
en_vocab = Vocabulary(freq_threshold = 1, max_vocab_size = config['MAX_VOCAB_SIZE'])
en_vocab.adapt(context_en, tokenizer)

# portuguese vocabulary
por_vocab = Vocabulary(freq_threshold = 1, max_vocab_size = config['MAX_VOCAB_SIZE'])
por_vocab.adapt(target_por, tokenizer)

en_vocab.vocab_size(), por_vocab.vocab_size()

(13000, 13000)

In [7]:
# test
test_idx = 789
en_translation = context_en[test_idx]
por_translation = target_por[test_idx]

print(en_translation, '--------->', por_translation)

max_seq_len = 16
context_tokens = en_vocab.token_to_ids(en_translation)
target_tokens = por_vocab.token_to_ids(por_translation)

print("\nEncoder Input IDs: ")
print([en_vocab.sos_id] + context_tokens + [en_vocab.eos_id] + (max_seq_len - len(context_tokens) - 2) * [en_vocab.pad_id])
print("\nPre-Attention Decoder Input IDs (Shifted to the Right): ")
print([en_vocab.sos_id] + target_tokens + [en_vocab.eos_id] + (max_seq_len - len(context_tokens) - 2) * [en_vocab.pad_id])
print("\nPost-Attention Decoder Input IDs: ")
print(target_tokens + [en_vocab.eos_id] + (max_seq_len - len(context_tokens) - 1) * [en_vocab.pad_id])

Here I am. ---------> Aqui estou.

Encoder Input IDs: 
[1, 194, 20, 62, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Pre-Attention Decoder Input IDs (Shifted to the Right): 
[1, 402, 47, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Post-Attention Decoder Input IDs: 
[402, 47, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Neural Machine Translation Custom Dataset 

In [68]:
class NMT_dataset(Dataset):
    def __init__(self, translation_pairs, tokenizer, vocabularies, max_seq_len):
        print(translation_pairs.shape)
        self.translation_pairs = translation_pairs
        self.tokenizer = tokenizer
        self.en_vocab, self.por_vocab = vocabularies
        self.max_seq_len = max_seq_len

        # for convenience 
        self.sos_id = self.en_vocab.sos_id
        self.eos_id = self.en_vocab.eos_id
        self.pad_id = self.en_vocab.pad_id
        self.oov_id = self.en_vocab.oov_id

    def __len__(self):
        return self.translation_pairs.shape[-1]

    def __getitem__(self, idx):
        req_pair = self.translation_pairs[:, idx]
        en_translation, por_translation = req_pair

        context_tokens = self.en_vocab.token_to_ids(en_translation)
        target_tokens = self.por_vocab.token_to_ids(por_translation)

        # encoder input tokens
        encoder_input = (
            [self.sos_id] + 
            context_tokens + 
            [self.eos_id] + 
            (self.max_seq_len - len(context_tokens) - 2) * [self.pad_id]
            )
        
        # pre-attention decoder input tokens
        pre_decoder_input = (
            [self.sos_id] + 
            target_tokens + 
            [self.eos_id] + 
            (self.max_seq_len - len(target_tokens) - 2) * [self.pad_id] 
        )

        # post-attention decoder output tokens
        post_decoder_output = (
            target_tokens + 
            [self.eos_id] + 
            (self.max_seq_len - len(target_tokens) - 1) * [self.pad_id]
        )

        encoder_input_tensor = torch.tensor(encoder_input, dtype = torch.long)
        pre_decoder_input_tensor = torch.tensor(pre_decoder_input, dtype = torch.long)
        post_decoder_output_tensor = torch.tensor(post_decoder_output, dtype = torch.long)

        return encoder_input_tensor, pre_decoder_input_tensor, post_decoder_output_tensor

In [69]:
# train and val_dataset
is_train = np.random.uniform(size = (sentences.shape[-1],)) < 0.85
train_raw_set = sentences[:, is_train]
val_raw_set = sentences[:, ~is_train]

train_raw_set.shape, val_raw_set.shape

((2, 161909), (2, 28730))

In [70]:
train_dataset = NMT_dataset(train_raw_set, tokenizer, (en_vocab, por_vocab), config['MAX_SEQ_LEN'])
val_dataset = NMT_dataset(val_raw_set, tokenizer, (en_vocab, por_vocab), config['MAX_SEQ_LEN'])

(2, 161909)
(2, 28730)


In [71]:
next(iter(train_dataset))

(tensor([1, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([1, 6, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([6, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [201]:
def collate_fn(batch):
    # Unpack the batch (you might need to adjust this if your data is structured differently)
    contexts, target_ins, target_outs = zip(*batch)

    # Find the max length in the batch
    max_len = max([len(x) for x in contexts])

    # Pad sequences
    padded_contexts = [torch.nn.functional.pad(x, (0, max_len - len(x))) for x in contexts]
    padded_target_ins = [torch.nn.functional.pad(x, (0, max_len - len(x))) for x in target_ins]
    padded_target_outs = [torch.nn.functional.pad(x, (0, max_len - len(x))) for x in target_outs]

    # Stack the tensors to form a batch
    padded_contexts = torch.stack(padded_contexts)
    padded_target_ins = torch.stack(padded_target_ins)
    padded_target_outs = torch.stack(padded_target_outs)

    return padded_contexts, padded_target_ins, padded_target_outs


In [202]:
train_loader = DataLoader(
    train_dataset, 
    batch_size = config['BATCH_SIZE'], 
    shuffle = True,
    collate_fn = collate_fn
)

val_loader = DataLoader(
    val_dataset, 
    batch_size = config['BATCH_SIZE'], 
    shuffle = False,
    collate_fn = collate_fn
)

In [203]:
len(train_loader) * config['BATCH_SIZE'], len(val_loader) * config['BATCH_SIZE']

(161912, 28736)

In [204]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, units):
        """Initializes an instance of this class

        Args:
            vocab_size (int): Size of the vocabulary
            units (int): Number of units in the LSTM layer
        """
        super(Encoder, self).__init__()

        self.embedding = nn.Embedding(
            num_embeddings = vocab_size,
            embedding_dim = units,
            padding_idx = 0  # Assuming padding index is 0, similar to mask_zero in Keras
        )
        
        # bi-directional LSTM
        self.rnn = nn.LSTM(
            input_size = units,
            hidden_size = units,
            batch_first = True,
            bidirectional = True
        )

    def forward(self, context):
        """Forward pass of this layer

        Args:
            context (torch.Tensor): The sentence to translate

        Returns:
            torch.Tensor: Encoded sentence to translate
        """

        # Pass the context through the embedding layer
        x = self.embedding(context)

        # Pass the output of the embedding through the RNN
        x, _ = self.rnn(x)

        # Merge the bidirectional outputs by summing them
        x = x[:, :, :self.rnn.hidden_size] + x[:, :, self.rnn.hidden_size:]

        return x
    
# Example usage
vocab_size = config['MAX_VOCAB_SIZE']
units = config['UNITS']

encoder = Encoder(vocab_size, units)

input_tensor = next(iter(train_dataset))
input_tensor = input_tensor[0].unsqueeze(0)
output = encoder(input_tensor)
print(output.shape)  # Expected shape: (32, 20, units)

torch.Size([1, 16, 256])


In [205]:
class CrossAttention(nn.Module):
    def __init__(self, units):
        """Initializes an instance of this class

        Args:
            units (int): Number of units in the LSTM layer
        """
        super(CrossAttention, self).__init__()

        ### START CODE HERE ###

        self.multihead_attn = nn.MultiheadAttention(
            embed_dim = units,  # the size of Q, K, V dims is the embedding dimension
            num_heads = 1,  # Single head as in your Keras example
            batch_first = True  # This ensures the input and output shapes are (batch_size, sequence_length, embedding_dim)
        )

        ### END CODE HERE ###

        self.layernorm = nn.LayerNorm(units)
        self.add = nn.ModuleList([nn.Linear(units, units) for _ in range(2)])

    def forward(self, context, target):
        """Forward pass of this layer

        Args:
            context (torch.Tensor): Encoded sentence to translate
            target (torch.Tensor): The embedded shifted-to-the-right translation

        Returns:
            torch.Tensor: Cross attention between context and target
        """
        ### START CODE HERE ###

        # Call the MultiHeadAttention by passing in the query, key, and value
        # For this case, the query should be the translation and the key/value the encoded sentence to translate
        attn_output, _ = self.multihead_attn(query = target, key = context, value = context)
        
        x = self.add[0](target) + self.add[1](attn_output)
        return self.layernorm(x)

# Example usage
units = config['UNITS']

cross_attention = CrossAttention(units)

# Assuming context and target tensors of shape (batch_size, seq_length, units)
context_tensor = torch.randn(8, 16, units)  # Example tensor for context
target_tensor = torch.randn(8, 16, units)    # Example tensor for target

output = cross_attention(context_tensor, target_tensor)
print(output.shape)  # Expected shape: (32, 8, units)


torch.Size([8, 16, 256])


In [206]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, units):
        """Initializes an instance of this class

        Args:
            vocab_size (int): Size of the vocabulary
            units (int): Number of units in the LSTM layer
        """
        super(Decoder, self).__init__()

        ### START CODE HERE ###

        # The embedding layer
        self.embedding = nn.Embedding(vocab_size, units, padding_idx = 0)

        # The RNN before attention
        self.pre_attention_rnn = nn.LSTM(units, units, batch_first = True)

        # The attention layer
        self.attention = CrossAttention(units)

        # The RNN after attention
        self.post_attention_rnn = nn.LSTM(units, units, batch_first=True)

        # The dense layer with logsoftmax activation
        self.output_layer = nn.Linear(units, vocab_size)

        ### END CODE HERE ###

    def forward(self, context, target, state=None, return_state=False):
        """Forward pass of this layer

        Args:
            context (torch.Tensor): Encoded sentence to translate
            target (torch.Tensor): The shifted-to-the-right translation
            state (tuple(torch.Tensor, torch.Tensor), optional): Hidden state of the pre-attention LSTM. Defaults to None.
            return_state (bool, optional): If set to true, return the hidden states of the LSTM. Defaults to False.

        Returns:
            torch.Tensor: The log_softmax probabilities of predicting a particular token
        """
        ### START CODE HERE ###

        # Get the embedding of the input
        x = self.embedding(target)
        
        # Pass the embedded input into the pre-attention LSTM
        if state is None:
            x, (hidden_state, cell_state) = self.pre_attention_rnn(x)
        else:
            x, (hidden_state, cell_state) = self.pre_attention_rnn(x, state)
        
        # Perform cross attention between the context and the output of the LSTM (in that order)
        x = self.attention(context, x)

        # Do a pass through the post-attention LSTM
        x, _ = self.post_attention_rnn(x)

        # Compute the logits
        logits = self.output_layer(x)
        
        
        # Apply log softmax
        logits = F.log_softmax(logits, dim = -1)

        ### END CODE HERE ###

        if return_state:
            return logits, (hidden_state, cell_state)

        return logits

# Example usage
vocab_size = config['MAX_VOCAB_SIZE']
units = config['UNITS']

decoder = Decoder(vocab_size, units)

# Assuming context and target tensors of shape (batch_size, seq_length)
context_tensor = torch.randn(8, 15, units)  # Example tensor for context
target_tensor = torch.randint(0, vocab_size, (8, 16)).long()  # Example tensor for target

output = decoder(context_tensor, target_tensor)
print(output.shape)  # Expected shape: (32, 8, vocab_size)

torch.Size([8, 16, 13000])


In [207]:
class NMT_Translator(nn.Module):
    def __init__(self, vocab_size, units):
        super(NMT_Translator, self).__init__()
        
        self.encoder = Encoder(vocab_size, units)
        self.decoder = Decoder(vocab_size, units)
        
    def forward(self, context, target):
        encoded_context = self.encoder(context)
        logits = self.decoder(encoded_context, target)
        
        return torch.argmax(logits, dim = -1)

In [208]:
vocab_size = config['MAX_VOCAB_SIZE']
units = config['UNITS']

# Initialize the Translator model
translator = NMT_Translator(vocab_size, units)

# Example tensors
context_tensor = torch.randint(0, vocab_size, (8, 15)).long()  # (batch_size, seq_len, units)
target_tensor = torch.randint(0, vocab_size, (8, 16))  # (batch_size, seq_len)

print(context_tensor.shape, target_tensor.shape)

# Ensure the target tensor is Long type for embedding
target_tensor = target_tensor.long()

# Forward pass through the model
output = translator(context_tensor, target_tensor)
print(output.shape)  # Expected shape: (batch_size, seq_len, vocab_size)

torch.Size([8, 15]) torch.Size([8, 16])
torch.Size([8, 16])


In [209]:
class MaskedLoss(nn.Module):
    def __init__(self):
        super(MaskedLoss, self).__init__()
        self.loss_fn = nn.CrossEntropyLoss(reduction = 'none')

    def forward(self, y_pred, y_true):
        # CrossEntropyLoss expects y_true to be of shape (batch_size, seq_len)
        y_pred = y_pred.argmax(dim=-1).float()
        
        mask = (y_true != 0).float()
        y_pred *= mask
        loss = self.loss_fn(y_pred, y_true.float())

        # Create a mask where y_true is not padding (assuming 0 is the padding index)
        mask = (y_true != 0).float()
        

        # Return the average loss
        return loss.sum() / mask.sum()

class MaskedAcc(nn.Module):
    def __init__(self):
        super(MaskedAcc, self).__init__()

    def forward(self, y_pred, y_true):
        # Get the predicted class by taking argmax along the last dimension
        y_pred = y_pred.argmax(dim=-1)

        # Create a mask where y_true is not padding
        mask = (y_true != 0).float()

        # Compute the number of correct predictions
        correct = (y_true == y_pred).float() * mask

        # Return the accuracy
        return correct.sum() / mask.sum()

In [212]:
def train(model, train_loader, val_loader, optimizer, loss_fn, masked_loss_fn, masked_acc_fn, epochs=20, patience=3):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_acc = 0

        for batch in train_loader:
            context, target_in, target_out = batch
            

            optimizer.zero_grad()
            output = model(context, target_in)
            
            
            # Compute loss and accuracy
            
            loss = loss_fn(output.float(), target_out.float())
            loss.requires_grad = True
#             acc = masked_acc_fn(output, target_out)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
#             total_acc += acc.item()

        avg_loss = total_loss / len(train_loader)
#         avg_acc = total_acc / len(train_loader)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Training loss: {avg_loss:.4f}, accuracy: {avg_acc:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                context, target = batch
                output = model(context, target)
                loss = loss_fn(output.float(), target_out.float())
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation loss: {avg_val_loss:.4f}")

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping")
                break

    # Load the best model
    model.load_state_dict(torch.load('best_model.pth'))
    return model

In [None]:
vocab_size = config['MAX_VOCAB_SIZE']
units = config['UNITS']
model = NMT_Translator(vocab_size, units)

optimizer = optim.Adam(model.parameters(), lr = 0.001)
loss = nn.CrossEntropyLoss()
masked_loss_fn = MaskedLoss()
masked_acc_fn = MaskedAcc()

trained_model = train(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    loss_fn = loss, 
    masked_loss_fn=masked_loss_fn,
    masked_acc_fn=masked_acc_fn,
    epochs=20,
    patience=3
)