# ***Conding Transformer network from scratch [PyTorch]***

***Imporitng Necessary Libraries***

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

import math

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path

from typing import Any

from tqdm import tqdm

import warnings

2024-07-23 15:38:03.641424: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-23 15:38:03.641522: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-23 15:38:03.765639: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# ***Model***

***Input Embeddings***

In [2]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

***Positional Encoding***

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        
        pe = torch.zeros(seq_len, d_model)
        
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

***Layer Normalization***

In [4]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

***Feed Forward Network***

In [5]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

***Multi Head Attention***

In [6]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)  
        return self.w_o(x)

***Residual Connection***

In [7]:
class ResidualConnection(nn.Module):
    
        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)
    
        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

***Encoder and EncoderBlock***

In [8]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [9]:
class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

***Decoder and DecoderBlock***

In [10]:
class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [11]:
class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

***ProjectionLayer***

In [12]:
class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

***Transformer***

In [13]:
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

In [14]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
    
    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)
    
    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
    
    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)
    
    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer

# ***Dataset***

***Tokenizer***

In [15]:
# Defining Tokenizer
def build_tokenizer(config, ds, lang):
    
    # Crating a file path for the tokenizer 
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    
    # Checking if Tokenizer already exists
    if not Path.exists(tokenizer_path): 
        
        # If it doesn't exist, we create a new one
        tokenizer = Tokenizer(WordLevel(unk_token = '[UNK]')) # Initializing a new world-level tokenizer
        tokenizer.pre_tokenizer = Whitespace() # We will split the text into tokens based on whitespace
        
        # Creating a trainer for the new tokenizer
        trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", 
                                                     "[SOS]", "[EOS]"], min_frequency = 2) # Defining Word Level strategy and special tokens
        
        # Training new tokenizer on sentences from the dataset and language specified 
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer = trainer)
        tokenizer.save(str(tokenizer_path)) # Saving trained tokenizer to the file path specified at the beginning of the function
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path)) # If the tokenizer already exist, we load it
    return tokenizer # Returns the loaded tokenizer or the trained tokenizer

In [16]:
# Iterating through dataset to extract the original sentence and its translation 
def get_all_sentences(ds, lang):
    for pair in ds:
        yield pair['translation'][lang]

In [17]:
def get_ds(config):
    
    # Loading the train portion of the OpusBooks dataset.
    # The Language pairs will be defined in the 'config' dictionary we will build later
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split = 'train') 
    
    # Building or loading tokenizer for both the source and target languages 
    tokenizer_src = build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = build_tokenizer(config, ds_raw, config['lang_tgt'])
    
    # Splitting the dataset for training and validation 
    train_ds_size = int(0.9 * len(ds_raw)) # 90% for training
    val_ds_size = len(ds_raw) - train_ds_size # 10% for validation
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size]) # Randomly splitting the dataset
                                    
    # Processing data with the BilingualDataset class, which we will define below
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
                                    
    # Iterating over the entire dataset and printing the maximum length found in the sentences of both the source and target languages
    max_len_src = 0
    max_len_tgt = 0
    for pair in ds_raw:
        src_ids = tokenizer_src.encode(pair['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_src.encode(pair['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))
        
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')
    
    # Creating dataloaders for the training and validadion sets
    # Dataloaders are used to iterate over the dataset in batches during training and validation
    train_dataloader = DataLoader(train_ds, batch_size = config['batch_size'], shuffle = True) # Batch size will be defined in the config dictionary
    val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle = True)
    
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt # Returning the DataLoader objects and tokenizers

In [18]:
def casual_mask(size):
        # Creating a square matrix of dimensions 'size x size' filled with ones
        mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
        return mask == 0

In [19]:
class BilingualDataset(Dataset):
    
    # This takes in the dataset contaning sentence pairs, the tokenizers for target and source languages, and the strings of source and target languages
    # 'seq_len' defines the sequence length for both languages
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        
        # Defining special tokens by using the target language tokenizer
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

        
    # Total number of instances in the dataset (some pairs are larger than others)
    def __len__(self):
        return len(self.ds)
    
    # Using the index to retrive source and target texts
    def __getitem__(self, index: Any) -> Any:
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        
        # Tokenizing source and target texts 
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        
        # Computing how many padding tokens need to be added to the tokenized texts 
        # Source tokens
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
        # Target tokens
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # Subtracting the '[SOS]' special token
        
        # If the texts exceed the 'seq_len' allowed, it will raise an error. This means that one of the sentences in the pair is too long to be processed
        # given the current sequence length limit (this will be defined in the config dictionary below)
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
         
        # Building the encoder input tensor by combining several elements
        encoder_input = torch.cat(
            [
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(enc_input_tokens, dtype = torch.int64), # Inserting the tokenized source text
            self.eos_token, # Inserting the '[EOS]' token
            torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        )
        
        # Building the decoder input tensor by combining several elements
        decoder_input = torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token 
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        
        )
        
        # Creating a label tensor, the expected output for training the model
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                self.eos_token, # Inserting the '[EOS]' token 
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Adding padding tokens
                
            ]
        )
        
        # Ensuring that the length of each tensor above is equal to the defined 'seq_len'
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
        
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input, 
            'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)), 
            'label': label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }

In [20]:
# Define function to obtain the most probable next token
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    # Retrieving the indices from the start and end of sequences of the target tokens
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')
    
    # Computing the output of the encoder for the source sequence
    encoder_output = model.encode(source, source_mask)
    # Initializing the decoder input with the Start of Sentence token
    decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)
    
    # Looping until the 'max_len', maximum length, is reached
    while True:
        if decoder_input.size(1) == max_len:
            break
            
        # Building a mask for the decoder input
        decoder_mask = casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        
        # Calculating the output of the decoder
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
        
        # Applying the projection layer to get the probabilities for the next token
        prob = model.project(out[:, -1])
        
        # Selecting token with the highest probability
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat([decoder_input, torch.empty(1,1). type_as(source).fill_(next_word.item()).to(device)], dim=1)
        
        # If the next token is an End of Sentence token, we finish the loop
        if next_word == eos_idx:
            break
            
    return decoder_input.squeeze(0) # Sequence of tokens generated by the decoder

In [21]:
# Defining function to evaluate the model on the validation dataset
# num_examples = 2, two examples per run
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval() # Setting model to evaluation mode
    count = 0 # Initializing counter to keep track of how many examples have been processed
    
    console_width = 80 # Fixed witdh for printed messages
    
    # Creating evaluation loop
    with torch.no_grad(): # Ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count += 1
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            
            # Ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0) ==  1, 'Batch size must be 1 for validation.'
            
            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch
            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
            
            # Retrieving source and target texts from the batch
            source_text = batch['src_text'][0]
            target_text = batch['tgt_text'][0] # True translation 
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # Decoded, human-readable model output
            
            # Printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')
            
            # After two examples, we break the loop
            if count == num_examples:
                break

In [22]:
# We pass as parameters the config dictionary, the length of the vocabylary of the source language and the target language
def get_model(config, vocab_src_len, vocab_tgt_len):
    
    # Loading model using the 'build_transformer' function.
    # We will use the lengths of the source language and target language vocabularies, the 'seq_len', and the dimensionality of the embeddings
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

In [23]:
# Define settings for building and training the transformer model
def get_config():
    return{
        'batch_size': 8,
        'num_epochs': 20,
        'lr': 10**-4,
        'seq_len': 350,
        'd_model': 512, # Dimensions of the embeddings in the Transformer. 512 like in the "Attention Is All You Need" paper.
        'lang_src': 'en',
        'lang_tgt': 'it',
        'model_folder': 'weights',
        'model_basename': 'tmodel_',
        'preload': None,
        'tokenizer_file': 'tokenizer_{0}.json',
        'experiment_name': 'runs/tmodel'
    }
    

# Function to construct the path for saving and retrieving model weights
def get_weights_file_path(config, epoch: str):
    model_folder = config['model_folder'] # Extracting model folder from the config
    model_basename = config['model_basename'] # Extracting the base name for model files
    model_filename = f"{model_basename}{epoch}.pt" # Building filename
    return str(Path('.')/ model_folder/ model_filename) # Combining current directory, the model folder, and the model filename

In [24]:
def train_model(config):
    # Setting up device to run on GPU to train faster
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device {device}")
    
    # Creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
    
    # Retrieving dataloaders and tokenizers for source and target languages using the 'get_ds' function
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    
    # Initializing model on the GPU using the 'get_model' function
    model = get_model(config,tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])
    
    # Setting up the Adam optimizer with the specified learning rate from the '
    # config' dictionary plus an epsilon value
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps = 1e-9)
    
    # Initializing epoch and global step variables
    initial_epoch = 0
    global_step = 0
    
    # Checking if there is a pre-trained model to load
    # If true, loads it
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename) # Loading model
        
        # Sets epoch to the saved in the state plus one, to resume from where it stopped
        initial_epoch = state['epoch'] + 1
        # Loading the optimizer state from the saved model
        optimizer.load_state_dict(state['optimizer_state_dict'])
        # Loading the global step state from the saved model
        global_step = state['global_step']
        
    # Initializing CrossEntropyLoss function for training
    # We ignore padding tokens when computing loss, as they are not relevant for the learning process
    # We also apply label_smoothing to prevent overfitting
    loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.token_to_id('[PAD]'), label_smoothing = 0.1).to(device)
    
    # Initializing training loop 
    
    # Iterating over each epoch from the 'initial_epoch' variable up to
    # the number of epochs informed in the config
    for epoch in range(initial_epoch, config['num_epochs']):
        
        # Initializing an iterator over the training dataloader
        # We also use tqdm to display a progress bar
        batch_iterator = tqdm(train_dataloader, desc = f'Processing epoch {epoch:02d}')
        
        # For each batch...
        for batch in batch_iterator:
            model.train() # Train the model
            
            # Loading input data and masks onto the GPU
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)
            
            # Running tensors through the Transformer
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)
            
            # Loading the target labels onto the GPU
            label = batch['label'].to(device)
            
            # Computing loss between model's output and true labels
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            
            # Updating progress bar
            batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})
            
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()
            
            # Performing backpropagation
            loss.backward()
            
            # Updating parameters based on the gradients
            optimizer.step()
            
            # Clearing the gradients to prepare for the next batch
            optimizer.zero_grad()
            
            global_step += 1 # Updating global step count
            
        # We run the 'run_validation' function at the end of each epoch
        # to evaluate model performance
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)
         
        # Saving model
        model_filename = get_weights_file_path(config, f'{epoch:02d}')
        # Writting current model state to the 'model_filename'
        torch.save({
            'epoch': epoch, # Current epoch
            'model_state_dict': model.state_dict(),# Current model state
            'optimizer_state_dict': optimizer.state_dict(), # Current optimizer state
            'global_step': global_step # Current global step 
        }, model_filename)

In [25]:
if __name__ == '__main__':
    warnings.filterwarnings('ignore') # Filtering warnings
    config = get_config() # Retrieving config settings
    train_model(config) # Training model with the config arguments

Using device cuda


Downloading readme:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Max length of source sentence: 309
Max length of target sentence: 274


Processing epoch 00: 100%|██████████| 3638/3638 [15:02<00:00,  4.03it/s, loss=6.085]


--------------------------------------------------------------------------------
SOURCE: The Tartar darted off, his coat-tails flying; and five minutes later rushed in again, with a dish of opened oysters in pearly shells and a bottle between his fingers.
TARGET: E il tartaro con le falde svolazzanti, corse via e dopo cinque minuti entrò volando con un vassoio di ostriche aperte sui gusci di madreperla e una bottiglia fra le dita.
PREDICTED: Il suo momento , e la sua , e la sua sua , e la sua sua , e la sua , e la sua .
--------------------------------------------------------------------------------
SOURCE: In the present instance, going back to the liver-pill circular, I had the symptoms, beyond all mistake, the chief among them being "a general disinclination to work of any kind."
TARGET: Nel caso presente, per ritornare all’annuncio delle pillole per il fegato, io avevo i sintomi d’una malattia di fegato, dei quali il principale era «una generale svogliatezza al lavoro di qualunque 

Processing epoch 01: 100%|██████████| 3638/3638 [15:02<00:00,  4.03it/s, loss=6.080]


--------------------------------------------------------------------------------
SOURCE: We did get the thing up at last, the two of us together. We fixed it, not exactly upside down - more sideways like - and we tied it up to the mast with the painter, which we cut off for the purpose.
TARGET: Finalmente tutti e due insieme riuscimmo a fissarla, ma esattamente sottosopra — un po’ lateralmente — e la legammo all’albero con la gomena tagliata per quello scopo.
PREDICTED: , come , , , , e non ci , e non si , e non , e , e , e , , , e , , , , .
--------------------------------------------------------------------------------
SOURCE: Oblonsky was telling his sister-in-law the pun he had made about 'dissolving marriages.'
TARGET: Stepan Arkad’ic raccontava alla cognata il suo giuoco di parole sul divorzio.
PREDICTED: Stepan Arkad ’ ic era stato stato stato di nuovo , che aveva detto il suo .


Processing epoch 02: 100%|██████████| 3638/3638 [15:01<00:00,  4.04it/s, loss=5.227]


--------------------------------------------------------------------------------
SOURCE: I have a new bodyguard – Mary Vlasyevna,' this was the midwife, a new and important personage in the Levins' family life.
TARGET: Ho una nuova guardia del corpo, Mar’ja Vlas’evna — era la levatrice, un personaggio nuovo, importante nella vita familiare di Levin. — È venuta a farmi visita.
PREDICTED: Io sono un uomo di voce , che aveva detto che la sua vita era stata la vita e la vita di vita .
--------------------------------------------------------------------------------
SOURCE: A BLUSTERING STORM WAS RUSHING and whistling between the wheels of the train and round the pillars and the corners of the station.
TARGET: Una tormenta paurosa s’era scatenata e fischiava fra le ruote della vettura, lungo le colonne, al di là dell’angolo della stazione.
PREDICTED: Il suo , e la sua luce del governatorato , la folla del quale si e la folla di .


Processing epoch 03: 100%|██████████| 3638/3638 [15:04<00:00,  4.02it/s, loss=4.592]


--------------------------------------------------------------------------------
SOURCE: He was holding with one hand the window of a carriage (from which the head of a lady in a velvet bonnet and two little children's heads were leaning out) and was smilingly beckoning with his other hand to his brother-in-law.
TARGET: Si teneva con una mano al finestrino di una carrozza che s’era fermata all’angolo, dalla quale si sporgevano una testa di donna con un cappello di velluto e due testoline di bimbi, e sorrideva e faceva segno con l’altra mano al cognato.
PREDICTED: Egli si mise a guardare con la finestra , che si la testa di un ’ occhiata alla finestra e un ’ altra , che si era un ’ altra , e il dottore si era con la mano e la principessa .
--------------------------------------------------------------------------------
SOURCE: The great gates were closed and locked; but a wicket in one of them was only latched.
TARGET: Aprii la porta e la chiusi dolcemente.
PREDICTED: Il sole era un po 

Processing epoch 04: 100%|██████████| 3638/3638 [15:04<00:00,  4.02it/s, loss=5.384]


--------------------------------------------------------------------------------
SOURCE: (He vividly pictured to himself Mlle Roland's roguish black eyes, and her smile.) 'Besides, as long as she was in the house I never took any liberties.
TARGET: Ma che governante! — e ricordò con vivezza il riso e gli occhi neri assassini di m.lle Rolland. — Del resto finché è stata in casa nostra, io non mi sono permesso nulla.
PREDICTED: ( con un sorriso di gioia , con gli occhi scintillanti , le lacrime , e la njanja , come egli aveva detto , con la casa , non era mai più nulla di più .
--------------------------------------------------------------------------------
SOURCE: 'Laska! Here!' he said, pointing to the other side.
TARGET: “Laska, qua!” egli disse, indicandole l’altra parte.
PREDICTED: — Laska ! — disse , indicando il loro posto .


Processing epoch 05: 100%|██████████| 3638/3638 [15:04<00:00,  4.02it/s, loss=5.552]


--------------------------------------------------------------------------------
SOURCE: Oh, this is a shame!" And he flung the watch down, and sprang out of bed, and had a cold bath, and washed himself, and dressed himself, and shaved himself in cold water because there was not time to wait for the hot, and then rushed and had another look at the watch.
TARGET: Vergogna! — E scagliò lontano l’orologio, saltò dal letto, fece un bagno freddo, si lavò, si vestì, si fece la barba con l’acqua fredda, perchè non vi era tempo d’aspettare la calda, e poi si precipitò a dare un’altra occhiata all’orologio.
PREDICTED: Oh ! questa è una donna ! — gridò il Coniglio , e , , si alzò e , , si mise a guardare un lume , e si mise a guardare un ' altra volta , e poi non si poteva .
--------------------------------------------------------------------------------
SOURCE: I proceeded: at last my way opened, the trees thinned a little; presently I beheld a railing, then the house--scarce, by this dim light

Processing epoch 06: 100%|██████████| 3638/3638 [15:05<00:00,  4.02it/s, loss=4.535]


--------------------------------------------------------------------------------
SOURCE: 'Enoch, Enos...'
TARGET: — Enoch, Enos.
PREDICTED: — Tutto , un ...
--------------------------------------------------------------------------------
SOURCE: She gave the order to harness another pair of horses, and busied herself packing her handbag with things necessary for a few days.
TARGET: Ordinò di attaccare altri cavalli e si occupò di mettere in una sacca da viaggio le cose indispensabili per qualche giorno.
PREDICTED: Ella aveva mandato a prendere un altro di cavalli e , il denaro con la strada , in qualche tempo per la giornata .


Processing epoch 07: 100%|██████████| 3638/3638 [15:01<00:00,  4.03it/s, loss=4.057]


--------------------------------------------------------------------------------
SOURCE: 'I hate him: and I cannot forgive myself.'
TARGET: — Lo odio, e non riesco a perdonarmelo.
PREDICTED: — Io lo odio , e non posso perdonare .
--------------------------------------------------------------------------------
SOURCE: "Of Mr. Reed's ghost I am: he died in that room, and was laid out there.
TARGET: — Sì, ho paura dell'ombra del signor Reed, che morì in quella camera, e di là fu portato a sotterrare.
PREDICTED: — Ebbene , il signor Reed è morto ; è morto e là ci sono .


Processing epoch 08: 100%|██████████| 3638/3638 [15:01<00:00,  4.03it/s, loss=3.926]


--------------------------------------------------------------------------------
SOURCE: Oblonsky during the drive was composing the menu of their dinner.
TARGET: Stepan Arkad’ic durante il percorso componeva la lista del pranzo.
PREDICTED: Stepan Arkad ’ ic si le , a pranzo .
--------------------------------------------------------------------------------
SOURCE: I had a short jacket of goat’s skin, the skirts coming down to about the middle of the thighs, and a pair of open-kneed breeches of the same; the breeches were made of the skin of an old he-goat, whose hair hung down such a length on either side that, like pantaloons, it reached to the middle of my legs; stockings and shoes I had none, but had made me a pair of somethings, I scarce knew what to call them, like buskins, to flap over my legs, and lace on either side like spatterdashes, but of a most barbarous shape, as indeed were all the rest of my clothes.
TARGET: Il mio abito era una specie di saio di pelle di capra anch’ess

Processing epoch 09: 100%|██████████| 3638/3638 [15:01<00:00,  4.03it/s, loss=4.782]


--------------------------------------------------------------------------------
SOURCE: I call that downright wisdom, not merely as regards the present case, but with reference to our trip up the river of life, generally.
TARGET: Io la dichiaro, questa, vera saggezza, non semplicemente rispetto al nostro caso particolare, ma al nostro pellegrinaggio sul fiume della vita, in generale.
PREDICTED: Io considero che questa mancanza di non essere il diritto , ma che ci sia la nostra vita , come ci siamo per lavorare i nostri , come quelli che ci sia .
--------------------------------------------------------------------------------
SOURCE: 'I know, I know,' he said with a smile. 'I am a family man myself.
TARGET: — Lo so, lo so — disse il dottore, sorridendo — io stesso ho famiglia; ma noi mariti, in questi momenti, siamo le persone più pietose.
PREDICTED: — Io so , so — disse lui sorridendo . — Sono un uomo di famiglia .


Processing epoch 10: 100%|██████████| 3638/3638 [15:01<00:00,  4.04it/s, loss=3.387]


--------------------------------------------------------------------------------
SOURCE: During the sacrament Levin did that which, agnostic though he was, he had done a thousand times before.
TARGET: Durante la funzione Levin pregava e faceva proprio quello che lui, miscredente, aveva fatto mille volte.
PREDICTED: Levin , soprattutto , che Levin non amava , era stato a volte un ’ altra volta , e che era già .
--------------------------------------------------------------------------------
SOURCE: "Here is to your health, ministrant spirit!" he said.
TARGET: — Alla vostra salute, spirito benefico!
PREDICTED: — Qui è la vostra salute , ! — disse .


Processing epoch 11: 100%|██████████| 3638/3638 [15:01<00:00,  4.04it/s, loss=3.514]


--------------------------------------------------------------------------------
SOURCE: "So could I--with a roast onion.
TARGET: — Anch'io, con le cipolle arrostite.
PREDICTED: — Sì , potrei , un .
--------------------------------------------------------------------------------
SOURCE: I said it was the sort of thing I had often longed for myself; and we discussed the possibility of our going away, we four, to some handy, well-fitted desert island, and living there in the woods.
TARGET: Io osservai che era proprio quello che avevo sempre desiderato per me; e noi discutemmo la possibilità di andarcene, noi quattro, in qualche bell’isola deserta, a vivere nei boschi.
PREDICTED: Dissi allora che mi era parso di aver tanto paura di veder la possibilità di fare la nostra vita ; e ci a remare , per qualche distanza di distanza , e per altro che in distanza le foreste .


Processing epoch 12: 100%|██████████| 3638/3638 [15:03<00:00,  4.03it/s, loss=3.144]


--------------------------------------------------------------------------------
SOURCE: He knew that between him and her there could and should be nothing secret, and therefore he decided that it was his duty, but he had not considered how the confession might affect her: he had not put himself in her place.
TARGET: Sapeva che fra di loro non potevano e non dovevano esserci segreti e perciò aveva deciso di far così, ma non si era reso conto degli effetti che ne sarebbero potuti derivare, non si era trasferito in lei.
PREDICTED: Sapeva che l ’ avrebbe trovato fra lei e lei e l ’ unico segreto , e perciò decise di non essere sicuro , ma che , pensando , non aveva pensato a quanto avrebbe potuto , avrebbe potuto in un posto .
--------------------------------------------------------------------------------
SOURCE: What a consternation of soul was mine that dreary afternoon!
TARGET: Quale costernazione erasi insinuata nell'anima mia in quel triste pomeriggio!
PREDICTED: Che orrore !


Processing epoch 13: 100%|██████████| 3638/3638 [15:03<00:00,  4.03it/s, loss=2.136]


--------------------------------------------------------------------------------
SOURCE: But I am in a hot climate, where, if I had clothes, I could hardly wear them.
TARGET: Ma sono in un clima caldo, e ancorchè avessi vesti, non potrei comportarle. Io sono senza difesa, e senza mezzi di contrastare alla violenza degli uomini o delle bestie.
PREDICTED: Ma sono in una piccola striscia di fuoco , quando mi son caduta , se non avessi bevuto di trenta .
--------------------------------------------------------------------------------
SOURCE: She knew his habit, which had become a necessity, of reading in the evening.
TARGET: Conosceva la sua abitudine, che era ormai una necessità, di leggere la sera.
PREDICTED: Capì , ora , con quel che era stato fatto per la necessità di leggere .


Processing epoch 14: 100%|██████████| 3638/3638 [15:03<00:00,  4.03it/s, loss=2.720]


--------------------------------------------------------------------------------
SOURCE: I showed him the volume on the shelf: he took it down, and withdrawing to his accustomed window recess, he began to read it.
TARGET: Un momento dopo era al solito, nel vano della finestra a leggere.
PREDICTED: Lo feci in giro per il muro e lo feci in giro , quando gli feci posto in piedi , cominciò a leggere .
--------------------------------------------------------------------------------
SOURCE: Isn't he a fine fellow?
TARGET: Non è vero che è un gran bravo ragazzo?
PREDICTED: Non è vero , amico ?


Processing epoch 15: 100%|██████████| 3638/3638 [15:03<00:00,  4.02it/s, loss=3.392]


--------------------------------------------------------------------------------
SOURCE: "Distrust it, sir; it is not a true angel."
TARGET: — Diffidate di lui, non è un angiolo vero.
PREDICTED: — Quello che signore , non è vero , non è vero .
--------------------------------------------------------------------------------
SOURCE: I am glad to be so near you again."
TARGET: Sono felice di essere accanto a voi. — Jane Eyre!
PREDICTED: Sono contenta che siate venuta a piedi .


Processing epoch 16: 100%|██████████| 3638/3638 [15:04<00:00,  4.02it/s, loss=1.969]


--------------------------------------------------------------------------------
SOURCE: This was the first time that I entertained a thought of breeding up some tame creatures, that I might have food when my powder and shot was all spent.
TARGET: Fu questa la prima volta che mi nacque il pensiere di addimesticare animali, per ritrarne nudrimento quando la mia polvere e le mie munizioni sarebbero finite del tutto.
PREDICTED: Fu questo il primo pensiere che io avessi pensato a ciò che mi avrebbe potuto scoprire una certa quantità di costoro , che mi avrebbe dato a gran voglia di dar fuoco e alle mie munizioni .
--------------------------------------------------------------------------------
SOURCE: He knew that between him and her there could and should be nothing secret, and therefore he decided that it was his duty, but he had not considered how the confession might affect her: he had not put himself in her place.
TARGET: Sapeva che fra di loro non potevano e non dovevano esserci segr

Processing epoch 17: 100%|██████████| 3638/3638 [15:04<00:00,  4.02it/s, loss=2.344]


--------------------------------------------------------------------------------
SOURCE: If you ask me – the betrothal to-day and the wedding to-morrow!'
TARGET: Se domandate a me, per me, oggi la benedizione e domani le nozze.
PREDICTED: Se vuoi fare la pace , domani se domani al matrimonio ?
--------------------------------------------------------------------------------
SOURCE: I laughed at him as he said this.
TARGET: Risi nel sentirlo parlar così.
PREDICTED: Gli feci ridere come egli mi disse .


Processing epoch 18: 100%|██████████| 3638/3638 [15:04<00:00,  4.02it/s, loss=2.056]


--------------------------------------------------------------------------------
SOURCE: 'No, we are very comfortable here,' replied the ambassador's wife smiling, and she continued the interrupted conversation.
TARGET: — No, stiamo tanto bene qui — rispose con un sorriso la moglie dell’ambasciatore, e riprese la conversazione di poco prima.
PREDICTED: — No , ci siamo molto contenti — rispose l ’ offesa , e , continuò a parlare del discorso .
--------------------------------------------------------------------------------
SOURCE: This is you, who have been as slippery as an eel this last month, and as thorny as a briar-rose?
TARGET: Perché da un mese mi sgusciate di mano come un'anguilla e pungete come un cespuglio di rose?
PREDICTED: Che è , che siete una , come un lavoro d ’ un mese ? Un , come un albero che non si possa avvicinarsi ?


Processing epoch 19: 100%|██████████| 3638/3638 [15:04<00:00,  4.02it/s, loss=2.065]


--------------------------------------------------------------------------------
SOURCE: My husband! Ah, yes...
TARGET: Mio marito, ah, già....
PREDICTED: Ma se mio marito ...
--------------------------------------------------------------------------------
SOURCE: Only the invalid himself did not show that desire, but on the contrary was angry because the doctor had not been fetched, and he continued taking medicine and talking of life.
TARGET: Soltanto il malato non esprimeva questo sentimento, al contrario, si irritava perché non gli portavano il dottore, e continuava a prendere la medicina e parlava di vivere.
PREDICTED: Solo il malato non aveva il desiderio di lasciarsi prendere , ma al contrario , al dottore , non era stato portato , e , la vita , si mise a parlare e si era fatto a parlare della vita .
