In [1]:
!pip install sacremoses torchmetrics

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.8.1-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadat

In [2]:
import os
import math
import torch
import torch.nn as nn
from pathlib import Path

from torch.utils.data import Dataset, DataLoader, random_split

# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm

import torchmetrics
from torch.utils.tensorboard import SummaryWriter

In [8]:
import os
from pathlib import Path

# Transformer model parameters
MODEL_NUMBER_OF_LAYERS = 3              # paper value : 6
MODEL_DIMENSION = 256                   # 512
MODEL_NUMBER_OF_HEADS = 4               # 8
MODEL_INNER_LAYER_DIMENSION = 1024      # 2048
MODEL_DROPOUT_PROBABILITY = 0.1         # 0.1
MODEL_LABEL_SMOOTHING_VALUE = 0.1       # 0.1
SEQUENCE_LENGTH = 500

# Training parameters
BATCH_SIZE = 8
NUMBER_OF_EPOCHS = 15
BETA1 = 0.9                             # 0.9
BETA2 = 0.98                            # 0.98
EPSILON = 1e-9                          # 1e-9
WARMUP_STEPS = 4000                     # 4000

# Dataset parameters
DATASET_NAME = "Helsinki-NLP/opus_books"
SOURCE_LANGUAGE = "en"
TARGET_LANGUAGE = "fr"

# Saving parameters
MODEL_FOLDER = "weights"
MODEL_BASENAME = "tmodel_"
MODEL_PRELOAD = "latest"
EXPERIMENT_FOLDER = "runs/tmodel"

# Special tokens
UNK_TOKEN = '[UNK]'
SOS_TOKEN = '[SOS]'
EOS_TOKEN = '[EOS]'
PAD_TOKEN = "[PAD]"

CHECKPOINTS_PATH = os.path.join(os.getcwd(), 'models', 'checkpoints') # semi-trained models during training will be dumped here
BINARIES_PATH = os.path.join(os.getcwd(), 'models', 'binaries') # location where trained models are located
DATA_DIR_PATH = os.path.join(os.getcwd(), 'data') # training data will be stored here

os.makedirs(CHECKPOINTS_PATH, exist_ok=True)
os.makedirs(BINARIES_PATH, exist_ok=True)
os.makedirs(DATA_DIR_PATH, exist_ok=True)

def get_weights_file_path(epoch: str):
    model_folder = f"{DATASET_NAME}_{MODEL_FOLDER}"
    model_filename = f"{MODEL_BASENAME}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path():
    model_folder = f"{DATASET_NAME}_{MODEL_FOLDER}"
    model_filename = f"{MODEL_BASENAME}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [4]:
import math
import torch
import torch.nn as nn


class Embedding(nn.Module):
    def __init__(self, vocabulary_size, model_dimension):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.model_dimension = model_dimension
        self.embedding = nn.Embedding(vocabulary_size, model_dimension)

    def forward(self, src_token_ids):
        return self.embedding(src_token_ids) * math.sqrt(self.model_dimension)


class PositionalEncoding(nn.Module):
    def __init__(self, model_dimension, dropout_probability, max_sequence_length=5000):
        super().__init__()
        self.sequence_length = max_sequence_length
        self.model_dimension = model_dimension
        self.dropout = nn.Dropout(dropout_probability)

        positional_encoding = torch.zeros(max_sequence_length, model_dimension)

        positions = torch.arange(0, max_sequence_length, 1, dtype=float)
        positions = torch.unsqueeze(positions, 1)
        denominator = torch.exp(torch.arange(0, model_dimension, 2, dtype=float) * -math.log(10000.) / model_dimension)

        positional_encoding[:,0::2] = torch.sin(positions * denominator)
        positional_encoding[:,1::2] = torch.cos(positions * denominator)

        positional_encoding = positional_encoding.unsqueeze(0)
        self.register_buffer("positional_encoding", positional_encoding)

    def forward(self, src_embedded):
        return self.dropout(src_embedded + self.positional_encoding[:, :src_embedded.shape[1]])


class ScaledDotProductAttention(nn.Module):
    def __init__(self, key_dimension, value_dimension):
        super().__init__()
        self.key_dimension = key_dimension
        self.value_dimension = value_dimension

    def forward(self, queries, keys, values, mask=None):
        dot_product = torch.matmul(queries, keys.transpose(-2, -1))
        dot_product /= math.sqrt(self.key_dimension)

        if mask is not None:
            dot_product.masked_fill_(mask == 0, float("-inf"))

        weights = dot_product.softmax(dim=-1)

        return torch.matmul(weights, values), weights


class MultiHeadAttention(nn.Module):
    def __init__(self, model_dimension, number_of_heads, save_weigths=False):
        super().__init__()
        self.model_dimension = model_dimension
        self.number_of_heads = number_of_heads
        self.key_dimension = model_dimension // number_of_heads
        self.value_dimension = model_dimension // number_of_heads

        self.linear_queries = nn.Linear(model_dimension, model_dimension) # W_Q
        self.linear_keys = nn.Linear(model_dimension, model_dimension) # W_K
        self.linear_values = nn.Linear(model_dimension, model_dimension) # W_V

        self.linear_output = nn.Linear(model_dimension, model_dimension) # w_0

        self.attention_weigths = None
        self.save_weigths = save_weigths

    def forward(self, queries, keys, values, mask=None):
        projected_queries = self.linear_queries(queries) # Q * W_Q
        projected_queries = projected_queries.view(projected_queries.shape[0], projected_queries.shape[1], self.number_of_heads, self.key_dimension).transpose(1, 2)

        projected_keys = self.linear_keys(keys) # K * W_K
        projected_keys = projected_keys.view(projected_keys.shape[0], projected_keys.shape[1], self.number_of_heads, self.key_dimension).transpose(1, 2)

        projected_values = self.linear_values(values) # V * W_V
        projected_values = projected_values.view(projected_values.shape[0], projected_values.shape[1], self.number_of_heads, self.value_dimension).transpose(1, 2)

        scaled_dot_product = ScaledDotProductAttention(self.key_dimension, self.value_dimension)

        attention, attention_weigths = scaled_dot_product(projected_queries, projected_keys, projected_values, mask)
        attention = attention.transpose(1, 2).contiguous()
        attention = attention.view(queries.shape[0], -1, self.number_of_heads * self.key_dimension)

        if self.save_weigths :
            self.attention_weigths = attention_weigths.detach()

        return self.linear_output(attention)


class PositionwiseFeedForwardNetwork(nn.Module):
    def __init__(self, model_dimension, inner_layer_dimension):
        super().__init__()
        self.modul_dimension = model_dimension
        self.inner_layer_dimension = inner_layer_dimension

        self.linear1 = nn.Linear(model_dimension, inner_layer_dimension) # W1 and b1
        self.linear2 = nn.Linear(inner_layer_dimension, model_dimension) # W2 and b2
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        return self.linear2(x)


class EncoderLayer(nn.Module):
    def __init__(self, model_dimension, multihead_attention, feedforward_network, dropout_probability):
        super().__init__()
        self.model_dimension = model_dimension

        self.multihead_attention = multihead_attention
        self.feedforward_network = feedforward_network

        self.layernorm1 = nn.LayerNorm(model_dimension)
        self.layernorm2 = nn.LayerNorm(model_dimension)

        self.dropout = nn.Dropout(dropout_probability)

    def forward(self, src_embedded, src_mask=None):
        attention_ouput = self.multihead_attention(src_embedded, src_embedded, src_embedded, src_mask)
        sublayer_output1 = self.layernorm1(src_embedded + self.dropout(attention_ouput))

        ffnetwork_output = self.feedforward_network(sublayer_output1)
        sublayer_output2 = self.layernorm2(sublayer_output1 + self.dropout(ffnetwork_output))

        return sublayer_output2


class Encoder(nn.Module):
    def __init__(self, model_dimension, number_of_layers, number_of_heads, inner_layer_dimension, dropout_probability):
        super().__init__()
        self.model_dimension = model_dimension
        self.number_of_layers = number_of_layers

        self.layers = nn.ModuleList([
            EncoderLayer(
                model_dimension,
                MultiHeadAttention(model_dimension, number_of_heads),
                PositionwiseFeedForwardNetwork(model_dimension, inner_layer_dimension),
                dropout_probability
            )
            for _ in range(number_of_layers)
        ])

    def forward(self, src_embedded, src_mask=None):
        src_encoder_output = src_embedded

        for layer in self.layers:
            src_encoder_output = layer(src_encoder_output, src_mask)

        return src_encoder_output


class DecoderLayer(nn.Module):
    def __init__(self, model_dimension, masked_multihead_attention, multihead_attention, feedforward_network, dropout_probability):
        super().__init__()
        self.model_dimension = model_dimension

        self.masked_multihead_attention = masked_multihead_attention
        self.multihead_attention = multihead_attention
        self.feedforward_network = feedforward_network

        self.layernorm1 = nn.LayerNorm(model_dimension)
        self.layernorm2 = nn.LayerNorm(model_dimension)
        self.layernorm3 = nn.LayerNorm(model_dimension)

        self.dropout = nn.Dropout(dropout_probability)

    def forward(self, src_encoder_output, trg_embedded, src_mask=None, trg_mask=None):
        masked_attention_output = self.masked_multihead_attention(queries=trg_embedded, keys=trg_embedded, values=trg_embedded, mask=trg_mask)
        sublayer_output1 = self.layernorm1(trg_embedded + self.dropout(masked_attention_output))

        attention_ouput = self.multihead_attention(queries=sublayer_output1, keys=src_encoder_output, values=src_encoder_output, mask=src_mask)
        sublayer_output2 = self.layernorm2(sublayer_output1 + self.dropout(attention_ouput))

        ffnetwork_ouput = self.feedforward_network(sublayer_output2)
        sublayer_output3 = self.layernorm3(sublayer_output2 + self.dropout(ffnetwork_ouput))

        return sublayer_output3


class Decoder(nn.Module):
    def __init__(self, model_dimension, number_of_layers, number_of_heads, inner_layer_dimension, dropout_probability):
        super().__init__()
        self.model_dimension = model_dimension
        self.number_of_layers = number_of_layers

        self.layers = nn.ModuleList([
            DecoderLayer(
                model_dimension,
                MultiHeadAttention(model_dimension, number_of_heads),  # masked self-attention
                MultiHeadAttention(model_dimension, number_of_heads),  # encoder-decoder attention
                PositionwiseFeedForwardNetwork(model_dimension, inner_layer_dimension),
                dropout_probability
            )
            for _ in range(number_of_layers)
        ])

    def forward(self, src_encoder_output, trg_embedded, src_mask=None, trg_mask=None):
        trg_decoder_output = trg_embedded

        for layer in self.layers:
            trg_decoder_output = layer(src_encoder_output, trg_decoder_output, src_mask=src_mask, trg_mask=trg_mask)

        return trg_decoder_output


class Transformer(nn.Module):
    def __init__(self, model_dimension, inner_layer_dimension, number_of_layers, number_of_heads, src_vocabulary_size, trg_vocabulary_size, dropout_probability):
        super().__init__()
        self.model_dimension = model_dimension
        self.number_of_layers = number_of_layers
        self.number_of_heads = number_of_heads
        self.src_vocabulary_size = src_vocabulary_size
        self.trg_vocabulary_size = trg_vocabulary_size

        self.input_embedding = Embedding(src_vocabulary_size, model_dimension)
        self.output_embedding = Embedding(trg_vocabulary_size, model_dimension)

        self.input_pos_encoding = PositionalEncoding(model_dimension, dropout_probability)
        self.output_pos_encoding = PositionalEncoding(model_dimension, dropout_probability)

        self.encoder = Encoder(model_dimension, number_of_layers, number_of_heads, inner_layer_dimension, dropout_probability)
        self.decoder = Decoder(model_dimension, number_of_layers, number_of_heads, inner_layer_dimension, dropout_probability)

        self.linear_projection = nn.Linear(model_dimension, trg_vocabulary_size)
        self.softmax = nn.Softmax(dim=-1)

    def encode(self, src_token_ids, src_mask=None):
        input_embedded = self.input_embedding(src_token_ids)
        input_pos_encoded = self.input_pos_encoding(input_embedded)
        input_encoded = self.encoder(input_pos_encoded, src_mask)
        return input_encoded

    def decode(self, input_encoded, trg_token_ids, src_mask, trg_mask=None):
        output_embedded = self.output_embedding(trg_token_ids)
        output_pos_encoded = self.output_pos_encoding(output_embedded)
        output_decoded = self.decoder(input_encoded, output_pos_encoded, src_mask, trg_mask)
        return output_decoded

    def forward(self, src_token_ids, trg_token_ids, src_mask=None, trg_mask=None):
        input_embedded = self.input_embedding(src_token_ids)
        input_pos_encoded = self.input_pos_encoding(input_embedded)
        input_encoded = self.encoder(input_pos_encoded, src_mask)

        output_embedded = self.output_embedding(trg_token_ids)
        output_pos_encoded = self.output_pos_encoding(output_embedded)
        output_decoded = self.decoder(input_encoded, output_pos_encoded, src_mask, trg_mask)

        output_decoded = self.linear_projection(output_decoded)
        # output_decoded = output_decoded.softmax(dim=-1)

        return output_decoded

In [5]:
class EnglishToFrenchDataset(Dataset):

    def __init__(self, dataset, src_tokenizer, trg_tokenizer, src_language, trg_language, sequence_length):
        super().__init__()
        self.sequence_length = sequence_length

        self.dataset = dataset
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer
        self.src_language = src_language
        self.trg_language = trg_language

        self.sos_token = torch.tensor([trg_tokenizer.token_to_id(SOS_TOKEN)], dtype=torch.int64)
        self.eos_token = torch.tensor([trg_tokenizer.token_to_id(EOS_TOKEN)], dtype=torch.int64)
        self.pad_token = torch.tensor([trg_tokenizer.token_to_id(PAD_TOKEN)], dtype=torch.int64)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        src_target_pair = self.dataset[idx]
        src_text = src_target_pair['translation'][self.src_language]
        trg_text = src_target_pair['translation'][self.trg_language]

        # Transform the text into tokens
        enc_input_tokens = self.src_tokenizer.encode(src_text).ids
        dec_input_tokens = self.trg_tokenizer.encode(trg_text).ids

        max_token_length = self.sequence_length - 2
        if len(enc_input_tokens) > max_token_length:
            enc_input_tokens = enc_input_tokens[:max_token_length]
        if len(dec_input_tokens) > max_token_length - 1:  # -1 car decoder a seulement SOS au début
            dec_input_tokens = dec_input_tokens[:max_token_length - 1]

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.sequence_length - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.sequence_length - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all sequence_length long
        assert encoder_input.size(0) == self.sequence_length
        assert decoder_input.size(0) == self.sequence_length
        assert label.size(0) == self.sequence_length

        encoder_mask = (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0)
        decoder_mask = (decoder_input != self.pad_token).unsqueeze(0) & causal_mask(decoder_input.size(0))

        return {
            "encoder_input": encoder_input,  # (sequence_length)
            "decoder_input": decoder_input,  # (sequence_length)
            "encoder_mask": encoder_mask, # (1, 1, sequence_length)
            "decoder_mask": decoder_mask, # (1, sequence_length) & (1, sequence_length, sequence_length),
            "label": label,  # (sequence_length)
            "src_text": src_text,
            "trg_text": trg_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((size, size)), diagonal=1).type(torch.int)
    return (mask == 0).unsqueeze(0)


def get_all_sentences(dataset, language):
    for item in dataset:
        yield item["translation"][language]


def get_tokenizer(dataset, language):
    tokenizer_path = Path("tokenizer_{0}.json".format(language))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token=UNK_TOKEN))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=[UNK_TOKEN, PAD_TOKEN, SOS_TOKEN, EOS_TOKEN], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(dataset, language), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer

def get_dataset():
    dataset = load_dataset(DATASET_NAME, SOURCE_LANGUAGE + "-" + TARGET_LANGUAGE, split="train")
    dataset = dataset.shuffle().select(range(int(len(dataset)/4)))

    src_tokenizer = get_tokenizer(dataset, SOURCE_LANGUAGE)
    trg_tokenizer = get_tokenizer(dataset, TARGET_LANGUAGE)

    max_length = SEQUENCE_LENGTH - 2

    def is_valid_length(example):
        src_tokens = src_tokenizer.encode(example['translation'][SOURCE_LANGUAGE]).ids
        trg_tokens = trg_tokenizer.encode(example['translation'][TARGET_LANGUAGE]).ids
        return len(src_tokens) <= max_length and len(trg_tokens) <= max_length

    dataset = dataset.filter(is_valid_length)
    print(f"Dataset filtred: {len(dataset)} examples left")

    train_size = int(0.8 * len(dataset))
    validation_size = len(dataset) - train_size

    train_dataset, valisation_dataset = random_split(dataset, [train_size, validation_size])

    train_dataset = EnglishToFrenchDataset(train_dataset, src_tokenizer, trg_tokenizer, SOURCE_LANGUAGE, TARGET_LANGUAGE, SEQUENCE_LENGTH)
    validation_dataset = EnglishToFrenchDataset(valisation_dataset, src_tokenizer, trg_tokenizer, SOURCE_LANGUAGE, TARGET_LANGUAGE, SEQUENCE_LENGTH)

    # Find the maximum length of each sentence in the source and target sentence
    src_max_length = 0
    trg_max_length = 0

    for item in dataset:
        src_ids = src_tokenizer.encode(item['translation'][SOURCE_LANGUAGE]).ids
        trg_ids = trg_tokenizer.encode(item['translation'][TARGET_LANGUAGE]).ids
        src_max_length = max(src_max_length, len(src_ids))
        trg_max_length = max(trg_max_length, len(trg_ids))

    print(f'Max length of source sentence: {src_max_length}')
    print(f'Max length of target sentence: {trg_max_length}')


    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(validation_dataset, batch_size=1, shuffle=True) # Process the sentences one by one

    return train_dataloader, val_dataloader, src_tokenizer, trg_tokenizer

In [None]:
def get_transformer(src_vocab_size, trg_vocab_size):
    transformer = Transformer(model_dimension=MODEL_DIMENSION,
                              inner_layer_dimension=MODEL_INNER_LAYER_DIMENSION,
                              number_of_layers=MODEL_NUMBER_OF_LAYERS,
                              number_of_heads=MODEL_NUMBER_OF_HEADS,
                              src_vocabulary_size=src_vocab_size,
                              trg_vocabulary_size=trg_vocab_size,
                              dropout_probability=MODEL_DROPOUT_PROBABILITY
                            )
    return transformer


def greedy_decode(model, source, source_mask, src_tokenizer, trg_tokenizer, max_len, device):
    sos_idx = trg_tokenizer.token_to_id('[SOS]')
    eos_idx = trg_tokenizer.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)

    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, decoder_input, source_mask, decoder_mask)

        # get next token
        prob = model.linear_projection(out[:, -1])
        prob = model.softmax(prob)
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


def run_validation(model, validation_ds, src_tokenizer, trg_tokenizer, max_len, device, print_msg, global_step, writer, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, src_tokenizer, trg_tokenizer, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["trg_text"][0]
            model_out_text = trg_tokenizer.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break

    if writer:
        # Evaluate the character error rate
        # Compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

        print(f"CER: {cer:.4f} | WER: {wer:.4f} | BLEU: {bleu:.4f}")


def learning_rate(step):
    if step == 0:
        step = 1
    return MODEL_DIMENSION ** (-0.5) * min(step ** (-0.5), step * WARMUP_STEPS ** (-1.5))


def train_model():
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    device = torch.device(device)
    torch.cuda.empty_cache()

    # Make sure the weights folder exists
    Path(f"{DATASET_NAME}_{MODEL_FOLDER}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, src_tokenizer, trg_tokenizer = get_dataset()
    model = get_transformer(src_tokenizer.get_vocab_size(), trg_tokenizer.get_vocab_size()).to(device)

    # Tensorboard
    writer = SummaryWriter(EXPERIMENT_FOLDER)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(BETA1, BETA2), eps=EPSILON)
    #lr_scheduler = LambdaLR(optimizer=optimizer, lr_lambda=lambda step: learning_rate(step))

    # If the user specified a model to preload before training, load it
    initial_epoch = 1
    global_step = 0
    preload = MODEL_PRELOAD
    model_filename = latest_weights_file_path() if preload == 'latest' else get_weights_file_path(preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_function = nn.CrossEntropyLoss(ignore_index=trg_tokenizer.token_to_id('[PAD]'), label_smoothing=MODEL_LABEL_SMOOTHING_VALUE).to(device)

    for epoch in range(initial_epoch, NUMBER_OF_EPOCHS + 1):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (B, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            model_output = model(encoder_input, decoder_input, encoder_mask, decoder_mask)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_function(model_output.view(-1, trg_tokenizer.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            #lr_scheduler.step()

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, src_tokenizer, trg_tokenizer, SEQUENCE_LENGTH, device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)

In [12]:
warnings.filterwarnings("ignore")
train_model()

Using device: cuda
Device name: Tesla T4
Device memory: 14.74127197265625 GB


Filter:   0%|          | 0/31771 [00:00<?, ? examples/s]

Dataset filtred: 31771 examples left
Max length of source sentence: 309
Max length of target sentence: 320
No model to preload, starting from scratch


Processing Epoch 01: 100%|██████████| 3177/3177 [07:42<00:00,  6.86it/s, loss=5.609]


--------------------------------------------------------------------------------
    SOURCE: "My Lord, excuse me! I speak as I can; I restrain myself.
    TARGET: -- Milord, excusez-moi! je parle comme je puis; je me contiens.
 PREDICTED: -- Je me me me me me me me , je me , je me , je me .
--------------------------------------------------------------------------------
    SOURCE: Afterwards he declared that the mates could not even know whether it was round or square, he had rushed along so swiftly.
    TARGET: Ensuite, il donna sa parole que les camarades ne devaient pas meme savoir si elle l'avait rond ou carré, tellement il galopait raide.
 PREDICTED: Il était bien que si bien qu ' il ne pouvait être pas que si bien qu ' il ne pouvait être si bien qu ' il ne pouvait être .
--------------------------------------------------------------------------------


Processing Epoch 02: 100%|██████████| 3177/3177 [07:46<00:00,  6.81it/s, loss=4.983]


--------------------------------------------------------------------------------
    SOURCE: "Yes, sir."
    TARGET: – Oui, monsieur.
 PREDICTED: -- Oui , monsieur .
--------------------------------------------------------------------------------
    SOURCE: "Always 'duty.' I am sick of the word.
    TARGET: Toujours les devoirs, je suis assommé de ces mots-là.
 PREDICTED: -- Je suis bien que je suis bien que je suis le plus de la plus .
--------------------------------------------------------------------------------


Processing Epoch 03: 100%|██████████| 3177/3177 [07:46<00:00,  6.82it/s, loss=5.276]


--------------------------------------------------------------------------------
    SOURCE: 'What pleasure, what instinct leads them to betray us?
    TARGET: Quel plaisir, quel instinct les portent à nous tromper !
 PREDICTED: – Que - vous , que nous avons dit - il , de nous ?
--------------------------------------------------------------------------------
    SOURCE: – Moi, Madame !
    TARGET: "I, Signora!
 PREDICTED: " " " " " " " " " " " " " " " " " " " " " " " " " " " "
--------------------------------------------------------------------------------


Processing Epoch 04: 100%|██████████| 3177/3177 [07:45<00:00,  6.82it/s, loss=5.404]


--------------------------------------------------------------------------------
    SOURCE: Bovary during this time did not dare to stir from his house.
    TARGET: Bovary, pendant ce temps-là, n’osait bouger de sa maison.
 PREDICTED: Il ne fallait pas de temps à ce temps , de temps , de la maison .
--------------------------------------------------------------------------------
    SOURCE: "The frigate?" Conseil replied, rolling over on his back. "I think master had best not depend on it to any great extent!"
    TARGET: -- La frégate ! répondit Conseil en se retournant sur le dos, je crois que monsieur fera bien de ne pas trop compter sur elle !
 PREDICTED: -- Le maître ! répondit Conseil , Conseil , sans doute , qui n ' avait pas de la mer ?
--------------------------------------------------------------------------------


Processing Epoch 05: 100%|██████████| 3177/3177 [07:45<00:00,  6.83it/s, loss=5.383]


--------------------------------------------------------------------------------
    SOURCE: They no longer felt the cold, these burning words had warmed them to the bone.
    TARGET: Ils ne sentaient plus le froid, ces ardentes paroles les avaient chauffés aux entrailles.
 PREDICTED: Ils avaient des mots , les paroles , les , les .
--------------------------------------------------------------------------------
    SOURCE: I approached my cheek to her lips: she would not touch it. She said I oppressed her by leaning over the bed, and again demanded water. As I laid her down--for I raised her and supported her on my arm while she drank--I covered her ice-cold and clammy hand with mine: the feeble fingers shrank from my touch--the glazing eyes shunned my gaze.
    TARGET: J'approchai ma joue de ses lèvres, mais elle ne la toucha pas: elle me dit que je l'oppressais en me penchant sur son lit, et me redemanda de l'eau; lorsque je la recouchai, car je l'avais soulevée avec mon bras pendan

Processing Epoch 06: 100%|██████████| 3177/3177 [07:45<00:00,  6.83it/s, loss=4.864]


--------------------------------------------------------------------------------
    SOURCE: Mais aussitôt que Clélia n’eut plus d’inquiétudes de ce côté, elle fut plus cruellement agitée encore par ses justes remords.
    TARGET: But as soon as Clelia had no longer any anxiety in that direction, she was even more cruelly tormented by her just remorse.
 PREDICTED: the of the of the of the of the of the of the of the of the .
--------------------------------------------------------------------------------
    SOURCE: Afterwards he declared that the mates could not even know whether it was round or square, he had rushed along so swiftly.
    TARGET: Ensuite, il donna sa parole que les camarades ne devaient pas meme savoir si elle l'avait rond ou carré, tellement il galopait raide.
 PREDICTED: Il était si il ne pouvait pas même s ’ il était si bien , il ne pouvait pas même , il ne pouvait pas même s ’ il était si le voir si bien , il était si le pouvait même , il ne pouvait même , il étai

Processing Epoch 07: 100%|██████████| 3177/3177 [07:45<00:00,  6.82it/s, loss=4.533]


--------------------------------------------------------------------------------
    SOURCE: "What’s the matter now, Planchet?" demanded d’Artagnan.
    TARGET: -- Qu'y a-t-il donc? demanda d'Artagnan.
 PREDICTED: -- Qu ' est - ce que vous ? dit d ' Artagnan .
--------------------------------------------------------------------------------
    SOURCE: I could not help it. I thought of him now--in his room--watching the sunrise; hoping I should soon come to say I would stay with him and be his.
    TARGET: Je ne pouvais m'empêcher de songer avec agonie à ce que j'avais laissé, à celui qui épiait dans sa chambre le lever du soleil, espérant me voir bientôt arriver pour lui dire que je voulais bien lui appartenir et rester près de lui.
 PREDICTED: Je ne pouvais pas me dire ; je lui dis que je ne pouvais pas me dire ; je lui dire , et je lui dis que je ne pouvais pas me dire .
--------------------------------------------------------------------------------


Processing Epoch 08: 100%|██████████| 3177/3177 [07:46<00:00,  6.81it/s, loss=4.337]


--------------------------------------------------------------------------------
    SOURCE: Il était immédiatement suivi d’un valet de chambre apportant une tasse de café infiniment petite, soutenue par un pied d’argent en filigrane ; et toutes les demi-heures un maître d’hôtel, portant épée et habit magnifique à la française, venait offrir des glaces.
    TARGET: He was immediately followed by a footman carrying an infinitesimal cup of coffee, supported on a stem of silver filigree; and every half hour a butler, wearing a sword and a magnificent coat, in the French style, brought round ices.
 PREDICTED: He was a of a , and , and a of the of a of a of a of a .
--------------------------------------------------------------------------------
    SOURCE: The commendation bestowed on him by Mrs. Reynolds was of no trifling nature.
    TARGET: Reynolds n’étaient pas de qualité ordinaire et quelle louange a plus de valeur que celle d’un serviteur intelligent ?
 PREDICTED: Le jour de Mme Fai

Processing Epoch 09: 100%|██████████| 3177/3177 [07:46<00:00,  6.81it/s, loss=4.572]


--------------------------------------------------------------------------------
    SOURCE: "That is my name," said Athos, quietly.
    TARGET: -- C'est mon nom, dit tranquillement Athos.
 PREDICTED: -- C ' est mon nom , dit Athos .
--------------------------------------------------------------------------------
    SOURCE: I was thus steeped in the marvellous ecstasy which all high summits develop in the mind; and now without giddiness, for I was beginning to be accustomed to these sublime aspects of nature. My dazzled eyes were bathed in the bright flood of the solar rays.
    TARGET: Je me plongeais ainsi dans cette prestigieuse extase que donnent les hautes cimes, et cette fois, sans vertige, car je m'accoutumais enfin à ces sublimes contemplations.
 PREDICTED: Mon esprit était ainsi , sans être ainsi ; car je me ainsi , sans être ainsi , je me ainsi , et je me mis à la nature , sans être ainsi que les yeux de la nature de l ' esprit de l ' état de la nature .
--------------------

Processing Epoch 10: 100%|██████████| 3177/3177 [07:45<00:00,  6.82it/s, loss=4.246]


--------------------------------------------------------------------------------
    SOURCE: I spent the evening in reading, writing, and thinking.
    TARGET: Je passai la soirée à lire, à écrire, à penser.
 PREDICTED: Je me mis en effet , je me mis dans la soirée .
--------------------------------------------------------------------------------
    SOURCE: I thought I might have retorted the question on him who put it: but I would not take that freedom.
    TARGET: Je pensai que j'aurais bien pu lui retourner sa question; mais n'osant pas prendre cette liberté, je lui répondis:
 PREDICTED: Je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas que je ne savais pas .
----

Processing Epoch 11: 100%|██████████| 3177/3177 [07:46<00:00,  6.82it/s, loss=4.689]


--------------------------------------------------------------------------------
    SOURCE: But in any case it was a blow to him, and it would take him some time before he could prepare himself to meet it.
    TARGET: Mais en toute éventualité, ce serait un coup, et un peu de temps lui serait nécessaire pour qu’il pût s’y préparer.
 PREDICTED: Mais il ne pouvait le faire , il lui un coup d ' oeil , et il l ' avait pu le faire un coup .
--------------------------------------------------------------------------------
    SOURCE: Miss Cunegund could scarce refrain from laughing at the good old woman, and thought it droll enough to pretend to a greater share of misfortunes than her own.
    TARGET: Cunégonde se mit presque à rire, et trouva cette bonne femme fort plaisante de prétendre être plus malheureuse qu'elle.
 PREDICTED: Mlle Temple , si la vieille femme pouvait la rendre assez grande pour la vieille , et , en faire une vieille femme de sa vieille femme .
--------------------------

Processing Epoch 12: 100%|██████████| 3177/3177 [07:46<00:00,  6.81it/s, loss=5.230]


--------------------------------------------------------------------------------
    SOURCE: Then added, repulsing him with a languid movement—
    TARGET: Puis, elle ajoutait en le repoussant d’un geste langoureux:
 PREDICTED: Alors , le , avec une .
--------------------------------------------------------------------------------
    SOURCE: They've all cleared out of the place."
    TARGET: Tous ont fichu le camp.
 PREDICTED: Ils ont fait tout le lieu de la place .
--------------------------------------------------------------------------------


Processing Epoch 13: 100%|██████████| 3177/3177 [07:46<00:00,  6.82it/s, loss=4.889]


--------------------------------------------------------------------------------
    SOURCE: The result of the rally in the last round had convinced his seconds that when it came to give-and-take hitting, their hardy and powerful man was likely to have the better of it.
    TARGET: Le résultat du repos après le dernier round avait convaincu les seconds que leur champion, avec son endurance et sa vigueur, devait avoir le dessus quand il s'agissait de recevoir et de rendre des coups.
 PREDICTED: Le dernier homme de ses et les de leur côté , quand il fut mieux pour prendre le dernier homme , il était évident que les de leur homme .
--------------------------------------------------------------------------------
    SOURCE: We moved cautiously along the track as if we were bound for the house, but Holmes halted us when we were about two hundred yards from it.
    TARGET: Nous avançâmes avec précaution sur le chemin comme si nous nous rendions à la maison, mais Holmes stoppa à deux cents mè

Processing Epoch 14: 100%|██████████| 3177/3177 [07:44<00:00,  6.84it/s, loss=4.334]


--------------------------------------------------------------------------------
    SOURCE: "Ay, pray do," said Candide, "and be sure you make them sensible of the horrid barbarity of boiling and roasting human creatures, and how little of Christianity there is in such practices."
    TARGET: Ne manquez pas, dit Candide, de leur représenter quelle est l'inhumanité affreuse de faire cuire des hommes, et combien cela est peu chrétien.
 PREDICTED: Candide est sûr de les , dit Candide , et il y a de les , et les de la petite personne de les .
--------------------------------------------------------------------------------
    SOURCE: In single file they still went on without a word, by the tiny flame of the lamps.
    TARGET: Un par un, ils allaient, ils allaient toujours, sans une parole, avec les petites flammes des lampes.
 PREDICTED: En effet , les sept heures , sans rien dire un seul mot , ils par le mot de la parole .
-----------------------------------------------------------------

Processing Epoch 15: 100%|██████████| 3177/3177 [07:43<00:00,  6.85it/s, loss=5.144]


--------------------------------------------------------------------------------
    SOURCE: The sketch of Rosamond's portrait pleased him highly: he said I must make a finished picture of it. He insisted, too, on my coming the next day to spend the evening at Vale Hall.
    TARGET: L'esquisse du portrait de Rosamonde lui plut beaucoup; il me demanda d'en faire une peinture aussi perfectionnée que possible; il me pria aussi de venir le lendemain passer la soirée à Vale-Hall.
 PREDICTED: Il me dit que le lendemain , il devait être trop vite , et il fallait en lui faire une parole , en lui une parole .
--------------------------------------------------------------------------------
    SOURCE: "Because we have very important matters to communicate to one another, and it was impossible to talk five minutes in that inn without being annoyed by all those importunate fellows, who keep coming in, saluting you, and addressing you. Here at least," said Athos, pointing to the bastion, "they will

In [9]:
def test_overfitting():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)
    train_dataloader, val_dataloader, src_tokenizer, trg_tokenizer = get_dataset()
    model = get_transformer(src_tokenizer.get_vocab_size(), trg_tokenizer.get_vocab_size()).to(device)
    # Prenez 1 seul batch
    single_batch = next(iter(train_dataloader))

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    print("Testing overfitting on single batch...")
    for step in range(100):
        encoder_input = single_batch['encoder_input'].to(device)
        decoder_input = single_batch['decoder_input'].to(device)
        encoder_mask = single_batch['encoder_mask'].to(device)
        decoder_mask = single_batch['decoder_mask'].to(device)
        label = single_batch['label'].to(device)

        model_output = model(encoder_input, decoder_input, encoder_mask, decoder_mask)
        loss = nn.CrossEntropyLoss(ignore_index=trg_tokenizer.token_to_id('[PAD]'))(
            model_output.view(-1, trg_tokenizer.get_vocab_size()),
            label.view(-1)
        )

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % 20 == 0:
            print(f"Step {step}: Loss = {loss.item():.4f}")

    if loss.item() < 0.1:
        print("Model CAN overfit - the architecture is working")
    else:
        print("Model CANNOT overfit - there's a fundamental problem")

test_overfitting()

Using device: cuda


Filter:   0%|          | 0/31771 [00:00<?, ? examples/s]

Dataset filtred: 31771 examples left
Max length of source sentence: 376
Max length of target sentence: 384
Testing overfitting on single batch...
Step 0: Loss = 10.2242
Step 20: Loss = 1.0047
Step 40: Loss = 0.1244
Step 60: Loss = 0.0407
Step 80: Loss = 0.0330
Model CAN overfit - the architecture is working


In [14]:
from google.colab import drive
drive.mount('/content/drive')

import shutil
import os

# Copy entire folder to Google Drive
source_folder = 'Helsinki-NLP'
destination_folder = '/content/drive/MyDrive/transformer_project/Helsinki-NLP'

if os.path.exists(source_folder):
    shutil.copytree(source_folder, destination_folder, dirs_exist_ok=True)
    print(f"Copied folder {source_folder} to Google Drive")
    shutil.copytree('data', destination_folder, dirs_exist_ok=True)
    print(f"Copied folder data to Google Drive")
    shutil.copytree('runs', destination_folder, dirs_exist_ok=True)
    print(f"Copied folder runs to Google Drive")
    shutil.copytree('models', destination_folder, dirs_exist_ok=True)
    print(f"Copied folder models to Google Drive")

Mounted at /content/drive
Copied folder Helsinki-NLP to Google Drive
Copied folder data to Google Drive
Copied folder runs to Google Drive
Copied folder models to Google Drive
