<a href="https://colab.research.google.com/github/johnkidsm/FT-scripts/blob/main/validate_transformer_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
from model import build_transformer
from dataset import TranslationDataset, causal_mask
from config import get_config, get_weights_file_path, latest_weights_file_path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

import warnings
from tqdm import tqdm
import os
from pathlib import Path

# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import torchmetrics
from torch.utils.tensorboard import SummaryWriter

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [114]:
%matplotlib inline

In [107]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.5.1-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.5.1-py3-none-any.whl (890 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.6/890.6 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.8 torchmetrics-1.5.1


In [120]:
!tensorboard --logdir=runs

2024-10-26 20:44:24.623382: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-26 20:44:24.644045: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-26 20:44:24.651189: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1729975467.281076   39642 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729975467.356598   39642 cuda_executor.cc:1015] succ

In [118]:
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

def get_ds(config):
    # It only has the train split, so we divide it overselves
    ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

    # Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = TranslationDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq'])
    val_ds = TranslationDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')


    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

In [110]:
model_path = "./tmodel_01.pt"
model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config["seq"], config['seq'], d_model=512, N=6, h=8)  # Load model with appropriate arguments


state_dict = torch.load(model_path)

# Remove unexpected keys
for key in ['epoch', 'model_state_dict', 'optimizer_state_dict', 'global_step']:
    if key in state_dict:
        state_dict.pop(key)

model.load_state_dict(torch.load(model_path), strict=False)

model.to(device)


  state_dict = torch.load(model_path)
  model.load_state_dict(torch.load(model_path), strict=False)


Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (w_o): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization

In [111]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
print("Using device:", device)
if (device == 'cuda'):
    print(f"Device name: {torch.cuda.get_device_name(device.index)}")
    print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
elif (device == 'mps'):
    print(f"Device name: <mps>")
else:
    print("NOTE: If you have a GPU, consider using it for training.")
    print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
    print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
device = torch.device(device)

Using device: cuda
Device name: Tesla T4
Device memory: 14.74810791015625 GB


In [106]:
len(val_dataloader)

5147

In [112]:
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device=device):

    model.eval()
    count = 0
    writer = SummaryWriter(config['experiment_name'])
    global_step=len(validation_ds)

    source_texts = []
    expected = []
    predicted = []

    console_width = 80  # Default width if console width cannot be retrieved

    with torch.no_grad():
        for batch in validation_ds:
            count += 1

            encoder_input = batch["encoder_input"].to(device)  # (b, seq)
            encoder_mask = batch["encoder_mask"].to(device)  # (b, 1, 1, seq)

            # Ensure batch size is 1 for validation using beam search
            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

            model_out = beam_search_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = model_out

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            # Print results if desired (modify formatting based on your preference)
            print('-' * console_width)
            print(f"{f'SOURCE: ':>12}{source_text}")
            print(f"{f'TARGET: ':>12}{target_text}")
            print(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == global_step:
                  print('-'*console_width)
                  break

    if writer:
        # Evaluate the character error rate
        # Compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()


In [119]:
# Run validation at the end of every epoch
run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq'], device)

--------------------------------------------------------------------------------
    SOURCE: »Sie ist Mr. Rochesters Mündel; er beauftragte mich, eine Gouvernante für sie zu suchen.
    TARGET: "She is Mr. Rochester's ward; he commissioned me to find a governess for her.
 PREDICTED: fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang fang bah fang fang fang fang fang fang fang fang fang bah bah bah bah bah bah bah bah bah bah bah bah bah bah bah bah bah bah recesses Saturday Saturday Saturday fang Saturday fang fang fang fang fang fang Saturday fang Saturday fang Saturday fang fang Saturday fang Saturday fang fang appeared unfavourable unfavourable unfavourable unfavourable unfavourable unfavourable unfavourable unfavourable unfavourable unfavourable unfavourable breeding breeding breeding breeding spray spray spray spray stumble stumble bah breeding breeding stumble stumble stumble stumble stumble 

KeyboardInterrupt: 

In [98]:
def beam_search_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device, beam_size=3):
    """
    Perform beam search decoding for the Transformer model.

    Args:
    - model: The Transformer model
    - encoder_input: Source sequence
    - encoder_mask: Mask for the source sequence
    - tokenizer_src: Tokenizer for the source language
    - tokenizer_tgt: Tokenizer for the target language
    - max_len: Maximum length of the target sequence
    - device: The device to run the model on (e.g., 'cuda' or 'cpu')
    - beam_size: Size of the beam for beam search (default: 3)

    Returns:
    - best_hypothesis: The best decoded sequence
    """

    # Encode the source sequence
    encoder_output = model.encode(encoder_input, encoder_mask)

    # Get start and end token IDs
    start_symbol = tokenizer_tgt.token_to_id('[SOS]')
    end_symbol = tokenizer_tgt.token_to_id('[EOS]')

    # Initialize the beam
    beam = [(torch.tensor([start_symbol], device=device), 0)]

    for _ in range(max_len - 1):
        candidates = []

        for seq, score in beam:
            if seq[-1] == end_symbol:
                candidates.append((seq, score))
                continue

            # Prepare the target sequence
            tgt = seq.unsqueeze(0)
            tgt_mask = torch.triu(torch.ones(1, tgt.size(1), tgt.size(1)) == 0, diagonal=1).to(device)

            # Decode
            out = model.decode(encoder_output, encoder_mask, tgt, tgt_mask)
            proj = model.project(out[:, -1])
            prob = torch.nn.functional.log_softmax(proj, dim=-1)

            # Get top k candidates
            top_k_probs, top_k_ids = prob[0].topk(beam_size)

            for i in range(beam_size):
                new_seq = torch.cat([seq, top_k_ids[i].unsqueeze(0)])
                new_score = score + top_k_probs[i].item()
                candidates.append((new_seq, new_score))

        # Select top beam_size candidates
        beam = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_size]

        # Check if all beams have reached the end symbol
        if all(seq[-1] == end_symbol for seq, _ in beam):
            break

    # Return the best hypothesis
    best_hypothesis = beam[0][0]

    # Convert token IDs to words
    decoded_output = tokenizer_tgt.decode(best_hypothesis.tolist())

    return decoded_output



In [52]:
print(type(model))

<class 'model.Transformer'>


In [7]:
config = get_config()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

Max length of source sentence: 479
Max length of target sentence: 466


In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading