In [None]:
import torch
import tiktoken
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

### Detect if GPU is available

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


### Set up model configuration 

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.2,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
    "device": device,
}

### Initialize the tokenizer

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

Characters: 1820039
Tokens: 415577


### Load training and validation data files

In [None]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [None]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    tokenizer=tokenizer,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    tokenizer=tokenizer,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
total_characters = len(train_data + val_data)
total_tokens = len(tokenizer.encode(train_data + val_data, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

In [7]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [8]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [9]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, 
          sample_text="Every effort moves you",
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.05)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        checkpoint_path=checkpoint_path,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen,
        tokenizer=tokenizer
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    
    return model

### Train the model on training data

In [10]:
# train model on 3 books

train(train_loader, val_loader, num_epochs=7,
      eval_iter=25, sample_text="The horses are",
      checkpoint_path="model_and_optimizer_5.pth");

MPS Available: True
Allocated Memory: 0.0 MB
Starting training...
Ep 1 (Step 000000): Train loss 10.521, Val loss 10.502
Ep 1 (Step 000025): Train loss 7.593, Val loss 7.567
Ep 1 (Step 000050): Train loss 6.531, Val loss 6.613
Ep 1 (Step 000075): Train loss 6.171, Val loss 6.316
Ep 1 (Step 000100): Train loss 5.955, Val loss 6.079
Ep 1 (Step 000125): Train loss 5.765, Val loss 5.947
Ep 1 (Step 000150): Train loss 5.593, Val loss 5.798
Ep 1 (Step 000175): Train loss 5.504, Val loss 5.699
Ep 1 (Step 000200): Train loss 5.366, Val loss 5.609
Ep 1 (Step 000225): Train loss 5.361, Val loss 5.547
Ep 1 (Step 000250): Train loss 5.231, Val loss 5.508
Ep 1 (Step 000275): Train loss 5.203, Val loss 5.447
Ep 1 (Step 000300): Train loss 5.135, Val loss 5.394
Ep 1 (Step 000325): Train loss 5.125, Val loss 5.357
Ep 1 (Step 000350): Train loss 5.033, Val loss 5.329
The horses are a way on such a family
Ep 2 (Step 000375): Train loss 5.000, Val loss 5.297
Ep 2 (Step 000400): Train loss 4.942, Val loss

### Load trained model

In [10]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

checkpoint = torch.load("model_and_optimizer_5.pth", weights_only=True, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader
from itertools import combinations
import evaluate

# Load BLEU and ROUGE metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")


### 1. Perplexity Calculation ###
def compute_perplexity(model, dataloader, device):
    model.eval()
    total_loss = 0
    total_tokens = 0

    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids, target_ids = batch
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            logits = model(input_ids)  # Forward pass
            loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

            total_loss += loss.item() * target_ids.numel()
            total_tokens += target_ids.numel()

    perplexity = np.exp(total_loss / total_tokens)
    return perplexity


### 2. Word Embedding Association Test (WEAT) ###
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def weat_score(model, target_words_1, target_words_2, attribute_words_1, attribute_words_2, tokenizer, device):
    """
    Measures bias by comparing how close different groups of words are in embedding space.
    """

    def get_embedding(word):
        token_id = tokenizer.encode(word, allowed_special={'<|endoftext|>'})[0]  # Get token ID
        with torch.no_grad():
            embed = model.tok_emb(torch.tensor([token_id], device=device)).cpu().numpy()
        return embed.flatten()

    # Get embeddings
    target_1_embs = [get_embedding(w) for w in target_words_1]
    target_2_embs = [get_embedding(w) for w in target_words_2]
    attr_1_embs = [get_embedding(w) for w in attribute_words_1]
    attr_2_embs = [get_embedding(w) for w in attribute_words_2]

    def association(t, A, B):
        return np.mean([cosine_similarity(t, a) for a in A]) - np.mean([cosine_similarity(t, b) for b in B])

    # Compute WEAT score
    s1 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_1_embs])
    s2 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_2_embs])
    
    weat_score = s1 - s2  # Difference in associations
    return weat_score


### 3. BLEU & ROUGE Score Calculation ###
def compute_bleu_rouge(references, predictions):
    """
    Compute BLEU and ROUGE scores between reference texts and model-generated texts.
    """

    # Tokenizing correctly for BLEU
    references = [[ref] for ref in references]  # Each reference sentence must be a list of words inside another list
    predictions = [pred for pred in predictions]  # Tokenize predictions
    
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)['bleu']
    rouge_score = rouge_metric.compute(predictions=predictions, references=[[r] for r in references])  # Wrap in another list

    return bleu_score, rouge_score["rougeL"]

In [None]:
compute_perplexity(model, val_loader, device)

In [None]:
target_male = ["gentleman", "husband", "clergyman", "brother", "captain"]
target_female = ["lady", "governess", "sister", "wife", "widow"]

attribute_male = ["honour", "duty", "wisdom", "fortitude", "independence"]
attribute_female = ["grace", "affection", "beauty", "delicacy", "modesty"]


weat_score(model, target_male, target_female, attribute_male, attribute_female, tokenizer, device)

In [None]:
references = []
predictions = []

# Example test prompts
test_sentences = [
    ("A single man in possession of a good fortune, must be ___", "in want of a wife"),
    ("The servants, I suppose, forgot to tell you that Mr. Palmer was ___", "not in the house."),
    ("He rose up, and ___", "walked across the room."),
    ("I should be undeserving of the confidence ___", "you have honoured me with"),
    ("A few months had seen the beginning and ___", "the end of their acquaintance")
]

for element in test_sentences:
    sentence = element[0]
    expected = element[1]
    
    encoded = tokenizer.encode(sentence.replace(" ___", ""), allowed_special={'<|endoftext|>'})
    input_ids = torch.tensor(encoded).unsqueeze(0)
    
    # Generate text
    # logits = model(input_ids)
    # logits = logits[:, -1, :]
    # idx_next = torch.argmax(logits, dim=-1, keepdim=True)
    # flat = idx_next.squeeze(0) # remove batch dimension
    
    # # Decode predictions & reference text
    # generated_text = tokenizer.decode(flat.tolist())
    
    generated_text = generate(
        model=model, tokenizer=tokenizer,
        prompt=sentence.replace(" ___", ""),
        max_new_tokens=5, context_size=GPT_CONFIG_124M['context_length'],
        device="cpu",
        temperature=0.5,
        top_k=40,
    )
    
    # Append reference (ground truth completion) & model output
    references.append(sentence.replace("___", expected))  # Example correct completion
    predictions.append(generated_text)

print("references:", references)
print("predictions:", predictions)
print(50*"=")

# Compute BLEU and ROUGE scores
bleu, rouge = compute_bleu_rouge(references, predictions)
print(f"BLEU Score: {bleu:.4f}, ROUGE-L Score: {rouge:.4f}")

### Example text generation

In [15]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, tokenizer=tokenizer,
    prompt="Hello, I am",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=1,
    top_k=40,
    eos_id=13
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Hello, I am glad to have the greatest character, when we had to leave them."
But Sir John and said to her the room, and she spoke at first she would scarcely any body


In [18]:
if device == "mps":
    clean()

MPS Available: True
Allocated Memory: 2113.8642578125 MB
