In [1]:
import torch
import tiktoken
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

### Detect if GPU is available

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using cpu device.


### Set up model configuration 

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.2,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
    "device": device,
}

### Load training and validation data files

In [4]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [5]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

### Initialize the tokenizer

In [6]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
total_characters = len(train_data + val_data)
total_tokens = len(tokenizer.encode(train_data + val_data, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 4132331
Tokens: 935517


In [7]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [8]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [9]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, 
          sample_text="Every effort moves you",
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")
    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.05)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        checkpoint_path=checkpoint_path,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    if device == "cuda":
        print(50 * "=")
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
    
    return model

In [10]:
gc.collect()  # Force garbage collection

20

### Train the model on training data

In [11]:
# train model on all works

train(train_loader, val_loader, num_epochs=10,
      eval_iter=20, sample_text="He inherited the estate",
      checkpoint_path="model_and_optimizer_all_txt.pth");

Ep 1 (Step 000000): Train loss 10.423, Val loss 10.416
Ep 1 (Step 000020): Train loss 7.801, Val loss 7.793
Ep 1 (Step 000040): Train loss 6.614, Val loss 6.624
Ep 1 (Step 000060): Train loss 6.217, Val loss 6.256
Ep 1 (Step 000080): Train loss 5.963, Val loss 6.021
Ep 1 (Step 000100): Train loss 5.791, Val loss 5.847
Ep 1 (Step 000120): Train loss 5.622, Val loss 5.724
Ep 1 (Step 000140): Train loss 5.524, Val loss 5.606
Ep 1 (Step 000160): Train loss 5.428, Val loss 5.505
Ep 1 (Step 000180): Train loss 5.339, Val loss 5.432
Ep 1 (Step 000200): Train loss 5.248, Val loss 5.356
Ep 2 (Step 000220): Train loss 5.249, Val loss 5.304
Ep 2 (Step 000240): Train loss 5.170, Val loss 5.266
Ep 2 (Step 000260): Train loss 5.101, Val loss 5.215
Ep 2 (Step 000280): Train loss 5.098, Val loss 5.188
Ep 2 (Step 000300): Train loss 5.023, Val loss 5.148
Ep 2 (Step 000320): Train loss 4.979, Val loss 5.120
Ep 2 (Step 000340): Train loss 4.965, Val loss 5.073
Ep 2 (Step 000360): Train loss 4.919, Val lo

### Load trained model

In [29]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

checkpoint = torch.load("model_and_optimizer_all_txt.pth", weights_only=True, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

## Evaluate model

In [21]:
from torch.utils.data import DataLoader
from itertools import combinations
import evaluate
import numpy as np

#### 1. Perplexity Calculation

In [22]:
def compute_perplexity(model, dataloader, device='cpu'):
    model.eval()
    total_loss = 0
    total_tokens = 0

    criterion = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids, target_ids = batch
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            logits = model(input_ids)  # Forward pass
            loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

            total_loss += loss.item() * target_ids.numel()
            total_tokens += target_ids.numel()

    perplexity = np.exp(total_loss / total_tokens)
    return perplexity

In [23]:
compute_perplexity(model, val_loader, device)

np.float64(82.51142259395502)

#### 2. Word Embedding Association Test (WEAT)

In [27]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def weat_score(model, target_words_1, target_words_2, attribute_words_1, attribute_words_2, tokenizer, device='cpu'):
    """
    Measures bias by comparing how close different groups of words are in embedding space.
    """

    def get_embedding(word):
        token_id = tokenizer.encode(word, allowed_special={'<|endoftext|>'})[0]
        with torch.no_grad():
            embed = model.tok_emb(torch.tensor([token_id], device=device)).cpu().numpy()
        return embed.flatten()

    # Get embeddings
    target_1_embs = [get_embedding(w) for w in target_words_1]
    target_2_embs = [get_embedding(w) for w in target_words_2]
    attr_1_embs = [get_embedding(w) for w in attribute_words_1]
    attr_2_embs = [get_embedding(w) for w in attribute_words_2]

    def association(t, A, B):
        return np.mean([cosine_similarity(t, a) for a in A]) - np.mean([cosine_similarity(t, b) for b in B])

    # Compute WEAT score
    s1 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_1_embs])
    s2 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_2_embs])
    
    weat_score = s1 - s2
    return weat_score

In [28]:
target_male = ["gentleman", "officer", "clergyman", "husband", "captain"]
target_female = ["lady", "governess", "girl", "wife", "widow"]

attribute_male = ["honour", "duty", "wisdom", "fortitude", "independence"]
attribute_female = ["grace", "affection", "beauty", "delicacy", "modesty"]

weat_score(model, target_male, target_female, attribute_male, attribute_female, tokenizer, device)

np.float32(-0.027674114)

#### 3. BLEU & ROUGE Score Calculation

In [None]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [None]:
def compute_bleu_rouge(references, predictions):
    """
    Compute BLEU and ROUGE scores between reference texts and model-generated texts.
    """
    references = [[ref.split()] for ref in references]  # BLEU requires tokenized reference list
    predictions = [pred.split() for pred in predictions]

    bleu_score = bleu_metric.compute(predictions=predictions, references=references)['bleu']
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    return bleu_score, rouge_score["rougeL"]

In [None]:
compute_bleu_rouge

## Example text generation

In [19]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="Miss Bennet has inherited the estate from her aunt, so she must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="Mr. Darcy has inherited the estate from his aunt, so he must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Miss Bennet has inherited the estate from her aunt, so she must have been the subject, and so much in love with her, as to her that Mr. and Mrs. Gardiner, that you should not be so disappointed, that his absence is not to be in his way. There is a very young man
Mr. Darcy has inherited the estate from his aunt, so he must have done so much in all his own. It is not a very handsome young man, that is, I hope, to be very glad to be so in my power to be in the least. I am very glad that I hope it will be


In [26]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="A wife is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="A husband is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

A wife is a young man, and is not a very agreeable young man, and the other people know. But it is a very different sort of man, and
A husband is a man, and is a very pretty young man, and a very agreeable man, and a very good young man, and a very good man,


In [24]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="A good lady ought to be",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="A highly respectable man ought to be",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

A good lady ought to be a very desirable thing. The two girls are a man, and every body ought to be allowed to be the best, and all must be a little
A highly respectable man ought to be in town of him."
"I am sure," cried Catherine, "for I always felt that I am so kind as to the match as possible


In [16]:
if device == "mps":
    clean()