In [1]:
import torch
import tiktoken
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

### Detect if GPU is available

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


### Set up model configuration 

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 384,  # Context length
    "emb_dim": 640,         # Embedding dimension
    "n_heads": 8,          # Number of attention heads
    "n_layers": 10,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": True,      # Query-Key-Value bias
    "device": device,
}

### Initialize the tokenizer

#### GPT-2 tokenizer

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
def encode(full_text):
    return tokenizer.encode(full_text, allowed_special={'<|endoftext|>'})

### Load training and validation data files

In [6]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [7]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    encode=encode,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    encode=encode,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [8]:
full_text = train_data + val_data

word_count = len(full_text.split())
char_count = len(full_text)

tokens = tokenizer.encode(full_text, allowed_special={'<|endoftext|>'})

token_count = len(tokens)
unique_token_count = len(set(tokens))

print("Words:", word_count)
print("Characters:", char_count)
print("Tokens:", token_count)
print("Unique Tokens Used:", unique_token_count)

Words: 5789730
Characters: 32372094
Tokens: 7608098
Unique Tokens Used: 28960


In [9]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [10]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, lr=0.0002,
          generate_sample_text=False,
          sample_text="It is a truth universally acknowledged, that a single man in possession of a good fortune, must be",
          model_prefix="model_and_optimizer"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")
    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-08, weight_decay=0.05)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        generate_sample_text=generate_sample_text,
        model_prefix=model_prefix,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen,
        tokenizer=tokenizer
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    if device == "cuda":
        print(50 * "=")
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
    
    return model

In [11]:
gc.collect()  # Force garbage collection

0

### Train the model on training data

In [12]:
# train model on all works

train(train_loader, val_loader, num_epochs=6,
      eval_iter=150, model_prefix="model_640_10_8_0_1");

Ep 1 (Step 000000): Train loss 10.449, Val loss 10.457
Ep 1 (Step 000150): Train loss 6.141, Val loss 6.075
Ep 1 (Step 000300): Train loss 5.762, Val loss 5.716
Ep 1 (Step 000450): Train loss 5.631, Val loss 5.572
Ep 1 (Step 000600): Train loss 5.530, Val loss 5.458
Ep 1 (Step 000750): Train loss 5.393, Val loss 5.399
Ep 1 (Step 000900): Train loss 5.349, Val loss 5.337
Ep 1 (Step 001050): Train loss 5.309, Val loss 5.282
Ep 1 (Step 001200): Train loss 5.212, Val loss 5.193
Ep 1 (Step 001350): Train loss 5.194, Val loss 5.169
Ep 1 (Step 001500): Train loss 5.145, Val loss 5.067
Ep 1 (Step 001650): Train loss 5.045, Val loss 5.060
Ep 1 (Step 001800): Train loss 5.056, Val loss 4.991
Ep 1 (Step 001950): Train loss 4.967, Val loss 4.930
Ep 1 (Step 002100): Train loss 4.948, Val loss 4.903
Ep 1 (Step 002250): Train loss 4.875, Val loss 4.900
Ep 1 (Step 002400): Train loss 4.897, Val loss 4.862
Ep 1 (Step 002550): Train loss 4.870, Val loss 4.860
Ep 1 (Step 002700): Train loss 4.812, Val lo

Ep 6 (Step 023250): Train loss 3.627, Val loss 4.034
Ep 6 (Step 023400): Train loss 3.616, Val loss 4.032
Ep 6 (Step 023550): Train loss 3.626, Val loss 4.031
Ep 6 (Step 023700): Train loss 3.645, Val loss 4.034
Ep 6 (Step 023850): Train loss 3.635, Val loss 4.036
Ep 6 (Step 024000): Train loss 3.629, Val loss 4.032
Ep 6 (Step 024150): Train loss 3.625, Val loss 4.031
Ep 6 (Step 024300): Train loss 3.601, Val loss 4.032
Ep 6 (Step 024450): Train loss 3.619, Val loss 4.031
Ep 6 (Step 024600): Train loss 3.620, Val loss 4.031
Ep 6 (Step 024750): Train loss 3.614, Val loss 4.033
Ep 6 (Step 024900): Train loss 3.649, Val loss 4.032
Ep 6 (Step 025050): Train loss 3.629, Val loss 4.032
Ep 6 (Step 025200): Train loss 3.619, Val loss 4.031
Ep 6 (Step 025350): Train loss 3.599, Val loss 4.031
Ep 6 (Step 025500): Train loss 3.652, Val loss 4.032
Ep 6 (Step 025650): Train loss 3.641, Val loss 4.032
Ep 6 (Step 025800): Train loss 3.603, Val loss 4.031
Ep 6 (Step 025950): Train loss 3.624, Val loss

### Load trained model

In [26]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0002, weight_decay=0.05)

checkpoint = torch.load("model_768_12_12_512.pth", weights_only=True, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

In [None]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"

In [27]:
from torch.utils.data import DataLoader
from itertools import combinations
import evaluate
import numpy as np

2025-08-07 23:19:26.056087: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


RuntimeError: This version of jaxlib was built using AVX instructions, which your CPU and/or operating system do not support. This error is frequently encountered on macOS when running an x86 Python installation on ARM hardware. In this case, try installing an ARM build of Python. Otherwise, you may be able work around this issue by building jaxlib from source.

In [28]:
def compute_perplexity(model, dataloader, device='cpu'):
    model.eval()
    total_loss = 0
    total_tokens = 0

    criterion = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids, target_ids = batch
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            logits = model(input_ids)  # Forward pass
            loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

            total_loss += loss.item() * target_ids.numel()
            total_tokens += target_ids.numel()

    perplexity = np.exp(total_loss / total_tokens)
    return perplexity

In [29]:
compute_perplexity(model, val_loader)

NameError: name 'np' is not defined

In [None]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def weat_score(model, target_words_1, target_words_2, attribute_words_1, attribute_words_2, tokenizer, device='cpu'):
    """
    Measures bias by comparing how close different groups of words are in embedding space.
    """

    def get_embedding(word):
        token_id = tokenizer.encode(word, allowed_special={'<|endoftext|>'})[0]
        with torch.no_grad():
            embed = model.tok_emb(torch.tensor([token_id], device=device)).cpu().numpy()
        return embed.flatten()

    # Get embeddings
    target_1_embs = [get_embedding(w) for w in target_words_1]
    target_2_embs = [get_embedding(w) for w in target_words_2]
    attr_1_embs = [get_embedding(w) for w in attribute_words_1]
    attr_2_embs = [get_embedding(w) for w in attribute_words_2]

    def association(t, A, B):
        return np.mean([cosine_similarity(t, a) for a in A]) - np.mean([cosine_similarity(t, b) for b in B])

    # Compute WEAT score
    s1 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_1_embs])
    s2 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_2_embs])
    
    weat_score = s1 - s2
    return weat_score

In [None]:
target_male = ["gentleman", "officer", "clergyman", "husband", "captain"]
target_female = ["lady", "governess", "girl", "wife", "widow"]

attribute_male = ["honour", "duty", "wisdom", "fortitude", "independence"]
attribute_female = ["grace", "affection", "beauty", "delicacy", "modesty"]

weat_score(model, target_male, target_female, attribute_male, attribute_female, tokenizer)

In [None]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [None]:
import torch
import evaluate
import re

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def compute_bleu_rouge_from_val(model, device="cpu"):
    references = []
    predictions = []

    # Step 1: Load the validation set
    with open('val_text_data_all_txt.txt', 'r', encoding='utf-8') as f:
        data = f.read()

    # Step 2: Split into sentences & filter
    sentences = re.split(r'(?<=[.!?])\s+', data)
    filtered_sentences = [s.strip() for s in sentences if 5 <= len(s.split()) <= 60]
    filtered_sentences = filtered_sentences[:1000]

    # Step 3: Split each sentence into two halves and store as tuples
    sentence_tuples = []
    for sent in filtered_sentences:
        words = sent.split()
        mid = len(words) // 2
        first_half = ' '.join(words[:mid])
        second_half = ' '.join(words[mid:])
        sentence_tuples.append((first_half, second_half))

    # Step 4: For each (first_half, second_half), generate prediction
    for first_half, second_half in sentence_tuples:
        generated_text = generate(
            model=model, prompt=first_half,
            max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
            device=device,
            temperature=0.7,
            top_k=50
        )

        # Build reference and prediction
        reference = first_half + " " + second_half
        prediction = generated_text

        references.append(reference)
        predictions.append(prediction)

    # Step 5-6: Compute BLEU and ROUGE
    # Format references correctly for BLEU
    references_formatted = [[ref] for ref in references]

    bleu_score = bleu_metric.compute(predictions=predictions, references=references_formatted)['bleu']
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    print(f"BLEU Score: {bleu_score:.4f}, ROUGE-L Score: {rouge_score['rougeL']:.4f}")

In [None]:
compute_bleu_rouge_from_val(model)

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="Miss Bennet has inherited the estate from her aunt, so she must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model,
    prompt="Mr. Darcy has inherited the estate from his aunt, so he must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="A wife is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, 
    prompt="A husband is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, 
    prompt="I shall now go",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, 
    prompt="He said",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, 
    prompt="She was",
    max_new_tokens=200, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

In [None]:
if device == "mps":
    clean()

In [66]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="a duty to",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.4,
    top_k=50
)

text

'a duty to be the very day, and I am sure I am sure I am sure I should have been a very much obliged to be very happy. I am'

In [67]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="a duty to",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.4,
    top_k=50
)

text

'a duty to go and Mrs. Weston.\n"I am very glad to think of your own family."\n"I will not like you. I am afraid'

In [92]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="she is wild to get married",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=1,
    top_k=50
)

text

'she is wild to get married from me to do the best of them. I think it is quite forgot to write to be a great deal. You must own. There is nothing'