In [1]:
import torch
import tiktoken
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

### Detect if GPU is available

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


### Set up model configuration 

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 8,          # Number of attention heads
    "n_layers": 8,         # Number of layers
    "drop_rate": 0.2,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
    "device": device,
}

### Initialize the tokenizer

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

Characters: 1820039
Tokens: 415577


### Load training and validation data files

In [4]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [5]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    tokenizer=tokenizer,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    tokenizer=tokenizer,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
total_characters = len(train_data + val_data)
total_tokens = len(tokenizer.encode(train_data + val_data, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

In [6]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [7]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, lr=0.0002,
          generate_sample_text=False,
          sample_text="It is a truth universally acknowledged, that a single man in possession of a good fortune, must be",
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")
    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-08, weight_decay=0.05)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        generate_sample_text=generate_sample_text,
        checkpoint_path=checkpoint_path,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen,
        tokenizer=tokenizer
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    if device == "cuda":
        print(50 * "=")
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
    
    return model

In [8]:
gc.collect()  # Force garbage collection

0

### Train the model on training data

In [9]:
# train model on all works

train(train_loader, val_loader, num_epochs=10,
      eval_iter=10, checkpoint_path="model_and_optimizer_best.pth");

Ep 1 (Step 000000): Train loss 9.016, Val loss 9.021
Ep 1 (Step 000010): Train loss 7.445, Val loss 7.414
Ep 1 (Step 000020): Train loss 6.828, Val loss 6.787
Ep 1 (Step 000030): Train loss 6.487, Val loss 6.478
Ep 1 (Step 000040): Train loss 6.374, Val loss 6.376
Ep 1 (Step 000050): Train loss 6.271, Val loss 6.323
Ep 1 (Step 000060): Train loss 6.242, Val loss 6.206
Ep 1 (Step 000070): Train loss 6.109, Val loss 6.111
Ep 1 (Step 000080): Train loss 6.035, Val loss 6.028
Ep 1 (Step 000090): Train loss 5.930, Val loss 5.948
Ep 1 (Step 000100): Train loss 5.861, Val loss 5.889
Ep 1 (Step 000110): Train loss 5.812, Val loss 5.833
Ep 1 (Step 000120): Train loss 5.739, Val loss 5.800
Ep 1 (Step 000130): Train loss 5.660, Val loss 5.743
Ep 1 (Step 000140): Train loss 5.696, Val loss 5.708
Ep 1 (Step 000150): Train loss 5.663, Val loss 5.672
Ep 1 (Step 000160): Train loss 5.495, Val loss 5.634
Ep 1 (Step 000170): Train loss 5.630, Val loss 5.605
Ep 1 (Step 000180): Train loss 5.538, Val loss

Ep 5 (Step 001550): Train loss 4.456, Val loss 4.919
Ep 5 (Step 001560): Train loss 4.445, Val loss 4.908
Ep 5 (Step 001570): Train loss 4.367, Val loss 4.906
Ep 5 (Step 001580): Train loss 4.366, Val loss 4.909
Ep 5 (Step 001590): Train loss 4.380, Val loss 4.910
Ep 5 (Step 001600): Train loss 4.356, Val loss 4.903
Ep 5 (Step 001610): Train loss 4.371, Val loss 4.902
Ep 5 (Step 001620): Train loss 4.306, Val loss 4.902
Ep 5 (Step 001630): Train loss 4.389, Val loss 4.903
Ep 5 (Step 001640): Train loss 4.357, Val loss 4.905
Ep 5 (Step 001650): Train loss 4.403, Val loss 4.895
Ep 5 (Step 001660): Train loss 4.406, Val loss 4.891
Ep 5 (Step 001670): Train loss 4.330, Val loss 4.893
Ep 5 (Step 001680): Train loss 4.322, Val loss 4.888
Ep 5 (Step 001690): Train loss 4.289, Val loss 4.887
Ep 5 (Step 001700): Train loss 4.349, Val loss 4.888
Ep 5 (Step 001710): Train loss 4.316, Val loss 4.887
Ep 5 (Step 001720): Train loss 4.380, Val loss 4.886
Ep 5 (Step 001730): Train loss 4.310, Val loss

Ep 9 (Step 003100): Train loss 4.029, Val loss 4.847
Ep 9 (Step 003110): Train loss 4.046, Val loss 4.846
Ep 9 (Step 003120): Train loss 4.093, Val loss 4.846
Ep 9 (Step 003130): Train loss 4.077, Val loss 4.846
Ep 9 (Step 003140): Train loss 4.074, Val loss 4.846
Ep 10 (Step 003150): Train loss 4.054, Val loss 4.846
Ep 10 (Step 003160): Train loss 4.029, Val loss 4.846
Ep 10 (Step 003170): Train loss 4.140, Val loss 4.846
Ep 10 (Step 003180): Train loss 4.067, Val loss 4.847
Ep 10 (Step 003190): Train loss 4.024, Val loss 4.847
Ep 10 (Step 003200): Train loss 4.107, Val loss 4.847
Ep 10 (Step 003210): Train loss 4.073, Val loss 4.847
Ep 10 (Step 003220): Train loss 4.061, Val loss 4.847
Ep 10 (Step 003230): Train loss 4.097, Val loss 4.847
Ep 10 (Step 003240): Train loss 4.026, Val loss 4.847
Ep 10 (Step 003250): Train loss 4.027, Val loss 4.847
Ep 10 (Step 003260): Train loss 4.107, Val loss 4.847
Ep 10 (Step 003270): Train loss 4.066, Val loss 4.847
Ep 10 (Step 003280): Train loss 4

In [10]:
# train model on all works

train(train_loader, val_loader, num_epochs=10,
      eval_iter=10, checkpoint_path="model_and_optimizer_best_old_tok.pth");

Ep 1 (Step 000000): Train loss 10.484, Val loss 10.461
Ep 1 (Step 000010): Train loss 8.235, Val loss 8.203
Ep 1 (Step 000020): Train loss 7.011, Val loss 6.983
Ep 1 (Step 000030): Train loss 6.659, Val loss 6.617
Ep 1 (Step 000040): Train loss 6.569, Val loss 6.562
Ep 1 (Step 000050): Train loss 6.469, Val loss 6.395
Ep 1 (Step 000060): Train loss 6.120, Val loss 6.250
Ep 1 (Step 000070): Train loss 5.919, Val loss 6.099
Ep 1 (Step 000080): Train loss 5.901, Val loss 6.016
Ep 1 (Step 000090): Train loss 5.774, Val loss 5.884
Ep 1 (Step 000100): Train loss 5.647, Val loss 5.786
Ep 1 (Step 000110): Train loss 5.595, Val loss 5.718
Ep 1 (Step 000120): Train loss 5.488, Val loss 5.658
Ep 1 (Step 000130): Train loss 5.502, Val loss 5.618
Ep 1 (Step 000140): Train loss 5.392, Val loss 5.549
Ep 1 (Step 000150): Train loss 5.318, Val loss 5.516
Ep 1 (Step 000160): Train loss 5.285, Val loss 5.481
Ep 1 (Step 000170): Train loss 5.227, Val loss 5.443
Ep 1 (Step 000180): Train loss 5.306, Val lo

Ep 5 (Step 001550): Train loss 3.853, Val loss 4.701
Ep 5 (Step 001560): Train loss 3.830, Val loss 4.711
Ep 5 (Step 001570): Train loss 3.903, Val loss 4.702
Ep 5 (Step 001580): Train loss 3.810, Val loss 4.697
Ep 5 (Step 001590): Train loss 3.739, Val loss 4.700
Ep 5 (Step 001600): Train loss 3.831, Val loss 4.685
Ep 5 (Step 001610): Train loss 3.745, Val loss 4.690
Ep 5 (Step 001620): Train loss 3.849, Val loss 4.693
Ep 5 (Step 001630): Train loss 3.725, Val loss 4.681
Ep 5 (Step 001640): Train loss 3.857, Val loss 4.701
Ep 5 (Step 001650): Train loss 3.746, Val loss 4.691
Ep 5 (Step 001660): Train loss 3.781, Val loss 4.686
Ep 5 (Step 001670): Train loss 3.831, Val loss 4.691
Ep 5 (Step 001680): Train loss 3.774, Val loss 4.696
Ep 5 (Step 001690): Train loss 3.775, Val loss 4.697
Ep 5 (Step 001700): Train loss 3.673, Val loss 4.691
Ep 5 (Step 001710): Train loss 3.683, Val loss 4.689
Ep 5 (Step 001720): Train loss 3.720, Val loss 4.691
Ep 5 (Step 001730): Train loss 3.750, Val loss

KeyboardInterrupt: 

### Load trained model

In [11]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0002, weight_decay=0.05)

checkpoint = torch.load("model_and_optimizer_best_old_tok.pth", weights_only=True)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

In [12]:
from torch.utils.data import DataLoader
from itertools import combinations
import numpy as np

In [13]:
def compute_perplexity(model, dataloader, device='cpu'):
    model.eval()
    total_loss = 0
    total_tokens = 0

    criterion = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids, target_ids = batch
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            logits = model(input_ids)  # Forward pass
            loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

            total_loss += loss.item() * target_ids.numel()
            total_tokens += target_ids.numel()

    perplexity = np.exp(total_loss / total_tokens)
    return perplexity

In [14]:
compute_perplexity(model, val_loader)

127.75698927127588

In [15]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, tokenizer=tokenizer,
    prompt="Miss Bennet has inherited the estate from her aunt, so she must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model,  tokenizer=tokenizer,
    prompt="Mr. Darcy has inherited the estate from his aunt, so he must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Miss Bennet has inherited the estate from her aunt, so she must have been rather.
"And I think you have no longer to be sure. My poor mother will be so very well known to be sure of Edward? And will not be able to tell you."
"Yes. You have given me in
Mr. Darcy has inherited the estate from his aunt, so he must have no less well as he, though in his behaviour, and the country. Mr. Bennet was obliged to be a way by his behaviour to the room, as he was a very often wished to have happened; and Mr. Bennet,


In [16]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, tokenizer=tokenizer,
    prompt="A wife is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, tokenizer=tokenizer,
    prompt="A husband is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

A wife is not to her.
“And you are very well.”
“I have not know, “I am not you,
A husband is not to be in the of this, I am sure I shall not mean to be sure I am sure to say to the very little to be so


In [17]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, tokenizer=tokenizer,
    prompt="A good lady ought to be",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, tokenizer=tokenizer,
    prompt="A highly respectable man ought to be",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

A good lady ought to be done. He saw him in his manner of his own judgment. He was not to him, he had a man, he had he had not been
A highly respectable man ought to be quite as she had seen him to take it. The house had passed, the greatest part of the house was not been the whole party of his own


In [19]:
if device == "mps":
    clean()

In [21]:
import torch

# Load the checkpoint
checkpoint = torch.load('model_and_optimizer_all_txt_updated.pth', map_location='cpu')

# Extract optimizer state dict
optimizer_state = checkpoint['optimizer_state_dict']

# Optional: print all optimizer keys to explore
print(optimizer_state.keys())

# Extract settings (if AdamW or Adam)
for param_group in optimizer_state['param_groups']:
    print("Learning Rate (lr):", param_group['lr'])
    print("Weight Decay:", param_group['weight_decay'])
    print("Betas:", optimizer_state['state'][list(optimizer_state['state'].keys())[0]]['exp_avg'])  # Optional state content
    print("Eps (may not be saved):", 'Check model code, not always stored')
    print(param_group)


dict_keys(['state', 'param_groups'])
Learning Rate (lr): 0.0002
Weight Decay: 0.01
Betas: 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

