In [1]:
import torch
import tiktoken
import os

from gpt_model import GPTModel

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note:
# Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# However, the resulting loss values may be slightly different.

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
    "device": device,
}

In [4]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

In [5]:
from data_loader_v1 import create_dataloader_v1

train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [6]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
total_characters = len(train_data + val_data)
total_tokens = len(tokenizer.encode(train_data + val_data, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 1827883
Tokens: 454177


In [7]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [8]:
import gc

def clean(): 
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

In [9]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, 
          sample_text="Every effort moves you",
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    clean()
    print(50 * "=")
    print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        checkpoint_path=checkpoint_path,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    print(50 * "=")
    clean()
    
    return model

In [10]:
# train model on 3 books

train(train_loader, val_loader, num_epochs=5,
      eval_iter=10, sample_text="The horses are",
      checkpoint_path="model_and_optimizer_3.pth")

MPS Available: True
Allocated Memory: 0.0 MB
Starting training...
Ep 1 (Step 000000): Train loss 9.750, Val loss 9.686
Ep 1 (Step 000010): Train loss 7.148, Val loss 7.087
Ep 1 (Step 000020): Train loss 6.806, Val loss 6.703
Ep 1 (Step 000030): Train loss 6.656, Val loss 6.656
Ep 1 (Step 000040): Train loss 6.488, Val loss 6.460
Ep 1 (Step 000050): Train loss 6.325, Val loss 6.317
Ep 1 (Step 000060): Train loss 6.113, Val loss 6.246
Ep 1 (Step 000070): Train loss 6.167, Val loss 6.150
Ep 1 (Step 000080): Train loss 6.018, Val loss 6.019
Ep 1 (Step 000090): Train loss 5.994, Val loss 5.959
Ep 1 (Step 000100): Train loss 5.674, Val loss 5.844
Ep 1 (Step 000110): Train loss 5.712, Val loss 5.795
Ep 1 (Step 000120): Train loss 5.618, Val loss 5.753
Ep 1 (Step 000130): Train loss 5.658, Val loss 5.683
Ep 1 (Step 000140): Train loss 5.602, Val loss 5.634
Ep 1 (Step 000150): Train loss 5.476, Val loss 5.586
Ep 1 (Step 000160): Train loss 5.392, Val loss 5.536
Ep 1 (Step 000170): Train loss 5.

Ep 4 (Step 001490): Train loss 3.838, Val loss 4.673
Ep 4 (Step 001500): Train loss 3.691, Val loss 4.661
Ep 4 (Step 001510): Train loss 3.724, Val loss 4.645
Ep 4 (Step 001520): Train loss 3.802, Val loss 4.655
Ep 4 (Step 001530): Train loss 3.744, Val loss 4.638
Ep 4 (Step 001540): Train loss 3.758, Val loss 4.631
Ep 4 (Step 001550): Train loss 3.680, Val loss 4.640
Ep 4 (Step 001560): Train loss 3.672, Val loss 4.632
Ep 4 (Step 001570): Train loss 3.640, Val loss 4.632
Ep 4 (Step 001580): Train loss 3.712, Val loss 4.624
Ep 4 (Step 001590): Train loss 3.734, Val loss 4.616
The horses are!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Ep 5 (Step 001600): Train loss 3.639, Val loss 4.618
Ep 5 (Step 001610): Train loss 3.589, Val loss 4.636
Ep 5 (Step 001620): Train loss 3.552, Val loss 4.648
Ep 5 (Step 001630): Train loss 3.619, Val loss 4.665
Ep 5 (Step 001640): Train loss 3.597, Val loss 4.683
Ep 5 (Step 001650): Train loss 3.606, Val loss 4.689
Ep 5 (Step 001660): Train loss 3.3

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): Tran

In [113]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

checkpoint = torch.load("model_and_optimizer_3.pth", weights_only=True)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

In [122]:
import importlib
import generate_text

importlib.reload(generate_text)
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="Hello, I am",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=1,
    top_k=40,
    eos_id=13
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Hello, I am sure, and even the first hope for it may not seem to him and my sweet
to have long have found enough to
been the smallest intention of
his heart, because he may remember all this; a person who was too decided by Miss


In [93]:
import torch
probs = torch.load("probs.pt")
print("Minimum probability:", probs.min())
print("Maximum probability:", probs.max())
print("Probabilities mean:", probs.mean())
print(torch.argmax(probs, dim=-1, keepdim=True))

Minimum probability: tensor(1.9645e-05)
Maximum probability: tensor(0.0032)
Probabilities mean: tensor(1.9898e-05)
tensor([[11]])


In [117]:
tokenizer.encode(".")

[13]

In [75]:
import torch

tensor = torch.tensor([[1e-30, 1e-25, -1e-20, -1e-35, 1e-28, 1e-22, 0.0]])
print(tensor)
print(torch.argmax(tensor, dim=-1, keepdim=True))  # Check across the correct dimension


tensor([[ 1.0000e-30,  1.0000e-25, -1.0000e-20, -1.0000e-35,  1.0000e-28,
          1.0000e-22,  0.0000e+00]])
tensor([[5]])


In [104]:
# train model on 3 books

train(train_loader, val_loader, num_epochs=5,
      eval_iter=20, sample_text="The horses are",
      checkpoint_path="model_and_optimizer_4.pth");

MPS Available: True
Allocated Memory: 2487.265869140625 MB
Starting training...
Ep 1 (Step 000000): Train loss 3.178, Val loss 4.780
Ep 1 (Step 000020): Train loss 3.169, Val loss 4.823
Ep 1 (Step 000040): Train loss 3.033, Val loss 4.850
Ep 1 (Step 000060): Train loss 3.132, Val loss 4.869
Ep 1 (Step 000080): Train loss 3.013, Val loss 4.868
Ep 1 (Step 000100): Train loss 3.054, Val loss 4.889
Ep 1 (Step 000120): Train loss 2.984, Val loss 4.911
Ep 1 (Step 000140): Train loss 3.043, Val loss 4.901
Ep 1 (Step 000160): Train loss 2.906, Val loss 4.893
Ep 1 (Step 000180): Train loss 3.023, Val loss 4.905
Ep 1 (Step 000200): Train loss 2.933, Val loss 4.883
Ep 1 (Step 000220): Train loss 2.796, Val loss 4.917
Ep 1 (Step 000240): Train loss 2.721, Val loss 4.897
Ep 1 (Step 000260): Train loss 2.771, Val loss 4.891
Ep 1 (Step 000280): Train loss 2.727, Val loss 4.892
Ep 1 (Step 000300): Train loss 2.758, Val loss 4.894
Ep 1 (Step 000320): Train loss 2.776, Val loss 4.894
Ep 1 (Step 000340):

tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0, 

tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
    

tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]], device='mps:0')
The horses are!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Ep 3 (Step 000800): Train loss 1.948, Val loss 5.093
Ep 3 (Step 000820

tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0

tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]],
       device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='mps:0')
tensor([[  464, 14260,   389,     0,

tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]], device='mps:0')
The horses are!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Ep 5 (Step 001600): Train loss 0.815, Val loss 5.773
Ep 5 (Step 001620): Train loss 0.733, Val loss 5.853
Ep 5 (Step 001640): Train loss 0.769, Val loss 5.910
Ep 5 (Step 001660): Train loss 0.726, Val loss 5.936
Ep 5 (Step 001680): Train loss 0.672, Val loss 5.993
Ep 5 (Step 001700): Train loss 0.713, Val loss 6.002
Ep 5 (Step 001720): Train loss 0.710, Val loss 6.035
Ep 5 (Step 001740): Train loss 0.689, Val loss 6.060
Ep 5 (Step 001760): Train loss 0.592, Val loss 6.056
Ep 5 (Step 001780): Tr

tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]], device='mps:0')
tensor([[  464, 14260,   389,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0