In [1]:
import torch
import tiktoken
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note:
# Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# However, the resulting loss values may be slightly different.

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
    "device": device,
}

In [4]:
from gpt_model import GPTModel
from generate_text_simple import generate_text_simple

In [5]:
# from generate_text import generate

# torch.manual_seed(123)
# model = GPTModel(GPT_CONFIG_124M).to(device)
# model.eval()  # disable dropout

# start_context = "Hello, I am"

# out = generate(
#     model=model,
#     prompt=start_context,
#     max_new_tokens=10,
#     context_size=GPT_CONFIG_124M["context_length"],
#     device=GPT_CONFIG_124M["device"],
#     temperature=1
# )
# print("Output text:", out)

In [6]:
file_path = 'pride_and_prejeduce.txt'

with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [7]:
from data_loader_v1 import create_dataloader_v1

# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=8,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=8,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [8]:
import gc

def clean(): 
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

In [9]:
train_losses, val_losses, track_tokens_seen = [], [], []

In [10]:
from pre_train import train_model_simple
import time

def train(tokenizer, train_loader, val_loader,
          num_epochs=10, eval_iter=5, 
          sample_text="Every effort moves you",
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    clean()
    print(50 * "=")
    print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, tokenizer=tokenizer,
        checkpoint_path=checkpoint_path, cfg=GPT_CONFIG_124M,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    print(50 * "=")
    clean()
    
    return model

In [11]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 711567
Tokens: 176635


In [12]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [13]:
# train model on pride & prejiduce

train(tokenizer, train_loader,
      val_loader, num_epochs=5,
      eval_iter=10, sample_text="The horses are",
      checkpoint_path="model_and_optimizer_2.pth")

MPS Available: True
Allocated Memory: 0.0 MB
Starting training...
Ep 1 (Step 000000): Train loss 9.605, Val loss 9.692
Ep 1 (Step 000010): Train loss 6.790, Val loss 7.189
Ep 1 (Step 000020): Train loss 6.266, Val loss 6.983
Ep 1 (Step 000030): Train loss 6.087, Val loss 6.750
Ep 1 (Step 000040): Train loss 5.938, Val loss 6.629
Ep 1 (Step 000050): Train loss 5.662, Val loss 6.456
Ep 1 (Step 000060): Train loss 5.592, Val loss 6.308
Ep 1 (Step 000070): Train loss 5.384, Val loss 6.248
The horses are!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Ep 2 (Step 000080): Train loss 5.196, Val loss 6.186
Ep 2 (Step 000090): Train loss 5.047, Val loss 6.146
Ep 2 (Step 000100): Train loss 4.982, Val loss 6.103
Ep 2 (Step 000110): Train loss 4.927, Val loss 6.033
Ep 2 (Step 000120): Train loss 4.793, Val loss 5.976
Ep 2 (Step 000130): Train loss 4.817, Val loss 5.994
Ep 2 (Step 000140): Train loss 4.731, Val loss 5.982
Ep 2 (Step 000150): Train loss 4.670, Val loss 5.945
The horses are!!!!!!!

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): Tran