In [1]:
import torch
import tiktoken
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

### Detect if GPU is available

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


### Set up model configuration 

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.2,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
    "device": device,
}

### Load training and validation data files

In [4]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [5]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

### Initialize the tokenizer

In [6]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
total_characters = len(train_data + val_data)
total_tokens = len(tokenizer.encode(train_data + val_data, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 1820039
Tokens: 415577


In [7]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [8]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [9]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, 
          sample_text="Every effort moves you",
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.05)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        checkpoint_path=checkpoint_path,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    
    return model

### Train the model on training data

In [10]:
# train model on 3 books

train(train_loader, val_loader, num_epochs=7,
      eval_iter=25, sample_text="The horses are",
      checkpoint_path="model_and_optimizer_5.pth");

MPS Available: True
Allocated Memory: 0.0 MB
Starting training...
Ep 1 (Step 000000): Train loss 10.521, Val loss 10.502
Ep 1 (Step 000025): Train loss 7.593, Val loss 7.567
Ep 1 (Step 000050): Train loss 6.531, Val loss 6.613
Ep 1 (Step 000075): Train loss 6.171, Val loss 6.316
Ep 1 (Step 000100): Train loss 5.955, Val loss 6.079
Ep 1 (Step 000125): Train loss 5.765, Val loss 5.947
Ep 1 (Step 000150): Train loss 5.593, Val loss 5.798
Ep 1 (Step 000175): Train loss 5.504, Val loss 5.699
Ep 1 (Step 000200): Train loss 5.366, Val loss 5.609
Ep 1 (Step 000225): Train loss 5.361, Val loss 5.547
Ep 1 (Step 000250): Train loss 5.231, Val loss 5.508
Ep 1 (Step 000275): Train loss 5.203, Val loss 5.447
Ep 1 (Step 000300): Train loss 5.135, Val loss 5.394
Ep 1 (Step 000325): Train loss 5.125, Val loss 5.357
Ep 1 (Step 000350): Train loss 5.033, Val loss 5.329
The horses are a way on such a family
Ep 2 (Step 000375): Train loss 5.000, Val loss 5.297
Ep 2 (Step 000400): Train loss 4.942, Val loss

### Load trained model

In [11]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

checkpoint = torch.load("model_and_optimizer_5.pth", weights_only=True)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

In [40]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="Miss Bennet has inherited the estate from her aunt, so she must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="Mr. Darcy has inherited the estate from his aunt, so he must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Miss Bennet has inherited the estate from her aunt, so she must suffer by her father to her own family.
"I am afraid it. Yes, dear aunt, I shall find you, and make it. You will not be very different great a very ill-in-morrow; and you know my word
Mr. Darcy has inherited the estate from his aunt, so he must be a very good deal more anxious to her; for she could not know, and it could not be his return, and she might not help feeling.
"I am sure I am not know, or the next morning to his conduct of him


In [41]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="Miss Bennet has inherited the estate from her aunt, so she must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="Mr. Darcy has inherited the estate from his aunt, so he must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Miss Bennet has inherited the estate from her aunt, so she must have been very early to a few hours, and she knew how happy.
The evening was going to the former friends, and in the ladies of this time, she had the entrance of the house. In a few months had been an hour,
Mr. Darcy has inherited the estate from his aunt, so he must be very well; and the very often to be not blame on his coming to the first time; but he could not be his attentions to him, he might be in the other, if he had not been more than he was so many months


In [45]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="A good lady ought to be",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="A highly respectable man ought to be",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

A good lady ought to be the rest of the evening, and their feelings which she was to the most natural.
"I do, my dear," said she, "you should be done for my acquaintance."
"I am not seen him."
Miss Bennet's
A highly respectable man ought to be able to a little of his wife's affection for the rest of his present. He had been spent in the matter; and had been spared only one of his father, and the same time, he had been given to the most valuable voice, had


In [13]:
if device == "mps":
    clean()