In [1]:
import itertools
import torch
from copy import deepcopy
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from pre_train import train_model_simple

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using cuda device.


In [3]:
config_grid = {
    "vocab_size": [10000],
    "context_length": [256, 512],
    "emb_dim": [512, 768, 1024],
    "n_heads": [4, 6, 8, 12, 16],
    "n_layers": [6, 8, 12, 16],
    "drop_rate": [0.3, 0.2],
    "qkv_bias": [False, True],
    "device": [device],
}

optimizer_grid = {
    "lr": [0.0002, 0.0004],
    "betas": [(0.9, 0.95), (0.9, 0.98)],
    "eps": [1e-8],
    "weight_decay": [0.01, 0.05, 0.001]
}


In [4]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

In [5]:
train_ratio = 0.90

In [6]:
import gc

def train_func(model, optimizer, train_loader, val_loader, cfg):

    train_losses, val_losses, track_tokens_seen = [],[],[]

    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
        print(50 * "=")
        print("Starting training...")

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=1, eval_iter=10,
        cfg=cfg,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen,
        start_context="",
        generate_sample_text=False
    )
    
    if val_losses:
        final_val_loss = val_losses[-1]
    else:
        final_val_loss = float('inf')  # Fallback if val_losses is empty
    
    gc.collect()

    return final_val_loss

In [7]:
def grid_search_gpt(config_grid, optimizer_grid, train_func):
    """
    Perform grid search over GPT config and optimizer parameters.

    Args:
        train_loader: Your PyTorch training DataLoader
        val_loader: Your PyTorch validation DataLoader
        config_grid: Dictionary of GPT config hyperparameters to search
        optimizer_grid: Dictionary of optimizer hyperparameters to search
        train_func: Function that accepts (model, optimizer, train_loader, val_loader) and returns val_loss

    Returns:
        Best config, optimizer params, and the lowest validation loss
    """

    # Create all possible combinations of config and optimizer params
    config_combos = list(itertools.product(*config_grid.values()))
    optimizer_combos = list(itertools.product(*optimizer_grid.values()))

    best_val_loss = float('inf')
    best_config = None
    best_optimizer_params = None

    for config_values in config_combos:
        gpt_cfg = deepcopy(dict(zip(config_grid.keys(), config_values)))

        for opt_values in optimizer_combos:
            opt_cfg = dict(zip(optimizer_grid.keys(), opt_values))
            
            if gpt_cfg["emb_dim"] % gpt_cfg["n_heads"] != 0:
                continue
                
            
            train_loader = create_dataloader_v1(
                train_data,
                batch_size=4,
                max_length=gpt_cfg["context_length"],
                stride=gpt_cfg["context_length"],
                drop_last=True,
                shuffle=True,
                num_workers=0
            )

            val_loader = create_dataloader_v1(
                val_data,
                batch_size=4,
                max_length=gpt_cfg["context_length"],
                stride=gpt_cfg["context_length"],
                drop_last=False,
                shuffle=False,
                num_workers=0
            )

            # Build your model dynamically here (pseudo-code, replace with your model call)
            model = GPTModel(gpt_cfg).to(gpt_cfg['device'])
            optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=opt_cfg['lr'],
                betas=opt_cfg['betas'],
                eps=opt_cfg['eps'],
                weight_decay=opt_cfg['weight_decay']
            )

            # Run one training + validation loop and get val loss (your training logic)
            val_loss = train_func(model, optimizer, train_loader, val_loader, gpt_cfg)

            print(50*"=")
            print(f"Config: {gpt_cfg}, Optimizer: {opt_cfg}, Val Loss: {val_loss:.4f}")
            print(50*"=")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_config = deepcopy(gpt_cfg)
                best_optimizer_params = deepcopy(opt_cfg)

    return best_config, best_optimizer_params, best_val_loss

In [None]:
grid_search_gpt(config_grid, optimizer_grid, train_func)

Ep 1 (Step 000000): Train loss 9.169, Val loss 9.168
Ep 1 (Step 000010): Train loss 7.584, Val loss 7.536
Ep 1 (Step 000020): Train loss 6.960, Val loss 6.854
Ep 1 (Step 000030): Train loss 6.540, Val loss 6.515
Ep 1 (Step 000040): Train loss 6.423, Val loss 6.410
Ep 1 (Step 000050): Train loss 6.453, Val loss 6.383
Ep 1 (Step 000060): Train loss 6.324, Val loss 6.346
Ep 1 (Step 000070): Train loss 6.261, Val loss 6.257
Ep 1 (Step 000080): Train loss 6.213, Val loss 6.190
Ep 1 (Step 000090): Train loss 6.090, Val loss 6.092
Ep 1 (Step 000100): Train loss 6.044, Val loss 6.014
Ep 1 (Step 000110): Train loss 5.891, Val loss 5.972
Ep 1 (Step 000120): Train loss 5.903, Val loss 5.929
Ep 1 (Step 000130): Train loss 5.739, Val loss 5.876
Ep 1 (Step 000140): Train loss 5.793, Val loss 5.832
Ep 1 (Step 000150): Train loss 5.729, Val loss 5.783
Ep 1 (Step 000160): Train loss 5.743, Val loss 5.758
Ep 1 (Step 000170): Train loss 5.622, Val loss 5.708
Ep 1 (Step 000180): Train loss 5.611, Val loss