In [None]:
# Clone the NanoGPT
!git clone https://github.com/karpathy/nanoGPT.git
!pip install tiktoken
!pip install wandb

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm.auto import tqdm
from contextlib import nullcontext
import nanoGPT.model as GPT
import wandb
import os

In [None]:
# Directly set your API key (not secure in public notebooks)
import wandb
wandb.login(key="2c31d7e5323a64ac198ab2499a802513a1ac5ec8")

In [None]:
class GPTConfig: # Model config from NanoGPT
    block_size: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    # dropout: float = 0.0
    dropout: float = 0.2 
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

config = GPTConfig

In [None]:
!pip install datasets

from datasets import load_dataset

ds = load_dataset("roneneldan/TinyStories")

In [None]:
import tiktoken
import os

enc = tiktoken.get_encoding("gpt2")

# Some functions from https://github.com/karpathy/nanoGPT/blob/master/data/openwebtext/prepare.py

def process(example):
    ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
    out = {'ids': ids, 'len': len(ids)}
    return out

if not os.path.exists("train.bin"):
    tokenized = ds.map(
        process,
        remove_columns=['text'],
        desc="tokenizing the splits",
        num_proc=8,
        )
    # concatenate all the ids in each dataset into one large file we can use for training
    for split, dset in tokenized.items():
        arr_len = np.sum(dset['len'], dtype=np.uint64)
        filename = f'{split}.bin'
        dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
        total_batches = 1024

        idx = 0
        for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
            # Batch together samples for faster write
            batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
            arr_batch = np.concatenate(batch['ids'])
            # Write into mmap
            arr[idx : idx + len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()

In [None]:
# Some functions from https://github.com/karpathy/nanoGPT/blob/master/train.py with slight modifications
def get_batch(split):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap('train.bin', dtype=np.uint16, mode='r')
    else:
        data = np.memmap('validation.bin', dtype=np.uint16, mode='r')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y


def estimate_loss(model):
    out = {}
    model.eval()
    with torch.inference_mode():
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X, Y = get_batch(split)
                with ctx:
                    logits, loss = model(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()
    return out

In [None]:
# Training Config

learning_rate = 5e-4  # Reduced learning rate
max_iters = 5000      # Slightly reduced iterations
warmup_steps = 200    # Increased warmup
min_lr = 1e-5         # Adjusted minimum learning rate
eval_iters = 100
batch_size = 6        # Slight increase in batch size
block_size = 1024
gradient_accumulation_steps = 24  # Reduced accumulation steps

device =  "cuda" if torch.cuda.is_available() else "cpu"
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler

# How to use autocast https://wandb.ai/wandb_fc/tips/reports/How-To-Use-Autocast-in-PyTorch--VmlldzoyMTk4NTky
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]

ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

torch.set_default_device(device)
torch.manual_seed(42)

In [None]:
from torch.optim.lr_scheduler import LinearLR,SequentialLR, CosineAnnealingLR

nanoGPT = GPT.GPT(config)
# optimizer =  torch.optim.AdamW(nanoGPT.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
optimizer = torch.optim.AdamW(
    nanoGPT.parameters(), 
    lr=learning_rate, 
    betas=(0.9, 0.95),  # Adjusted betas
    weight_decay=0.01    # Added light weight decay
)

scheduler_warmup = LinearLR(optimizer, total_iters=warmup_steps)
scheduler_decay = CosineAnnealingLR(
    optimizer, 
    T_max=max_iters - warmup_steps, 
    eta_min=min_lr
)
scheduler = SequentialLR(
    optimizer, 
    schedulers=[scheduler_warmup, scheduler_decay], 
    milestones=[warmup_steps]
)
# https://stackoverflow.com/questions/72534859/is-gradscaler-necessary-with-mixed-precision-training-with-pytorch
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

In [None]:
wandb.init(
    project="LLM-Training-Assignment",  # Choose a meaningful project name
    config={
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "max_iters": max_iters,
        "model_layers": config.n_layer,
        "model_heads": config.n_head,
        "model_embedding_dim": config.n_embd
    }
)
best_val_loss = float('inf')
best_model_params_path = "best_model_params.pt"
train_loss_list, validation_loss_list = [], []

from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize variables for early stopping
early_stop_count = 0
patience = 8  # Number of evaluation intervals to wait before stopping
best_val_loss = float('inf')

# Set up ReduceLROnPlateau scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

try:
    for epoch in tqdm(range(max_iters)):
        # Evaluation and Logging
        if epoch % eval_iters == 0 and epoch != 0:
            losses = estimate_loss(nanoGPT)
            train_loss = losses['train']
            val_loss = losses['val']
            
            print(f"Epoch {epoch}: train loss {train_loss:.4f}, val loss {val_loss:.4f}")
            print(f"The current learning rate: {optimizer.param_groups[0]['lr']:.6f}")
            
            train_loss_list.append(train_loss)
            validation_loss_list.append(val_loss)
            
            wandb.log({
                "epoch": epoch,
                "train/loss": train_loss,
                "val/loss": val_loss,
                "lr": optimizer.param_groups[0]['lr']
            })
            
            # Save the best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                early_stop_count = 0  # Reset patience counter
                torch.save(nanoGPT.state_dict(), best_model_params_path)
                print(f"New best model saved with val loss: {val_loss:.4f}")
            else:
                early_stop_count += 1
                print(f"No improvement. Early stop count: {early_stop_count}/{patience}")
            
            # Early stopping condition
            if early_stop_count >= patience:
                print("Early stopping triggered. Training terminated.")
                break
            
            # Update learning rate based on validation loss
            scheduler.step(val_loss)

        # Training Step
        X, y = get_batch("train")
        with ctx:  # Mixed precision context
            logits, loss = nanoGPT(X, y)
            loss = loss / gradient_accumulation_steps
            scaler.scale(loss).backward()

        # Update weights and learning rate at accumulation step
        if (epoch + 1) % gradient_accumulation_steps == 0 or (epoch + 1 == max_iters):
            # Gradient Clipping
            torch.nn.utils.clip_grad_norm_(nanoGPT.parameters(), 1.0)
            
            # Optimizer step
            scaler.step(optimizer)
            optimizer.zero_grad(set_to_none=True)
            scaler.update()

finally:
    wandb.finish()


In [None]:
import matplotlib.pyplot as plt
train_loss_list_converted = [i.cpu().detach() for i in train_loss_list]
validation_loss_list_converted = [i.cpu().detach() for i in validation_loss_list]
plt.plot(train_loss_list_converted, 'g', validation_loss_list_converted, 'r')
plt.xlabel("Steps - Every 100 epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
#Load the model
nanoGPT = GPT.GPT(config)
device =  "cuda" if torch.cuda.is_available() else "cpu"
best_model_params_path = "best_model_params.pt"
nanoGPT.load_state_dict(torch.load(best_model_params_path, map_location=torch.device(device))) # load best model states


In [None]:
sentence = "There was a"
context = (torch.tensor(enc.encode_ordinary(sentence)).unsqueeze(dim = 0))
y = nanoGPT.generate(context, 200)
print(enc.decode(y.squeeze().tolist()))