### Step 1: Install necesscary packages

In [1]:
%pip install matplotlib
%pip install torch numpy transformers datasets tiktoken wandb tqdm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch

# Check if GPU is available
print("CUDA available:", torch.cuda.is_available())

# Check which device is being used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Optional: print GPU name
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
Using device: cuda
GPU name: NVIDIA GeForce RTX 3060 Ti


### Step 2: Package imports and configuration

In [3]:
import sys
import os
sys.path.append(os.path.abspath(".."))
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt
# Configuration
beta = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_lr = 1e-4 #default is 1e-4
epochs = 5
batch_size = 64
max_length =64
num_samples = 1
max_new_tokens = 200
temperature = 0.7 # default is 0.8
top_k = 200
# tokenizer
with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]
#def encode(s): return [stoi[c] for c in s]
#def decode(l): return ''.join([itos[i] for i in l])

PAD_IDX = 0
UNK_IDX = stoi.get("<unk>", stoi.get(" ", PAD_IDX))  # prefer <unk>, then space, else pad(0)

def encode(s: str):
    # map unseen characters to UNK instead of raising KeyError
    return [stoi.get(c, UNK_IDX) for c in s]

def decode(ids):
    return ''.join(itos[i] for i in ids if 0 <= i < len(itos))

# this the default

### Step 3: Define helper functions

In [4]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss

def pad_or_truncate(seq, max_length):
    return seq[-max_length:] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

def get_batches(lines, batch_size):
    random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [5]:
ckpt = torch.load("../sft/gpt.pt", map_location=device)
gptconf = GPTConfig(**ckpt['model_args'])
gpt = GPT(gptconf)
state_dict = ckpt['model']
unwanted_prefix = '_orig_mod.'
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.to(device).train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

### Step 5: Load Data (**students are required to complete this part!**)

In [6]:
# Load data from pos_neg_pairs.json
import json
import tiktoken
# Loading the json file, CHANGE ADDRESS IF NEEDED
lines = ""
with open("../pos_neg_pairs.json", "r", encoding = "utf-8") as f:
 lines = json.load(f)
 print(f"Loaded {len(lines)} pairs.")

Loaded 100000 pairs.


Step 6: Optimizer and Scheduler

In [7]:
grad_clip = 1.0  # Gradient clipping to prevent explosion
anchor_weight_start = 0.2
anchor_weight_end = 0.05
neg_anchor_weight = 0.001

total_steps = len(lines) // batch_size

In [8]:
# AdamW optimizer
import math
optimizer = gpt.configure_optimizers(
    weight_decay=0.01,            # consider 0.01 instead of 0.1
    learning_rate=base_lr,        # e.g., 1e-3 or 1e-4
    betas=(0.9, 0.95),            # your choice
    device_type='cuda' if device=='cuda' else 'cpu'
)

# 2) Define a multiplicative factor schedule: warmup (0→1), then cosine to 0.1
max_iters   = (len(lines) // batch_size) * epochs
warmup_steps = max(1, int(0.03 * max_iters))

def lr_factor(step: int):
    if step < warmup_steps:
        return (step + 1) / warmup_steps                  # 0→1 over warmup
    t = (step - warmup_steps) / max(1, (max_iters - warmup_steps))
    return 0.1 + 0.9 * 0.5 * (1 + math.cos(math.pi * t))  # decay to 0.1

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_factor)


num decayed parameter tensors: 26, with 8,834,328 parameters
num non-decayed parameter tensors: 13, with 4,524 parameters
using fused AdamW: True


### Step 7: Begin training (**students are required to complete this part!**)

This one works

In [9]:
# total_steps = len(lines) // batch_size
# for epoch in range(epochs):
#     pbar = tqdm(get_batches(lines, batch_size))
#     for step, (neg_tensor, pos_tensor) in enumerate(pbar):
#         ##########################################################################
#         neg_log = compute_logprob(neg_tensor)
#         pos_log = compute_logprob(pos_tensor)
#         loss = -F.logsigmoid((pos_log - neg_log) / beta).mean() - 0.1 * pos_log.mean()

#         global_step = epoch * total_steps + step + 1
#         optimizer.zero_grad()
#         loss.backward()
#         lr = warmup_cosine_scheduler(total_steps * epochs, base_lr, global_step)

#         for param_group in optimizer.param_groups:
#             param_group['lr'] = lr

#         optimizer.step()
#         pbar.set_description(f"epoch {epoch+1} step {step+1} loss={loss.item():.4f} lr={lr:.2e}")
#         ##########################################################################

#     ckpt_path = f"./dpo_epoch_{epoch+1}.pt"
#     torch.save({
#         "model_state_dict": gpt.state_dict(),
#         "model_args": ckpt['model_args'],
#     }, ckpt_path)
#     print(f"Saved checkpoint to {ckpt_path}")

This one doesnt work

In [10]:
# for epoch in range(epochs):
#     pbar = tqdm(get_batches(lines, batch_size))
#     epoch_loss = 0.0  # Initialize the epoch loss for each epoch
#     for step, (neg_tensor, pos_tensor) in enumerate(pbar):
#         ##########################################################################
#         # Forward pass for negative and positive tensors
#         neg_logprob = compute_logprob(neg_tensor)
#         pos_logprob = compute_logprob(pos_tensor)

#         # Compute the preference term (contrastive loss)
#         logit_diff = (pos_logprob - neg_logprob) / beta
#         preference_term = -torch.nn.functional.logsigmoid(logit_diff).mean()

#         # Adaptive anchor term (adjust the importance of positive/negative samples)
#         progress = epoch * total_steps + step  # Track progress over the entire training
#         anchor_weight = anchor_weight_start * (1 - progress / (total_steps * epochs)) + anchor_weight_end * (progress / (total_steps * epochs))

#         # Dual anchoring: encourage good positives, discourage negatives
#         pos_anchor = -anchor_weight * pos_logprob.mean()
#         neg_anchor = neg_anchor_weight * neg_logprob.mean()
#         anchor_term = pos_anchor + neg_anchor

#         # Total loss (contrastive + anchor regularization)
#         loss = preference_term + anchor_term

#         # Backward pass
#         optimizer.zero_grad()  # Zero the gradients before backpropagation
#         loss.backward()

#         # Clip gradients for stability
#         torch.nn.utils.clip_grad_norm_(gpt.parameters(), grad_clip)

#         # Optimizer step: update model parameters
#         optimizer.step()

#         # Track loss for the epoch
#         epoch_loss += loss.item()

#         # Update progress bar description with loss and learning rate info
#         lr = scheduler.get_last_lr()[0]  # Get the current learning rate from scheduler
#         pbar.set_description(f"Epoch {epoch + 1}/{epochs} | Step {step}/{total_steps} | Loss {loss.item():.4f} | LR {lr:.2e}")

#     # Update the learning rate scheduler at the end of each epoch
#     scheduler.step()

#     # Print average loss for the epoch
#     print(f"Epoch {epoch + 1} completed. Avg Loss: {epoch_loss / total_steps:.4f}")
#         ##########################################################################

#     ckpt_path = f"./dpo.pt"
#     torch.save({
#             "model_state_dict": gpt.state_dict(),
#             "model_args": ckpt['model_args'],
#         }, ckpt_path)
#     print(f"Saved checkpoint to {ckpt_path}")


Kieran's new try

In [11]:
global_step = 0
for epoch in range(epochs):
    pbar = tqdm(get_batches(lines, batch_size))
    epoch_loss = 0.0

    for step, (neg_tensor, pos_tensor) in enumerate(pbar):
        # forward
        neg_logprob = compute_logprob(neg_tensor)
        pos_logprob = compute_logprob(pos_tensor)
        logit_diff = (pos_logprob - neg_logprob) / beta
        preference_term = -torch.nn.functional.logsigmoid(logit_diff).mean()

        progress = epoch * total_steps + step
        anchor_weight = anchor_weight_start * (1 - progress / (total_steps * epochs)) \
                        + anchor_weight_end   * (progress / (total_steps * epochs))
        pos_anchor = -anchor_weight * pos_logprob.mean()
        neg_anchor =  neg_anchor_weight * neg_logprob.mean()
        loss = preference_term + (pos_anchor + neg_anchor)

        # backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(gpt.parameters(), grad_clip)
        optimizer.step()
        scheduler.step()                 # <-- per-iteration step
        global_step += 1

        epoch_loss += loss.item()
        current_lr = optimizer.param_groups[0]['lr']  # read actual LR
        pbar.set_description(
            f"Epoch {epoch+1}/{epochs} | Step {step+1}/{total_steps} | "
            f"Loss {loss.item():.4f} | LR {current_lr:.2e}"
        )

    print(f"Epoch {epoch + 1} completed. Avg Loss: {epoch_loss / total_steps:.4f}")

    # save
    torch.save({"model_state_dict": gpt.state_dict(),
                "model_args": ckpt['model_args']}, "./dpo.pt")
    print("Saved checkpoint to ./dpo.pt")

Epoch 1/5 | Step 1562/1562 | Loss -0.0237 | LR 9.33e-05: : 1562it [02:38,  9.88it/s]


Epoch 1 completed. Avg Loss: 0.1708
Saved checkpoint to ./dpo.pt


Epoch 2/5 | Step 1562/1562 | Loss -0.1173 | LR 7.14e-05: : 1562it [02:38,  9.86it/s]


Epoch 2 completed. Avg Loss: -0.0707
Saved checkpoint to ./dpo.pt


Epoch 3/5 | Step 1562/1562 | Loss -0.1945 | LR 4.28e-05: : 1562it [02:37,  9.94it/s]


Epoch 3 completed. Avg Loss: -0.1573
Saved checkpoint to ./dpo.pt


Epoch 4/5 | Step 1562/1562 | Loss -0.2428 | LR 1.91e-05: : 1562it [02:36,  9.99it/s]


Epoch 4 completed. Avg Loss: -0.2188
Saved checkpoint to ./dpo.pt


Epoch 5/5 | Step 1562/1562 | Loss -0.2623 | LR 1.00e-05: : 1562it [02:36,  9.98it/s]

Epoch 5 completed. Avg Loss: -0.2529
Saved checkpoint to ./dpo.pt





### Step 8: Begin testing (**students are required to complete this part!**)

In [14]:
# Load the fine-tuned model
ckpt_path = "../dpo/dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
gpt = GPT(gptconf).cuda()
try:
    state_dict = checkpoint['model']
except:
    state_dict = checkpoint['model_state_dict']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
# Test
gpt.eval()
test_set = ["100+19=?", "3*17=?", "72/4=?", "72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=34,x=?"]
with torch.no_grad():
    for prompt in test_set:
        prompt_ids = encode(prompt)
        ###########################################################
        prompts_ids_formatted = torch.tensor([prompt_ids],dtype=torch.long,device=device)
        result = gpt.generate(prompts_ids_formatted, max_new_tokens, temperature, top_k)
        decoded_result = decode(result[0].detach().cpu().view(-1).tolist())
        print(decoded_result)
        ###########################################################


100+19=? The answer is 119 because 100+19 equals 119.
3*17=? The answer is 57 because 3*17 equals 57.
72/4=? The answer is 17 because 72//4 equals 17.
72-x=34,x=? The answer is 40 because 72-40 equals 34.
x*11=44,x=? The answer is 34 because 34+11 equals 44.
3*17=? The answer is 67 because 3*17 equals 67.
72/4=? The answer is 15 because 72//4 equals 15.
72-x=34,x=? The answer is 38 because 72-38 equals 34.
