In [None]:
# !pip install torch torchvision --quiet

import os, math, time, random
from dataclasses import dataclass
from typing import Tuple, Dict, Any, List

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset, random_split
from torchvision import transforms, datasets, models


In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 42
random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

NUM_CLASSES = 10  # STL-10 has 10 classes
IMG_SIZE = 224    # upscale 96 -> 224 for ResNet-152
BATCH_SIZE = 64   # reduce if memory-limited
NUM_WORKERS = 4   # bump if you have CPU headroom

In [None]:


# ---------- Transforms ----------
# Use ImageNet normalization because we rely on ImageNet-pretrained weights
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]
train_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(0.1, 0.1, 0.1, 0.05),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
eval_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

# ---------- Data ----------
def get_stl10_dataloaders(data_root="./data", val_ratio=0.2)->Tuple[DataLoader, DataLoader, DataLoader]:
    # STL10 train split has 5k labeled images; test has 8k
    train_full = datasets.STL10(root=data_root, split='train', download=True, transform=train_tfms)
    test_set   = datasets.STL10(root=data_root, split='test',  download=True, transform=eval_tfms)

    val_size = int(len(train_full) * val_ratio)
    train_size = len(train_full) - val_size
    g = torch.Generator().manual_seed(SEED)
    train_set, val_set = random_split(train_full, [train_size, val_size], generator=g)

    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS, pin_memory=True)
    val_loader   = DataLoader(val_set,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    return train_loader, val_loader, test_loader

train_loader, val_loader, test_loader = get_stl10_dataloaders()

100%|██████████| 2.64G/2.64G [01:00<00:00, 43.4MB/s]


In [None]:
# ---------- Model helpers ----------
def make_resnet152(pretrained: bool, num_classes: int = NUM_CLASSES) -> nn.Module:
    if pretrained:
        weights = models.ResNet152_Weights.IMAGENET1K_V2
        model = models.resnet152(weights=weights)
    else:
        model = models.resnet152(weights=None)

    in_feats = model.fc.in_features
    model.fc = nn.Linear(in_feats, num_classes)  # new head
    return model

def freeze_all(m: nn.Module):
    for p in m.parameters():
        p.requires_grad = False

def unfreeze_all(m: nn.Module):
    for p in m.parameters():
        p.requires_grad = True

def trainable_params(m: nn.Module):
    return [p for p in m.parameters() if p.requires_grad]

# ---------- Optimizers / schedulers ----------
def make_optimizer(model, lr_backbone, lr_head=None, wd=1e-4, last_block_only=False, full_ft=False):
    """
    - head-only: call with lr_backbone=None and pass only head params externally
    - last_block_only: layer4 gets lr_backbone; fc gets lr_head
    - full_ft: backbone gets lr_backbone; fc gets lr_head
    """
    params = []
    if last_block_only:
        assert lr_backbone is not None and lr_head is not None
        params.append({"params": model.layer4.parameters(), "lr": lr_backbone})
        params.append({"params": model.fc.parameters(),     "lr": lr_head})
    elif full_ft:
        assert lr_backbone is not None and lr_head is not None
        # backbone: everything except fc
        backbone_params = []
        for n, p in model.named_parameters():
            if p.requires_grad and not n.startswith("fc."):
                backbone_params.append(p)
        params.append({"params": backbone_params, "lr": lr_backbone})
        params.append({"params": model.fc.parameters(), "lr": lr_head})
    else:
        # head-only handled outside (just pass model.fc.parameters()).
        pass

    opt = torch.optim.AdamW(params if params else model.fc.parameters(), lr=lr_head if lr_head else lr_backbone, weight_decay=wd)
    return opt

def make_cosine_scheduler(optimizer, warmup_steps, total_steps):
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        progress = float(step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return 0.5 * (1.0 + math.cos(math.pi * progress))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

# ---------- Training / Eval ----------
@dataclass
class TrainConfig:
    epochs: int = 20
    label_smoothing: float = 0.0
    early_stop_patience: int = 3
    max_steps_per_epoch: int = None  # set to int to cap steps for quick debug
    use_amp: bool = True

class EarlyStopper:
    def __init__(self, patience=3, mode='max'):
        self.patience = patience
        self.mode = mode
        self.best = -float('inf') if mode=='max' else float('inf')
        self.bad_epochs = 0
        self.should_stop = False

    def step(self, metric):
        improved = (metric > self.best) if self.mode=='max' else (metric < self.best)
        if improved:
            self.best = metric
            self.bad_epochs = 0
        else:
            self.bad_epochs += 1
            if self.bad_epochs >= self.patience:
                self.should_stop = True

def accuracy(logits, y):
    preds = logits.argmax(1)
    return (preds == y).float().mean().item()

def evaluate(model, loader, criterion):
    model.eval()
    total_loss, total_acc, total_n = 0.0, 0.0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
            logits = model(x)
            loss = criterion(logits, y)
            bs = y.size(0)
            total_loss += loss.item() * bs
            total_acc  += accuracy(logits, y) * bs
            total_n    += bs
    return total_loss/total_n, total_acc/total_n

def train_one_epoch(model, loader, criterion, optimizer, scaler: torch.cuda.amp.GradScaler=None, max_steps=None):
    model.train()
    total_loss, total_acc, total_n = 0.0, 0.0, 0
    step = 0
    for x, y in loader:
        x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
        optimizer.zero_grad(set_to_none=True)
        if scaler is not None:
            with torch.cuda.amp.autocast():
                logits = model(x)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()

        bs = y.size(0)
        total_loss += loss.item() * bs
        total_acc  += accuracy(logits, y) * bs
        total_n    += bs
        step += 1
        if max_steps and step >= max_steps:
            break
    return total_loss/total_n, total_acc/total_n

def fit(model, train_loader, val_loader, optimizer, scheduler, config: TrainConfig):
    model.to(DEVICE)
    criterion = nn.CrossEntropyLoss(label_smoothing=config.label_smoothing)
    scaler = torch.cuda.amp.GradScaler() if (config.use_amp and DEVICE.type=='cuda') else None

    stopper = EarlyStopper(patience=config.early_stop_patience, mode='max')
    best = {"epoch": -1, "val_acc": -1.0, "state_dict": None}

    global_step = 0
    total_steps = config.epochs * len(train_loader)
    warmup = max(1, int(0.05 * total_steps))  # ~5% warmup; tweak per taste

    start = time.time()
    for epoch in range(1, config.epochs+1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, criterion, optimizer, scaler, max_steps=config.max_steps_per_epoch)
        val_loss, val_acc = evaluate(model, val_loader, criterion)

        # Update scheduler per step (cosine lambda depends on step)
        if scheduler:
            # Advance by number of batches seen this epoch
            for _ in range(len(train_loader if not config.max_steps_per_epoch else range(config.max_steps_per_epoch))):
                scheduler.step()
                global_step += 1

        print(f"[Epoch {epoch:02d}] train loss={tr_loss:.4f} acc={tr_acc*100:.2f}% | val loss={val_loss:.4f} acc={val_acc*100:.2f}%")

        if val_acc > best["val_acc"]:
            best.update({"epoch": epoch, "val_acc": val_acc, "state_dict": {k:v.cpu() for k,v in model.state_dict().items()}})

        stopper.step(val_acc)
        if stopper.should_stop:
            print("Early stopping.")
            break

    took = time.time() - start
    print(f"Best @ epoch {best['epoch']}: val acc={best['val_acc']*100:.2f}% | time={took/60:.1f} min")
    # load best weights back to model
    if best["state_dict"] is not None:
        model.load_state_dict({k:v.to(DEVICE) for k,v in best["state_dict"].items()})
    return best

In [None]:
# E1: Pretrained backbone frozen, train only the new head
config = TrainConfig(epochs=15, label_smoothing=0.0, early_stop_patience=3, use_amp=True)

model_e1 = make_resnet152(pretrained=True, num_classes=NUM_CLASSES)
freeze_all(model_e1)                # freeze everything
for p in model_e1.fc.parameters():  # unfreeze head
    p.requires_grad = True

# Optimizer only on head
opt_e1 = torch.optim.AdamW(model_e1.fc.parameters(), lr=5e-4, weight_decay=1e-4)
# Cosine scheduler over (epochs * steps); will be advanced inside fit()
sched_e1 = make_cosine_scheduler(opt_e1, warmup_steps=100, total_steps=config.epochs*len(train_loader))

best_e1 = fit(model_e1, train_loader, val_loader, opt_e1, sched_e1, config)

# Evaluate on test set
test_loss, test_acc = evaluate(model_e1, test_loader, nn.CrossEntropyLoss())
print(f"E1 Test acc: {test_acc*100:.2f}%")


Downloading: "https://download.pytorch.org/models/resnet152-f82ba261.pth" to /root/.cache/torch/hub/checkpoints/resnet152-f82ba261.pth


100%|██████████| 230M/230M [00:01<00:00, 163MB/s]
  scaler = torch.cuda.amp.GradScaler() if (config.use_amp and DEVICE.type=='cuda') else None
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
  with torch.cuda.amp.autocast():
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)


[Epoch 01] train loss=2.2818 acc=15.00% | val loss=2.2882 acc=12.40%
[Epoch 02] train loss=1.6920 acc=70.67% | val loss=1.2652 acc=90.70%
[Epoch 03] train loss=0.8545 acc=94.27% | val loss=0.6761 acc=94.20%
[Epoch 04] train loss=0.4893 acc=95.97% | val loss=0.4645 acc=95.30%
[Epoch 05] train loss=0.3517 acc=96.38% | val loss=0.3580 acc=95.70%
[Epoch 06] train loss=0.2799 acc=96.65% | val loss=0.3149 acc=95.60%
[Epoch 07] train loss=0.2351 acc=97.42% | val loss=0.2630 acc=96.40%
[Epoch 08] train loss=0.2061 acc=97.50% | val loss=0.2402 acc=96.70%
[Epoch 09] train loss=0.1850 acc=97.72% | val loss=0.2373 acc=95.90%
[Epoch 10] train loss=0.1719 acc=97.90% | val loss=0.2187 acc=96.60%
[Epoch 11] train loss=0.1693 acc=97.47% | val loss=0.2091 acc=96.30%
Early stopping.
Best @ epoch 8: val acc=96.70% | time=7.4 min
E1 Test acc: 97.22%


In [None]:
torch.save(best_e1["state_dict"], "resnet152_E1_head_only.pth")

In [None]:
# E2: Pretrained; unfreeze layer4 + head (discriminative LRs)
config = TrainConfig(epochs=20, label_smoothing=0.0, early_stop_patience=3, use_amp=True)

model_e2 = make_resnet152(pretrained=True, num_classes=NUM_CLASSES)
freeze_all(model_e2)
for p in model_e2.layer4.parameters():
    p.requires_grad = True
for p in model_e2.fc.parameters():
    p.requires_grad = True

# Discriminative LRs: smaller for backbone block, larger for head
opt_e2 = make_optimizer(model_e2, lr_backbone=1e-4, lr_head=5e-4, wd=1e-4, last_block_only=True)
sched_e2 = make_cosine_scheduler(opt_e2, warmup_steps=200, total_steps=config.epochs*len(train_loader))

best_e2 = fit(model_e2, train_loader, val_loader, opt_e2, sched_e2, config)

test_loss, test_acc = evaluate(model_e2, test_loader, nn.CrossEntropyLoss())
print(f"E2 Test acc: {test_acc*100:.2f}%")


  scaler = torch.cuda.amp.GradScaler() if (config.use_amp and DEVICE.type=='cuda') else None
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
  with torch.cuda.amp.autocast():
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)


[Epoch 01] train loss=2.2891 acc=12.25% | val loss=2.2917 acc=11.90%
[Epoch 02] train loss=1.5100 acc=70.70% | val loss=0.5353 acc=95.20%
[Epoch 03] train loss=0.1969 acc=96.43% | val loss=0.0983 acc=97.40%
[Epoch 04] train loss=0.0750 acc=97.95% | val loss=0.0772 acc=97.50%
[Epoch 05] train loss=0.0404 acc=98.83% | val loss=0.0965 acc=97.10%
[Epoch 06] train loss=0.0262 acc=99.28% | val loss=0.0722 acc=97.90%
[Epoch 07] train loss=0.0220 acc=99.35% | val loss=0.0928 acc=97.30%
[Epoch 08] train loss=0.0181 acc=99.45% | val loss=0.0734 acc=98.00%
[Epoch 09] train loss=0.0211 acc=99.42% | val loss=0.0694 acc=97.80%
[Epoch 10] train loss=0.0160 acc=99.72% | val loss=0.0787 acc=97.60%
[Epoch 11] train loss=0.0132 acc=99.60% | val loss=0.0945 acc=97.00%
Early stopping.
Best @ epoch 8: val acc=98.00% | time=7.6 min
E2 Test acc: 98.29%


In [None]:
torch.save(best_e2["state_dict"], "resnet152_E2_lastblock.pth")

In [None]:
# E3: Pretrained; unfreeze everything, tiny LR for backbone, larger for head
config = TrainConfig(epochs=20, label_smoothing=0.0, early_stop_patience=3, use_amp=True)

model_e3 = make_resnet152(pretrained=True, num_classes=NUM_CLASSES)
unfreeze_all(model_e3)

# Gentle LRs to avoid destroying pretrained features
opt_e3 = make_optimizer(model_e3, lr_backbone=2e-5, lr_head=8e-5, wd=1e-4, full_ft=True)
sched_e3 = make_cosine_scheduler(opt_e3, warmup_steps=300, total_steps=config.epochs*len(train_loader))

best_e3 = fit(model_e3, train_loader, val_loader, opt_e3, sched_e3, config)

test_loss, test_acc = evaluate(model_e3, test_loader, nn.CrossEntropyLoss())
print(f"E3 Test acc: {test_acc*100:.2f}%")


  scaler = torch.cuda.amp.GradScaler() if (config.use_amp and DEVICE.type=='cuda') else None
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
  with torch.cuda.amp.autocast():
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)


[Epoch 01] train loss=2.3686 acc=5.67% | val loss=2.3649 acc=5.60%
[Epoch 02] train loss=2.2835 acc=14.65% | val loss=2.2021 acc=22.60%
[Epoch 03] train loss=1.9658 acc=49.33% | val loss=1.6860 acc=69.80%
[Epoch 04] train loss=1.0933 acc=85.97% | val loss=0.5386 acc=95.30%
[Epoch 05] train loss=0.2554 acc=97.15% | val loss=0.1423 acc=97.10%
[Epoch 06] train loss=0.0824 acc=98.40% | val loss=0.0940 acc=97.70%
[Epoch 07] train loss=0.0458 acc=99.15% | val loss=0.0689 acc=98.10%
[Epoch 08] train loss=0.0267 acc=99.40% | val loss=0.0733 acc=98.00%
[Epoch 09] train loss=0.0187 acc=99.70% | val loss=0.0561 acc=98.10%
[Epoch 10] train loss=0.0184 acc=99.67% | val loss=0.0619 acc=97.50%
Early stopping.
Best @ epoch 7: val acc=98.10% | time=8.4 min
E3 Test acc: 98.52%


In [None]:
torch.save(best_e3["state_dict"], "resnet152_E3_full_pretrained.pth")

In [None]:
# E4: Scratch init; full training. Stronger regularization/aug may help.
config = TrainConfig(epochs=25, label_smoothing=0.1, early_stop_patience=4, use_amp=True)

model_e4 = make_resnet152(pretrained=False, num_classes=NUM_CLASSES)
unfreeze_all(model_e4)

# Larger LR (scratch), maybe SGD with momentum also works well; here we use AdamW
opt_e4 = torch.optim.AdamW(model_e4.parameters(), lr=1e-3, weight_decay=2e-4)
sched_e4 = make_cosine_scheduler(opt_e4, warmup_steps=500, total_steps=config.epochs*len(train_loader))

best_e4 = fit(model_e4, train_loader, val_loader, opt_e4, sched_e4, config)

test_loss, test_acc = evaluate(model_e4, test_loader, nn.CrossEntropyLoss())
print(f"E4 Test acc: {test_acc*100:.2f}%")


  scaler = torch.cuda.amp.GradScaler() if (config.use_amp and DEVICE.type=='cuda') else None
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
  with torch.cuda.amp.autocast():
  x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)


[Epoch 01] train loss=2.5049 acc=9.95% | val loss=2.4215 acc=10.20%
[Epoch 02] train loss=2.3597 acc=13.75% | val loss=2.3239 acc=19.50%
[Epoch 03] train loss=2.1495 acc=21.82% | val loss=2.1757 acc=22.80%
[Epoch 04] train loss=2.0400 acc=27.38% | val loss=2.0241 acc=28.90%
[Epoch 05] train loss=1.9129 acc=31.10% | val loss=1.9409 acc=32.80%
[Epoch 06] train loss=1.8931 acc=33.42% | val loss=1.9021 acc=35.90%
[Epoch 07] train loss=1.8375 acc=35.15% | val loss=1.9737 acc=35.40%
[Epoch 08] train loss=1.8171 acc=37.62% | val loss=1.8728 acc=38.00%
[Epoch 09] train loss=1.7657 acc=39.57% | val loss=1.7218 acc=41.30%
[Epoch 10] train loss=1.7228 acc=42.52% | val loss=1.6879 acc=41.90%
[Epoch 11] train loss=1.6531 acc=45.12% | val loss=1.7459 acc=41.00%
[Epoch 12] train loss=1.6090 acc=48.20% | val loss=1.6626 acc=44.30%
[Epoch 13] train loss=1.5732 acc=49.48% | val loss=1.7847 acc=40.40%
[Epoch 14] train loss=1.5322 acc=52.30% | val loss=1.6622 acc=48.30%
[Epoch 15] train loss=1.4648 acc=55

In [1]:
torch.save(best_e4["state_dict"], "resnet152_E4_full_scratch.pth")

NameError: name 'torch' is not defined

In [None]:
def summarize(tag, best, model):
    test_loss, test_acc = evaluate(model, test_loader, nn.CrossEntropyLoss())
    return {
        "exp": tag,
        "best_epoch": best["epoch"],
        "val_acc_best": round(best["val_acc"]*100, 2),
        "test_acc": round(test_acc*100, 2),
        "trainable_params_m": round(sum(p.numel() for p in trainable_params(model))/1e6, 2),
    }

summary = []
summary.append(summarize("E1_head_only", best_e1, model_e1))
summary.append(summarize("E2_layer4+head", best_e2, model_e2))
summary.append(summarize("E3_full_pretrained", best_e3, model_e3))
summary.append(summarize("E4_full_scratch", best_e4, model_e4))

import pandas as pd
pd.DataFrame(summary)


In [None]:
import time, torch
import pandas as pd
import matplotlib.pyplot as plt

def count_trainable_params(m):
    return sum(p.numel() for p in m.parameters() if p.requires_grad)

@torch.no_grad()
def eval_test_acc(model, loader):
    model.eval()
    crit = torch.nn.CrossEntropyLoss()
    tot_acc, tot_n = 0.0, 0
    for x, y in loader:
        x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
        logits = model(x)
        preds = logits.argmax(1)
        tot_acc += (preds == y).float().sum().item()
        tot_n   += y.size(0)
    return 100.0 * tot_acc / tot_n

# pull best epochs (already stored by your fit)
def best_epoch_of(best_dict):
    return best_dict.get("epoch", None) if isinstance(best_dict, dict) else None

summary_rows = []
for tag, model, best in [
    ("E1_head_only",        model_e1, best_e1),
    ("E2_layer4+head",      model_e2, best_e2),
    ("E3_full_pretrained",  model_e3, best_e3),
    ("E4_full_scratch",     model_e4, best_e4),
]:
    be = best_epoch_of(best)
    test_acc = eval_test_acc(model, test_loader)
    params_m = count_trainable_params(model)/1e6
    summary_rows.append({
        "exp": tag,
        "best_epoch": be,
        "test_acc_%": round(test_acc, 2),
        "trainable_params_M": round(params_m, 2),
        "steps_to_best": None,       # we'll fill after we estimate per-step time
        "est_step_time_s": None,     # ditto
        "est_time_to_best_min": None # ditto
    })

df = pd.DataFrame(summary_rows)
df


In [None]:
def estimate_step_time_seconds(model, loader, batches=5):
    model = model.to(DEVICE)
    model.train()
    # tiny optimizer for timing (won't be used for real training)
    opt = torch.optim.SGD((p for p in model.parameters() if p.requires_grad), lr=1e-3, momentum=0.0)
    crit = torch.nn.CrossEntropyLoss()

    n = 0
    start = None
    # warmup one batch for stable timing
    for x, y in loader:
        x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
        opt.zero_grad(set_to_none=True)
        out = model(x)
        loss = crit(out, y)
        loss.backward()
        opt.step()
        break

    if DEVICE.type == 'cuda':
        torch.cuda.synchronize()

    t0 = time.time()
    for x, y in loader:
        x, y = x.to(DEVICE, non_blocking=True), torch.tensor(y, device=DEVICE)
        opt.zero_grad(set_to_none=True)
        out = model(x)
        loss = crit(out, y)
        loss.backward()
        opt.step()
        n += 1
        if n >= batches:
            break

    if DEVICE.type == 'cuda':
        torch.cuda.synchronize()
    t1 = time.time()

    return (t1 - t0) / max(1, n)

# fill in steps_to_best (best_epoch * steps/epoch) and estimated time_to_best
steps_per_epoch = len(train_loader)

for i in range(len(df)):
    tag = df.loc[i, "exp"]
    model = {"E1_head_only": model_e1, "E2_layer4+head": model_e2,
             "E3_full_pretrained": model_e3, "E4_full_scratch": model_e4}[tag]
    be = df.loc[i, "best_epoch"]
    if be is None or be <= 0:
        continue
    step_time = estimate_step_time_seconds(model, train_loader, batches=5)
    steps_to_best = be * steps_per_epoch
    time_to_best_min = (step_time * steps_to_best) / 60.0
    df.loc[i, "steps_to_best"] = int(steps_to_best)
    df.loc[i, "est_step_time_s"] = round(step_time, 3)
    df.loc[i, "est_time_to_best_min"] = round(time_to_best_min, 2)

df.sort_values(["test_acc_%", "est_time_to_best_min"], ascending=[False, True])


In [None]:
# Accuracy vs estimated time-to-best
plt.figure()
valid = df.dropna(subset=["est_time_to_best_min"])
plt.scatter(valid["est_time_to_best_min"], valid["test_acc_%"])
for _, row in valid.iterrows():
    plt.annotate(row["exp"], (row["est_time_to_best_min"], row["test_acc_%"]))
plt.xlabel("Estimated time to best (min)")
plt.ylabel("Test accuracy (%)")
plt.title("Compute vs Accuracy (post-hoc estimate)")
plt.show()

# Accuracy vs trainable params (proxy for adaptation budget)
plt.figure()
plt.scatter(df["trainable_params_M"], df["test_acc_%"])
for _, row in df.iterrows():
    plt.annotate(row["exp"], (row["trainable_params_M"], row["test_acc_%"]))
plt.xlabel("Trainable parameters (Millions)")
plt.ylabel("Test accuracy (%)")
plt.title("Accuracy vs Adaptation Budget")
plt.show()

df
