<a href="https://colab.research.google.com/github/heewonLEE2/Data-Ai-Colab/blob/main/CIFAR10_%EC%B4%88%EA%B8%B0%ED%95%99%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# CIFAR-10 Colab Starter: Transfer Learning / From-Scratch
# =========================================================
!nvidia-smi -L || True  # GPU 확인(없어도 무시)

import os, math, random, time
import numpy as np
import torch, torch.nn as nn, torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler

import torchvision
import torchvision.transforms as T
from torchvision.transforms import AutoAugment, AutoAugmentPolicy
from torchvision import models

In [None]:
# ---------------------------
# Config
# ---------------------------
SEED = 42
DATASET = 'CIFAR10'   # 'CIFAR10' or 'CIFAR100'
NUM_CLASSES = 10      # 100 if CIFAR100
MODEL_NAME = 'resnet18'  # 'resnet18' or 'cnn'
FULL_FINETUNE = True  # ResNet: 전체 미세조정 여부 (False면 헤드만 학습)
BATCH_SIZE = 128
EPOCHS = 30
LR = 3e-4             # AdamW 기본 러닝레이트 (ResNet 기준)
WD = 0.05             # Weight Decay
LABEL_SMOOTH = 0.1
WARMUP_EPOCHS = 3
PATIENCE = 7          # EarlyStopping patience
NUM_WORKERS = os.cpu_count() if os.cpu_count() else 2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
cudnn.benchmark = True if DEVICE=='cuda' else False

In [None]:
# ---------------------------
# Utils
# ---------------------------
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(SEED)

class EarlyStopping:
    def __init__(self, patience=PATIENCE, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best = None
        self.counter = 0
        self.should_stop = False

    def step(self, val_loss):
        if self.best is None or val_loss < self.best - self.min_delta:
            self.best = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True

def get_cosine_schedule_with_warmup(optimizer, num_warmup, num_training):
    def lr_lambda(current_step):
        if current_step < num_warmup:
            return float(current_step) / float(max(1, num_warmup))
        progress = float(current_step - num_warmup) / float(max(1, num_training - num_warmup))
        return 0.5 * (1.0 + math.cos(math.pi * progress))
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

def accuracy(output, target):
    with torch.no_grad():
        pred = output.argmax(dim=1)
        return (pred == target).float().mean().item()

def save_checkpoint(model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save(model.state_dict(), path)

In [None]:
# ---------------------------
# Data: transforms & loaders
# ---------------------------
if MODEL_NAME == 'resnet18':
    # ImageNet 통계 (ResNet 권장)
    mean = (0.485, 0.456, 0.406); std = (0.229, 0.224, 0.225)
    train_tf = T.Compose([
        T.Resize(224),
        AutoAugment(AutoAugmentPolicy.CIFAR10),
        T.ToTensor(),
        T.Normalize(mean, std),
    ])
    test_tf = T.Compose([
        T.Resize(224),
        T.ToTensor(),
        T.Normalize(mean, std),
    ])
else:
    # CIFAR 통계 (작은 CNN 권장)
    mean = (0.4914, 0.4822, 0.4465); std = (0.2470, 0.2435, 0.2616)
    train_tf = T.Compose([
        T.RandomCrop(32, padding=4),
        T.RandomHorizontalFlip(),
        T.ColorJitter(0.1,0.1,0.1,0.1),
        T.ToTensor(),
        T.Normalize(mean, std),
    ])
    test_tf = T.Compose([
        T.ToTensor(),
        T.Normalize(mean, std),
    ])

DatasetClass = getattr(torchvision.datasets, DATASET)
trainset = DatasetClass(root='./data', train=True, download=True, transform=train_tf)
testset  = DatasetClass(root='./data', train=False, download=True, transform=test_tf)

# train/val split
val_ratio = 0.1
num_train = len(trainset)
indices = np.arange(num_train)
np.random.shuffle(indices)
split = int(num_train * (1 - val_ratio))
train_idx, val_idx = indices[:split], indices[split:]
train_subset = torch.utils.data.Subset(trainset, train_idx)
val_subset   = torch.utils.data.Subset(trainset, val_idx)

pin = True if DEVICE=='cuda' else False
train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=pin, persistent_workers=pin)
val_loader   = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=pin, persistent_workers=pin)
test_loader  = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=pin, persistent_workers=pin)

In [None]:
# ---------------------------
# Model
# ---------------------------
class SmallCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2), nn.Dropout(0.1),

            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(128),
            nn.Conv2d(128,128, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2), nn.Dropout(0.2),

            nn.Conv2d(128,256, 3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(256),
            nn.Conv2d(256,256, 3, padding=1), nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1,1))
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

if MODEL_NAME == 'resnet18':
    weights = models.ResNet18_Weights.IMAGENET1K_V1
    model = models.resnet18(weights=weights)
    model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
    if not FULL_FINETUNE:
        for p in model.parameters(): p.requires_grad = False
        for p in model.fc.parameters(): p.requires_grad = True
else:
    model = SmallCNN(NUM_CLASSES)

model = model.to(DEVICE)

In [None]:
# ---------------------------
# Optimizer / Loss / Scheduler
# ---------------------------
opt = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WD)
criterion = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTH)
scaler = GradScaler(enabled=(DEVICE=='cuda'))

# 전체 스텝 수 기준 스케줄러 (warmup + cosine)
total_steps = EPOCHS * math.ceil(len(train_loader))
warmup_steps = WARMUP_EPOCHS * math.ceil(len(train_loader))
scheduler = get_cosine_schedule_with_warmup(opt, warmup_steps, total_steps)

In [None]:
# ---------------------------
# Train / Eval
# ---------------------------
best_val = float('inf')
early = EarlyStopping(patience=PATIENCE)
CKPT = '/content/best_cifar.pt'

def run_one_epoch(loader, train=True):
    model.train(train)
    running_loss, running_acc = 0.0, 0.0
    for images, labels in loader:
        images, labels = images.to(DEVICE, non_blocking=True), labels.to(DEVICE, non_blocking=True)

        if train:
            opt.zero_grad(set_to_none=True)
            with autocast(enabled=(DEVICE=='cuda')):
                logits = model(images)
                loss = criterion(logits, labels)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
            scheduler.step()
        else:
            with torch.no_grad():
                logits = model(images)
                loss = criterion(logits, labels)

        running_loss += loss.item() * images.size(0)
        running_acc  += accuracy(logits, labels) * images.size(0)

    n = len(loader.dataset)
    return running_loss / n, running_acc / n

for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    train_loss, train_acc = run_one_epoch(train_loader, train=True)
    val_loss, val_acc     = run_one_epoch(val_loader, train=False)

    if val_loss < best_val:
        best_val = val_loss
        save_checkpoint(model, CKPT)

    early.step(val_loss)
    dt = time.time() - t0
    print(f"[{epoch:02d}/{EPOCHS}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f} | {dt:.1f}s")
    if early.should_stop:
        print("Early stopping triggered.")
        break

In [None]:
# ---------------------------
# Test with best checkpoint
# ---------------------------
model.load_state_dict(torch.load(CKPT, map_location=DEVICE))
model.eval()
test_loss, test_acc = run_one_epoch(test_loader, train=False)
print(f"Test: loss={test_loss:.4f}, acc={test_acc:.4f}")

# Confusion Matrix (간단 버전)
import itertools
import matplotlib.pyplot as plt

@torch.no_grad()
def confusion_matrix(model, loader, num_classes=NUM_CLASSES):
    cm = torch.zeros((num_classes, num_classes), dtype=torch.int64)
    for images, labels in loader:
        images = images.to(DEVICE); labels = labels.to(DEVICE)
        logits = model(images); preds = logits.argmax(1)
        for t, p in zip(labels.view(-1), preds.view(-1)):
            cm[t.long(), p.long()] += 1
    return cm.cpu().numpy()

cm = confusion_matrix(model, test_loader, NUM_CLASSES)
plt.figure(figsize=(6,6))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.colorbar(); plt.tight_layout(); plt.show()

# ❗ 위에 학습속도가 너무 느려 1 epoch 돌리는데 많은 시간을 소요해 성능을 포기하고 빠르게 도는 방식으로 다시 리팩토링
- 아래의 방식으로 돌리니 학습속도가 5~8배는 넘게 빨라짐
- 성능의 차이도 알면 좋겠다.

In [None]:
# =========================================================
# Fast CIFAR-10 Trainer (Colab/T4 최적화판)
# - 32×32 입력 유지 (연산량 최소화)
# - SmallCNN 기본 (아주 빠름) / CIFAR-stem ResNet18 옵션
# - AMP + 간단한 증강 + 안정적인 DataLoader 설정
# =========================================================
!nvidia-smi -L || True

import os, math, time, random
import numpy as np
import torch, torch.nn as nn, torch.optim as optim
from torch import amp  # <-- 최신 권장 autocast
from torch.cuda.amp import GradScaler
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as T

# ---------------------------
# Config (속도 우선 프로파일)
# ---------------------------
SEED = 42
DATASET = 'CIFAR10'       # 'CIFAR10' or 'CIFAR100'
NUM_CLASSES = 10          # CIFAR100이면 100으로 변경
MODEL = 'smallcnn'        # 'smallcnn' (기본, 매우 빠름) or 'cifar_resnet18'
EPOCHS = 15               # 빠른 실험 기본
BATCH_SIZE = 256          # T4에서 보통 256~512까지 시도 가능(메모리 보고 조정)
LR = 1e-3                 # AdamW 기본 러닝레이트
WD = 0.01                 # weight decay (살짝만)
LABEL_SMOOTH = 0.0        # 속도/안정 위해 0으로 (원하면 0.1)
WARMUP_EPOCHS = 1         # 짧은 워밍업
PATIENCE = 5              # early stopping
NUM_WORKERS = 2           # Colab은 2~4가 안전. 멈추면 0으로 내리기
PIN = True
PERSISTENT = False        # Colab 프리징 방지
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = (DEVICE=='cuda')

GPU 0: Tesla T4 (UUID: GPU-1f6874e9-b9d3-ca2e-b467-be622eebe3ba)


In [None]:
# ---------------------------
# Utils
# ---------------------------
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed()

class EarlyStopping:
    def __init__(self, patience=PATIENCE, min_delta=0.0):
        self.patience = patience; self.min_delta = min_delta
        self.best = None; self.counter = 0; self.should_stop = False
    def step(self, val_loss):
        if self.best is None or val_loss < self.best - self.min_delta:
            self.best = val_loss; self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True

def get_cosine_schedule_with_warmup(optimizer, num_warmup, num_training):
    def lr_lambda(current_step):
        if current_step < num_warmup:
            return float(current_step) / float(max(1, num_warmup))
        progress = float(current_step - num_warmup) / float(max(1, num_training - num_warmup))
        return 0.5 * (1.0 + math.cos(math.pi * progress))
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

@torch.no_grad()
def accuracy(output, target):
    return (output.argmax(1) == target).float().mean().item()

def save_ckpt(model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save(model.state_dict(), path)

In [None]:
# ---------------------------
# Data (32×32 유지, 가벼운 증강)
# ---------------------------
if DATASET == 'CIFAR100':
    DatasetClass = torchvision.datasets.CIFAR100
    NUM_CLASSES = 100
else:
    DatasetClass = torchvision.datasets.CIFAR10
    NUM_CLASSES = 10

mean = (0.4914, 0.4822, 0.4465); std = (0.2470, 0.2435, 0.2616)
train_tf = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean, std),
])
test_tf = T.Compose([
    T.ToTensor(),
    T.Normalize(mean, std),
])

trainset = DatasetClass(root='./data', train=True,  download=True, transform=train_tf)
testset  = DatasetClass(root='./data', train=False, download=True, transform=test_tf)

# train/val split (9:1)
idx = np.arange(len(trainset)); np.random.shuffle(idx)
split = int(len(idx) * 0.9)
train_idx, val_idx = idx[:split], idx[split:]
train_subset = torch.utils.data.Subset(trainset, train_idx)
val_subset   = torch.utils.data.Subset(trainset, val_idx)

# DataLoader (prefetch_factor는 workers>0일 때만 설정)
dl_kwargs = dict(batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN, persistent_workers=PERSISTENT)
if NUM_WORKERS and NUM_WORKERS > 0:
    dl_kwargs.update(prefetch_factor=2)

train_loader = DataLoader(train_subset, shuffle=True, **dl_kwargs)
val_loader   = DataLoader(val_subset, shuffle=False, **dl_kwargs)
test_loader  = DataLoader(testset, shuffle=False, **dl_kwargs)

100%|██████████| 170M/170M [00:07<00:00, 24.3MB/s]


In [None]:
# ---------------------------
# Models (속도 우선)
# ---------------------------
class SmallCNN(nn.Module):
    # 채널 수를 절제해 아주 빠르게
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2), nn.Dropout(0.05),

            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2), nn.Dropout(0.1),

            nn.Conv2d(64,128,3, padding=1), nn.ReLU(inplace=True), nn.BatchNorm2d(128),
            nn.Conv2d(128,128,3, padding=1), nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1,1)),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 128), nn.ReLU(inplace=True), nn.Dropout(0.25),
            nn.Linear(128, num_classes)
        )
    def forward(self, x): return self.classifier(self.features(x))

def build_cifar_resnet18(num_classes):
    from torchvision import models
    m = models.resnet18(weights=None)          # 전이학습보다 from-scratch가 훨씬 가벼움
    m.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)  # CIFAR stem
    m.maxpool = nn.Identity()
    m.fc = nn.Linear(m.fc.in_features, num_classes)
    return m

if MODEL == 'cifar_resnet18':
    model = build_cifar_resnet18(NUM_CLASSES)
else:
    model = SmallCNN(NUM_CLASSES)

model = model.to(DEVICE)

In [None]:
# ---------------------------
# Optim / Loss / Scheduler
# ---------------------------
opt = optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
criterion = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTH)
scaler = GradScaler(enabled=(DEVICE=='cuda'))

total_steps = EPOCHS * len(train_loader)
warmup_steps = max(1, WARMUP_EPOCHS * len(train_loader))
scheduler = get_cosine_schedule_with_warmup(opt, warmup_steps, total_steps)

  scaler = GradScaler(enabled=(DEVICE=='cuda'))


In [None]:
# ---------------------------
# Train / Eval
# ---------------------------
CKPT = '/content/best_fast_cifar.pt'
best_val = float('inf')
early = EarlyStopping(patience=PATIENCE)

def run_epoch(loader, train=True):
    model.train(train)
    total_loss, total_acc, n = 0.0, 0.0, 0
    for x,y in loader:
        x = x.to(DEVICE, non_blocking=True); y = y.to(DEVICE, non_blocking=True)
        if train:
            opt.zero_grad(set_to_none=True)
            with amp.autocast(device_type='cuda', enabled=(DEVICE=='cuda')):
                logits = model(x)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
            scheduler.step()
        else:
            with torch.no_grad():
                logits = model(x)
                loss = criterion(logits, y)
        bs = x.size(0)
        total_loss += loss.item() * bs
        total_acc  += accuracy(logits, y) * bs
        n += bs
    return total_loss / n, total_acc / n

for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    tr_loss, tr_acc = run_epoch(train_loader, train=True)
    va_loss, va_acc = run_epoch(val_loader, train=False)

    if va_loss < best_val:
        best_val = va_loss
        save_ckpt(model, CKPT)

    early.step(va_loss)
    print(f"[{epoch:02d}/{EPOCHS}] "
          f"train_loss={tr_loss:.4f} acc={tr_acc:.4f} | "
          f"val_loss={va_loss:.4f} acc={va_acc:.4f} | "
          f"{time.time()-t0:.1f}s")
    if early.should_stop:
        print("Early stopping.")
        break

[01/15] train_loss=1.7657 acc=0.3346 | val_loss=1.5909 acc=0.4300 | 43.1s
[02/15] train_loss=1.1862 acc=0.5688 | val_loss=1.0401 acc=0.6346 | 18.6s
[03/15] train_loss=0.9660 acc=0.6556 | val_loss=1.1108 acc=0.6186 | 22.0s
[04/15] train_loss=0.8357 acc=0.7049 | val_loss=0.8069 acc=0.7160 | 21.1s
[05/15] train_loss=0.7400 acc=0.7400 | val_loss=0.7580 acc=0.7322 | 20.2s
[06/15] train_loss=0.6695 acc=0.7657 | val_loss=0.6877 acc=0.7612 | 19.5s
[07/15] train_loss=0.6178 acc=0.7879 | val_loss=0.6687 acc=0.7736 | 18.5s
[08/15] train_loss=0.5668 acc=0.8037 | val_loss=0.5681 acc=0.8040 | 19.3s
[09/15] train_loss=0.5336 acc=0.8156 | val_loss=0.5380 acc=0.8138 | 19.0s
[10/15] train_loss=0.5047 acc=0.8252 | val_loss=0.5371 acc=0.8180 | 19.0s
[11/15] train_loss=0.4775 acc=0.8341 | val_loss=0.5195 acc=0.8224 | 19.1s
[12/15] train_loss=0.4559 acc=0.8419 | val_loss=0.4999 acc=0.8310 | 18.5s
[13/15] train_loss=0.4390 acc=0.8491 | val_loss=0.4875 acc=0.8342 | 19.5s
[14/15] train_loss=0.4320 acc=0.8522 |

In [None]:
# ---------------------------
# Test with the best checkpoint
# ---------------------------
model.load_state_dict(torch.load(CKPT, map_location=DEVICE))
model.eval()
te_loss, te_acc = run_epoch(test_loader, train=False)
print(f"Test: loss={te_loss:.4f}, acc={te_acc:.4f}")

Test: loss=0.4830, acc=0.8359


# ✅ 더 좋은 성능을 내보기 위해 몇개의 하이퍼 파라미터를 변경
- 모델: SmallCNN → CIFAR-stem ResNet18 (7×7/stride2/maxpool 제거, 32×32 최적화).
- 증강: RandAugment(n=1) + Cutout(16) 추가(가벼우면서 일반화↑).
- 정규화 트릭: label_smoothing=0.1, weight_decay=0.02.
- 학습 스케줄: 30epoch, Cosine + Warmup(2epoch), EarlyStopping(patience=7).
- 속도 트릭: AMP, 큰 배치(384), channels_last, 안정적 DataLoader 설정.

In [None]:
# =========================================================
# CIFAR-10 (Improved Fast Baseline)
# - 32×32 입력 유지 + CIFAR-stem ResNet18 (from scratch)
# - RandAugment(n=1) + Cutout(16)
# - Label Smoothing 0.1, WD 0.02
# - AMP, 큰 배치(384), 30 epoch, 안정적 DataLoader
# =========================================================
!nvidia-smi -L || True

import os, math, time, random
import numpy as np
import torch, torch.nn as nn, torch.optim as optim
from torch import amp
from torch.cuda.amp import GradScaler
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as T

# ---------------------------
# Config
# ---------------------------
SEED = 42
DATASET = 'CIFAR10'     # 'CIFAR10' or 'CIFAR100'
NUM_CLASSES = 10
EPOCHS = 30
BATCH_SIZE = 384        # 메모리 여유면 512도 시도 가능
LR = 1.5e-3             # 배치↑에 맞춰 살짝 상향(256→384, 약 1.5배)
WD = 0.02
LABEL_SMOOTH = 0.10
WARMUP_EPOCHS = 2
PATIENCE = 7
NUM_WORKERS = 2         # 멈추면 0으로
PIN = True
PERSISTENT = False
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = (DEVICE=='cuda')

def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)
set_seed()

# ---------------------------
# Small util
# ---------------------------
class EarlyStopping:
    def __init__(self, patience=PATIENCE, min_delta=0.0):
        self.patience=patience; self.min_delta=min_delta; self.best=None; self.counter=0; self.should_stop=False
    def step(self, v):
        if self.best is None or v < self.best - self.min_delta:
            self.best = v; self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience: self.should_stop = True

def get_cosine_schedule_with_warmup(optimizer, num_warmup, num_training):
    def lr_lambda(step):
        if step < num_warmup: return float(step) / float(max(1, num_warmup))
        prog = float(step - num_warmup) / float(max(1, num_training - num_warmup))
        return 0.5 * (1.0 + math.cos(math.pi * prog))
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

@torch.no_grad()
def accuracy(logits, y): return (logits.argmax(1) == y).float().mean().item()

def save_ckpt(model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save(model.state_dict(), path)

# ---------------------------
# Augmentations (RandAugment + Cutout)
# ---------------------------
class Cutout(object):
    def __init__(self, size=16):
        self.size = size
    def __call__(self, img):
        # img: Tensor[C,H,W], 값 [0,1]
        if not isinstance(img, torch.Tensor): return img
        c, h, w = img.shape
        cx = random.randint(0, w-1); cy = random.randint(0, h-1)
        half = self.size // 2
        x1, y1 = max(0, cx - half), max(0, cy - half)
        x2, y2 = min(w, cx + half), min(h, cy + half)
        img[:, y1:y2, x1:x2] = 0.0
        return img

# ---------------------------
# Data (32×32 유지)
# ---------------------------
if DATASET == 'CIFAR100':
    DatasetClass = torchvision.datasets.CIFAR100
    NUM_CLASSES = 100
else:
    DatasetClass = torchvision.datasets.CIFAR10
    NUM_CLASSES = 10

mean = (0.4914, 0.4822, 0.4465); std = (0.2470, 0.2435, 0.2616)

train_tf = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.RandAugment(num_ops=1, magnitude=7),  # 가벼운 RandAugment
    T.ToTensor(),
    Cutout(size=16),                        # 컷아웃 1개
    T.Normalize(mean, std),
])

test_tf = T.Compose([
    T.ToTensor(),
    T.Normalize(mean, std),
])

trainset = DatasetClass(root='./data', train=True,  download=True, transform=train_tf)
testset  = DatasetClass(root='./data', train=False, download=True,  transform=test_tf)

# train/val split (9:1)
idx = np.arange(len(trainset)); np.random.shuffle(idx)
split = int(0.9 * len(idx))
train_idx, val_idx = idx[:split], idx[split:]
train_subset = torch.utils.data.Subset(trainset, train_idx)
val_subset   = torch.utils.data.Subset(trainset, val_idx)

dl_kwargs = dict(batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN, persistent_workers=PERSISTENT)
if NUM_WORKERS and NUM_WORKERS > 0:
    dl_kwargs.update(prefetch_factor=2)

train_loader = DataLoader(train_subset, shuffle=True,  **dl_kwargs)
val_loader   = DataLoader(val_subset,   shuffle=False, **dl_kwargs)
test_loader  = DataLoader(testset,      shuffle=False, **dl_kwargs)

# ---------------------------
# CIFAR-stem ResNet18
# ---------------------------
from torchvision import models
def build_cifar_resnet18(num_classes):
    m = models.resnet18(weights=None)              # 전이 대신 스크래치(32×32에 최적화)
    m.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    m.maxpool = nn.Identity()
    m.fc = nn.Linear(m.fc.in_features, num_classes)
    return m

model = build_cifar_resnet18(NUM_CLASSES).to(DEVICE)
if DEVICE=='cuda':
    model = model.to(memory_format=torch.channels_last)

# ---------------------------
# Optim / Loss / Schedule / AMP
# ---------------------------
opt = optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
criterion = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTH)
scaler = GradScaler(enabled=(DEVICE=='cuda'))

total_steps = EPOCHS * len(train_loader)
warmup_steps = max(1, WARMUP_EPOCHS * len(train_loader))
scheduler = get_cosine_schedule_with_warmup(opt, warmup_steps, total_steps)

# ---------------------------
# Train / Eval
# ---------------------------
CKPT = '/content/best_cifar_improved.pt'
best_val = float('inf')
early = EarlyStopping(patience=PATIENCE)

def run_epoch(loader, train=True):
    model.train(train)
    total_loss = 0.0; total_acc = 0.0; n = 0
    for x,y in loader:
        if DEVICE=='cuda':
            x = x.to(DEVICE, non_blocking=True).contiguous(memory_format=torch.channels_last)
        else:
            x = x.to(DEVICE)
        y = y.to(DEVICE, non_blocking=True)

        if train:
            opt.zero_grad(set_to_none=True)
            with amp.autocast(device_type='cuda', enabled=(DEVICE=='cuda')):
                logits = model(x)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
            scheduler.step()
        else:
            with torch.no_grad():
                logits = model(x)
                loss = criterion(logits, y)

        bs = x.size(0)
        total_loss += loss.item() * bs
        total_acc  += accuracy(logits, y) * bs
        n += bs
    return total_loss / n, total_acc / n

for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    tr_loss, tr_acc = run_epoch(train_loader, train=True)
    va_loss, va_acc = run_epoch(val_loader,   train=False)

    if va_loss < best_val:
        best_val = va_loss
        save_ckpt(model, CKPT)

    early.step(va_loss)
    dt = time.time() - t0
    print(f"[{epoch:02d}/{EPOCHS}] train_loss={tr_loss:.4f} acc={tr_acc:.4f} | "
          f"val_loss={va_loss:.4f} acc={va_acc:.4f} | {dt:.1f}s")
    if early.should_stop:
        print("Early stopping.")
        break

# ---------------------------
# Test with best
# ---------------------------
model.load_state_dict(torch.load(CKPT, map_location=DEVICE))
model.eval()
te_loss, te_acc = run_epoch(test_loader, train=False)
print(f"Test: loss={te_loss:.4f}, acc={te_acc:.4f}")


GPU 0: Tesla T4 (UUID: GPU-1f6874e9-b9d3-ca2e-b467-be622eebe3ba)


  scaler = GradScaler(enabled=(DEVICE=='cuda'))


[01/30] train_loss=1.9575 acc=0.3122 | val_loss=1.9386 acc=0.3592 | 45.5s
[02/30] train_loss=1.6143 acc=0.4881 | val_loss=1.6730 acc=0.4710 | 34.0s
[03/30] train_loss=1.4155 acc=0.5829 | val_loss=1.5171 acc=0.5422 | 34.9s
[04/30] train_loss=1.2794 acc=0.6512 | val_loss=1.3868 acc=0.6060 | 34.9s
[05/30] train_loss=1.1923 acc=0.6911 | val_loss=1.2517 acc=0.6696 | 33.7s
[06/30] train_loss=1.1326 acc=0.7210 | val_loss=1.1693 acc=0.7082 | 34.5s
[07/30] train_loss=1.0845 acc=0.7419 | val_loss=1.3546 acc=0.6452 | 34.6s
[08/30] train_loss=1.0442 acc=0.7609 | val_loss=1.1871 acc=0.6972 | 34.1s
[09/30] train_loss=1.0112 acc=0.7745 | val_loss=1.1876 acc=0.6980 | 35.0s
[10/30] train_loss=0.9816 acc=0.7893 | val_loss=1.0644 acc=0.7530 | 34.5s
[11/30] train_loss=0.9548 acc=0.8000 | val_loss=1.1532 acc=0.7204 | 33.9s
[12/30] train_loss=0.9307 acc=0.8108 | val_loss=1.0000 acc=0.7764 | 34.6s
[13/30] train_loss=0.9124 acc=0.8192 | val_loss=1.0041 acc=0.7742 | 33.9s
[14/30] train_loss=0.8885 acc=0.8294 |