In [None]:
!pip install -q wandb torchvision wandb-workspaces

In [None]:
import wandb
wandb.login()

# Image Classification Demo — W&B 전체 기능 체험

## 개요

이 노트북은 **CIFAR-10** 데이터셋과 **ResNet-18** 모델을 사용하여 이미지 분류를 수행하면서,
W&B(Weights & Biases)의 핵심 기능을 전부 체험합니다.

## 다루는 W&B 기능

| 기능 | 설명 |
|------|------|
| **Experiment Tracking** | 학습 메트릭 실시간 추적 (`wandb.init`, `wandb.log`, `wandb.config`) |
| **Media Logging** | 이미지 로깅 (`wandb.Image`) |
| **Tables** | 데이터셋 미리보기 및 예측 결과 비교 (`wandb.Table`) |
| **Artifacts** | 데이터셋/모델 버저닝 및 계보(lineage) 추적 |
| **Model Registry** | 모델 등록 및 alias 관리 (staging/production) |
| **Sweeps** | 베이지안 하이퍼파라미터 최적화 |
| **Reports** | 프로그래밍 방식 실험 리포트 생성 |

## 데이터셋
- **CIFAR-10**: 60,000장 (32×32 RGB), 10개 클래스
- torchvision 내장 데이터셋으로 별도 다운로드 불필요

## 모델
- **ResNet-18** (ImageNet pretrained → CIFAR-10 fine-tune)
- 32×32 입력에 맞게 conv1 및 maxpool 수정

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18, ResNet18_Weights
import random
import time

# === 학습 설정 ===
CONFIG = {
    "batch_size": 64,
    "lr": 1e-3,
    "epochs": 5,
    "optimizer": "adam",
    "num_classes": 10,
    "img_size": 32,
    "model_name": "resnet18",
    "dataset": "cifar10",
}

CIFAR10_CLASSES = [
    "airplane", "automobile", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# === 데이터 로드 + Transform + DataLoader ===

# CIFAR-10 전용 정규화 값 (이미지넷 값과 다름)
CIFAR10_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR10_STD = (0.2470, 0.2435, 0.2616)

transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD),
])

trainset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform_train
)
testset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform_test
)

trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=CONFIG["batch_size"], shuffle=True, num_workers=2
)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=CONFIG["batch_size"], shuffle=False, num_workers=2
)

print(f"Train: {len(trainset)}장, Test: {len(testset)}장")

In [None]:
# === 데이터셋 Artifact 생성 ===

run = wandb.init(
    project="wandb-e2e-demo-image-classification",
    config=CONFIG,
    job_type="data-versioning",
    name="cifar10-data-versioning",
)

artifact = wandb.Artifact(
    "cifar10",
    type="dataset",
    description="CIFAR-10 dataset (torchvision)",
    metadata={
        "num_train": len(trainset),
        "num_test": len(testset),
        "num_classes": 10,
        "image_size": "32x32",
        "classes": CIFAR10_CLASSES,
        "source": "torchvision.datasets.CIFAR10",
    },
)
artifact.add_dir("./data/cifar-10-batches-py")
run.log_artifact(artifact)
print("데이터셋 Artifact 로깅 완료!")

In [None]:
# === 샘플 이미지 wandb.Table 시각화 ===

# 시각화용으로 정규화 안 된 원본 데이터 로드
raw_dataset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=False, transform=transforms.ToTensor()
)

# 클래스별 5장 = 총 50장
table = wandb.Table(columns=["Image", "Label", "Label_ID"])

class_indices = {i: [] for i in range(10)}
for idx, (_, label) in enumerate(raw_dataset):
    class_indices[label].append(idx)

for class_id in range(10):
    samples = random.sample(class_indices[class_id], 5)
    for idx in samples:
        img, label = raw_dataset[idx]
        table.add_data(wandb.Image(img), CIFAR10_CLASSES[label], label)

wandb.log({"dataset_preview": table})
wandb.finish()
print("데이터셋 미리보기 테이블 로깅 완료!")

In [None]:
# === 모델 정의 (ResNet-18, CIFAR-10용 수정) ===

def create_model(num_classes=10):
    """
    CIFAR-10(32x32) 입력에 맞게 수정한 ResNet-18.
    
    표준 ResNet-18은 224x224용으로 conv1(7x7, stride=2) + maxpool(3x3, stride=2)가
    32x32 입력을 1x1로 축소시켜 학습이 불가능해짐.
    수정: conv1 -> 3x3, stride=1, padding=1 / maxpool -> Identity
    """
    model = resnet18(weights=ResNet18_Weights.DEFAULT)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    model.maxpool = nn.Identity()
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

model = create_model(CONFIG["num_classes"]).to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# === 학습 루프 ===

run = wandb.init(
    project="wandb-e2e-demo-image-classification",
    config=CONFIG,
    job_type="training",
    name="resnet18-cifar10-baseline",
)

criterion = nn.CrossEntropyLoss()

if CONFIG["optimizer"] == "adam":
    optimizer = optim.Adam(model.parameters(), lr=CONFIG["lr"])
elif CONFIG["optimizer"] == "sgd":
    optimizer = optim.SGD(model.parameters(), lr=CONFIG["lr"], momentum=0.9, weight_decay=5e-4)
elif CONFIG["optimizer"] == "adamw":
    optimizer = optim.AdamW(model.parameters(), lr=CONFIG["lr"], weight_decay=1e-2)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CONFIG["epochs"])

# 모델 gradient/parameter 로깅
wandb.watch(model, criterion, log="all", log_freq=100)


def train_one_epoch(model, loader, criterion, optimizer):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for batch_idx, (inputs, targets) in enumerate(loader):
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        if batch_idx % 100 == 0:
            wandb.log({
                "train/batch_loss": loss.item(),
                "train/batch_acc": 100.0 * predicted.eq(targets).sum().item() / targets.size(0),
            })

    return running_loss / len(loader), 100.0 * correct / total


def evaluate(model, loader, criterion):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    return running_loss / len(loader), 100.0 * correct / total


# 메인 학습 루프
best_acc = 0.0

for epoch in range(CONFIG["epochs"]):
    start_time = time.time()

    train_loss, train_acc = train_one_epoch(model, trainloader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, testloader, criterion)
    scheduler.step()

    epoch_time = time.time() - start_time

    # Epoch 레벨 메트릭 로깅
    wandb.log({
        "epoch": epoch + 1,
        "train/loss": train_loss,
        "train/acc": train_acc,
        "val/loss": val_loss,
        "val/acc": val_acc,
        "lr": scheduler.get_last_lr()[0],
        "epoch_time_sec": epoch_time,
    })

    # 예측 이미지 로깅 (denormalize 필수)
    model.eval()
    images, labels = next(iter(testloader))
    with torch.no_grad():
        outputs = model(images.to(device))
        _, preds = outputs.max(1)
        preds = preds.cpu()

    mean = torch.tensor(CIFAR10_MEAN).view(3, 1, 1)
    std = torch.tensor(CIFAR10_STD).view(3, 1, 1)
    wandb_images = []
    for i in range(8):
        img = images[i] * std + mean  # denormalize
        img = img.clamp(0, 1)
        caption = f"True: {CIFAR10_CLASSES[labels[i]]} | Pred: {CIFAR10_CLASSES[preds[i]]}"
        wandb_images.append(wandb.Image(img, caption=caption))
    wandb.log({"predictions": wandb_images})

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")

    print(
        f"Epoch [{epoch+1}/{CONFIG['epochs']}] "
        f"Train Loss: {train_loss:.4f} Acc: {train_acc:.2f}% | "
        f"Val Loss: {val_loss:.4f} Acc: {val_acc:.2f}%"
    )

wandb.summary["best_val_acc"] = best_acc
print(f"\n학습 완료! Best Val Acc: {best_acc:.2f}%")

In [None]:
# === 검증 결과 wandb.Table ===

model.load_state_dict(torch.load("best_model.pth", map_location=device, weights_only=True))
model.eval()

# 시각화용 원본 데이터
raw_testset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=False, transform=transforms.ToTensor()
)

columns = ["Image", "True Label", "Predicted Label", "Correct", "Confidence"]
for cls in CIFAR10_CLASSES:
    columns.append(f"P({cls})")

results_table = wandb.Table(columns=columns)
num_samples = 200
indices = random.sample(range(len(testset)), num_samples)

for idx in indices:
    img_raw, label = raw_testset[idx]
    img_norm, _ = testset[idx]

    with torch.no_grad():
        output = model(img_norm.unsqueeze(0).to(device))
        probs = torch.softmax(output, dim=1)[0].cpu()
        pred = probs.argmax().item()
        confidence = probs[pred].item()

    row = [
        wandb.Image(img_raw),
        CIFAR10_CLASSES[label],
        CIFAR10_CLASSES[pred],
        label == pred,
        round(confidence, 4),
    ]
    for p in probs.tolist():
        row.append(round(p, 4))

    results_table.add_data(*row)

wandb.log({"test_predictions": results_table})
print(f"테스트 예측 결과 {num_samples}건 로깅 완료!")

In [None]:
# === 모델 Artifact 저장 ===

model_artifact = wandb.Artifact(
    "resnet18-cifar10",
    type="model",
    description="ResNet-18 fine-tuned on CIFAR-10",
    metadata={
        "model_type": "classification",
        "model_architecture": "resnet18",
        "dataset": "cifar10",
        "num_classes": 10,
        "best_val_acc": best_acc,
        "classes": CIFAR10_CLASSES,
        "framework": "pytorch",
        "input_size": [3, 32, 32],
    },
)
model_artifact.add_file("best_model.pth", name="model.pth")
run.log_artifact(model_artifact)
print("모델 Artifact 로깅 완료!")

In [None]:
# === Model Registry 등록 ===

run.link_artifact(
    model_artifact,
    "model-registry/cifar10-classifier",
    aliases=["staging"],
)
print("Model Registry에 'staging' alias로 등록 완료!")

wandb.finish()

## Hyperparameter Sweep

**Bayesian 최적화**를 사용하여 최적의 하이퍼파라미터 조합을 탐색합니다.

| 파라미터 | 탐색 범위 |
|-----------|------------|
| Learning Rate | 1e-5 ~ 1e-2 (log uniform) |
| Batch Size | 32, 64, 128 |
| Optimizer | Adam, SGD, AdamW |

In [None]:
# === Sweep 설정 및 실행 ===

sweep_config = {
    "method": "bayes",
    "metric": {"name": "val/acc", "goal": "maximize"},
    "parameters": {
        "lr": {"min": 1e-5, "max": 1e-2, "distribution": "log_uniform_values"},
        "batch_size": {"values": [32, 64, 128]},
        "optimizer": {"values": ["adam", "sgd", "adamw"]},
    },
}


def sweep_train():
    """Sweep 학습 함수 (인자 없음 — wandb.agent 규칙)"""
    run = wandb.init(config=CONFIG)
    config = wandb.config

    # Sweep 파라미터로 DataLoader 재생성
    sweep_trainloader = torch.utils.data.DataLoader(
        trainset, batch_size=config.batch_size, shuffle=True, num_workers=2
    )
    sweep_testloader = torch.utils.data.DataLoader(
        testset, batch_size=config.batch_size, shuffle=False, num_workers=2
    )

    # 각 trial마다 새 모델
    sweep_model = create_model(CONFIG["num_classes"]).to(device)
    criterion = nn.CrossEntropyLoss()

    if config.optimizer == "adam":
        opt = optim.Adam(sweep_model.parameters(), lr=config.lr)
    elif config.optimizer == "sgd":
        opt = optim.SGD(sweep_model.parameters(), lr=config.lr, momentum=0.9, weight_decay=5e-4)
    elif config.optimizer == "adamw":
        opt = optim.AdamW(sweep_model.parameters(), lr=config.lr, weight_decay=1e-2)

    # Sweep에서는 3 epoch으로 제한 (데모 시간 절약)
    for epoch in range(3):
        train_loss, train_acc = train_one_epoch(sweep_model, sweep_trainloader, criterion, opt)
        val_loss, val_acc = evaluate(sweep_model, sweep_testloader, criterion)

        wandb.log({
            "epoch": epoch + 1,
            "train/loss": train_loss,
            "train/acc": train_acc,
            "val/loss": val_loss,
            "val/acc": val_acc,
        })

    wandb.finish()


sweep_id = wandb.sweep(sweep_config, project="wandb-e2e-demo-image-classification")
wandb.agent(sweep_id, function=sweep_train, count=5)
print("Sweep 완료!")

In [None]:
# === Report 생성 ===

import wandb_workspaces.reports.v2 as wr

report = wr.Report(
    project="wandb-e2e-demo-image-classification",
    title="CIFAR-10 Image Classification — 실험 결과 리포트",
    description="ResNet-18 CIFAR-10 fine-tuning 실험 결과 및 Sweep 분석",
)

report.blocks = [
    wr.TableOfContents(),

    wr.H1("1. 실험 개요"),
    wr.P(
        "CIFAR-10 데이터셋에 대한 ResNet-18 이미지 분류 실험 결과를 정리합니다. "
        "W&B의 Experiment Tracking, Artifacts, Sweeps, Model Registry 기능을 활용하였습니다."
    ),

    wr.H1("2. 학습 결과"),
    wr.PanelGrid(
        runsets=[
            wr.Runset(project="wandb-e2e-demo-image-classification")
        ],
        panels=[
            wr.LinePlot(title="Training Loss", x="epoch", y=["train/loss"]),
            wr.LinePlot(title="Validation Accuracy", x="epoch", y=["val/acc"]),
            wr.LinePlot(title="Validation Loss", x="epoch", y=["val/loss"]),
            wr.LinePlot(title="Training Accuracy", x="epoch", y=["train/acc"]),
        ],
    ),

    wr.H1("3. Sweep 분석"),
    wr.P("Bayesian 최적화를 통한 하이퍼파라미터 탐색 결과:"),
    wr.PanelGrid(
        runsets=[
            wr.Runset(project="wandb-e2e-demo-image-classification")
        ],
        panels=[
            wr.ParallelCoordinatesPlot(
                columns=[
                    wr.ParallelCoordinatesPlotColumn(metric="c::lr"),
                    wr.ParallelCoordinatesPlotColumn(metric="c::batch_size"),
                    wr.ParallelCoordinatesPlotColumn(metric="c::optimizer"),
                    wr.ParallelCoordinatesPlotColumn(metric="val/acc"),
                ],
            ),
            wr.ScalarChart(title="Best Validation Accuracy", metric="val/acc"),
            wr.BarPlot(title="Val Accuracy by Run", metrics=["val/acc"]),
        ],
    ),

    wr.H1("4. 다음 단계"),
    wr.P(
        "최적 모델을 Model Registry의 'production' alias로 승격하여 "
        "배포 파이프라인을 트리거합니다."
    ),
]

report.save()
print(f"Report 생성 완료! URL: {report.url}")

In [None]:
wandb.finish()
print("\n데모 완료!")
print("W&B 대시보드에서 결과를 확인하세요.")
print("\n다음 단계: models/automations/automations.ipynb 에서 모델을 'production'으로 승격하여 자동 배포를 트리거합니다.")