# Introduction to W&B

<!--- @wandbcode{dlai_01} -->

스프라이트 분류 모델 훈련에 `wandb`를 추가하여 중요한 메트릭을 추적 및 시각화하고, 모델의 동작에 대한 인사이트를 얻고, 모델 개선을 위한 정보에 기반한 의사 결정을 내릴 수 있도록 할 것입니다. 또한 다양한 실험을 비교 및 분석하고, 팀원들과 협업하며, 결과를 효과적으로 재현하는 방법도 살펴볼 것입니다.

In [1]:
# !pip install wandb

In [2]:
import math
from pathlib import Path
from types import SimpleNamespace
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from utilities import get_dataloaders

import wandb

### 스프라이트 분류

스프라이트를 분류하는 간단한 모델을 만들어 보겠습니다. 아래 이미지에서 스프라이트와 해당 클래스의 몇 가지 예를 볼 수 있습니다.

<img src="sprite_sample.png" alt="Alt Text" width="700"/>

In [3]:
INPUT_SIZE = 3 * 16 * 16
OUTPUT_SIZE = 5
HIDDEN_SIZE = 256
NUM_WORKERS = 2
CLASSES = ["hero", "non-hero", "food", "spell", "side-facing"]
DATA_DIR = Path('./data/')
DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")

def get_model(dropout):
    "Simple MLP with Dropout"
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(INPUT_SIZE, HIDDEN_SIZE),
        nn.BatchNorm1d(HIDDEN_SIZE),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE)
    ).to(DEVICE)

In [4]:
# Let's define a config object to store our hyperparameters
config = SimpleNamespace(
    epochs = 2,
    batch_size = 128,
    lr = 1e-5,
    dropout = 0.5,
    slice_size = 10_000,
    valid_pct = 0.2,
)

In [5]:
def train_model(config):
    "Train a model with a given config"
    
    wandb.init(
        project="dlai_intro",
        config=config,
    )

    # Get the data
    train_dl, valid_dl = get_dataloaders(DATA_DIR, 
                                         config.batch_size, 
                                         config.slice_size, 
                                         config.valid_pct)
    n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)

    # A simple MLP model
    model = get_model(config.dropout)

    # Make the loss and optimizer
    loss_func = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=config.lr)

    example_ct = 0

    for epoch in tqdm(range(config.epochs), total=config.epochs):
        model.train()

        for step, (images, labels) in enumerate(train_dl):
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            outputs = model(images)
            train_loss = loss_func(outputs, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            example_ct += len(images)
            metrics = {
                "train/train_loss": train_loss,
                "train/epoch": epoch + 1,
                "train/example_ct": example_ct
            }
            wandb.log(metrics)
            
        # Compute validation metrics, log images on last epoch
        val_loss, accuracy = validate_model(model, valid_dl, loss_func)
        # Compute train and validation metrics
        val_metrics = {
            "val/val_loss": val_loss,
            "val/val_accuracy": accuracy
        }
        wandb.log(val_metrics)
    
    wandb.finish()


In [6]:
def validate_model(model, valid_dl, loss_func):
    "Compute the performance of the model on the validation dataset"
    model.eval()
    val_loss = 0.0
    correct = 0

    with torch.inference_mode():
        for i, (images, labels) in enumerate(valid_dl):
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            # Forward pass
            outputs = model(images)
            val_loss += loss_func(outputs, labels) * labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            
    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)


### W&B account
https://wandb.ai/site 에서 무료 계정을 만든 다음 wandb 계정에 로그인하여 실험 결과를 저장하고 고급 W&B 기능을 사용하세요. 익명 모드로도 계속 학습할 수 있습니다. 기존 W&B 계정이 있고 브라우저가 자동으로 로그인하는 경우 혼동을 피하기 위해 여기에서 해당 계정을 사용해야 합니다.

In [7]:
wandb.login(anonymous="allow")

[34m[1mwandb[0m: Currently logged in as: [33manony-moose-759708326952010574[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### 모델 학습
기본 구성으로 모델을 학습시키고 W&B에서 어떻게 작동하는지 확인해 보겠습니다.

In [8]:
train_model(config)

  0%|          | 0/2 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.156908…

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
train/example_ct,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/train_loss,█▆▇██▆▅▇▆▆▅▄▅▅▅▅▅▅▄▆▄▄▄▄▃▃▃▄▃▃▂▁▂▂▂▂▂▂▁▁
val/val_accuracy,▁█
val/val_loss,█▁

0,1
train/epoch,2.0
train/example_ct,16000.0
train/train_loss,1.28441
val/val_accuracy,0.6475
val/val_loss,1.24947


In [9]:
# So let's change the learning rate to a 1e-3 
# and see how this affects our results.
config.lr = 1e-4
train_model(config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016751269449984344, max=1.0…

  0%|          | 0/2 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.163866…

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
train/example_ct,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/train_loss,█▇▇▇▆▅▅▄▄▄▄▃▃▃▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▂
val/val_accuracy,▁█
val/val_loss,█▁

0,1
train/epoch,2.0
train/example_ct,16000.0
train/train_loss,0.49899
val/val_accuracy,0.941
val/val_loss,0.34998


In [10]:
config.lr = 1e-4
train_model(config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01675102431666649, max=1.0)…

  0%|          | 0/2 [00:00<?, ?it/s]

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
train/example_ct,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/train_loss,█▇▇▆▆▅▅▄▅▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁
val/val_accuracy,▁█
val/val_loss,█▁

0,1
train/epoch,2.0
train/example_ct,16000.0
train/train_loss,0.41657
val/val_accuracy,0.9355
val/val_loss,0.35668


In [11]:
config.dropout = 0.1
config.epochs = 1
train_model(config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01675110348345091, max=1.0)…

  0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.012 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.075438…

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/example_ct,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/train_loss,██▇▇▆▆▆▅▅▄▅▅▄▄▃▃▄▃▃▂▂▂▂▃▂▂▂▂▂▂▁▂▂▂▁▂▂▂▁▁
val/val_accuracy,▁
val/val_loss,▁

0,1
train/epoch,1.0
train/example_ct,8000.0
train/train_loss,0.48068
val/val_accuracy,0.935
val/val_loss,0.45842


In [12]:
config.lr = 1e-3
train_model(config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0167516784669715, max=1.0))…

  0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.157064…

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/example_ct,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/train_loss,█▆▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/val_accuracy,▁
val/val_loss,▁

0,1
train/epoch,1.0
train/example_ct,8000.0
train/train_loss,0.07975
val/val_accuracy,0.99
val/val_loss,0.0729
