<a href="https://colab.research.google.com/github/jyothi8203/CMU/blob/main/WandB_Recitation_0P_v1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recitation 0P: Weights and Biases

This recitation notebook will detail the importance of performance visualization, model tracking and version control, and the workflow of hyperparameter tuning using [WandB](https://wandb.ai/) -- a widely used ML Development Tool -- using the PyTorch CIFAR10 dataset as a pedagogical example.

# Installation and Libraries

In [None]:
### installing WandB
!pip install wandb -qqq

In [None]:
import os
import wandb
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from torchvision.transforms import ToTensor

from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ", device)

Device:  cuda


In [None]:
os.environ["WANDB_API_KEY"] = ""        ## enter API KEY
wandb.login()

True

# Dataset

In [None]:
data_train      = datasets.CIFAR10(
    root        = "data",
    train       = True,
    transform   = ToTensor(),
    download    = True,
)

data_test       = datasets.CIFAR10(
    root        = "data",
    train       = False,
    transform   = ToTensor(),
    download    = True,
)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
def build_data(batch_size, data_train, data_test):
    ''' creates the training and validation DataLoaders for efficient batch iterations '''
    train_loader    = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True)
    test_loader     = torch.utils.data.DataLoader(data_test, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [None]:
train_loader, test_loader = build_data(64, data_train, data_test)

for x, y in train_loader:
    print(f"x: {x}\n")
    print(f"y: {y}")

    break

x: tensor([[[[1.0000, 1.0000, 1.0000,  ..., 0.9373, 0.9294, 0.9098],
          [1.0000, 0.9961, 0.9961,  ..., 0.9137, 0.9059, 0.8902],
          [1.0000, 1.0000, 1.0000,  ..., 0.8980, 0.8902, 0.8784],
          ...,
          [0.4824, 0.4745, 0.4824,  ..., 0.4745, 0.4706, 0.4706],
          [0.4980, 0.4863, 0.4824,  ..., 0.4706, 0.4745, 0.4706],
          [0.4902, 0.4784, 0.4784,  ..., 0.4745, 0.4784, 0.4667]],

         [[1.0000, 1.0000, 1.0000,  ..., 0.9412, 0.9412, 0.9255],
          [1.0000, 0.9961, 0.9961,  ..., 0.9176, 0.9176, 0.9098],
          [1.0000, 1.0000, 1.0000,  ..., 0.9098, 0.9098, 0.9020],
          ...,
          [0.4784, 0.4706, 0.4784,  ..., 0.4706, 0.4667, 0.4667],
          [0.4941, 0.4824, 0.4784,  ..., 0.4667, 0.4706, 0.4667],
          [0.4863, 0.4745, 0.4745,  ..., 0.4706, 0.4745, 0.4627]],

         [[1.0000, 1.0000, 1.0000,  ..., 0.9569, 0.9569, 0.9451],
          [1.0000, 0.9961, 0.9961,  ..., 0.9412, 0.9451, 0.9412],
          [1.0000, 1.0000, 1.0000,  ...

# Network

In [None]:
class Network(nn.Module):
    def __init__(self):

        super(Network, self).__init__()

        self.CNN = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),

            nn.AvgPool2d(kernel_size=9),
            nn.Flatten()
        )

        self.classification = nn.Linear(576, 10)

    def forward(self, x):
        x_cnn   = self.CNN(x)
        res     = self.classification(x_cnn)

        return res

In [None]:
model = Network().to(device)
print(model)

model(x.to(device)).shape

Network(
  (CNN): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): AvgPool2d(kernel_size=9, stride=9, padding=0)
    (4): Flatten(start_dim=1, end_dim=-1)
  )
  (classification): Linear(in_features=576, out_features=10, bias=True)
)


torch.Size([64, 10])

# Training Loop Helper Functions

In [None]:
def get_optim(optimizer, learning_rate, model):
  if optimizer == "sgd":
    return optim.SGD(model.parameters(), lr=learning_rate)
  else:
    return optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def train_epoch(model, loader, optimizer, criterion, scaler):
    num_correct = 0
    total_loss  = 0

    for i, (x, y) in enumerate(loader):
          optimizer.zero_grad()

          x = x.cuda()
          y = y.cuda()

          with torch.cuda.amp.autocast():
              outputs   = model(x)
              loss      = criterion(outputs, y)

          total_loss += float(loss)

          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()
    ep_loss = float(total_loss / len(loader))

    return model, ep_loss

In [None]:
def train(model, finish=True):

  ''' don't worry if these functions don't make complete sense yet, as
        you will be very familiar with it after HW1
  '''

  best_acc = 0

  for epoch in range(run_config["epochs"]):
      batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc="Train")

      num_correct   = 0
      total_loss    = 0

      for i, (x, y) in enumerate(train_loader):
          optimizer.zero_grad()

          x = x.cuda()
          y = y.cuda()

          with torch.cuda.amp.autocast():
              outputs   = model(x)
              loss      = criterion(outputs, y)

          num_correct   += int((torch.argmax(outputs, axis=1) == y).sum())
          total_loss    += float(loss)

          batch_bar.set_postfix(
              acc="{:.04f}%".format(100 * num_correct / ((i + 1) * run_config["batch_size"])),
              loss="{:.04f}".format(float(total_loss / (i + 1))),
              num_correct=num_correct,
              lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))

          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()


          batch_bar.update()
      batch_bar.close()

      train_loss = float(total_loss / len(train_loader))
      train_acc = 100 * num_correct / (len(train_loader) * run_config["batch_size"])
      lr = float(optimizer.param_groups[0]['lr'])

      print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
          epoch + 1,
          run_config["epochs"],
          train_acc ,
          train_loss,
          lr
          )
      )

      ## creating dictionary to log relevant model metrics
      metrics = {
          "train_loss"  : train_loss,
          "train_acc"   : train_acc,
          'lr'          : lr
      }

      ## logging metrics to WandB
      wandb.log(metrics)

      # updating the model version
      if train_acc > best_acc:
        best_acc = train_acc

        # saving the model and optimizer states
        torch.save({
              'model_state_dict'        : model.state_dict(),
              'optimizer_state_dict'    : optimizer.state_dict()
              }, "Model")

        # creating WandB Artifact
        model_artifact = wandb.Artifact(run_config['model'], type='model')

        # Adding model file to Artifact
        model_artifact.add_file("Model")

        # Saving Artifact to WandB
        run.log_artifact(model_artifact)

  if finish: wandb.finish()

# Basic Usage

You can run this training function and log the performance of your choice into the WandB GUI. This simple method will allow you to monitor trends in a specific run configuration and compare model performance across different configurations.

In [None]:
run_config = {
    'epochs'    : 5,
    'batch_size': 64,

    'model'     : '1-2dcnn',
    'optimizer' : 'sgd',
    'lr'        : 2e-3,
}

train_loader, test_loader = build_data(run_config['batch_size'], data_train, data_test)

optimizer = get_optim(run_config['optimizer'], run_config['lr'], model)

criterion = nn.CrossEntropyLoss()

scaler = torch.cuda.amp.GradScaler()

In [None]:
run = wandb.init(
    project     = "wandb-quickstart",
    job_type    = "model-training",
    name        = run_config['model'],
    config      = run_config,
    )

In [None]:
train(model)



Epoch 1/5: Train Acc 24.3047%, Train Loss 2.1170, Learning Rate 0.0020




Epoch 2/5: Train Acc 31.7335%, Train Loss 1.9620, Learning Rate 0.0020




Epoch 3/5: Train Acc 33.9914%, Train Loss 1.8957, Learning Rate 0.0020




Epoch 4/5: Train Acc 35.5878%, Train Loss 1.8547, Learning Rate 0.0020




Epoch 5/5: Train Acc 36.6368%, Train Loss 1.8239, Learning Rate 0.0020


VBox(children=(Label(value='0.168 MB of 0.168 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
lr,▁▁▁▁▁
train_acc,▁▅▆▇█
train_loss,█▄▃▂▁

0,1
lr,0.002
train_acc,36.63683
train_loss,1.82386


# Resuming Training

In [None]:
''' change RESUME_LOGGING to True and enter the specific run ID below '''

RESUME_LOGGING = False                  ### resume run status

if RESUME_LOGGING:
  run_id = NotImplemented               ### replace with run ID string (from www.wandb.ai/)

  run = wandb.init(
      id        = run_id,               ### inserting specific run ID to resume a previous run
      #reinit    = True,
      resume    = "must",               ### set this to "must" to resume, but comment out reinit=True
      project   = "wandb-quickstart",   ### WandB Project Name
  )

In [None]:
''' test code to append metrics to previously logged metrics in the run '''

test_new_metrics = {
    "train_loss"    :1.5,
    "train_acc"     : 40,
    'lr'            : 0.001
    }

wandb.log(test_new_metrics)

# Hyperparameter Sweeps

[Sweeps](https://docs.wandb.ai/guides/sweeps) are a way of automating hyperparameter tuning in Deep Learning Models. <br>
You set up the values that you want your sweep to try and then track the effect of changing each parameter on each value on the model.

In [None]:
# initializing the sweep, setting the sampling method (grid, random, bayesian)

sweep_config = {'method' : 'random'}        # enter "grid", "random", "bayes"

In [None]:
# setting the objective of the sweep (minimize loss, maximize accuracy, etc.)

metric = {
    'name' : 'loss',
    'goal' : 'minimize'
}

sweep_config['metric'] = metric

In [None]:
# hyperparameters of interest (to be visualized in the sweep)

parameters_dict = {
    'optimizer' : {
        'values': ['sgd', 'adam']
    },

    'learning_rate' : {
        'distribution'  : 'uniform',
        'min'           : 2e-4,
        'max'           : 1e-1
    },

    'batch_size' : {
        'distribution'  : 'q_log_uniform_values',
        'q'             : 4,
        'min'           : 16,
        'max'           : 128
    },

    'epochs' : {
        'value' : 5
    }
}

sweep_config['parameters'] = parameters_dict

In [None]:
# initalizing the sweep

sweep_id = wandb.sweep(sweep_config, project="CIFAR-Sweep2")

Create sweep with ID: wzbjge6q
Sweep URL: https://wandb.ai/cchilkun/CIFAR-Sweep2/sweeps/wzbjge6q


In [None]:
def train_sweep(config = None):
    with wandb.init(config=config) as run:
        run.name=f"LR:{wandb.config.learning_rate}__BS:{wandb.config.batch_size}__OPTIM:{wandb.config.optimizer}"
        config = wandb.config

        train_loader, test_loader = build_data(config.batch_size, data_train, data_test)

        model = Network().to(device)

        optimizer = get_optim(config.optimizer, config.learning_rate, model)

        criterion = nn.CrossEntropyLoss()

        scaler = torch.cuda.amp.GradScaler()

        for epoch in range(config.epochs):

            model, loss = train_epoch(model, train_loader, optimizer, criterion, scaler)

            wandb.log({'loss': loss})

In [None]:
# running the sweep

wandb.agent(sweep_id, train_sweep, count=5)

[34m[1mwandb[0m: Agent Starting Run: zoyvjdhc with config:
[34m[1mwandb[0m: 	batch_size: 48
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.09083513142349564
[34m[1mwandb[0m: 	optimizer: sgd


VBox(children=(Label(value='0.003 MB of 0.012 MB uploaded\r'), FloatProgress(value=0.21595941702230453, max=1.…

0,1
loss,█▄▃▂▁

0,1
loss,1.25249


[34m[1mwandb[0m: Agent Starting Run: is6c1nu6 with config:
[34m[1mwandb[0m: 	batch_size: 56
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.05483909989589066
[34m[1mwandb[0m: 	optimizer: sgd


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▄▃▂▁

0,1
loss,1.31324


[34m[1mwandb[0m: Agent Starting Run: t4nqqsoz with config:
[34m[1mwandb[0m: 	batch_size: 36
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.013364095048078676
[34m[1mwandb[0m: 	optimizer: adam


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▄▂▁▁

0,1
loss,1.18276


[34m[1mwandb[0m: Agent Starting Run: 2cvfqagz with config:
[34m[1mwandb[0m: 	batch_size: 44
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.05184637989221525
[34m[1mwandb[0m: 	optimizer: adam


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▃▂▂▁

0,1
loss,1.43797


[34m[1mwandb[0m: Agent Starting Run: m397j3r6 with config:
[34m[1mwandb[0m: 	batch_size: 124
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.04060632810130305
[34m[1mwandb[0m: 	optimizer: sgd


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▅▃▂▁

0,1
loss,1.45985


# Artifact and Model Versioning

Artifacts are a method of managing versions for data and models. You can use Artifacts to keep and compare versions of your model while training to make it easier to share data and models between team members, and for safe version control.

In [None]:
run_config = {
    'model'         : '1-2dcnn',
    'optimizer'     : 'adam',
    'lr'            : 5e-3,
    'batch_size'    : 20,
    'epochs'        : 5
}

train_loader, test_loader   = build_data(run_config['batch_size'], data_train, data_test)
optimizer                   = get_optim(run_config['optimizer'], run_config['lr'], model)
criterion                   = nn.CrossEntropyLoss()
scaler                      = torch.cuda.amp.GradScaler()

In [None]:
run = wandb.init(
    project     = "wandb-quickstart",
    job_type    = "model-training",
    name        = run_config['model'],
    config      = run_config
    )



In [None]:
train(model, finish=False) #run should not finish for using artifact



Epoch 1/5: Train Acc 44.6620%, Train Loss 1.5556, Learning Rate 0.0050




Epoch 2/5: Train Acc 53.7480%, Train Loss 1.3205, Learning Rate 0.0050




Epoch 3/5: Train Acc 57.1420%, Train Loss 1.2293, Learning Rate 0.0050




Epoch 4/5: Train Acc 58.6000%, Train Loss 1.1885, Learning Rate 0.0050




Epoch 5/5: Train Acc 59.8800%, Train Loss 1.1565, Learning Rate 0.0050


In [None]:
## Retreiving the model

# Getting the latest version of the artifact
artifact = run.use_artifact('{}:latest'.format(run_config['model']))
# Downloading the artifact
artifact_dir = artifact.download()
# Loading the model
model_dict = torch.load(os.path.join(artifact_dir, 'Model'))


# Loading weights
model.load_state_dict(model_dict['model_state_dict'])
# Loading optimizer state
optimizer.load_state_dict(model_dict['optimizer_state_dict'])

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [None]:
# Finishing runs
wandb.finish()

0,1
lr,▁▁▁▁▁
train_acc,▁▅▇▇█
train_loss,█▄▂▂▁

0,1
lr,0.005
train_acc,59.88
train_loss,1.15649
