# Copyright

<PRE>
This notebook was created as part of the "Deep learning / VITMMA19" class at
Budapest University of Technology and Economics, Hungary,
https://portal.vik.bme.hu/kepzes/targyak/VITMMA19

Any re-use or publication of any part of the notebook is only allowed with the
 written consent of the authors.

2024 (c) Mohammed Salah Al-Radhi and Tamás Gábor Csapó (malradhi@tmit.bme.hu)
</PRE>

In [1]:
### HYPEROPT: task during the class - we will do this together
# add WandB.ai integration to the code
# (help: https://docs.wandb.ai/guides/integrations/lightning )
# run at least 3 different trainings

In [9]:
# install pytorch lithening
!pip install pytorch-lightning --quiet
!pip install wandb --quiet

In [17]:
import os
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"

In [18]:
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader,random_split
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.datasets import CIFAR10
from pytorch_lightning.loggers import WandbLogger
import wandb


In [19]:
# create one class to deal with data
class CifarDataModule(pl.LightningDataModule):
  def __init__(self, batch_size, data_dir="./"):
    super().__init__()
    self.data_dir=data_dir
    self.batch_size=batch_size
    self.transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
    self.num_classes=10

  def prepare_data(self):
    CIFAR10(self.data_dir,train=True,download=True)
    CIFAR10(self.data_dir,train=False,download=True)

  def setup(self, stage=None):
    if stage=='fit' or stage is None:
      cifar_full=CIFAR10(self.data_dir,train=True,transform=self.transform)
      self.cifar_train,self.cifar_val=random_split(cifar_full,[45000,5000])

    if stage=='test' or stage is None:
      self.cifar_test=CIFAR10(self.data_dir,train=False,transform=self.transform)

  def train_dataloader(self):
    return DataLoader(self.cifar_train,batch_size=self.batch_size,shuffle=True,num_workers=2)

  def val_dataloader(self):
    return DataLoader(self.cifar_val,batch_size=self.batch_size,shuffle=False,num_workers=2)

  def test_dataloader(self):
    return DataLoader(self.cifar_test,batch_size=self.batch_size,shuffle=False,num_workers=2)

In [20]:
# class for visualizing one batch of validation images along with predicted and rall class label
class ImagePredictionLogger(pl.Callback):
    def __init__(self, val_samples, num_samples=32):
        super().__init__()
        self.val_imgs, self.val_labels = val_samples
        self.val_imgs = self.val_imgs[:num_samples]
        self.val_labels = self.val_labels[:num_samples]

    def on_validation_epoch_end(self, trainer, pl_module):
        val_imgs = self.val_imgs.to(device=pl_module.device)
        logits = pl_module(val_imgs)
        preds = torch.argmax(logits, 1)

        trainer.logger.experiment.log({
            "examples": [wandb.Image(x, caption=f"Pred:{pred}, Label:{y}")
                            for x, pred, y in zip(val_imgs, preds, self.val_labels)],
            "global_step": trainer.global_step
            })

In [21]:
class CIFAR10LitModel(pl.LightningModule):
    def __init__(self, input_shape, num_classes, learning_rate, neurons_fc1, neurons_fc2, optimizer):
        super().__init__()
        self.save_hyperparameters()
        self.input_shape = input_shape
        self.learning_rate = learning_rate
        self.neurons_fc1 = neurons_fc1
        self.neurons_fc2 = neurons_fc2
        self.optimizer_name = optimizer

        # model architecture
        self.conv1 = nn.Conv2d(3, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 32, 3, 1)
        self.conv3 = nn.Conv2d(32, 64, 3, 1)
        self.conv4 = nn.Conv2d(64, 64, 3, 1)
        self.pool1 = nn.MaxPool2d(2)
        self.pool2 = nn.MaxPool2d(2)

        n_sizes = self._get_output_shape(input_shape)
        self.fc1 = nn.Linear(n_sizes, self.neurons_fc1)
        self.fc2 = nn.Linear(self.neurons_fc1, self.neurons_fc2)
        self.fc3 = nn.Linear(self.neurons_fc2, num_classes)

        self.train_acc = Accuracy(task='multiclass', num_classes=10)
        self.val_acc = Accuracy(task='multiclass', num_classes=10)
        self.test_acc = Accuracy(task='multiclass', num_classes=10)



    def _get_output_shape(self, shape):
          '''returns the size of the output tensor from the conv layers'''
          batch_size = 1
          input = torch.autograd.Variable(torch.rand(batch_size, *shape))
          output_feat = self._feature_extractor(input)
          n_size = output_feat.data.view(batch_size, -1).size(1)
          return n_size


  # conv1,relu, conv2,relu, maxpool,conv3,relu,conv4,relu,maxpool
    def _feature_extractor(self,x):
      x=F.relu(self.conv1(x))
      x=self.pool1(F.relu(self.conv2(x)))
      x=F.relu(self.conv3(x))
      x=self.pool2(F.relu(self.conv4(x)))
      return x


    def forward(self,x):
      x=self._feature_extractor(x)
      x=x.view(x.size(0),-1)
      x=F.relu(self.fc1(x))
      x=F.relu(self.fc2(x))
      x=F.log_softmax(self.fc3(x),dim=1)
      return x

    def training_step(self, batch, batch_idx):
      x, y = batch
      logits = self(x)
      loss = F.nll_loss(logits, y)
      # metric
      preds = torch.argmax(logits, dim=1)
      acc = self.train_acc(preds, y)
      self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
      self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True)
      return loss

    # validation loop
    def validation_step(self, batch, batch_idx):
      x, y = batch
      logits = self(x)
      loss = F.nll_loss(logits, y)
      preds = torch.argmax(logits, dim=1)
      acc = self.val_acc(preds, y)
      self.log('val_loss', loss, prog_bar=True)
      self.log('val_acc', acc, prog_bar=True)
      return loss

    # test loop
    def test_step(self,batch,batch_idx):
      x,y=batch
      logits=self(x)
      loss=F.nll_loss(logits,y)

      pred=torch.argmax(logits,dim=1)
      acc=self.test_acc(pred,y)
      self.log('test_loss',loss,on_epoch=True)
      self.log('test_acc',acc,on_epoch=True)
      return loss

    def configure_optimizers(self):
        if self.optimizer_name == 'adam':
            optimizer = torch.optim.Adam(self.parameters(), self.learning_rate)
        elif self.optimizer_name == 'sgd':
            optimizer = torch.optim.SGD(self.parameters(), self.learning_rate, momentum=0.9)
        return optimizer


In [22]:
sweep_config = {
    'method': 'random',  # Search strategy: 'grid', 'random', or 'bayes'
    'metric': {
        'name': 'val_acc',
        'goal': 'maximize'  # Optimization goal
    },
    'parameters': {
        'learning_rate': {
            'values': [1e-4, 1e-3, 1e-2]  # Different learning rates to try
        },
        'optimizer': {
            'values': ['adam', 'sgd']  # Try different optimizers
        },
        'neurons_fc1': {
            'values': [256, 512, 1024]  # Number of neurons in the first FC layer
        },
        'neurons_fc2': {
            'values': [64, 128, 256]  # Number of neurons in the second FC layer
        }
    }
}


def sweep_train():
    # Initialize a new wandb run
    run = wandb.init()  # Start a new run
    config = wandb.config  # Access the config for hyperparameters

    # Initialize the model using the config parameters
    model = CIFAR10LitModel(
        input_shape=(3, 32, 32),
        num_classes=10,
        learning_rate=config.learning_rate,
        neurons_fc1=config.neurons_fc1,
        neurons_fc2=config.neurons_fc2,
        optimizer=config.optimizer
    )

    # Initialize WandB logger
    wandb_logger = WandbLogger(project='sweep_project')

    # Initialize Data Module
    dm = CifarDataModule(batch_size=32)
    dm.prepare_data()
    dm.setup()

    # Define callbacks for model checkpointing and early stopping
    checkpoint_callback = pl.callbacks.ModelCheckpoint()
    early_stop_callback = pl.callbacks.EarlyStopping(
        monitor="val_acc", patience=3, verbose=False, mode="max"
    )

    # Define the PyTorch Lightning trainer
    trainer = pl.Trainer(
        max_epochs=5,
        logger=wandb_logger,
        callbacks=[checkpoint_callback, early_stop_callback]
    )

    # Train the model
    trainer.fit(model, datamodule=dm)

    # Test the model
    trainer.test(datamodule=dm)

    # Mark the end of the wandb run
    run.finish()




In [23]:
# Initialize the sweep
sweep_id = wandb.sweep(sweep_config, project='sweep_project')

# Run the sweep
wandb.agent(sweep_id, function=sweep_train, count=10)  # Runs 10 trials


Create sweep with ID: mhhk7itk
Sweep URL: https://wandb.ai/nguyenbaphi-/sweep_project/sweeps/mhhk7itk


[34m[1mwandb[0m: Agent Starting Run: mlpg5fvl with config:
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	neurons_fc1: 1024
[34m[1mwandb[0m: 	neurons_fc2: 128
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: Currently logged in as: [33mnguyenbaphi[0m ([33mnguyenbaphi-[0m). Use [1m`wandb login --relogin`[0m to force relogin


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


 33%|███▎      | 56360960/170498071 [00:03<00:05, 21847199.34it/s][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


 51%|█████     | 86376448/170498071 [00:04<00:04, 18819976.40it/s]

In [None]:
# Instantiate the cifar and model
cifar = CifarDataModule(batch_size=32)
cifar.prepare_data()
cifar.setup()

# Grab samples to log predictions on
samples = next(iter(cifar.val_dataloader()))

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:10<00:00, 15689000.53it/s]


Extracting ./cifar-10-python.tar.gz to ./
Files already downloaded and verified


In [None]:
### WandB, you have have an account(if you don't, create one)
def train_model(learning_rate, neurons_fc1, neurons_fc2, optimizer):
    wandb.login()

    config=wandb.config

    wandb_logger = WandbLogger(project='lastt', job_type='train', log_model="all")

    # instantiate classes
    dm = CifarDataModule(32)
    dm.prepare_data()
    dm.setup()

    model = CIFAR10LitModel(
        input_shape=(3, 32, 32),
        num_classes=10,
        learning_rate=learning_rate,
        neurons_fc1=neurons_fc1,
        neurons_fc2=neurons_fc2,
        optimizer=optimizer
    )

    wandb_logger.watch(model)

    # Initialize Callbacks
    checkpoint_callback = pl.callbacks.ModelCheckpoint()
    early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_acc", patience=3, verbose=False, mode="max")

    ### WandB
    trainer = pl.Trainer(max_epochs=5,
                     logger=wandb_logger,
                     callbacks=[checkpoint_callback, early_stop_callback,ImagePredictionLogger(samples)]
                    )

    # Train the model
    trainer.fit(model, dm)

    # Evaluate the model
    trainer.test(dataloaders=cifar.test_dataloader())

    # tell the WandB you have finished
    wandb.finish()

In [None]:
best_hyperparameters = {
    'learning_rate': 0.001,
    'neurons_fc1': 512,
    'neurons_fc2': 256,
    'optimizer': 'adam'
}

# Start training the model
train_model(**best_hyperparameters)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Files already downloaded and verified
Files already downloaded and verified


[34m[1mwandb[0m: Currently logged in as: [33mkillercookie[0m ([33mkillercookie_[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified


INFO:pytorch_lightning.callbacks.model_summary:
   | Name      | Type               | Params | Mode 
----------------------------------------------------------
0  | conv1     | Conv2d             | 896    | train
1  | conv2     | Conv2d             | 9.2 K  | train
2  | conv3     | Conv2d             | 18.5 K | train
3  | conv4     | Conv2d             | 36.9 K | train
4  | pool1     | MaxPool2d          | 0      | train
5  | pool2     | MaxPool2d          | 0      | train
6  | fc1       | Linear             | 819 K  | train
7  | fc2       | Linear             | 131 K  | train
8  | fc3       | Linear             | 2.6 K  | train
9  | train_acc | MulticlassAccuracy | 0      | train
10 | val_acc   | MulticlassAccuracy | 0      | train
11 | test_acc  | MulticlassAccuracy | 0      | train
----------------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.077     Total estimated model params size (MB)
12        Modul

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


Files already downloaded and verified
Files already downloaded and verified


INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at ./lastt/o5rtw1r7/checkpoints/epoch=4-step=7035.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at ./lastt/o5rtw1r7/checkpoints/epoch=4-step=7035.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

VBox(children=(Label(value='58.869 MB of 58.869 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▅▇▇▇▇▇█
global_step,▁▂▄▅▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▆▇█
train_acc_step,▁▁▂▂▄▅▃▃▃▃▄▃▆▄▃▄▆▇▇▅▆▅▆▆▄▅▆▇▅▄▅▆█▆▇▆▆▆▇▅
train_loss_epoch,█▅▃▂▁
train_loss_step,█▇▆█▅▅▅▆▆▅▅▃▅▄▄▅▄▃▃▄▃▃▂▃▄▄▂▂▃▂▂▃▃▃▂▂▃▂▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
val_acc,▁▆▇▇█

0,1
epoch,5.0
global_step,7035.0
test_acc,0.7457
test_loss,0.7721
train_acc_epoch,0.8034
train_acc_step,0.90625
train_loss_epoch,0.5553
train_loss_step,0.2004
trainer/global_step,7035.0
val_acc,0.7412


In [None]:
### TASK OF THE STUDENT

# extend WandB.ai integration in the code with sweeps
# (e.g. add variables like learning rate, optimizer, neurons_FC1, neurons_FC2)
# help:https://docs.wandb.ai/guides/sweeps  and
#       https://github.com/wandb/wandb/issues/5003
# store the hyperparameters and val_acc to wandb
# run at least 10 trainings
# in wandb.ai, export the result of the runs as a .csv file,
# in wandb.ai, create a report from the sweep results and share it by submitting the link in Moodle.