# Imports

In [1]:
# Torch
import torch
import torchvision
import torch.nn as nn
import torchvision.transforms as transforms

# Wandb
import wandb

# Python
import random as rand

# Numpy
import numpy as np

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Seeding
torch.manual_seed(24)

# Scheduler
from torch.optim.lr_scheduler import StepLR

# GPU/CPU

In [2]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [3]:
# Get device
device = get_default_device()

In [4]:
device

device(type='cuda')

# WANDB

In [5]:
# Define sweep
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'validation_loss',
      'goal': 'minimize'   
    },
    'early_terminate': {
        'type': 'hyperband',
        'min_iter': 3
    },
    'parameters': {
        'epochs': {
            'values': [60, 100, 120]
        },
        'learning_rate': {
            'values': [1e-1, 1e-2, 3e-2]
        },
        'step_size': {
            'values': [20, 30, 40]
        },
        'gamma': {
            'values': [0.5, 0.1]
        },
        'pool1': {
            'values': ['max', 'avg']
        },
        'pool2': {
            'values': ['max', 'avg']
        }
    }
}

# Datasets

### Load

In [6]:
# Transform
transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Normalize the test set same as training set without augmentation
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Load train and test set
trainset = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=transform_train)
testset = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=transform_test)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
# Labels
CLASSES = trainset.classes

In [8]:
CLASSES

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

### Dataset related parameters

In [9]:
# Params
batch_size = 4000
val_size = 5000
train_size = len(trainset) - val_size

### Preparation

In [10]:
# Create train and validation
trainset, validationset = torch.utils.data.random_split(trainset, [train_size, val_size])

In [11]:
print("Validation set lenght: ", len(validationset))
print("Train set length: ", len(trainset))

Validation set lenght:  5000
Train set length:  45000


In [12]:
# Create dataloaders
test_loader = torch.utils.data.DataLoader(testset, batch_size, num_workers=4, pin_memory=True)
train_loader = torch.utils.data.DataLoader(trainset, batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = torch.utils.data.DataLoader(validationset, batch_size, shuffle=True, num_workers=4, pin_memory=True)

In [13]:
# Move dataloader to device
val_loader = DeviceDataLoader(val_loader, device)
test_loader = DeviceDataLoader(test_loader, device)
train_loader = DeviceDataLoader(train_loader, device)

# Model

### Helper Functions

In [14]:
# Accuracy
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

# Evaluation
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

# Training
def train():
    # Default values for hyper-parameters we're going to sweep over
    config_defaults = {
        'epochs': 60,
        'learning_rate': 1e-1,
        'step_size': 5,
        'gamma': 0.5
    }
    
    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    
    # Config
    config = wandb.config

    # Model
    model = to_device(CIFAR10Model(config), device)
    model.train()
    
    # SGD optimizer
    optimizer = torch.optim.SGD(model.parameters(), config.learning_rate, momentum=0.9)
    
    # gamma = decaying factor
    scheduler = StepLR(optimizer, step_size=config.step_size, gamma=config.gamma)
    
    for epoch in range(config.epochs):        
        # Cumulative loss
        closs = 0
        
        # Training Phase 
        for batch_idx, batch in enumerate(train_loader):
            # Feed forward
            loss = model.training_step(batch)
            
            # Compute gradient
            loss.backward()
            
            # Add loss to cumulative
            closs += loss.item()
            
            # Backprop
            optimizer.step()
            
            # Clean gradients
            optimizer.zero_grad()
            
            # Log wandb (batch loss)
            wandb.log({"batch_loss": loss.item()})
                
        # Adaptive learning
        scheduler.step()
        
        # Log epoch loss (train)
        wandb.log({"train_loss": closs/len(train_loader)})
        
        # Log validation loss
        wandb.log({"validation_loss":evaluate(model, val_loader)['val_loss']})

        # Log validation accuracy
        wandb.log({"validation_accuracy":evaluate(model, val_loader)['val_acc']})
    
    # Save model
    print("SAVING NOW")
    torch.save(model, f"model__{config.learning_rate}_{config.epochs}_{config.step_size}_{config.gamma}.th")

### Model Class

In [15]:
class CIFAR10Model(nn.Module):
    def __init__(self, config):
        # Init
        super().__init__()
        
        # Convolution layer
        self.conv_layer = nn.Sequential(
            # Conv Layer block 1
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2) if config.pool1 == 'max' else nn.AvgPool2d(kernel_size=2, stride=2),

            # Conv Layer block 2
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2) if config.pool1 == 'max' else nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(p=0.05),

            # Conv Layer block 3
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        self.fc_layer = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Linear(4096, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.1),
            nn.Linear(512, 10)
        )
        
    def forward(self, x):
        # conv layers
        x = self.conv_layer(x)
        
        # flatten
        x = x.view(x.size(0), -1)
        
        # fc layer
        x = self.fc_layer(x)

        return x
    
    def training_step(self, batch):
        # Unpack batch
        images, labels = batch
        
        # Run forward pass
        out = self.forward(images)
        
        # Compute and return loss
        return nn.CrossEntropyLoss()(out, labels)
    
    def validation_step(self, batch):
        # Unpack batch
        images, labels = batch
        
        # Run forward pass
        out = self.forward(images)
        
        # Compute loss
        loss = nn.CrossEntropyLoss()(out, labels)
        
        # Compute accuracy
        acc = accuracy(out, labels)
        
        return {'val_loss': loss, 'val_acc': acc}
    
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(epoch, result['val_loss'], result['val_acc']))

# Training

In [16]:
# Initialize sweep
sweep_id = wandb.sweep(sweep_config, project="Assignment-4")

Create sweep with ID: tqwnsttb
Sweep URL: https://wandb.ai/itaouil/Assignment-4/sweeps/tqwnsttb


In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: dkmve2pe with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	gamma: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	pool1: max
[34m[1mwandb[0m: 	pool2: max
[34m[1mwandb[0m: 	step_size: 40
[34m[1mwandb[0m: Currently logged in as: [33mitaouil[0m (use `wandb login --relogin` to force relogin)


SAVING NOW


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_loss,0.00052
_step,1499.0
_runtime,733.0
_timestamp,1607614299.0
train_loss,0.0005
validation_loss,2.43788
validation_accuracy,0.74975


0,1
batch_loss,█▇█▇█▇▇▆▆▆▅▅▅▄▆▅▄▄▄▃▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,███▇▇▇▇▇▆▆▅▅▅▄▄▅▅▄▄▄▄▃▃▃▃▂▂▂▂▃▂▁▁▁▁▁▁▁▁▁
validation_loss,█▆█▆▇▆▆▅▄▅▄▄▃▂▄▃▃▂▂▁▂▁▁▁▁▁▁▂▃▁▂▃▄▅▆▇▇▇▇█
validation_accuracy,▁▂▁▃▂▃▃▃▄▃▄▅▆▆▅▅▆▆▇▇▇▇██▇███████████████


[34m[1mwandb[0m: Agent Starting Run: 08qlcx48 with config:
[34m[1mwandb[0m: 	epochs: 60
[34m[1mwandb[0m: 	gamma: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.03
[34m[1mwandb[0m: 	pool1: avg
[34m[1mwandb[0m: 	pool2: avg
[34m[1mwandb[0m: 	step_size: 40


SAVING NOW


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_loss,0.00104
_step,899.0
_runtime,446.0
_timestamp,1607614749.0
train_loss,0.00099
validation_loss,2.5369
validation_accuracy,0.71263


0,1
batch_loss,██▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▅▄▄▄▄▃▃▃▄▃▂▂▁▁▂▁▁▁▁▁▁▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,██▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▄▅▄▄▄▄▃▃▃▃▃▂▂▂▁▂▁▁▁▁▁▁▁
validation_loss,▇▇▆▅▄▄▆▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▂▁▁▂▂▃▃▃▄▅▆▆▇█
validation_accuracy,▁▁▃▃▄▄▃▄▄▅▅▅▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇█████████████


[34m[1mwandb[0m: Agent Starting Run: rbrg14m8 with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	gamma: 0.1
[34m[1mwandb[0m: 	learning_rate: 0.03
[34m[1mwandb[0m: 	pool1: avg
[34m[1mwandb[0m: 	pool2: max
[34m[1mwandb[0m: 	step_size: 40


SAVING NOW


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_loss,0.00538
_step,1499.0
_runtime,745.0
_timestamp,1607615499.0
train_loss,0.00499
validation_loss,2.39998
validation_accuracy,0.703


0,1
batch_loss,█▇▇▆▇▆▅▅▄▅▄▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,██▇▆▆▆▅▅▅▄▄▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▇▆▅▄▄▃▃▃▂▂▁▂▁▁▁▂▁▂▂▂▃▃▃▄▄▅▆▆▇▇▇▇███▇████
validation_accuracy,▁▂▃▄▄▅▅▆▆▇▇▆▇▇▇▇████████████████████████


[34m[1mwandb[0m: Agent Starting Run: yuxpldc6 with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	gamma: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	pool1: max
[34m[1mwandb[0m: 	pool2: avg
[34m[1mwandb[0m: 	step_size: 40


SAVING NOW


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_loss,0.35661
_step,1499.0
_runtime,740.0
_timestamp,1607616245.0
train_loss,0.33782
validation_loss,0.99212
validation_accuracy,0.70525


0,1
batch_loss,█████▇▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▁▁▁▁▁▁▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█████▇▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▁▁▁▁▁▁
validation_loss,█████▇▆▅▄▄▄▄▄▃▄▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁▂▂▁▁▁▁▁▁▁▂
validation_accuracy,▁▁▂▂▂▃▄▄▅▅▅▅▅▆▆▆▇▆▇▇▇▇▇▇▇█▇▇███▇████████


[34m[1mwandb[0m: Agent Starting Run: q5uuk6rk with config:
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	gamma: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	pool1: avg
[34m[1mwandb[0m: 	pool2: max
[34m[1mwandb[0m: 	step_size: 40


SAVING NOW


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_loss,0.00016
_step,1499.0
_runtime,737.0
_timestamp,1607616987.0
train_loss,0.00018
validation_loss,4.64999
validation_accuracy,0.60462


0,1
batch_loss,████▇▇████▇▆▆▆▅▅▅▄▄▄▄▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,▇███▇█████▇▆▆▆▅▅▅▄▄▄▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▂▂▃▃▄▄▄▅▆▆▇▇▇███
validation_accuracy,▁▂▂▂▃▃▂▂▁▃▄▅▄▆▆▆▇▇▇▇██▇▇▇██▇████████████


[34m[1mwandb[0m: Agent Starting Run: ka8s0rfy with config:
[34m[1mwandb[0m: 	epochs: 60
[34m[1mwandb[0m: 	gamma: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	pool1: avg
[34m[1mwandb[0m: 	pool2: max
[34m[1mwandb[0m: 	step_size: 20


SAVING NOW


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_loss,1.22883
_step,899.0
_runtime,447.0
_timestamp,1607617440.0
train_loss,1.23084
validation_loss,1.3068
validation_accuracy,0.53312


0,1
batch_loss,█████▇▇▆▆▅▅▄▅▅▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█████▇▇▆▆▆▅▅▅▅▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
validation_loss,████▇▇▆▆▆▅▄▄▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▂▂▂▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇█████████████


[34m[1mwandb[0m: Agent Starting Run: lm8m8b8b with config:
[34m[1mwandb[0m: 	epochs: 120
[34m[1mwandb[0m: 	gamma: 0.5
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	pool1: max
[34m[1mwandb[0m: 	pool2: avg
[34m[1mwandb[0m: 	step_size: 40
