# 1. Hyperparameter optimization

To install Ax: run `pip install ax-platform`

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms

from ax.service.ax_client import AxClient, ObjectiveProperties
from ax.utils.tutorials.cnn_utils import evaluate, load_mnist, train
from ax.utils.notebook.plotting import init_notebook_plotting, render
from ax.modelbridge.registry import Models
from ax.modelbridge.generation_strategy import GenerationStrategy, GenerationStep
import numpy as np

init_notebook_plotting()

In [None]:
torch.manual_seed(1)
dtype = torch.float
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load MNIST data

In [None]:
BATCH_SIZE = 512
train_loader, valid_loader, test_loader = load_mnist(batch_size=BATCH_SIZE)

## (a) Implement CNN

See the exercise sheet for specification. Note that you need to figure out the output size of the convolutional layer and max pooling layer to deteremine the size of the fully connected layer.

In [None]:
class CNN(nn.Module):
    
    # TODO

## (b) Bayesian optimization using Ax

Set-up Ax and specify the hyperparameters that will be optimized.

See https://ax.dev/tutorials/tune_cnn_service.html for instructions on how to use Ax.

In [None]:
ax_client = AxClient()
ax_client.create_experiment(
    name="tune_cnn_on_mnist",  # The name of the experiment.
    parameters=#TODO
    objectives={"accuracy": ObjectiveProperties(minimize=False)},
)

Wrapper for easy model training and evaluation for given hyperparameters. You need the first during hyperparameter optimization, and the latter is useful for checking whether the chosen settings are actually good.

In [None]:
def train_evaluate(parameterization):

    # Extract the kernel size from the ax parameterization
    net = CNN(kernel_size=parameterization.get("kernel_size"))
    #initializes the network, defines the loss function and optimizer, performs the training loop, and returns the trained model
    net = train(
        net=net,
        train_loader=train_loader,
        parameters=parameterization,
        dtype=dtype,
        device=device,
    )

    # computes the accuracy of the model on the evaluation dataset and returns the metric
    return evaluate(
        net=net, 
        data_loader=valid_loader, 
        dtype=dtype, 
        device=device,
    )

def train_test(parameterization):

    # Extract the kernel size from the ax parameterization
    net = CNN(kernel_size=parameterization.get("kernel_size"))
    #initializes the network, defines the loss function and optimizer, performs the training loop, and returns the trained model
    net = train(
        net=net,
        train_loader=train_loader,
        parameters=parameterization,
        dtype=dtype,
        device=device,
    )

    # computes the accuracy of the model on the evaluation dataset and returns the metric
    return evaluate(
        net=net, 
        data_loader=test_loader, 
        dtype=dtype, 
        device=device,
    )

Run hyperparameter optimization using Bayesian optimization (first 5 iterations uses Sobol random sampling)

In [None]:
max_iters = 75
for i in range(max_iters):
    parameters, trial_index = ax_client.get_next_trial()
    ax_client.complete_trial(trial_index=trial_index, raw_data=train_evaluate(parameters))

Plot optimization perfomance (the accuracy of CNN with best hyperparameters found so far)

In [None]:
render(
    ax_client.get_optimization_trace()
)

In [None]:
# TODO: Figure out how to get the best parameters and how to check whether they are good

## (c) grid search

In [None]:
# TODO: Implement here a simple grid search

# TODO: Figure out the best parameters and check how well they work

In [None]:
# Simple plotting code that plots a similar figure as we got from Ax.
bests = []
best_so_far = 0.0
for key, value in values.items():
    if value > best_so_far:
        best_so_far = value
    bests.append(best_so_far)

plt.plot(bests)
plt.show()

# 2. Transfer learning

First set up the data and the model. You do not need to change these.

In [None]:
import torch.nn as nn

# Define the model architecture
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()

        # Block 1
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2)
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(16)
        self.MaxPool1 = nn.MaxPool2d(kernel_size=2)

        # Block 2
        self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(32)
        self.MaxPool2 = nn.MaxPool2d(kernel_size=2)

        # Block 3
        self.cnn3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2)
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(64)
        self.MaxPool3 = nn.MaxPool2d(kernel_size=2)

        self.fc1 = nn.Linear(576, 10)

    def forward(self, x):
        out = self.MaxPool1(self.bn1(self.relu1(self.cnn1(x))))
        out = self.MaxPool2(self.bn2(self.relu2(self.cnn2(out))))
        out = self.MaxPool3(self.bn3(self.relu3(self.cnn3(out))))

        out = out.view(out.size(0), -1)
        out = self.fc1(out)

        return out


In [None]:
import torch
import torchvision
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
import torchvision.transforms as transforms
import random
import matplotlib.pyplot as plt


# Set random seed for reproducibility (so that you all have the same data points)
# DO NOT CHANGE THIS
seed = 2024
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# set the device in use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download and load Fashion MNIST dataset
train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True,
                                                  transform=transforms.ToTensor())
test_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True,
                                                 transform=transforms.ToTensor())

# Select a few samples from the training set
train_indices = list(range(len(train_dataset)))
random.shuffle(train_indices)
train_indices = train_indices[:64]
train_subset = Subset(train_dataset, train_indices)

# Define dataloaders for training, validation, and testing
batch_size = 32
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## (a) Training the model from scratch

Standard training on the tiny labeled data.

In [None]:
# Instantiate the model
model = CNNModel()

# Define the criterion and optimizer
criterion = nn.CrossEntropyLoss()
# TODO: Choose a suitable optimizer
optimizer = ...

# Train the model on Fashion MNIST from scratch
# TODO: Implement standard optimization routine here
for epoch in range(epochs):
    model.train()
    # Go through data and change parameters, also evaluating the training data metrics
    ...

        # Example of how to compute the classification accuracy
        _, predicted = torch.max(nn.functional.softmax(outputs.data, 1), 1)
        total += labels.size(0)
        correct += (predicted == labels).float().sum().item()
    
    # evaluate test data metrics here
    model.eval()
    ...
    
 
    print(
        f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')

plt.plot(range(1, epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, epochs+1), test_losses, label='test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.plot(range(1, epochs+1), train_accs, label='Train Accuracy')
plt.plot(range(1, epochs+1), test_accs, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.show()


## (b) and (c): Fine-tuning the model

These two parts are combined in the notebook as you re-use so much of the code. Please try to return notebooks that do not copy-paste long chunks of standard optimization code, but rather re-use the same functions.

In [None]:
for subtask in ['b','c']:

    # Instantiate the model
    model = CNNModel()

    # Load the pretrained model
    model.load_state_dict(torch.load('pretrained_MNIST_model.pt'))

    # (b):TODO: Figure out how to freeze the parameters of all layers except the last one

    # TODO: Training the model and plotting of the results

## (d) Report here the requested final losses and accuracies for the three cases

# 3 Few-shot learning

Set up the data, this time so that we have a specific number of samples from each class. Again no need to change this.

The model to be used as the feature extractor is the same as in the previous exercise.

In [None]:
# Set random seed for reproducibility -- DO NOT CHANGE THIS
seed = 2024
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


class KShotCDataset(Dataset):
    def __init__(self, fashion_mnist_dataset, k_shot, c_way):
        self.fashion_mnist_dataset = fashion_mnist_dataset
        self.k_shot = k_shot
        self.c_way = c_way

        self.data_indices = []

        self.class_indices = {label: [] for label in range(self.c_way)}
        self.create_balanced_dataset()

    def create_balanced_dataset(self):
        for idx, (_, label) in enumerate(self.fashion_mnist_dataset):
            if label < self.c_way:
                self.class_indices[label].append(idx)

        for label in range(self.c_way):
            self.data_indices.extend(self.class_indices[label][:self.k_shot])

    def __len__(self):
        return len(self.data_indices)

    def __getitem__(self, index):
        fashion_mnist_index = self.data_indices[index]
        image, label = self.fashion_mnist_dataset[fashion_mnist_index]
        return image, label


# Load the Fashion MNIST training dataset
fashionmnist_dataset = torchvision.datasets.FashionMNIST(
    root='./data',
    train=True,
    transform=transforms.ToTensor(),
    download=True
)

# ...and the test data as well
test_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True,
                                                 transform=transforms.ToTensor())


## (b) Implement the algorithm

In [None]:
# Takes as input the labeled training samples and the test samples
# Also need the model that is used for extracting the features
# Returns the predicted classes for all test samples (and perhaps already also the classification accuracy)

## (c) Run experiments

In [None]:
C = 10  # Number of classes -- ordered labels are selected, e.g. C = 3 means labels=[0, 1, 2]

for K in [1, 5]: # Number of shots per class
    # Create the K-shot C-way dataset
    k_shot_c_dataset = KShotCDataset(fashionmnist_dataset, K, C)
    
    # Data loader for getting access to the training samples
    dataloader = DataLoader(k_shot_c_dataset, batch_size=K*C, shuffle=False)

    # Data loader for looping through the test samples
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # TODO: Run your algorithm
    
    # TODO: Report classification accuracy