# ML that can See: Supervised Learning with Images 

Let's load in any libraries we will use in this notebook. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns

#import torch which has many of the functions to build deep learning models and to train them
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#import torchvision, which was lots of functions for loading and working with image data
import torchvision
import torchvision.transforms as transforms

#this is a nice progress bar representation that will be good to measure progress during training
import tqdm
import copy
import random

# fix seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# setup device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #this line checks if we have a GPU available
print(f"Using device: {device}")

Using device: cuda:0


# 1. The Data

We will use a subset of the **Stanford Dogs** dataset for **fine-grained image classification**. Our subset contains **20 dog breeds**, and the goal is to train a model to classify images into these classes. The dataset is already organized on disk into two main subfolders.

The `train` and `test` folders contain subdirectories corresponding to the dog breeds (i.e. the class labels).  
- The `train` folder will be used to create the **training** and **validation** sets (for hyperparameter tuning).
- The `test` folder will be used as the **test set** to report final performance.

### Task 1:

1. Load the dataset using  
   [`torchvision.datasets.ImageFolder`](https://pytorch.org/vision/stable/generated/torchvision.datasets.ImageFolder.html), which loads images from a directory structure and **automatically assigns class labels** based on subfolder names.
2. Load **one image from each class** and plot them as a **mosaic** using `matplotlib`. 


In [None]:
# Load the full training dataset first (without transforms for now)
trainval_dataset = ...
test_dataset = ...

print(f'Trainval dataset size: {len(trainval_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')

# Plot one random sample from each class in a horizontal stripe
num_classes = 20
fig, axes = plt.subplots(2, 10, figsize=(20, 6))

# Get class names
class_names = trainval_dataset.classes
dataset_labels = np.array([label for _, label in trainval_dataset.samples])

# For each class, find one sample and plot it
for class_idx in range(num_classes):
    class_indices = np.where(dataset_labels == class_idx)[0] 
    
    # Pick a random sample from this class
    sample_idx = np.random.choice(class_indices)
    
    # Load and display the image
    img, _ = trainval_dataset[sample_idx]

    row, col = int(class_idx // 10), int(class_idx % 10)
    axes[row, col].imshow(img)
    axes[row, col].set_title(class_names[class_idx].split('-')[-1], fontsize=10)
    axes[row, col].axis('off')

plt.tight_layout()
plt.show()

We should also examine the distribution of samples across the different classes.

In [None]:

def plot_class_distribution(*datasets, dataset_names=None, figsize=(14, 6)):
    """
    Plot the distribution of samples per class for multiple datasets.
    
    Args:
        *datasets: Variable number of ImageFolder datasets or Subsets
        dataset_names: List of names for each dataset (optional)
        figsize: Figure size (width, height)
    """
    
    if dataset_names is None:
        dataset_names = [f'dataset_{i}' for i in range(len(datasets))]
    
    # Collect data from all datasets
    df_list = []
    for dataset, name in zip(datasets, dataset_names):
        # Handle both ImageFolder and Subset datasets
        if hasattr(dataset, 'samples'):
            # ImageFolder dataset
            labels_full = [label for _, label in dataset.samples]
            class_names = dataset.classes
        else:
            # Subset dataset - get labels from indices
            labels_full = [dataset.dataset.targets[i] for i in dataset.indices]
            class_names = dataset.dataset.classes
        
        df = pd.DataFrame({
            'class': [class_names[label] for label in labels_full],
            'split': name,
            'count': 1
        })
        df_list.append(df)
    
    # Combine all dataframes
    combined_df = pd.concat(df_list)
    
    # Group by class and split, then count
    df_grouped = combined_df.groupby(['class', 'split']).count().reset_index()
    
    # Create grouped bar plot
    plt.figure(figsize=figsize)
    sns.barplot(data=df_grouped, x='class', y='count', hue='split')
    plt.xticks(rotation=45, ha='right')
    plt.xlabel('Class')
    plt.ylabel('Number of Samples')
    plt.title('Distribution of Samples per Class')
    plt.tight_layout()
    plt.show()

# Plot the distribution
plot_class_distribution(trainval_dataset, test_dataset, dataset_names=['TrainVal', 'Test'])

## Dataset Splitting
Note that the classes contain **different numbers of samples**. This class imbalance should be taken into account when splitting the `trainval` dataset into **training** and **validation** sets. To preserve the class distribution in both splits, we use **stratification**.

### Task 2:  
Split the `trainval` dataset into **training (80%)** and **validation (20%)** sets using **stratified sampling**.

**Hint:**  
You can use the [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
 function from `sklearn`, with the `stratify` argument.


In [None]:


# Perform stratified split to maintain class distribution

# Create subset datasets
train_dataset = ...
val_dataset = ...

print(f"Total trainval samples: {len(trainval_dataset)}")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
plot_class_distribution(train_dataset, val_dataset, dataset_names=['Train', 'Val'])

## Data Preprocessing, Augmentation, and Loading

Since our dataset is relatively small and the images have **different sizes**, we need to perform a few preprocessing steps before feeding them to a deep learning model. PyTorch transforms module provides these functionalities:

1. **Preprocessing:**  
   - [`transforms.Resize`](https://pytorch.org/vision/stable/generated/torchvision.transforms.Resize.html) – resize all images to a fixed size so they are compatible with the network (often 224 x 224 pixels).  
   - [`transforms.ToTensor`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ToTensor.html) – convert images to PyTorch tensors and scale pixel values to the range [0, 1].  
   - [`transforms.Normalize`](https://pytorch.org/vision/stable/generated/torchvision.transforms.Normalize.html) – normalize images using **ImageNet mean and standard deviation**, required when fine-tuning models pretrained on ImageNet.

2. **Data Augmentation:**  
   To artificially increase the effective dataset size and reduce overfitting, we can apply **random transformations** to training images, such as:  
   - [`RandomResizedCrop`](https://pytorch.org/vision/stable/generated/torchvision.transforms.RandomResizedCrop.html) – randomly crops and resizes images.  
   - [`RandomHorizontalFlip`](https://pytorch.org/vision/stable/generated/torchvision.transforms.RandomHorizontalFlip.html) – randomly flips images horizontally.  
   - [`RandomVerticalFlip`](https://pytorch.org/vision/stable/generated/torchvision.transforms.RandomVerticalFlip.html) – randomly flips images vertically.  
   - [`RandomRotation`](https://pytorch.org/vision/stable/generated/torchvision.transforms.RandomRotation.html) – randomly rotates images.  
   - [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) – randomly adjusts brightness, contrast, saturation, and hue.

   These transformations, applied sequentially and randomly, generate a wide variety of images from the same originals, helping the model generalize better.

3. **Loading data in batches:**  
   Deep learning models use **stochastic optimization**, processing images in batches rather than all at once. PyTorch’s [`DataLoader`](https://pytorch.org/vision/stable/data.html) class efficiently loads data in batches, shuffles the training set, and can use multiple worker threads for faster loading.

### Task 3:

1. **Preprocessing:**  
   - Create a **transform pipeline** that resizes, converts to tensor, and normalizes images using ImageNet statistics.  
   - Apply this transform to the validation and test datasets.


3. **Data Augmentation:**  
   - Create a **transform pipeline for training** that includes preprocessing and at least 2–3 random augmentations from the list above.  
   - Apply this transform to the training dataset.


4. **DataLoader:**  
   - Create a **DataLoader** for each dataset (training, validation, and test).  
   - Choose a suitable batch size.  
   - Shuffle the training DataLoader; do not shuffle validation or test DataLoaders.  
   - Optionally, use multiple workers to speed up loading.

5.  **visualize a single batch from the training dataloader.**


In [None]:

# Imnagenet preprocessing
imagenet_means = (0.485, 0.456, 0.406)
imagenet_stds = (0.229, 0.224, 0.225)

# preprocessing transform
transform = ...


# Data augmentation transforms for training set
train_transform = ...

# Apply transforms to the datasets
train_dataset.dataset.transform = transforms.Compose([train_transform,transform])
val_dataset.dataset.transform = transform
test_dataset.transform = transform

# create dataloaders for train, val, test datasets
batch_size = 16
trainloader = ...
valloader = ...
testloader = ...

# Visualize a batch of augmented training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# Plot the batch
fig, axes = plt.subplots(2, 8, figsize=(20, 5))
axes = axes.ravel()
for idx in range(min(16, len(images))):
    # Denormalize the image for visualization
    img = images[idx].numpy().transpose((1, 2, 0))
    # just remap for visualization
    img = img * np.array(imagenet_stds) + np.array(imagenet_means)
    img = np.clip(img, 0, 1)
    
    axes[idx].imshow(img)
    axes[idx].set_title(class_names[labels[idx]])
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

# 2. The Model

This week we will use a pretrained ResNet18, that has been trained on ImageNet, but we will be freezing certain parameters in the model so that their weights do not update. We will do this to try and prevent the model from overfitting to the new, small dataset. You can also experiment other pretrained models provided by [PyTorch torchvision library](https://pytorch.org/vision/stable/models.html#classification).

When we create a model for transfer learning, we should follow these steps:
1. Initialise the model with pretrained weights.
2. Adapt the architecture for the new number of classes in our new dataset by changing the final linear layer.
3. If necessary, freeze any weights.

In [None]:
def setup_model(model, num_classes, freeze_backbone = False):
    
    #### Adapt the architecture for the new number of classes.
    in_features = model.fc.in_features
    model.fc = nn.Linear(model.fc.in_features, num_classes)

    #### If necessary, freeze any weights.
    if freeze_backbone: 
        for param in model.parameters():
            param.requires_grad = False
        
        # Unfreeze the parameters of the last fully connected layer
        for param in model.fc.parameters():
            param.requires_grad = True

    return model

backbone = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
resnet_frozen = setup_model(backbone, 20, True)
print(resnet_frozen)

# 3. Training the Model with Transfer Learning

Now that we know how to set up our model using **transfer learning**, we can start training it.

We will use a **fine-tuning approach**, where the model’s parameters are slightly adjusted to adapt its learned features to the specific nuances of the new task. In this tutorial, we will **only update the parameters of the final linear layer**, keeping all other layers frozen. You can also experiment with training from scratch or finetune all layers.

In the lecture, we reviewed the general training procedure:

1. **Initialize the model.**
2. **Define a loss function** (also called cost function or objective function).
3. **Initialize the optimizer.**
4. For `n` epochs (or until the loss converges/stops changing):
    1. **Training phase:**  
       - Put the model in training mode with `model.train()`.  
       - For each batch in the **training dataset**:           
           1. Perform a forward pass to compute predictions.
           2. Calculate the **loss** and **accuracy**.
           3. Perform a backward pass to compute gradients with respect to the parameters.
           4. Update the parameters using the optimizer.
    2. **Validation phase:**  
       - Put the model in evaluation mode with `model.eval()`.  
       - For each batch in the **validation dataset**:
           1. Perform a forward pass to compute predictions.
           2. Calculate the **loss** and **accuracy**.

### Task 4
In the cells below, implement the **training** and **validation** functions as described above.


In [None]:
def train_epoch(model, dataloader, criterion, optimizer, epoch, device):

    # Put the model in "train" mode
    model.train() 

    # For all batches in the training dataset
    train_loss, correct, total = [], 0.0, 0.0
    for _, data in  tqdm.tqdm(enumerate(dataloader, 0), total = len(dataloader), desc = f'Epoch {epoch+1} - training phase'):
        # get the inputs and labels from the dataloader and move to device (GPU or CPU)
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

       # TODO

    # average stats for the epoch
    mean_train_loss = np.mean(train_loss)
    train_accuracy = correct/total
    print(f"Training {epoch+1}: loss={mean_train_loss:.3f} acc={train_accuracy:.3f}")

    return mean_train_loss, train_accuracy


In [None]:
def eval_epoch(model, dataloader, criterion, epoch, device):

    # Put the model in "eval" mode
    model.eval()
    
    #Validation loop: For all batches in the validation dataset
    with torch.no_grad(): # not build the computation graph for backpropagation, and thus, no gradients will be computed or stored for the tensors involved in those operations.
        val_loss, val_correct, val_total = 0.0, 0.0, 0.0
        for i, data in  tqdm.tqdm(enumerate(dataloader, 0), total = len(dataloader), desc = f'Epoch {epoch+1} - validation phase'):
            # get the inputs and labels from the dataloader and move to device (GPU or CPU)
            inputs, labels = data            
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # TODO
    
    mean_val_loss = val_loss / val_total
    val_accuracy = val_correct / val_total
    print(f"Validation {epoch+1}: loss={mean_val_loss:.3f} acc={val_accuracy:.3f}")
    return mean_val_loss, val_accuracy

Now let’s put everything together and train our model. Remember to save the best model, defined as the one that achieves the highest accuracy on the validation set at any epoch.

In [None]:
# any hyperparameters
lr = 0.001
total_epochs = 10

#Initialise the model.
backbone = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
resnet_frozen = setup_model(backbone, 20, True)
resnet_frozen = resnet_frozen.to(device)
# Define a loss function
criterion = nn.CrossEntropyLoss()

# Initialise the SGD optimizer.
optimizer = optim.SGD(resnet_frozen.parameters(), lr=lr, momentum=0.9)

#Step 4: For n epochs (e.g. loss converged/stops changing)
total_train_loss, total_val_loss = [], []
total_train_acc, total_val_acc = [], []
best_acc = -np.inf
for epoch in range(total_epochs):    

    # train for one epoch
    mean_train_loss, train_accuracy = train_epoch(resnet_frozen, trainloader, criterion, optimizer, epoch, device)
    total_train_loss.append(mean_train_loss); total_train_acc.append(train_accuracy);

    # evaluate on validation set
    mean_val_loss, val_accuracy = eval_epoch(resnet_frozen, valloader, criterion, epoch, device)
    total_val_loss.append(mean_val_loss); total_val_acc.append(val_accuracy);

    # save the best model based on validation accuracy
    # TODO


# Plot training and validation loss and accuracy curves
plt.plot(total_train_loss, label = 'Train')
plt.plot(total_val_loss, label = 'val')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

plt.plot(total_train_acc, label = 'Train')
plt.plot(total_val_acc, label = 'val')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()

# 4. Evaluation

Once our model is trained, the next step is to **evaluate its performance** on the **test set**. Evaluation allows us to understand how well the model generalizes to **unseen data**.

In this tutorial, we will focus on two basic metrics:

1. **Accuracy** – the proportion of correctly predicted labels over all test samples.
2. **Confusion Matrix** – a table that compares true class labels with predicted class labels, showing how many samples of each class are correctly classified and how many are misclassified into other classes. It helps identify which classes the model confuses more often.

> Note: A more detailed discussion of evaluation metrics, such as precision, recall, F1-score, and ROC curves, will be covered in the next tutorial.

For now, your task is to **run the model on the test set**, compute predictions for each sample, and report:

- The **overall accuracy** of the model
- The **confusion matrix** for the test set

### Task 5

1. Load the best model.
2. Iterate over all batches in the **test DataLoader**.  
3. For each sample, compute the **predicted label** using the trained model.  
4. Collect the predictions and the ground-truth labels.  
5. Compute and display:
   - The **overall accuracy** (DONE)
   - The **confusion matrix** (DONE)


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Load the best model
resnet_frozen.load_state_dict(torch.load("resnet_frozen_best.pth"))
resnet_frozen.eval()

# Collect predictions and ground truth labels
all_predictions = []
all_labels = []

# compute predictions
# TODO

# Overall accuracy
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)
test_accuracy = np.mean(all_predictions == all_labels)

# Per-class accuracy
class_accuracies = []
for class_idx in range(num_classes):
    class_mask = all_labels == class_idx
    class_accuracies.append((all_predictions[class_mask] == all_labels[class_mask]).sum() /float(class_mask.sum()))

# Visualize per-class accuracy as a bar plot
plt.figure(figsize=(12, 6))
plt.bar(range(len(class_accuracies)), class_accuracies)
plt.xticks(range(len(class_accuracies)), class_names, rotation=45, ha='right')
plt.xlabel('Class'); plt.ylabel('Accuracy'); plt.title('Per-class Test Accuracy');
plt.ylim([0, 1])
plt.axhline(y=test_accuracy, color='r', linestyle='--', label=f'Overall Accuracy: {test_accuracy:.4f}')
plt.legend(); plt.tight_layout(); plt.show();

# Compute and display confusion matrix using sklearn's ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay.from_predictions(all_labels, all_predictions, display_labels=class_names, 
                                               cmap=plt.cm.Blues, xticks_rotation='vertical')

# 5. Extensions and Experiments

Once you have completed training and evaluation with the current model, there are many ways you can **experiment and explore on your own** to deepen your understanding of CNNs and transfer learning:

- **Change the backbone network:**  
  Try replacing the current CNN with another architecture, such as **ResNet-50**, **EfficientNet**, or **MobileNet**, and observe how it affects accuracy and training speed.

- **Use a foundation model:**  
  You can explore **pretrained foundation models** such as **DINOv3** or other self-supervised models. Fine-tune them on this dataset and compare their performance to standard CNNs.

- **Handle class imbalance:**  
  If your dataset has classes with very different numbers of samples, try:
  - Using **loss weights** in your loss function to give more importance to underrepresented classes.
  - Using **sampling strategies** in the DataLoader (e.g., [`WeightedRandomSampler`](https://pytorch.org/docs/stable/data.html#torch.utils.data.WeightedRandomSampler)) to balance the class distribution during training.  
  These methods can help the model learn fairly across all classes.

- **Experiment with data augmentation:**  
  Try adding or modifying augmentations in your training pipeline (e.g., different crop sizes, rotations, color jitter, or even MixUp/CutMix). See how these changes impact generalization.

- **Adjust training parameters:**  
  Experiment with **learning rates**, **batch sizes**, **number of frozen layers**, or **optimizer types** to see how these hyperparameters affect convergence and final accuracy.

- **Visualize model predictions:**  
  Look at misclassified images in the test set or visualize feature maps from intermediate layers to gain insights into what the model has learned.

> Tip: Keep a notebook or log of your experiments. Compare results systematically and try to identify patterns that improve performance. This will help you develop good practices for model development and fine-tuning in real-world scenarios.


# 6. Pro Tip: Hyperparameter Tuning with Optuna

Hyperparameter selection can have a **big impact on model performance**. Instead of manually trying different values, you can use **Optuna**, a Python library for **automated hyperparameter optimization**.

Optuna allows you to:

- Define an **objective function** that trains and evaluates your model with a given set of hyperparameters.  
- Specify **search spaces** for hyperparameters such as learning rate, batch size, optimizer type, or number of frozen layers.  
- Automatically explore the hyperparameter space using **efficient sampling and pruning strategies** to find configurations that maximize performance.  

**Example hyperparameters you could tune:**

- Learning rate (`0.0001` to `0.01`)  
- Batch size (`16`, `32`, `64`)  
- Optimizer (`SGD` vs `Adam`)  
- Weight decay  
- Number of frozen layers in the backbone  

### Challenge: 

As a challenge, you can use **Optuna** in combination with your **training and validation loop**.  

> ⚠️ Note: This process can take a long time, and the study should be **saved to disk** so you can resume or analyze it later (see [Optuna documentation](https://optuna.readthedocs.io/en/stable/)).

Here, we show a minimal example tuning **only the learning rate**, just for learning purposes.   You can extend this to **other hyperparameters** or more complex workflows.

**Important:** Never use the **test set** during hyperparameter search. Doing so can **lead to overfitting** on the test set and your model may fail when deployed on unseen data.

**Can you beat my model?**


In [20]:
# install optuna
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
Downloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [optuna]2m1/2[0m [optuna]
[1A[2KSuccessfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
import optuna

# let's just use 4 epochs so it does not take alot of time
total_epochs = 4

# Define objective function
def objective(trial):
    
    ############## Suggest hyperparameters ############       
    lr =  trial.suggest_float("lr", 1e-3, 1e1, log=True)
    # ... more can be included

    ############## Training code as before ##########
    #Initialise the model.
    backbone = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
    resnet_frozen = setup_model(backbone, 20, True)
    resnet_frozen = resnet_frozen.to(device)
    
    # Define a loss function
    criterion = nn.CrossEntropyLoss()
    
    # Initialise the SGD optimizer.
    optimizer = optim.SGD(resnet_frozen.parameters(), lr=lr, momentum=0.9)

    #For n epochs (e.g. loss converged/stops changing)
    total_train_loss, total_val_loss = [], []
    total_train_acc, total_val_acc = [], []
    best_acc = -np.inf
    for epoch in range(total_epochs):    
    
        # train for one epoch
        mean_train_loss, train_accuracy = train_epoch(resnet_frozen, trainloader, criterion, optimizer, epoch, device)
        total_train_loss.append(mean_train_loss); total_train_acc.append(train_accuracy);
    
        # evaluate on validation set
        mean_val_loss, val_accuracy = eval_epoch(resnet_frozen, valloader, criterion, epoch, device)
        total_val_loss.append(mean_val_loss); total_val_acc.append(val_accuracy);
    
        # save validation accuracy to report to optuna
        if val_accuracy > best_acc:            
            best_acc = val_accuracy            

    ############## give to optuna the best metric found (i.e. validation accuracy). Optuna will try to maximize this ############  
    
    return best_acc

# Run Optuna study
study = optuna.create_study(study_name="hyperparameter Optimization", direction="maximize", storage="sqlite:///optuna_study.db",)
study.optimize(objective, n_trials=10)

print("Best hyperparameters:", study.best_params)
print("Best accuracy:", study.best_value)