# Baseline

This notebook contains code for training baseline convolutional model, to compare to ViT model. The performance of baseline is better, than ViT, since the dataset is not suited for such large models, it only serves as learning example. 

### Setup

In [None]:
import math
import os
from functools import cached_property
from typing import Any, Callable, Iterable, Tuple

import numpy as np
import seaborn as sns
import torch
import torchvision
import lovely_tensors as lt
import wandb
from skimage import io
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
lt.monkey_patch()

In [3]:
transforms = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),  # convert PIL to tensor
    torchvision.transforms.Lambda(lambda image: image.squeeze()),  # convert shape from (BATCH_SIZE, 1, X, Y) to (BATCH_SIZE, X * Y)
])

train_dataset = torchvision.datasets.MNIST("../data", download=True, transform=transforms, train=True)
test_dataset = torchvision.datasets.MNIST("../data", download=True, transform=transforms, train=False)

In [4]:
train_data_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=128, shuffle=False)

### Model

The baseline is convolutional model with 2 conv layers.

In [35]:
class MNISTClassifier(torch.nn.Module):
    """Convolutional classifier as a simple baseline model for image classification."""

    def __init__(self, num_classes: int):
        """
        :param num_classes: number of classes in the dataset
        """
        super().__init__()

        self.num_classes = num_classes
        self.stack = torch.nn.Sequential(
            torch.nn.Conv2d(1, 32, kernel_size=3, stride=1),
            torch.nn.ReLU(),
            torch.nn.Conv2d(32, 64, kernel_size=3, stride=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=2),
            torch.nn.Flatten(),
            torch.nn.Linear(9216, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes),
            torch.nn.Softmax(dim=1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.stack(x.unsqueeze(1))


In [36]:
# create instance of the model and run it on sample batch of 128 examples
model = MNISTClassifier(num_classes=10).cuda()

In [37]:
for x, y in train_data_loader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([128, 28, 28])
torch.Size([128])


In [38]:
with torch.no_grad():
    y = model(x.cuda())  # x exists from previous cell

# make sure shape is correct and all outputs sum to 1 (probabilities after softmax)
y.shape, y[0].sum()

(torch.Size([128, 10]), tensor cuda:0 1.000)

### Evaluation

Create eval loop first, to make sure model behaves as expected.

In [39]:
def predict(model: torch.nn.Module, data_loader: Iterable) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Prediction loop for the given model and data loader
    :note: assumes running on CUDA enabled device
    """
    model = model.eval()
    targets, predictions = [], []

    for inputs, y_true in data_loader:
        with torch.no_grad():
            inputs = inputs.cuda()
            y_true = y_true.cuda()

            y_pred = model(inputs)

            predictions.append(y_pred)
            targets.append(y_true)

    return torch.cat(targets), torch.cat(predictions)


def evaluate(y_true: Iterable, y_pred: Iterable) -> dict[str, float]:
    return {
        "accuracy": metrics.accuracy_score(y_true, y_pred),
        "precision": metrics.precision_score(y_true, y_pred, zero_division=0, average="macro"),
        "recall": metrics.recall_score(y_true, y_pred, zero_division=0, average="macro"),
        "f1_score": metrics.f1_score(y_true, y_pred, zero_division=0, average="macro"),
    }


In [40]:
def log_training_progress(model: torch.nn.Module, epoch: int, data_loader: Iterable) -> None:
    """
    Logs progress of the model training to W&B
    :note: requires Wto be called inside W&B run
    """
    logs = {}
    targets, predictions = predict(model, data_loader)

    logs["accuracy"] = metrics.accuracy_score(targets.cpu(), predictions.cpu().argmax(dim=-1))
    logs["f1_score"] = metrics.f1_score(
        targets.cpu(), predictions.cpu().argmax(dim=-1), zero_division=0, average="macro"
    )

    wandb.log(logs, step=epoch)


def count_trainable_parameters(model: torch.nn.Module) -> int:
    """Return the number of trainable parameters in neural model"""
    return sum(param.numel() for param in model.parameters() if param.requires_grad)


Run the model on entire test dataset and compute metrics. Without training they will be random, so around 10% accuracy can be expdcted.

In [41]:
targets, predictions = predict(model, test_data_loader)
targets.shape, predictions.shape

(torch.Size([10000]), torch.Size([10000, 10]))

In [42]:
print(metrics.classification_report(targets.cpu(), predictions.cpu().argmax(dim=-1), zero_division=0))

              precision    recall  f1-score   support

           0       0.06      0.26      0.09       980
           1       0.01      0.04      0.01      1135
           2       0.00      0.00      0.00      1032
           3       0.00      0.00      0.00      1010
           4       0.00      0.00      0.00       982
           5       0.00      0.00      0.00       892
           6       0.00      0.00      0.00       958
           7       0.00      0.00      0.00      1028
           8       0.00      0.00      0.00       974
           9       0.00      0.00      0.00      1009

    accuracy                           0.03     10000
   macro avg       0.01      0.03      0.01     10000
weighted avg       0.01      0.03      0.01     10000


### Train

Define training loop and logging to W&B

In [43]:
def train_test_split_data_loader(dataset, test_size: float = 0.1):
    """Train test split function similar to sklearn applied to torch DataLoaders"""

    train_indices, validation_indices = train_test_split(range(len(dataset)), test_size=test_size, random_state=42)
    
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    validation_dataset = torch.utils.data.Subset(dataset, validation_indices)
    
    train_data_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
    validation_data_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=128, shuffle=False)
    
    return train_data_loader, validation_data_loader

In [44]:
def train(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    loss_function: Callable,
    dataset: Iterable,
    n_epochs: int,
    validation_size: float = 0.1,
) -> torch.nn.Module:
    """
    Train loop for the given torch model
    :note: assumes running on CUDA enabled device
    """
    train_data_loader, validation_data_loader = train_test_split_data_loader(dataset, test_size=validation_size)

    for epoch in range(n_epochs):
        for x, y in train_data_loader:
            model = model.train()
            x = x.cuda()
            y = y.cuda()

            optimizer.zero_grad()
            y_pred = model(x)

            loss_value = loss_function(y_pred, y)
            loss_value.backward()
            optimizer.step()
       
        log_training_progress(model, epoch, validation_data_loader)

    return model


def run(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    loss_function: Callable,
    train_dataset: Iterable,
    test_dataset: Iterable,
    n_epochs: int,
    config: dict[str, Any],
    validation_size=0.1,
) -> None:
    """
    Run training and evaluation loop for the given model and data

    :param model: pytorch model to train
    :param optimizer: pytorch optimizer to use
    :param loss_function: pytorch loss function to use
    :param train_dataset: iterable dataset for training
    :param test_dataset: iterable dataset for testing
    :param n_epochs: number of epochs to train
    :param config: dictionary with configuration for W&B
    :param validation_size: size of the validation set
    """
    with wandb.init(project=config["project_name"], name=config["run_name"]):
        wandb.log(config)
        test_data_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=128, shuffle=False)
        # run training and evaluation
        model = train(model, optimizer, loss_function, train_dataset, n_epochs, validation_size)
        targets, predictions = predict(model, test_data_loader)
        # log metrics and model info to W&B
        wandb.log(evaluate(targets.cpu(), predictions.cpu().argmax(dim=-1)))
        wandb.log({"n_trainable_parameters": count_trainable_parameters(model)})
        # save model to file and upload to W&B
        path = os.path.join(wandb.run.dir, "model.pt")
        torch.save(model.state_dict(), path)
        torch.save(model.state_dict(), "model.pt")
        wandb.save(path)
    
    return model

### Run

Run with W&B logging, training metrics and evaluation of the trained model on test dataset will appear in the interface and the logs in this notebook.

In [45]:
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.CrossEntropyLoss()

config = {
    "project_name": "vision-transformer",  # W&B config
    "run_name": "cnn-baseline",  # W&B config
    "dataset": "MNIST",
    "model": "FeedForward",
    "optimizer": "AdamW",
    "loss_function": "CrossEntropyLoss",
}

In [46]:
trained_transformer = run(
    model,
    optimizer,
    loss_function,
    train_dataset,
    test_dataset,
    n_epochs=30,
    config=config
)



0,1
accuracy,▁▅▆▆▆▆▇▇▇▆▇▇▇▇▇▇▇█▇█▆██▇██▇▆▇
f1_score,▁▅▆▆▆▆▇▇▇▆▇▇▇▇▇▇▇█▇█▆██▇██▇▆▇
n_trainable_parameters,▁
precision,▁
recall,▁

0,1
accuracy,0.9894
dataset,MNIST
f1_score,0.98935
loss_function,CrossEntropyLoss
model,FeedForward
n_trainable_parameters,1199882
optimizer,AdamW
precision,0.9895
project_name,vision-transformer
recall,0.98923
