In [None]:
import numpy as np
import matplotlib.pylab as plt
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from torch.utils.data import Dataset, DataLoader

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)

# Task 1: Digit Image Classifier

In [8]:
# plot loss history
def plot_history(train_history, test_history=None, ylabel=""):
    plt.figure(figsize=(10, 6))

    train_history = np.array(train_history)
    plt.plot(train_history[:, 0], train_history[:, 1], label="Train history")

    if test_history:
        test_history = np.array(test_history)
        plt.plot(test_history[:, 0], test_history[:, 1], label="Test history")

    plt.xlabel("Epoch")
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()

def evaluate(X_test, y_test):
    # Evaluate the model using the test set
    classifier.eval()
    correct = 0
    eval_loss = 0.0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for X, y in zip(X_test, y_test):
            # Unpack batch dimensions
            X = X.unsqueeze(0)
            y = y.unsqueeze(0)
            # Calculate outputs by running images through the network
            y_pred = classifier(X)
            loss = softmax_loss(y_pred, y)
            # The predicted class label is the maximum prediction
            predicted = torch.argmax(y_pred, 1)
            true = torch.argmax(y, 1)
            correct += (predicted == true).sum().item()
            eval_loss += loss

    eval_loss /= X_test.shape[0]

    return eval_loss, correct

## a) Load and Visualize MNIST Dataset: Utilize sklearn’s datasets to load the MNIST digit dataset, where each image is composed of $8 \times 8$ pixels and has grayscale values, and plot some samples to understand what the images look like.

In [1]:
# TODO: Load small MNIST dataset using sklearn.datasets.load_digits.

# TODO: plot example images from MNIST dataset

### 1. Convert labels to one-hot encodings for classification:

In [2]:
# TODO: Convert y to be one-hot encoded (this is important for classification!)


### 2. Add a channel dimension to input data, as PyTorch expects inputs of the shape [B, C, H, W (, D)] where B is the batch dimension, C the number of channels of the input, H the image height, W the image width, and, for 3D CNNs, D the depth of the image.

In [3]:
# TODO: Add channel dimension to X (Before (B, H, W), After (B, 1, H, D))
# hint: as we have grayscale images the number of channels is one

### 3. Split the dataset into training and test sets using sklearn’s train_test_split, ensuring a reasonable ratio and shuffling the data.

In [4]:
# TODO: Split up the dataset into train and test sets with a reasonable amount of test samples.

### 4. Convert the data for PyTorch: Transform the numpy arrays into PyTorch tensors, preparing them for model input.

In [6]:
# TODO: Convert numpy arrays to torch tensors

## c) Build the CNN model architecture using PyTorch:
1. Create a CNN class inheriting from nn.Module.
2. Define the architecture considering parameters like the input in_channels, n_conv, n_filters, n_-
classes, and kernel_size. Consider how many convolutional layers to choose, as well as using pooling layers. For the convolutional layers choose an appropriate border padding
mode that preserves the feature maps’ shape dimensions during feature extraction.
3. Use ReLU activations for intermediate layers and the softmax activation for the output layer. The
softmax function $z(x_i)$ is defined as
<center>$z(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}}$

  for each element $x_i$ in the input vector. In multi-class classification, softmax is used at the output layer because it transforms the outputs into probability distributions. Each output value represents the probability that the input belongs to one of the classes. This is particularly useful for classification tasks like digit recognition, where each class (digit) is mutually exclusive.

In [5]:
class CNN(nn.Module):
    def __init__(self, in_channels=1, n_conv=8, n_filters=128, n_classes=10, kernel_size=(3, 3)):
        super(CNN, self).__init__()
        # TODO: define the network architecture

        self.n_conv = n_conv

    def forward(self, x):
        # TODO: define how input is propagated through the layers AND activation functions
        return x

    # Count trainable weights
    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

NameError: name 'nn' is not defined

## e) Define hyperparameters for training. Therefore, set the number of epochs to train, the learning rate, and other relevant parameters, such as the number of convolutional filters and the number of hidden units. (There is not one correct solution. You can try out different parameters and see what the outcome of your training is.)

In [4]:
# TODO: Define hyperparameters

## f) Initialize and analyze the model architecture: Print the number of trainable parameters, choose an optimizer like Stochastic Gradient Descent (SGD) that accepts both the learning rate and model parameters, and define the loss function (cross entropy loss). Check initial loss before training and evaluate the model on the test set using the evaluate(...) function of the CNN.

In [7]:
# TODO: Initialize CNN

# TODO: Print number of trainable weights

# TODO: Setup optimizer and define loss function

# TODO: Evaluate the classification model on testset

## g) Training Process: Implement the loop over n epochs. For each epoch:
1. Zero the parameter gradients.
2. Forward pass the training images to calculate predictions.
3. Compute loss between prediction y_pred and y_train.
4. Perform a backward pass.
5. Update network parameters.
6. Evaluate on the test set periodically, log and track the training and test losses.

In [9]:
# TODO: Train the model using the training set
# store training and testing loss and accuracy history for later visualization

# TODO: implement training loop
for epoch in range(0, n_epochs):
    # TODO: zero the parameter gradients

    # TODO: forward pass

    # TODO: calculate loss

    # TODO: Backward pass

    # TODO: Optimize

    # TODO: Track and log losses

# TODO: Evaluate the classification model on testset
# you can use the function evaluate() for this

SyntaxError: incomplete input (<ipython-input-9-c0b336c8d1e3>, line 19)

## g) Plot results of the training and test loss history over epochs and plot the accuracy on the test set over epochs. If your training is not successful try to find possible solutions. Maybe your network size is to small. Try changing the number of filters in each convolutional layer or the number of convolutional layers.

In [1]:
# TODO: Plot loss history over epochs
# you can use the function plot_history for this
