### Question 2

In [None]:
def get_output_size(input, kernel_size, stride, padding):
    _, _, w, h = input.shape
    output_width = (w+2*padding-kernel_size)//stride + 1
    output_height = (h+2*padding-kernel_size)//stride + 1
    return (output_width, output_height)

In [None]:
import torch.nn.functional as F
import torch

In [None]:
# Test
kernel_size, stride, padding = 5, 2, 2
filters = torch.randn(8, 4, kernel_size, kernel_size)
inputs = torch.randn(1, 4, 26, 54)
output = F.conv2d(inputs, filters, stride=stride, padding=padding)
osize = get_output_size(inputs, kernel_size, stride=stride, padding=padding)
(output.shape[-2], output.shape[-1]) == (osize[-2], osize[-1])

True

### Question 6

In [None]:
import torch
import torch.nn.functional as F
from torch import nn

class Conv2DFunc(torch.autograd.Function):
  @staticmethod
  def forward(ctx, X, kernel, stride=1, padding=1):
    """
    Computation Graph: X-(unfold)-> U -(multiply W)-> Y' -(reshape)-> Y
    """
    b, c, h, w = X.shape
    kn, _, kh, kw = kernel.shape # kn: nr of kernels, kh: kernel height, kw: kernel width
    oh, ow = (h+2*padding-kh)//stride + 1, (w+2*padding-kw)//stride + 1

    U = F.unfold(X, (kh, kw), stride=stride, padding=padding)

    assert oh*ow == U.shape[2]

    U = U.transpose(1, 2)   # (b, k, p) --> (b, p, k)
    W = kernel.view(kn, -1).t()   # (nr_kernels, nr_input_channels, kernel_height, kernel_width) --> (k, nr_kernels)
    Y_prime = U.matmul(W)   # (b, p, nr_kernels), nr_kernels = nr_output_channels
    Y_prime = Y_prime.transpose(1, 2) # (b, oc, p)

    Y = Y_prime.reshape(b, kn, oh, ow)   # (b, oc, oh, ow)

    ctx.hw = (h, w)
    ctx.kshape = (kn, c, kh, kw)
    ctx.sp = (stride, padding)
    ctx.UW = (U, W)
    ctx.yprime_shape = Y_prime.shape

    return Y

  @staticmethod
  def backward(ctx, grad_Y):
    (h, w), (kn, c, kh, kw), (stride, padding), (U, W), Y_prime_size = ctx.hw, \
    ctx.kshape, ctx.sp, ctx.UW, ctx.yprime_shape

    grad_Y_prime = grad_Y.reshape(Y_prime_size).transpose(1, 2)

    kernel_grad = U.transpose(1, 2).matmul(grad_Y_prime)
    kernel_grad = kernel_grad.sum(dim=0)
    kernel_grad = kernel_grad.t().reshape(kn, c, kh, kw)

    grad_U = grad_Y_prime.matmul(W.t())
    grad_U = grad_U.transpose(1, 2)

    input_batch_grad = F.fold(grad_U, (h, w), (kh, kw), stride=stride, padding=padding)

    return input_batch_grad, kernel_grad, None, None


input_batch = torch.randn(16, 3, 32, 32, requires_grad=True)
kernel = torch.randn(2, 3, 4, 5, requires_grad=True)
output = Conv2DFunc.apply(input_batch, kernel)
output.backward(torch.ones_like(output))

### Question 7

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.datasets import MNIST

# Set random seed for reproducibility
torch.manual_seed(42)

# Define the transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download MNIST dataset
train = MNIST(root='./data', train=True, download=True, transform=transform)
test = MNIST(root='./data', train=False, download=True, transform=transform)

In [None]:
# Split training data into training and validation sets
train_size = 50000
val_size = len(train) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(train, [train_size, val_size])

# Create tensors for training instances and labels
train_instances = torch.stack([instance for instance, _ in train_dataset])
train_labels = torch.tensor([label for _, label in train_dataset])

val_instances = torch.stack([instance for instance, _ in val_dataset])
val_labels = torch.tensor([label for _, label in val_dataset])

In [None]:
# Define Model
model = None

# Define Loss function
loss_func = None

# Define Optimizer
optimizer = None

In [None]:
# Hypermarameters
batch_size = 16
num_epochs = 10

# Training loop
for epoch in range(num_epochs):

    # Training phase
    model.train()
    for i in range(0, len(train_instances), batch_size):
        batch_instances = train_instances[i:i + batch_size]
        batch_labels = train_labels[i:i + batch_size]

        # Forward pass
        outputs = model(batch_instances)

        # Compute loss
        loss = loss_func(outputs, batch_labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print training statistics
        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], \
                   Step [{i + 1}/{len(train_instances)}],\
                   Loss: {loss.item():.4f}')

    # Validation phase
    model.eval()
    with torch.no_grad():
        # Use the entire validation set at once
        val_outputs = model(val_instances)

        # Compute accuracy
        _, predicted = torch.max(val_outputs, 1)
        correct = (predicted == val_labels).sum().item()
        accuracy = correct / len(val_labels)
        print(f'Epoch [{epoch + 1}/{num_epochs}],\
              Validation Accuracy: {100 * accuracy:.2f}%')
