# DATASCI 315, Homework 9: Convolutional Neural Networks

In this homework assignment, we'll use PyTorch to implement convolutional neural networks (also known as "convnets" or "CNNs").

## Getting started

To get started, let's import some packages and download an image of a bear.

In [None]:
from pathlib import Path

import torch
import torch.nn.functional as F
from matplotlib import pyplot as plt
from skimage import io as skimage_io
from skimage.color import rgb2gray
from skimage.transform import resize
from torch import nn

In [None]:
url = (
    "https://ewscripps.brightspotcdn.com/dims4/default/e666c4b/2147483647/strip"
    "/true/crop/1280x720+0+0/resize/1280x720!/quality/90/"
    "?url=https%3A%2F%2Fewscripps.brightspotcdn.com"
    "%2Fb0%2F82%2Fe97f14ff421ca8ca7d8692e0ecdb%2Fgeneric-1280-1.png"
)
response = requests.get(url, timeout=30)
with Path("bear.png").open("wb") as f:
    f.write(response.content)
bear = skimage_io.imread("bear.png")
bear = resize(bear, (bear.shape[0] // 2, bear.shape[1] // 2), anti_aliasing=True)
bw_bear = rgb2gray(bear)
bear_edges = torch.zeros(bw_bear.shape)
plt.imshow(bw_bear, cmap="gray")
plt.axis("off")
plt.show()
bw_bear = torch.tensor(bw_bear, dtype=torch.float32)
print("The shape of the bear image is : {}".format(bw_bear.shape))

### Problem 1: Convolution with stride

Implement a 2D convolution operation with stride using only `for` loops and basic PyTorch operations (without using the `torch.nn` submodule).

**Requirements:**
- Input `X` has shape `(h_x, w_x)`
- Kernel `K` has shape `(h_k, w_k)`
- Stride `s` is a 2-tuple `(vertical_stride, horizontal_stride)`
- Use only `torch` operations (no `torch.nn`)

**Hint:** The output dimensions are `((h_x - h_k) // s[0] + 1, (w_x - w_k) // s[1] + 1)`.

In [None]:
def my_conv2d(x_input, kernel, stride):
    """Compute 2D convolution.

    Args:
        x_input: Input tensor of shape (h_x, w_x)
        kernel: Kernel tensor of shape (h_k, w_k)
        stride: 2-tuple indicating (vertical_stride, horizontal_stride)

    Returns:
        output: Output tensor after applying convolution
    """
    # BEGIN SOLUTION
    h_k, w_k = kernel.shape
    new_h = (x_input.shape[0] - h_k) // stride[0] + 1
    new_w = (x_input.shape[1] - w_k) // stride[1] + 1
    output = torch.zeros((new_h, new_w), dtype=torch.float32)
    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            row_start = i * stride[0]
            col_start = j * stride[1]
            patch = x_input[row_start : row_start + h_k, col_start : col_start + w_k]
            output[i, j] = torch.sum(patch * kernel)
    return output
    # END SOLUTION

In [None]:
# Test assertions
test_input = torch.arange(16, dtype=torch.float32).reshape(4, 4)
test_kernel = torch.ones((2, 2), dtype=torch.float32)
test_output = my_conv2d(test_input, test_kernel, (1, 1))
assert test_output.shape == (3, 3), f"Expected shape (3, 3), got {test_output.shape}"
assert test_output[0, 0] == 10.0, f"Expected 10.0, got {test_output[0, 0]}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
test_output_stride2 = my_conv2d(test_input, test_kernel, (2, 2))
assert test_output_stride2.shape == (
    2,
    2,
), f"Expected shape (2, 2) with stride 2, got {test_output_stride2.shape}"
assert test_output_stride2[0, 0] == 10.0, f"Expected 10.0 at [0,0], got {test_output_stride2[0, 0]}"
assert test_output_stride2[1, 1] == 42.0, f"Expected 42.0 at [1,1], got {test_output_stride2[1, 1]}"
# END HIDDEN TESTS

The convolution operation is very useful in image processing. With the right kernel, you can perform edge detection, blurring, bokeh effects, etc. Run the cells below to see your convolution function in action on the bear image.

In [None]:
blurr_kernel = torch.tensor(
    [
        [1, 4, 7, 4, 1],
        [4, 16, 26, 16, 4],
        [7, 26, 41, 26, 7],
        [4, 16, 26, 16, 4],
        [1, 4, 7, 4, 1],
    ],
    dtype=torch.float32,
)
blurr_kernel /= blurr_kernel.sum()

edge_kernel = torch.tensor(
    [
        [-1.0, -1, -1],
        [-1, 8, -1],
        [-1, -1, -1],
    ],
    dtype=torch.float32,
)

In [None]:
bw_bear_edge = my_conv2d(bw_bear, edge_kernel, (2, 2))
plt.imshow(bw_bear_edge.numpy(), cmap="gray")
plt.axis("off")
plt.show()
print(bw_bear_edge.shape)

In [None]:
bw_bear_blur = my_conv2d(bw_bear, blurr_kernel, (2, 2))
plt.imshow(bw_bear_blur.numpy(), cmap="gray")
plt.axis("off")
plt.show()
print(bw_bear_blur.shape)

### Problem 2: Convolutional layer with padding and stride

Fill in the `MyConv2dModule` class to create a convolutional layer with padding and stride using the `my_conv2d` function from Problem 1. Implement this without the bias term.

**Padding rules:**
- If padding `p` is even, pad each side with `p/2` rows/columns
- If padding for rows is odd, pad the top with `p // 2` rows and the bottom with `p - p // 2` rows
- If padding for columns is odd, pad the left with `p // 2` columns and the right with `p - p // 2` columns

**Hint:** Use `torch.nn.functional.pad()` for padding.

In [None]:
class MyConv2dModule(nn.Module):
    """Custom 2D convolutional layer using my_conv2d.

    Args:
        kernel_size: 2-tuple (h_k, w_k) for kernel dimensions
        stride: 2-tuple indicating vertical and horizontal stride
        padding: 2-tuple indicating padding for rows and columns
    """

    def __init__(self, kernel_size, stride, padding):
        super().__init__()
        # BEGIN SOLUTION
        self.weight = nn.Parameter(torch.randn(kernel_size))
        self.stride = stride
        self.padding = padding
        # END SOLUTION

    def forward(self, inputs):
        # BEGIN SOLUTION
        top = self.padding[0] // 2
        bottom = self.padding[0] - top
        left = self.padding[1] // 2
        right = self.padding[1] - left
        inputs_padded = F.pad(inputs, (left, right, top, bottom), mode="constant", value=0.0)
        return my_conv2d(inputs_padded, kernel=self.weight, stride=self.stride)
        # END SOLUTION

In [None]:
# Test assertions
torch.manual_seed(100)
convlayer = MyConv2dModule((3, 3), (2, 2), (2, 2))
test_x = torch.ones((32, 32))
test_y = convlayer(test_x)
assert test_y.shape == torch.Size([16, 16]), f"Expected shape [16, 16], got {test_y.shape}"
assert torch.isclose(
    test_y[0, 0], torch.tensor(-2.8146), atol=0.01
), f"Expected -2.8146, got {test_y[0, 0]}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
torch.manual_seed(42)
convlayer2 = MyConv2dModule((5, 5), (1, 1), (4, 4))
test_x2 = torch.ones((10, 10))
test_y2 = convlayer2(test_x2)
assert test_y2.shape == torch.Size([10, 10]), f"Expected shape [10, 10], got {test_y2.shape}"
# END HIDDEN TESTS

## Neural Network Blocks

A key aspect of deep learning is its modular approach, where neural networks are constructed using discrete building blocks. These blocks, which consist of collections of layers, are combined to form complete models. Let's explore this concept further.

Suppose we want to implement a block with the following architecture:

<img src='https://drive.google.com/uc?id=1Eu6XK96JwDJA0r_qBDkmpU5IihSJQYja' width=400>

To implement this block, we would write code that defines it as a distinct module. This block can then serve as a fundamental component of a more complex neural network. In PyTorch, we would extend the `nn.Module` class. The new class would include at least two functions: `__init__` for initialization and `forward` for executing the forward pass.

In [None]:
class ExampleBlock(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.lin1 = nn.Linear(in_features, 64)
        self.lin2 = nn.Linear(64, 64)
        self.act = nn.ReLU()

    def forward(self, x):
        out = self.act(self.lin1(x))
        return self.act(self.lin2(out))

In [None]:
blk = ExampleBlock(3)
X = torch.randn((1, 32, 32, 3))
Y = blk(X)
Y.shape

### Problem 3: MysteryBlock

Similar to the example, fill in `__init__()` and `forward()` using the following flowchart to implement the `MysteryBlock` class.

<img src='https://drive.google.com/uc?id=1LoZjHA92C2Lk14y0pSHi5RqeY47otBB4' width=400>

**Important:** Initialize the weights from a normal distribution with standard deviation of 0.05 using `nn.init.normal_(layer.weight, mean=0.0, std=0.05)`.

In [None]:
class MysteryBlock(nn.Module):
    """Neural network block following the flowchart architecture."""

    def __init__(self, height=32, width=32):
        super().__init__()
        # BEGIN SOLUTION
        kernel_size = 3
        self.conv1 = nn.Conv2d(
            in_channels=3, out_channels=64, kernel_size=kernel_size, padding="same"
        )
        self.conv2 = nn.Conv2d(
            in_channels=64, out_channels=64, kernel_size=kernel_size, padding="same"
        )

        padding = (kernel_size - 1) // 2
        self.pool1 = nn.MaxPool2d(kernel_size=kernel_size, padding=padding)

        pool_out = ((height - 1) // 3 + 1, (width - 1) // 3 + 1)
        pool_out1 = ((pool_out[0] - 1) // 3 + 1, (pool_out[1] - 1) // 3 + 1)

        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(64 * pool_out1[0] * pool_out1[1], 1024)
        self.dense2 = nn.Linear(64 * pool_out[0] * pool_out[1], 1024)
        self.dense3 = nn.Linear(1024, 10)

        nn.init.normal_(self.conv1.weight, mean=0.0, std=0.05)
        nn.init.normal_(self.conv2.weight, mean=0.0, std=0.05)
        nn.init.normal_(self.dense1.weight, mean=0.0, std=0.05)
        nn.init.normal_(self.dense2.weight, mean=0.0, std=0.05)
        nn.init.normal_(self.dense3.weight, mean=0.0, std=0.05)
        # END SOLUTION

    def forward(self, x):
        # BEGIN SOLUTION
        x = torch.relu(self.pool1(self.conv1(x)))
        output2 = self.dense1(self.flatten(torch.relu(self.pool1(self.conv2(x)))))
        output3 = torch.relu(self.dense2(self.flatten(x)))
        return self.dense3(output2 + output3)
        # END SOLUTION

In [None]:
# Test assertions
torch.manual_seed(100)
block = MysteryBlock()
test_input = torch.ones((1, 3, 32, 32))
test_output = block(test_input)
assert test_output.shape == torch.Size([1, 10]), f"Expected shape [1, 10], got {test_output.shape}"
assert torch.isclose(
    test_output[0, 0], torch.tensor(-0.6186), atol=0.01
), f"Expected -0.6186, got {test_output[0, 0]}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
torch.manual_seed(100)
block_64 = MysteryBlock(height=64, width=64)
test_input_64 = torch.ones((2, 3, 64, 64))
test_output_64 = block_64(test_input_64)
assert test_output_64.shape == torch.Size(
    [2, 10]
), f"Expected shape [2, 10], got {test_output_64.shape}"
# END HIDDEN TESTS

### Problem 4: Residual blocks

Similarly, using the following flowchart, fill in `__init__()` and `forward()` for the `ResidualBlock` class.

<img src='https://drive.google.com/uc?id=1P5nKQUh3IA-yGFScOtt9oJb9pTTmB3rC' width=400>

- The left flowchart shows the `ResidualBlock` architecture when `self_conv=False`
- The right flowchart shows the architecture when `self_conv=True`

The batch normalization layer (`nn.BatchNorm2d`) makes training faster and more stable by recentering and rescaling the network inputs.

**Important:** Initialize the weights in the conv layers from a normal distribution with mean 0 and standard deviation 0.05.

In [None]:
class ResidualBlock(nn.Module):
    """Residual block with optional skip connection convolution.

    Args:
        filters: List of filter counts for conv layers
        self_conv: If True, uses conv+batchnorm on skip connection
        in_channels: Number of input channels
    """

    def __init__(self, filters, self_conv, in_channels=3):
        super().__init__()
        # BEGIN SOLUTION
        self._self_conv = self_conv
        strides = [2, 1] if self_conv else [1, 1]

        kernel_size = 3
        pad = (kernel_size - 1) // 2
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=filters[0],
            kernel_size=kernel_size,
            padding=pad,
            stride=strides[0],
        )
        self.conv2 = nn.Conv2d(
            in_channels=filters[0],
            out_channels=filters[1],
            kernel_size=kernel_size,
            padding=pad,
            stride=strides[1],
        )

        self.bn1 = nn.BatchNorm2d(num_features=filters[0])
        self.bn2 = nn.BatchNorm2d(num_features=filters[1])

        if self_conv:
            self.conv3 = nn.Conv2d(
                in_channels=in_channels,
                out_channels=filters[2],
                kernel_size=1,
                padding=0,
                stride=strides[0],
            )
            self.bn3 = nn.BatchNorm2d(num_features=filters[2])

        self.act = nn.ReLU()

        nn.init.normal_(self.conv1.weight, mean=0.0, std=0.05)
        nn.init.normal_(self.conv2.weight, mean=0.0, std=0.05)
        if self_conv:
            nn.init.normal_(self.conv3.weight, mean=0.0, std=0.05)
        # END SOLUTION

    def forward(self, x):
        # BEGIN SOLUTION
        out_skip = x
        out = self.act(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if self._self_conv:
            out_skip = self.bn3(self.conv3(out_skip))
        out = torch.add(out, out_skip)
        return self.act(out)
        # END SOLUTION

In [None]:
# Test assertions
torch.manual_seed(100)
test_input = torch.ones((10, 3, 32, 32))
block_2 = ResidualBlock([16, 3], self_conv=False, in_channels=3)
output_2 = block_2(test_input)
assert output_2.shape == torch.Size(
    [10, 3, 32, 32]
), f"Expected shape [10, 3, 32, 32], got {output_2.shape}"

block_3 = ResidualBlock([8, 8, 8], self_conv=True, in_channels=3)
output_3 = block_3(test_input)
assert output_3.shape == torch.Size(
    [10, 8, 16, 16]
), f"Expected shape [10, 8, 16, 16], got {output_3.shape}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
torch.manual_seed(42)
block_hidden = ResidualBlock([32, 32], self_conv=False, in_channels=32)
hidden_input = torch.randn((4, 32, 16, 16))
hidden_output = block_hidden(hidden_input)
assert hidden_output.shape == torch.Size(
    [4, 32, 16, 16]
), f"Expected shape [4, 32, 16, 16], got {hidden_output.shape}"
# END HIDDEN TESTS

## ResNet18

ResNet (Residual Network) is a widely used convolutional neural network architecture that makes use of skip connections. These connections add the original input to the output of a series of convolutional layers, ensuring that useful information from earlier layers is preserved even if later layers fail to extract meaningful features.

In this problem, you will build ResNet18, a variant that comprises 18 convolutional layers in total. The model will be trained on the CIFAR-10 dataset, which contains 60,000 color images of size 32x32 pixels (3 channels) across 10 classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and truck.

In [None]:
import torchvision
from torchvision import transforms

transform = transforms.Compose([transforms.ToTensor()])

trainset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform
)
testset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform
)

batch_size = 256
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=batch_size, num_workers=2, shuffle=True
)
# we'll use the test data for validation
testloader = torch.utils.data.DataLoader(
    testset, batch_size=batch_size, shuffle=True, num_workers=2
)

classes = (
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
)

print(f"Number of classes = {trainset.classes}")
print(f"Original train size = {len(trainset)}, test size = {len(testset)}")
print(
    f"Samples in train dataloader = {len(trainloader.sampler)},"
    f"in validation dataloader = {len(testloader.sampler)}"
)

In [None]:
for x, y in trainloader:
    print(f"Batch shape = {x.shape}, target shape  {y.shape}")
    break

Here is a helper function for training a model network.

In [None]:
def train(model, train_dataloader, val_dataloader, criterion, lr=0.03, num_epochs=10):
    model = model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    train_loss = []
    train_accuracy = []
    val_loss = []
    val_accuracy = []

    for epoch in range(num_epochs):
        t_loss = 0.0
        t_correct = 0
        v_loss = 0.0
        v_correct = 0

        # training phase
        for data in train_dataloader:
            inputs, labels = data[0].cuda(), data[1].cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            t_loss += loss.item()
            with torch.no_grad():
                predicted_class = torch.argmax(outputs, dim=1)
                t_correct += (predicted_class == labels).sum().item()

        # evaluation on val
        with torch.no_grad():
            for data in val_dataloader:
                inputs, labels = data[0].cuda(), data[1].cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                v_loss += loss.item()
                predicted_class = torch.argmax(outputs, dim=1)
                v_correct += (predicted_class == labels).sum().item()

        train_loss.append(t_loss / len(train_dataloader))
        val_loss.append(v_loss / len(val_dataloader))
        train_accuracy.append(t_correct / len(train_dataloader.dataset))
        val_accuracy.append(v_correct / len(val_dataloader.dataset))

        print(
            f"Epoch={epoch}: "
            f"training loss = {train_loss[-1]:.4f} "
            f"accuracy = {train_accuracy[-1]:.2f}, "
            f"validation loss = {val_loss[-1]:.4f}, "
            f"accuracy = {val_accuracy[-1]:.2f}"
        )

    return train_loss, val_loss

### Problem 5: Data augmentation

Before training the ResNet model, we need to apply data augmentation to prevent overfitting.

Create a transform using `torchvision.transforms.Compose` that has two stages:
1. Random horizontal flips (with probability 0.5)
2. Random rotations between -30 and 30 degrees

**Hint:** Use `transforms.RandomHorizontalFlip` and `transforms.RandomRotation`.

In [None]:
data_aug_preprocess = transforms.Compose(
    [  # SOLUTION
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(degrees=(-30, 30)),
    ]
)

In [None]:
# Test assertions
assert hasattr(data_aug_preprocess, "transforms"), "data_aug_preprocess should be a Compose object"
assert len(data_aug_preprocess.transforms) == 2, "Should have exactly 2 transforms"
print("All tests passed!")

# BEGIN HIDDEN TESTS
transform_types = [type(t).__name__ for t in data_aug_preprocess.transforms]
assert "RandomHorizontalFlip" in transform_types, "Missing RandomHorizontalFlip"
assert "RandomRotation" in transform_types, "Missing RandomRotation"
# END HIDDEN TESTS

### Problem 6: ResNet18 class

Use the following flowchart and the `ResidualBlock` class from Problem 4 to implement ResNet18.

<img src='https://drive.google.com/uc?id=1y6DHU48iu54YAeg7ywx7_3zyZKHnbsdY' width=700>

**Notes:**
- The "Skip Connection" parts are `ResidualBlocks` with `self_conv=True`
- This flowchart doesn't include the output layer - include an appropriate output layer for image classification
- The `data_aug` parameter should be applied in the forward pass

**Important:** If you're using Google Colab, make sure to go to "Edit" > "Notebook Settings" and select "GPU" before running the training cells.

In [None]:
class ResNet18(nn.Module):
    """ResNet18 architecture for image classification.

    Args:
        num_classes: Number of output classes
        data_aug: Data augmentation transform to apply
        in_channels: Number of input channels (default: 3 for RGB)
        image_dim: Input image dimension (default: 32 for CIFAR-10)
    """

    def __init__(self, num_classes, data_aug, in_channels=3, image_dim=32):
        super().__init__()
        self.image_dim = image_dim
        self.da = data_aug
        # BEGIN SOLUTION
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=64,
            kernel_size=7,
            padding=3,
            stride=2,
        )
        self.before_res_block_bn = nn.BatchNorm2d(num_features=64)
        self.maxpooling = nn.MaxPool2d(kernel_size=(2, 2), stride=2, padding=0)
        self.resnetblock_1 = ResidualBlock([64, 64], self_conv=False, in_channels=64)
        self.resnetblock_2 = ResidualBlock([64, 64], self_conv=False, in_channels=64)
        self.resnetblock_3 = ResidualBlock([128, 128, 128], self_conv=True, in_channels=64)
        self.resnetblock_4 = ResidualBlock([128, 128], self_conv=False, in_channels=128)
        self.resnetblock_5 = ResidualBlock([256, 256, 256], self_conv=True, in_channels=128)
        self.resnetblock_6 = ResidualBlock([256, 256], self_conv=False, in_channels=256)
        self.resnetblock_7 = ResidualBlock([512, 512, 512], self_conv=True, in_channels=256)
        self.resnetblock_8 = ResidualBlock([512, 512], self_conv=False, in_channels=512)
        self.avgpooling = nn.AvgPool2d(kernel_size=1)
        self.flat = nn.Flatten()
        out_size = image_dim // 2**5
        self.dense = nn.Linear(512 * out_size**2, num_classes)
        self.relu = nn.ReLU()
        self.blocks = [
            self.resnetblock_1,
            self.resnetblock_2,
            self.resnetblock_3,
            self.resnetblock_4,
            self.resnetblock_5,
            self.resnetblock_6,
            self.resnetblock_7,
            self.resnetblock_8,
        ]
        # END SOLUTION

    def forward(self, x):
        # BEGIN SOLUTION
        output = self.da(x)
        output = self.maxpooling(self.relu(self.before_res_block_bn(self.conv1(output))))
        for res_block in self.blocks:
            output = res_block(output)
        return self.dense(self.flat(self.avgpooling(output)))
        # END SOLUTION

In [None]:
# Test assertions
model = ResNet18(10, data_aug_preprocess, in_channels=3, image_dim=32)
test_input = torch.randn(4, 3, 32, 32)
test_output = model(test_input)
assert test_output.shape == torch.Size([4, 10]), f"Expected shape [4, 10], got {test_output.shape}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
model_64 = ResNet18(10, data_aug_preprocess, in_channels=3, image_dim=64)
test_input_64 = torch.randn(2, 3, 64, 64)
test_output_64 = model_64(test_input_64)
assert test_output_64.shape == torch.Size(
    [2, 10]
), f"Expected shape [2, 10] for 64x64 input, got {test_output_64.shape}"
# END HIDDEN TESTS

Now train the network. Check that CUDA is available (otherwise the train function will raise an error).

In [None]:
len(trainloader), len(testloader)

In [None]:
torch.cuda.is_available()

In [None]:
model = ResNet18(10, data_aug_preprocess, in_channels=3, image_dim=32)
train_loss, val_loss = train(
    model, trainloader, testloader, nn.CrossEntropyLoss(), lr=0.03, num_epochs=10
)

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss, label="train")
plt.plot(val_loss, label="val")
plt.legend()
plt.show()