# Exercise: Debugging TinyAlexNet

Alex is asked to build a simple binary classifier to distinguish between images of dogs and cats.
The requirements are:
- The model receives a batch of RGB(color) image. The width and height of the image are both 10.
- The model should have the following architecture
    - A 2-D convolutional layer with 5 output channels, using a 3x3 kernal and a stride of 1
    - A ReLU activation funciton
    - A linear layer to predict 2 classes
- Alex should perform a single training step (forward pass, loss calculation, backward pass, and parameter update) using the Adam optimizer (lr=0.1) on a batch of 4 images.

Alex provides the following code but unfortunately there are 6 bugs in it.
Please find all of them.

In [4]:
import torch
import torch.nn as nn

class TinyAlexNet(nn.Module):
    def __init__(self):
        super().__init__()

        # ============================================================
        # DIMENSION CALCULATION FORMULA FOR CONV2D AND POOLING:
        # ============================================================
        # General formula for output size (height or width):
        #
        #   out = floor((in + 2*padding - kernel_size) / stride) + 1
        #
        # Where:
        #   - in: input dimension (height or width)
        #   - padding: zero-padding added to both sides (p1 for height, p2 for width)
        #   - kernel_size: filter size (m for height, n for width)
        #   - stride: step size (s)
        # ============================================================

        self.feat_extractor = nn.Sequential(
            # ---------------------------------------------------------
            # CONV2D LAYER:
            # Input shape: (batch=4, channels=3, height=10, width=10)
            # Parameters: kernel=3x3, stride=1, padding=0, out_channels=5
            #
            # Height calculation:
            #   out_h = floor((10 + 2*0 - 3) / 1) + 1
            #         = floor((10 - 3) / 1) + 1
            #         = floor(7) + 1
            #         = 8
            #
            # Width calculation:
            #   out_w = floor((10 + 2*0 - 3) / 1) + 1
            #         = floor(7) + 1
            #         = 8
            #
            # Output shape after Conv2d: (batch=4, channels=5, height=8, width=8)
            # ---------------------------------------------------------
            nn.Conv2d(in_channels=3, out_channels=5, kernel_size=(3, 3), stride=(1, 1), padding=0), # [BUG 2]
            nn.ReLU(),

            # ---------------------------------------------------------
            # MAXPOOL2D LAYER:
            # Input shape: (batch=4, channels=5, height=8, width=8)
            # Parameters: kernel=(1,2), stride=(1,1), padding=0
            #
            # Height calculation (kernel_h=1, stride_h=1, padding=0):
            #   out_h = floor((8 + 2*0 - 1) / 1) + 1
            #         = floor(7 / 1) + 1
            #         = 8
            #   (kernel=1 means no reduction in height dimension)
            #
            # Width calculation (kernel_w=2, stride_w=1, padding=0):
            #   out_w = floor((8 + 2*0 - 2) / 1) + 1
            #         = floor(6 / 1) + 1
            #         = 7
            #
            # Output shape after MaxPool2d: (batch=4, channels=5, height=8, width=7)
            # ---------------------------------------------------------
            nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 1), padding=0)
        )

        # ---------------------------------------------------------
        # LINEAR LAYER INPUT SIZE CALCULATION:
        # After feat_extractor, shape is: (batch, channels, height, width)
        #                               = (4, 5, 8, 7)
        #
        # Flattened size = channels * height * width
        #                = 5 * 8 * 7
        #                = 280
        #
        # This is why in_features=280 and reshape uses 280!
        # ---------------------------------------------------------
        self.pred = nn.Linear(in_features=280, out_features=2) # [BUG 3]

    def forward(self, input: torch.Tensor):
        feat = self.feat_extractor(input)
        # feat.shape = (batch_size, 5, 8, 7) -> flatten to (batch_size, 280)
        emb = feat.reshape(-1, 280)
        return self.pred(emb)

# prepare ata
input_data = torch.randn((4, 3, 10, 10)) 
targets = torch.tensor([0, 1, 0, 1])

# prepare model, optimizer and loss function
model = TinyAlexNet()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
loss_fn = nn.CrossEntropyLoss()

optimizer.zero_grad()

# training Step
output = model(input_data)
loss = loss_fn(output, targets)
loss.backward()
optimizer.step()