In [1]:
# Dataset: CIFAR10

# Architecture: CNN:
    # • Conv2D (in channels=3, out channels=32, kernel size=3, padding=1) → ReLU → MaxPool2D (kernel size=2, stride=2)
    # • Conv2D (32 → 64, kernel size=3, padding=1) → ReLU → MaxPool2D (kernel size=2, stride=2)
    # • Conv2D (64 → 128, kernel size=3, padding=1) → ReLU → MaxPool2D (kernel size=2, stride=2)
    # • Flatten → Linear(128 × 4 × 4 = 2048 → 256) → ReLU → Linear(256 → 10)
# Loss: CrossEntropyLoss
# Optimizer: Adam (lr=0.001)

# Define the batch size for training and testing
batch_size = 64
learning_rate = 0.001

from tqdm import tqdm

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define a transform to convert images to tensors and normalize them
transform = transforms.Compose([
transforms.ToTensor(), # Convert PIL image to tensor
transforms.Normalize((0.4914, 0.4822, 0.4465), # Mean for each channel
                    (0.2470, 0.2435, 0.2616)) # Std for each channel
])

# Load the CIFAR-10 training dataset with transformations applied
train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
# Load the CIFAR-10 test dataset with the same transformations
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)
# Create a data loader for the training set
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, # Number of samples per batch
                        shuffle=True) # Shuffle the data each epoch
# Create a data loader for the test set
test_loader = DataLoader(dataset=test_dataset,
                        batch_size=batch_size, # Same batch size as training
                        shuffle=False) # No shuffling for test data

In [2]:
# Define the CNN model
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # First convolutional layer: 3 input channels (RGB), 32 output channels, kernel size 3, padding 1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        # Second convolutional layer: 32 input channels, 64 output channels
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        # Third convolutional layer: 64 input channels, 128 output channels
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        # Fully connected layer: input size is 128*4*4 (after flattening), output size is 256
        self.fc1 = nn.Linear(128 * 4 * 4, 256)
        # Final fully connected layer: input size is 256, output size is 10 (number of classes)
        self.fc2 = nn.Linear(256, 10)
    def forward(self, x):
        # Apply first convolutional layer, ReLU activation, and max pooling
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        # Apply second convolutional layer, ReLU activation, and max pooling
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # Apply third convolutional layer, ReLU activation, and max pooling
        x = F.max_pool2d(F.relu(self.conv3(x)), 2)
        # Flatten the tensor for the fully connected layers
        x = x.view(-1, 128 * 4 * 4)
        # Apply first fully connected layer and ReLU activation
        x = F.relu(self.fc1(x))
        # Apply final fully connected layer (output logits)
        x = self.fc2(x)
        return x
    

In [3]:
# init:
model = CNN()
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# check if cuda:
if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
else:
    print("CUDA is not available. Training on CPU.")

# train over 20 epochs:

# plot training loss over epochs

# plot training and testing accuracy over epochs





CUDA is available. Training on GPU.


In [4]:
# 2. 
    # different hyperparameters: for each of the following, train the model and plot the training loss and accuracy over epochs:

# a) batch size = 64, learning rate = [0.01, 0.001, 0.0001]
# b) batch size = [32, 64, 128], learning rate = 0.001
# c) batch size = 64, learning rate = 0.001, optimizer = RMSProp
    # Describe observations in the difference between RMSProp and Adam optimizers

In [5]:
# 3. 
    # for batch size = 64, learning rate = 0.001, modify architecture:

# a) add a Dropout layer (p=0.5) after the first Linear layer
    # plot training loss, training accuracy, and testing accuracy over epochs
    # discuss observations
# b) add a BatchNorm2d layer before each MaxPool2d layer
    # num_features arg of BatchNorm2da should match out_channels of preceding Conv2d layer
    # plot training loss, training accuracy, and testing accuracy over epochs
    # discuss observations