In [1]:
# pip install torchvision

In [4]:
# import necessary libraries
import os
import math
import time
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as T
import matplotlib.pyplot as plt

In [5]:
def compute_channel_stats(data_dir="./data"):
    """
    Compute mean and std of CIFAR-10 training set.
    Returns two lists: mean, std (each of length 3 for RGB).
    """
    # Load train set
    train_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True,
        transform=T.ToTensor()
    )
    loader = DataLoader(train_set, batch_size=5000, shuffle=False, num_workers=2)

    mean = 0.
    std = 0.
    nb_samples = 0

    for data, _ in loader:
        # data shape: [batch, channels, height, width]
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)  # flatten H*W
        mean += data.mean(2).sum(0)
        std  += data.std(2).sum(0)
        nb_samples += batch_samples

    mean /= nb_samples
    std /= nb_samples

    return mean.tolist(), std.tolist()

In [6]:
def get_cifar10_loaders(
    data_dir="./data",
    batch_size=128,
    num_workers=2,
    drop_last=False
):
    # compute mean/std
    mean, std = compute_channel_stats(data_dir)
    print("CIFAR-10 stats:", mean, std)

    train_tfms = T.Compose([
        T.RandomCrop(32, padding=2),
        T.RandomHorizontalFlip(), 
        T.ToTensor(),
        T.Normalize(mean, std),
    ])
    test_tfms = T.Compose([
        T.ToTensor(),
        T.Normalize(mean, std),
    ])

    train_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=train_tfms
    )
    test_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=test_tfms
    )

    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True,
        num_workers=num_workers, pin_memory=True, drop_last=drop_last
    )
    test_loader = DataLoader(
        test_set, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=True, drop_last=False
    )
    return train_loader, test_loader, train_set.classes

In [7]:
# Simple CNN model with less parameters
class CNN1(nn.Module):
    """
    Two conv blocks + 2x2 pools → 8x8 feature map.
    Head: 4096 → 128 → 10
    """
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 32x16x16
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 64x8x8
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64*8*8, 128), nn.ReLU(inplace=True),
            nn.Linear(128, num_classes),
        )
    def forward(self, x): return self.classifier(self.features(x))


In [8]:
# Medium CNN model with parameters in medium range
class CNN2(nn.Module):
    """
    Two blocks with doubled channels and BN:
    Block1: 3→64→64 → pool (16x16)
    Block2: 64→128→128 → pool (8x8)
    Head: 8192 → 256 → 10
    """
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 64x16x16

            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 128x8x8
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128*8*8, 256), nn.ReLU(inplace=True),
            nn.Linear(256, num_classes),
        )
    def forward(self, x): return self.classifier(self.features(x))


In [9]:
# Complex CNN model with high number of parameters
class CNN3(nn.Module):
    """
    Three blocks with more channels:
    B1: 3→64→64 → pool (16x16)
    B2: 64→128→128 → pool (8x8)
    B3: 128→256→256 → pool (4x4)
    Head: 4096 → 512 → 10
    """
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 64x16x16

            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 128x8x8

            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 256x4x4
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256*4*4, 512), nn.ReLU(inplace=True),
            nn.Linear(512, num_classes),
        )
    def forward(self, x): return self.classifier(self.features(x))


In [10]:
def count_params(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [11]:
def train_one_epoch(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss, correct, total = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = loss_fn(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)
    return total_loss / total, correct / total

@torch.no_grad()
def evaluate(model, loader, loss_fn, device):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = loss_fn(logits, yb)
        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)
    return total_loss / total, correct / total


In [12]:
def plot_curves(history_dict, epochs, ylabel, title, out_path):
    plt.figure(figsize=(7,5))
    for name, values in history_dict.items():
        plt.plot(range(1, epochs+1), values, label=name, linewidth=2)
    plt.xlabel("Epoch")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.grid(True, linewidth=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=180)
    plt.close()


In [17]:
def main():
    
    seed=42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    # Hyperparameters
    BATCH_SIZE   = 128
    EPOCHS       = 100
    LR           = 1e-3
    WEIGHT_DECAY = 5e-4  

    # Data
    train_loader, test_loader, class_names = get_cifar10_loaders(
        data_dir="./data", batch_size=BATCH_SIZE, num_workers=2, drop_last=False
    )
    print("Classes:", class_names)

    # Models
    models = {
        "CNN Model 1":  CNN1().to(device),
        "CNN Model 2": CNN2().to(device),
        "CNN Model 3":  CNN3().to(device),
    }
    for name, m in models.items():
        print(f"{name}: {count_params(m):,} params")

    loss_fn = nn.CrossEntropyLoss() # Cross Entropy is used as the loss function

    # Histories
    train_losses = {k: [] for k in models.keys()}
    train_accs   = {k: [] for k in models.keys()}

    # Train all models
    for name, model in models.items():
        print(f"\n     Training {name} Model\n")
        optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

        for epoch in range(1, EPOCHS+1):
            tr_loss, tr_acc = train_one_epoch(model, train_loader, optimizer, loss_fn, device)

            train_losses[name].append(tr_loss)
            train_accs[name].append(tr_acc)

            print(f"Epoch {epoch:02d}/{EPOCHS} | "
                  f"Train Loss: {tr_loss:.4f} Acc: {tr_acc*100:5.2f}%")

    # training Loss and training Accuracy plots 
    plot_curves(train_losses, EPOCHS, ylabel="Training Loss",
                title="Model training loss",
                out_path="HW_1-1-TrainOnActualDataset/Task1/cifar_loss.png")

    plot_curves(train_accs, EPOCHS, ylabel="Training Accuracy",
                title="Model training accuracy",
                out_path="HW_1-1-TrainOnActualDataset/Task1/cifar_train_acc.png")

    print("\nSaved figures:")
    print(" - cifar_loss.png")
    print(" - cifar_train_acc.png")


In [18]:
if __name__ == "__main__":
    main()

Device: cuda
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:02<00:00, 57.9MB/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data




CIFAR-10 stats: [0.4913996756076813, 0.4821583926677704, 0.44653093814849854] [0.20230092108249664, 0.19941280782222748, 0.20096160471439362]
Files already downloaded and verified
Files already downloaded and verified
Classes: ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
CNN Model 1: 545,098 params
CNN Model 2: 2,360,906 params
CNN Model 3: 3,249,994 params

     Training CNN Model 1 Model

Epoch 01/100 | Train Loss: 1.4411 Acc: 48.29%
Epoch 02/100 | Train Loss: 1.0873 Acc: 61.83%
Epoch 03/100 | Train Loss: 0.9805 Acc: 65.46%
Epoch 04/100 | Train Loss: 0.9109 Acc: 68.11%
Epoch 05/100 | Train Loss: 0.8625 Acc: 69.72%
Epoch 06/100 | Train Loss: 0.8203 Acc: 71.37%
Epoch 07/100 | Train Loss: 0.7852 Acc: 72.79%
Epoch 08/100 | Train Loss: 0.7564 Acc: 73.66%
Epoch 09/100 | Train Loss: 0.7389 Acc: 74.27%
Epoch 10/100 | Train Loss: 0.7129 Acc: 75.18%
Epoch 11/100 | Train Loss: 0.6980 Acc: 75.71%
Epoch 12/100 | Train Loss: 0.6775 Acc: 76.32%
Epoch 13