In [1]:
# import necessary libraries
import os, math, random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as T

from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

In [2]:
OUT_DIR = "HW_1-2-GradientNorm"
os.makedirs(OUT_DIR, exist_ok=True)

In [3]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [4]:
def device():
    if torch.cuda.is_available(): return torch.device("cuda")
    return torch.device("cpu")

In [5]:
def grad_norm(model: nn.Module) -> float:
    grad_all = 0.0
    for p in model.parameters():
        if p.grad is not None:
            grad_all += (p.grad.detach() ** 2).sum().item()
    grad_norm = grad_all ** 0.5
    return grad_norm

In [6]:
def plot_grad_and_loss(grad_hist, loss_hist, title, out_path):
    iters = np.arange(1, len(loss_hist) + 1)
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8.5, 6.5), sharex=True)
    ax1.plot(iters, grad_hist, linewidth=1)
    ax1.set_ylabel("grad"); ax1.grid(True, linewidth=0.3)
    ax2.plot(iters, loss_hist, linewidth=1)
    ax2.set_xlabel("iteration"); ax2.set_ylabel("loss"); ax2.grid(True, linewidth=0.3)
    fig.suptitle(title)
    fig.tight_layout(rect=[0,0,1,0.96])
    plt.savefig(out_path, dpi=180); plt.close()

In [7]:
# Task 1: Single-input Single-output function
def f_true(x: torch.Tensor) -> torch.Tensor:
    return torch.cos(2* math.pi*x) * (x**3)

def make_function_loaders(xmin=-3.0, xmax=3.0, n_train=256, batch=64, device=None):
    x = torch.linspace(xmin, xmax, n_train).unsqueeze(1)
    y = f_true(x)
    ds = TensorDataset(x, y)
    loader = DataLoader(ds, batch_size=batch, shuffle=True, drop_last=False)
    if device:
        pass
    return loader

class SimpleFunctionModel(nn.Module):
    def __init__(self, hidden=[18, 20, 15]):
        super().__init__()
        layers = []
        in_d = 1
        for h in hidden:
            layers += [nn.Linear(in_d, h), nn.Tanh()]
            in_d = h
        layers += [nn.Linear(in_d, 1)]
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)

In [8]:
def run_function_gradnorm(epochs=1000, lr=1e-3, wd=5e-4, batch=128):
    dev = device()
    loader = make_function_loaders(batch=batch)
    model = SimpleFunctionModel([18, 20, 15]).to(dev)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    loss_fn = nn.MSELoss()

    grad_hist, loss_hist = [], []
    for ep in range(epochs):
        model.train()
        for xb,yb in loader:
            xb,yb = xb.to(dev), yb.to(dev)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred,yb)
            loss.backward()
            grad_hist.append(grad_norm(model))
            loss_hist.append(loss.item())
            opt.step()
    plot_grad_and_loss(grad_hist, loss_hist,
                       f"Grad-norm & Loss vs Iterations [Function]",
                       f"{OUT_DIR}/gradloss_function.png")
    print("Saved function plot.")

In [9]:
def compute_channel_stats(data_dir="./data"):
    """
    Compute mean and std of CIFAR-10 training set.
    Returns two lists: mean, std (each of length 3 for RGB).
    """
    # Load train set
    train_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True,
        transform=T.ToTensor()
    )
    loader = DataLoader(train_set, batch_size=5000, shuffle=False, num_workers=2)

    mean = 0.
    std = 0.
    nb_samples = 0

    for data, _ in loader:
        # data shape: [batch, channels, height, width]
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)  # flatten H*W
        mean += data.mean(2).sum(0)
        std  += data.std(2).sum(0)
        nb_samples += batch_samples

    mean /= nb_samples
    std /= nb_samples

    return mean.tolist(), std.tolist()

In [10]:
def get_cifar10_loaders(
    data_dir="./data",
    batch_size=128,
    num_workers=2,
    drop_last=False
):
    # compute mean/std
    mean, std = compute_channel_stats(data_dir)
    print("CIFAR-10 stats:", mean, std)

    train_tfms = T.Compose([
        T.RandomCrop(32, padding=2),
        T.RandomHorizontalFlip(), 
        T.ToTensor(),
        T.Normalize(mean, std),
    ])
    test_tfms = T.Compose([
        T.ToTensor(),
        T.Normalize(mean, std),
    ])

    train_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=train_tfms
    )
    test_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=test_tfms
    )

    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True,
        num_workers=num_workers, pin_memory=True, drop_last=drop_last
    )
    test_loader = DataLoader(
        test_set, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=True, drop_last=False
    )
    return train_loader, test_loader

In [11]:
class CNNModel(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(inplace=True),   # <-- 'features.0' is first conv
            nn.MaxPool2d(2),  # 32x16x16
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 64x8x8
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64*8*8, 128), nn.ReLU(inplace=True),
            nn.Linear(128, num_classes),
        )
    def forward(self, x): return self.classifier(self.features(x))


In [12]:
def run_cifar_gradnorm(epochs=5, lr=1e-3, wd=5e-4, batch=128, data_dir="./data"):
    train_loader, _ = get_cifar10_loaders(data_dir=data_dir, batch_size=batch)

    dev = device()
    model = CNNModel().to(dev)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    loss_fn = nn.CrossEntropyLoss()

    grad_hist, loss_hist = [], []
    for ep in range(epochs):
        model.train()
        for xb,yb in train_loader:
            xb,yb = xb.to(dev), yb.to(dev)
            opt.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits,yb)
            loss.backward()
            grad_hist.append(grad_norm(model))
            loss_hist.append(loss.item())
            opt.step()
    plot_grad_and_loss(grad_hist, loss_hist,
                       f"Grad-norm & Loss vs Iterations [CIFAR-10]",
                       f"{OUT_DIR}/gradloss_cifar.png")
    print("Saved CIFAR plot.")

In [13]:
def main(task="both",
         epochs=20,
         lr=1e-3,
         weight_decay=5e-4,
         batch=128,
         data_dir="./data"):

    set_seed(42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)


    if task in ("function", "both"):
        run_function_gradnorm(epochs=epochs, lr=lr, wd=weight_decay,
                              batch=batch)

    if task in ("cifar", "both"):
        run_cifar_gradnorm(epochs=epochs, lr=lr, wd=weight_decay,
                           batch=batch, data_dir=data_dir)

In [14]:
if __name__ == "__main__":
    main(task="function", epochs=5000)
    main(task="cifar", epochs=25)

Device: cuda
Saved function plot.
Device: cuda
Files already downloaded and verified




CIFAR-10 stats: [0.4913996756076813, 0.4821583926677704, 0.44653093814849854] [0.20230092108249664, 0.19941280782222748, 0.20096160471439362]
Files already downloaded and verified
Files already downloaded and verified
Saved CIFAR plot.
