## Problem 1: Classification Problem

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch

### Q1 (10 points): Implement Instance Normalization

In [2]:
class InstanceNorm1D:
    def __init__(self, D, eps=1e-5, device=None, dtype=torch.float32):
        self.D = D
        self.eps = eps

        # parameters (learned)
        self.gamma = torch.ones(D, device=device, dtype=dtype)
        self.beta  = torch.zeros(D, device=device, dtype=dtype)

        # gradients
        self.dgamma = torch.zeros_like(self.gamma)
        self.dbeta  = torch.zeros_like(self.beta)

        # cache
        self._cache = None

    def forward(self, z):
        assert z.dim() == 2 and z.size(1) == self.D, "Expected (B, D) input"

        mu = z.mean(dim=1, keepdim=True)                         # (B, 1)
        var = ((z - mu) ** 2).mean(dim=1, keepdim=True)          # (B, 1)
        invstd = torch.rsqrt(var + self.eps)                     # (B, 1)
        zhat = (z - mu) * invstd                                 # (B, D)

        a = zhat * self.gamma.view(1, -1) + self.beta.view(1, -1)

        # cache for backward
        self._cache = (zhat, invstd)
        return a

    def backward(self, da):
        zhat, invstd = self._cache
        B, D = da.shape
        assert D == self.D, "Mismatched feature dimension"

        # parameter gradients
        self.dbeta = da.sum(dim=0)               # (D,)
        self.dgamma = (da * zhat).sum(dim=0)     # (D,)

        # dL/dzhat
        dzhat = da * self.gamma.view(1, -1)      # (B, D)

        sum_dzhat = dzhat.sum(dim=1, keepdim=True)                # (B, 1)
        sum_dzhat_zhat = (dzhat * zhat).sum(dim=1, keepdim=True)  # (B, 1)

        dz = (invstd / D) * (D * dzhat - sum_dzhat - zhat * sum_dzhat_zhat)  # (B, D)
        return dz

### Q2 (10 points): Implement Dropout

In [3]:
class Dropout:
    def __init__(self, p=0.5):
        assert 0.0 <= p < 1.0, "p must be in [0, 1)"
        self.p = float(p)
        self._mask = None
        self._train = True  # default mode like PyTorch modules

    def train(self):
        self._train = True

    def eval(self):
        self._train = False

    def forward(self, z):
        if (not self._train) or self.p == 0.0:
            self._mask = None
            return z

        keep_prob = 1.0 - self.p
        # mask is 1 with prob keep_prob, else 0
        self._mask = (torch.rand_like(z) < keep_prob).to(z.dtype)
        a = (self._mask * z) / keep_prob
        return a

    def backward(self, da):
        if (not self._train) or self.p == 0.0:
            return da

        keep_prob = 1.0 - self.p
        dz = (self._mask / keep_prob) * da
        return dz

### Q3 (10 points): Implement Softmax Cross Entropy

In [4]:
class SoftmaxCrossEntropy:
    def __init__(self):
        self._cache = None  # (yhat, y)

    def forward(self, z, y):
        assert z.dim() == 2, "z must be (B, C)"
        assert y.dim() == 1 and y.size(0) == z.size(0), "y must be (B,)"

        B, C = z.shape

        # stable softmax: subtract row-wise max
        z_shift = z - z.max(dim=1, keepdim=True).values           # (B, C)
        exp_z = torch.exp(z_shift)                                 # (B, C)
        yhat = exp_z / exp_z.sum(dim=1, keepdim=True)              # (B, C)

        # cross-entropy: -mean(log prob of correct class)
        correct_probs = yhat[torch.arange(B, device=z.device), y]  # (B,)
        loss = -torch.log(correct_probs + 1e-12).mean()            # scalar

        self._cache = (yhat, y)
        return loss

    def backward(self):
        yhat, y = self._cache
        B, C = yhat.shape

        dz = yhat.clone()
        dz[torch.arange(B, device=dz.device), y] -= 1.0
        dz /= B
        return dz

### Q4 (10 points): Implement Tanh

In [5]:
class Tanh:
    def __init__(self):
        self._cache = None # tanh(z)

    def forward(self, z):
        a = torch.tanh(z)
        self._cache = a  # cache the output for backward
        return a
    
    def backward(self, da):
        assert self._cache is not None, "Must call forward before backward"
        a = self._cache
        dz = da * (1.0 - a ** 2)
        return dz

### Q5 (10 points): Build and Train a Classification Model

Reusing helpers from assignment2.ipynb


In [6]:

def generate_spiral(n_per_class=200, n_classes=2, noise=0.2, rotations=1.0, seed=0):
    rng = np.random.default_rng(seed)
    N = n_per_class * n_classes
    X = np.zeros((N, 2), dtype=np.float32)
    y = np.zeros((N,), dtype=np.int64)

    # radius grows from 0 to 1
    r = np.linspace(0.0, 1.0, n_per_class, dtype=np.float32)

    for c in range(n_classes):
        # angle: start offset for each class + rotations * 2pi * r
        offset = (2.0 * np.pi / n_classes) * c
        theta = offset + rotations * 2.0 * np.pi * r

        # base spiral
        xs = r * np.cos(theta)
        ys = r * np.sin(theta)

        # add noise
        xs += rng.normal(0.0, noise, size=n_per_class).astype(np.float32)
        ys += rng.normal(0.0, noise, size=n_per_class).astype(np.float32)

        idx0 = c * n_per_class
        idx1 = (c + 1) * n_per_class
        X[idx0:idx1, 0] = xs
        X[idx0:idx1, 1] = ys
        y[idx0:idx1] = c

    return X, y

def train_val_split(X, y, val_ratio=0.2, seed=0):
    rng = np.random.default_rng(seed)
    idx = np.arange(len(X))
    rng.shuffle(idx)
    n_val = int(len(X) * val_ratio)
    val_idx = idx[:n_val]
    tr_idx = idx[n_val:]
    return X[tr_idx], y[tr_idx], X[val_idx], y[val_idx]

@torch.no_grad()
def accuracy_from_logits(logits, y):
    pred = torch.argmax(logits, dim=1)
    return (pred == y).float().mean().item()

#### Linear Class

In [7]:
class Linear:
    def __init__(self, in_dim, out_dim, device=None, dtype=torch.float32):
        self.in_dim = int(in_dim)
        self.out_dim = int(out_dim)

        # Kaiming-ish small init (good enough for tanh; keep simple)
        w_scale = (1.0 / np.sqrt(self.in_dim)).astype(np.float32) if isinstance(self.in_dim, np.ndarray) else (1.0 / np.sqrt(self.in_dim))
        self.W = torch.randn(self.in_dim, self.out_dim, device=device, dtype=dtype) * w_scale
        self.b = torch.zeros(self.out_dim, device=device, dtype=dtype)

        self.dW = torch.zeros_like(self.W)
        self.db = torch.zeros_like(self.b)

        self._cache_x = None

    def forward(self, x):
        assert x.dim() == 2 and x.size(1) == self.in_dim, "Expected (B, in_dim)"
        self._cache_x = x
        return x @ self.W + self.b.view(1, -1)

    def backward(self, dout):
        x = self._cache_x
        assert x is not None, "Must call forward() before backward()"

        # grads
        self.dW = x.t() @ dout                      # (in_dim, out_dim)
        self.db = dout.sum(dim=0)                   # (out_dim,)
        dx = dout @ self.W.t()                      # (B, in_dim)
        return dx

    def zero_grad(self):
        self.dW.zero_()
        self.db.zero_()

    def step(self, lr):
        self.W -= lr * self.dW
        self.b -= lr * self.db

#### MLP Class

In [8]:
class MLP:
    """
    2-hidden-layer MLP:
      Linear -> InstanceNorm -> Tanh -> Dropout
      Linear -> InstanceNorm -> Tanh -> Dropout
      Linear -> logits
    """
    def __init__(self, in_dim=2, hidden_dim=64, out_dim=2, p_drop=0.2, device=None, dtype=torch.float32):
        self.l1 = Linear(in_dim, hidden_dim, device=device, dtype=dtype)
        self.n1 = InstanceNorm1D(hidden_dim, device=device, dtype=dtype)
        self.a1 = Tanh()
        self.d1 = Dropout(p_drop)

        self.l2 = Linear(hidden_dim, hidden_dim, device=device, dtype=dtype)
        self.n2 = InstanceNorm1D(hidden_dim, device=device, dtype=dtype)
        self.a2 = Tanh()
        self.d2 = Dropout(p_drop)

        self.l3 = Linear(hidden_dim, out_dim, device=device, dtype=dtype)

        self._train = True

    def train(self):
        self._train = True
        self.d1.train()
        self.d2.train()

    def eval(self):
        self._train = False
        self.d1.eval()
        self.d2.eval()

    def forward(self, x):
        z = self.l1.forward(x)
        z = self.n1.forward(z)
        z = self.a1.forward(z)
        z = self.d1.forward(z)

        z = self.l2.forward(z)
        z = self.n2.forward(z)
        z = self.a2.forward(z)
        z = self.d2.forward(z)

        logits = self.l3.forward(z)
        return logits

    def backward(self, dlogits):
        dz = self.l3.backward(dlogits)

        dz = self.d2.backward(dz)
        dz = self.a2.backward(dz)
        dz = self.n2.backward(dz)
        dz = self.l2.backward(dz)

        dz = self.d1.backward(dz)
        dz = self.a1.backward(dz)
        dz = self.n1.backward(dz)
        dz = self.l1.backward(dz)
        return dz

    def zero_grad(self):
        self.l1.zero_grad()
        self.l2.zero_grad()
        self.l3.zero_grad()
        # InstanceNorm params are plain tensors; nothing to "zero" besides overwriting dgamma/dbeta in backward
        # (they are assigned each backward call)

    def step(self, lr):
        # linear params
        self.l1.step(lr)
        self.l2.step(lr)
        self.l3.step(lr)

        # instance norm params
        self.n1.gamma -= lr * self.n1.dgamma
        self.n1.beta  -= lr * self.n1.dbeta
        self.n2.gamma -= lr * self.n2.dgamma
        self.n2.beta  -= lr * self.n2.dbeta


Generating required data, model, & loss

In [9]:
# data
X_np, y_np = generate_spiral(n_per_class=150, n_classes=2, noise=0.05, rotations=1.5, seed=42)
Xtr, ytr, Xva, yva = train_val_split(X_np, y_np, val_ratio=0.2, seed=0)

Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
ytr_t = torch.tensor(ytr, dtype=torch.int64)
Xva_t = torch.tensor(Xva, dtype=torch.float32)
yva_t = torch.tensor(yva, dtype=torch.int64)

# model + loss
model = MLP(in_dim=2, hidden_dim=64, out_dim=2, p_drop=0.2)
loss_fn = SoftmaxCrossEntropy()

Training loop (with required hyperparameters)

In [10]:
lr = 0.1
epochs = 2000
batch_size = 64
rng = np.random.default_rng(0)

for epoch in range(1, epochs + 1):
    model.train()

    # shuffle
    idx = rng.permutation(len(Xtr_t))
    Xtr_t = Xtr_t[idx]
    ytr_t = ytr_t[idx]

    total_loss = 0.0
    total_acc = 0.0
    n_batches = 0

    for start in range(0, len(Xtr_t), batch_size):
        xb = Xtr_t[start:start + batch_size]
        yb = ytr_t[start:start + batch_size]

        # zero grads
        model.zero_grad()

        # forward
        logits = model.forward(xb)

        # loss
        loss = loss_fn.forward(logits, yb)

        # backward
        dlogits = loss_fn.backward()
        model.backward(dlogits)

        # update (SGD, no weight decay)
        model.step(lr)

        total_loss += float(loss.detach().cpu())
        total_acc += accuracy_from_logits(logits, yb)
        n_batches += 1

    # validation
    model.eval()
    with torch.no_grad():
        val_logits = model.forward(Xva_t)
        val_acc = accuracy_from_logits(val_logits, yva_t)

    if epoch % 200 == 0 or epoch == 1:
        print(
            f"Epoch {epoch:4d} | "
            f"train loss {total_loss / n_batches:.4f} | "
            f"train acc {total_acc / n_batches:.3f} | "
            f"val acc {val_acc:.3f}"
        )


Epoch    1 | train loss 0.7353 | train acc 0.559 | val acc 0.550
Epoch  200 | train loss 0.6381 | train acc 0.635 | val acc 0.617
Epoch  400 | train loss 0.5926 | train acc 0.648 | val acc 0.683
Epoch  600 | train loss 0.4985 | train acc 0.725 | val acc 0.733
Epoch  800 | train loss 0.2723 | train acc 0.854 | val acc 0.867
Epoch 1000 | train loss 0.2524 | train acc 0.889 | val acc 0.917
Epoch 1200 | train loss 0.3024 | train acc 0.875 | val acc 0.917
Epoch 1400 | train loss 0.2233 | train acc 0.906 | val acc 0.883
Epoch 1600 | train loss 0.2256 | train acc 0.928 | val acc 0.917
Epoch 1800 | train loss 0.1648 | train acc 0.947 | val acc 0.917
Epoch 2000 | train loss 0.1493 | train acc 0.941 | val acc 0.917


Final report

In [11]:
model.eval()
with torch.no_grad():
    final_train_acc = accuracy_from_logits(model.forward(Xtr_t), ytr_t)
    final_val_acc = accuracy_from_logits(model.forward(Xva_t), yva_t)

print(f"Final training accuracy:   {final_train_acc:.3f}")
print(f"Final validation accuracy: {final_val_acc:.3f}")

Final training accuracy:   0.967
Final validation accuracy: 0.917


#### Q6 (25 points): InstanceNorm vs BatchNorm

Replacing InstanceNorm with BatchNormID

In [12]:
# BatchNorm1D (feature dimension D on a (B, D) tensor)
class BatchNorm1D:
    def __init__(self, D, eps=1e-5, momentum=0.9, device=None, dtype=torch.float32):
        self.D = int(D)
        self.eps = eps
        self.momentum = momentum

        # learned affine params
        self.gamma = torch.ones(self.D, device=device, dtype=dtype)
        self.beta  = torch.zeros(self.D, device=device, dtype=dtype)

        # gradients
        self.dgamma = torch.zeros_like(self.gamma)
        self.dbeta  = torch.zeros_like(self.beta)

        # running stats (for eval)
        self.running_mean = torch.zeros(self.D, device=device, dtype=dtype)
        self.running_var  = torch.ones(self.D, device=device, dtype=dtype)

        self._train = True
        self._cache = None  # (xhat, invstd)

    def train(self):
        self._train = True

    def eval(self):
        self._train = False

    def forward(self, x):
        assert x.dim() == 2 and x.size(1) == self.D, "Expected (B, D)"
        if self._train:
            mu = x.mean(dim=0, keepdim=True)                          # (1, D)
            var = ((x - mu) ** 2).mean(dim=0, keepdim=True)           # (1, D)
            invstd = torch.rsqrt(var + self.eps)                      # (1, D)
            xhat = (x - mu) * invstd                                  # (B, D)

            # update running stats (detach so they don't build graphs)
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * mu.squeeze(0).detach()
            self.running_var  = self.momentum * self.running_var  + (1 - self.momentum) * var.squeeze(0).detach()

            self._cache = (xhat, invstd)
        else:
            mu = self.running_mean.view(1, -1)                         # (1, D)
            var = self.running_var.view(1, -1)                         # (1, D)
            invstd = torch.rsqrt(var + self.eps)                       # (1, D)
            xhat = (x - mu) * invstd                                   # (B, D)
            self._cache = None  # no backward in eval typically

        out = xhat * self.gamma.view(1, -1) + self.beta.view(1, -1)
        return out

    def backward(self, dout):
        assert self._cache is not None, "BatchNorm backward called without forward() in train mode"
        xhat, invstd = self._cache
        B, D = dout.shape
        assert D == self.D

        # affine grads
        self.dbeta = dout.sum(dim=0)
        self.dgamma = (dout * xhat).sum(dim=0)

        # backprop through BN (alternate simplified formula)
        dxhat = dout * self.gamma.view(1, -1)                          # (B, D)
        sum_dxhat = dxhat.sum(dim=0, keepdim=True)                     # (1, D)
        sum_dxhat_xhat = (dxhat * xhat).sum(dim=0, keepdim=True)       # (1, D)

        dx = (invstd / B) * (B * dxhat - sum_dxhat - xhat * sum_dxhat_xhat)
        return dx

BatchNorm based MLP variant

In [13]:
class MLP_BN:
    """
    Same architecture as MLP, but with BatchNorm instead of InstanceNorm.
    """
    def __init__(self, in_dim=2, hidden_dim=64, out_dim=2, p_drop=0.2, device=None, dtype=torch.float32):
        self.l1 = Linear(in_dim, hidden_dim, device=device, dtype=dtype)
        self.n1 = BatchNorm1D(hidden_dim, device=device, dtype=dtype)
        self.a1 = Tanh()
        self.d1 = Dropout(p_drop)

        self.l2 = Linear(hidden_dim, hidden_dim, device=device, dtype=dtype)
        self.n2 = BatchNorm1D(hidden_dim, device=device, dtype=dtype)
        self.a2 = Tanh()
        self.d2 = Dropout(p_drop)

        self.l3 = Linear(hidden_dim, out_dim, device=device, dtype=dtype)

    def train(self):
        self.d1.train()
        self.d2.train()
        self.n1.train()
        self.n2.train()

    def eval(self):
        self.d1.eval()
        self.d2.eval()
        self.n1.eval()
        self.n2.eval()

    def forward(self, x):
        z = self.l1.forward(x)
        z = self.n1.forward(z)
        z = self.a1.forward(z)
        z = self.d1.forward(z)

        z = self.l2.forward(z)
        z = self.n2.forward(z)
        z = self.a2.forward(z)
        z = self.d2.forward(z)

        logits = self.l3.forward(z)
        return logits

    def backward(self, dlogits):
        dz = self.l3.backward(dlogits)

        dz = self.d2.backward(dz)
        dz = self.a2.backward(dz)
        dz = self.n2.backward(dz)
        dz = self.l2.backward(dz)

        dz = self.d1.backward(dz)
        dz = self.a1.backward(dz)
        dz = self.n1.backward(dz)
        dz = self.l1.backward(dz)
        return dz

    def zero_grad(self):
        self.l1.zero_grad()
        self.l2.zero_grad()
        self.l3.zero_grad()

    def step(self, lr):
        # linear params
        self.l1.step(lr)
        self.l2.step(lr)
        self.l3.step(lr)

        # norm params
        self.n1.gamma -= lr * self.n1.dgamma
        self.n1.beta  -= lr * self.n1.dbeta
        self.n2.gamma -= lr * self.n2.dgamma
        self.n2.beta  -= lr * self.n2.dbeta

Training both models with same parameters

In [14]:
def train_model(model, loss_fn, Xtr_t, ytr_t, Xva_t, yva_t, lr=0.1, epochs=2000, batch_size=64, seed=0, print_every=200):
    rng = np.random.default_rng(seed)

    for epoch in range(1, epochs + 1):
        model.train()

        idx = rng.permutation(len(Xtr_t))
        Xsh = Xtr_t[idx]
        ysh = ytr_t[idx]

        total_loss = 0.0
        total_acc = 0.0
        n_batches = 0

        for start in range(0, len(Xsh), batch_size):
            xb = Xsh[start:start + batch_size]
            yb = ysh[start:start + batch_size]

            model.zero_grad()

            logits = model.forward(xb)
            loss = loss_fn.forward(logits, yb)

            dlogits = loss_fn.backward()
            model.backward(dlogits)

            model.step(lr)

            total_loss += float(loss.detach().cpu())
            total_acc += accuracy_from_logits(logits, yb)
            n_batches += 1

        model.eval()
        with torch.no_grad():
            val_logits = model.forward(Xva_t)
            val_acc = accuracy_from_logits(val_logits, yva_t)

        if epoch % print_every == 0 or epoch == 1:
            print(
                f"Epoch {epoch:4d} | "
                f"train loss {total_loss / n_batches:.4f} | "
                f"train acc {total_acc / n_batches:.3f} | "
                f"val acc {val_acc:.3f}"
            )

    model.eval()
    with torch.no_grad():
        final_train_acc = accuracy_from_logits(model.forward(Xtr_t), ytr_t)
        final_val_acc = accuracy_from_logits(model.forward(Xva_t), yva_t)

    return final_train_acc, final_val_acc

Checking InstanceNorm vs BatchNorm comparision

In [15]:
# Ensure fair comparison: same initial seeds for weights
torch.manual_seed(0)
model_in = MLP(in_dim=2, hidden_dim=64, out_dim=2, p_drop=0.2)
loss_in = SoftmaxCrossEntropy()

torch.manual_seed(0)
model_bn = MLP_BN(in_dim=2, hidden_dim=64, out_dim=2, p_drop=0.2)
loss_bn = SoftmaxCrossEntropy()

print("Training InstanceNorm model")
in_train_acc, in_val_acc = train_model(model_in, loss_in, Xtr_t, ytr_t, Xva_t, yva_t,
                                       lr=0.1, epochs=2000, batch_size=64, seed=0)

print("\nTraining BatchNorm model")
bn_train_acc, bn_val_acc = train_model(model_bn, loss_bn, Xtr_t, ytr_t, Xva_t, yva_t,
                                       lr=0.1, epochs=2000, batch_size=64, seed=0)

print("\nFinal report")
print(f"InstanceNorm  final train acc: {in_train_acc:.3f} | final val acc: {in_val_acc:.3f}")
print(f"BatchNorm     final train acc: {bn_train_acc:.3f} | final val acc: {bn_val_acc:.3f}")

Training InstanceNorm model
Epoch    1 | train loss 0.7751 | train acc 0.546 | val acc 0.533
Epoch  200 | train loss 0.6486 | train acc 0.622 | val acc 0.600
Epoch  400 | train loss 0.5994 | train acc 0.668 | val acc 0.667
Epoch  600 | train loss 0.5382 | train acc 0.732 | val acc 0.650
Epoch  800 | train loss 0.4288 | train acc 0.788 | val acc 0.900
Epoch 1000 | train loss 0.2935 | train acc 0.902 | val acc 0.917
Epoch 1200 | train loss 0.2620 | train acc 0.887 | val acc 0.900
Epoch 1400 | train loss 0.1932 | train acc 0.928 | val acc 0.900
Epoch 1600 | train loss 0.1974 | train acc 0.928 | val acc 0.900
Epoch 1800 | train loss 0.1718 | train acc 0.935 | val acc 0.900
Epoch 2000 | train loss 0.1642 | train acc 0.928 | val acc 0.900

Training BatchNorm model
Epoch    1 | train loss 0.7812 | train acc 0.561 | val acc 0.667
Epoch  200 | train loss 0.6841 | train acc 0.592 | val acc 0.567
Epoch  400 | train loss 0.6648 | train acc 0.576 | val acc 0.667
Epoch  600 | train loss 0.6905 | tra

#### What I observed

InstanceNorm training and validation is slightly higher then BatchNorm and this can be because BatchNorm is slightly nore nosier with small batch sizes and InstanceNorm tends to be more stable when stats are unreliable because it normalizes per-sample. 

InstanceNorm computes mean/variance for each sample while BatchNorm computes across the batch for each feature which can have different affects according to batch sizes. 

#### Q7 (25 points): InstanceNorm With and Without Dropout