## Problem 1: Classification Problem

In [3]:
import torch

#### Q1 (10 points): Implement Instance Normalization

In [4]:
class InstanceNorm1D:
    def __init__(self, D, eps=1e-5, device=None, dtype=torch.float32):
        self.D = D
        self.eps = eps

        # parameters (learned)
        self.gamma = torch.ones(D, device=device, dtype=dtype)
        self.beta  = torch.zeros(D, device=device, dtype=dtype)

        # gradients
        self.dgamma = torch.zeros_like(self.gamma)
        self.dbeta  = torch.zeros_like(self.beta)

        # cache
        self._cache = None

    def forward(self, z):
        assert z.dim() == 2 and z.size(1) == self.D, "Expected (B, D) input"

        mu = z.mean(dim=1, keepdim=True)                         # (B, 1)
        var = ((z - mu) ** 2).mean(dim=1, keepdim=True)          # (B, 1)
        invstd = torch.rsqrt(var + self.eps)                     # (B, 1)
        zhat = (z - mu) * invstd                                 # (B, D)

        a = zhat * self.gamma.view(1, -1) + self.beta.view(1, -1)

        # cache for backward
        self._cache = (zhat, invstd)
        return a

    def backward(self, da):
        zhat, invstd = self._cache
        B, D = da.shape
        assert D == self.D, "Mismatched feature dimension"

        # parameter gradients
        self.dbeta = da.sum(dim=0)               # (D,)
        self.dgamma = (da * zhat).sum(dim=0)     # (D,)

        # dL/dzhat
        dzhat = da * self.gamma.view(1, -1)      # (B, D)

        sum_dzhat = dzhat.sum(dim=1, keepdim=True)                # (B, 1)
        sum_dzhat_zhat = (dzhat * zhat).sum(dim=1, keepdim=True)  # (B, 1)

        dz = (invstd / D) * (D * dzhat - sum_dzhat - zhat * sum_dzhat_zhat)  # (B, D)
        return dz

#### Q2 (10 points): Implement Dropout

In [5]:
class Dropout:
    def __init__(self, p=0.5):
        assert 0.0 <= p < 1.0, "p must be in [0, 1)"
        self.p = float(p)
        self._mask = None
        self._train = True  # default mode like PyTorch modules

    def train(self):
        self._train = True

    def eval(self):
        self._train = False

    def forward(self, z):
        if (not self._train) or self.p == 0.0:
            self._mask = None
            return z

        keep_prob = 1.0 - self.p
        # mask is 1 with prob keep_prob, else 0
        self._mask = (torch.rand_like(z) < keep_prob).to(z.dtype)
        a = (self._mask * z) / keep_prob
        return a

    def backward(self, da):
        if (not self._train) or self.p == 0.0:
            return da

        keep_prob = 1.0 - self.p
        dz = (self._mask / keep_prob) * da
        return dz

#### Q3 (10 points): Implement Softmax Cross Entropy

In [None]:
class SoftmaxCrossEntropy:
    def __init__(self):
        self._cache = None  # (yhat, y)

    def forward(self, z, y):
        assert z.dim() == 2, "z must be (B, C)"
        assert y.dim() == 1 and y.size(0) == z.size(0), "y must be (B,)"

        B, C = z.shape

        # stable softmax: subtract row-wise max
        z_shift = z - z.max(dim=1, keepdim=True).values           # (B, C)
        exp_z = torch.exp(z_shift)                                 # (B, C)
        yhat = exp_z / exp_z.sum(dim=1, keepdim=True)              # (B, C)

        # cross-entropy: -mean(log prob of correct class)
        correct_probs = yhat[torch.arange(B, device=z.device), y]  # (B,)
        loss = -torch.log(correct_probs + 1e-12).mean()            # scalar

        self._cache = (yhat, y)
        return loss

    def backward(self):
        yhat, y = self._cache
        B, C = yhat.shape

        dz = yhat.clone()
        dz[torch.arange(B, device=dz.device), y] -= 1.0
        dz /= B
        return dz

#### Q4 (10 points): Implement Tanh

#### Q5 (10 points): Build and Train a Classification Model

#### Q6 (25 points): InstanceNorm vs BatchNorm

#### Q7 (25 points): InstanceNorm With and Without Dropout