In [1]:
import torch
import torch.nn as nn

#### Linear Regression

In [5]:
class LinearRegression:
    def __init__(self, in_features, out_features):
        self.W = torch.randn(in_features, out_features)
        self.b = torch.zeros(out_features)

    def mse_loss(self, y_pred, y_true):
        return torch.mean(torch.square(y_pred - y_true))
    
    def gradients(self, y_pred, y_true, X):
        diff = (y_pred - y_true)
        N = X.shape[0]

        dw = (2 / N) * torch.matmul(X.T, diff)
        db = (2 / N) * torch.sum(diff, dim=0)

        return dw, db
    
    def backward(self, dw, db, learning_rate):
        self.W -= learning_rate * dw
        self.b -= learning_rate * db

    def forward(self, X, y, epochs, learning_rate):
        for epoch in range(epochs):
            y_pred = torch.matmul(X, self.W) + self.b
            loss = self.mse_loss(y_pred, y)
            dw, db = self.gradients(y_pred, y, X)
            self.backward(dw, db, learning_rate)

#### Logistic Regression

In [None]:
class LogisticRegression:
    def __init__(self, in_features, n_labels):
        self.W = torch.randn(in_features, n_labels)
        self.b = torch.zeros(n_labels)

    def binary_cross_entropy(self, y_pred, y_true):
        return - torch.mean(y_true * torch.log(y_pred) + (1. - y_true) * torch.log(1. - y_pred))
    
    def gradients(self, y_pred, y_true, X):
        diff = y_pred - y_true
        
        dw = torch.matmul(X.T, diff) / X.shape[0]
        db = torch.mean(diff, dim=0)

        return dw, db
    
    def backward(self, dw, db, learning_rate):
        self.W -= learning_rate * dw
        self.b -= learning_rate * db

    def forward(self, X, y, epochs, learning_rate):
        for epoch in epochs:
            y_pred = torch.sigmoid(torch.matmul(X, self.W) + self.b)
            loss = self.binary_cross_entropy(y_pred, y)
            dw, db = self.gradients(y_pred, y, X)
            self.backward(dw, db, learning_rate)

#### KMeans

In [6]:
class KMeans:
    def __init__(self, n_clusters, max_iterations):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations

        self.centroids = None

    def fit(self, X, y):
        n_samples = X.shape[0]
        indices = torch.randint(0, n_samples, (self.n_clusters,))

        self.centroids = X[indices]

        for _ in range(self.max_iterations):
            distances = torch.stack([torch.norm(X - c) for c in self.centroids], dim=1)
            labels = torch.argmin(distances, dim=1)

            new_centroids = torch.stack(torch.mean([X[labels == c] for c in range(self.n_clusters)], dim=0), dim=0)

            if torch.allclose(new_centroids, self.centroids):
                break

            self.centroids = new_centroids

        return self.centroids
    
    def predict(self, x, y):
        distances = torch.stack([torch.norm(x - c) for c in self.centroids], dim=1)
        return torch.argmin(distances, dim=1)

#### KNN

In [8]:
class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, X):
        predictions = []

        for x in X:
            distances = torch.norm(self.X - x, dim=1)
            _, topk_indices = torch.topk(distances, k=self.k, largest=False)

            labels = self.y[topk_indices]
            pred = torch.mode(labels).values

            predictions.append(pred)

        return torch.stack(predictions, dim=0)

#### PCA

In [10]:
def pca(data, n_components):
    mean = torch.mean(data, dim=0)
    std = torch.std(data, dim=0)

    normalized = (data - mean) / (std + 1e-4)
    n_samples = data.shape[0]

    cov_matrix = torch.matmul(normalized.t(), normalized) / (n_samples - 1)

    eigen_values, eigen_vectors = torch.linalg.eig(cov_matrix)
    
    sorted_eigen_indices = torch.argsort(eigen_values)[::-1]
    sorted_eigen_values = eigen_values[sorted_eigen_indices]
    sorted_eigen_vectors = eigen_vectors[:, sorted_eigen_indices]

    principle_components = sorted_eigen_vectors[:, : n_components]

    reduced_data = torch.matmul(normalized, principle_components)

    return reduced_data, principle_components, eigen_values

#### SVM

#### Loss Functions

In [11]:
def mse_loss(y_true, y_pred):
    return torch.mean(torch.square(y_pred - y_true))

In [12]:
def mae_loss(y_true, y_pred):
    return torch.mean(torch.abs(y_pred - y_true))

In [None]:
def bce_loss(y_pred, y_true, eps=1e-8):
    return - torch.mean(y_true * torch.log(y_pred + eps) + (1. - y_true) * torch.log(1. - y_pred + eps))

In [None]:
def cross_entropy_loss(y_pred, y_true, eps=1e-8):
    return - torch.mean(y_true * torch.log(y_pred + eps))

#### Activations

In [15]:
def sigmoid(x):
    return 1. / (1 + torch.exp(-x))

In [16]:
def tanh(x):
    exp_x_ = torch.exp(-x)
    exp_x = torch.exp(x)

    return (exp_x - exp_x_) / (exp_x + exp_x_)

In [17]:
def relu(x):
    return torch.max(x, 0)

In [18]:
def leaky_relu(x, alpha):
    return torch.max(x, alpha * x)

In [19]:
class PReLU:
    def __init__(self):
        self.p = torch.tensor(0.25, requires_grad=True)

    def forward(self, x):
        return torch.max(x, self.p * x)

In [20]:
def elu(x, alpha):
    return torch.where(x > 0, x, alpha * (torch.exp(x) - 1))

In [21]:
def softmax(x):
    exp_x = torch.exp(x - torch.max(x))
    return exp_x / torch.sum(exp_x)

In [22]:
def log_softmax(x):
    return x - torch.logsumexp(x)

#### Optimizers

In [23]:
class SGD:
    def __init__(self, params, learning_rate):
        self.params = params
        self.learning_rate = learning_rate

    def step(self):
        for p in self.params:
            p.data -= self.learning_rate * p.grad

In [26]:
class SGDMomentum:
    def __init__(self, params, learning_rate, momentum):
        self.params = params
        self.learning_rate = learning_rate
        self.momentum = momentum

        self.v = [torch.zeros_like(p) for p in params]

    def step(self):
        for i, p in enumerate(self.params):
            self.v[i] = self.momentum * self.v[i] + (1. - self.momentum) * p.grad
            p.data -= self.learning_rate * self.v[i]

In [None]:
class AdaGrad:
    def __init__(self, params, learning_rate, eps=1e-8):
        self.params = params
        self.learning_rate = learning_rate
        self.eps = eps

        self.cache = [torch.zeros_like(p) for p in params]

    def step(self):
        for i, p in enumerate(self.params):
            self.cache[i] += p.grad ** 2
            p.data -= (self.learning_rate / torch.sqrt(self.cache[i] + self.eps)) * p.grad

In [29]:
class RMSProp:
    def __init__(self, params, learning_rate, decay_rate, eps=1e-8):
        self.params = params
        self.learning_rate = learning_rate
        self.decay_rate = decay_rate
        self.eps = eps

        self.cache = [torch.zeros_like(p) for p in params]

    def step(self):
        for i, p in enumerate(self.params):
            self.cache[i] = self.decay_rate * self.cache[i] + (1. - self.decay_rate) * p.grad ** 2
            p.data -= (self.learning_rate / torch.sqrt(self.cache[i] + self.eps)) * p.grad

In [30]:
class Adam:
    def __init__(self, params, learning_rate, beta1, beta2, eps=1e-8):
        self.params = params
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps

        self.m = [torch.zeros_like(p) for p in params]
        self.v = [torch.zeros_like(p) for p in params]

        self.t = 0

    def step(self):
        self.t += 1

        for i, p in enumerate(self.params):
            self.m[i] = self.beta1 * self.m[i] + (1. - self.beta1) * p.grad
            self.v[i] = self.beta2 * self.v[i] + (1. - self.beta2) * p.grad ** 2

            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)

            p.data -= (self.learning_rate / torch.sqrt(v_hat + self.eps)) * m_hat

### Neural Network Foundations

In [33]:
### Activations and Gradients
### For loss functions: refer above

def sigmoid(z):
    return 1. / (1 + torch.exp(-z))

def sigmoid_derivative(a):
    return a * (1. - a)

def relu(z):
    return torch.max(z, 0)

def relu_derivative(z):
    return torch.where(z >= 0, 1, 0)

def tanh(a):
    return (torch.exp(a) - torch.exp(-a)) / (torch.exp(a) + torch.exp(-a))

def tanh_derivative(a):
    return 1. - a ** 2

In [34]:
class Neuron:
    def __init__(self, input_features, activation='sigmoid'):
        self.in_features = input_features
        self.activation = activation

        self.W = torch.randn(input_features)
        self.b = 0.0
        
        self.cache = {}

    def forward(self, x):
        z = torch.dot(x, self.W) + self.b

        if self.activation == 'sigmoid':
            a = sigmoid(z)

        self.cache = {
            'x': x,
            'z': z,
            'a': a
        }

        return a
    
    def backward(self, da):
        x = self.cache['x']
        z = self.cache['z']
        a = self.cache['a']

        if self.activation == 'sigmoid':
            dz = da * sigmoid_derivative(a)

        self.dw = torch.dot(x.T, dz) / x.shape[0]
        self.db = torch.mean(dz)

        dx = torch.dot(dz.reshape(-1, 1), self.W.reshape(1, -1))

        return dx
    
    def update(self, learning_rate):
        self.W = self.W - learning_rate * self.dw
        self.b = self.b - learning_rate * self.db


#### Q1. Explain forward propagation in a neural network.
- It is the process of passing the inputs through a sequence of layers to produce an output which is compared against the true labels using a loss function. At each layer:
    - We compute the weighted sum of the inputs
    - Apply activation functions to introduce non-linearity
    - Pass the output to the next layer

#### Q2. Why do we need activation functions? What happens without them?
- Without activation functions, neural networks become purely linear. Stacking multiple linear layers is equivalent to a single linear layer.
- Activation functions introduce non-linearity in the outputs, that help the networks to learn complex pattern in the data distribution.

#### Q3. When would you MSE vs Cross entropy loss? 
- MSE is for regression problems where the task is produce continuous values - it penalizes errors quadratically.
- CE is for classification because its probabilistically motivated (measures divergence between the distributions)

#### Q4. What's the dying ReLU problem?
- When inputs are consistently negative, ReLU neurons output 0 and have zero gradient, so they never update. 
- This can happen with bad initialization or very high learning rates. Solutions: Leaky ReLU, He initialization, lower learning rates.

### Backpropagation Deep Dive

#### Technical explanation
- Backpropagation is an algorithm to compute the gradients of the loss function with respect to all the parameters in the neural network using the chain rule of calculus.
- It's called back-propagation because we compute the gradients starting from the output layers and moving backwards towards the inputs
    - Before backprop, computing gradients for deep networks required calculating derivatives separately for each parameter - computationally not reasonable

#### Computational Graphs
- A computation graph is a directed acyclic graph where: 
    - Nodes = operations or variables
    - Edges = data dependency

In [1]:
import torch

In [2]:
def sigmoid(z):
    return 1. / (1 + torch.exp(-z))

def sigmoid_derivative(a):
    return a * (1. - a)

def relu(z):
    return torch.max(z, 0)

def relu_derivative(a):
    return torch.where(a > 0, 1, 0)

In [4]:
class TwoLayerNetwork:
    def __init__(self, in_features, hidden_features, out_features, learning_rate):
        self.in_features = in_features
        self.hidden_features = hidden_features
        self.out_features = out_features

        self.learning_rate = learning_rate

        self.W1 = torch.randn(in_features, hidden_features)
        self.b1 = torch.zeros(hidden_features)

        self.W2 = torch.randn(hidden_features, out_features)
        self.b2 = torch.zeros(out_features)

        self.cache = {}

    def forward(self, x):
        z1 = torch.matmul(x, self.W1) + self.b1
        a1 = relu(z1)

        z2 = torch.matmul(a1, self.W2) + self.b2
        a2 = sigmoid(z2)

        self.cache = {
            'x': x,
            'z1': z1, 
            'a1': a1,
            'z2': z2,
            'a2': a2 
        }

        return a2
    
    def backward(self, y_true):
        x = self.cache['x']
        z1 = self.cache['z1']
        a1 = self.cache['a1']
        z2 = self.cache['z2']
        a2 = self.cache['a2']

        n_samples = x.shape[0]

        dldz2 = a2 - y_true

        dldw2 = (dldz2 @ a2.T) / n_samples
        dldb2 = torch.sum(dldz2, dim=0, keepdim=True) / n_samples

        dlda1 = (dldz2 @ self.W2.T)
        dldz1 = dlda1 * relu_derivative(z1)

        dldw1 = (dldz1 @ a1.T) / n_samples
        dldb1 = torch.sum(dldz1, dim=0, keepdim=True) / n_samples

        self.gradients = {
            'dw1': dldw1,
            'db1': dldb1,
            'dw2': dldw2,
            'db2': dldb2
        }

        return self.gradients
    
    def update_paramaters(self):
        self.W1 -= self.lr * self.gradients['dw1']
        self.b1 -= self.lr * self.gradients['db1']
        self.W2 -= self.lr * self.gradients['dw2']
        self.b2 -= self.lr * self.gradients['db2']

    def compute_loss(self, y_pred, y_true, eps=1e-8):
        loss = - torch.mean(y_true * torch.log(y_pred + eps) + (1. - y_true) * torch.log(1. - y_pred + eps))
        return loss
    
    def train_step(self, X, y):
        y_pred = self.forward(X)
        loss = self.compute_loss(y_pred, y)
        self.backward(y)
        self.update_paramaters()

        return loss

#### Q5. Explain Backpropagation
- Backpropagation is an algorithm to compute the gradients of the loss function with respect to all the parameters using the chain rule of the calculus.
- Starting from the loss, we compute how much each parameter contributed to the error by propagating gradients backward through the network.
- At each layer: 
    - Receive gradient from the layer above
    - Compute local gradient through activation
    - Compute parameter gradients 
    - Pass gradient to layer below

#### Q6. Why do we need to cache values during the forward pass?
- During backprop, we need values from the forward pass to compute gradients.
- Without caching, we'd have to recompute the forward pass during backprop, doubling computation.

#### Q7. What's the vanishing gradient problem? How does it relate to backprop?
- The vanishing gradient problem occurs when gradients become exponentially small as they propagate backward through many layers. This happens because
    - Chain rule multiplies gradients
    - Sigmoid/Tanh derivatives are <1
    - Deep networks suffer the most

#### Q8. Why do we divide gradients by batch size?
- Scale invariance: Loss should be comparable regardless of batch size. 
- If we sum instead of average, larger batches would have larger gradients, requiring different learning rates.

## Core Layer Types

#### Dense Layers
- A dense layer performs a transformation followed by an activation: 
    - output = activation(Wx + b)
- Every input is connected to every output - a fully connected topology.

#### Convolution Layers
- Convolution applies a learnable filter across spatial dimensions
- Key Properties
    - Local connectivity: Each output neuron only look at a small region (receptive field)
    - Parameter sharing: Same filter used across entire spatial dimension
    - Translation equivariance: If input shifts, output shifts equivalently
- Why convolution for images? 
    - Spatial structure: Pixels nearby are related
    - Parameter Efficiency
    - Translation equivariance: Car detector works anywhere in the image

- H_out: (H - K + 2P) / S + 1
- W_out: (W - K + 2P) / S + 1

- Padding Types: 
    - Valid: No padding, output shrinks
    - Same: Pad so output size = input_size

#### Pooling Layers
- Pooling performs downsampling by aggregating values in a local region
- Benefits: 
    - Reduces spatial dimension
    - Reduces computation
    - No learnable parameters
- Types
    - Average Pooling
    - Max Pooling
    - Min Pooling
    - Global Average Pooling: 

#### Initialization Approaches
1. Xavier
2. He
3. LeCun

- Why initialization matters?
    - Too small -> vanishing gradients
    - Too big -> exploding gradients
    - Goal: Keep variance of activations roughly constant across layers

## Normalization Techniques
- Problem: During training, the distribution of the layer inputs changes as parameters get updated (called Internal Covariant Shift): This causes: 
    - Unstable training
    - Slow convergence
    - Careful initializations

- Solution: Normalize activations to have consistent statistics

#### Batch Normalization: 
- Normalize each feature across the batch to have mean = 0, variance = 1, then learn optimal mean/variance with trainable parameters
- Why scale and shift ? 
    - Pure normalizattion to N(0, 1) might be too restrictive. alpha and beta let the network learn the optimal distribution.
- Training vs Inference
    - Training: Use batch statistics
    - Inference: Use running averages accumulated during training (exponential moving average)
- Suited for CNNs

#### Layer Normalization: 
- BatchNorm normalizes across the batch dimension. LayerNorm normalizes across the feature dimension for each sample independently
- Why LayerNorm ?
    - Better for Sequences
    - Used in Transformers
- Suited for RNNs and Transformers

#### Instance Normalization: 
- Normalize each channel independently for each sample. 
- Like LayerNorm but for spatial data.
- Helps with image generation in case of GANs

#### Group Normalization
- Divide channels into groups and normalize with each group. Middle ground between LayerNorm and InstanceNorm

In [5]:
class BatchNorm: 
    def __init__(self, d_model, momentum, eps):
        self.d_model = d_model
        self.momentum = momentum
        self.eps = eps

        self.gamma = torch.ones(d_model)
        self.beta = torch.zeros(d_model)

        self.running_mean = torch.zeros(d_model)
        self.running_var = torch.ones(d_model)

    def forward(self, x):
        if self.training:
            batch_size, seq_len, d_model = x.shape

            mean = torch.mean(x, dim=(0, 1))
            var = torch.var(x, dim=(0, 1))

            x_normalised = (x - mean) / (torch.sqrt(var) + self.eps)

            gamma_reshaped = self.gamma.reshape(1, 1, d_model)
            beta_reshaped = self.beta.reshape(1, 1, d_model)

            output = gamma_reshaped * x_normalised + beta_reshaped

            self.running_mean = self.momentum * self.running_mean + (1. - self.momentum) * mean
            self.running_var = self.momentum * self.running_var + (1. - self.momentum) * var
        else:
            mean_reshaped = torch.reshape(self.running_mean, (1, 1, d_model))
            var_reshaped = torch.reshape(self.running_var, (1, 1, d_model))

            x_normalized = (x - mean_reshaped) / (torch.sqrt(var_reshaped) + self.eps)

            gamma_reshaped = self.gamma.reshape(1, 1, d_model)
            beta_reshaped = self.beta.reshape(1, 1, d_model)

            output = gamma_reshaped * x_normalised + beta_reshaped

        return output

In [6]:
class LayerNorm: 
    def __init__(self, d_model, eps=1e-5):
        self.d_model = d_model
        self.eps = eps

        self.gamma = torch.ones(d_model)
        self.beta = torch.zeros(d_model)
    
    def forward(self, x):
        mean = torch.mean(x, axis=-1, keepdims=True)
        var = torch.var(x, axis=-1, keepdims=True)
        
        x_normalized = (x - mean) / torch.sqrt(var + self.eps)
        
        out = self.gamma * x_normalized + self.beta
        
        return out

In [7]:
class Dropout:
    def __init__(self, p=0.7):
        self.p = p
        self.mask = None
        self.training = True

    def forward(self, x):
        if self.training:
            self.mask = (torch.randn(*x.shape) < self.p)
            out = x * self.mask / self.p
        else:
            out = x

        return out
    
    def backward(self, dout):
        if self.training:
            return dout * self.mask / self.p
        else:
            return dout

#### Q1. Explain dropout and why its preventing overfitting.
- Dropout randomly sets a fraction of the neuron activations to zero during training. 
- For each training iteration, we use a different sub-network.

Why it prevents overfitting ?
- Prevents co-adaptation where the neurons rely less on the other neurons
- Ensemble effect: Training 2^N different sub-networks, final model is an ensemble
- Forces neurons to more independent and useful

#### Q2. Whats the difference between L1 and L2 regularization? 
- Both add a penatly term but with different effects
- L2: 
    - lambda * W^2
    - Weights are never zeroed-out
    - Generalizes using all the weights
    - Use when: 
        - Want a general regularization
        - All features are potentially relevant
- L1: 
    - lambda * |W|
    - Feature selection
    - Drives weights towards zero
    - Use when: 
        - Want feature selection
        - High-dimensional data with irrelevant features

#### Q3. Explain inverted dropout. Why scale during training instead of testing? 
- During training, a sub-network contributes to the output and during inference, the full ensemble model is contributing to the output.
- The output of the sub-network and the full ensemble model need to be in the same range, as a result we need to scale during training.

#### Q4. Why does early stopping work as regularization?
- Early stopping acts as implicit regularization by limiting training time:
    - Early in training: model learns simple patterns (high bias, low variance)
    - Later in training: model fits noise (low bias, high variance)
    - Stopping early finds the sweet spot

#### Q5: What's label smoothing and why does it help?
- Label smoothing replaces hard target with soft target: 
    - To smoothen the target distribution
    - Prevent model over-confidence
    - Empirically improves test accuracy

#### Q6: Why shouldn't you use both Dropout and BatchNorm together?
- BatchNorm computes statistics over the batch
- Dropout randomly zeros activations
- BatchNorm statistics become unreliable with dropout's noise

## Optimizers
Technical: Minimize the loss function to reach the global minimum for all the parameters

#### Stochastic Gradient Descent
- Move in the direction opposite to the gradient (steepest descent)
- Batch GD, Stochastic GD, Mini-Batch GD
- Outliers cause un-stable gradients and learning
- Oscillating gradients in case of all positive or all negative activations

#### Stochastic Gradient Descent with Momentum
- Include a velocity which is a exponential moving average of the gradients
- Reduces the affects of the outlier gradients since the velocity value dominates
- Causes dampening oscillating gradients
- Accelerates in consistent direction
- Helps escape local minimum

#### AdaGrad
- Different parameters learn and update at different rates in the training process
- As a result, they need dynamic learning rate.
- Goal: higher learning rate for slow-converging parameters
- Goal: lower learning rate for fast-converging parameters
- Problem: G only accumulated and never decreases, as a result learning rate approaches zero

#### RMSProp
- Fix AdaGrad by using exponential moving average for the accumulating G^2 values
- Learning can increase or decrease depening on scenario

#### Adam
- Combines momentum and adaptive learning rate approaches for effective covergence
- Bias correction to handle the incorrect gradient values in the beginning of the training process

## Weight Initialization
- Bad initialization causes: 
    - Vanishing or Exploding gradients
    - Symmetry Problems 
    - Slow convergence

- The problems: 
    - All zeros: Symmetry - all neurons learn the same thing
    - Too large: Exploding activation / gradients
    - Too small: Vanishing activation / gradients

Goal: We want the variance of activations and gradients to stay roughly constant across layers

1. Xavier
    - Use with sigmoid / tanh
    - All neurons active, balance forward and backward
    - sigmoid / tanh have a symmetric nature to forward and backward pass, so both the directions need to be considered

2. He
    - Use with ReLU / LeakyReLU
    - Half neurons die, need 2x variance
    - During the forward, ~50% neurons are deactivated by ReLU, as a result the variance is halved (needs compensation)
    - Asymmetric nature of forward and the backward processes, forward pass is important for ReLU

## Learning Rate Schedulers
- Fixed learning rate is suboptimal.
- Early in training, we need large steps to explore
- Late in training, we need small steps for precise convergence

#### Step Decay
- Simple, works well
- Requires tuning drop points

#### Exponential Decay
- Smooth, continuous
- Choosing decay rate - tricky

#### Cosine annealing
- Smooth, 
- Requires knowing total epochs

#### Warmup - The Critical Addition
- Linearly increase LR from near-zero to target over initial - training steps
- Prevents early instability from large gradients and random initialization
- Why warmup is essential ? 
    - Prevent unstable training in the initial steps
    - Required for large batch size
    - Might help in escaping from local - minimum or saddle points

#### Common combinations
- Warmup + Cosine Annealing (BERT, Vision Transformers, SOTA models)
- Warmup + step decay (CNNs)