In [None]:
import marimo as mo

# Week 5: Backpropagation - The Engine of Deep Learning**IME775: Data Driven Modeling and Optimization**ðŸ“– **Reference**: Krishnendu Chaudhury. *Math and Architectures of Deep Learning*, Chapter 6---## Learning Objectives- Understand backpropagation as reverse-mode automatic differentiation- Derive gradients for common layers- Implement backpropagation from scratch- Verify gradients numerically

In [None]:
import numpy as npimport matplotlib.pyplot as plt

## 5.1 Why Backpropagation?**Problem**: Computing gradients for millions of parameters**Naive approach**: Numerical gradients require $2n$ forward passes for $n$ parameters**Solution**: Backpropagation computes ALL gradients in ONE backward pass!

## 5.2 Computational GraphsNeural network computation can be represented as a **directed acyclic graph (DAG)**:- Nodes: Operations or variables- Edges: Data flow**Forward pass**: Compute outputs (left â†’ right)**Backward pass**: Compute gradients (right â†’ left)

## 5.3 The Chain Rule in ActionFor $L = f(g(h(x)))$:$$\frac{\partial L}{\partial x} = \frac{\partial f}{\partial g} \cdot \frac{\partial g}{\partial h} \cdot \frac{\partial h}{\partial x}$$Each node:1. Computes **local gradient** (derivative of its operation)2. Multiplies by **upstream gradient** (from output)

In [None]:
# Visualize backpropagation on a simple network# y = sigmoid(w2 * relu(w1 * x + b1) + b2)class ComputeNode:        self.name = name        self.output = None        self.grad = None        return f"{self.name}: out={self.output:.4f}, grad={self.grad:.4f}" if self.output else self.name    # Input and weights    x = 2.0    w1, b1 = 0.5, 0.1    w2, b2 = 0.8, -0.2    target = 0.7    # Forward pass (with tracing)    nodes = {}    # z1 = w1 * x + b1    z1 = w1 * x + b1    nodes['z1'] = z1    # h1 = relu(z1)    h1 = max(0, z1)    nodes['h1'] = h1    # z2 = w2 * h1 + b2    z2 = w2 * h1 + b2    nodes['z2'] = z2    # y = sigmoid(z2)    y = 1 / (1 + np.exp(-z2))    nodes['y'] = y    # Loss = (y - target)^2    L = (y - target) ** 2    nodes['L'] = L    # Backward pass    grads = {}    # dL/dy    dL_dy = 2 * (y - target)    grads['y'] = dL_dy    # dL/dz2 = dL/dy * dy/dz2 = dL/dy * y*(1-y)    dy_dz2 = y * (1 - y)    dL_dz2 = dL_dy * dy_dz2    grads['z2'] = dL_dz2    # dL/dw2 = dL/dz2 * dz2/dw2 = dL/dz2 * h1    dL_dw2 = dL_dz2 * h1    grads['w2'] = dL_dw2    # dL/db2 = dL/dz2    grads['b2'] = dL_dz2    # dL/dh1 = dL/dz2 * dz2/dh1 = dL/dz2 * w2    dL_dh1 = dL_dz2 * w2    grads['h1'] = dL_dh1    # dL/dz1 = dL/dh1 * dh1/dz1 (relu derivative)    dh1_dz1 = 1 if z1 > 0 else 0    dL_dz1 = dL_dh1 * dh1_dz1    grads['z1'] = dL_dz1    # dL/dw1 = dL/dz1 * dz1/dw1 = dL/dz1 * x    dL_dw1 = dL_dz1 * x    grads['w1'] = dL_dw1    # dL/db1 = dL/dz1    grads['b1'] = dL_dz1    return nodes, gradsnodes, grads = forward_backward_trace()# Create visualizationfig2, ax2 = plt.subplots(figsize=(14, 6))# Node positionspositions = {    'x': (0, 0.5),    'z1': (1, 0.5),    'h1': (2, 0.5),    'z2': (3, 0.5),    'y': (4, 0.5),    'L': (5, 0.5)}# Draw nodesfor name, (px, py) in positions.items():    if name in nodes:        val = nodes[name]        grad = grads.get(name, 0)        color = 'lightblue' if name == 'x' else ('lightgreen' if name == 'L' else 'lightyellow')        circle = plt.Circle((px, py), 0.15, fill=True, color=color, edgecolor='black', linewidth=2)        ax2.add_patch(circle)        ax2.text(px, py + 0.02, name, ha='center', va='center', fontsize=12, fontweight='bold')        ax2.text(px, py - 0.25, f'val={val:.3f}', ha='center', fontsize=9)        if name in grads:            ax2.text(px, py + 0.25, f'grad={grad:.4f}', ha='center', fontsize=9, color='red')# Draw input xcircle = plt.Circle((0, 0.5), 0.15, fill=True, color='lightblue', edgecolor='black', linewidth=2)ax2.add_patch(circle)ax2.text(0, 0.52, 'x=2', ha='center', va='center', fontsize=12, fontweight='bold')# Draw edges with operationsedges = [    ((0, 0.5), (1, 0.5), 'w1Â·x+b1'),    ((1, 0.5), (2, 0.5), 'ReLU'),    ((2, 0.5), (3, 0.5), 'w2Â·h+b2'),    ((3, 0.5), (4, 0.5), 'Ïƒ'),    ((4, 0.5), (5, 0.5), 'MSE'),]for (x1, y1), (x2, y2), op in edges:    ax2.annotate('', xy=(x2 - 0.15, y2), xytext=(x1 + 0.15, y1),                arrowprops=dict(arrowstyle='->', color='black', lw=1.5))    ax2.text((x1 + x2) / 2, y1 + 0.12, op, ha='center', fontsize=10, style='italic')ax2.set_xlim(-0.5, 5.5)ax2.set_ylim(0, 1)ax2.set_aspect('equal')ax2.axis('off')ax2.set_title('Backpropagation: Forward Values (black) and Backward Gradients (red)', fontsize=14)fig2return (    ComputeNode,    ax2,    circle,    edges,    fig2,    forward_backward_trace,    grads,    nodes,    op,    positions,    x1,    x2,    y1,    y2,)@app.cell# Print the gradientsprint("Gradients computed via backpropagation:")print("-" * 40)for name, grad in grads.items():    print(f"âˆ‚L/âˆ‚{name} = {grad:.6f}")

## 5.4 Layer-by-Layer Gradients### Linear Layer: $z = Wx + b$- $\frac{\partial L}{\partial W} = x^T \cdot \frac{\partial L}{\partial z}$- $\frac{\partial L}{\partial b} = \frac{\partial L}{\partial z}$- $\frac{\partial L}{\partial x} = W^T \cdot \frac{\partial L}{\partial z}$### ReLU: $h = \max(0, z)$- $\frac{\partial L}{\partial z} = \frac{\partial L}{\partial h} \cdot \mathbb{1}[z > 0]$

In [None]:
# Full backprop implementationclass Layer:        raise NotImplementedError        raise NotImplementedErrorclass Linear(Layer):        # He initialization        self.W = np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features)        self.b = np.zeros(out_features)        self.grad_W = None        self.grad_b = None        self.x = x  # Cache for backward        return x @ self.W + self.b        batch_size = self.x.shape[0]        self.grad_W = self.x.T @ grad_output / batch_size        self.grad_b = np.mean(grad_output, axis=0)        return grad_output @ self.W.Tclass ReLU(Layer):        self.mask = (x > 0)        return x * self.mask        return grad_output * self.maskclass Sigmoid(Layer):        self.output = 1 / (1 + np.exp(-np.clip(x, -500, 500)))        return self.output        return grad_output * self.output * (1 - self.output)class MSELoss:        self.pred = pred        self.target = target        return np.mean((pred - target) ** 2)        return 2 * (self.pred - self.target) / self.pred.shape[0]print("Layer classes defined: Linear, ReLU, Sigmoid, MSELoss")

## 5.5 Gradient Checking**Always verify analytical gradients numerically!**$$\frac{\partial L}{\partial \theta} \approx \frac{L(\theta + \epsilon) - L(\theta - \epsilon)}{2\epsilon}$$Relative error should be $< 10^{-5}$

## 5.6 Vanishing and Exploding GradientsIn deep networks, gradients can become very small (vanishing) or very large (exploding):$$\frac{\partial L}{\partial W^{(1)}} = \frac{\partial L}{\partial h^{(L)}} \prod_{l=1}^{L} \frac{\partial h^{(l)}}{\partial h^{(l-1)}}$$**Solutions**: ReLU, proper initialization, skip connections, normalization

## Summary| Concept | Key Point ||---------|-----------|| **Backpropagation** | Compute all gradients in one backward pass || **Chain Rule** | Multiply local gradient by upstream gradient || **Gradient Checking** | Verify analytical vs numerical gradients || **Vanishing Gradients** | Use ReLU, proper init, skip connections |---## References- **Primary**: Krishnendu Chaudhury. *Math and Architectures of Deep Learning*, Chapter 6.- **Classic**: Rumelhart, Hinton & Williams (1986). "Learning representations by back-propagating errors."## Connection to ML Refined CurriculumBackpropagation enables training for:- All gradient descent methods (Weeks 2-3)- Any supervised learning model (Weeks 4-8)