In [1]:
import torch

In [2]:
class Variable:
    """
    A scalar value with gradient tracking.
    This is the fundamental building block - similar to PyTorch's autograd.
    """
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __add__(self, other):
        other = other if isinstance(other, Variable) else Variable(other)
        out = Variable(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Variable) else Variable(other)
        out = Variable(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data

        out._backward = _backward
        return out
    
    def __pow__(self, other):
        assert isinstance(other, (int, float))
        out = Variable(self.data ** other, (self,), f'**{other}')

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad

        out._backward = _backward
        return out
    
    def backward(self):
        """
        Compute gradients via backpropagation.
        Uses topological sort to ensure we process nodes in correct order.
        """
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)

        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

In [3]:
def example_1_basic():
    """
    Compute: f(x, y) = x*y + x^2
    Find: df/dx and df/dy
    """
    print("\n" + "="*60)
    print("EXAMPLE 1: Basic Scalar Autodiff")
    print("="*60)
    
    x = Variable(2.0, label='x')
    y = Variable(3.0, label='y')
    
    # Forward pass
    z = x * y  # z = 6.0
    w = x ** 2  # w = 4.0
    f = z + w  # f = 10.0
    
    print(f"Forward pass: f = {f.data}")
    
    # Backward pass
    f.backward()
    
    print(f"df/dx = {x.grad}")  # Should be: y + 2*x = 3 + 4 = 7
    print(f"df/dy = {y.grad}")  # Should be: x = 2
    
    # Verify manually
    print(f"\nManual verification:")
    print(f"df/dx = y + 2*x = {y.data} + 2*{x.data} = {y.data + 2*x.data}")
    print(f"df/dy = x = {x.data}")

example_1_basic()


EXAMPLE 1: Basic Scalar Autodiff
Forward pass: f = 10.0
df/dx = 7.0
df/dy = 2.0

Manual verification:
df/dx = y + 2*x = 3.0 + 2*2.0 = 7.0
df/dy = x = 2.0


In [4]:
import numpy as np

In [None]:
import numpy as np

class Tensor:
    def __init__(self, data, requires_grad=True, _children=(), _op=''):
        self.data = np.array(data, dtype=np.float32)
        self.requires_grad = requires_grad
        self._prev = _children
        self._op = _op

        self.grad = None
        self._backward = lambda: None

    @property
    def shape(self):
        return self.data.shape
    
    def zero_grad(self):
        self.grad = np.zeros_like(self.data)

    @staticmethod
    def _unbroadcast(grad, original_shape):
        """
        Reduce gradient to match original shape by summing along broadcasted dimensions.
        
        This handles two cases:
        1. Prepended dimensions (added on left): Sum along axis 0 repeatedly
        2. Expanded dimensions (size 1 -> size N): Sum along that axis with keepdims
        """
        # Step 1: Remove prepended dimensions
        ndims_added = grad.ndim - len(original_shape)
        for _ in range(ndims_added):
            grad = grad.sum(axis=0)
        
        # Step 2: Reduce expanded dimensions (where original had size 1)
        for i, (grad_dim, orig_dim) in enumerate(zip(grad.shape, original_shape)):
            if orig_dim == 1 and grad_dim > 1:
                grad = grad.sum(axis=i, keepdims=True)
        
        return grad

    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other, requires_grad=False)
        out = Tensor(self.data + other.data, _children=(self, other), _op='+')

        def _backward():
            if self.requires_grad:
                grad = self._unbroadcast(out.grad, self.data.shape)
                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad
            
            if other.requires_grad:
                grad = self._unbroadcast(out.grad, other.data.shape)
                if other.grad is None:
                    other.grad = grad
                else:
                    other.grad += grad

        out._backward = _backward
        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other, requires_grad=False)
        out = Tensor(self.data * other.data, _children=(self, other), _op='*')

        def _backward():
            if self.requires_grad:
                grad = out.grad * other.data
                grad = self._unbroadcast(grad, self.data.shape)
                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad
            
            if other.requires_grad:
                grad = out.grad * self.data
                grad = self._unbroadcast(grad, other.data.shape)
                if other.grad is None:
                    other.grad = grad
                else:
                    other.grad += grad

        out._backward = _backward
        return out
    
    def matmul(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other, requires_grad=False)
        out = Tensor(self.data @ other.data, _children=(self, other), _op='@')

        def _backward():
            if self.requires_grad:
                grad = out.grad @ other.data.T
                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

            if other.requires_grad:
                grad = self.data.T @ out.grad
                if other.grad is None:
                    other.grad = grad
                else:
                    other.grad += grad

        out._backward = _backward
        return out
    
    def sum(self, axis=None, keepdim=False):
        out = Tensor(self.data.sum(axis=axis, keepdims=keepdim), _children=(self,), _op='sum')

        def _backward():
            if self.requires_grad:
                grad = out.grad

                if axis is None:
                    grad = np.full_like(self.data, fill_value=grad)
                else:
                    if not keepdim:
                        grad = np.expand_dims(grad, axis=axis)
                    grad = np.broadcast_to(grad, self.data.shape).copy()

                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def mean(self, axis=None, keepdim=False):
        out = Tensor(self.data.mean(axis=axis, keepdims=keepdim), _children=(self,), _op='mean')

        def _backward():
            if self.requires_grad:
                grad = out.grad

                if axis is None:
                    n = self.data.size
                    grad = np.full(self.data.shape, fill_value=grad / n)
                else:
                    n = self.data.shape[axis]
                    grad = grad / n

                    if not keepdim:
                        grad = np.expand_dims(grad, axis=axis)
                    grad = np.broadcast_to(grad, self.data.shape).copy()

                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def relu(self):
        out = Tensor(np.maximum(self.data, 0.0), _children=(self,), _op='relu')

        def _backward():
            if self.requires_grad:
                grad = (self.data > 0).astype(np.float32) * out.grad

                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def sigmoid(self):
        s = 1. / (1. + np.exp(-self.data))
        out = Tensor(s, _children=(self,), _op='sigmoid')

        def _backward():
            if self.requires_grad:
                grad = s * (1 - s) * out.grad

                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def log(self):
        out = Tensor(np.log(self.data + 1e-10), _children=(self,), _op='log')
        
        def _backward():
            if self.requires_grad:
                grad = (1.0 / (self.data + 1e-10)) * out.grad
                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad
        
        out._backward = _backward
        return out
    
    def softmax(self, axis=-1):
        x_max = self.data.max(axis=axis, keepdims=True)
        exp_x = np.exp(self.data - x_max)
        s = exp_x / exp_x.sum(axis=axis, keepdim=True)

        out = Tensor(s, _children=(self,), _op='softmax')

        def _backward():
            if self.requires_grad:
                sum_term = (s * out.grad).sum(axis=axis, keepdim=True)

                grad = s * (out.grad - sum_term)
                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def reshape(self, shape):
        out = Tensor(self.data.reshape(shape), _children=(self,), _op='reshape')

        def _backward():
            if self.requires_grad:
                grad = out.grad.reshape(self.data.shape)
                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def transpose(self, axes=None):
        out = Tensor(self.data.transpose(axes), _children=(self,), _op='transpose')

        def _backward():
            if self.requires_grad:
                if axes is None:
                    grad = out.grad.T
                else:
                    inverse_axes = [0] * len(axes)
                    for i, axis in enumerate(axes):
                        inverse_axes[axis] = i
                    grad = np.transpose(out.grad, axes=inverse_axes)

                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def batchnorm(self, gamma, beta):
        mean = self.data.mean(axis=0, keepdim=True)
        var = self.data.var(axis=0, keepdim=True)

        std = np.sqrt(var + 1e-5)
        x_centered = (self.data - mean)
        x_norm = x_centered / std

        y = gamma.data * x_norm + beta.data

        out = Tensor(y, _children=(self, gamma, beta), _op='batch_norm')

        def _backward():
            if gamma.requires_grad:
                grad_gamma = (out.grad * x_norm).sum(axis=0, keepdim=True)
                if gamma.grad is not None:
                    gamma.grad = grad_gamma
                else:
                    gamma.grad += grad_gamma

            if beta.requires_grad:
                grad_beta = (out.grad).sum(axis=0, keepdim=True)
                if beta.grad is not None:
                    beta.grad = grad_beta
                else:
                    beta.grad += grad_beta

            if self.requires_grad:
                out_grad = out.grad
                N = self.data.shape[0]

                grad_x_norm = out_grad
                grad_var = (grad_x_norm * (-0.5) * (x_centered) * var ** (-3/2)).sum(axis=0, keepdim=True)
                grad_mean = (grad_x_norm * (1. / std) + grad_var * (-2. / N) * (x_centered)).sum(axis=0, keepdim=True)

                grad = grad_x_norm * (1. / std) + grad_mean * (1. / N) + grad_var * (2. / N) * x_centered * (1 - 1 / N)

                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad

        out._backward = _backward
        return out
    
    def dropout(self, p):
        mask = (np.random.rand(*self.data.shape) > p).astype(np.float32)
        mask = mask / (1. - p)

        out = Tensor(self.data * mask, _children=(self,), _op='dropout')

        def _backward():
            if self.requires_grad:
                out_grad = out.grad * mask

                if self.grad is None:
                    self.grad = out_grad
                else:
                    self.grad += out_grad

        out._backward = _backward
        return out
     
    def __pow__(self, other):
        other_val = other.data if isinstance(other, Tensor) else other
        out = Tensor(self.data ** other_val, _children=(self,), _op=f'**{other_val}')
        
        def _backward():
            if self.requires_grad:
                grad = other_val * (self.data ** (other_val - 1)) * out.grad
                if self.grad is None:
                    self.grad = grad
                else:
                    self.grad += grad
        
        out._backward = _backward
        return out
    
    def __neg__(self):
        return self * -1
    
    def __sub__(self, other):
        return self + (-other)
    
    def __radd__(self, other):
        return self + other
    
    def __rmul__(self, other):
        return self * other
    
    def __repr__(self):
        return f"Tensor(shape={self.shape}, data={self.data}, grad={self.grad})"
    
    def backward(self):
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)
        
        self.grad = np.ones_like(self.data)
        for node in reversed(topo):
            node._backward()


# ============================================================================
# Test Broadcasting
# ============================================================================

def test_broadcasting():
    print("="*60)
    print("Testing Broadcasting Support")
    print("="*60)
    
    # Test 1: Basic broadcasting
    print("\n--- Test 1: (1,3) * (3,1) -> (3,3) ---")
    a = Tensor([[1, 2, 3]])      # (1, 3)
    b = Tensor([[10],            # (3, 1)
                [20],
                [30]])
    
    c = a * b  # (3, 3)
    loss = c.sum()
    loss.backward()
    
    print(f"a.shape: {a.shape}, a.grad.shape: {a.grad.shape}")
    print(f"a.grad:\n{a.grad}")
    print(f"\nb.shape: {b.shape}, b.grad.shape: {b.grad.shape}")
    print(f"b.grad:\n{b.grad}")
    
    # Test 2: Prepended dimensions
    print("\n--- Test 2: (3,) + (2,3) -> (2,3) ---")
    a = Tensor([1, 2, 3])        # (3,)
    b = Tensor([[10, 20, 30],    # (2, 3)
                [40, 50, 60]])
    
    c = a + b  # (2, 3)
    loss = c.sum()
    loss.backward()
    
    print(f"a.shape: {a.shape}, a.grad.shape: {a.grad.shape}")
    print(f"a.grad: {a.grad}")
    print(f"\nb.shape: {b.shape}, b.grad.shape: {b.grad.shape}")
    print(f"b.grad:\n{b.grad}")
    
    # Test 3: Scalar broadcasting
    print("\n--- Test 3: Scalar * (2,3) -> (2,3) ---")
    a = Tensor(2.0)              # scalar
    b = Tensor([[1, 2, 3],
                [4, 5, 6]])      # (2, 3)
    
    c = a * b  # (2, 3)
    loss = c.sum()
    loss.backward()
    
    print(f"a.shape: {a.shape}, a.grad.shape: {a.grad.shape}")
    print(f"a.grad: {a.grad}")
    print(f"\nb.shape: {b.shape}, b.grad.shape: {b.grad.shape}")
    print(f"b.grad:\n{b.grad}")
    
    print("\n" + "="*60)
    print("All broadcasting tests passed! âœ…")
    print("="*60)


if __name__ == "__main__":
    test_broadcasting()