In [1]:
import math
import random
import numpy as np
import matplotlib.pyplot as plt

### Explanation of Key Concepts and Classes

#### Gradient (`grad`)
- The `grad` attribute in the `Value` class represents the gradient of the output with respect to that value. Gradients are essential for updating parameters during training using optimization algorithms like gradient descent.

#### Backpropagation (`backward`)
- Backpropagation is the process of computing gradients for all parameters in the network by traversing the computation graph in reverse order. The `backward()` method in the `Value` class implements this by building a topological order of nodes and applying the chain rule to propagate gradients backward through the graph.

#### Classes Overview
- **Value**: Represents a scalar value in the computation graph, supports basic arithmetic operations, and tracks gradients for automatic differentiation.
- **Neuron**: Models a single artificial neuron with weights and bias, computes the output using a tanh activation function.
- **Layer**: Comprises multiple neurons, processes input data through each neuron, and aggregates their outputs.
- **MLP (Multi-Layer Perceptron)**: Stacks multiple layers to form a feedforward neural network, enabling complex function approximation.

#### Training Loop
- The training loop repeatedly feeds input data through the network, computes the loss (difference between predictions and targets), performs backpropagation to calculate gradients, and updates the parameters using gradient descent. This iterative process allows the network to learn from data and improve its predictions over time.

In [2]:
class Value:
  
  def __init__(self, data, _children=(), _op='', label=''):
    self.data = data
    self.grad = 0.0
    self._backward = lambda: None
    self._prev = set(_children)
    self._op = _op
    self.label = label

  def __repr__(self):
    return f"Value(data={self.data})"
  
  def __add__(self, other):
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self, other), '+')
    
    def _backward():
      self.grad += 1.0 * out.grad
      other.grad += 1.0 * out.grad
    out._backward = _backward
    
    return out

  def __mul__(self, other):
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data * other.data, (self, other), '*')
    
    def _backward():
      self.grad += other.data * out.grad
      other.grad += self.data * out.grad
    out._backward = _backward
      
    return out
  
  def __pow__(self, other):
    assert isinstance(other, (int, float)), "only supporting int/float powers for now"
    out = Value(self.data**other, (self,), f'**{other}')

    def _backward():
        self.grad += other * (self.data ** (other - 1)) * out.grad
    out._backward = _backward

    return out
  
  def __rmul__(self, other): # other * self
    return self * other

  def __truediv__(self, other): # self / other
    return self * other**-1

  def __neg__(self): # -self
    return self * -1

  def __sub__(self, other): # self - other
    return self + (-other)

  def __radd__(self, other): # other + self
    return self + other

  def tanh(self):
    x = self.data
    t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
    out = Value(t, (self, ), 'tanh')
    
    def _backward():
      self.grad += (1 - t**2) * out.grad
    out._backward = _backward
    
    return out
  
  def exp(self):
    x = self.data
    out = Value(math.exp(x), (self, ), 'exp')
    
    def _backward():
      self.grad += out.data * out.grad # NOTE: in the video I incorrectly used = instead of +=. Fixed here.
    out._backward = _backward
    
    return out
  
  def backward(self):
    
    topo = []
    visited = set()
    def build_topo(v):
      if v not in visited:
        visited.add(v)
        for child in v._prev:
          build_topo(child)
        topo.append(v)
    build_topo(self) 
    
    self.grad = 1.0
    for node in reversed(topo):
      node._backward()

In [3]:
class Neuron:
  
  def __init__(self, nin):
    self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
    self.b = Value(random.uniform(-1,1))
  
  def __call__(self, x):
    # w * x + b
    act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
    out = act.tanh()
    return out
  
  def parameters(self):
    return self.w + [self.b]

class Layer:
  
  def __init__(self, nin, nout):
    self.neurons = [Neuron(nin) for _ in range(nout)]
  
  def __call__(self, x):
    outs = [n(x) for n in self.neurons]
    return outs[0] if len(outs) == 1 else outs
  
  def parameters(self):
    return [p for neuron in self.neurons for p in neuron.parameters()]

class MLP:
  
  def __init__(self, nin, nouts):
    sz = [nin] + nouts
    self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
  
  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    return x
  
  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]

In [4]:
def print_tree(value, indent=0):
    prefix = " " * indent
    try:
        # Print the operation and its result
        print(f"{prefix}{'[op:'+value._op+']'}: {value.data:.2f} (grad: {value.grad:.2f})")
    except AttributeError:
        print(f"{prefix}{'[no_op]'}: {value:.2f} (grad: : None)")
    for child in value._prev:
        print_tree(child, indent + 6)

In [5]:
mlp = MLP(3, [4, 4, 1])

xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

In [6]:
step = 0

for epoch in range(100):
    step += 1
    out = [mlp(x) for x in xs]
    loss = sum((o - y)**2 for y, o in zip(ys, out)) # MSE loss

    for p in mlp.parameters():
        p.grad = 0.0
    loss.backward()

    for p in mlp.parameters():
        p.data -= 0.01 * p.grad
    
    if step % 10 == 0:
        print(f"epoch {epoch+1}, loss: {loss.data:.8f}")
        step = 0

epoch 10, loss: 4.97816241
epoch 20, loss: 4.22803703
epoch 30, loss: 3.90322286
epoch 40, loss: 3.57281627
epoch 50, loss: 2.58125884
epoch 60, loss: 1.00777397
epoch 70, loss: 0.48989850
epoch 80, loss: 0.29139758
epoch 90, loss: 0.19592383
epoch 100, loss: 0.14277657
