<div style="text-align: right"><i>Ivy Zhang<br>2023</i></div>

# Micrograd from scratch (v2)

The only thing I've copied over is the essential imports! I'm going to go through the code I wrote the first time, and I'm going to check against the original micrograd to see where I can improve.

In [55]:
import math
import random
import graphviz as G
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [56]:
# Go through and BFS to find all nodes and edges between them
def trace(node):
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(node)
    return nodes, edges

# Visualize it as a large graph
def visualize_node(node) -> None:
    dot = G.Digraph(format='svg', graph_attr={'rankdir': 'LR'}) 
    
    nodes, edges = trace(node)
    # Note that we can't just track based on label, they might be anonymous, so you should deal with that
    # CHANGE: don't need a dictionary, there's a builtin id function
    for node in nodes:
        uid = str(id(node))     
        dot.node(uid, f"{{{node.label} | data: {node.data} | grad: {node.grad}}}", shape='record')
        # If you have an operation, connect it to
        if node._op:        # CHANGE: Note to self, that empty string WON'T trigger this (can get rid of !=)
            dot.node(uid + node._op, node._op)
            dot.edge(uid + node._op, uid)

    # Instead of connecting directly, go to operation node
    # CHANGE: Rather than take each edge, you can unpack into nodes
    for n1, n2 in edges:
        dot.edge(id(n1), id(n2) + n2._op)

    return dot

In [57]:
class Value():
    """Stores values and the gradients"""
    # Note that label=None, so you can have "anonymous calls"
    def __init__(self, _data, _children=(), op='', label='') -> None:
        self.data = _data
        self.label = label
        self._prev = set(_children)
        self._op = op
        self.grad = 0.0
        self._backward = lambda: None

    # Little code, so I'm able to check the values in them
    def __repr__(self) -> str:
        return f"Value {self.label + ' ' if self.label != '' else self.label}(data = {self.data})"
    
    def __neg__(self):
        return self * -1
    
    # Learned something that it isn't the NODE that propagates it backwards, it's the out backward that does
    def __add__(self, other):
        assert(isinstance(other, (Value, int, float)))
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        
        out._backward = _backward
        return out
    
    def __sub__(self, other):
        return self + (-other)
    
    def __mul__(self, other):
        assert(isinstance(other, (Value, int, float)))
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        
        out._backward = _backward
        return out
    
    def __truediv__(self, other): # self / other
        return self * other**-1
    
    def __exp__(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')

        def _backward():
            self.grad += math.exp(x) * out.grad
        out._backward = _backward
        return out        

    def __radd__(self, other):      # other + self
        return self + other

    def __rsub__(self, other):
        assert(isinstance(self, (int, float)))
        return Value(self) + -other
    
    def __rmul__(self, other):
        return self * other
    
    def __pow__(self, other):
        assert isinstance(other, (int, float))
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += other * (self.data * (other - 1)) * out.grad
        
        out._backward = _backward
        return out

    def relu(self):
        out = Value(self.data if self.data > 0 else 0, (self, ), 'relu')

        def _backward():
            self.grad += (1 if self.data else 0) * out.grad
            
        out._backward = _backward
        return out
    
    # NOTE: Fixed bug to accumulate gradient instead of just setting it 
    def sigmoid(self):
        x = self.data
        sig = math.exp(x) / (1 + math.exp(x))
        out = Value(sig, (self, ), 'sigmoid')

        def _backward():
            self.grad += sig * (1 - sig) * out.grad

        out._backward = _backward
        return out

    def backward(self):
        visited = set()
        ordering = []

        def topo_sort(node):
            if node not in visited:
                visited.add(node)
                for child in node._prev:
                    if child not in visited:
                        topo_sort(child)
                ordering.append(node)

        topo_sort(self)

        self.grad = 1.0
        for node in reversed(ordering):
            node._backward()

In [58]:
class Module():
    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0.0
    
    def parameters(self):
        pass

class Neuron(Module):
    """Take a certain number of weights in, and then takes in a bias. This is a traditional perceptron"""
    def __init__(self, nin):
        # Xavier initialization & NOTE: 0 bias update
        self.w = [Value(random.uniform(-1/math.sqrt(nin), 1/math.sqrt(nin))) for _ in range(nin)]
        self.b = Value(0)       
    
    def __call__(self, x):
        # w [nin], x [nin]
        # zip to pull them together
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        out = act.sigmoid()
        return out
    
    def parameters(self):
        return [self.b] + self.w
    
class Layer(Module):
    """Create a layer of Neurons to pass through"""
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [neuron(x) for neuron in self.neurons]
        return outs[0] if len(outs) == 1 else outs

    def parameters(self):
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameters())
        return params

class MLP(Module):
    def __init__(self, nin, layers):
        nc = [nin] + layers
        self.layers = [Layer(nc[i], nc[i+1]) for i in range(len(layers))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params

In [59]:
xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, 0.0, 0.0, 1.0]

In [97]:
n = MLP(3, [6, 6, 1])

In [99]:

for k in range(500 + 1):
  # forward pass
  ypred = [n(x) for x in xs]
  loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
  
  # backward pass
  for p in n.parameters():
    p.grad = 0.0
  loss.backward()
  
  # update
  for p in n.parameters():
    p.data += -0.1 * p.grad
  
  if k % 50 == 0:
    print(k, loss.data)
  

0 0.025188860212798078
50 0.01918036489079715
100 0.015308064337113184
150 0.01263980834088238
200 0.010706564046983363
250 0.00925039927048938
300 0.008119248146274387
350 0.007218324025923282
400 0.00648579655090566
450 0.005879782578930692
500 0.005370995122457957
