In [561]:
import math
import numpy as np

import matplotlib.pyplot as plt
import sys

In [562]:
class Value:
    def __init__(self, data, _children=(), _op='') -> None:
        self.value: int = data
        self.grad = .0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
    
    def __repr__(self) -> str:
        return f"Value(data={self.value}, size={sys.getsizeof(self)}Bytes)"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other) # to make a+1.0 work
        out = Value(self.value + other.value, (self, other), "+")

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out 

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.value * other.value, (self, other), "*")
        def _backward():
            self.grad += other.value * out.grad
            other.grad += self.value * out.grad
        out._backward = _backward
        return out

    def __rmul__(self, other):
        return other * self
    
    def __radd__(self, other):
        return self + other
    
    def tanh(self):
        x = self.value
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self,), "tanh")
        
        def _backward():
            self.grad = (1 - t**2) * out.grad

        out._backward = _backward
        return out
    
    def exp(self):
        x = self.value
        out = Value(math.exp(x), (self,))
        def _backward():
            self.grad += out.value * out.grad
        out._backward = _backward

        return out
    
    def __truediv__(self, other):
        return self * other**(-1)

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supports int/float powers"
        out = Value(self.value**other, (self,))

        def _backward():
            self.grad += other * self.value**(other-1) * out.grad
        out._backward = _backward
        return out
    
    def __sub__(self, other):
        return self + (-other)
    
    def __neg__(self):
        return self * -1
    
    def backward(self):
        # build the topological graph to order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

a = Value(32)
b = Value(2)
print(a, b)
print(a+b)
print(a*b)
d = a*b + a
print(d)

d.backward()

print(d.grad, a.grad, b.grad)

Value(data=32, size=56Bytes) Value(data=2, size=56Bytes)
Value(data=34, size=56Bytes)
Value(data=64, size=56Bytes)
Value(data=96, size=56Bytes)
1.0 3.0 32.0


In [563]:
print(d._prev, d._op)

{Value(data=32, size=56Bytes), Value(data=64, size=56Bytes)} +


In [564]:
# Bug, should be 2 instead of 1, in the function _backward of addition,
#  we are overriding the gradient value (1), when self is the same object as other
#  when we use a variable more then ones
#  we should accumulate those gradients, multivariate case of chain rule
s = a + a
s.backward()
print(s.grad, a.grad)
# Expected 2, got 1 for s.grad

1.0 5.0


In [565]:
# Bug fixed
a = Value(32)
s = a + a
s.backward()
print(s.grad, a.grad)

1.0 2.0


To run back propagation, we need to store all the nodes in a topological order, then run the function in reverse order.

In [566]:
a = Value(2.0)
b = Value(4.0)
print(a/b)
a-b

Value(data=0.5, size=56Bytes)


Value(data=-2.0, size=56Bytes)

#### From scalars, gradients, neurons, and backpropagation To a MLP

In [567]:
class Neuron:
    def __init__(self, n) -> None:
        self.w = [Value(np.random.randn()) for _ in range(n)]
        self.b = Value(np.random.randn())
        
    def parameters(self):
        return self.w + [self.b]

    def __call__(self, x):
        a = sum((wi+xi for wi, xi in zip(self.w, x)), self.b)
        return a.tanh()
    
    
x = [2.0, 3.0]
n = Neuron(2)
n(x) # python will call n.__call__(x)

Value(data=0.9897511152845961, size=56Bytes)

In [568]:
class Layer:
    def __init__(self, nin, nout) -> None:
        self.nin = nin
        self.nout = nout
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def parameters(self):
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameters())
        return params
    
    def forward(self, x):
        return [neuron(x) for neuron in self.neurons]
    
    def __call__(self, x):
        return self.forward(x)[0] if self.nout == 1 else self.forward(x)
    
l = Layer(2, 3)
l(x)

[Value(data=0.999997338164193, size=56Bytes),
 Value(data=0.9999530116258976, size=56Bytes),
 Value(data=0.9998719950881702, size=56Bytes)]

In [569]:
class MLP:
    def __init__(self, nin, nouts) -> None:
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [570]:
input = [2.0, 3.0]
mlp = MLP(2, [3, 4, 1]) 
mlp(input)

Value(data=0.9145138093877023, size=56Bytes)

In [571]:
# example
xs = [
    [2.0, 3.0, 1.0, 15.0],
    [1.0, 2.0, 3.0, 4.0],
    [3.0, 2.0, 1.0, 0.0]
]
target = [1.0, 0.0, 1.0]
pred = [mlp(xs[i]) for i in range(3)]
pred

[Value(data=0.9145138093877023, size=56Bytes),
 Value(data=0.8566918735805565, size=56Bytes),
 Value(data=0.9145138093877023, size=56Bytes)]

let's define a loss to mesure the error of the model, we will use the mean squared error.

In [572]:
def mse(pred, target):
    return sum((p-t)**2 for p, t in zip(pred, target))

loss = mse(pred, target)
loss

Value(data=0.7485367438297685, size=56Bytes)

In [573]:
loss.backward()

In [574]:
# check the gradients
mlp.layers[0].neurons[1].w[1].grad

0.00025144986133308965

In [575]:
# weights and biases of all the network
mlp.parameters()

[Value(data=-0.5794855995877302, size=56Bytes),
 Value(data=0.33667057157449, size=56Bytes),
 Value(data=-1.7907542989818883, size=56Bytes),
 Value(data=1.4763437050314083, size=56Bytes),
 Value(data=-0.7884670309878907, size=56Bytes),
 Value(data=0.7861819938845216, size=56Bytes),
 Value(data=-1.0336710139717482, size=56Bytes),
 Value(data=1.4241566140922832, size=56Bytes),
 Value(data=-0.23333981591318337, size=56Bytes),
 Value(data=0.8016614973491538, size=56Bytes),
 Value(data=0.49305397053304234, size=56Bytes),
 Value(data=1.5241419952773227, size=56Bytes),
 Value(data=-0.30559916173037777, size=56Bytes),
 Value(data=0.06643229635308645, size=56Bytes),
 Value(data=-1.3821969756051278, size=56Bytes),
 Value(data=-0.9402984078802232, size=56Bytes),
 Value(data=-2.477302392789485, size=56Bytes),
 Value(data=0.032850974260755256, size=56Bytes),
 Value(data=-0.24643604232438562, size=56Bytes),
 Value(data=1.6388662445323356, size=56Bytes),
 Value(data=-0.41827466568077204, size=56Bytes

In [576]:
mlp.layers[0].neurons[1].w[1].value

-0.7884670309878907

In [577]:
lr = 0.01
for p in mlp.parameters():
    p.value -= lr * p.grad

# check the gradients
mlp.layers[0].neurons[1].w[1].value

-0.788469545486504

In [578]:
# forward pass, we expect the loss to decrease
pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

Value(data=0.7325330459306472, size=56Bytes)

The loss decreased. Yaaay!

Let's train it.

In [579]:
from tqdm import tqdm

In [580]:
epochs = 100
lr = 0.01
for i in tqdm(range(epochs)):
    pred = [mlp(xs[i]) for i in range(3)]
    loss = mse(pred, target)
    loss.backward()
    for p in mlp.parameters():
        p.value -= lr * p.grad
    loss = mse(pred, target)
    print(f"Epoch: {i} -> Loss: {loss.value}")

pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

100%|██████████| 100/100 [00:00<00:00, 2342.68it/s]

Epoch: 0 -> Loss: 0.7325330459306472
Epoch: 1 -> Loss: 0.6979110704977577
Epoch: 2 -> Loss: 0.6400649335021132
Epoch: 3 -> Loss: 0.5566683828835824
Epoch: 4 -> Loss: 0.4646470756745054
Epoch: 5 -> Loss: 0.43291084595840634
Epoch: 6 -> Loss: 0.5428250453286493
Epoch: 7 -> Loss: 0.6797703315599455
Epoch: 8 -> Loss: 0.6129126201656261
Epoch: 9 -> Loss: 0.40776361660678984
Epoch: 10 -> Loss: 0.29817863502867387
Epoch: 11 -> Loss: 0.3130730187356399
Epoch: 12 -> Loss: 0.36119040189273593
Epoch: 13 -> Loss: 0.38643189811000983
Epoch: 14 -> Loss: 0.3723274489097933
Epoch: 15 -> Loss: 0.31910053907413105
Epoch: 16 -> Loss: 0.23654395678629767
Epoch: 17 -> Loss: 0.14460882493119648
Epoch: 18 -> Loss: 0.07216042802003171
Epoch: 19 -> Loss: 0.04864357789468318
Epoch: 20 -> Loss: 0.08769353975357973
Epoch: 21 -> Loss: 0.17296625063200427
Epoch: 22 -> Loss: 0.25497680113704724
Epoch: 23 -> Loss: 0.26884124846816077
Epoch: 24 -> Loss: 0.19667809519030777
Epoch: 25 -> Loss: 0.1093393329231052
Epoch: 




Value(data=0.012765924946486328, size=56Bytes)

Wooooooow! It's working! Loss decreased by a lot!

In [581]:
pred

[Value(data=0.9201407818053836, size=56Bytes),
 Value(data=0.0033068845119885054, size=56Bytes),
 Value(data=0.9201407818053836, size=56Bytes)]

It's not perfect, but it's a good start.