In [968]:
import math
import numpy as np

import matplotlib.pyplot as plt
import sys

In [969]:
class Value:
    def __init__(self, data, _children=(), _op='') -> None:
        self.value: int = data
        self.grad = .0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
    
    def __repr__(self) -> str:
        return f"Value(data={self.value}, size={sys.getsizeof(self)}Bytes)"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other) # to make a+1.0 work
        out = Value(self.value + other.value, (self, other), "+")

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out 

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.value * other.value, (self, other), "*")
        def _backward():
            self.grad += other.value * out.grad
            other.grad += self.value * out.grad
        out._backward = _backward
        return out

    def __rmul__(self, other):
        return other * self
    
    def __radd__(self, other):
        return self + other
    
    def tanh(self):
        x = self.value
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self,), "tanh")
        
        def _backward():
            self.grad = (1 - t**2) * out.grad

        out._backward = _backward
        return out
    
    def exp(self):
        x = self.value
        out = Value(math.exp(x), (self,))
        def _backward():
            self.grad += out.value * out.grad
        out._backward = _backward

        return out
    
    def __truediv__(self, other):
        return self * other**(-1)

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supports int/float powers"
        out = Value(self.value**other, (self,))

        def _backward():
            self.grad += other * self.value**(other-1) * out.grad
        out._backward = _backward
        return out
    
    def __sub__(self, other):
        return self + (-other)
    
    def __neg__(self):
        return self * -1
    
    def backward(self):
        # build the topological graph to order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

a = Value(32)
b = Value(2)
print(a, b)
print(a+b)
print(a*b)
d = a*b + a
print(d)

d.backward()

print(d.grad, a.grad, b.grad)

Value(data=32, size=56Bytes) Value(data=2, size=56Bytes)
Value(data=34, size=56Bytes)
Value(data=64, size=56Bytes)
Value(data=96, size=56Bytes)
1.0 3.0 32.0


In [970]:
print(d._prev, d._op)

{Value(data=64, size=56Bytes), Value(data=32, size=56Bytes)} +


In [971]:
# Bug, should be 2 instead of 1, in the function _backward of addition,
#  we are overriding the gradient value (1), when self is the same object as other
#  when we use a variable more then ones
#  we should accumulate those gradients, multivariate case of chain rule
s = a + a
s.backward()
print(s.grad, a.grad)
# Expected 2, got 1 for s.grad

1.0 5.0


In [972]:
# Bug fixed
a = Value(32)
s = a + a
s.backward()
print(s.grad, a.grad)

1.0 2.0


To run back propagation, we need to store all the nodes in a topological order, then run the function in reverse order.

In [973]:
a = Value(2.0)
b = Value(4.0)
print(a/b)
a-b

Value(data=0.5, size=56Bytes)


Value(data=-2.0, size=56Bytes)

#### From scalars, gradients, neurons, and backpropagation To a MLP

In [974]:
class Neuron:
    def __init__(self, n) -> None:
        self.w = [Value(np.random.randn()) for _ in range(n)]
        self.b = Value(np.random.randn())
        
    def parameters(self):
        return self.w + [self.b]

    def __call__(self, x):
        a = sum((wi+xi for wi, xi in zip(self.w, x)), self.b)
        return a.tanh()
    
    
x = [2.0, 3.0]
n = Neuron(2)
n(x) # python will call n.__call__(x)

Value(data=0.9999834782734754, size=56Bytes)

In [975]:
class Layer:
    def __init__(self, nin, nout) -> None:
        self.nin = nin
        self.nout = nout
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def parameters(self):
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameters())
        return params
    
    def forward(self, x):
        return [neuron(x) for neuron in self.neurons]
    
    def __call__(self, x):
        return self.forward(x)[0] if self.nout == 1 else self.forward(x)
    
l = Layer(2, 3)
l(x)

[Value(data=0.9998321227272446, size=56Bytes),
 Value(data=0.9999970892341399, size=56Bytes),
 Value(data=0.9999997132102807, size=56Bytes)]

In [976]:
class MLP:
    def __init__(self, nin, nouts) -> None:
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [977]:
input = [2.0, 3.0]
mlp = MLP(2, [3, 4, 1]) 
mlp(input)

Value(data=0.9717841984379162, size=56Bytes)

In [978]:
# example
xs = [
    [2.0, 3.0, 1.0, 15.0],
    [1.0, 2.0, 3.0, 4.0],
    [3.0, 2.0, 1.0, 0.0]
]
target = [1.0, 0.0, 1.0]
pred = [mlp(xs[i]) for i in range(3)]
pred

[Value(data=0.9717841984379162, size=56Bytes),
 Value(data=0.907588246803216, size=56Bytes),
 Value(data=0.9717841984379162, size=56Bytes)]

let's define a loss to mesure the error of the model, we will use the mean squared error.

In [979]:
def mse(pred, target):
    return sum((p-t)**2 for p, t in zip(pred, target))

loss = mse(pred, target)
loss

Value(data=0.8253086886509172, size=56Bytes)

In [980]:
loss.backward()

In [981]:
# check the gradients
mlp.layers[0].neurons[1].w[1].grad

0.05309032212159961

In [982]:
# weights and biases of all the network
mlp.parameters()

[Value(data=-2.012293840208911, size=56Bytes),
 Value(data=0.03806197044696347, size=56Bytes),
 Value(data=-0.451750955485669, size=56Bytes),
 Value(data=-0.5000302410899579, size=56Bytes),
 Value(data=-0.07118289509848812, size=56Bytes),
 Value(data=-0.7749125144913039, size=56Bytes),
 Value(data=1.8486488501662062, size=56Bytes),
 Value(data=0.7882446436816126, size=56Bytes),
 Value(data=0.9763373923984746, size=56Bytes),
 Value(data=0.7060136155333891, size=56Bytes),
 Value(data=1.2299360902008314, size=56Bytes),
 Value(data=-1.1199945670693077, size=56Bytes),
 Value(data=0.8398545059033957, size=56Bytes),
 Value(data=-0.5706141864098181, size=56Bytes),
 Value(data=-1.6878816028244596, size=56Bytes),
 Value(data=-0.9573114894061031, size=56Bytes),
 Value(data=0.5846133861466069, size=56Bytes),
 Value(data=-1.7019930085764075, size=56Bytes),
 Value(data=1.2285268781185719, size=56Bytes),
 Value(data=0.34701101078249597, size=56Bytes),
 Value(data=1.7453417621342378, size=56Bytes),
 V

In [983]:
mlp.layers[0].neurons[1].w[1].value

-0.07118289509848812

In [984]:
lr = 0.01
for p in mlp.parameters():
    p.value -= lr * p.grad

# check the gradients
mlp.layers[0].neurons[1].w[1].value

-0.0717137983197041

In [985]:
# forward pass, we expect the loss to decrease
pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

Value(data=0.8135930674366098, size=56Bytes)

The loss decreased. Yaaay!

Let's train it.

In [986]:
from tqdm import tqdm

In [987]:
epochs = 100
lr = 0.01
for i in tqdm(range(epochs)):
    pred = [mlp(xs[i]) for i in range(3)]
    loss = mse(pred, target)
    for p in mlp.parameters():
        p.grad = .0

    loss.backward()
    for p in mlp.parameters():
        p.value -= lr * p.grad
    loss = mse(pred, target)

    # lr
    if i % 100 == 0:
        lr *= 0.8

    print(f"Epoch: {i} -> Loss: {loss.value}")

pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0 -> Loss: 0.8135930674366098
Epoch: 1 -> Loss: 0.8004576068829902
Epoch: 2 -> Loss: 0.7887047518081861
Epoch: 3 -> Loss: 0.7757506672016046
Epoch: 4 -> Loss: 0.7614428013181432
Epoch: 5 -> Loss: 0.7456104823329142
Epoch: 6 -> Loss: 0.7280649734208762
Epoch: 7 -> Loss: 0.7086010993840819
Epoch: 8 -> Loss: 0.687001405818821
Epoch: 9 -> Loss: 0.6630441865526226
Epoch: 10 -> Loss: 0.6365171205308876
Epoch: 11 -> Loss: 0.6072385749142097
Epoch: 12 -> Loss: 0.5750886002720758
Epoch: 13 -> Loss: 0.540050814569879
Epoch: 14 -> Loss: 0.5022640964078682
Epoch: 15 -> Loss: 0.4620786263675388
Epoch: 16 -> Loss: 0.42010422727073643
Epoch: 17 -> Loss: 0.37723164480314153
Epoch: 18 -> Loss: 0.3346034834795187
Epoch: 19 -> Loss: 0.29351725694561065
Epoch: 20 -> Loss: 0.25526305033628155
Epoch: 21 -> Loss: 0.22092876114436624
Epoch: 22 -> Loss: 0.1912307946546154
Epoch: 23 -> Loss: 0.1664275013111327
Epoch: 24 -> Loss: 0.14634078078009635
Epoch: 25 -> Loss: 0.13046619852478147
Epoch: 26 -> Loss

100%|██████████| 100/100 [00:00<00:00, 466.33it/s]


Epoch: 33 -> Loss: 0.08078440087020584
Epoch: 34 -> Loss: 0.07824289959277722
Epoch: 35 -> Loss: 0.07592515651328174
Epoch: 36 -> Loss: 0.07377847640226096
Epoch: 37 -> Loss: 0.07176720368291065
Epoch: 38 -> Loss: 0.06986705885195496
Epoch: 39 -> Loss: 0.06806128923313325
Epoch: 40 -> Loss: 0.06633809606915808
Epoch: 41 -> Loss: 0.06468893552430495
Epoch: 42 -> Loss: 0.0631074065356759
Epoch: 43 -> Loss: 0.06158852719869091
Epoch: 44 -> Loss: 0.06012826573132915
Epoch: 45 -> Loss: 0.05872323698837785
Epoch: 46 -> Loss: 0.057370506051065334
Epoch: 47 -> Loss: 0.05606746081618458
Epoch: 48 -> Loss: 0.054811728946583545
Epoch: 49 -> Loss: 0.0536011233110112
Epoch: 50 -> Loss: 0.052433605719298844
Epoch: 51 -> Loss: 0.05130726241780678
Epoch: 52 -> Loss: 0.05022028715961525
Epoch: 53 -> Loss: 0.04917096916919771
Epoch: 54 -> Loss: 0.0481576842844845
Epoch: 55 -> Loss: 0.047178888175291214
Epoch: 56 -> Loss: 0.04623311093124581
Epoch: 57 -> Loss: 0.04531895256477804
Epoch: 58 -> Loss: 0.04




Value(data=0.023552751132743188, size=56Bytes)

Wooooooow! It's working! Loss decreased by a lot!

In [988]:
pred

[Value(data=0.8929492495800423, size=56Bytes),
 Value(data=0.02515998413733679, size=56Bytes),
 Value(data=0.8929492495800423, size=56Bytes)]

It's not perfect, but it's a good start.