In [1111]:
import math
import numpy as np

import matplotlib.pyplot as plt
import sys

In [1112]:
class Value:
    def __init__(self, data, _children=(), _op='') -> None:
        self.value: int = data
        self.grad = .0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
    
    def __repr__(self) -> str:
        return f"Value(data={self.value}, size={sys.getsizeof(self)}Bytes)"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other) # to make a+1.0 work
        out = Value(self.value + other.value, (self, other), "+")

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out 

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.value * other.value, (self, other), "*")
        def _backward():
            self.grad += other.value * out.grad
            other.grad += self.value * out.grad
        out._backward = _backward
        return out

    def __rmul__(self, other):
        return other * self
    
    def __radd__(self, other):
        return self + other
    
    def tanh(self):
        x = self.value
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self,), "tanh")
        
        def _backward():
            self.grad = (1 - t**2) * out.grad

        out._backward = _backward
        return out
    
    def exp(self):
        x = self.value
        out = Value(math.exp(x), (self,))
        def _backward():
            self.grad += out.value * out.grad
        out._backward = _backward

        return out
    
    def __truediv__(self, other):
        return self * other**(-1)

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supports int/float powers"
        out = Value(self.value**other, (self,))

        def _backward():
            self.grad += other * self.value**(other-1) * out.grad
        out._backward = _backward
        return out
    
    def __sub__(self, other):
        return self + (-other)
    
    def __neg__(self):
        return self * -1
    
    def backward(self):
        # build the topological graph to order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

a = Value(32)
b = Value(2)
print(a, b)
print(a+b)
print(a*b)
d = a*b + a
print(d)

d.backward()

print(d.grad, a.grad, b.grad)

Value(data=32, size=56Bytes) Value(data=2, size=56Bytes)
Value(data=34, size=56Bytes)
Value(data=64, size=56Bytes)
Value(data=96, size=56Bytes)
1.0 3.0 32.0


In [1113]:
print(d._prev, d._op)

{Value(data=64, size=56Bytes), Value(data=32, size=56Bytes)} +


In [1114]:
# Bug, should be 2 instead of 1, in the function _backward of addition,
#  we are overriding the gradient value (1), when self is the same object as other
#  when we use a variable more then ones
#  we should accumulate those gradients, multivariate case of chain rule
s = a + a
s.backward()
print(s.grad, a.grad)
# Expected 2, got 1 for s.grad

1.0 5.0


In [1115]:
# Bug fixed
a = Value(32)
s = a + a
s.backward()
print(s.grad, a.grad)

1.0 2.0


To run back propagation, we need to store all the nodes in a topological order, then run the function in reverse order.

In [1116]:
a = Value(2.0)
b = Value(4.0)
print(a/b)
a-b

Value(data=0.5, size=56Bytes)


Value(data=-2.0, size=56Bytes)

#### From scalars, gradients, neurons, and backpropagation To a MLP

In [1117]:
class Neuron:
    def __init__(self, n) -> None:
        self.w = [Value(np.random.randn()) for _ in range(n)]
        self.b = Value(np.random.randn())
        
    def parameters(self):
        return self.w + [self.b]

    def __call__(self, x):
        a = sum((wi+xi for wi, xi in zip(self.w, x)), self.b)
        return a.tanh()
    
    
x = [2.0, 3.0]
n = Neuron(2)
n(x) # python will call n.__call__(x)

Value(data=0.9999996303486614, size=56Bytes)

In [1118]:
class Layer:
    def __init__(self, nin, nout) -> None:
        self.nin = nin
        self.nout = nout
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def parameters(self):
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameters())
        return params
    
    def forward(self, x):
        return [neuron(x) for neuron in self.neurons]
    
    def __call__(self, x):
        return self.forward(x)[0] if self.nout == 1 else self.forward(x)
    
l = Layer(2, 3)
l(x)

[Value(data=0.994048406796414, size=56Bytes),
 Value(data=0.9999721590140617, size=56Bytes),
 Value(data=0.9992224787031223, size=56Bytes)]

In [1119]:
class MLP:
    def __init__(self, nin, nouts) -> None:
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [1120]:
input = [2.0, 3.0]
mlp = MLP(2, [3, 4, 1]) 
mlp(input)

Value(data=0.9941342908150541, size=56Bytes)

In [1121]:
# example
xs = [
    [2.0, 3.0, 1.0, 15.0],
    [1.0, 2.0, 3.0, 4.0],
    [3.0, 2.0, 1.0, 0.0]
]
target = [1.0, 0.0, 1.0]
pred = [mlp(xs[i]) for i in range(3)]
pred

[Value(data=0.9941342908150541, size=56Bytes),
 Value(data=0.994054980073154, size=56Bytes),
 Value(data=0.9941342908150541, size=56Bytes)]

let's define a loss to mesure the error of the model, we will use the mean squared error.

In [1122]:
def mse(pred, target):
    return sum((p-t)**2 for p, t in zip(pred, target))

loss = mse(pred, target)
loss

Value(data=0.9882141164967232, size=56Bytes)

In [1123]:
loss.backward()

In [1124]:
# check the gradients
mlp.layers[0].neurons[1].w[1].grad

6.669147383293113e-06

In [1125]:
# weights and biases of all the network
mlp.parameters()

[Value(data=-0.6529901412410457, size=56Bytes),
 Value(data=0.5297575647074153, size=56Bytes),
 Value(data=-0.5004728551461568, size=56Bytes),
 Value(data=-0.5075937218740927, size=56Bytes),
 Value(data=1.3501966815674855, size=56Bytes),
 Value(data=-1.1634127460098471, size=56Bytes),
 Value(data=0.7794399527689354, size=56Bytes),
 Value(data=-1.9021398592335295, size=56Bytes),
 Value(data=-1.9863336468039758, size=56Bytes),
 Value(data=1.422881878722518, size=56Bytes),
 Value(data=0.35895657270957204, size=56Bytes),
 Value(data=-0.7954659800749525, size=56Bytes),
 Value(data=0.9306981349597084, size=56Bytes),
 Value(data=0.6853410324777836, size=56Bytes),
 Value(data=0.645352813977939, size=56Bytes),
 Value(data=0.4229889886454633, size=56Bytes),
 Value(data=0.7830763129461916, size=56Bytes),
 Value(data=-0.6303842913269332, size=56Bytes),
 Value(data=-0.2760073839987229, size=56Bytes),
 Value(data=0.7735478094197789, size=56Bytes),
 Value(data=1.6995433778260285, size=56Bytes),
 Valu

In [1126]:
mlp.layers[0].neurons[1].w[1].value

1.3501966815674855

In [1127]:
lr = 0.01
for p in mlp.parameters():
    p.value -= lr * p.grad

# check the gradients
mlp.layers[0].neurons[1].w[1].value

1.3501966148760116

In [1128]:
# forward pass, we expect the loss to decrease
pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

Value(data=0.988186949329495, size=56Bytes)

The loss decreased. Yaaay!

Let's train it.

In [1129]:
from tqdm import tqdm

In [1130]:
epochs = 100
lr = 0.01
for i in tqdm(range(epochs)):
    pred = [mlp(xs[i]) for i in range(3)]
    loss = mse(pred, target)
    for p in mlp.parameters():
        p.grad = .0

    loss.backward()
    for p in mlp.parameters():
        p.value -= lr * p.grad
    loss = mse(pred, target)

    # lr
    # if i % 100 == 0:
    #     lr *= 0.9 # decay the learning rate
    # this is a simple problem no need to decay lr

    print(f"Epoch: {i} -> Loss: {loss.value}")

pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0 -> Loss: 0.988186949329495
Epoch: 1 -> Loss: 0.9881596582246531
Epoch: 2 -> Loss: 0.9881322423396407
Epoch: 3 -> Loss: 0.9881047008242994
Epoch: 4 -> Loss: 0.9880770328207834
Epoch: 5 -> Loss: 0.9880492374634737
Epoch: 6 -> Loss: 0.9880213138788891
Epoch: 7 -> Loss: 0.9879932611855988
Epoch: 8 -> Loss: 0.9879650784941312
Epoch: 9 -> Loss: 0.9879367649068829
Epoch: 10 -> Loss: 0.9879083195180259
Epoch: 11 -> Loss: 0.987879741413414
Epoch: 12 -> Loss: 0.9878510296704881
Epoch: 13 -> Loss: 0.9878221833581782
Epoch: 14 -> Loss: 0.9877932015368078
Epoch: 15 -> Loss: 0.9877640832579927
Epoch: 16 -> Loss: 0.9877348275645422
Epoch: 17 -> Loss: 0.9877054334903559
Epoch: 18 -> Loss: 0.9876759000603218
Epoch: 19 -> Loss: 0.9876462262902106
Epoch: 20 -> Loss: 0.9876164111865694
Epoch: 21 -> Loss: 0.9875864537466148
Epoch: 22 -> Loss: 0.9875563529581237
Epoch: 23 -> Loss: 0.9875261077993214
Epoch: 24 -> Loss: 0.9874957172387722
Epoch: 25 -> Loss: 0.9874651802352619
Epoch: 26 -> Loss: 0.987

100%|██████████| 100/100 [00:00<00:00, 1924.18it/s]


Value(data=0.9846642851194423, size=56Bytes)

Wooooooow! It's working! Loss decreased by a lot!

In [1131]:
pred

[Value(data=0.9923472916551775, size=56Bytes),
 Value(data=0.9922434969448882, size=56Bytes),
 Value(data=0.9923472916551775, size=56Bytes)]

It's not perfect, but it's a good start.

Do not forget zero grad after each epoch, because the gradients are accumulated.