In [1217]:
import math
import numpy as np
import matplotlib.pyplot as plt

In [1218]:
class Value:
    def __init__(self, data, _children=(), _op='') -> None:
        self.value: int = data
        self.grad = .0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
    
    def __repr__(self) -> str:
        return f"Value(data={self.value}"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other) # to make a+1.0 work
        out = Value(self.value + other.value, (self, other), "+")

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward
        return out 

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.value * other.value, (self, other), "*")
        def _backward():
            self.grad += other.value * out.grad
            other.grad += self.value * out.grad
        out._backward = _backward
        return out

    def __rmul__(self, other):
        return other * self
    
    def __radd__(self, other):
        return self + other
    
    def tanh(self):
        x = self.value
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self,), "tanh")
        
        def _backward():
            self.grad = (1 - t**2) * out.grad

        out._backward = _backward
        return out
    
    def exp(self):
        x = self.value
        out = Value(math.exp(x), (self,))
        def _backward():
            self.grad += out.value * out.grad
        out._backward = _backward

        return out
    
    def __truediv__(self, other):
        return self * other**(-1)

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supports int/float powers"
        out = Value(self.value**other, (self,))

        def _backward():
            self.grad += other * self.value**(other-1) * out.grad
        out._backward = _backward
        return out
    
    def __sub__(self, other):
        return self + (-other)
    
    def __neg__(self):
        return self * -1
    
    def backward(self):
        # build the topological graph to order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

a = Value(32)
b = Value(2)
print(a, b)
print(a+b)
print(a*b)
d = a*b + a
print(d)

d.backward()

print(d.grad, a.grad, b.grad)

Value(data=32 Value(data=2
Value(data=34
Value(data=64
Value(data=96
1.0 3.0 32.0


In [1219]:
print(d._prev, d._op)

{Value(data=32, Value(data=64} +


In [1220]:
# Bug, should be 2 instead of 1, in the function _backward of addition,
#  we are overriding the gradient value (1), when self is the same object as other
#  when we use a variable more then ones
#  we should accumulate those gradients, multivariate case of chain rule
s = a + a
s.backward()
print(s.grad, a.grad)
# Expected 2, got 1 for s.grad

1.0 5.0


In [1221]:
# Bug fixed
a = Value(32)
s = a + a
s.backward()
print(s.grad, a.grad)

1.0 2.0


To run back propagation, we need to store all the nodes in a topological order, then run the function in reverse order.

In [1222]:
a = Value(2.0)
b = Value(4.0)
print(a/b)
a-b

Value(data=0.5


Value(data=-2.0

#### From scalars, gradients, neurons, and backpropagation To a MLP

In [1223]:
class Neuron:
    def __init__(self, n) -> None:
        self.w = [Value(np.random.randn()) for _ in range(n)]
        self.b = Value(np.random.randn())
        
    def parameters(self):
        return self.w + [self.b]

    def __call__(self, x):
        a = sum((wi+xi for wi, xi in zip(self.w, x)), self.b)
        return a.tanh()
    
    
x = [2.0, 3.0]
n = Neuron(2)
n(x) # python will call n.__call__(x)

Value(data=0.9990327218788304

In [1224]:
class Layer:
    def __init__(self, nin, nout) -> None:
        self.nin = nin
        self.nout = nout
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def parameters(self):
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameters())
        return params
    
    def forward(self, x):
        return [neuron(x) for neuron in self.neurons]
    
    def __call__(self, x):
        return self.forward(x)[0] if self.nout == 1 else self.forward(x)
    
l = Layer(2, 3)
l(x)

[Value(data=0.999596205659595,
 Value(data=0.9998433421296711,
 Value(data=0.991058280631276]

In [1225]:
class MLP:
    def __init__(self, nin, nouts) -> None:
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [1226]:
input = [2.0, 3.0]
mlp = MLP(2, [3, 4, 1]) 
mlp(input)

Value(data=0.33283583596315486

In [1227]:
# example
xs = [
    [2.0, 3.0, 1.0, 15.0],
    [1.0, 2.0, 3.0, 4.0],
    [3.0, 2.0, 1.0, 0.0]
]
target = [1.0, 0.0, 1.0]
pred = [mlp(xs[i]) for i in range(3)]
pred

[Value(data=0.33283583596315486,
 Value(data=0.3283814997418692,
 Value(data=0.33283583596315486]

let's define a loss to mesure the error of the model, we will use the mean squared error.

In [1228]:
def mse(pred, target):
    return sum((p-t)**2 for p, t in zip(pred, target))

loss = mse(pred, target)
loss

Value(data=0.9980504529226839

In [1229]:
loss.backward()

In [1230]:
# check the gradients
mlp.layers[0].neurons[1].w[1].grad

0.006039528886460024

In [1231]:
# weights and biases of all the network
mlp.parameters()

[Value(data=0.0631581074830363,
 Value(data=1.2126200159077103,
 Value(data=0.5648248302090649,
 Value(data=0.07480969899065093,
 Value(data=-1.9751401083716467,
 Value(data=0.048336867329492744,
 Value(data=-0.12159686498545282,
 Value(data=0.4334831358751545,
 Value(data=1.0866112012114235,
 Value(data=0.7194646795587307,
 Value(data=0.8147173504410563,
 Value(data=-1.1336829860843465,
 Value(data=0.025497882664008813,
 Value(data=0.031398784797302876,
 Value(data=-0.7681163870856056,
 Value(data=-0.4916257897700847,
 Value(data=0.9443188708234825,
 Value(data=-0.4283991953581445,
 Value(data=-1.0158648100832557,
 Value(data=2.788910898271474,
 Value(data=0.5563309101972289,
 Value(data=1.0277910938614678,
 Value(data=-1.1566885349403597,
 Value(data=0.15453851050353687,
 Value(data=1.0241695195532723,
 Value(data=-1.7355261560225108,
 Value(data=0.8893762589785813,
 Value(data=-1.3011355519653087,
 Value(data=-0.31824260876749255,
 Value(data=-1.1768315220155177]

In [1232]:
mlp.layers[0].neurons[1].w[1].value

-1.9751401083716467

In [1233]:
lr = 0.01
for p in mlp.parameters():
    p.value -= lr * p.grad

# check the gradients
mlp.layers[0].neurons[1].w[1].value

-1.9752005036605114

In [1234]:
# forward pass, we expect the loss to decrease
pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

Value(data=0.8611903741909768

The loss decreased. Yaaay!

Let's train it.

In [1311]:
epochs = 100
lr = 0.01
for i in tqdm(range(epochs)):
    pred = [mlp(xs[i]) for i in range(3)]
    loss = mse(pred, target)
    for p in mlp.parameters():
        p.grad = .0

    loss.backward()
    for p in mlp.parameters():
        p.value -= lr * p.grad
    loss = mse(pred, target)

    # lr
    # if i % 100 == 0:
    #     lr *= 0.9 # decay the learning rate
    # this is a simple problem no need to decay lr

    print(f"Epoch: {i} -> Loss: {loss.value}")

pred = [mlp(xs[i]) for i in range(3)]
loss = mse(pred, target)
loss

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0 -> Loss: 0.000750808907551976
Epoch: 1 -> Loss: 0.000750693739365945
Epoch: 2 -> Loss: 0.0007505786154831024
Epoch: 3 -> Loss: 0.0007504635358783628
Epoch: 4 -> Loss: 0.0007503485005266475
Epoch: 5 -> Loss: 0.0007502335094029262
Epoch: 6 -> Loss: 0.0007501185624821552
Epoch: 7 -> Loss: 0.0007500036597393339
Epoch: 8 -> Loss: 0.0007498888011494621
Epoch: 9 -> Loss: 0.0007497739866875753
Epoch: 10 -> Loss: 0.0007496592163287161
Epoch: 11 -> Loss: 0.0007495444900479537
Epoch: 12 -> Loss: 0.0007494298078203573
Epoch: 13 -> Loss: 0.0007493151696210418
Epoch: 14 -> Loss: 0.0007492005754251255
Epoch: 15 -> Loss: 0.0007490860252077406
Epoch: 16 -> Loss: 0.0007489715189440483
Epoch: 17 -> Loss: 0.0007488570566092288
Epoch: 18 -> Loss: 0.0007487426381784638
Epoch: 19 -> Loss: 0.0007486282636269817
Epoch: 20 -> Loss: 0.0007485139329299998
Epoch: 21 -> Loss: 0.0007483996460627708
Epoch: 22 -> Loss: 0.0007482854030005731
Epoch: 23 -> Loss: 0.0007481712037186795
Epoch: 24 -> Loss: 0.0007480

100%|██████████| 100/100 [00:00<00:00, 2262.41it/s]

Epoch: 85 -> Loss: 0.0007411753338254846
Epoch: 86 -> Loss: 0.0007410638436721379
Epoch: 87 -> Loss: 0.0007409523957549446
Epoch: 88 -> Loss: 0.0007408409900503457
Epoch: 89 -> Loss: 0.0007407296265348111
Epoch: 90 -> Loss: 0.0007406183051848161
Epoch: 91 -> Loss: 0.000740507025976869
Epoch: 92 -> Loss: 0.0007403957888874795
Epoch: 93 -> Loss: 0.0007402845938931835
Epoch: 94 -> Loss: 0.0007401734409705333
Epoch: 95 -> Loss: 0.000740062330096094
Epoch: 96 -> Loss: 0.0007399512612464443
Epoch: 97 -> Loss: 0.0007398402343981988
Epoch: 98 -> Loss: 0.0007397292495279734
Epoch: 99 -> Loss: 0.0007396183066124045





Value(data=0.0007395074056281415

Wooooooow! It's working! Loss decreased by a lot!

In [1312]:
pred

[Value(data=0.9807835367697398,
 Value(data=0.0009810644567304848,
 Value(data=0.9807835367697398]

It's not perfect, but it's a good start.

Do not forget zero grad after each epoch, because the gradients are accumulated.