# Introduction to Backpropagation and Neural Networks with AutoGrad
Following the video and code of Andrej Karpathy
https://www.youtube.com/watch?v=VMj-3S1tku0&t=1072s&pp=ugMICgJlcxABGAHKBRJtaWNyb2dyYWQga2FycGF0aHk%3D

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lets start by defining a function

In [None]:
def f(x):
    return 3*x**2 - 4*x + 5

We can, of course, evaluate the function

In [None]:
f(3.0)

Lets plot this

In [None]:
xs = np.arange(-5, 5, 0.25)
ys = f(xs)
plt.plot(xs, ys)
plt.show()

What is derivative meassuring?

df/dx = lim h->0 (f(x+h) - f(x)) / h

In [None]:
h = 0.01
x = 3.0
(f(x + h) - f(x)) / h

In [None]:
h = 0.001
(f(x + h) - f(x)) / h

In [None]:
h = 0.0001
(f(x + h) - f(x)) / h

Solving analytically we find dy/dx = 6*x-4

In [None]:
6 * x - 4

The derivative in the point is the slope, or instaneous increment of the function when the argument incresase:
- since derivative is positive the funcion is increasing in that point
- the increase is proportional to 14 times the increment in the function parameter

Lets try a different value

In [None]:
x = 1
h = 0.0001
(f(x + h) - f(x)) / h

Now the function is also increasing, but now slower

Lets check another value

In [None]:
x = -2
h = 0.0001
(f(x + h) - f(x)) / h

Note than in x=2 the function is decreasing, faster ...

Lets see another point

In [None]:
x = 2/3
h = 0.0001
(f(x + h) - f(x)) / h

It is very close to 0, so the function is neither increasing nor decreasing at that point

A more complex example

In [None]:
a = 2.0
b = -3.0
c = 10.0
f = a*b+c
f

Let see how the function changes with respect to the parameters

In [None]:
h = 0.01
df_da = (((a+h)*b + c) - (a*b + c)) / h
df_da

This -3 means that the function value decreases proportional to 3 times the increment in 'a'. 

Lets see some other increments

In [None]:
df_db = ((a*(b+h) + c) - (a*b + c)) / h
df_dc = ((a*b+c+h) - (a*b+c)) / h

df_da, df_db, df_dc

You can check the values are very close to the analytical "partial derivatives"
How can I modify the parameters a, b, and c if I want to increase the value of f?

In [None]:
h = 0.01
a*b+c, (a-h)*(b+h)+(c+h)

In [None]:
h = -0.01
a*b+c, (a-h)*(b+h)+(c+h)

# Gradient descend

Lets create a class for storing values, adding operations and parameters of each node

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self._prev = set(_children)
        self._op = _op
        
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
        
    def __repr__(self):
        return f"Value(data={self.data})"
    
a = Value(2)
b = Value(-3.0)
d = (a*b)
d, d._prev, d._op

In [None]:
# Now, let create a way to visualize the expression
from graphviz import Digraph

def trace(root):
  # builds a set of all nodes and edges in a graph
  nodes, edges = set(), set()
  def build(v):
    if v not in nodes:
      nodes.add(v)
      for child in v._prev:
        edges.add((child, v))
        build(child)
  build(root)
  return nodes, edges

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
  
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{data: %s}" % (n.data), shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot

In [None]:
a = Value(2)
b = Value(-3.0)
c = Value(10)
d = a*b + c

draw_dot(d)

Lets add some labels to the nodes, in order to identify them with ease

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
  
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{%s|data: %s}" % (n.label, n.data), shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot

In [None]:
a = Value(2, label='a')
b = Value(-3.0, label='b')
c = Value(10, label='c')
e = a*b; e.label='e'
d = e + c; d.label='d'
f = Value(-2, label='f')
L = d*f; L.label='L'
draw_dot(L)

You can see that we have a matematical expression linking L with four free parameters: a, b, c, and f. We are now to run backpropagation, trying to increase the value of L by changing the values of the free parameters.
- For every single value we are going to calculate the derivative, using the chain rule. Using this, we will know how to change the values for increasing L 

In order to do so, we will add a property in Value to hold the derivative of L with respect to that value. We will name this property 'grad'

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
        
    def __repr__(self):
        return f"Value(data={self.data})"
    
def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
  
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{%s|data: %.4f|grad:%.4f}" % (n.label, n.data, n.grad), 
             shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot

a = Value(2, label='a')
b = Value(-3.0, label='b')
c = Value(10, label='c')
e = a*b; e.label='e'
d = e + c; d.label='d'
f = Value(-2, label='f')
L = d*f; L.label='L'
draw_dot(L)

We start back to front, manually. We started by L

dL/dL = 1

In [None]:
L.grad = 1
draw_dot(L)

 Now, lets calculate the derivatives with respect to f and d. Since L = f*d:
 
 dL/df = d
 
 dL/dd = f
 
 Lets check by hand:

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'    
    f = Value(-2+h, label='f')  # HERE    
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    d.data += h                 # HERE
    f = Value(-2, label='f')
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
# Since they are correct, we will set the values
d.grad = f.data
f.grad = d.data

draw_dot(L)

Now, lets move to the previous values, e and c. We now need to calculate dL/de and dL/dc. From the chain rule we know that:

dL/de = dL/dd*dd/de
- We already calculated dL/dd, which is -2
- Since d = e+c, dd/de=1
- As a result **dL/de = -2 * 1 = -2**

In a similar waw, **dL/dc = -2**

**Note**: The + node only passes the gradient without modification

Lets check numerically if it is correct:

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    e.data += h              # HERE
    d = e + c; d.label='d'    
    f = Value(-2, label='f')    
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10+h, label='c') # HERE
    e = a*b; e.label='e'
    d = e + c; d.label='d'    
    f = Value(-2, label='f')    
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
# After the check we know it is correct, so we update the .grad property
e.grad = -2
c.grad = -2

draw_dot(L)

Now, lets calculate the final one, the gradient for *a* and *b*, also using the chain rule.

- dL/da = dL/de * de/da
- dL/de is already known, -2
- Since e = a*b, then de/da = b

Finally, dL/da = -2 * -3 = 6

Similarly, dL/db = dL/de*de/db = -2 * a = -2 * 2 = -4

Lets check it numerically:

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2+h, label='a') # HERE
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'    
    f = Value(-2, label='f')    
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2, label='a')
    b = Value(-3.0+h, label='b') # HERE
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'    
    f = Value(-2, label='f')    
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
# Since they are correct, we updated the .grad properties.
a.grad = 6
b.grad = -4

draw_dot(L)

**Note**: There are some parameters that we can change, like a, b, c, and f, while the others are calculated, so cannot be changed.

Now, let use the gradient in order to increase the value of L

In [None]:
ag, bg, cg, fg = a.grad, b.grad, c.grad, f.grad

In [None]:
def eval_by_hand(h = 0.01):
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2 + ag * h, label='a')
    b = Value(-3.0 + bg * h, label='b')
    c = Value(10 + cg * h, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2 + fg * h, label='f')
    L = d*f; L.label='L'
    L2 = L.data

    print(L1, L2, L2-L1)
    
eval_by_hand(0.01)


In [None]:
eval_by_hand(-0.01)

Note, it is increased. This is the backpropagation algorithm in action! 

Lets move to something more complex, like a neuron.

A neural network has:
- Neurons
    - Weights
    - Bias (neuron default activation in absence of inputs)
    - Activation function: 
        - Introduce nonlinearities, frequently squashing the neuron output
        
There are some common activation function, like the tanh and ReLu

In [None]:
import numpy as np 
import matplotlib.pyplot as plt

x = np.arange(-5, 5, 0.1)
plt.plot(x, np.tanh(x))

In [None]:
x = np.arange(-5, 5, 0.1)

plt.plot(x, np.where(x > 0, x, 0))

Lets create a sistem a neuron and two inputs

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.7, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'

draw_dot(n)

Lets add the code for the activation function. 
- Since in our Value we only sums and products, we need a new node for the tanh

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __repr__(self):
        return f"Value(data={self.data})"
        
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
        
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        return out

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.8812735870195432, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh(); o.label='o'
draw_dot(o)

Note than here the efect of tanh is minimal, but lets change the bias to 10 we would see the squashing efect of tanh. Now, lets run backpropagation on the neuron.

**Note**. While training the neuron, the only parameters that we can change are the weights and biases, because the training examples are fixed.

First, since do/do=1, lets set that in the neuron.

In [None]:
o.grad = 1.0
draw_dot(o)

dtanh(x)/dx = 1 - tanh(x)^2

In [None]:
n.grad = 1 - (o.data)**2
draw_dot(o)

In [None]:
def verify():
    h = 0.001
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0), Value(1.0)
    b = Value(6.8812735870195432)
    x1w1 = x1*w1
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b; 
    o = n.tanh(); 
    L1 = o.data
    
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0), Value(1.0)
    b = Value(6.8812735870195432)
    x1w1 = x1*w1
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b; 
    n.data += h
    o = n.tanh(); 
    L2 = o.data
    
    print((L2-L1)/h)
verify()   


do/db = do/dn*dn/db = 0.5 * 1

do/d(x1w1+x2w2) = do/dn * dn/d(x1w1+x2w2) = 0.5 * 1 = 0.5

In [None]:
b.grad = 0.5
x1w1x2w2.grad = 0.5
draw_dot(o)

do/dx2w2 = do/d(x1w1+x2w2) * d(x1w1+x2w2)/dx2w2
    = 0.5 * 1 = 0.5
    
do/dx1w1 = do/d(x1w1+x2w2) * d(x1w1+x2w2)/dx1w1
    = 0.5 * 1 = 0.5

In [None]:
x2w2.grad = 0.5
x1w1.grad = 0.5
draw_dot(o)

In [None]:
def verify():
    h = 0.001
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0), Value(1.0)
    b = Value(6.8812735870195432)
    x1w1 = x1*w1
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b; 
    o = n.tanh(); 
    L1 = o.data
    
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0), Value(1.0)
    b = Value(6.8812735870195432)
    x1w1 = x1*w1
    x1w1.data += h
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b;
    o = n.tanh(); 
    L2 = o.data
    
    print((L2-L1)/h)
verify()   


do/dw1 = do/dx1w1 * dx1w1/dw1 = 0.5*x1 = 0.5*2 = 1

do/dx1 = do/dx1w1 * dx1w1/dx1 = 0.5*w1 = 0.5*-3 = -1.5

do/dw2 = do/dx2w2 * dx2w2/dw2 = 0.5*x2 = 0.5*0 = 0

d0/dx2 = do/dx2w2 * dx2w2/dx2 = 0.5*w2 = 0.5*1 = 0.5

In [None]:
w1.grad = 1
x1.grad = -1.5
w2.grad = 0
x2.grad = 0.5
draw_dot(o)

Now, the parameters we can modify in order to increase the value of the function are the weights and biases.

Lets modify them to icrease the value.

In [None]:
grad_w1 = 1
grad_w2 = 0
grad_b = 0.5

def modify(h):
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0), Value(1.0)
    b = Value(6.8812735870195432)
    x1w1 = x1*w1
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b; 
    o = n.tanh(); 
    L1 = o.data
    
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0 + h*grad_w1), Value(1.0+h*grad_w2)
    b = Value(6.8812735870195432 + h*grad_b)
    x1w1 = x1*w1
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b; 
    o = n.tanh(); 
    L2 = o.data
    return L1, L2, L2-L1

print(modify(0.01))

In [None]:
print(modify(-0.01))

You can see that applying backpropagation is very simple, but tedious. Lets move to automatically calculate the gradient (autograd).

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        
        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad = 1.0 * out.grad
            other.grad = 1.0 * out.grad
        
        out._backward = _backward
        
        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad = out.grad * other.data
            other.grad = out.grad * self.data
            
        out._backward = _backward
        
        return out
    
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        
        def _backward():
            self.grad = out.grad * (1 - out.data**2)
        
        out._backward = _backward
        
        return out
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

Lets put it to work

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.8812735870195432, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh(); o.label='o'

o.grad = 1
o._backward()
draw_dot(o)

In [None]:
n._backward()
draw_dot(o)

In [None]:
x1w1x2w2._backward()
draw_dot(o)

So, in order to perform everything automatically, we need to get the list of all nodes, moving from 'o' to the first nodes. 

We perform this using a topological sort.

In [None]:
topo = []
visited = set()
def build_topo(v):
    if v not in visited:
        visited.add(v)
        for child in v._prev:
            build_topo(child)
        topo.append(v)

In [None]:
build_topo(o)
topo

As you can see, in this order, all the nodes refered by a given node are always traveled **before** the node.

In [None]:
o.grad = 1
build_topo(o)
for t in topo[::-1]:
    t._backward()
draw_dot(o)

Lets put this inside a method in the Value class.

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        
        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad = 1.0 * out.grad
            other.grad = 1.0 * out.grad
        
        out._backward = _backward
        
        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad = out.grad * other.data
            other.grad = out.grad * self.data
            
        out._backward = _backward
        
        return out
    
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        
        def _backward():
            self.grad = out.grad * (1 - out.data**2)
        
        out._backward = _backward
        
        return out
    
    def backward(self):
        self.grad = 1
        build_topo(self)
        for t in topo[::-1]:
            t._backward()
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.8812735870195432, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh(); o.label='o'

o.backward()
draw_dot(o)

There is a subtle problem in the code. Lets see in an example. 

Which is the expected value of the grad?

In [None]:
a = Value(3.0, label='a')
b = a + a; b.label = 'b'
b.backward()
draw_dot(b)

Note that:

db/da = d(a+a)/da = da/da + da/da = 1 + 1 = 2

The problem with our code is that grad values are set, instead of updated.

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        
        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        
        out._backward = _backward
        
        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
            
        out._backward = _backward
        
        return out
    
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        
        def _backward():
            self.grad += out.grad * (1 - out.data**2)
        
        out._backward = _backward
        
        return out
    
    def backward(self):
        self.grad = 1
        build_topo(self)
        for t in topo[::-1]:
            t._backward()
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

In [None]:
a = Value(3.0, label='a')
b = a + a; b.label = 'b'
b.backward()
draw_dot(b)

Lets see it on a more complex example

In [None]:
a = Value(-2.0, label='a')
b = Value(3.0, label='b')
d = a * b; d.label='d'
e = a + b; e.label='e'
f = d * e; f.label='f'

f.backward()
draw_dot(f)

You can check by hand that everything is working perfectly. Lets add a final manual check!!

In [None]:
def check():
    h = 0.001
    a = Value(-2.0, label='a')
    b = Value(3.0, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L1 = f.data
    
    a = Value(-2.0+h, label='a')
    b = Value(3.0, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L2 = f.data 
    
    print((L2 - L1) / h)
    
check()

In [None]:
def check():
    h = 0.001
    a = Value(-2.0, label='a')
    b = Value(3.0, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L1 = f.data
    
    a = Value(-2.0, label='a')
    b = Value(3.0+h, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L2 = f.data 
    
    print((L2 - L1) / h)
    
check()

## Decomposing tanh by its components

tanh(x) = (e** (2*x)-1) / (e** (2*x)+1)

Lets modify Value in order to use use the tanh definition. We introduce some modifications in order to deal with adding a constant.

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        
        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        print(self.data, other.data)
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        
        out._backward = _backward
        
        return out
    
    def __radd__(self, other):
        return self + other
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
            
        out._backward = _backward
        
        return out
    
    def __rmul__(self, other):
        return self*other
    
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        
        def _backward():
            self.grad += out.grad * (1 - out.data**2)
        
        out._backward = _backward
        
        return out
    
    def backward(self):
        self.grad = 1
        build_topo(self)
        for t in topo[::-1]:
            t._backward()
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

Lets add now the exponentiation method, with the proper derivative

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        
        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        print(self.data, other.data)
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        
        out._backward = _backward
        
        return out
    
    def __radd__(self, other):
        return self + other
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
            
        out._backward = _backward
        
        return out
    
    def __rmul__(self, other):
        return self*other
    
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        
        def _backward():
            self.grad += out.grad * (1 - out.data**2)
        
        out._backward = _backward
        
        return out
    
    def exp(self):
        out = Value(np.exp(self.data), (self,), 'exp')
        
        def _backward():
            self.grad += out.grad * out.data
            
        out._backward = _backward
        
        return out
    
    def backward(self):
        self.grad = 1
        build_topo(self)
        for t in topo[::-1]:
            t._backward()
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

In [None]:
a = Value(2.0)
a.exp()

For the division, we will introduce something more general.

a / b = a * (1/b) = a * b**-1

So, we need the power operator

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        
        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        print(self.data, other.data)
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        
        out._backward = _backward
        
        return out
    
    def __radd__(self, other):
        return self + other
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
            
        out._backward = _backward
        
        return out
    
    def __rmul__(self, other):
        return self*other
    
    def __pow__(self, exponent):
        assert isinstance(exponent, (int, float)), "Only support int and float for now"
        out = Value(self.data ** exponent, (self, ), f'**{exponent}')
        
        def _backward():
            self.grad = out.grad * exponent * self.data ** (exponent -1)
        
        out._backward = _backward
        return out
    
    def __truediv__(self, other):
        return self * other ** -1
    
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        
        def _backward():
            self.grad += out.grad * (1 - out.data**2)
        
        out._backward = _backward
        
        return out
    
    def exp(self):
        out = Value(np.exp(self.data), (self,), 'exp')
        
        def _backward():
            self.grad += out.grad * out.data
            
        out._backward = _backward
        
        return out
    
    def backward(self):
        self.grad = 1
        build_topo(self)
        for t in topo[::-1]:
            t._backward()
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

In [None]:
a = Value(4.0)
a**2

In [None]:
a = Value(8.0)
b = Value(2.0)
a/b

For completness, lets add the substraction and negation operands

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        
        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label
        
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        
        out._backward = _backward
        
        return out
    
    def __radd__(self, other):
        return self + other
    
    def __sub__(self, other):
        return self + -other
    
    def __neg__(self):
        return self * -1
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
            
        out._backward = _backward
        
        return out
    
    def __rmul__(self, other):
        return self*other
    
    def __pow__(self, exponent):
        assert isinstance(exponent, (int, float)), "Only support int and float for now"
        out = Value(self.data ** exponent, (self, ), f'**{exponent}')
        
        def _backward():
            self.grad = out.grad * exponent * self.data ** (exponent -1)
        
        out._backward = _backward
        return out
    
    def __truediv__(self, other):
        return self * other ** -1
    
    def tanh(self):
        out = Value(np.tanh(self.data), (self,), 'tanh')
        
        def _backward():
            self.grad += out.grad * (1 - out.data**2)
        
        out._backward = _backward
        
        return out
    
    def exp(self):
        out = Value(np.exp(self.data), (self,), 'exp')
        
        def _backward():
            self.grad += out.grad * out.data
            
        out._backward = _backward
        
        return out
    
    def backward(self):
        self.grad = 1
        build_topo(self)
        for t in topo[::-1]:
            t._backward()
        
    def __repr__(self):
        return f"Value(data={self.data})"
    

In [None]:
a = Value(8.0)
b = Value(5.0)
a - b

In [None]:
-a

Lets go back to our example, in order to modify the tanh

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.8812735870195432, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh(); o.label='o'

o.backward()
draw_dot(o)

In [None]:
# tanh(x) = (e** (2x)-1) / (e**(2*x)+1)

# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.8812735870195432, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'

# o = n.tanh(); o.label='o'
ex = (n*2).exp(); ex.label='exp(2**n)'
o = (ex -1) / (ex + 1); o.label=o

o.backward()
draw_dot(o)

This example shows a very important point: **The level of details in the components is up to your needs. You can assemble complex behaviours as units, and you only need to provide the gradient of the complex unit**.|