# Introduction to Backpropagation

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt

## Manually calculating gradients

Lets create a class for storing values, adding operations and parameters of each node

In [None]:
from value import Value, draw_dot
    
a = Value(2)
b = Value(-3.0)
d = (a*b)
draw_dot(d)

Lets analyze the expression: L = (a*b+c)*f

Lets add some labels to the nodes, in order to identify them with ease

In [None]:
a = Value(2, label='a')
b = Value(-3.0, label='b')
c = Value(10, label='c')
e = a*b; e.label='e'
d = e + c; d.label='d'
f = Value(-2, label='f')
L = d*f; L.label='L'
draw_dot(L)

You can see that we have a matematical expression linking L with four free parameters: a, b, c, and f. We are now to run backpropagation, trying to increase the value of L by changing the values of the free parameters.
- For every single value we are going to calculate the derivative, using the chain rule. Using this, we will know how to change the values for increasing L 

In order to do so, we will add a property in Value to hold the derivative of L with respect to that value. We will name this property 'grad'

We start back to front, manually. We started by L

dL/dL = 1

In [None]:
L.grad = 1
draw_dot(L)

 Now, lets calculate the derivatives with respect to f and d. Since L = f*d:
 
 dL/df = d
 
 dL/dd = f
 
 Lets check by hand:

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'    
    f = Value(-2+h, label='f')  # HERE    
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
def grad_by_hand():

    h = 0.01

    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    d.data += h                 # HERE
    f = Value(-2, label='f')
    L = d*f; L.label='L'   
    L2 = L.data
    
    print((L2 - L1) / h)
    
grad_by_hand()

In [None]:
# Since they are correct, we will set the values
d.grad = f.data
f.grad = d.data

draw_dot(L)

Now, lets move to the previous values, e and c. We now need to calculate dL/de and dL/dc. From the chain rule we know that:

dL/de = dL/dd*dd/de
- We already calculated dL/dd, which is -2
- Since d = e+c, dd/de=1
- As a result **dL/de = -2 * 1 = -2**

In a similar waw, **dL/dc = -2**

**Note**: The + node only passes the gradient without modification

In [None]:
# After the check we know it is correct, so we update the .grad property
e.grad = -2
c.grad = -2

draw_dot(L)

Now, lets calculate the final one, the gradient for *a* and *b*, also using the chain rule.

- dL/da = dL/de * de/da
- dL/de is already known, -2
- Since e = a*b, then de/da = b

Finally, dL/da = -2 * -3 = 6

Similarly, dL/db = dL/de*de/db = -2 * a = -2 * 2 = -4

In [None]:
# Since they are correct, we updated the .grad properties.
a.grad = 6
b.grad = -4

draw_dot(L)

**Note**: There are some parameters that we can change, like a, b, c, and f, while the others are calculated, so cannot be changed.

Now, let use the gradient in order to increase the value of L

In [None]:
ag, bg, cg, fg = a.grad, b.grad, c.grad, f.grad

In [None]:
def eval_by_hand(h = 0.01):
    
    a = Value(2, label='a')
    b = Value(-3.0, label='b')
    c = Value(10, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2, label='f')
    L = d*f; L.label='L'
    L1 = L.data
    
    a = Value(2 + ag * h, label='a')
    b = Value(-3.0 + bg * h, label='b')
    c = Value(10 + cg * h, label='c')
    e = a*b; e.label='e'
    d = e + c; d.label='d'
    f = Value(-2 + fg * h, label='f')
    L = d*f; L.label='L'
    L2 = L.data

    print(L1, L2, L2-L1)
    
eval_by_hand(0.01)


In [None]:
eval_by_hand(-0.01)

Note, it is increased. This is the backpropagation algorithm in action! 

Lets move to something more complex, like a neuron.

A neural network has:
- Neurons
    - Weights
    - Bias (neuron default activation in absence of inputs)
    - Activation function: 
        - Introduce nonlinearities, frequently squashing the neuron output
        
There are some common activation function, like the tanh and ReLu

In [None]:
import numpy as np 
import matplotlib.pyplot as plt

x = np.arange(-5, 5, 0.1)
plt.plot(x, np.tanh(x))
plt.show()

In [None]:
x = np.arange(-5, 5, 0.1)

plt.plot(x, np.where(x > 0, x, 0))
plt.show()

Lets create a sistem a neuron and two inputs

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.7, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'

draw_dot(n)

Now evaluate the activation function

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.8812735870195432, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh(); o.label='o'
draw_dot(o)

Note than here the efect of tanh is minimal, but lets change the bias to 10 we would see the squashing efect of tanh. Now, lets run backpropagation on the neuron.

**Note**. While training the neuron, the only parameters that we can change are the weights and biases, because the training examples are fixed.

First, since do/do=1, lets set that in the neuron.

In [None]:
o.grad = 1.0
draw_dot(o)

dtanh(x)/dx = 1 - tanh(x)^2

In [None]:
n.grad = 1 - (o.data)**2
draw_dot(o)

do/db = do/dn*dn/db = 0.5 * 1

do/d(x1w1+x2w2) = do/dn * dn/d(x1w1+x2w2) = 0.5 * 1 = 0.5

In [None]:
b.grad = 0.5
x1w1x2w2.grad = 0.5
draw_dot(o)

do/dx2w2 = do/d(x1w1+x2w2) * d(x1w1+x2w2)/dx2w2
    = 0.5 * 1 = 0.5
    
do/dx1w1 = do/d(x1w1+x2w2) * d(x1w1+x2w2)/dx1w1
    = 0.5 * 1 = 0.5

In [None]:
x2w2.grad = 0.5
x1w1.grad = 0.5
draw_dot(o)

do/dw1 = do/dx1w1 * dx1w1/dw1 = 0.5*x1 = 0.5*2 = 1

do/dx1 = do/dx1w1 * dx1w1/dx1 = 0.5*w1 = 0.5*-3 = -1.5

do/dw2 = do/dx2w2 * dx2w2/dw2 = 0.5*x2 = 0.5*0 = 0

d0/dx2 = do/dx2w2 * dx2w2/dx2 = 0.5*w2 = 0.5*1 = 0.5

In [None]:
w1.grad = 1
x1.grad = -1.5
w2.grad = 0
x2.grad = 0.5
draw_dot(o)

Now, the parameters we can modify in order to increase the value of the function are the weights and biases.

Lets modify them to icrease the value.

In [None]:
grad_w1 = 1
grad_w2 = 0
grad_b = 0.5

def modify(h):
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0), Value(1.0)
    b = Value(6.8812735870195432)
    x1w1 = x1*w1
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b; 
    o = n.tanh(); 
    L1 = o.data
    
    x1, x2 = Value(2.0), Value(0.0)
    w1, w2 = Value(-3.0 + h*grad_w1), Value(1.0+h*grad_w2)
    b = Value(6.8812735870195432 + h*grad_b)
    x1w1 = x1*w1
    x2w2 = x2*w2
    x1w1x2w2 = x1w1+x2w2
    n = x1w1x2w2+b; 
    o = n.tanh(); 
    L2 = o.data
    return L1, L2, L2-L1

print(modify(0.01))

In [None]:
print(modify(-0.01))

## AutoGrad

You can see that applying backpropagation is very simple, but tedious. Lets move to automatically calculate the gradient (autograd).

In [None]:
# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.8812735870195432, label='b')

x1w1 = x1*w1; x1w1.label = 'x1w1'
x2w2 = x2*w2; x2w2.label = 'x2w2'

x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label = 'x1w1+x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh(); o.label='o'

o.backward()
draw_dot(o)

Lets see it on a more complex example

In [None]:
a = Value(-2.0, label='a')
b = Value(3.0, label='b')
d = a * b; d.label='d'
e = a + b; e.label='e'
f = d * e; f.label='f'

f.backward()
draw_dot(f)

You can check by hand that everything is working perfectly. Lets add a final manual check!!

In [None]:
def check():
    h = 0.001
    a = Value(-2.0, label='a')
    b = Value(3.0, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L1 = f.data
    
    a = Value(-2.0+h, label='a')
    b = Value(3.0, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L2 = f.data 
    
    print((L2 - L1) / h)
    
check()

In [None]:
def check():
    h = 0.001
    a = Value(-2.0, label='a')
    b = Value(3.0, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L1 = f.data
    
    a = Value(-2.0, label='a')
    b = Value(3.0+h, label='b')
    d = a * b; d.label='d'
    e = a + b; e.label='e'
    f = d * e; f.label='f'
    L2 = f.data 
    
    print((L2 - L1) / h)
    
check()