# Backpropagation
```
----------------------------------------------------------------------
Filename : backpropagation.ipynb
Author   : Jaidev Deshpande
Purpose  : Training multilayer perceptrons with error backpropagation.
Libraries: Tensorflow and its dependencies
----------------------------------------------------------------------
```

In [None]:
from IPython.display import Image
Image("mlp.png", height=200, width=600)

# Variables & Terminology
* ## $W_{i}$ - weights of the $i$th layer
* ## $B_{i}$ - biases of the $i$th layer
* ## $L_{a}^{i}$ - _activation_ (Inner product of weights and inputs of previous layer) of the $i$th layer.
* ## $L_{o}^{i}$ - _output_ of the $i$th layer. (This is $f(L_{a}^{i})$, where $f$ is the activation function)

# MLP with one input, one hidden, one output layer
* ## $X, y$ are the training samples
* ## $\mathbf{W_{1}}$ and $\mathbf{W_{2}}$ are the weights for first (hidden) and the second (output) layer.
* ## $\mathbf{B_{1}}$ and $\mathbf{B_{2}}$ are the biases for first (hidden) and the second (output) layer.
* ## $L_{a}^{0} = L_{o}^{0}$, since the first (zeroth) layers is just the input.

# Activations and outputs
* ## $L_{a}^{1} = X\mathbf{W_{1}} + \mathbf{B_{1}}$
* ## $L_{o}^{1} = \frac{1}{1 + e^{-L_{a}^{1}}}$
* ## $L_{a}^{2} = L_{o}^{1}\mathbf{W_{2}} + \mathbf{B_{2}}$
* ## $L_{o}^{2} = \frac{1}{1 + e^{-L_{a}^{2}}}$
* ## Loss $E = \frac{1}{2} \sum_{S}(y - L_{o}^{2})^{2}$

In [None]:
import numpy as np
from utils import backprop_decision_boundary, backprop_make_classification, backprop_make_moons
from sklearn.metrics import accuracy_score
import tensorflow as tf
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rc('figure', figsize=(8, 6))
%matplotlib inline

In [None]:
# make dummy data
X, Y = backprop_make_classification()
plt.scatter(*X.T, c=np.argmax(Y, axis=1).ravel())

## Forward Pass

In [None]:
x = tf.constant(X, dtype='float32')
y = tf.constant(Y, dtype='float32')

# weights and biases
w1 = tf.random.uniform((2, 3))
b1 = tf.random.uniform((1, 3))
w2 = tf.random.uniform((3, 2))
b2 = tf.random.uniform((1, 2))

def predict(x, w1, b1, w2, b2):
    l1_activation = tf.tensordot(x, w1, 1) + b1
    l1_output = 1.0 / (1 + tf.exp(-l1_activation))
    l2_activation = tf.tensordot(l1_output, w2, 1) + b2
    l2_output = 1.0 / (1 + tf.exp(-l2_activation))
    return l2_output

In [None]:
preds = predict(x, w1, b1, w2, b2)
p = tf.argmax(preds, axis=1)
plt.scatter(*X.T, c=p.numpy())

## Backpropagation

In [None]:
for i in range(500):
    with tf.GradientTape() as g:
        g.watch(w1)
        g.watch(w2)
        g.watch(b1)
        g.watch(b2)
        
        prediction = predict(x, w1, b1, w2, b2)

        # losses and gradients
        loss = 0.5 * tf.reduce_sum(tf.pow(y - prediction, 2))
    if i % 100 == 0:
        print(loss.numpy())
    gw1, gb1, gw2, gb2 = g.gradient(loss, [w1, b1, w2, b2])
    
    # Gradient descent:
    w1 -= 0.2 * gw1
    b1 -= 0.2 * gb1
    w2 -= 0.2 * gw2
    b2 -= 0.2 * gb2

In [None]:
preds = predict(x, w1, b1, w2, b2)
p = tf.argmax(preds, axis=1)
print(preds[:5])
plt.scatter(*X.T, c=p.numpy())

In [None]:
y_enc = np.argmax(Y, axis=1)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))
Z = predict(tf.constant(np.c_[xx.ravel(), yy.ravel()], dtype='float32'),
            w1, b1, w2, b2)
Z = tf.argmax(Z, axis=1)
Z = tf.reshape(Z, xx.shape)
plt.contourf(xx, yy, Z.numpy(), alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y_enc)

# Exercise: Implement an MLP with two hidden layers, for the following dataset

In [None]:
X, Y = backprop_make_moons()
plt.scatter(X[:, 0], X[:, 1], c=np.argmax(Y, axis=1))

### Hints:
1. Use two hidden layers, one containing 3 and the other containing 4 neurons
2. Use learning rate $\alpha$ = 0.2
3. Try to make the network converge in 1000 iterations 

In [None]:
# enter code here

### Tips & Tricks for backprogation:
[Efficient BackProp, LeCun et al](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)