# Building a Neural Network from scratch

Today, you'll learn how to code and train a neural network from scratch using just `numpy`.

Let's start with a toy dataset:

In [None]:
import numpy as np
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
X, y = make_moons(n_samples=50, noise=0.1, random_state=42)
plt.scatter(X[:,0], X[:,1], c=y, cmap='coolwarm')
plt.xlabel('x1')
plt.ylabel('x2')

In [None]:
y = y.reshape(-1, 1) # make y a column vector

### Questions

* How many observations does the data have?
* How many input features are there?
* Why would a simple Logistic Regression (LogReg) model perform poorly 
* How many model parameters (weights) does a LogReg model have for this task?
* How could one make a logistic regression perform better



### Fitting a linear model

we need:

1. A prediction function that maps the input `X` to the output `y`: $\hat{y} = F(X;w)$
2. A loss function that evaluates the goodness of fit: $L(y, \hat{y})$
3. Training data that is used to find the weights `w` that minimize the loss function.
4. Separate validation data that is used to assess the model's performance on unseen data.
5. The Gradient Descent Algorithm:

    $$
    w_{new} = w_{old} - LR \cdot \nabla_{L_{Loss}}(w)
    $$

### Let's start with a Log Reg model!

$$
F(X) = sigmoid(w_01 + w_1X_1 + w_2X_2) = sigmoid(Xw)
$$

In [None]:
# add a column of ones to the input data

def add_bias(X):
    return np.hstack([X, np.ones((X.shape[0], 1))])

X = add_bias(X)

# a model parameter for each column
assert X.shape[1] == 3

In [None]:
# initialize three random weights from the normal distribution
w = np.random.normal(size=3)
w

In [None]:
# calculate the linear combination between the input and the weights with a DOT product
# X[:,0]*w[0] + X[:,1]*w[1] + X[:,2]*w[2]
X.dot(w)

In [None]:
# calculate the sigmoid non linear transformation
def sigmoid(a):
    return ...
    
a = np.array([-10.0, -1.0, 0.0, 1.0, 10.0])
expected = np.array([0.0, 0.27, 0.5, 0.73, 1.0])
assert np.all(sigmoid(a).round(2) == expected)

In [None]:
# calculate the log loss (aka binary crossentropy)
def log_loss(y, y_pred):
    return ...

a = np.array([0.0, 0.0, 1.0, 1.0])
b = np.array([0.01, 0.99, 0.01, 0.99])
expected = np.array([0.01, 4.61, 4.61, 0.01])
assert np.all(log_loss(a, b).round(2) == expected)

In [None]:
# predictions of a log reg model
y_pred = sigmoid(X.dot(w))
y_pred

---
# Neural Network

We build a Neural Net with 

- one hidden layer that contains 3 "neurons"/ units
- one output layer with 1 unit
- a `sigmoid` **activation** function

$$
\hat{y} = F(X; w_h, w_o) = act(act(Xw_h)w_o)
$$

In [None]:
# how can we calculate the output of several LogReg models at the same time? 
# this is the first layer of a neural net!

weights = []

weights.append(np.random.normal(size=(3, 2))) # 3 model parameters for each units/neurons

# output layer: last layer of the network
# binary classification = one unit
weights.append(np.random.normal(size=(3, 1)))

weights

In [None]:
# output of the first layer: stacked output of three logistic regressions 
X_hidden = sigmoid(X.dot(weights[0]))

In [None]:
# feed the output of the first hidden layer into a second layer! this is an ordinary logistic regression.

# add a bias
X_hidden_with_bias = add_bias(X_hidden)

# calculate the final output of the network
ypred = sigmoid(X_hidden_with_bias.dot(weights[1]))

### The feed forward function

In [None]:
def feed_forward(X, weights):
    """
    1. Calculate the dot product of X
       and the weights of the first layer.

    2. Apply the sigmoid function on the result.

    3. Append an extra column of ones to the result (i.e. the bias).

    4. Calculate the dot product of the previous step
       with the weights of the second (i.e. outer) layer.

    5. Apply the sigmoid function on the result.

    6. Return all intermediate results (i.e. anything that is outputted
       by an activation function).
    """ 
    ...

    return output1, output2

In [None]:
# initialize some random weights

weights = [
    np.random.normal(size=(3, 2)),
    np.random.normal(size=(3, 1))
]

# testing 

out1, out2 = feed_forward(X, weights)

assert out1.shape == (50, 2)
assert out2.shape == (50, 1)

Xref = np.array([[1.0, 2.0, 1.0]])

out1, out2 = feed_forward(Xref, weights)

# The Backpropagation algorithm

http://krspiced.pythonanywhere.com/chapters/project_deep_learning/neural_networks/backpropagation.html

In [None]:
def sigmoid_deriv(X):
   """derivative of sigmoid with respect to X"""
   return sigmoid(X) * (1-sigmoid(X))

In [None]:
def log_loss_deriv(ytrue, ypred):
    loss_deriv = -(ytrue/ypred - ((1-ytrue)/(1-ypred)))  #transcribe the formula above
    return loss_deriv

In [None]:
def backprop(weights,
             output1,
             output2,
             ytrue,
             X_input,
             LR):

    wH = weights[0]
    wO = weights[1]

    '''EQUATION A:'''
    loss_gradient = log_loss_deriv(ytrue , output2)

    '''EQUATION B:'''
    # don't forget the bias!
    hidden_out_with_bias = add_bias(output1)
    # derivative of the sigmoid function with respect to the
    # hidden output * weights
    sig_deriv_1 = sigmoid_deriv(hidden_out_with_bias.dot(wO))

    y_grad = sig_deriv_1 * loss_gradient

    '''EQUATION C:'''
    delta_wo = -np.dot( y_grad.T, hidden_out_with_bias ) * LR

    #and finally, old weights + delta weights -> new weights!
    wO_new = wO + delta_wo.T

    '''EQUATION D:'''
    sig_deriv_2 = sigmoid_deriv( X_input.dot(wH) )
    #exclude the bias (last column) of the outer weights,
    #since it is not backpropagated!
    H_grad = sig_deriv_2  * np.dot(y_grad , wO[:-1].T)

    '''EQUATION E:'''
    delta_wH = -np.dot(H_grad.T, X_input ) * LR

    #old weights + delta weights -> new weights!
    wH_new = wH + delta_wH.T

    # new hidden weights, new output weights
    return wH_new, wO_new

In [None]:
# setup
X, y = make_moons(n_samples=200, noise=0.1, random_state=42)
X = add_bias(X)
y = y.reshape(-1, 1)
weights = [
   np.random.normal(size=(3, 4)),
   np.random.normal(size=(5, 1))
]

# train
history = []

for i in range(1000):
    ...


In [None]:
plt.plot(history)

## Extra: Visualizing the decision boundary

The decision boundary looks random as we have not trained our neural network yet!

In [None]:
# create a grid of values
x = np.linspace(-3, 3, 200)
X_vis = np.array([(x1, x2) for x1 in x for x2 in x])
# add the bias column
X_vis = add_bias(X_vis)

# calculate the (random) predictions
_, y_pred = feed_forward(X_vis, weights)

In [None]:
# reshape the predictions for visualization
Z = y_pred.reshape((len(x), len(x)), order='F')

In [None]:
# draw a contour plot
fig,ax=plt.subplots(1,1)
cp = ax.contourf(x, x, Z, alpha=0.8, cmap='coolwarm')
ax.contour(x, x, Z, levels=[0.5])
fig.colorbar(cp) # Add a colorbar to a plot

# draw the original data
ax.scatter(X[:,0], X[:,1], c=y.flatten(), cmap='coolwarm')