In [None]:
# This Section (Spanning two weeks) Will cover neural networks.
# This is the final math step in putting together all of the stuff we have learned about linear
# Regression, math, and python. After this, we can start doing more intelligent ML.

In [None]:
# Below, we generate the Xor dataset
import numpy as np

n_data = 1000
x = np.random.randint(2, size=(n_data, 2))
y = x[:,0] != x[:,1]

In [None]:
from sklearn.linear_model import Ridge

In [None]:
# Let's train liear regression to predict the y values

In [None]:
mdl = # TODO train a Ridge regression model on x, y

mdl.coef_

In [None]:
# And let's check the accuracy

In [None]:
yhat = # TODO Calculate the yhat predictions (is the prediction greater than .5)

In [None]:
(yhat == y).mean()

In [None]:
# What the heck! Why is our accuracy 50%? That's not very good. Let's look below at the problem we're
# Solving. How could we solve that? Should x1 and x2 have positive or negative coeficients? 
# There isn't really an answer, so the model just sets them to close to 0.

In [None]:
from IPython.display import Image
Image(filename='xor.png', width=200, height=200) 

In [None]:
# What if you had features 
# x1, x2, x3=(x1 OR x2) 
# (essentially, x3 is only equal to one at the top right corner of the above image)?
# Would you be able to learn linear regression to solve this problem? 
# What you you assign the weights to be?

In [None]:
assert(False)
# TODO: Pick features that would perfectly solve this problem, then remove the assertion error above

In [None]:
x3 = # TODO: Calculate x3 = (x1 or x2)

In [None]:
x_expanded = # TODO: make a single x that contains x and x3

In [None]:
mdl_improved = Ridge().fit(x_expanded, y)
mdl_improved.coef_

In [None]:
# Oh! Check it out, the model learned exactly what we wanted it to learn! (Well, almost)
# What if we reduced our regularization, do we think the model would get closer to [1, 1, -2],
# Or further away?

In [None]:
# TODO: Guess if the model's coefficients will be closer to [1, 1, -2] or further away
mdl_improved = Ridge(0).fit(x_expanded, y)
mdl_improved.coef_

In [None]:
# Wow! It's perfect! Great.
# So the problem here is, we already knew to compute x3, that new feature
# Before we improved our model.
# Let's do a few things with our original x, y dataset

In [None]:
# First, let's check if other models could solve this problem.
# Let's try a decision tree, and a neural network.

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dt = # TODO Train a model based on a DecisionTreeRegressor
yhat_dt = dt.predict(x)
(yhat_dt == y).mean()

In [None]:
from sklearn.neural_network import MLPRegressor


In [None]:
nn = # TODO Train a model based on a MLPRegressor (use hidden shapes (100, 100))
yhat_nn = nn.predict(x) > .5
(yhat_nn == y).mean()

In [None]:
# Okay, so we know that it's possible for a model to automatically learn stuff like our 
# "x_expanded" from earlier, that they can use to solve x_or
# So let's try to implement a model like linear regression, but with this quirk.
# Let's try a bunch of linear regressions each computing their own features
# Let's call teh number of linear regressions we do in this first layer num_hidden,
# Because we call this the hidden layer of our neural network.
# We can treat these
# Three like features from a fake dataset, and use them to do linear regression to predict y.

In [None]:
# Let's use 100 hidden units. Feel free to change this
num_hidden = 100

In [None]:
first_lr = # TODO Initialize the first linear regression models (weight matrix)

In [None]:
second_lr = # TODO Initialize the first second regression models (weight matrix)

In [None]:
# So here's the issue. If we do this directly, it will mathematically represent the same thing
# As a single linear regression from x prediction y.
# Here are some links that describe the issue. I won't go into it in detail
# https://math.stackexchange.com/questions/1948502/show-that-multiplication-of-matrices-corresponds-to-composition-of-linear-transf
# http://www.math.lsa.umich.edu/~kesmith/217worksheet2-3ALT1.pdf
# Let me propose a specific trick to get around this though. All we need is to not compose linear 
# Functions. So how about we look at the features we computed with the first step, let's put them
# Through this kink function (called a leaky relu), that is NOT LINEAR.
# Now we're good to go

In [None]:
def leaky_relu(x):
    return # TODO calculate the  output of a leaky relu, which is equal to x, unless x is negative, then is .1 * x

In [None]:
xviz = np.linspace(-10, 10, 1000)

yviz = leaky_relu(xviz)

import matplotlib.pyplot as plt
plt.plot(xviz, yviz)

In [None]:
# Great! So now let's build it.
features = # TODO calculate the hidden layer feature values. These will be used to run the final linear regression

In [None]:
features.shape

In [None]:
yhat = # TODO Calculate the output y prediction vector

In [None]:
loss = # TODO Calculate our loss
dl_dyhat = # TODO Calculate the derivitive from the loss to yhat

In [None]:
dl_dyhat.shape

In [None]:
second_lr.shape

In [None]:
dl_dsecond_lr = # TODO Calculate the derivitive
dl_dfeatures = # TODO Calculate the derivitive 

In [None]:
dl_dfeatures.shape

In [None]:
# check the features of each of our derivitives! For instance, dl_dfeatures is the same shape 
# as features. This makes sense, because the derivitive tells us which direction it wants us to change
# Each of the values

In [None]:
dl_dfeatures.shape

In [None]:
dl_dfirst_lr = # TODO Calculate the derivitive from the loss to yhat

In [None]:
# Okay! We calculated all our derivitives. Let's put this in a for loop to update our parameters

In [None]:
np.random.seed(0)
lr = .0001
first_lr = np.random.rand(2, num_hidden)/10000
second_lr = np.random.rand(num_hidden)/10000
for update_step in range(100):
    features = # TODO 
    yhat = # TODO 
    loss = # TODO 
    dl_dyhat = # TODO 
    dl_dsecond_lr = # TODO 
    dl_dfeatures = # TODO 
    dl_dfirst_lr = # TODO 
    first_lr = # TODO Calculate the derivitive from the loss to yhat
    second_lr = # TODO Calculate the derivitive from the loss to yhat
    if update_step % 10 == 0:
        print(update_step, 'loss', loss)
        print(update_step, 'acc', ((yhat > 0.5) == y).mean())

In [None]:
# Okay, so we were able to train the model and reduce the loss, but we still didn't
# learn accuracy greater than half.
# So let's do that thing we talked about earlier, and put the leaky relu in for our "hidden features"
# (the one's after the first linear regression)

In [None]:

lr = .00001
first_lr = np.random.rand(2, num_hidden)/10000
second_lr = np.random.rand(num_hidden)/10000
for update_step in range(100):
    features = # TODO 
    features = # TODO: Apply the leaky relu
    yhat = # TODO 
    loss = # TODO 
    dl_dyhat = # TODO 
    dl_dsecond_lr = # TODO 
    dl_dfeatures = # TODO 
    dl_dfeatures = # TODO Added. How we calculate the derivitive to the values before the leaky relu?
    dl_dfirst_lr = # TODO 
    first_lr = # TODO 
    second_lr = # TODO 
    if update_step % 10 == 0:
        print(update_step, 'loss', loss)
        print(update_step, 'acc', ((yhat > 0.5) == y).mean())

In [None]:
# WOOO! We got 100% accuracy! That's called a neurel network.
# We can also insert more hidden layers. Each hidden layer's job is just to compute teh next
# Hidden layer, before finally the last one is used to perform the final linear regression