In [None]:
import numpy as np
import seaborn as sb
import pandas
import sys
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
%matplotlib notebook

# Simple two-layer, non-linear perceptron

We are going to implement a simple, perceptron with a non-linear sigmoid activation function and train it using backpropagation.

Weights are initialized with random numbers of mean 0 and then update iteratively using the error between network output and wanted labels. 

Initially, we are going to test very simple input-output relationships with the input consisting of a 3-dimensional vector with either 0 or 1, and the output consisting of a 1-dimensional target-value also with either 0 or 1.

In [None]:
# a non-linear activation function
# here, we use a sigmoid shape
def sigmoid(x):
    return 1/(1+np.exp(-x))

# the derivative of the sigmoid
def dsigmoid(x):
    return x*(1-x)

# implements a simple two-layer network
def twoLayer(X,y,plotting=True):
    # seed random numbers to get repeatable results
    np.random.seed(1)

    # initialize weights randomly (mean 0)
    # the number of weights is determined by the number
    # of columns in both the input data X and the output
    # data y!
    syn0 = 2*np.random.random((X.shape[1],y.shape[1])) - 1

    # maximum iteration
    maxIter = 10000

    # store errors
    l1ErrorArray = np.zeros(maxIter)

    # do some iterations
    for it in np.arange(maxIter):

        # forward propagation: we put in our pattern
        # as layer "0" and then push it through the
        # activation function to get the output of
        # the layer
        l0 = X
        l1 = sigmoid(np.dot(l0,syn0))

        # evaluate the error of the layer
        l1Error = y - l1

        # evaluate the summed squared error
        l1ErrorArray[it] = np.sum(l1Error*l1Error)

        # print out the summed squared error sometimes
        if (it%1000==0):
            sys.stdout.write("Iteration {:d}: error = {:f}\r".format(it,l1ErrorArray[it]))
            sys.stdout.flush()

        # the error determines the amount we need
        # to move along the derivative
        l1Delta = l1Error * dsigmoid(l1)

        # the weight update is the dot product between
        # the pattern input and the correction amount 
        syn0 += np.dot(l0.T,l1Delta)

    print("output after training is:\n",l1)
    if (plotting):
        fig,ax = plt.subplots(figsize=(8,6))
        plt.plot(l1ErrorArray)
    return(syn0,l1ErrorArray)
    
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1] ])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# note, that each row here corresponds to the
# row in X - so we are trying to basically
# learn two class labels from data here
y = np.array([[0],
              [1],
              [1],
              [0]])

# let's call our function and do the training
(weights,errors)=twoLayer(X,y)

That went pretty well. We achieved fast learning. 

Notice, however, that the first row of the training examples was fully correlated to the target values! Hence, among all the training examples, this should have been easy to find out!

In fact, let's check the weights of the network:

In [None]:
weights

Predictably, the first data dimension was given a large positive weight, whereas the other dimensions were given small and/or negative weights.

Note, however, that the weights are not the weights that would be given by a linear perceptron, since the weights here sit inside the sigmoid activation function!

$Out_j = 1/(1+e^{\sum_k w_jk x_k})$

So, what about a different, less correlated target value vector?

In [None]:
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1] ])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# note, that each row here corresponds to the
# row in X - so we are trying to basically
# learn two class labels from data here
y = np.array([[1],
              [1],
              [1],
              [0]])

# let's call our function and do the training
(weights,errors)=twoLayer(X,y)
print(weights)

That works as well. 

We have seen that networks can learn simple logical functions, so this one should do that too, right? Let's try:

In [None]:
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1],
                [1,0,0],
                [1,1,0],
                [0,1,0],
                [0,0,0]])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# here is a simple, logical combination:
y = np.logical_or(X[:,0],np.logical_and(X[:,1],X[:,2]))
print(y)

# let's call our function and do the training:
(weights,errors)=twoLayer(X,y.reshape(-1,1))
print(weights)

But we forgot something! Right now, what we can do is to change the weights on the sigmoids. But we have no way to shift the whole curve!! This is what the "bias" neuron can do.

To see why this is important, imagine we are trying to fit a line by only having weights:

$ y = w*x$

This can of course only change the slope of the line. But we also need to shift the line, so we need an intercept or bias:

$ y = w*x + b$


Let's add this neuron to our two-layer network:

In [None]:
def twoLayerBias(X,y,plotting=True):
    # seed random numbers to get repeatable results
    np.random.seed(1)
    
    # let's add ones to the data to model the bias
    X=np.hstack((np.ones((X.shape[0],1)),X))
    # initialize weights randomly (mean 0)
    # the number of weights is determined by the number
    # of columns in both the input data X (but remember we
    # have the bias). We still connect of course to the 
    # output dimensions determined by the data y 
    syn0 = 2*np.random.random((X.shape[1],y.shape[1])) - 1

    # maximum iteration
    maxIter = 10000

    # store errors
    l1ErrorArray = np.zeros(maxIter)

    # do some iterations
    for it in np.arange(maxIter):

        # forward propagation: we put in our pattern
        # as layer "0" and then push it through the
        # activation function to get the output of
        # the layer
        l0 = X
        l1 = sigmoid(np.dot(l0,syn0))

        # evaluate the error of the layer
        l1Error = y - l1

        # evaluate the summed squared error
        l1ErrorArray[it] = np.sum(l1Error*l1Error)

        # print out the summed squared error sometimes
        if (it%1000==0):
            sys.stdout.write("Iteration {:d}: error = {:f}\r".format(it,l1ErrorArray[it]))
            sys.stdout.flush()

        # the error determines the amount we need
        # to move along the derivative
        l1Delta = l1Error * dsigmoid(l1)

        # the weight update is the dot product between
        # the pattern input and the correction amount 
        syn0 += np.dot(l0.T,l1Delta)

    print("output after training is:\n",l1)
    if (plotting):
        fig,ax = plt.subplots(figsize=(8,6))
        plt.plot(l1ErrorArray)
    return(syn0,l1ErrorArray)


In [None]:
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1],
                [1,0,0],
                [1,1,0],
                [0,1,0],
                [0,0,0]])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# here is a simple, logical combination:
y = np.logical_or(X[:,0],np.logical_and(X[:,1],X[:,2]))
print(y)

# let's call our function and do the training:
(weights,errors)=twoLayerBias(X,y.reshape(-1,1))
print(weights)

Cool, that works.

So let's try something very different. Let's go back to our beautiful IRIS dataset and try to categorize using a two-layer network:

In [None]:
iris = load_iris()
# these are the inputs to our neural network
# each row is one training example
X = iris.data[:100]
    
# this is the target state we want to have
y = iris.target[:100]

# let's call our function and do the training:
(weights,errors)=twoLayerBias(X,y.reshape(-1,1))
print(weights)

Ugh. That did not go so well. Somehow, the network did not learn. Instead the error goes pretty much immediately to high values. What went wrong?

The problem is in the input values. When using the gradient for updating the weights, the gradient can become too big and the network "overshoots". The best solution is to:

* normalize the data: `X = (X-X.mean(axis=0))/X.std(axis=0)`

In [None]:
iris = load_iris()
# these are the inputs to our neural network
# each row is one training example
X = iris.data[:100]
    
# normalize the data
X = (X-X.mean(axis=0))/X.std(axis=0)

# this is the target state we want to have
y = iris.target[:100]

# let's call our function and do the training:
(weights,errors)=twoLayerBias(X,y.reshape(-1,1))
print(weights)

Good. As we can see, the examples are correctly recognized and the weights that are given to the four dimensions are now different.

## Breaking the neural network

Let's return to our simple, 0/1 example and try a different training/testing combination:

In [None]:
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1] ])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# note, that each row here corresponds to the
# row in X - so we are trying to basically
# learn two class labels from data here
y = np.array([[1],
              [1],
              [0],
              [0]])

# let's call our function and do the training:
(weights,errors)=twoLayerBias(X,y)
print(weights)

Ouch. So that does not work. We have left the realm of simple correlations and the output space is a weird, highly non-linear combination of the inputs.

But this is a sigmoid-network, so can our network even learn non-linear things?

In [None]:
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1] ])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# here is an explictly non-linear function:
y = (X[:,0]+(X[:,1]-X[:,2])/(X[:,1]+0.4))
print(y)

# let's call our function and do the training:
(weights,errors)=twoLayerBias(X,y.reshape(-1,1))
print(weights)

Apparently not. But wait!

If we look at the "y" target values, we can see that they are well outside the 0,1 range of the network inputs. Since the activation function itself is normalized between 0 and 1, we of course should normalize our output to be between 0 and 1:

In [None]:
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1] ])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# here is an explictly non-linear function:
y = (X[:,0]+(X[:,1]-X[:,2])/(X[:,1]+0.4))
print(y)
y = (y-y.min())/(y.max()-y.min())
print(y)

# let's call our function and do the training:
(weights,errors)=twoLayerBias(X,y.reshape(-1,1))
print(weights)

But, still, the example from above cannot apparently be learned. This is because even though we have non-linear activation, the target vectors are far outside the "span" of the input examples.

The solution? 

More layers!

## Three-layer network

Let's add a "hidden" layer to our network. We also add a bias node to the input layer of the network in order to be able to shift the outputs around.

Finally, since multi-layer networks use a gradient descent via backpropagation, we are in danger of getting stuck in local minima. One way to avoid this is to use a "learning rate" that is used to dampen the step size update. 

In [None]:
def threeLayerBias(X,y,numHidden=5,alpha=0.02,plotting=True):

    # seed random numbers to get repeatable results
    np.random.seed(1)

    # let's add ones to the data to model the bias
    X=np.hstack((np.ones((X.shape[0],1)),X))
    
    # initialize weights randomly (mean 0)
    syn0 = 2*np.random.random((X.shape[1],numHidden)) - 1
    syn1 = 2*np.random.random((numHidden,y.shape[1])) - 1

    # maximum iteration
    maxIter = 40000

    # store errors
    l2ErrorArray = np.zeros(maxIter)

    # do some iterations
    for it in np.arange(maxIter):

        # forward propagation: we put in our pattern
        # as layer "0" and then push it through the
        # activation function to get the output of
        # the layer
        l0 = X
        l1 = sigmoid(np.dot(l0,syn0))
        l2 = sigmoid(np.dot(l1,syn1))

        # evaluate the error of the layer
        l2Error = y - l2

        l2ErrorArray[it] = np.sum(l2Error*l2Error)

        if (it%1000==0):
            sys.stdout.write("Iteration {:d}: error = {:f}\r".format(it,l2ErrorArray[it]))
            sys.stdout.flush()

        # the error determines the amount we need
        # to move along the derivative
        # to "regularize" this further, we multiply by the 
        # learning rate alpha

        l2Delta = l2Error*dsigmoid(l2)
        l1Delta = l2Delta.dot(syn1.T)*dsigmoid(l1)

        # the weight update is the dot product between
        # the pattern input and the correction amount
        # this is moderated by the "learning rate" alpha
        syn1 += alpha*np.dot(l1.T,l2Delta)
        syn0 += alpha*np.dot(X.T,l1Delta)

    print("output after training is:\n",l2)

    if (plotting):
        fig,ax = plt.subplots(figsize=(8,6))
        plt.plot(l2ErrorArray)
    return(syn1,l2ErrorArray,l2)

In [None]:
# these are the inputs to our neural network
# each row is one training example
X = np.array([  [0,0,1],
                [1,1,1],
                [1,0,1],
                [0,1,1] ])
    
# this is the target state we want to have, 
# our input is three numbers, our output is
# one number
# note, that each row here corresponds to the
# row in X - so we are trying to basically
# learn two class labels from data here
y = np.array([[1],
              [1],
              [0],
              [0]])

# let's call our function and do the training:
(weights,errors,_)=threeLayerBias(X,y,3,1)
print(weights)

Alright, now this one works. You can also clearly see the location, where the optimization kicked in to do another round.

In [None]:
iris = load_iris()
# these are the inputs to our neural network
# each row is one training example
X = iris.data
X = (X-X.mean(axis=0))/X.std(axis=0)    
# this is the target state we want to have
y = iris.target/2

# let's call our function and do the training:
(weights,errors,pred)=threeLayerBias(X,y.reshape(-1,1),5,1)
print(weights)
fig,ax = plt.subplots(figsize=(8,6))
plt.plot(pred)
plt.plot(y)