In [None]:
import numpy as np

| Input1 | Input2 |   Input3(bias)   | Output        |
| :-------------: |:-------------:|
| 0 | 0 | 1     | 0 |
| 0 | 1 | 1     | 1 |
| 1 | 0 | 1     | 1 |
| 1 | 1 | 1     | 1 |

In [None]:
#![title](img/simple_neuralnet.png)

In [None]:
#input data
X = np.array([[0,0,1],  
            [0,1,1],
            [1,0,1]])
            #[1,1,1]])
# desired output data
y = np.array([[0],
             [1],
             [1]])#,
             #[1]])
    

In [None]:
#X_test = np.array([[1,1,1]]).T
#X_test

In [None]:
#sigmoid
def nonlin(x, deriv=False):
    if(deriv==True):
        return (x*(1-x))
    
    return 1/(1+np.exp(-x)) 

In [None]:
np.random.seed(1)
#synapses
syn0 = 2*np.random.random((3,4)) - 1 
# 3x4 matrix of weights ((2 inputs + 1 bias) x 4 nodes in the hidden layer)
syn1 = 2*np.random.random((4,1)) - 1  
# 4x1 matrix of weights. (4 nodes x 1 output) - no bias term in the hidden layer.


syn1

In [None]:
#training step
for j in range(100000):  
    
    # Calculate forward through the network.
    l0 = X     #layer 0
    l1 = nonlin(np.dot(l0, syn0))    #layer 1
    l2 = nonlin(np.dot(l1, syn1))    #layer 2

    
    # Back propagation of errors using the chain rule. 
    l2_error = y - l2
    if( j % 10000 ) == 0:   # Only print the error every 10000 steps, to save time and limit the amount of output. 
        print ("Error: " + str(np.mean(np.abs(l2_error))))
        
    l2_delta = l2_error*nonlin(l2, deriv=True)
    l1_error = l2_delta.dot(syn1.T)
    l1_delta = l1_error * nonlin(l1,deriv=True)
    
    #update weights (no learning rate term)
    syn1 += l1.T.dot(l2_delta)
    syn0 += l0.T.dot(l1_delta)
    
print ("Output after training")
print (l2)


In [None]:
l0 = np.array([1,1,1])     #layer 0
l1 = nonlin(np.dot(l0, syn0))    #layer 1
l2 = nonlin(np.dot(l1, syn1))    #layer 2
l2

# Sample 2

| Input1 | Input2 |   Input3 (bias)  | Output        |
| :-------------: |:-------------:|
| 0 | 0 | 1     | 0 |
| 0 | 1 | 1     | 0 |
| 1 | 0 | 1     | 1 |
| 1 | 1 | 1     | 1 |

| Input1 | Input2 |   Input3   | Output        |
| :--: |:--:|:--:|:--:|
| 1 | 0 | 0 | ? |


In [None]:
#input data
X_train = np.array([[0,0,1],  
                    [0,1,1],
                    [1,0,1],
                    [1,1,1]])

y_train = np.array([[0, 1, 1, 0]]).T

### Model
![title](img/singleperceptron.png)

### Training process
1. Take the inputs from a training set example, adjust them by the weights, and pass them through a special formula to calculate the neuron’s output.
2. Calculate the error, which is the difference between the neuron’s output and the desired output in the training set example.
3. Depending on the direction of the error, adjust the weights slightly.
4. Repeat this process 10, 000 times.

### Formula for calculating the neuron’s output
```
- Sum( weight_i , input_i ) = input1 * weight1 + input2 * weight2 + ... + input_n * weight_n

Next we normalise this, so the result is between 0 and 1. Using Sigmoid function: 
- 1/1+e^x

Final Output: 
-> 1 / 1 + e^(Sum(weights,inputs) 
```
### Adjusting weight

1. We used the Sigmoid curve to calculate the output of the neuron.
2. If the output is a large positive or negative number, it signifies the neuron was quite confident one way or another.
3. From Diagram 4, we can see that at large numbers, the Sigmoid curve has a shallow gradient.
4. If the neuron is confident that the existing weight is correct, it doesn’t want to adjust it very much. Multiplying by the Sigmoid curve gradient achieves this.

Error Weighted Derivative: 
``` math
Adjust error by = error * inputs * sigmoidCurveGradient(output)
sigmoidCurveGradient(output) = output *( 1 - output)
Hence => error * inputs * output *( 1 - output)
```

In [None]:
# The Sigmoid function, which describes an S shaped curve.
# We pass the weighted sum of the inputs through this function to
# normalise them between 0 and 1.
def __sigmoid(x):
    return 1 / (1 + np.exp(-x))

# The derivative of the Sigmoid function.
# This is the gradient of the Sigmoid curve.
# It indicates how confident we are about the existing weight.
def __sigmoid_derivative(x):
    return (x*(1-x))

In [None]:
np.random.seed(1)

# We model a single neuron, with 3 input connections and 1 output connection.
# We assign random weights to a 3 x 1 matrix, with values in the range -1 to 1
# and mean 0.
synaptic_weights  = 2 * np.random.random((3, 1))-1 #synaptic_weights 
# 3x1 matrix of weights ((2 inputs + 1 bias) x 1 nodes in the hidden layer)
synaptic_weights 

In [None]:
np.random.seed(1)

# We model a single neuron, with 3 input connections and 1 output connection.
# We assign random weights to a 3 x 1 matrix, with values in the range -1 to 1
# and mean 0.
synaptic_weights  = 2* np.random.random((3, 1))-1 #synaptic_weights 
# 3x1 matrix of weights ((2 inputs + 1 bias) x 1 nodes in the hidden layer)

# training step
for j in range(10000):  
    
    # Calculate forward through the network.
    l0 = X_train     #layer 0
    output = __sigmoid(np.dot(X_train, synaptic_weights))     #layer 1 <= output layer

    # Calculate the error (The difference between the desired output and the predicted output).
    # Back propagation of errors using the chain rule. 
    output_error = y_train - output
    if( j % 1000 ) == 0:   # Only print the error every 10000 steps, to save time and limit the amount of output. 
        print ("Error: " + str(np.mean(np.abs(output_error))))

            
    # Multiply the error by the input and again by the gradient of the Sigmoid curve.
    # This means less confident weights are adjusted more.
    # This means inputs, which are zero, do not cause changes to the weights.
    adjustment = np.dot(X_train.T, output_error * __sigmoid_derivative(output))
    
    if( j % 1000 ) == 0:   # Only print the error every 10000 steps, to save time and limit the amount of output. 
        print ("syn: ", synaptic_weights)
            #print(l0.T, l1_error * __sigmoid_derivative(l1))

    # Adjust the weights.
    synaptic_weights += adjustment
    print(synaptic_weights)
    
    
    #l2_delta = l2_error*nonlin(l2, deriv=True)
    #l1_error = l2_delta.dot(syn1.T)
#     l1_delta = l1_error * nonlin(l1,deriv=True)
    
#     #update weights (no learning rate term)
#     syn1 += l1.T.dot(l2_delta)
#     syn0 += l0.T.dot(l1_delta)
synaptic_weights

# Implementing as class

In [None]:
from numpy import exp, array, random, dot


class NeuralNetwork():
    def __init__(self):
        # Seed the random number generator, so it generates the same numbers
        # every time the program runs.
        random.seed(1)

        # We model a single neuron, with 3 input connections and 1 output connection.
        # We assign random weights to a 3 x 1 matrix, with values in the range -1 to 1
        # and mean 0.
        self.synaptic_weights = 2 * random.random((3, 1)) - 1

    # The Sigmoid function, which describes an S shaped curve.
    # We pass the weighted sum of the inputs through this function to
    # normalise them between 0 and 1.
    def __sigmoid(self, x):
        return 1 / (1 + exp(-x))

    # The derivative of the Sigmoid function.
    # This is the gradient of the Sigmoid curve.
    # It indicates how confident we are about the existing weight.
    def __sigmoid_derivative(self, x):
        return x * (1 - x)

    # We train the neural network through a process of trial and error.
    # Adjusting the synaptic weights each time.
    def train(self, training_set_inputs, training_set_outputs, number_of_training_iterations):
        for iteration in range(number_of_training_iterations):
            # Pass the training set through our neural network (a single neuron).
            output = self.think(training_set_inputs)

            # Calculate the error (The difference between the desired output
            # and the predicted output).
            error = training_set_outputs - output
            
            if( iteration % 1000 ) == 0:   # Only print the error every 10000 steps, to save time and limit the amount of output. 
                print ("Error: " + str(np.mean(np.abs(error))))

            # Multiply the error by the input and again by the gradient of the Sigmoid curve.
            # This means less confident weights are adjusted more.
            # This means inputs, which are zero, do not cause changes to the weights.
            adjustment = np.dot(training_set_inputs.T, error * self.__sigmoid_derivative(output))
            

            # Adjust the weights.
            self.synaptic_weights += adjustment
            

    # The neural network thinks.
    def think(self, inputs):
        # Pass inputs through our neural network (our single neuron).
        return self.__sigmoid(np.dot(inputs, self.synaptic_weights))


if __name__ == "__main__":

    #Intialise a single neuron neural network.
    neural_network = NeuralNetwork()

    print ("Random starting synaptic weights: ")
    print (neural_network.synaptic_weights)

    # The training set. We have 4 examples, each consisting of 3 input values
    # and 1 output value.
    training_set_inputs = array([[0, 0, 1],
                                 [1, 1, 1],
                                 [1, 0, 1],
                                 [0, 1, 1]])
    training_set_outputs = array([[0, 1, 1, 0]]).T

    # Train the neural network using a training set.
    # Do it 10,000 times and make small adjustments each time.
    neural_network.train(training_set_inputs, training_set_outputs, 10000)

    print ("New synaptic weights after training: ")
    print (neural_network.synaptic_weights)

    # Test the neural network with a new situation.
    print ("Considering new situation [1, 0, 0] -> ?: ")
    print (neural_network.think(array([1, 0, 0])))

# Sk learn

In [81]:
from sklearn.neural_network import MLPClassifier
X = [[0., 0.], [1., 1.]]
y = [0, 1]
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X, y)   
clf.predict([[1, 0], [0, 1]])
[coef.shape for coef in clf.coefs_]

[(2, 5), (5, 2), (2, 1)]