from: <i><b style='color:red;'>grokking</b> <b>Deep Learning</b></i>
<p>by Andrew W. Trask</p>

<p><b>prerequisites</b></p>
$$f(x) = \mathbf{w^{*}x} + b^{*},$$
where <b>w*</b> and b* are optimal values for parameters <b>w</b> and b
<p></p>
<p>perceptron</p>
<p>gradient descent</p>
<p>backpropagation</p>
<p><b>"The interface for the neural network is simple: it accepts an <i>input</i> variable as information and a <i>weights</i> variable as knowledge, and it outputs a prediction."</b></p>
<p><b>"Measuring error simplifies the problem of training neural networks to make correct predictions."</b></p>
<p><b>"Different ways of measuring the error prioritize error differently."</b></p>
<p>Error is calculated and applied to modify the weights during each iteration of the training.</p>
<p><b>"<i>alpha</i> is the simplest way to prevent overcorrecting weight updates."</b></p>

In [None]:
import numpy as np

In [None]:
# initialization, functions

def gradient_descent(prediction, target):
    ''' One method for calculating error.
    '''
    return (prediction - target)**2

def gradient_descent_deriv(weights):
    ''' Taking the derivative of the error
        during training
        yields amount and direction of the prediction
        from the target.
    '''
    return (2 * weights - 1)
    
def relu(x):
    ''' Returns x iff x > 0; otherwise, returns 0
    '''
    return (x > 0) * x

def relu2deriv(output):
    ''' Returns 1 for input > 0; otherwise, returns 0
    '''
    return output > 0

In [None]:
# input and target
streetlights = np.array([1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1]).reshape(4, 3) # layer 0 (input) & "x" in layer 1
walk_v_stop = np.array([1, 1, 0, 0]).T # values to train the model on

In [None]:
# hyperparameters
alpha = 0.2 # scale down correction to prevent overcorrection
hidden_size = 4

In [None]:
# initialization, weights
weights_0_1 = gradient_descent_deriv(np.random.random((3, hidden_size)))
weights_1_2 = gradient_descent_deriv(np.random.random((hidden_size, 1)))

In [None]:
# training
for iteration in range(300):
    ''' supervised learning
    '''
    # reset layer_2_error to 0
    layer_2_error = 0

    for index, values in enumerate(streetlights):
        layer_0 = streetlights[index:index+1] # rename input
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        layer_2 = np.dot(layer_1, weights_1_2)

        # difference between layer 2 output and predicted values
        layer_2_error += np.sum(gradient_descent(layer_2, walk_v_stop[index:index+1]))

        # calculate the correction
        layer_2_delta = (layer_2 - walk_v_stop[index:index+1])
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)

        # apply the correction --- note that corrected weights are running sums
        # alpha is a fractional value to dampen correction, preventing overcorrection
        weights_1_2 -= alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 -= alpha * layer_0.T.dot(layer_1_delta)

    if (iteration % 10 == 9):
        print(f"Error: {layer_2_error:.25f}")

In [None]:
%whos

In [None]:
import pickle
with open('pickle/weights_0_1.pickle.bin', 'wb') as out_file:
    pickle.dump(weights_0_1, out_file)
with open('pickle/weights_1_2.pickle.bin', 'wb') as out_file:
    pickle.dump(weights_1_2, out_file)