In [2]:
import numpy as np

In [5]:
# Streetlight configuration
xs = np.array([[1, 0, 1],
               [0, 1, 1],
               [0, 0, 1],
               [1, 1, 1],
               [0, 1, 1],
               [1, 0, 1]])

# Walk versus stop
ys = np.array([[0],
               [1],
               [0],
               [1],
               [1],
               [0]])

# Grab one data entry
x_i = xs[0]
y_i = ys[0]

epochs = 50
lr = 0.1

# Neural network weights (one layer)
w = np.array([0.5, 0.48, -0.7])

for i in range(epochs):
    y_hat = x_i.dot(w)
    l = (y_hat - y_i) ** 2.0
    
    dl_dw = (y_hat - y_i) * x_i
    
    w -= lr * dl_dw
    
    print('epoch: {} - loss: {}, prediction: {}'.format(i, l, y_hat))

epoch: 0 - loss: [0.04], prediction: -0.19999999999999996
epoch: 1 - loss: [0.0256], prediction: -0.15999999999999992
epoch: 2 - loss: [0.016384], prediction: -0.1279999999999999
epoch: 3 - loss: [0.01048576], prediction: -0.10239999999999982
epoch: 4 - loss: [0.00671089], prediction: -0.08191999999999977
epoch: 5 - loss: [0.00429497], prediction: -0.06553599999999982
epoch: 6 - loss: [0.00274878], prediction: -0.05242879999999994
epoch: 7 - loss: [0.00175922], prediction: -0.04194304000000004
epoch: 8 - loss: [0.0011259], prediction: -0.03355443200000008
epoch: 9 - loss: [0.00072058], prediction: -0.02684354560000002
epoch: 10 - loss: [0.00046117], prediction: -0.021474836479999926
epoch: 11 - loss: [0.00029515], prediction: -0.01717986918399994
epoch: 12 - loss: [0.00018889], prediction: -0.013743895347199997
epoch: 13 - loss: [0.00012089], prediction: -0.010995116277759953
epoch: 14 - loss: [7.73712525e-05], prediction: -0.008796093022207963
epoch: 15 - loss: [4.95176016e-05], predi

Learning the whole dataset

In [2]:
# Streetlight configuration
xs = np.array([[1, 0, 1],
               [0, 1, 1],
               [0, 0, 1],
               [1, 1, 1],
               [0, 1, 1],
               [1, 0, 1]])

# Walk versus stop
ys = np.array([[0],
               [1],
               [0],
               [1],
               [1],
               [0]])

epochs = 50
lr = 0.1

# Neural network weights (one layer)
w = np.array([0.5, 0.48, -0.7])

# Loop over epochs
for i in range(epochs):
    
    l_total = 0.0
    
    # Loop over dataset
    for j in range(len(xs)):
        
        # Grab a new data entry
        x_i = xs[j]
        y_i = ys[j]

        y_hat = x_i.dot(w)
        l = (y_hat - y_i) ** 2.0
        
        # Accumulate loss
        l_total += l
        
        dl_dw = (y_hat - y_i) * x_i
        
        # Perform a parameter update after each example: this is known 
        # as stochastic gradient descent (SGD)
        w -= lr * dl_dw
        print('    actual: {}, prediction: {}'.format(y_i, y_hat))

    print('epoch: {} - total loss: {}'.format(i, l_total))

    actual: [0], prediction: -0.19999999999999996
    actual: [1], prediction: -0.19999999999999996
    actual: [0], prediction: -0.5599999999999999
    actual: [1], prediction: 0.616
    actual: [1], prediction: 0.17279999999999995
    actual: [0], prediction: 0.17552
epoch: 0 - total loss: [2.65612311]
    actual: [0], prediction: 0.14041599999999999
    actual: [1], prediction: 0.3066464
    actual: [0], prediction: -0.34513824
    actual: [1], prediction: 1.006637344
    actual: [1], prediction: 0.4785034751999999
    actual: [0], prediction: 0.26700416768
epoch: 1 - total loss: [0.96287018]
    actual: [0], prediction: 0.213603334144
    actual: [1], prediction: 0.5347420299776
    actual: [0], prediction: -0.26067345110016
    actual: [1], prediction: 1.131942884509696
    actual: [1], prediction: 0.6274723921901568
    actual: [0], prediction: 0.25433999330650114
epoch: 2 - total loss: [0.55091659]
    actual: [0], prediction: 0.20347199464520094
    actual: [1], prediction: 0.6

A full-fledged 2-layer neural network

In [10]:
np.random.seed(1)

def init_weights(shape):
    return 2.0 * np.random.random(shape) - 1.0

def affine(x, w):
    '''
    Inputs:
    - x: layer input
    - w: layer weights
    
    Returns:
    - o: layer output with the same shape as x
    '''
    o = x.dot(w)
    return o

def affine_backwards(dl_do, x, w):
    '''
    Inputs:
    - dl_do: derivative of the network loss w.r.t. this layer's output o
    - x: original input to this layer 
    - w: original weights associated with this layer 
    
    Returns:
    - [do_dx, do_dw]: tuple of two items:
        - do_dx: partial derivative of this layer's output w.r.t. x
        - do_dw: partial derivative of this layer's output w.r.t. w
    '''
    do_dx = dl_do.dot(w.T)
    assert(do_dx.shape == x.shape)
    
    do_dw = x.T.dot(dl_do)
    assert(do_dw.shape == w.shape)
    
    return [do_dx, do_dw]

def relu(x):
    '''A ReLU activation function, which sets all negative numbers to zero
    
    Inputs:
    - x: layer input
    
    Returns:
    - o: layer output with the same shape as x
    '''
    o = (x > 0.0) * x
    return o

def relu_backwards(dl_do, x):
    '''
    Inputs:
    - dl_do:
    - x: original input to this layer 
    
    Returns:
    - do_dx: partial derivative of this layer's output w.r.t. its input x
    '''
    do_dx = x > 0.0
    return dl_do * do_dx

def mse(y, y_hat):
    '''
    Computes the MSE between the actual and predicted value(s) of `y`
    '''
    return np.sum((y_hat - y) ** 2.0)

def reduce_sum(x):
    return np.sum(x)

def reduce_sum_backwards(o):
    return np.ones_like(o)

xs = np.array([
    [1.0, 0.0, 1.0],
    [0.0, 1.0, 1.0],
    [0.0, 0.0, 1.0],
    [1.0, 1.0, 1.0]
])

ys = np.array([
    [1.0, 1.0, 0.0, 0.0]
]).T

# Build our model
hidden_size = 4
w0 = init_weights((3, hidden_size)) # Input -> layer 1
w1 = init_weights((hidden_size, 1)) # Layer 1 -> layer 2

# Train our model
debug = False
epochs = 60
lr = 0.2

for i in range(epochs):
    
    l_total = 0.0
    
    for j in range(len(xs)):
        
        # Rename some variables for clarity
        x = xs[j:j + 1]
        y = ys[j:j + 1]
        
        # Forward pass
        layer_0 = x
        layer_1 = relu(affine(layer_0, w0))
        layer_2 = affine(layer_1, w1)
        
        y_hat = layer_2
        
        if debug:
            print('-' * 20)
            print('layer_0 (shape): {}'.format(layer_0.shape))
            print('layer_1 (shape): {}'.format(layer_1.shape))
            print('layer_2 (shape): {}'.format(layer_2.shape))
            print('network output: {}'.format(layer_2))
        
         
        
        # The loss is a single, floating-point number
        l = np.sum((y_hat - y) ** 2.0)
        l_total += l
        
        layer_2_delta = y - y_hat
        
        dout = y - y_hat
        dout = affine_backwards()
        #dlayer2_inputs = 
        
        layer_1_delta = relu_backwards(layer_2_delta.dot(w1.T), layer_1)
        
        # Calculate derivatives
        dl_dw1 = layer_1.T.dot(layer_2_delta)
        dl_dw0 = layer_0.T.dot(layer_1_delta)
        
        # Update weights
        w1 += lr * dl_dw1
        w0 += lr * dl_dw0
        
        #print('    actual: {}, prediction: {}'.format(y.flatten(), y_hat.flatten()))
    
    if (i % 10 == 9):
        print('total loss: {}'.format(l_total))
        
        
'''
total loss: 0.6342311598444467
total loss: 0.35838407676317513
total loss: 0.0830183113303298
total loss: 0.006467054957103705
total loss: 0.0003292669000750734
total loss: 1.5055622665134859e-05
'''

total loss: 0.6342311598444467
total loss: 0.35838407676317513
total loss: 0.0830183113303298
total loss: 0.006467054957103705
total loss: 0.0003292669000750734
total loss: 1.5055622665134859e-05


'\ntotal loss: 0.6342311598444467\ntotal loss: 0.35838407676317513\ntotal loss: 0.0830183113303298\ntotal loss: 0.006467054957103705\ntotal loss: 0.0003292669000750734\ntotal loss: 1.5055622665134859e-05\n'