In [1]:
import numpy as np

np.random.seed(0)

# bits are our inputs
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])

# parities are our labels
Y = np.array([[0], [1], [1], [0]])

for i, bits in enumerate(X):
    print(f'bits {bits} --> parity {Y[i]}')

bits [0 0] --> parity [0]
bits [0 1] --> parity [1]
bits [1 0] --> parity [1]
bits [1 1] --> parity [0]


In [2]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def delta_sigmoid(x):
    # to derive use the +1 trick from http://cs231n.github.io/optimization-2/
    return sigmoid(x) * (1 - sigmoid(x))

def analytical_gradient(f, x):
    h = 1e-9
    return (f(x + h) - f(x)) / h

print('delta sigmoid', delta_sigmoid(Y))
print('delta sigmoid analytical', analytical_gradient(sigmoid, Y))

delta sigmoid [[0.25      ]
 [0.19661193]
 [0.19661193]
 [0.25      ]]
delta sigmoid analytical [[0.25000002]
 [0.19661195]
 [0.19661195]
 [0.25000002]]


In [3]:
# X [4,2]
input_dim = X.shape[-1]
# Y [4,1]
output_dim = Y.shape[-1]
hidden_units = 2
lr = 0.1

# [2,2]
Whidden = np.random.uniform(size=(input_dim, hidden_units)) # hidden layer

# [2,1]
Woutput = np.random.uniform(size=(hidden_units, output_dim)) # output layer

for step in range(10000):
    # forward pass
    # loss = loss(output(activation(hidden(X))))

    # hidden(X) [4,2]
    hidden = X.dot(Whidden)
    
    # activation(hidden) [4,2]
    activation = sigmoid(hidden)

    # output(activation) [4,2]x[2,1] -> [4,1]
    output = activation.dot(Woutput)

    # loss(output) [4,1]
    loss = 0.5 * (output - Y)**2
    if step % 2500 == 0:
        print('loss', np.mean(loss))
    
    # backward pass
    # loss'(output) [4,1]
    dloss_output = output - Y
    
    # loss'(activation) = loss'(output) * output'(activation)
    # [4,1]x[1,2] -> [4,2]
    dloss_activation = dloss_output.dot(Woutput.T)

    # loss'(hidden) = loss'(activation) * activation'(hidden)
    # [4,2]*[4,2] -> [4,2]
    dloss_hidden = dloss_activation * delta_sigmoid(hidden)

    # Take a small step in the opposite direction of the gradient 
    
    # loss'(Woutput) = loss'(output) * output'(Woutput)
    # [2,4]x[4,1] -> [2,1]
    dloss_woutput = activation.T.dot(dloss_output)
    Woutput -= dloss_woutput * lr

    # loss'(Whidden) = loss'(hidden) * hidden'(Whidden)
    # [2,4]x[4,2] -> [2,2]
    dloss_whidden = X.T.dot(dloss_hidden)    
    Whidden -= dloss_whidden * lr

loss 0.14451072667400197
loss 0.007930633168167129
loss 0.0031754754752917323
loss 0.0021824385490060365


In [4]:
for i, prediction in enumerate(output):
    print(f'prediction {prediction} -> label {Y[i]}')

prediction [-0.08500212] -> label [0]
prediction [0.98169372] -> label [1]
prediction [0.98169457] -> label [1]
prediction [0.07744216] -> label [0]
