In [1]:
import math
import numpy as np

def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    a = sigmoid(z)
    return a * (1-a)

def relu_derivative(x):
    return x > 0
relu_derivative_v = np.vectorize(relu_derivative)

def relu(x):
    return np.maximum(0, x)

print(relu)

<function relu at 0x107eeeb90>


In [24]:
num_input_features = 3
num_hidden_features = 4
num_output_features = 1
num_epochs = 100
learning_rate = 0.3

X = np.array([[0, 0, 1], [1, 1, 0], [1, 0, 0], [0, 1, 1]]).T # (num_input_features, num_examples)
y = np.array([[0, 1, 1, 0]]) # (num_output_features, num_examples)

W1 = np.random.normal(0, 0.5, (num_hidden_features, num_input_features))
b1 = np.random.normal(0, 0.5, (num_hidden_features, 1))

W2 = np.random.normal(0, 0.5, (num_output_features, num_hidden_features))
b2 = np.random.normal(0, 0.5, (num_output_features, 1))

for _ in range(100):
    
    # Forward pass.
    z1 = np.dot(W1, X) + b1 # (num_hidden_features, num_examples)
    a1 = relu(z1) # (num_hidden_features, num_examples)
    z2 = np.dot(W2, a1) + b2 # (num_output_features, num_examples)
    a2 = sigmoid(z2) # (num_output_features, num_examples)

    mse_loss = ((y - a2)**2).mean()
    loss = mse_loss
    print(loss)
    
    # Back prop.
    # da is shape (num_output_features, num_examples)
    # da = dL/da = d/da (y^2 + a^2 - 2ay) = d/da(2a - 2y)
    da2 = 2 * a2 - 2 * y
    assert da2.shape == a2.shape
    
    # dz2 is shape (num_output_features, num_examples)
    # dz2 = dL/dz2 = dL/da2 * da2/dz2 = da2 * sigmoid_derivative(z2)
    # da2 is shape (num_output_features, num_examples)
    # sigmoid_derivative(z2) is shape (num_output_features, num_examples)
    dz2 = da2 * sigmoid_derivative(z2)
    assert dz2.shape == z2.shape

    # dW2 is shape (num_output_features, num_hidden_features)
    # dW2 = dL/dW2 = dL/dz2 * dz2/dW2 = dz2 * X
    # dz2 is shape (num_output_features, num_examples)
    # a1 is shape (num_hidden_features, num_examples)
    dW2 = np.dot(dz2, a1.T)
    assert dW2.shape == W2.shape
    
    # db2 is shape (num_output_features, 1)
    # db2 = dL/db2 = dL/dz2 * dz2/db = dz2 * 1
    # dz2 is shape (num_output_features, num_examples)
    db2 = np.mean(dz2, axis=1)
    db2 = np.expand_dims(db2, axis=1)
    assert db2.shape == b2.shape
    
    # da1 is shape (num_hidden_features, num_examples)
    # da1 = dL/da1 = dL/dz2 * dz2/da1 = dz2 * W2
    # dz2 is shape (num_output_features, num_examples)
    # W2 is shape (num_output_features, num_hidden_features)
    da1 = np.dot(W2.T, dz2)
    assert da1.shape == a1.shape

    # dz1 is shape (num_hidden_features, num_examples)
    # dz1 = dL/dz1 = dL/da1 * da1/dz1 = da1 * sigmoid_derivative(z1)
    # da1 is shape (num_hidden_features, num_examples)
    # sigmoid_derivative(z1) is shape (num_hidden_features, num_examples)
    dz1 = da1 * relu_derivative(z1)
    assert dz1.shape == z1.shape
    
    # dW1 is shape (num_hidden_features, num_input_features)
    # dW1 = dL/dW1 = dL/dz1 * dz1/dW1 = dz1 * X
    # dz1 is shape (num_hidden_features, num_examples)
    # X is shape (num_input_features, num_examples)
    dW1 = np.dot(dz1, X.T)
    assert dW1.shape == W1.shape
    
    # db1 is shape (num_hidden_features, 1)
    # db1 = dL/db1 = dL/dz1 * dz1/db1 = dz1
    # dz1 is shape (num_hidden_features, num_examples)
    db1 = np.mean(dz1, axis=1)
    db1 = np.expand_dims(db1, axis=1)
    assert db1.shape == b1.shape
    
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1
    

0.42045316156777185
0.3975614944764524
0.3721317087241182
0.34575261853965555
0.32012095135419233
0.2958005191249794
0.2717384205350081
0.24714853279701188
0.22481492406497705
0.19777322095507452
0.1670663912520226
0.13724168180373447
0.10789065018422446
0.08376165045039251
0.06357771295443604
0.04901829989426931
0.03815986224918186
0.03029526553978549
0.02466180182839926
0.02040608084629222
0.017177955315968538
0.01470529005527146
0.012771583668769623
0.011201716922652559
0.009928065406170415
0.008888505031119938
0.008018116389417405
0.007277590777485358
0.006646765862323312
0.006111542323380613
0.00564059077692012
0.005228445768597704
0.004867599006391757
0.004549923185457997
0.004263622955599746
0.004007490369137429
0.0037794148786679917
0.0035722530582119674
0.003383323735942252
0.003211554795112614
0.0030566498853143603
0.002912473208108843
0.002779962209643137
0.002658574586927479
0.002546528708798082
0.0024417572005713674
0.0023444508785802976
0.0022549763674549798
0.00217054228