In [1]:
import numpy as np

In [4]:
with np.load('parameters.npz') as fin:
    W1, W2, W3 = fin['W1'], fin['W2'], fin['W3']
    b1, b2, b3 = fin['b1'], fin['b2'], fin['b3']

In [12]:
x = np.array([1, 0, 1]).reshape(-1, 1)
y = np.array([0, 0, 1]).reshape(-1, 1)
x, y

(array([[1],
        [0],
        [1]]),
 array([[0],
        [0],
        [1]]))

In [13]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    y = np.exp(x)
    return y / y.sum()

In [14]:
# No. of learnable params
sum([k.size for k in [W1, W2, W3, b1, b2, b3]])

36

In [23]:
# What is the sum of the elments of output a1? 
a1 = W1 @ x + b1
a1.sum()

5.44766562448759

In [24]:
# What is the sum of the elements of output h1?
h1 = sigmoid(a1)
h1.sum()

2.5749524957231924

In [25]:
# Remaining forward pass
a2 = W2 @ h1 + b2
h2 = sigmoid(a2)
a3 = W3 @ h2 + b3
yhat = softmax(a3)

In [26]:
[a2.sum(), h2.sum(), a3.sum()]

[6.460166773282593, 2.63139309587371, 4.874920988265704]

In [35]:
# Seems correct so far - what is the loss value?
L = -1 * np.sum(y * np.log(yhat))
L

0.8563785622753883

In [37]:
# Gradient of loss wrt a3?
grad_L_a3 = -(y - yhat)
grad_L_a3

array([[ 0.23691422],
       [ 0.33838847],
       [-0.57530268]])

In [39]:
# Backpropagate

In [52]:
grad_a3 = - (y - yhat)
grad_a3

array([[ 0.23691422],
       [ 0.33838847],
       [-0.57530268]])

In [53]:
grad_w3 = grad_a3 @ h2.T
grad_w3

array([[ 0.21202113,  0.18529411,  0.2260992 ],
       [ 0.30283325,  0.26465862,  0.3229412 ],
       [-0.51485438, -0.44995274, -0.5490404 ]])

In [54]:
grad_b3 = grad_a3

In [55]:
grad_h2 = W3.T @ grad_a3
grad_h2

array([[ 0.1954864 ],
       [-0.11722488],
       [-0.08814526]])

In [59]:
grad_b2 = grad_a2 = grad_h2 * sigmoid(a2) * (1 - sigmoid(a2))
grad_a2

array([[ 0.01838198],
       [-0.01997644],
       [-0.0038401 ]])

In [61]:
grad_w2 = grad_a2 @ h1.T
grad_w2

array([[ 0.01512358,  0.0161568 ,  0.01605235],
       [-0.0164354 , -0.01755824, -0.01744473],
       [-0.0031594 , -0.00337525, -0.00335343]])

In [63]:
grad_h1 = W2.T @ grad_a2
grad_h1

array([[ 0.00571305],
       [ 0.01326947],
       [-0.01908499]])

In [64]:
grad_a1 = grad_h1 * (sigmoid(a1) * (1 - sigmoid(a1)))
grad_a1

array([[ 0.00083319],
       [ 0.00141185],
       [-0.00211219]])

In [65]:
grad_b1 = grad_a1

In [66]:
grad_w1 = grad_a1 @ x.T

In [67]:
grad_w1

array([[ 0.00083319,  0.        ,  0.00083319],
       [ 0.00141185,  0.        ,  0.00141185],
       [-0.00211219,  0.        , -0.00211219]])

In [69]:
w1_new = W1 - grad_w1
b1_new = b1 - grad_b1

w2_new = W2 - grad_w2
b2_new = b2 - grad_b2

w3_new = W3 - grad_w3
b3_new = b3 - grad_b3

In [70]:
# New forward pass

In [71]:
a1 = w1_new @ x + b1_new
h1 = sigmoid(a1)

a2 = w2_new @ h1 + b2_new
h2 = sigmoid(a2)

a3 = w3_new @ h2 + b3_new
yhat = softmax(a3)

In [72]:
yhat

array([[0.03475536],
       [0.03520987],
       [0.93003478]])

In [73]:
# Loss
-1 * np.sum(y * np.log(yhat))

0.07253330081047048