In [58]:
import numpy as np

In [59]:
N = 64 # batch size
D_in = 1000 # input dimension (# features)
H = 100 # hidden layer dimension (# nodes)
D_out = 10 # output classes

x, y = np.random.randn(N, D_in), np.random.randn(N, D_out) # random input & output values

# x = 64 x 1000
# y = 64 x 10

# W1 = 1000 x 100
# W2 = 100 x 10
# op = 10 x 1

w1, w2 = np.random.randn(D_in, H), np.random.randn(H, D_out)

In [None]:
# activation with first layer
h = 1 / (1 + np.exp(-x.dot(w1)))
h.shape

In [None]:
y_pred = h.dot(w2)
y_pred.shape

In [None]:
# back propagation loss formula
loss = np.square(y_pred - y).sum()
loss

In [None]:
grad_y_pred = 2.0 * (y_pred - y)
grad_y_pred.shape

In [None]:
grad_w2 = h.T.dot(grad_y_pred) # (100, 64) * (64, 10)
grad_w2.shape

In [None]:
grad_h = grad_y_pred.dot(w2.T)
grad_h.shape

In [None]:
grad_w1 = (x.T).dot(grad_h * h * (1 - h))
# (1000, 64) * (64,100) = (1000, 100)
# (100, 64) * (64, 100)
grad_w1.shape

In [60]:
# putting it all together
for iteration in range(5000):
    
    # forward propagation

    # sigmoid activation
    h = 1 / (1 + np.exp(-x.dot(w1))) # (64, 100) 
    y_pred = h.dot(w2) # (64, 10)
    loss = np.square(y_pred - y).sum()
    if iteration % 20 == 0:
        print('iteration: ', iteration, 'loss: ', loss)

    # calculate gradients for back propagation
    grad_y_pred = 2.0 * (y_pred - y) # (64 , 10)
    grad_w2 = h.T.dot(grad_y_pred) # (100, 64) * (64, 10) = (100, 10)
    grad_h = grad_y_pred.dot(w2.T) # (64, 10) * (10 * 100) = (64, 100)
    grad_w1 = (x.T).dot(grad_h * h * (1 - h)) # (1000, 64) * (64,100) * (100, 64) * (64, 100) = (1000, 100)

    LR = 1e-4
    w1 -= LR * grad_w1
    w2 -= LR * grad_w2
    
    

iteration:  0 loss:  32536.390312386357
iteration:  20 loss:  6387.027209186369
iteration:  40 loss:  4554.042679801955
iteration:  60 loss:  3411.718669346021
iteration:  80 loss:  2668.0802819952005
iteration:  100 loss:  2129.3333675237604
iteration:  120 loss:  1747.1135979169126
iteration:  140 loss:  1471.5557584138319
iteration:  160 loss:  1239.6513629466372
iteration:  180 loss:  1065.0929166319816
iteration:  200 loss:  928.4114369686455
iteration:  220 loss:  819.6173356591263
iteration:  240 loss:  719.6786463501097
iteration:  260 loss:  642.5128103027009
iteration:  280 loss:  577.6835443353361
iteration:  300 loss:  521.2944275877046
iteration:  320 loss:  469.6195889928462
iteration:  340 loss:  425.4385104642733
iteration:  360 loss:  388.2129083307922
iteration:  380 loss:  354.6560037932403
iteration:  400 loss:  324.8662814188174
iteration:  420 loss:  297.925091885833
iteration:  440 loss:  271.9183306664873
iteration:  460 loss:  250.26493388026915
iteration:  480