Assume the network has a single hidden layer, and trained with gradient descent to fit random data by minimizing the Euclidean Distance between the network output and the target

In [1]:
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

W1 = np.random.randn(D_in, H)
W2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(100):
    h = x.dot(W1) # (N, H)
    h_relu = np.maximum(h, 0) 
    y_pred = h_relu.dot(W2) # (N, D_out)
    
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    # backprop 
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(W2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    W1 -= learning_rate * grad_w1
    W2 -= learning_rate * grad_w2
    

0 36654028.18116052
1 28197456.956982538
2 21242109.045911424
3 14862627.06859262
4 9696387.940155469
5 6168767.99634973
6 3996093.0025573918
7 2712226.499879354
8 1946455.300414504
9 1468762.988361637
10 1152297.2499678887
11 929597.06630546
12 764841.2601159154
13 638092.6214065547
14 537756.4153417265
15 456636.9612356018
16 390347.29433500156
17 335593.69841093617
18 289912.57062667276
19 251524.06264928984
20 219067.6696289307
21 191450.75940984854
22 167862.2932737589
23 147623.07571460304
24 130216.47740580156
25 115167.00796112863
26 102116.00817778298
27 90757.17807441804
28 80853.53070319138
29 72191.13804025375
30 64588.80390164518
31 57900.91213407631
32 52002.14056246617
33 46796.04190707661
34 42183.909421806966
35 38090.59925078027
36 34451.524199374515
37 31206.811799337796
38 28308.869315687156
39 25716.69515785575
40 23393.598653487734
41 21308.064769368808
42 19432.543454765182
43 17743.95130174278
44 16220.471900958995
45 14844.914862997764
46 13600.30254150804
47 1

In [2]:
import torch

device = torch.device('cpu')
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in).to(device)
y = torch.randn(N, D_out).to(device)

w1 = torch.randn(D_in, H).to(device)
w2 = torch.randn(H, D_out).to(device)

learning_rate = 1e-6

for i in range(100):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(i, loss.item())
    # Backprop
    grad_y_pred = 2.0*(y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone().detach()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    # update
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27176834.0
1 22766068.0
2 22151140.0
3 22036676.0
4 20487846.0
5 16815894.0
6 12086071.0
7 7769292.0
8 4718732.5
9 2862527.5
10 1821410.125
11 1244342.125
12 915261.625
13 715256.8125
14 584199.4375
15 491610.78125
16 421862.5
17 366613.9375
18 321355.5625
19 283530.78125
20 251369.46875
21 223739.109375
22 199822.421875
23 178986.296875
24 160740.3125
25 144700.796875
26 130561.3203125
27 118053.6328125
28 106941.953125
29 97050.0
30 88243.6640625
31 80361.703125
32 73288.0
33 66927.5
34 61195.32421875
35 56018.28125
36 51332.8984375
37 47087.55078125
38 43231.4765625
39 39726.3359375
40 36535.33203125
41 33627.921875
42 30976.568359375
43 28554.404296875
44 26339.52734375
45 24311.759765625
46 22453.88671875
47 20749.75390625
48 19186.29296875
49 17751.40625
50 16433.798828125
51 15221.5791015625
52 14105.7265625
53 13077.990234375
54 12130.48828125
55 11256.6923828125
56 10450.2744140625
57 9705.708984375
58 9018.15625
59 8383.12890625
60 7796.921875
61 7255.9384765625
62 6755.222

***backward will compute the gradient of loss with respect to all Tensors with require_grad = True***

In [3]:
# pytorch autograd

import torch

device = torch.device('cpu')
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device)
w1.requires_grad_()
w2 = torch.randn(H, D_out, device=device)
w2.requires_grad_()

learning_rate = 1e-6

for i in range(100):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(i, loss.item())
    # backward
    loss.backward()
    
    # we don't want to build up a computational graph for the update steps
    # to signify that you don’t want to track the gradient of this operation 
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # manually zero the gradients
        w1.grad.zero_()
        w2.grad.zero_()

0 33453980.0
1 33688708.0
2 37670824.0
3 38762856.0
4 32185628.0
5 20498954.0
6 10425639.0
7 4907647.5
8 2518488.0
9 1536416.375
10 1094438.0
11 857644.625
12 705739.6875
13 595057.1875
14 508419.0625
15 437985.34375
16 379593.375
17 330573.75
18 289043.4375
19 253662.265625
20 223333.484375
21 197217.1875
22 174631.234375
23 155011.453125
24 137912.078125
25 123012.71875
26 109941.1328125
27 98446.140625
28 88316.546875
29 79361.453125
30 71423.984375
31 64380.64453125
32 58116.16015625
33 52528.0625
34 47535.1953125
35 43068.3203125
36 39065.4765625
37 35471.5703125
38 32240.47265625
39 29338.5
40 26726.376953125
41 24368.49609375
42 22238.58203125
43 20311.6015625
44 18567.23828125
45 16985.8046875
46 15550.9306640625
47 14248.48046875
48 13064.96875
49 11988.431640625
50 11008.490234375
51 10115.294921875
52 9300.5810546875
53 8556.375
54 7876.25244140625
55 7254.37646484375
56 6685.3623046875
57 6164.33154296875
58 5686.62939453125
59 5248.68798828125
60 4846.77783203125
61 4478.0

***The forward function computes output Tensors from input Tensors, the backward function receives the gradient of the output Tensors with respect to some scalar value***

In [4]:
# arrange the computations into layer, some of learnable parameters can be optimized
import torch

device = torch.device('cpu')
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# use nn package to define our model as a sequence of layers
# apply them in sequence to produce its output
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)

loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
for i in range(100):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(i, loss.item())
    
    # zero the gradients before running the model
    model.zero_grad()
    # backward
    loss.backward()
    
    with torch.no_grad():
        # compute gradients for all learnable parameters in the model
        for param in model.parameters():
            param.data -= learning_rate * param.grad


0 692.7584838867188
1 639.4838256835938
2 593.3998413085938
3 553.3582763671875
4 518.0360717773438
5 486.5527038574219
6 458.27740478515625
7 432.5351257324219
8 408.91461181640625
9 387.0406494140625
10 366.61871337890625
11 347.3252258300781
12 329.1490478515625
13 312.07244873046875
14 295.83270263671875
15 280.4412536621094
16 265.8695373535156
17 251.9599151611328
18 238.63462829589844
19 225.9536895751953
20 213.90664672851562
21 202.43356323242188
22 191.53472900390625
23 181.18177795410156
24 171.32241821289062
25 161.94676208496094
26 153.02313232421875
27 144.55917358398438
28 136.52207946777344
29 128.8887176513672
30 121.6481704711914
31 114.76158905029297
32 108.23185729980469
33 102.06263732910156
34 96.24482727050781
35 90.75540161132812
36 85.57328033447266
37 80.68270874023438
38 76.06634521484375
39 71.7142562866211
40 67.61351013183594
41 63.74716567993164
42 60.11194610595703
43 56.69011688232422
44 53.45947265625
45 50.41518020629883
46 47.54682922363281
47 44.851

***optimization algorithms***

In [8]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
# optimizer will update the weights of the model 
# the first param to the optimiezer tells which tensor it should update
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(100):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    # makes an update to its parameters
    optimizer.step()

0 634.1229248046875
1 617.5852661132812
2 601.5380859375
3 586.152099609375
4 571.2894897460938
5 556.8876342773438
6 542.943603515625
7 529.4150390625
8 516.3544921875
9 503.6574401855469
10 491.3299255371094
11 479.3149108886719
12 467.62542724609375
13 456.3007507324219
14 445.3126220703125
15 434.6336669921875
16 424.3106384277344
17 414.3401794433594
18 404.6278381347656
19 395.12872314453125
20 385.8719482421875
21 376.87567138671875
22 368.1364440917969
23 359.66925048828125
24 351.43389892578125
25 343.4422912597656
26 335.66522216796875
27 328.06268310546875
28 320.65167236328125
29 313.38177490234375
30 306.3014831542969
31 299.384765625
32 292.6275634765625
33 285.9807434082031
34 279.45489501953125
35 273.0585632324219
36 266.7910461425781
37 260.6462097167969
38 254.59970092773438
39 248.69773864746094
40 242.93630981445312
41 237.30020141601562
42 231.76422119140625
43 226.33609008789062
44 221.00799560546875
45 215.77293395996094
46 210.63722229003906
47 205.594711303710