In [85]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [86]:
import torch

## Forward and Backward pass without autograd (manually)

Example derived from: https://github.com/jcjohnson/pytorch-examples#pytorch-autograd

In [87]:
device = torch.device('cpu')

In [88]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [89]:
# random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Random init of weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

In [90]:
learning_rate = 1e-6

for epoch in range(300):
    # Forward Pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute the loss
    loss = (y_pred - y).pow(2).sum()  # scalar
    print(epoch, loss.item())
    
    # Back propagation: 
    # compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # grad update using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2


0 43415424.0
1 44696848.0
2 43684180.0
3 33445164.0
4 18966750.0
5 8710072.0
6 4064412.25
7 2290609.0
8 1566268.5
9 1199763.25
10 968629.5
11 801704.6875
12 672338.125
13 569214.8125
14 485309.5
15 416119.875
16 358731.25
17 310751.9375
18 270388.8125
19 236253.890625
20 207237.0625
21 182534.671875
22 161286.453125
23 142934.734375
24 127015.734375
25 113161.7578125
26 101051.0078125
27 90427.078125
28 81089.28125
29 72849.84375
30 65573.0859375
31 59129.8125
32 53400.8828125
33 48296.6015625
34 43737.6875
35 39663.74609375
36 36012.859375
37 32736.626953125
38 29790.51171875
39 27138.73828125
40 24750.560546875
41 22593.486328125
42 20643.345703125
43 18878.994140625
44 17278.8984375
45 15827.1474609375
46 14508.4892578125
47 13309.392578125
48 12217.7314453125
49 11222.955078125
50 10315.6689453125
51 9487.958984375
52 8732.1689453125
53 8042.54833984375
54 7411.3671875
55 6833.11279296875
56 6303.33154296875
57 5817.6904296875
58 5372.22509765625
59 4963.30517578125
60 4587.5327148

## With Autograd + vanilla gradient descent

In [91]:
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

lr = 1e-6
for epoch in range(300):
    # Forward Pass
    # Forward pass: compute predicted y using operations on Tensors. Since w1 and
    # w2 have requires_grad=True, operations involving these Tensors will cause
    # PyTorch to build a computational graph, allowing automatic computation of
    # gradients. Since we are no longer implementing the backward pass by hand we
    # don't need to keep references to intermediate values.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute loss
    loss = (y_pred - y).pow(2).sum()
    print(epoch, loss.item())
    
    # USE AUTOGRAD
    # to compute backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Update weights using gradient descent. For this step we just want to mutate
    # the values of w1 and w2 in-place; we don't want to build up a computational
    # graph for the update steps, so we use the torch.no_grad() context manager
    # to prevent PyTorch from building a computational graph for the updates
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        
        # Manually zero the gradients after running the backward pass
        # https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch
        w1.grad.zero_()
        w2.grad.zero_()


0 25942616.0
1 20826046.0
2 21238560.0
3 24298532.0
4 27461800.0
5 27520630.0
6 23098542.0
7 15689217.0
8 9032992.0
9 4725458.5
10 2497366.5
11 1430850.25
12 924970.3125
13 668213.125
14 523413.75
15 430951.75
16 365188.375
17 314705.9375
18 273972.0625
19 240077.625
20 211379.9375
21 186838.140625
22 165708.109375
23 147398.984375
24 131448.203125
25 117494.234375
26 105251.5625
27 94476.8203125
28 84960.921875
29 76537.9375
30 69059.5078125
31 62410.171875
32 56484.2734375
33 51188.12890625
34 46455.6015625
35 42215.66796875
36 38405.8359375
37 34979.69921875
38 31894.5703125
39 29110.30078125
40 26594.232421875
41 24316.7734375
42 22253.53125
43 20383.240234375
44 18684.919921875
45 17141.2890625
46 15736.83984375
47 14458.1484375
48 13292.1416015625
49 12228.306640625
50 11257.6259765625
51 10371.16796875
52 9560.66796875
53 8818.88671875
54 8139.2236328125
55 7515.86865234375
56 6944.18505859375
57 6419.4326171875
58 5937.2744140625
59 5494.17578125
60 5086.65283203125
61 4711.687

## With Autograd + Optimizer

In [92]:
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

lr = 1e-6

optimizer = torch.optim.SGD([w1, w2], lr=lr)

for epoch in range(300):
    # Forward Pass
    # Forward pass: compute predicted y using operations on Tensors. Since w1 and
    # w2 have requires_grad=True, operations involving these Tensors will cause
    # PyTorch to build a computational graph, allowing automatic computation of
    # gradients. Since we are no longer implementing the backward pass by hand we
    # don't need to keep references to intermediate values.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute loss
    loss = (y_pred - y).pow(2).sum()
    print(epoch, loss.item())
    
    # USE AUTOGRAD
    # to compute backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # We are not doing this here
    # Using optimizer to step towards gradient descent and zeroing out grad
#     with torch.no_grad():
#         w1 -= lr * w1.grad
#         w2 -= lr * w2.grad
        
#         w1.grad.zero_()
#         w2.grad.zero_()

    optimizer.step()
    
    optimizer.zero_grad()

0 35422672.0
1 31244452.0
2 28708928.0
3 24129812.0
4 17419576.0
5 10978274.0
6 6379350.0
7 3729778.0
8 2318253.5
9 1574131.125
10 1156691.0
11 901313.375
12 729261.875
13 603939.6875
14 507661.5625
15 430981.75
16 368564.375
17 317045.3125
18 274153.0625
19 238073.703125
20 207548.453125
21 181526.75
22 159270.125
23 140161.234375
24 123703.1484375
25 109522.09375
26 97207.453125
27 86468.0
28 77089.53125
29 68868.734375
30 61637.90234375
31 55276.75390625
32 49662.20703125
33 44685.2890625
34 40267.234375
35 36335.375
36 32832.31640625
37 29705.0625
38 26908.01171875
39 24408.0390625
40 22169.15234375
41 20155.203125
42 18342.9375
43 16709.94140625
44 15236.373046875
45 13905.875
46 12702.7001953125
47 11613.4140625
48 10625.9501953125
49 9729.59765625
50 8915.24609375
51 8174.51171875
52 7500.447265625
53 6886.6396484375
54 6327.103515625
55 5816.962890625
56 5350.97998046875
57 4925.408203125
58 4536.0712890625
59 4179.69970703125
60 3853.388671875
61 3554.298095703125
62 3280.0561

## With Autograd + Optimizer + nn.Loss

In [95]:
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

lr = 1e-6
# Instead of manual loss implementation, Using Torch.nn.<LossFunction>
mse_loss = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD([w1, w2], lr=lr)

for epoch in range(300):
    # Forward Pass
    # Forward pass: compute predicted y using operations on Tensors. Since w1 and
    # w2 have requires_grad=True, operations involving these Tensors will cause
    # PyTorch to build a computational graph, allowing automatic computation of
    # gradients. Since we are no longer implementing the backward pass by hand we
    # don't need to keep references to intermediate values.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute loss
    # Instead of manual loss implementation, Using Torch.nn.<LossFunction>
    # loss = (y_pred - y).pow(2).sum()
    loss = mse_loss(y_pred, y)
    print(epoch, loss.item())
    
    # USE AUTOGRAD
    # to compute backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Using optimizer to step towards gradient descent and zeroing out grad
    optimizer.step()
    optimizer.zero_grad()

0 31530432.0
1 27427456.0
2 25455544.0
3 22159162.0
4 17361252.0
5 11939529.0
6 7576283.0
7 4628189.0
8 2899822.5
9 1918916.75
10 1362040.0
11 1028755.5625
12 816119.0625
13 669863.8125
14 562692.875
15 480055.0
16 414132.09375
17 360321.8125
18 315366.1875
19 277332.28125
20 244851.703125
21 216940.359375
22 192817.59375
23 171837.0625
24 153509.84375
25 137438.78125
26 123300.484375
27 110826.3359375
28 99813.8515625
29 90043.796875
30 81356.953125
31 73628.203125
32 66745.9765625
33 60588.0078125
34 55069.015625
35 50133.65625
36 45697.0859375
37 41701.05078125
38 38097.625
39 34840.1875
40 31892.080078125
41 29220.615234375
42 26797.38671875
43 24594.669921875
44 22592.22265625
45 20769.8046875
46 19111.189453125
47 17597.154296875
48 16214.876953125
49 14952.4462890625
50 13796.84375
51 12738.6064453125
52 11768.7392578125
53 10879.9345703125
54 10063.953125
55 9314.177734375
56 8625.0712890625
57 7990.9541015625
58 7406.95458984375
59 6868.9033203125
60 6373.40625
61 5918.3569335

## Using pre-trained models

In [107]:
import torchvision

In [110]:
device = torch.device('cpu')
# Load resnet18 pretrained model
model = torchvision.models.resnet18(pretrained=True)

data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 1000)

# Instead of manual loss implementation, Using Torch.nn.<LossFunction>
mse_loss = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

for epoch in range(20):
    y_pred = model(data)
    
    # Compute loss
    # Instead of manual loss implementation, Using Torch.nn.<LossFunction>
    # loss = (y_pred - y).pow(2).sum()
    loss = mse_loss(y_pred, labels)
    print(epoch, loss.item())
    
    # USE AUTOGRAD
    # to compute backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Using optimizer to step towards gradient descent and zeroing out grad
    optimizer.step()
    optimizer.zero_grad()

0 1.0171806812286377
1 1.090268611907959
2 0.9514904022216797
3 0.8523459434509277
4 0.7531971335411072
5 0.6300870180130005
6 0.5352777242660522
7 0.4524187743663788
8 0.37616461515426636
9 0.3109992444515228
10 0.25395897030830383
11 0.20431816577911377
12 0.16136804223060608
13 0.12471511960029602
14 0.09422096610069275
15 0.06942766159772873
16 0.04972061514854431
17 0.03440779820084572
18 0.02287709154188633
19 0.014634357765316963
