# Numpy

In [3]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print("iter, loss = ",t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

iter, loss =  0 27205473.81979662
iter, loss =  1 22653004.776252083
iter, loss =  2 21204419.18836749
iter, loss =  3 20066741.83247085
iter, loss =  4 17984292.525800034
iter, loss =  5 14631037.372807005
iter, loss =  6 10832995.226793382
iter, loss =  7 7385129.209125947
iter, loss =  8 4841031.296303229
iter, loss =  9 3141947.174375601
iter, loss =  10 2093712.41690197
iter, loss =  11 1452373.8589529074
iter, loss =  12 1059097.9058962846
iter, loss =  13 808710.8530664165
iter, loss =  14 642445.5045689727
iter, loss =  15 526087.700821579
iter, loss =  16 440630.0172238075
iter, loss =  17 375135.1078743287
iter, loss =  18 323163.5279022085
iter, loss =  19 280807.35051786585
iter, loss =  20 245621.3644721667
iter, loss =  21 215955.04265997352
iter, loss =  22 190633.39463575024
iter, loss =  23 168846.16991092148
iter, loss =  24 149993.24159534893
iter, loss =  25 133603.69861908592
iter, loss =  26 119305.66459088799
iter, loss =  27 106806.30107432789
iter, loss =  28 9

iter, loss =  234 0.19249438604542213
iter, loss =  235 0.18265830636727787
iter, loss =  236 0.1733271709089229
iter, loss =  237 0.16447408047550707
iter, loss =  238 0.15607816034027736
iter, loss =  239 0.14811524100759277
iter, loss =  240 0.14056835119713762
iter, loss =  241 0.13340731628439986
iter, loss =  242 0.12661329794634724
iter, loss =  243 0.12016841922717506
iter, loss =  244 0.11405497174803113
iter, loss =  245 0.10826007667316946
iter, loss =  246 0.10275783429683058
iter, loss =  247 0.09753906089925676
iter, loss =  248 0.09258816305953091
iter, loss =  249 0.08789086059249526
iter, loss =  250 0.08343704599322366
iter, loss =  251 0.07920765976458344
iter, loss =  252 0.07519452517351355
iter, loss =  253 0.0713866653027812
iter, loss =  254 0.06777474196630573
iter, loss =  255 0.06434787289798968
iter, loss =  256 0.06109467447656708
iter, loss =  257 0.05800697662118337
iter, loss =  258 0.055077182631592475
iter, loss =  259 0.05229780195512123
iter, loss = 

# Tensor

In [4]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 26984836.0
1 20240764.0
2 17782098.0
3 16653049.0
4 15414896.0
5 13540144.0
6 11013320.0
7 8344364.5
8 5950085.5
9 4106608.0
10 2797606.5
11 1927062.125
12 1359018.75
13 991569.4375
14 749834.5625
15 587180.5625
16 473952.4375
17 392366.09375
18 331324.65625
19 284109.09375
20 246441.578125
21 215663.109375
22 190089.671875
23 168447.796875
24 149893.328125
25 133849.78125
26 119875.6796875
27 107627.671875
28 96842.6640625
29 87323.1875
30 78880.96875
31 71364.765625
32 64665.33203125
33 58679.859375
34 53317.6953125
35 48503.06640625
36 44170.1484375
37 40269.44921875
38 36749.4296875
39 33570.30859375
40 30692.109375
41 28083.755859375
42 25717.205078125
43 23567.654296875
44 21614.15625
45 19835.65234375
46 18216.673828125
47 16739.953125
48 15391.3740234375
49 14159.29296875
50 13032.5146484375
51 12001.7626953125
52 11058.3173828125
53 10195.119140625
54 9403.9248046875
55 8677.68359375
56 8011.22509765625
57 7399.77490234375
58 6838.142578125
59 6321.66943359375
60 5846.097167

474 2.8921012926730327e-05
475 2.862874316633679e-05
476 2.837955253198743e-05
477 2.7943395252805203e-05
478 2.746084464888554e-05
479 2.7368663722882047e-05
480 2.711277556954883e-05
481 2.669987952685915e-05
482 2.6248939320794307e-05
483 2.5951718271244317e-05
484 2.5838022338575684e-05
485 2.565466274973005e-05
486 2.5419822122785263e-05
487 2.5125484171439894e-05
488 2.49303466262063e-05
489 2.4730794393690303e-05
490 2.453618435538374e-05
491 2.4331731765414588e-05
492 2.3998363758437335e-05
493 2.3606216927873902e-05
494 2.3409687855746597e-05
495 2.3265009076567367e-05
496 2.3007287381915376e-05
497 2.272865822305903e-05
498 2.2606498532695696e-05
499 2.2311376596917398e-05


# Autograd

In [5]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 38859836.0
1 35128044.0
2 30804344.0
3 23155806.0
4 14634064.0
5 8220497.5
6 4578296.0
7 2755890.5
8 1857092.5
9 1376264.125
10 1086791.75
11 890957.5
12 746705.125
13 634667.3125
14 544529.0
15 470344.125
16 408783.375
17 357163.65625
18 313466.125
19 276245.34375
20 244369.203125
21 216984.15625
22 193284.5
23 172738.609375
24 154834.828125
25 139153.078125
26 125399.7734375
27 113240.8671875
28 102463.6796875
29 92862.765625
30 84303.8359375
31 76659.390625
32 69815.171875
33 63673.41796875
34 58150.1484375
35 53174.5390625
36 48682.4609375
37 44623.8046875
38 40950.171875
39 37618.65234375
40 34594.609375
41 31844.7734375
42 29340.693359375
43 27060.654296875
44 24982.2109375
45 23085.94921875
46 21349.638671875
47 19759.037109375
48 18300.470703125
49 16961.646484375
50 15731.3271484375
51 14600.1787109375
52 13559.2431640625
53 12600.0322265625
54 11715.7333984375
55 10900.0078125
56 10146.8203125
57 9450.88671875
58 8807.29296875
59 8211.875
60 7660.6123046875
61 7150.09082031

# nn

In [7]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 706.1978759765625
1 656.0972900390625
2 612.7448120117188
3 574.7098388671875
4 540.8685913085938
5 510.58843994140625
6 482.8158264160156
7 457.2683410644531
8 433.51318359375
9 411.5287780761719
10 391.0774230957031
11 371.7334289550781
12 353.4732360839844
13 336.17926025390625
14 319.80718994140625
15 304.2423095703125
16 289.4091796875
17 275.1865234375
18 261.5335998535156
19 248.40316772460938
20 235.79367065429688
21 223.70677185058594
22 212.07760620117188
23 200.94078063964844
24 190.26998901367188
25 179.98043823242188
26 170.1342315673828
27 160.7449493408203
28 151.7748260498047
29 143.2320556640625
30 135.0959014892578
31 127.3434829711914
32 119.96572875976562
33 112.9560546875
34 106.30949401855469
35 100.01708221435547
36 94.0768051147461
37 88.451416015625
38 83.14179229736328
39 78.13935852050781
40 73.4178237915039
41 68.97269439697266
42 64.78125
43 60.84278869628906
44 57.143795013427734
45 53.66612243652344
46 50.40114974975586
47 47.33778381347656
48 44.467742

409 8.470789907732978e-05
410 8.236106077674776e-05
411 8.008440636331216e-05
412 7.786506466800347e-05
413 7.570991147076711e-05
414 7.361873576883227e-05
415 7.158621156122535e-05
416 6.961266626603901e-05
417 6.768916500732303e-05
418 6.582399510079995e-05
419 6.401353311957791e-05
420 6.224928074516356e-05
421 6.0539394326042384e-05
422 5.887402221560478e-05
423 5.725562368752435e-05
424 5.568453707383014e-05
425 5.415518535301089e-05
426 5.2670802688226104e-05
427 5.122611401020549e-05
428 4.9822410801425576e-05
429 4.846032243221998e-05
430 4.7134813939919695e-05
431 4.584241469274275e-05
432 4.459134288481437e-05
433 4.337179780122824e-05
434 4.218848698656075e-05
435 4.103773244423792e-05
436 3.991723497165367e-05
437 3.8832447899039835e-05
438 3.7773374060634524e-05
439 3.6745219404110685e-05
440 3.574630318325944e-05
441 3.477152858977206e-05
442 3.382748764124699e-05
443 3.2910756999626756e-05
444 3.2019102945923805e-05
445 3.114823630312458e-05
446 3.030202969966922e-05
447

# Optimizer

In [None]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()