In [1]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [2]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 31789358.0
1 25885480.0
2 23663080.0
3 21537246.0
4 18126244.0
5 13664303.0
6 9327304.0
7 5948257.0
8 3715892.75
9 2363955.75
10 1575488.75
11 1111079.875
12 828229.9375
13 646115.125
14 521626.46875
15 431564.84375
16 363275.40625
17 309616.46875
18 266271.84375
19 230572.453125
20 200757.28125
21 175606.421875
22 154243.015625
23 135959.28125
24 120231.078125
25 106637.390625
26 94842.6796875
27 84572.3515625
28 75597.0703125
29 67731.953125
30 60816.01953125
31 54719.16796875
32 49327.84375
33 44545.03125
34 40294.71875
35 36510.2109375
36 33138.78125
37 30117.45703125
38 27404.576171875
39 24966.681640625
40 22771.6484375
41 20793.31640625
42 19006.869140625
43 17391.30859375
44 15927.95703125
45 14600.44921875
46 13395.732421875
47 12301.26171875
48 11304.677734375
49 10397.5791015625
50 9570.3515625
51 8815.0927734375
52 8124.875
53 7493.14404296875
54 6914.74658203125
55 6385.16552734375
56 5899.83984375
57 5455.54150390625
58 5047.20263671875
59 4672.16552734375
60 4327.23974

427 7.157091022236273e-05
428 7.034589361865073e-05
429 6.93745823809877e-05
430 6.79883305565454e-05
431 6.69727596687153e-05
432 6.576265877811238e-05
433 6.476250564446673e-05
434 6.390662747435272e-05
435 6.289723387453705e-05
436 6.175996531965211e-05
437 6.085099812480621e-05
438 6.0014590417267755e-05
439 5.8651046856539324e-05
440 5.78128092456609e-05
441 5.677770241163671e-05
442 5.600221629720181e-05
443 5.534548472496681e-05
444 5.456183134810999e-05
445 5.374953252612613e-05
446 5.2878702263114974e-05
447 5.185957343201153e-05
448 5.121323556522839e-05
449 5.03906630910933e-05
450 4.9610971473157406e-05
451 4.9265760026173666e-05
452 4.837611049879342e-05
453 4.757491115015e-05
454 4.7092558816075325e-05
455 4.6485321945510805e-05
456 4.5823820983059704e-05
457 4.501453076954931e-05
458 4.4394153519533575e-05
459 4.409451867104508e-05
460 4.3348987674107775e-05
461 4.2644081986509264e-05
462 4.22789016738534e-05
463 4.1613126086303964e-05
464 4.127011197851971e-05
465 4.068

In [28]:
(y_pred[0] == y_pred[0].max()).nonzero()

tensor([[5]])