In [None]:
%matplotlib inline


PyTorch: Variables and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Variables, and uses PyTorch autograd to compute gradients.

A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents a node
in a computational graph. If x is a Variable then x.data is a Tensor giving its
value, and x.grad is another Variable holding the gradient of x with respect to
some scalar value.

PyTorch Variables have the same API as PyTorch tensors: (almost) any operation
you can do on a Tensor you can also do on a Variable; the difference is that
autograd allows you to automatically compute gradients.



In [3]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 28300898.0
1 25770370.0
2 28143456.0
3 31105750.0
4 30632076.0
5 24762788.0
6 16077041.0
7 8796733.0
8 4482686.0
9 2383153.25
10 1422622.0
11 967917.9375
12 729133.8125
13 585472.5
14 487431.8125
15 414109.0625
16 355944.34375
17 308261.96875
18 268446.625
19 234831.40625
20 206198.265625
21 181669.796875
22 160542.046875
23 142278.734375
24 126483.6328125
25 112732.3671875
26 100693.2109375
27 90126.0859375
28 80829.96875
29 72632.3203125
30 65392.62890625
31 58971.00390625
32 53266.3125
33 48190.703125
34 43658.34765625
35 39605.18359375
36 35972.90625
37 32712.533203125
38 29780.93359375
39 27142.044921875
40 24764.744140625
41 22617.486328125
42 20675.08984375
43 18915.64453125
44 17320.01171875
45 15870.6611328125
46 14553.7353515625
47 13355.9296875
48 12264.4560546875
49 11269.8095703125
50 10362.1865234375
51 9533.3603515625
52 8776.220703125
53 8083.93701171875
54 7450.2119140625
55 6869.39892578125
56 6336.82666015625
57 5848.26318359375
58 5399.75146484375
59 4987.85253906

468 3.516904325806536e-05
469 3.4604967368068174e-05
470 3.416799154365435e-05
471 3.375205415068194e-05
472 3.3593449188629165e-05
473 3.3056399843189865e-05
474 3.263966937083751e-05
475 3.218786514480598e-05
476 3.17007397825364e-05
477 3.150004704366438e-05
478 3.114740684395656e-05
479 3.0876130040269345e-05
480 3.0499420972773805e-05
481 3.0021765269339085e-05
482 2.9463257305906154e-05
483 2.9329552489798516e-05
484 2.9000319045735523e-05
485 2.87780803773785e-05
486 2.8385120458551683e-05
487 2.812589445966296e-05
488 2.7845639124279842e-05
489 2.756470894382801e-05
490 2.7272928491584025e-05
491 2.6993606297764927e-05
492 2.6812818759935908e-05
493 2.6541898478171788e-05
494 2.626191962917801e-05
495 2.589997893664986e-05
496 2.5611620003473945e-05
497 2.547588519519195e-05
498 2.510590638848953e-05
499 2.48338019446237e-05
