# PyTorch with examples
Based on [this tutorial](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html).

In [3]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(100):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    loss = np.square(y_pred - y).sum()
    if t % 10 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) # Gradient with respect to y_pred
    grad_w2 = h_relu.T.dot(grad_y_pred) # Gradient with respect to w2
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 31013131.587486777
10 2222117.631989957
20 240552.15919300047
30 76235.60945676007
40 30077.89536266186
50 13449.196059237143
60 6514.647629573584
70 3331.994592094785
80 1765.236841445614
90 958.5991503771224


In [5]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    loss = (y_pred - y).pow(2).sum().item()
    if t % 10 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 38760816.0
10 1050525.25
20 216995.328125
30 71692.515625
40 28023.298828125
50 11993.0703125
60 5440.0576171875
70 2573.849365234375
80 1258.4881591796875
90 632.1093139648438
100 324.8955078125
110 170.43275451660156
120 91.0814208984375
130 49.51249694824219
140 27.342105865478516
150 15.321521759033203
160 8.702713012695312
170 5.004947662353516
180 2.911710739135742
190 1.7116506099700928
200 1.0156599283218384
210 0.607783317565918
220 0.36662599444389343
230 0.22264395654201508
240 0.13602808117866516
250 0.08357734978199005
260 0.051621630787849426
270 0.03204760700464249
280 0.019998831674456596
290 0.012573858723044395
300 0.00796563271433115
310 0.005115479230880737
320 0.0033400729298591614
330 0.0022204197011888027
340 0.0015091252280399203
350 0.0010525984689593315
360 0.0007504013483412564
370 0.0005483362474478781
380 0.0004103693354409188
390 0.0003143498324789107
400 0.00024527005734853446
410 0.00019577136845327914
420 0.0001584457786520943
430 0.000129814943647943

In [6]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 10 == 0:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 33819040.0
10 991609.5625
20 206469.78125
30 69621.9609375
40 28826.1953125
50 13523.537109375
60 6927.1484375
70 3777.9345703125
80 2157.403076171875
90 1275.5419921875
100 774.439453125
110 479.7152099609375
120 301.76995849609375
130 192.04493713378906
140 123.33074188232422
150 79.80134582519531
160 51.9127311706543
170 33.91488265991211
180 22.237552642822266
190 14.62437629699707
200 9.639455795288086
210 6.365767002105713
220 4.211150169372559
230 2.789616346359253
240 1.8501750230789185
250 1.2283086776733398
260 0.8162503838539124
270 0.5428179502487183
280 0.36122846603393555
290 0.24066799879074097
300 0.16036289930343628
310 0.1069307029247284
320 0.07132479548454285
330 0.04767594113945961
340 0.03188961371779442
350 0.02136874757707119
360 0.014364227652549744
370 0.00970419030636549
380 0.006600853055715561
390 0.004534025210887194
400 0.003146818606182933
410 0.0022102417424321175
420 0.0015824874863028526
430 0.0011530796764418483
440 0.000855477002914995
450 0.00064

In [7]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 10 == 0:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 25784296.0
10 1976697.75
20 194479.90625
30 64596.33984375
40 25636.34765625
50 11353.599609375
60 5416.7080078125
70 2725.79150390625
80 1425.1949462890625
90 767.0255737304688
100 421.9708251953125
110 236.24063110351562
120 134.16421508789062
130 77.13327026367188
140 44.82170867919922
150 26.29680824279785
160 15.562889099121094
170 9.285591125488281
180 5.581443786621094
190 3.378462553024292
200 2.0582408905029297
210 1.2615464925765991
220 0.7776156663894653
230 0.48184019327163696
240 0.3000325560569763
250 0.18759432435035706
260 0.11781259626150131
270 0.07426729053258896
280 0.04702411964535713
290 0.029893962666392326
300 0.01908000186085701
310 0.012258913367986679
320 0.007936155423521996
330 0.005201319698244333
340 0.003450126852840185
350 0.0023337858729064465
360 0.0016128456918522716
370 0.001140199019573629
380 0.0008274352294392884
390 0.0006120070465840399
400 0.00046470994129776955
410 0.000358453078661114
420 0.00028207263676449656
430 0.00022523671214003116
4

### Same example with TensorFlow using static graphs

In [11]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for t in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        if t % 10 == 0:
            print(t, loss_value)

0 29138028.0
10 107727976.0
20 207902.64
30 76284.195
40 35134.402
50 18569.664
60 10750.391
70 6630.093
80 4278.241
90 2852.1404
100 1945.8875
110 1351.0537
120 951.6257
130 677.91
140 487.46964
150 353.3787
160 258.07233
170 189.64412
180 140.11513
190 104.0233
200 77.61851
210 58.136696
220 43.689873
230 32.93211
240 24.890345
250 18.85878
260 14.320541
270 10.895882
280 8.306184
290 6.3423367
300 4.8503222
310 3.714665
320 2.8485084
330 2.1872034
340 1.6810789
350 1.2934544
360 0.99617445
370 0.7679531
380 0.5925187
390 0.45747444
400 0.35347316
410 0.27330598
420 0.21158594
430 0.16381148
440 0.12695816
450 0.09845124
460 0.076423995
470 0.05933692
480 0.046135705
490 0.035926536


In [12]:
# -*- coding: utf-8 -*-
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 10 == 0:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 593.7945556640625
10 321.715576171875
20 190.1618194580078
30 108.57192993164062
40 59.454307556152344
50 32.058807373046875
60 17.554323196411133
70 9.887866020202637
80 5.743444919586182
90 3.4359397888183594
100 2.1027116775512695
110 1.3110263347625732
120 0.8304890990257263
130 0.5328897833824158
140 0.34608107805252075
150 0.2269282341003418
160 0.15007688105106354
170 0.09998749941587448
180 0.06703263521194458
190 0.04521980881690979
200 0.03070395067334175
210 0.020985830575227737
220 0.014427931979298592
230 0.009972568601369858
240 0.006929589435458183
250 0.004838995635509491
260 0.0033961685840040445
270 0.002395347924903035
280 0.001697704428806901
290 0.00120886682998389
300 0.000864645466208458
310 0.0006213027518242598
320 0.0004484256205614656
330 0.00032513096812181175
340 0.00023675545526202768
350 0.00017315240984316915
360 0.000127187289763242
370 9.381398558616638e-05
380 6.949006638024002e-05
390 5.168916322872974e-05
400 3.8607784517807886e-05
410 2.894985300

In [13]:
# -*- coding: utf-8 -*-
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 10 == 0:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 640.841552734375
10 497.96685791015625
20 391.2698669433594
30 310.4077453613281
40 245.7281951904297
50 192.78416442871094
60 149.2481231689453
70 113.42206573486328
80 84.58444213867188
90 61.79524612426758
100 44.29963302612305
110 31.216581344604492
120 21.581859588623047
130 14.609041213989258
140 9.688215255737305
150 6.279635429382324
160 3.9819421768188477
170 2.475731372833252
180 1.5093015432357788
190 0.90308678150177
200 0.5307276844978333
210 0.30638831853866577
220 0.17372949421405792
230 0.09687654674053192
240 0.053246594965457916
250 0.02890770137310028
260 0.015480522066354752
270 0.008180275559425354
280 0.004273911472409964
290 0.002207574201747775
300 0.0011260970495641232
310 0.0005682561895810068
320 0.0002837846986949444
330 0.00014060482499189675
340 6.914435653015971e-05
350 3.373126673977822e-05
360 1.6312555089825764e-05
370 7.813540833012667e-06
380 3.7047759633423993e-06
390 1.7383857766617439e-06
400 8.064277494668204e-07
410 3.6999736607867817e-07
420 

In [14]:
# -*- coding: utf-8 -*-
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(100):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 10 == 0:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 658.9791259765625
10 363.08489990234375
20 213.97409057617188
30 121.97673797607422
40 67.34693145751953
50 37.010780334472656
60 20.798660278320312
70 12.032783508300781
80 7.19368314743042
90 4.436589241027832


### An example of dynamic graphs and weight sharing
At each forward pass, the number of hidden layers is determined randomly.

In [15]:
# -*- coding: utf-8 -*-
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 10 == 0:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 645.0087890625
10 527.2771606445312
20 516.1095581054688
30 72.93712615966797
40 94.55448913574219
50 177.55770874023438
60 73.06647491455078
70 61.769927978515625
80 40.94328308105469
90 34.81297302246094
100 12.391901016235352
110 13.597167015075684
120 19.191179275512695
130 6.949825763702393
140 50.0013313293457
150 20.72280502319336
160 13.736315727233887
170 2.8050537109375
180 2.631934404373169
190 8.959303855895996
200 15.282915115356445
210 24.196706771850586
220 18.369306564331055
230 3.6045737266540527
240 1.6809461116790771
250 3.4134421348571777
260 0.4704574644565582
270 2.6431336402893066
280 0.3593122959136963
290 1.6383602619171143
300 0.7336400747299194
310 1.0631821155548096
320 1.2466310262680054
330 1.0346189737319946
340 0.14329493045806885
350 0.14934679865837097
360 0.1274939477443695
370 0.26628485321998596
380 0.05094677954912186
390 0.656501829624176
400 0.4536900222301483
410 0.07217419892549515
420 0.42325571179389954
430 0.36178845167160034
440 0.0922489