# Pytorch with Examples

This is an introduction to Pytorch following the examples according to:

 - [1] http://pytorch.org/tutorials/beginner/pytorch_with_examples.html

### Pytorch: Tensors


In [6]:
import torch

# dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # This runs on GPU

# batch size (N), input dim (D_in), output dim (D_out), hidden dim (H)
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weight matrices
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # forward pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    # Compute and predict loss
    loss = (y_pred - y).pow(2).sum()
    if(t % 100 == 0):
        print(t, loss)
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2 

0 27091220.0
100 263.90582275390625
200 0.7395830750465393
300 0.0033662114292383194
400 0.00012202889047330245


### Pytorch: Variables and AutoGrad


In [9]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor

# batch size (N), input dim (D_in), output dim (D_out), hidden dim (H)
N, D_in, H, D_out = 64, 1000, 100, 10

# Create input and output tensors, but wrap them in variables
# The requires_grad field denotes that gradients do not need 
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Weights, you do want gradients for these variables
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass (you can chain operations since you dont need intermediate values)
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    # Loss is now a variable, so to print it you need loss.data
    loss = (y_pred - y).pow(2).sum()
    if(t % 100 == 0):
        print(t, loss.data[0])
    # Autograd will compute the backward pass
    loss.backward()
    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 26133316.0
100 454.1337585449219
200 1.5466086864471436
300 0.008519974537193775
400 0.00019258004613220692


### Defining new autograd functions

In [11]:
import torch
from torch.autograd import Variable

class ReLU(torch.autograd.Function):
    """
    Subclass autograd.Function to create our own custom autograd functions.
    """
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors # Bring back saved tensor from forward pass
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor

# batch size (N), input dim (D_in), output dim (D_out), hidden dim (H)
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Apply our function
    relu = ReLU.apply
    # forward pass with our custom autograd function
    y_pred = relu(x.mm(w1)).mm(w2)
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if(t % 100 == 0):
        print(t, loss.data[0])
    # Backwards pass using autograd
    loss.backward()
    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 22577980.0
100 509.048095703125
200 4.342343807220459
300 0.06760153919458389
400 0.001541274250485003


### nn module

In [13]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn module to define our model in a sequential manner
model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                           torch.nn.ReLU(),
                           torch.nn.Linear(H, D_out))

# The nn module also contains loss functions
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # model can be called as a variable. It outputs a variable
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    if(t % 100 == 0):
        print(t, loss.data[0])
    # Zero gradients before running backwards pass
    model.zero_grad()
    loss.backward()
    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 667.1468505859375
100 1.7733511924743652
200 0.02600976452231407
300 0.0008200249867513776
400 3.693465987453237e-05


### optim module

In [14]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
# Use optim module to define an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    if(t % 100 == 0):
        print(t, loss.data[0])
    # Zero gradients
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 731.6021728515625
100 72.0875473022461
200 1.479520559310913
300 0.0060690464451909065
400 1.0713240953919012e-05


### Custom nn modules

In [16]:
import torch
from torch.autograd import Variable

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad= False)

model = TwoLayerNet(D_in, H, D_out)

loss_func = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = loss_func(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 657.40576171875
1 605.4139404296875
2 561.5408935546875
3 523.2501831054688
4 489.2220153808594
5 458.61395263671875
6 431.0629577636719
7 405.8373107910156
8 382.69390869140625
9 360.9817810058594
10 340.64019775390625
11 321.7035827636719
12 303.9287109375
13 287.1660461425781
14 271.30126953125
15 256.2503967285156
16 241.9177703857422
17 228.29176330566406
18 215.3031005859375
19 202.99143981933594
20 191.25765991210938
21 180.08087158203125
22 169.4584197998047
23 159.38400268554688
24 149.83360290527344
25 140.7922821044922
26 132.22720336914062
27 124.13607025146484
28 116.47787475585938
29 109.2695083618164
30 102.4994125366211
31 96.10252380371094
32 90.09012603759766
33 84.44229125976562
34 79.1405258178711
35 74.1895980834961
36 69.55003356933594
37 65.20470428466797
38 61.1375617980957
39 57.33534622192383
40 53.77180862426758
41 50.43370056152344
42 47.31364822387695
43 44.403785705566406
44 41.68169021606445
45 39.14221954345703
46 36.76384353637695
47 34.53904342651367

375 0.00010235732042929158
376 9.923728794092312e-05
377 9.621901699574664e-05
378 9.329617751063779e-05
379 9.045679325936362e-05
380 8.771116699790582e-05
381 8.50461219670251e-05
382 8.24676244519651e-05
383 7.996294880285859e-05
384 7.754081161692739e-05
385 7.519596692873165e-05
386 7.291958172572777e-05
387 7.071298023220152e-05
388 6.857894186396152e-05
389 6.650610885117203e-05
390 6.449886132031679e-05
391 6.254936306504533e-05
392 6.066340210963972e-05
393 5.883549965801649e-05
394 5.706298543373123e-05
395 5.534719457500614e-05
396 5.367810445022769e-05
397 5.2066603529965505e-05
398 5.049990795669146e-05
399 4.898415500065312e-05
400 4.751024971483275e-05
401 4.608579911291599e-05
402 4.470087151275948e-05
403 4.3362771975807846e-05
404 4.206318044452928e-05
405 4.0802104194881395e-05
406 3.958004526793957e-05
407 3.839517739834264e-05
408 3.724528505699709e-05
409 3.6135879781795666e-05
410 3.505598215269856e-05
411 3.4008549846475944e-05
412 3.299228410469368e-05
413 3.20

### Control Flow + Sharing

In [17]:
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 678.0689697265625
1 698.9865112304688
2 658.7703247070312
3 643.0426635742188
4 624.1532592773438
5 667.052978515625
6 586.9552001953125
7 664.2916259765625
8 654.0009765625
9 541.649658203125
10 482.1560363769531
11 639.3900756835938
12 503.0950622558594
13 627.1452026367188
14 382.1196594238281
15 455.5534973144531
16 600.8511962890625
17 587.4323120117188
18 632.5256958007812
19 550.8008422851562
20 527.6943969726562
21 220.34628295898438
22 572.947998046875
23 548.740966796875
24 433.2142028808594
25 174.7963409423828
26 158.20472717285156
27 458.0625305175781
28 115.33761596679688
29 437.33477783203125
30 293.7109375
31 402.8432922363281
32 383.5705261230469
33 215.77601623535156
34 345.1801452636719
35 248.3778076171875
36 131.49179077148438
37 151.4760284423828
38 272.351318359375
39 240.56602478027344
40 108.9260482788086
41 95.68561553955078
42 131.78335571289062
43 115.88265991210938
44 96.69530487060547
45 286.2318115234375
46 149.4499969482422
47 271.1319274902344
48 238.

422 0.5662491917610168
423 0.4060453474521637
424 0.7634985446929932
425 0.6109599471092224
426 0.47424450516700745
427 0.35526612401008606
428 0.4253063201904297
429 0.6826678514480591
430 0.09122802317142487
431 0.8396509289741516
432 0.4310610890388489
433 0.7842603325843811
434 0.09663848578929901
435 0.5216723680496216
436 0.0869370773434639
437 0.0871482565999031
438 0.7221881747245789
439 0.6517196297645569
440 0.3355465531349182
441 0.5034245252609253
442 0.35649916529655457
443 0.455129474401474
444 0.3244795799255371
445 0.5439274311065674
446 0.5268503427505493
447 0.5181049108505249
448 0.45050716400146484
449 0.5589508414268494
450 0.2538915276527405
451 0.11221888661384583
452 0.6219174861907959
453 0.06911291927099228
454 0.5454586744308472
455 0.3072149157524109
456 0.3765687048435211
457 0.09236952662467957
458 0.6934877038002014
459 0.2841666042804718
460 0.03727373480796814
461 0.0409160852432251
462 0.05244668200612068
463 0.057924043387174606
464 0.0503204576671123