In [1]:
from IPython.core.display import display, HTML 
display(HTML("<style>.container { width:100% !important; }</style>")) 

# Table of Contents
### Tensors
- Warm-up:numpy
- Pytorch:Tensors

### Autograd
- Pytorch: Tensors and autograd
- Pytorch: Defining new autograd functions
- TensorFlow: Static Graphs

### nn module 
- Pytorch: nn
- Pytorch: optim
- Pytorch: Custom nn Modules
- Pytorch: Control Flow + Weight Sharing

### Examples
- Tensors
- Autograd
- nn module

In [2]:
# Warm-up: numpy
# -*- coding:  utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and ouput data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pss: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30902815.86526917
1 30136116.879921198
2 34223900.9354293
3 37080599.96942255
4 33256600.779202193
5 22751720.910071183
6 12118364.445619939
7 5688831.931059418
8 2841696.768288509
9 1684791.3367316928
10 1183858.3801894723
11 926372.5793444746
12 766368.5541321026
13 651610.303553616
14 562209.6325070871
15 489338.4593820891
16 428624.1870335536
17 377484.46767280425
18 333998.68784211425
19 296690.22555295785
20 264593.28787242307
21 236758.5509606658
22 212501.24144026978
23 191313.92173114375
24 172713.3306074434
25 156311.5626188093
26 141797.171302972
27 128911.14016403261
28 117434.15118821098
29 107189.279704241
30 98011.22089363114
31 89780.65741993685
32 82370.64169060087
33 75683.00209938161
34 69623.4783132232
35 64131.031747098765
36 59150.68458235804
37 54620.21613940573
38 50492.00283661854
39 46725.32104872957
40 43283.4878780134
41 40134.34379891494
42 37248.32467617793
43 34599.670875079726
44 32165.572620759493
45 29928.0917853514
46 27867.851015312342
47 25969.564

387 0.04185585112369558
388 0.04056223060113481
389 0.039308615895643835
390 0.03809434874525486
391 0.03691728441143633
392 0.0357768686529675
393 0.03467192494717797
394 0.03360186239835852
395 0.03256421035025024
396 0.03155885618796388
397 0.030584606264442503
398 0.029640693891214542
399 0.02872576653893203
400 0.027839337221552688
401 0.02698049747092055
402 0.026148302290596802
403 0.0253415358499329
404 0.02455976930948795
405 0.02380235376097589
406 0.02306839141182851
407 0.02235719332826875
408 0.021667804151649095
409 0.020999951328458773
410 0.020352712191869644
411 0.019725382420775893
412 0.019117413831262926
413 0.018528401672534445
414 0.01795760290004614
415 0.01740428003087186
416 0.016868115770226306
417 0.016348598849559276
418 0.015845071389385294
419 0.015357087953702726
420 0.014884303859957121
421 0.01442599053626377
422 0.013981833476342588
423 0.013551320595569086
424 0.013134227863678748
425 0.012729951937454164
426 0.012338257581440387
427 0.011958670012258

In [3]:
# Pytorch implementation
# -*- coding: utf-8 -*-

import torch

dtype = torch.float
device = device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

cpu
0 31763704.0
1 28683640.0
2 29260632.0
3 28519670.0
4 24071810.0
5 16848814.0
6 10082588.0
7 5545345.0
8 3096470.75
9 1878387.75
10 1270155.375
11 942188.75
12 745226.8125
13 612788.875
14 515508.71875
15 439655.5625
16 378362.6875
17 327755.03125
18 285327.4375
19 249458.46875
20 218942.96875
21 192811.84375
22 170323.390625
23 150876.1875
24 133999.640625
25 119313.8984375
26 106486.171875
27 95252.4375
28 85394.34375
29 76726.9921875
30 69065.390625
31 62279.7421875
32 56252.5
33 50889.06640625
34 46108.67578125
35 41837.71875
36 38016.78125
37 34599.22265625
38 31528.322265625
39 28764.654296875
40 26273.076171875
41 24024.056640625
42 21991.435546875
43 20151.419921875
44 18483.345703125
45 16969.27734375
46 15593.39453125
47 14341.8447265625
48 13201.65625
49 12161.9111328125
50 11212.8876953125
51 10345.6533203125
52 9552.3974609375
53 8826.2958984375
54 8160.8212890625
55 7550.51953125
56 6990.19482421875
57 6475.3134765625
58 6001.87158203125
59 5566.2236328125
60 5165.037

# Autograd

if x is a Tensor that has x.requires_grad=True then x.grad is another Tensor holding the gradient of x with respect to some scalar value`

In [4]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random tensors to hold input and output
# setting requires_grad=False indicates that we do not compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype) #各要素がそれぞれ標準正規分布N(0,1)(平均0、分散1)から発生
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random tensors for weights
# Setting requires_grad=True indicates taht we want to compute gradients with
# respect to these Tensors during the backward pass
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors: these
    # are exactly the same operatioins we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operaroins on Tensors
    # Now loss is a tensor of shape(1, )
    # loss.item() gets the scalar value held in the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # Use autograd to compute the backward pass. this call will compute the
    # gradient of loss with respect to all Tensorswith requires_grad=True
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively
    loss.backward()
    
    # Manually update weights using gradient descent Wrap in torch.no_grad()
    # because wieghts have rquires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't tarck history
    # You can also use torch.optim.SGD to achive this 
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # manually zero the gradietns after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

cpu
0 32732638.0
1 31238682.0
2 36689144.0
3 42493300.0
4 40762208.0
5 28895538.0
6 14993434.0
7 6488180.5
8 2971427.0
9 1687065.75
10 1178030.0
11 925373.125
12 766528.8125
13 650068.0625
14 558038.4375
15 482816.4375
16 420390.90625
17 367913.125
18 323437.0625
19 285477.71875
20 253001.859375
21 224988.578125
22 200732.796875
23 179652.625
24 161239.109375
25 145100.125
26 130895.0625
27 118355.9765625
28 107239.84375
29 97353.0625
30 88534.015625
31 80644.9609375
32 73577.0390625
33 67227.9140625
34 61515.78515625
35 56361.40234375
36 51707.671875
37 47493.19140625
38 43666.1953125
39 40190.4765625
40 37025.95703125
41 34142.69140625
42 31511.501953125
43 29107.68359375
44 26908.5234375
45 24893.9140625
46 23047.962890625
47 21355.763671875
48 19801.28515625
49 18371.4609375
50 17055.314453125
51 15843.0703125
52 14724.92578125
53 13692.9736328125
54 12740.6396484375
55 11860.62890625
56 11047.0712890625
57 10293.8798828125
58 9596.4482421875
59 8950.3193359375
60 8351.0390625
61 7

## Defining new autograd functions
In Pytorch we can easily define our own autograd operator by
defining a subclass of torch.autograd.Function and implementing
the forward and backward functions

In [5]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input,= ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 27174070.0
1 21823328.0
2 19802362.0
3 18134054.0
4 15746608.0
5 12508332.0
6 9156776.0
7 6264495.5
8 4158182.5
9 2749738.75
10 1862520.5
11 1308689.25
12 961195.125
13 736088.5
14 585026.0
15 479069.75
16 401573.28125
17 342657.21875
18 296255.0625
19 258703.21875
20 227661.640625
21 201572.4375
22 179375.46875
23 160273.96875
24 143693.65625
25 129234.2578125
26 116561.5234375
27 105376.765625
28 95471.7578125
29 86668.109375
30 78819.28125
31 71802.4609375
32 65520.296875
33 59879.59375
34 54809.81640625
35 50235.59765625
36 46108.66015625
37 42368.921875
38 38976.25
39 35894.91796875
40 33090.76171875
41 30536.748046875
42 28206.607421875
43 26077.814453125
44 24128.53515625
45 22341.576171875
46 20703.4453125
47 19199.28125
48 17817.72265625
49 16547.6953125
50 15378.091796875
51 14299.326171875
52 13306.16015625
53 12391.6279296875
54 11546.828125
55 10766.73046875
56 10044.896484375
57 9376.4580078125
58 8756.736328125
59 8182.14892578125
60 7649.072265625
61 7154.00048828125


In [8]:
# Custom nn Modules
# -*- coding: utf-8 -*-
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as 
        well as as arbitrary operators on Tensors
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
    
    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    # Zero gradietns, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 691.74072265625
1 638.0119018554688
2 591.758544921875
3 551.1732177734375
4 514.7890014648438
5 482.36328125
6 453.0075378417969
7 426.0222473144531
8 401.0595703125
9 377.8407897949219
10 356.29534912109375
11 336.39422607421875
12 317.7605285644531
13 300.30963134765625
14 283.7848205566406
15 268.19122314453125
16 253.3729705810547
17 239.35992431640625
18 226.05247497558594
19 213.41912841796875
20 201.3875732421875
21 189.9186248779297
22 179.0402374267578
23 168.70223999023438
24 158.9108123779297
25 149.62896728515625
26 140.82859802246094
27 132.48011779785156
28 124.57557678222656
29 117.07850646972656
30 109.97911834716797
31 103.2701187133789
32 96.94243621826172
33 90.97946166992188
34 85.34876251220703
35 80.06291961669922
36 75.08028411865234
37 70.3912353515625
38 65.98219299316406
39 61.84822082519531
40 57.97920227050781
41 54.358558654785156
42 50.96196365356445
43 47.78129577636719
44 44.80548858642578
45 42.019962310791016
46 39.41441345214844
47 36.9772911071777

360 5.940521805314347e-05
361 5.735551530960947e-05
362 5.538314508157782e-05
363 5.34749633516185e-05
364 5.163285823073238e-05
365 4.9864280299516395e-05
366 4.8145619075512514e-05
367 4.649238326237537e-05
368 4.489463390200399e-05
369 4.33551067544613e-05
370 4.1865117964334786e-05
371 4.0430259105050936e-05
372 3.9039710827637464e-05
373 3.7701709516113624e-05
374 3.641113653429784e-05
375 3.515949720167555e-05
376 3.395510793779977e-05
377 3.279572774772532e-05
378 3.1672228942625225e-05
379 3.0585419153794646e-05
380 2.954282535938546e-05
381 2.8529950213851407e-05
382 2.755448986135889e-05
383 2.6613161026034504e-05
384 2.5703173378133215e-05
385 2.4823955754982308e-05
386 2.397805656073615e-05
387 2.315871461178176e-05
388 2.2369624275597744e-05
389 2.1605894289677963e-05
390 2.0868845240329392e-05
391 2.015855352510698e-05
392 1.9468106984277256e-05
393 1.880622949101962e-05
394 1.8165679648518562e-05
395 1.7547039533383213e-05
396 1.6947820768109523e-05
397 1.637166860746219

In [10]:
# -*- coding: utf-8 -*-
import random 
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that will use
        in the forward pass
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        for the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden la
        representations
        
        since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements whe
        defining the forward pass of the model
        
        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computaional graph. This is a big improvement from Lua
        Troch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimesion;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model
    y_pred = model(x)
    
    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    # Zero  gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 695.11083984375
1 691.240966796875
2 693.1908569335938
3 688.9932250976562
4 707.2847290039062
5 682.9061889648438
6 687.2890625
7 594.78662109375
8 674.3424072265625
9 495.1723937988281
10 438.2010192871094
11 681.5518798828125
12 666.6635131835938
13 663.6679077148438
14 678.7455444335938
15 229.89141845703125
16 638.5836791992188
17 644.6184692382812
18 610.3211059570312
19 668.5512084960938
20 558.8927612304688
21 113.85456085205078
22 652.2705688476562
23 460.1796569824219
24 632.5022583007812
25 616.0626220703125
26 505.6253662109375
27 561.6194458007812
28 520.4752197265625
29 472.4648742675781
30 187.1783905029297
31 263.0716247558594
32 406.3441162109375
33 379.538818359375
34 205.55892944335938
35 128.68910217285156
36 163.56674194335938
37 243.52267456054688
38 265.97125244140625
39 132.24647521972656
40 100.34864044189453
41 189.18438720703125
42 192.2643280029297
43 202.2376251220703
44 118.82656860351562
45 120.51624298095703
46 83.52153015136719
47 106.77814483642578
4