# Tensor

## Warm-up: numpy

In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
print(x.shape, y.shape)

(64, 1000) (64, 10)


In [2]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
print(w1.shape, w2.shape)

(1000, 100) (100, 10)


In [3]:
# optimize
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33454628.581842043
1 32984183.159082074
2 37185322.58679414
3 38654485.18878923
4 32311673.831666812
5 20231106.18295199
6 10016243.976181258
7 4564093.50572787
8 2305957.766076766
9 1409443.3522271148
10 1013603.2672970396
11 801586.4954424554
12 664512.3005920667
13 563883.1125875541
14 484676.1718430569
15 420118.9186103791
16 366455.74221112696
17 321259.2484831789
18 282863.1203543098
19 250081.64403837966
20 221875.7016129561
21 197477.49012804887
22 176283.78386149055
23 157765.16037740605
24 141524.63557378453
25 127242.12320460341
26 114641.05878764071
27 103468.66448141706
28 93543.75308649658
29 84715.0059945218
30 76831.95609002386
31 69770.7584288519
32 63438.03745137695
33 57751.63073076603
34 52630.71663369359
35 48014.08503354884
36 43844.64780544819
37 40074.92502676124
38 36657.971037930525
39 33557.19508612176
40 30741.460961089826
41 28182.027458833152
42 25851.787808523768
43 23729.643658198016
44 21796.018004018937
45 20030.90100855668
46 18418.49336588502
47 16

358 8.556318354579191e-05
359 8.094809982016746e-05
360 7.658092259716634e-05
361 7.245052575390433e-05
362 6.854343940850594e-05
363 6.484660833299877e-05
364 6.135076527115493e-05
365 5.8042579892357424e-05
366 5.491424845566421e-05
367 5.195477593412512e-05
368 4.915413513434154e-05
369 4.6505153823933845e-05
370 4.399938069108642e-05
371 4.162848956424315e-05
372 3.938583493410321e-05
373 3.7263922631223644e-05
374 3.525634472833183e-05
375 3.335743106603385e-05
376 3.156052587138446e-05
377 2.9860934302714847e-05
378 2.825340220140992e-05
379 2.673217539908729e-05
380 2.5292852151449876e-05
381 2.3931087180620665e-05
382 2.2642785598431947e-05
383 2.1423882216474442e-05
384 2.0270811718099685e-05
385 1.9179884891172334e-05
386 1.814770862952722e-05
387 1.7171448482253026e-05
388 1.6247459307352848e-05
389 1.5373209527833506e-05
390 1.4546555223459703e-05
391 1.3763959887586915e-05
392 1.3023580467137732e-05
393 1.2323067349528234e-05
394 1.1660222977571264e-05
395 1.10332376238415

## PyTorch: Tensors

In [4]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.float
if torch.cuda.is_available():
    device = torch.device("cuda:0") # Uncomment this to run on GPU
else: 
    device = torch.device("cpu")


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)  # mm performs  a matrix multiplication
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29601946.0
1 27959502.0
2 30137454.0
3 31557264.0
4 28835280.0
5 21443920.0
6 13142053.0
7 7098435.0
8 3803822.5
9 2208188.0
10 1452314.375
11 1066643.25
12 845331.125
13 700343.875
14 594958.3125
15 512839.6875
16 446170.21875
17 390610.4375
18 343575.03125
19 303444.59375
20 268961.0625
21 239211.78125
22 213409.578125
23 190925.0625
24 171248.328125
25 154007.390625
26 138846.421875
27 125463.4609375
28 113643.6796875
29 103153.9453125
30 93811.8828125
31 85457.0859375
32 77981.015625
33 71276.3125
34 65252.625
35 59830.96484375
36 54940.4765625
37 50520.4375
38 46516.5390625
39 42879.15234375
40 39571.19921875
41 36561.6171875
42 33816.9296875
43 31311.62890625
44 29020.05859375
45 26922.51953125
46 24998.9453125
47 23233.283203125
48 21609.95703125
49 20116.73046875
50 18740.783203125
51 17472.154296875
52 16301.2705078125
53 15222.255859375
54 14225.076171875
55 13301.69921875
56 12446.5380859375
57 11653.7890625
58 10917.6572265625
59 10233.6201171875
60 9597.818359375
61 9006

452 0.0007913903100416064
453 0.0007710436475463212
454 0.0007512300508096814
455 0.0007314424146898091
456 0.0007128204451873899
457 0.0006946756620891392
458 0.0006767677259631455
459 0.0006594412843696773
460 0.0006436316180042922
461 0.0006278146174736321
462 0.0006126464577391744
463 0.0005979886627756059
464 0.0005832557799294591
465 0.0005700477631762624
466 0.0005563328741118312
467 0.0005438076332211494
468 0.0005302376812323928
469 0.0005181568558327854
470 0.0005064264987595379
471 0.0004948393325321376
472 0.00048313025035895407
473 0.00047220021951943636
474 0.00046195570030249655
475 0.00045098160626366735
476 0.00044037174666300416
477 0.00043053331319242716
478 0.000420798925915733
479 0.00041153651545755565
480 0.0004026459064334631
481 0.0003941835602745414
482 0.0003852318332064897
483 0.00037793515366502106
484 0.00036966282641515136
485 0.0003611450083553791
486 0.0003541818878147751
487 0.00034684405545704067
488 0.00033947688643820584
489 0.00033283259836025536
4

In [5]:
h.device

device(type='cuda', index=0)

## PyTorch: Tensors and autograd

In [6]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
if torch.cuda.is_available():
    device = torch.device("cuda:0") # Uncomment this to run on GPU
else: 
    device = torch.device("cpu")


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.

    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    """
    #Anather way
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    """

0 28638050.0
1 26403538.0
2 31648910.0
3 39787816.0
4 43305548.0
5 36229964.0
6 21422560.0
7 9617528.0
8 3908343.75
9 1829375.75
10 1089534.375
11 788441.4375
12 631709.25
13 529411.8125
14 452707.84375
15 391234.96875
16 340435.1875
17 297808.59375
18 261675.15625
19 230962.984375
20 204707.578125
21 182034.875
22 162368.328125
23 145246.90625
24 130291.84375
25 117176.3984375
26 105628.265625
27 95424.390625
28 86392.515625
29 78371.6875
30 71226.3046875
31 64841.4765625
32 59123.4140625
33 54004.16015625
34 49406.125
35 45262.59765625
36 41520.375
37 38143.25390625
38 35085.80078125
39 32310.115234375
40 29781.06640625
41 27479.5390625
42 25382.84765625
43 23469.21875
44 21720.21484375
45 20119.212890625
46 18652.9453125
47 17308.658203125
48 16073.666015625
49 14938.6865234375
50 13894.705078125
51 12933.005859375
52 12046.380859375
53 11228.255859375
54 10472.64453125
55 9774.298828125
56 9128.3447265625
57 8530.7109375
58 7976.7451171875
59 7463.04443359375
60 6986.38671875
61 65

403 0.0009618081385269761
404 0.0009311514440923929
405 0.0009015751420520246
406 0.0008729759720154107
407 0.0008451514877378941
408 0.0008196281851269305
409 0.00079219916369766
410 0.0007698500994592905
411 0.0007448464748449624
412 0.0007228501490317285
413 0.0007003578939475119
414 0.0006786875310353935
415 0.0006583480862900615
416 0.0006387548637576401
417 0.000621546758338809
418 0.0006021157023496926
419 0.0005843390244990587
420 0.0005675507709383965
421 0.00055029516806826
422 0.000535240862518549
423 0.0005201601306907833
424 0.0005055191577412188
425 0.0004915784229524434
426 0.00047862448263913393
427 0.00046546573867090046
428 0.0004528331046458334
429 0.00044144137063995004
430 0.00043007623753510416
431 0.0004179067618679255
432 0.0004069046408403665
433 0.00039600100717507303
434 0.00038556300569325686
435 0.0003750075411517173
436 0.0003655178879853338
437 0.00035649974597617984
438 0.00034756583045236766
439 0.0003385919553693384
440 0.00032997375819832087
441 0.000

## PyTorch: Defining new autograd functions

### decolater

In [7]:
def sample_decorator(myfunc):
    print("I am the decorator!")
    return 0 
 
@sample_decorator
def myfunc():
    pass

I am the decorator!


In [8]:
def myfunc():
    pass
myfunc = sample_decorator(myfunc)
myfunc

I am the decorator!


0

In [9]:
def sample_decorator(myfunc):
    def inner_func():
        return "I am the decorator!"
    return inner_func
 
@sample_decorator
def myfunc(text):
    return text
 
print(myfunc())

I am the decorator!


In [10]:
def sample_decorator(myfunc):
    def inner_func(*args):
        print("I am the decorator!")
        myfunc(*args)
    return inner_func
 
@sample_decorator
def myfunc(text):
    print(text)

myfunc("Blabla")

I am the decorator!
Blabla


In [11]:
def A(myfunc):
    def inner_func():
        print("I am the A decorator!")
        myfunc()
        print("I am the A decorator!")
    return inner_func
 
def B(myfunc):
    def inner_func():
        print("I am the B decorator!")
        myfunc()
        print("I am the B decorator!")
    return inner_func

@B
@A
def myfunc():
    print("Hello, decorator")

myfunc()

I am the B decorator!
I am the A decorator!
Hello, decorator
I am the A decorator!
I am the B decorator!


In [12]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
if torch.cuda.is_available():
    device = torch.device("cuda:0") # Uncomment this to run on GPU
else: 
    device = torch.device("cpu")

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 30811548.0
1 27470242.0
2 30190638.0
3 33890128.0
4 33147330.0
5 26109060.0
6 15857724.0
7 8062010.0
8 3856619.75
9 2017842.375
10 1231209.375
11 868765.375
12 673678.0625
13 550124.875
14 461469.6875
15 392897.09375
16 337482.65625
17 291767.0625
18 253534.421875
19 221213.90625
20 193737.4375
21 170253.46875
22 150123.6875
23 132754.40625
24 117741.8046875
25 104699.9453125
26 93323.3515625
27 83367.625
28 74632.1796875
29 66945.1015625
30 60156.75
31 54152.4453125
32 48829.41796875
33 44105.703125
34 39899.7734375
35 36151.60546875
36 32807.01953125
37 29817.111328125
38 27143.9453125
39 24740.947265625
40 22578.380859375
41 20627.6171875
42 18866.55859375
43 17273.46875
44 15830.8232421875
45 14522.765625
46 13335.341796875
47 12256.01171875
48 11273.615234375
49 10378.4658203125
50 9562.0810546875
51 8817.1123046875
52 8136.4765625
53 7514.10791015625
54 6944.16455078125
55 6421.939453125
56 5942.95263671875
57 5503.27294921875
58 5099.36865234375
59 4727.96875
60 4385.998046875

372 0.00043419847497716546
373 0.0004220920382067561
374 0.0004090133588761091
375 0.0003977993910666555
376 0.0003861452278215438
377 0.0003755645884666592
378 0.0003640055365394801
379 0.0003543383500073105
380 0.00034397514536976814
381 0.0003346031589899212
382 0.0003258991346228868
383 0.00031671987380832434
384 0.0003082107868976891
385 0.00029925821581855416
386 0.0002919300750363618
387 0.00028408007347024977
388 0.0002761577779892832
389 0.00026906898710876703
390 0.00026211884687654674
391 0.00025541664217598736
392 0.00024887084146030247
393 0.00024254742311313748
394 0.00023628237249795347
395 0.00023053918266668916
396 0.00022515063756145537
397 0.0002191314852098003
398 0.00021391440532170236
399 0.0002088014007313177
400 0.00020434883481357247
401 0.00019922989304177463
402 0.00019425649952609092
403 0.00018981976609211415
404 0.0001851711713243276
405 0.00018079836445394903
406 0.000176791480043903
407 0.00017231042147614062
408 0.0001682708680164069
409 0.0001648949400

## PyTorch: nn

In [2]:
# -*- coding: utf-8 -*-
import torch
from tqdm import tqdm_notebook as tqdm

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)  # nn.Sequential はOrderdictを用いると層ごとに名前をつけることができる．名前を指定しないと添え字が名前となる
# https://pytorch.org/docs/stable/nn.html

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4

pbar = tqdm(range(1000))
pbar.set_description("Processing")
for t in pbar:
    # pbar.set_description("Processing {:3d}".format(t))
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

0 691.385009765625
1 641.5481567382812
2 597.8783569335938
3 559.3121948242188
4 525.0325927734375
5 494.3526611328125
6 466.52691650390625
7 441.1400451660156
8 417.4554443359375
9 395.25848388671875
10 374.4385986328125
11 354.8420104980469
12 336.38470458984375
13 318.9403076171875
14 302.32122802734375
15 286.4977111816406
16 271.4738464355469
17 257.2354431152344
18 243.67527770996094
19 230.7335662841797
20 218.35350036621094
21 206.48699951171875
22 195.14599609375
23 184.34445190429688
24 174.05186462402344
25 164.28457641601562
26 155.01693725585938
27 146.20533752441406
28 137.82913208007812
29 129.89308166503906
30 122.38594055175781
31 115.24951171875
32 108.48282623291016
33 102.09254455566406
34 96.0729751586914
35 90.37975311279297
36 85.01124572753906
37 79.96297454833984
38 75.2249526977539
39 70.77240753173828
40 66.5898666381836
41 62.664554595947266
42 58.980255126953125
43 55.511009216308594
44 52.25522232055664
45 49.20033264160156
46 46.32809829711914
47 43.63445

353 0.0002815208863466978
354 0.00027291118749417365
355 0.000264558446360752
356 0.00025647584698162973
357 0.00024863716680556536
358 0.00024104253679979593
359 0.00023368379333987832
360 0.0002265585499117151
361 0.0002196606801589951
362 0.0002129682106897235
363 0.000206488897674717
364 0.00020021969976369292
365 0.00019414298003539443
366 0.000188242964213714
367 0.0001825310755521059
368 0.00017699510499369353
369 0.0001716262922855094
370 0.00016642511764075607
371 0.00016138787032105029
372 0.00015650727436877787
373 0.0001517741329735145
374 0.00014718780585099012
375 0.00014274354907684028
376 0.00013843622582498938
377 0.00013426097575575113
378 0.0001302162418141961
379 0.00012629589764401317
380 0.0001224948646267876
381 0.00011881087993970141
382 0.00011523860302986577
383 0.00011177502892678604
384 0.00010841812763828784
385 0.00010515944450162351
386 0.0001020084455376491
387 9.895168477669358e-05
388 9.598488395567983e-05
389 9.311151370638981e-05
390 9.03249529073946

716 1.0475308620527812e-08
717 1.0265755356897444e-08
718 1.0033829767053248e-08
719 9.838556636054818e-09
720 9.65080282355757e-09
721 9.441271764387693e-09
722 9.237193232536356e-09
723 9.047109728044234e-09
724 8.870587819842513e-09
725 8.684867047747957e-09
726 8.486748193092808e-09
727 8.332752265971521e-09
728 8.163672404748468e-09
729 8.003860685334985e-09
730 7.839355831151806e-09
731 7.700958093437293e-09
732 7.54976792194384e-09
733 7.396355528044296e-09
734 7.261882206677228e-09
735 7.115686262437748e-09
736 6.990545919904889e-09
737 6.858126067044168e-09
738 6.725763501691517e-09
739 6.5940506388528775e-09
740 6.485891823615475e-09
741 6.3569354225023744e-09
742 6.242633965314326e-09
743 6.1286637986768255e-09
744 6.031128485517456e-09
745 5.9056293189030384e-09
746 5.805925518131971e-09
747 5.710221184784814e-09
748 5.616565879051905e-09
749 5.516793688542521e-09
750 5.422568172264164e-09
751 5.322066343182996e-09
752 5.231941102579185e-09
753 5.134089597902403e-09
754 5.0

In [8]:
for layers in model.named_modules():
    print(layers)

('', Sequential(
  (0): Linear(in_features=1000, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
))
('0', Linear(in_features=1000, out_features=100, bias=True))
('1', ReLU())
('2', Linear(in_features=100, out_features=10, bias=True))


In [None]:
# -*- coding: utf-8 -*-
import torch
from tqdm import tqdm_notebook as tqdm
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in tqdm(range(500)):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


## PyTorch: Custom nn Modules

In [10]:
# -*- coding: utf-8 -*-
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


0 656.0204467773438
1 606.3819580078125
2 563.4098510742188
3 526.0287475585938
4 492.7554016113281
5 462.7063903808594
6 435.3582458496094
7 410.1671447753906
8 386.9555969238281
9 365.4505310058594
10 345.50042724609375
11 326.8361511230469
12 309.347900390625
13 292.88433837890625
14 277.30853271484375
15 262.5722351074219
16 248.49688720703125
17 235.0901641845703
18 222.2703857421875
19 210.080810546875
20 198.50579833984375
21 187.4898681640625
22 177.03872680664062
23 167.0840301513672
24 157.58741760253906
25 148.53807067871094
26 139.9483184814453
27 131.8116912841797
28 124.10425567626953
29 116.79434967041016
30 109.87484741210938
31 103.33793640136719
32 97.1484603881836
33 91.29310607910156
34 85.75950622558594
35 80.53514862060547
36 75.6181411743164
37 70.97618865966797
38 66.61840057373047
39 62.526344299316406
40 58.680084228515625
41 55.06120300292969
42 51.66867446899414
43 48.487640380859375
44 45.504554748535156
45 42.71156692504883
46 40.089351654052734
47 37.6226

364 0.0002763486991170794
365 0.0002683101047296077
366 0.00026050698943436146
367 0.00025293920771218836
368 0.0002455913636367768
369 0.00023846091062296182
370 0.00023154196969699115
371 0.00022482794884126633
372 0.0002183088072342798
373 0.00021198176546022296
374 0.00020584060985129327
375 0.00019987787527497858
376 0.0001940887450473383
377 0.0001884686789708212
378 0.0001830158435041085
379 0.00017772833234630525
380 0.00017258856678381562
381 0.00016760350263211876
382 0.00016275844245683402
383 0.0001580602111062035
384 0.0001534928596811369
385 0.0001490672439103946
386 0.00014477000513579696
387 0.00014059653040021658
388 0.0001365447969874367
389 0.00013261062849778682
390 0.0001287928898818791
391 0.00012508788495324552
392 0.00012148435780545697
393 0.00011799151980085298
394 0.00011459729284979403
395 0.00011130264465464279
396 0.00010810857202159241
397 0.00010500386997591704
398 0.00010198920062975958
399 9.906129707815126e-05
400 9.62183257797733e-05
401 9.3457558250

In [12]:
for layers in model.named_modules():
    print(layers)  # Module がアトリビュート名で登録されている．

('', TwoLayerNet(
  (linear1): Linear(in_features=1000, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=10, bias=True)
))
('linear1', Linear(in_features=1000, out_features=100, bias=True))
('linear2', Linear(in_features=100, out_features=10, bias=True))


## PyTorch: Control Flow + Weight Sharing

In [4]:
# -*- coding: utf-8 -*-
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 694.4918212890625
1 641.6146240234375
2 669.7479858398438
3 665.8424682617188
4 665.8660888671875
5 667.9981079101562
6 659.9095458984375
7 652.8549194335938
8 653.382080078125
9 649.5932006835938
10 335.85345458984375
11 313.2641906738281
12 607.2434692382812
13 635.1311645507812
14 653.3431396484375
15 566.8755493164062
16 546.293701171875
17 168.37591552734375
18 493.0198974609375
19 460.218505859375
20 638.7105712890625
21 385.94110107421875
22 625.41796875
23 539.9629516601562
24 278.3038635253906
25 481.75762939453125
26 217.36444091796875
27 523.7520141601562
28 362.8290710449219
29 320.6985168457031
30 194.37408447265625
31 255.42184448242188
32 320.3872375488281
33 281.09100341796875
34 382.5536804199219
35 227.83058166503906
36 189.68760681152344
37 202.5064239501953
38 160.31298828125
39 104.53071594238281
40 76.94847106933594
41 147.7677459716797
42 57.40458679199219
43 188.9796142578125
44 110.99726104736328
45 178.54635620117188
46 93.25392150878906
47 102.1650466918945

390 1.8031177520751953
391 14.322591781616211
392 6.490121841430664
393 1.2959345579147339
394 5.548521041870117
395 1.2532612085342407
396 4.663203716278076
397 9.261096954345703
398 0.46326664090156555
399 1.2561397552490234
400 2.382652759552002
401 1.9524242877960205
402 4.081525802612305
403 1.6558030843734741
404 1.2803008556365967
405 2.4656686782836914
406 1.3823047876358032
407 0.5984993577003479
408 5.480122089385986
409 1.7394351959228516
410 1.264327883720398
411 1.6384068727493286
412 0.6461037993431091
413 1.0910154581069946
414 1.8221888542175293
415 1.3579744100570679
416 2.1851208209991455
417 0.5390334725379944
418 0.6121658086776733
419 2.672698497772217
420 0.4615043103694916
421 1.3293700218200684
422 0.35468578338623047
423 0.9830807447433472
424 1.819581151008606
425 0.4438799023628235
426 0.505510687828064
427 0.9594058990478516
428 0.6115707159042358
429 1.416797399520874
430 0.7768195867538452
431 0.9500673413276672
432 0.4830593466758728
433 0.826171815395355