# Tensor

## Warm-up: numpy

In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
print(x.shape, y.shape)

(64, 1000) (64, 10)


In [2]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
print(w1.shape, w2.shape)

(1000, 100) (100, 10)


In [3]:
# optimize
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33861085.172023386
1 30440771.42961561
2 29966105.735275455
3 27651794.651332684
4 22054992.66696891
5 14787186.274565171
6 8728966.377677513
7 4892146.394620342
8 2840069.287885293
9 1795438.459501493
10 1251254.841428449
11 944237.8563070894
12 752619.3913411273
13 620644.5516419434
14 522900.2090037905
15 446667.95761314186
16 385193.7047926944
17 334575.7213066949
18 292254.1427397417
19 256528.67039262952
20 226170.96382076238
21 200214.89550294657
22 177852.07450872875
23 158534.11651059097
24 141743.51970024832
25 127095.97640058227
26 114269.30672962623
27 103000.65458165792
28 93072.91674700865
29 84293.28205413037
30 76510.39721174331
31 69604.46982001493
32 63444.64650002723
33 57930.69525514642
34 52985.55545434292
35 48540.596063767254
36 44534.296497248404
37 40917.32470309666
38 37644.45462749351
39 34683.2231229975
40 31995.415922373715
41 29551.023361293308
42 27323.723034679668
43 25289.766945457406
44 23430.771026749047
45 21733.30324540985
46 20177.132571858456
47

379 0.00222106489063013
380 0.0021297508923714395
381 0.002042228253570085
382 0.0019583621838570866
383 0.0018779965684004001
384 0.001800913388215618
385 0.0017270475709045725
386 0.0016562335968397575
387 0.0015883349575428526
388 0.001523251852033414
389 0.0014608494033851258
390 0.001401023656864208
391 0.0013436826473604679
392 0.0012887071900513736
393 0.0012360263906887334
394 0.0011854838306123076
395 0.0011370394461489373
396 0.0010905961055180473
397 0.001046052276322874
398 0.0010033429721713592
399 0.0009624011599652946
400 0.000923140507658406
401 0.0008855028632757926
402 0.0008494016860714138
403 0.0008148066551507662
404 0.0007816166237518409
405 0.0007497964206158664
406 0.0007192858907616055
407 0.0006900199823258119
408 0.0006619524968670941
409 0.0006350338944708269
410 0.0006092195133989967
411 0.0005844636608760659
412 0.0005607224411720074
413 0.0005379637093546164
414 0.000516137087580508
415 0.0004952004734455739
416 0.0004751219881717877
417 0.000455855208139

## PyTorch: Tensors

In [4]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)  # mm performs  a matrix multiplication
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 34436480.0
1 30362808.0
2 27138856.0
3 21611582.0
4 14933136.0
5 9104164.0
6 5325864.5
7 3193513.75
8 2071436.0
9 1463505.5
10 1111528.25
11 887562.875
12 732886.1875
13 617959.3125
14 528185.4375
15 455799.0
16 396230.46875
17 346542.75
18 304623.1875
19 268928.21875
20 238354.9375
21 212035.59375
22 189223.875
23 169380.1875
24 152034.65625
25 136826.625
26 123432.4921875
27 111587.703125
28 101079.5625
29 91717.5234375
30 83376.0
31 75919.546875
32 69238.2265625
33 63236.90625
34 57838.75390625
35 52967.27734375
36 48570.32421875
37 44590.6796875
38 40983.6640625
39 37708.7578125
40 34729.70703125
41 32019.39453125
42 29547.232421875
43 27288.8203125
44 25224.46484375
45 23335.103515625
46 21605.357421875
47 20020.08203125
48 18564.0859375
49 17226.404296875
50 15996.7861328125
51 14863.833984375
52 13822.1435546875
53 12862.1181640625
54 11976.345703125
55 11158.0732421875
56 10401.2666015625
57 9701.4736328125
58 9053.849609375
59 8453.8642578125
60 7897.9091796875
61 7382.33544

In [5]:
h.device

device(type='cuda', index=0)

## PyTorch: Tensors and autograd

In [7]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.

    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    """
    #Anather way
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    """

0 37785780.0
1 41600804.0
2 46646140.0
3 42280308.0
4 27315388.0
5 12624326.0
6 5172827.5
7 2497890.5
8 1573831.0
9 1180943.75
10 957854.625
11 801983.125
12 681562.9375
13 584422.875
14 504459.96875
15 437884.78125
16 381993.0625
17 334631.78125
18 294334.09375
19 259883.53125
20 230248.0625
21 204628.703125
22 182380.796875
23 162997.640625
24 146072.640625
25 131233.6875
26 118166.8203125
27 106629.6015625
28 96406.96875
29 87320.90625
30 79240.046875
31 72033.0
32 65584.484375
33 59798.89453125
34 54593.35546875
35 49904.8671875
36 45677.58203125
37 41863.57421875
38 38410.46484375
39 35282.12890625
40 32445.61328125
41 29866.103515625
42 27517.169921875
43 25378.57421875
44 23426.9609375
45 21645.56640625
46 20015.396484375
47 18522.388671875
48 17155.001953125
49 15899.521484375
50 14746.091796875
51 13685.4306640625
52 12709.58984375
53 11810.05859375
54 10980.74609375
55 10218.103515625
56 9513.7529296875
57 8862.5986328125
58 8259.884765625
59 7702.37646484375
60 7186.07080078

417 0.00034064194187521935
418 0.00033303193049505353
419 0.0003240490914322436
420 0.00031662374385632575
421 0.00030901929130777717
422 0.0003009861975442618
423 0.0002940922568086535
424 0.0002871322212740779
425 0.0002806239644996822
426 0.0002745907404460013
427 0.00026818583137355745
428 0.0002623440814204514
429 0.00025641449610702693
430 0.00025040312903001904
431 0.00024576872237958014
432 0.00023991848865989596
433 0.00023485528072342277
434 0.0002296328020747751
435 0.0002245958603452891
436 0.00022045100922696292
437 0.0002152737433789298
438 0.00021108746295794845
439 0.00020633895474020392
440 0.00020248313376214355
441 0.0001986204442800954
442 0.00019413606787566096
443 0.0001899632188724354
444 0.00018664753588382155
445 0.00018222459766548127
446 0.00017861298692878336
447 0.00017503426352050155
448 0.00017202219169121236
449 0.00016880944895092398
450 0.000165249512065202
451 0.00016238566604442894
452 0.000159292874741368
453 0.00015627786342520267
454 0.00015309883

## PyTorch: Defining new autograd functions

### decolater

In [10]:
def sample_decorator(myfunc):
    print("I am the decorator!")
    return 0 
 
@sample_decorator
def myfunc():
    pass

I am the decorator!


In [13]:
def myfunc():
    pass
myfunc = sample_decorator(myfunc)
myfunc

I am the decorator!


0

In [18]:
def sample_decorator(myfunc):
    def inner_func():
        return "I am the decorator!"
    return inner_func
 
@sample_decorator
def myfunc(text):
    return text
 
print(myfunc())

I am the decorator!


In [19]:
def sample_decorator(myfunc):
    def inner_func(*args):
        print("I am the decorator!")
        myfunc(*args)
    return inner_func
 
@sample_decorator
def myfunc(text):
    print(text)

myfunc("Blabla")

I am the decorator!
Blabla


In [22]:
def A(myfunc):
    def inner_func():
        print("I am the A decorator!")
        myfunc()
        print("I am the A decorator!")
    return inner_func
 
def B(myfunc):
    def inner_func():
        print("I am the B decorator!")
        myfunc()
        print("I am the B decorator!")
    return inner_func

@B
@A
def myfunc():
    print("Hello, decorator")

myfunc()

I am the B decorator!
I am the A decorator!
Hello, decorator
I am the A decorator!
I am the B decorator!


In [23]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 27263964.0
1 23057172.0
2 22548934.0
3 22548540.0
4 21119038.0
5 17506728.0
6 12742477.0
7 8292836.0
8 5095510.5
9 3110481.25
10 1980137.625
11 1346215.0
12 983370.6875
13 763572.0
14 620535.875
15 520177.59375
16 445122.28125
17 386176.875
18 338169.78125
19 298110.5625
20 264194.4375
21 235089.8125
22 209909.5
23 187946.890625
24 168734.046875
25 151854.53125
26 136980.328125
27 123836.3203125
28 112182.0234375
29 101822.0078125
30 92589.125
31 84341.171875
32 76956.296875
33 70330.5234375
34 64436.72265625
35 59131.12109375
36 54335.0078125
37 49993.77734375
38 46063.15625
39 42492.02734375
40 39241.9921875
41 36279.6328125
42 33574.9609375
43 31102.609375
44 28842.34375
45 26769.837890625
46 24866.671875
47 23117.806640625
48 21508.73046875
49 20023.947265625
50 18654.439453125
51 17390.828125
52 16223.541015625
53 15144.900390625
54 14146.1787109375
55 13221.3935546875
56 12364.2041015625
57 11568.7001953125
58 10830.1953125
59 10144.0654296875
60 9505.6494140625
61 8911.9648437

469 0.00020302597840782255
470 0.0001987886062124744
471 0.00019465276272967458
472 0.0001914585445774719
473 0.0001875283633125946
474 0.0001834418944781646
475 0.00017988687613978982
476 0.0001764847111189738
477 0.00017320735787507147
478 0.00017011750605888665
479 0.00016655272338539362
480 0.00016371191304642707
481 0.00016047384997364134
482 0.0001572386536281556
483 0.00015451844956260175
484 0.00015186947712209076
485 0.00014847285638097674
486 0.00014654397091362625
487 0.00014361485955305398
488 0.00014115602243691683
489 0.00013871303235646337
490 0.00013617827789857984
491 0.00013375077105592936
492 0.00013170286547392607
493 0.0001293804671149701
494 0.00012711255112662911
495 0.0001249580382136628
496 0.00012277104542590678
497 0.0001209436304634437
498 0.00011891966278199106
499 0.00011698652087943628


## PyTorch: nn

In [24]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 692.644287109375
1 642.2822875976562
2 598.5562744140625
3 560.2967529296875
4 525.9440307617188
5 494.7234802246094
6 466.2933654785156
7 440.32208251953125
8 416.29913330078125
9 393.9095458984375
10 372.8952941894531
11 353.056884765625
12 334.2895812988281
13 316.6082458496094
14 299.8535461425781
15 283.9629821777344
16 268.8822326660156
17 254.52430725097656
18 240.86788940429688
19 227.88088989257812
20 215.50363159179688
21 203.7123565673828
22 192.49594116210938
23 181.830322265625
24 171.70201110839844
25 162.08815002441406
26 152.962646484375
27 144.36456298828125
28 136.2194061279297
29 128.4944610595703
30 121.1775131225586
31 114.26953887939453
32 107.75212860107422
33 101.60372924804688
34 95.8072738647461
35 90.35079956054688
36 85.2178955078125
37 80.38543701171875
38 75.84049987792969
39 71.56957244873047
40 67.55323791503906
41 63.782657623291016
42 60.23545455932617
43 56.899227142333984
44 53.765464782714844
45 50.81684494018555
46 48.04598617553711
47 45.4384117

437 5.3227126045385376e-05
438 5.189825969864614e-05
439 5.06094002048485e-05
440 4.934666503686458e-05
441 4.811839244212024e-05
442 4.6920958993723616e-05
443 4.575180355459452e-05
444 4.461657226784155e-05
445 4.350754898041487e-05
446 4.242845898261294e-05
447 4.137228461331688e-05
448 4.034595986013301e-05
449 3.9344955439446494e-05
450 3.836686664726585e-05
451 3.741672117030248e-05
452 3.649076825240627e-05
453 3.558565367711708e-05
454 3.4703425626503304e-05
455 3.384628507774323e-05
456 3.300928801763803e-05
457 3.219276914023794e-05
458 3.1396994017995894e-05
459 3.062021278310567e-05
460 2.9864846510463394e-05
461 2.912691888923291e-05
462 2.840801425918471e-05
463 2.770492574200034e-05
464 2.7023734219255857e-05
465 2.6358175091445446e-05
466 2.5706287487992086e-05
467 2.5073903088923544e-05
468 2.4455033781123348e-05
469 2.3852380763855763e-05
470 2.3266054995474406e-05
471 2.2692634956911206e-05
472 2.2134538085083477e-05
473 2.1589547031908296e-05
474 2.1059633581899107e