# Tensors

### Warm-up: numpy

In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h,0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2  with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30013392.058496468
1 28082361.084647972
2 30175607.685382314
3 31380406.525137167
4 28269767.87991444
5 20586518.895238332
6 12339558.844099157
7 6519588.328472834
8 3423615.902798177
9 1956650.9131299644
10 1268377.832901427
11 918644.5931144
12 717530.3257053932
13 585575.3076905775
14 489894.01180568384
15 415863.27894756745
16 356375.1734263998
17 307566.58953292976
18 266883.1185833
19 232631.86663543008
20 203587.89191340652
21 178855.03647419813
22 157696.96248821352
23 139512.8817015809
24 123797.95611544125
25 110144.05731891372
26 98253.849183934
27 87874.23770969531
28 78807.13333444041
29 70839.8636459379
30 63813.067639174245
31 57595.59711011514
32 52083.901534011944
33 47185.704246292284
34 42816.83705564499
35 38917.4352459254
36 35428.72397734082
37 32302.22920257105
38 29492.305134854563
39 26962.895310571592
40 24683.4478877051
41 22626.25175534031
42 20766.602072974005
43 19081.58710536235
44 17553.064105841462
45 16164.885788328636
46 14901.957529460728
47 13751.

445 0.0003423890852842839
446 0.00032941850181181736
447 0.00031693202864196816
448 0.00030491937580226784
449 0.0002933630095353592
450 0.00028224641505925313
451 0.0002715527318990977
452 0.00026126434128777754
453 0.0002513667756443877
454 0.0002418455960057307
455 0.00023268714337159766
456 0.00022387491321368034
457 0.00021539730785255048
458 0.0002072421749945621
459 0.00019939715013169056
460 0.00019184887763990297
461 0.0001845871231421828
462 0.00017760160511807812
463 0.0001708816261729252
464 0.00016441587326119478
465 0.00015819556027916754
466 0.0001522140138745387
467 0.00014645629054726625
468 0.0001409170003418993
469 0.00013558729605725814
470 0.00013045961190737745
471 0.00012552673703128907
472 0.00012078093342971662
473 0.0001162146945413291
474 0.00011182151225465779
475 0.00010759485283915948
476 0.00010352841315856047
477 9.961582976981758e-05
478 9.585146691583213e-05
479 9.22296773728022e-05
480 8.874594002231128e-05
481 8.539327190176354e-05
482 8.216749652919

### PyTorch: Tensors

In [2]:
import torch

In [3]:
dtype = torch.float
#device = torch.device('cpu')
device = torch.device('cuda:0')

In [4]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    

0 44589148.0
1 47807200.0
2 49765508.0
3 40272152.0
4 23363580.0
5 10320578.0
6 4501752.0
7 2412513.0
8 1617921.25
9 1234532.75
10 998351.5
11 828756.8125
12 697726.4375
13 592748.8125
14 507170.03125
15 436521.75
16 377777.9375
17 328586.125
18 287067.0
19 251824.375
20 221734.65625
21 195958.65625
22 173825.125
23 154670.125
24 138015.125
25 123487.328125
26 110774.25
27 99620.515625
28 89790.421875
29 81095.03125
30 73381.453125
31 66522.953125
32 60417.1953125
33 54961.51171875
34 50074.84375
35 45692.1328125
36 41756.3203125
37 38209.6640625
38 35009.83984375
39 32115.60546875
40 29494.083984375
41 27116.5078125
42 24956.41015625
43 22992.58203125
44 21204.10546875
45 19571.974609375
46 18081.5546875
47 16719.9453125
48 15475.1083984375
49 14333.85546875
50 13287.1005859375
51 12325.40234375
52 11441.0546875
53 10627.19140625
54 9879.6708984375
55 9191.107421875
56 8555.947265625
57 7969.0693359375
58 7426.41943359375
59 6924.2763671875
60 6459.72802734375
61 6029.2783203125
62 56

378 0.007714779116213322
379 0.007476554252207279
380 0.007249271962791681
381 0.007023101672530174
382 0.006811558268964291
383 0.006601671688258648
384 0.006403641775250435
385 0.006207531318068504
386 0.006022755056619644
387 0.005839487537741661
388 0.005664903204888105
389 0.005498013459146023
390 0.00533012580126524
391 0.005170874763280153
392 0.005016004201024771
393 0.0048674060963094234
394 0.004725169390439987
395 0.00458301417529583
396 0.00445015262812376
397 0.004320953972637653
398 0.004194264765828848
399 0.0040729050524532795
400 0.003954038023948669
401 0.0038399153854697943
402 0.003731539472937584
403 0.0036242089699953794
404 0.003518252167850733
405 0.003420583438128233
406 0.0033249221742153168
407 0.0032292751129716635
408 0.0031374837271869183
409 0.003050568513572216
410 0.0029638344421982765
411 0.0028828009963035583
412 0.002802050905302167
413 0.0027244677767157555
414 0.0026496199425309896
415 0.0025785937905311584
416 0.0025083902291953564
417 0.002439177

# Autograd

### PyTorch: Tensors and autograd

In [5]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
# Setting requires_grad=True indicates that we want to compute gradients
# with respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values
    # since we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad = True.
    # After this call w1.grad and w2.gard will be Tensors holding the gradient
    # of the loss with repect to w1 and w2 respectively.
    loss.backward()
    
    # Manulally update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    

    

0 37756080.0
1 34870064.0
2 33191706.0
3 27608946.0
4 19055258.0
5 11006836.0
6 5893689.5
7 3238826.75
8 1974515.0
9 1350509.625
10 1011164.25
11 801721.125
12 657418.875
13 549820.75
14 465576.9375
15 397644.9375
16 341900.84375
17 295670.5
18 256982.90625
19 224442.03125
20 196879.09375
21 173366.4375
22 153220.546875
23 135894.78125
24 120896.515625
25 107907.421875
26 96583.125
27 86666.96875
28 77949.9921875
29 70264.4296875
30 63476.421875
31 57459.2578125
32 52108.01953125
33 47336.8203125
34 43071.66015625
35 39250.140625
36 35818.68359375
37 32736.82421875
38 29958.4765625
39 27449.142578125
40 25179.26953125
41 23122.8828125
42 21256.443359375
43 19560.3125
44 18017.5390625
45 16612.57421875
46 15330.365234375
47 14158.8515625
48 13086.9755859375
49 12105.5166015625
50 11205.7109375
51 10379.62109375
52 9620.791015625
53 8923.1552734375
54 8281.1044921875
55 7689.6220703125
56 7144.40185546875
57 6642.052734375
58 6178.2744140625
59 5750.4013671875
60 5355.185546875
61 4989.4

370 0.00045813460019417107
371 0.0004429127147886902
372 0.0004281164729036391
373 0.00041403205250389874
374 0.0004011091950815171
375 0.00038778208545409143
376 0.00037545565282925963
377 0.0003643475938588381
378 0.0003537402953952551
379 0.0003423291491344571
380 0.00033113491372205317
381 0.0003216144978068769
382 0.00031224158010445535
383 0.00030354454065673053
384 0.0002942767459899187
385 0.00028550135903060436
386 0.0002770902938209474
387 0.0002684467181097716
388 0.0002612288808450103
389 0.0002542302827350795
390 0.0002474670473020524
391 0.00024100422160699964
392 0.00023362331558018923
393 0.00022733022342436016
394 0.00022183341206982732
395 0.0002157048147637397
396 0.0002098895492963493
397 0.00020375066378619522
398 0.0001987303839996457
399 0.00019374824478290975
400 0.00018871526117436588
401 0.0001837574236560613
402 0.00017933835624717176
403 0.0001748178619891405
404 0.00017043280240613967
405 0.00016624483396299183
406 0.00016263780707959086
407 0.0001582356344

### PyTorch: Defining new autograd functions

We can ealsily define our own autograd operator. 

We can then use our new autograd opearator by constructing an instance and calling it like a function, passing Tensors containing input data.

In [6]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input
    
    

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()        

0 31893116.0
1 30850552.0
2 31210794.0
3 28917272.0
4 22566736.0
5 14725250.0
6 8380022.5
7 4590935.0
8 2640690.5
9 1684833.75
10 1195375.5
11 920590.25
12 746875.1875
13 625070.5
14 532750.0625
15 459374.28125
16 399199.625
17 348871.125
18 306291.03125
19 269963.375
20 238751.4375
21 211869.21875
22 188596.8125
23 168307.75
24 150574.0625
25 135023.375
26 121339.59375
27 109271.53125
28 98592.2421875
29 89117.78125
30 80699.4453125
31 73190.8125
32 66469.328125
33 60448.65625
34 55048.828125
35 50212.0859375
36 45854.109375
37 41921.9609375
38 38367.5703125
39 35149.7421875
40 32234.0546875
41 29586.88671875
42 27180.771484375
43 24992.5546875
44 23000.40234375
45 21181.8125
46 19521.296875
47 18003.994140625
48 16618.283203125
49 15352.4150390625
50 14191.7734375
51 13127.263671875
52 12149.857421875
53 11251.5361328125
54 10425.51171875
55 9665.4482421875
56 8965.74609375
57 8320.9658203125
58 7726.2685546875
59 7177.3974609375
60 6670.3525390625
61 6201.798828125
62 5768.602050781

373 0.0014732151757925749
374 0.001426792936399579
375 0.0013806510251015425
376 0.0013358094729483128
377 0.0012940808665007353
378 0.0012534471461549401
379 0.0012146964436396956
380 0.001175496494397521
381 0.0011389683932065964
382 0.0011049802415072918
383 0.0010698401601985097
384 0.001039945986121893
385 0.001008006394840777
386 0.0009782497072592378
387 0.0009487936622463167
388 0.0009208705741912127
389 0.0008937305537983775
390 0.000867289665620774
391 0.0008436997304670513
392 0.0008197706192731857
393 0.0007957119960337877
394 0.000771719787735492
395 0.0007523054373450577
396 0.0007307795458473265
397 0.0007112104212865233
398 0.0006905628251843154
399 0.0006707995780743659
400 0.0006530766258947551
401 0.0006365948356688023
402 0.0006186420214362442
403 0.000601686944719404
404 0.0005861246027052402
405 0.0005714514991268516
406 0.0005577856209129095
407 0.0005417502834461629
408 0.0005292423302307725
409 0.0005151266814209521
410 0.0005013172049075365
411 0.0004889404517

# nn module