Use `numpy` to fit a two-layer network to random data.

In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute the print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29038550.1753679
1 26108901.578410245
2 27945973.078932367
3 29610046.80581097
4 27739340.486246303
5 21129319.874644823
6 13244182.145515442
7 7162028.128281277
8 3742036.689425596
9 2066742.665822919
10 1280845.0362067227
11 892184.5079821945
12 679664.9481404878
13 547679.4433304786
14 456039.8438976203
15 387103.12687274284
16 332554.728017541
17 287990.7483977186
18 250900.27109678462
19 219546.23022636372
20 192975.78830475538
21 170293.19726253377
22 150786.7470247831
23 133966.44893279634
24 119401.54650530918
25 106715.18798528195
26 95614.73856976826
27 85874.05986915628
28 77313.37777808993
29 69748.87090827306
30 63048.52590241222
31 57099.77637148828
32 51807.7254859999
33 47083.42260099482
34 42858.92609791188
35 39075.378215144934
36 35678.353262411125
37 32621.360970137124
38 29866.7699546441
39 27382.043888008622
40 25131.7075428451
41 23092.744646989926
42 21243.065721999534
43 19561.95665171762
44 18031.543998199224
45 16637.085822777524
46 15364.474801709865
47 14

403 0.00020391327282685416
404 0.00019495696369420536
405 0.00018639687356134833
406 0.00017821573252313494
407 0.00017039960982877952
408 0.0001629249988440395
409 0.00015578168874461133
410 0.00014895160758502632
411 0.00014242254533060106
412 0.00013618827850111047
413 0.00013022258479015184
414 0.00012451964821278593
415 0.00011906846078375946
416 0.00011385814647735766
417 0.0001088765010853121
418 0.00010411542500471727
419 9.956247880673735e-05
420 9.520949570279992e-05
421 9.104759511882176e-05
422 8.706954883915212e-05
423 8.326601462555519e-05
424 7.962991274531736e-05
425 7.615409306194788e-05
426 7.282969187697187e-05
427 6.965392477990823e-05
428 6.661787974427877e-05
429 6.371238615284069e-05
430 6.0934365302188865e-05
431 5.8278310324029914e-05
432 5.573867643694269e-05
433 5.331078122486084e-05
434 5.09894003082828e-05
435 4.8769156291232885e-05
436 4.664671292758125e-05
437 4.461687909786702e-05
438 4.267602064727435e-05
439 4.081976020568788e-05
440 3.904457811661124e

Use Pytorch Tensors to fit a two-layer network to random data.

In [3]:
import torch

dtype = torch.float
device = torch.device("cpu")

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 31529772.0
1 26659518.0
2 26014494.0
3 25089752.0
4 21886846.0
5 16386723.0
6 10683678.0
7 6330129.5
8 3669816.0
9 2206883.25
10 1433644.25
11 1013260.625
12 769806.4375
13 616064.75
14 510165.15625
15 431782.5625
16 370587.71875
17 321068.0
18 280107.46875
19 245721.796875
20 216530.75
21 191555.34375
22 170068.0625
23 151462.296875
24 135288.265625
25 121187.140625
26 108834.0703125
27 97969.2734375
28 88384.9140625
29 79916.1171875
30 72405.1171875
31 65719.40625
32 59750.2421875
33 54415.3046875
34 49635.1015625
35 45338.77734375
36 41475.65234375
37 37994.203125
38 34848.55078125
39 31997.224609375
40 29411.65234375
41 27063.408203125
42 24927.4296875
43 22982.345703125
44 21208.615234375
45 19588.8828125
46 18107.990234375
47 16753.498046875
48 15514.2841796875
49 14377.4501953125
50 13333.267578125
51 12372.869140625
52 11489.078125
53 10675.5537109375
54 9925.22265625
55 9232.9765625
56 8593.775390625
57 8003.05029296875
58 7458.1005859375
59 6955.279296875
60 6489.232421875


439 0.00015820386761333793
440 0.0001549387670820579
441 0.00015162811905611306
442 0.0001478969061281532
443 0.00014562200522050261
444 0.00014257254952099174
445 0.00013960563228465617
446 0.0001369660021737218
447 0.00013378878065850586
448 0.00013151956954970956
449 0.0001288310158997774
450 0.0001261072320630774
451 0.00012394502118695527
452 0.00012186989624751732
453 0.00011963321594521403
454 0.00011692140105878934
455 0.00011473918857518584
456 0.00011264577187830582
457 0.00011054221977246925
458 0.00010866982484003529
459 0.00010647094313753769
460 0.00010442036727908999
461 0.00010246472083963454
462 0.00010072382428916171
463 9.901619341690093e-05
464 9.684445103630424e-05
465 9.518473962089047e-05
466 9.323842823505402e-05
467 9.21283572097309e-05
468 9.051179949892685e-05
469 8.911071927286685e-05
470 8.773493027547374e-05
471 8.61046282807365e-05
472 8.485247963108122e-05
473 8.332564175361767e-05
474 8.174621325451881e-05
475 8.062287815846503e-05
476 7.953577005537227