In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [2]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [3]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    # dot is doing Matrix multiplication
    # [64, 1000].[1000, 100] = [64, 100]
    
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    # h_relu.T is Matix transpose
#     print("h_relu.T.shape", h_relu.T.shape)
#     print("grad_y_pred.shape", grad_y_pred.shape)
    grad_w2 = h_relu.T.dot(grad_y_pred)
#     print('grad_w2.shape', grad_w2.shape);
#     print('w2.shape', w2.shape)
    grad_h_relu = grad_y_pred.dot(w2.T)
#     print('grad_h_relu.shape', grad_h_relu.shape)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 40138530.3439616
1 53657452.213567
2 77339950.16397685
3 81123974.6650319
4 45725237.57341118
5 11758108.560189066
6 3008434.9710800187
7 1737441.1592201483
8 1376725.2357491697
9 1148221.9445042722
10 971253.7414247927
11 829316.9918905043
12 713885.4979469918
13 619195.7758853682
14 540439.4956144913
15 474342.44863551005
16 418433.9485186151
17 370891.861795267
18 330129.9465293534
19 294978.29087434127
20 264522.9643182534
21 237951.1877029059
22 214714.07614229625
23 194311.41152385168
24 176295.47944063527
25 160325.59315805172
26 146118.95287209016
27 133431.4105321026
28 122082.17964651031
29 111907.77383518656
30 102750.92874944859
31 94485.13013136383
32 87008.19886650605
33 80230.67640870268
34 74082.60586070965
35 68491.85915500319
36 63393.990646901395
37 58737.08512998747
38 54477.98233254285
39 50574.71338769396
40 46992.76456903558
41 43701.68103172023
42 40674.575883119396
43 37886.52012811681
44 35317.23791955193
45 32946.09145317591
46 30755.360633624285
47 28728.4

383 0.005472601382467706
384 0.0052557622309191325
385 0.00504768140243694
386 0.004847828977205612
387 0.004655958731592181
388 0.004471762301490717
389 0.004295003825586226
390 0.004125293352061173
391 0.003962268643506404
392 0.003805767615384509
393 0.0036555199713153696
394 0.0035112328446222432
395 0.003372657516676073
396 0.003239673044283369
397 0.0031119202154453103
398 0.002989256320882244
399 0.002871461847353352
400 0.0027583846186869185
401 0.002649768895742302
402 0.002545465635768211
403 0.0024453144077092393
404 0.0023491796861800874
405 0.002256810699168381
406 0.002168098436258291
407 0.0020829083086695054
408 0.0020011182966709214
409 0.00192254531725721
410 0.0018470815440780066
411 0.001774602308166609
412 0.0017049927592726612
413 0.001638147816865468
414 0.0015739349235955782
415 0.001512257006787831
416 0.001453017821535692
417 0.0013961361604261622
418 0.0013414862807143383
419 0.0012889932133490858
420 0.0012385865215956194
421 0.0011901692882571335
422 0.0011

In [1]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [2]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [3]:
learning_rate = 1e-6
for t in range(2):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    print(x.mm(w1).shape)
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

torch.Size([64, 100])
0 29408370.0
torch.Size([64, 100])
1 23900436.0
