In [1]:
# numpy example
import numpy as np

In [2]:
# N is batch size D_in is input dimension
# H is hiddeen dimension, D_out is output dimension

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

# here is just basic matrix multiplication
# dot multiplication
# 64*1000 product 1000*100 product 100*10 generate 64*10

In [3]:
# initial learning rate
learning_rate = 1e-6

In [4]:
# learning rate
for t in range(500):
    # Forward pass :  compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # compute and prnt loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) # devia(loss) = 2 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 25220473.01207943
1 20436712.20574233
2 17887771.120564952
3 15674688.151887111
4 13223201.728475519
5 10466220.309567917
6 7828674.089663807
7 5575803.938385295
8 3881481.758839852
9 2682272.704659152
10 1879535.213058867
11 1347751.956711391
12 997441.0177290025
13 762349.5634726391
14 601017.2020980911
15 486557.6044887745
16 402795.84842065966
17 339363.81582998135
18 289841.2565564209
19 250154.07182404073
20 217678.44840819653
21 190634.22009661092
22 167783.3212868447
23 148315.73686835173
24 131567.69622105407
25 117060.92598512763
26 104419.59275807123
27 93372.83208870832
28 83669.79297062446
29 75118.11549764522
30 67556.7993441596
31 60851.67584236468
32 54895.0003024038
33 49597.2277741266
34 44872.15921098893
35 40650.10599673036
36 36888.9631313586
37 33533.07331042309
38 30517.899778858282
39 27802.860087475372
40 25353.7358148751
41 23143.108584431728
42 21143.981892249772
43 19339.281894549866
44 17704.55592392879
45 16220.114983228199
46 14870.813823219582
47 13643

391 1.930677548571153e-05
392 1.8363217280887622e-05
393 1.746584494658353e-05
394 1.6612666009273227e-05
395 1.5801135408149313e-05
396 1.502962555052009e-05
397 1.429578539675676e-05
398 1.359812171725985e-05
399 1.2934431800743253e-05
400 1.2303502565916743e-05
401 1.1703265867888594e-05
402 1.1132669605265784e-05
403 1.0589744381321495e-05
404 1.0073632747514874e-05
405 9.58254362076861e-06
406 9.115765183162233e-06
407 8.671537608590919e-06
408 8.249313493974419e-06
409 7.847486111578851e-06
410 7.465534092407711e-06
411 7.102005462544298e-06
412 6.756437955772525e-06
413 6.427557760467173e-06
414 6.114950500407685e-06
415 5.817432070611437e-06
416 5.5345738289676825e-06
417 5.265379607852174e-06
418 5.009462012712021e-06
419 4.765895430068454e-06
420 4.534320455531505e-06
421 4.3139370404577315e-06
422 4.1043980944614975e-06
423 3.9049753464709385e-06
424 3.7153551465814417e-06
425 3.5348897843013273e-06
426 3.3633097704198193e-06
427 3.200001813310762e-06
428 3.044709248975021e-

In [9]:
import torch

dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights
# Setting requires_grad = True indicates that we want to compute gradient 
# with repect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    
    if t % 100 == 99:
        print(t, loss.item())
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

99 1468.939208984375
199 14.87568187713623
299 0.22394025325775146
399 0.004379940684884787
499 0.0002612062089610845


In [13]:
# pytorch: defining new autograde functions
# pytorch provides autograd operator by defining a subclass of torch
# autograd.Function and implementing the forward and backward functions. 

# possible to implmented custom autograd functions by subclassing
# torch.autograd.Function and implementing the forward and backward 
# passes which operate on Tensors
import torch

class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    # in the forward pass we recieve a Tensor containing the input
    # and return a Tensor containing the ouput. ctx is a context 
    # object that can be used to stash information for backward 
    # computation
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 480.1095275878906
199 3.761472225189209
299 0.04835880547761917
399 0.001113116624765098
499 0.00011630377412075177
