In [1]:
import numpy as np

In [2]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.

N, D_in, H, D_out = 64, 1000, 100, 10

In [7]:
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()   # derivative 2.0 * (y_pred - y)
#     print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
print(t, loss)

499 9.257942201724115e-09


In [6]:
w1.shape

(1000, 100)

In [8]:
import torch
dtype = torch.float
device = torch.device("cpu")

In [10]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
        
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 1040.202392578125
199 8.077804565429688
299 0.08927784115076065
399 0.0014256981667131186
499 0.0001143227273132652


In [3]:
import torch
dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    
    if t % 100 == 99:
        print(t, loss.item())
        
    loss.backward()
    
    #The wrapper "with torch.no_grad()" temporarily set all the requires_grad flag to false
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

99 218.39772033691406
199 0.5481513738632202
299 0.0027674189768731594
399 9.424641757505015e-05
499 2.139807475032285e-05


In [5]:
w1.shape

torch.Size([1000, 100])

In [11]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out),
        )

loss_fn = torch.nn.MSELoss(reduction="sum")

learning_rate = 1e-4

for t in range(500):
    
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
        
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

99 1.8552138805389404
199 0.02548915706574917
299 0.0007355919806286693
399 3.395331805222668e-05
499 2.0649106318160193e-06


In [12]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out),
        )

loss_fn = torch.nn.MSELoss(reduction="sum")

learning_rate = 1e-4

for t in range(500):
    
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
        
#     model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    model.zero_grad()

99 2.622849464416504
199 0.04880301281809807
299 0.00208452926017344
399 0.00012665374379139394
499 8.980288839666173e-06


In [14]:
for param in model.parameters():
    print(param.shape)
    print()

torch.Size([100, 1000])

torch.Size([100])

torch.Size([10, 100])

torch.Size([10])



In [15]:
x

tensor([[ 1.0440, -1.2483,  0.1135,  ..., -1.0168, -0.8968,  1.4866],
        [-1.4255,  1.1506, -2.8339,  ...,  1.8755,  0.7295,  0.4137],
        [ 0.8834, -0.3038,  0.5571,  ...,  0.7552,  1.1710,  0.5529],
        ...,
        [-0.2142,  0.7130, -0.3599,  ...,  1.1672,  1.4358, -0.8015],
        [ 1.3204, -0.2642,  0.1647,  ..., -0.6218,  0.9802,  2.1956],
        [ 0.6430, -0.2796, -0.3104,  ...,  1.2801, -1.2552, -1.3908]])

In [16]:
x.shape

torch.Size([64, 1000])

In [17]:
x1 = x

In [36]:
x2 = x1.view(-1, 64*1000)
x2.shape

torch.Size([1, 64000])

In [20]:
x1

[0;31mType:[0m        Tensor
[0;31mString form:[0m
tensor([[ 1.0440, -1.2483,  0.1135,  ..., -1.0168, -0.8968,  1.4866],
           [-1.4255,  1.1506,  <...> 0.6218,  0.9802,  2.1956],
           [ 0.6430, -0.2796, -0.3104,  ...,  1.2801, -1.2552, -1.3908]])
[0;31mLength:[0m      64
[0;31mFile:[0m        ~/anaconda3/lib/python3.7/site-packages/torch/__init__.py
[0;31mDocstring:[0m   <no docstring>
