# PyTorch basic idea

use numpy to fit a third order polynomial to sine function by manually implementing the forward and backward passes through the network using numpy operations.

In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import math

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    # Please understand grade_a/b/c/d
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 1604.164923079127
199 1101.8504025964928
299 758.5537765862919
399 523.6647915661549
499 362.765239958335
599 252.42225960056317
699 176.6640372581014
799 124.59177852519753
899 88.75986981776904
999 64.07592140073385
1099 47.053012745478526
1199 35.30079790399919
1299 27.178779091550652
1399 21.559801727744492
1499 17.668547696346366
1599 14.97111735537083
1699 13.099452759851598
1799 11.799548884043336
1899 10.895924866376236
1999 10.267222398453962
Result: y = 0.03591065828729171 + 0.8399301748505651 x + -0.006195186110491026 x^2 + -0.09093921787443272 x^3


A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is an n-dimensional array, and PyTorch provides many functions for operating on these Tensors. Behind the scenes, **Tensors can keep track of a computational graph and gradients**, but they’re also useful as a generic tool for scientific computing.

In [4]:
import torch
import math

dtype = torch.float
device = torch.device('cpu')

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    # Please understand grade_a/b/c/d
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 1665.4342041015625
199 1127.7601318359375
299 765.3524780273438
399 520.8475341796875
499 355.7271728515625
599 244.1062774658203
699 168.57350158691406
799 117.40811157226562
899 82.71179962158203
999 59.15825653076172
1099 43.151248931884766
1199 32.260841369628906
1299 24.843149185180664
1399 19.785053253173828
1499 16.33203125
1599 13.972024917602539
1699 12.357168197631836
1799 11.25096607208252
1899 10.492277145385742
1999 9.971384048461914
Result: y = 0.028203003108501434 + 0.8362511992454529 x + -0.0048654875718057156 x^2 + -0.09041590988636017 x^3


we can use **automatic differentiation** to automate the computation of backward passes in neural networks. The autograd package in PyTorch provides exactly this functionality. When using autograd, the **forward pass of your network will define a computational graph**; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients. Each Tensor represents a node in a computational graph. If `x` is a Tensor that has `x.requires_grad=True` then `x.grad` is another Tensor holding the gradient of `x` with respect to some scalar value.

In [6]:
import torch
import math

dtype = torch.float
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), dtype=dtype, requires_grad=True)
b = torch.randn((), dtype=dtype, requires_grad=True)
c = torch.randn((), dtype=dtype, requires_grad=True)
d = torch.randn((), dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    with torch.no_grad():
        # Update weights
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 468.52008056640625
199 313.0732421875
299 210.2025604248047
399 142.12222290039062
499 97.0636215209961
599 67.23976135253906
699 47.498504638671875
799 34.430152893066406
899 25.778404235839844
999 20.050207138061523
1099 16.257349014282227
1199 13.74573040008545
1299 12.082385063171387
1399 10.98068904876709
1499 10.250911712646484
1599 9.767428398132324
1699 9.447076797485352
1799 9.234801292419434
1899 9.094106674194336
1999 9.000825881958008
Result: y = -0.0030657299794256687 + 0.8696101903915405 x + 0.0005288885440677404 x^2 + -0.09516093879938126 x^3


## torch.autograd.Function

In this example we define out model as $y = a + bP_3(c + dx)$ instead of $y = a + bx + cx^2 + dx^3$, where $P_3(x) = \frac{1}{2}(5x^3 - 3x)$

In [None]:
import torch
from torch.autograd import Function
import math

class P3(Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)

dtype = torch.float
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), dtype=dtype, requires_grad=True)
b = torch.randn((), dtype=dtype, requires_grad=True)
c = torch.randn((), dtype=dtype, requires_grad=True)
d = torch.randn((), dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    # y_pred = a + b * x + c * x ** 2 + d * x ** 3
    # To apply our Function, we use Function.apply method.
    p3_fn = P3.apply
    y_pred = a + v * p3_fn(c + d * x)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    with torch.no_grad():
        # Update weights
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

## nn.module

In [8]:
import torch
from torch.autograd import Function
import math


dtype = torch.float
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype)
y = torch.sin(x)

p = torch.tensor([1,2,3])
xx = x.unsqueeze(-1).pow(p)
# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
# (3,), for this case, broadcasting semantics will apply to obtain a tensor
# of shape (2000, 3) 

# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    # y_pred = a + b * x + c * x ** 2 + d * x ** 3
    y_pred = model(xx)

    # Compute and print loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 1241.99853515625
199 825.1299438476562
299 549.2207641601562
399 366.595458984375
499 245.7069091796875
599 165.67897033691406
699 112.69672393798828
799 77.61705017089844
899 54.38859176635742
999 39.006038665771484
1099 28.818328857421875
1199 22.070383071899414
1299 17.600292205810547
1399 14.638751983642578
1499 12.676429748535156
1599 11.376008033752441
1699 10.514081954956055
1799 9.942709922790527
1899 9.563872337341309
1999 9.312650680541992
Result: y = 0.005428871139883995 + 0.8356849551200867 x + -0.0009365704609081149 x^2 + -0.09033537656068802 x^3
