# Warm up: Numpy example

In [1]:
import numpy as np
import math

# create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
    # forward pass: compute predicted y
    # y = a + bx + cx^2 + dx^3
    y_pred = a + b*x + c*x**2 + d*x**3

    # compute and print loss
    loss = np.square(y_pred - y).sum() # using L2 loss, compute total loss
    if t % 100 == 99: # output loss for each 100 iteration
        print(t, loss)
    
    # backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) # derivative of L2 loss = 2*(y_pred-y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x**2).sum()
    grad_d = (grad_y_pred * x**3).sum()

    # update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f"Result: y = {a} + {b} x + {c} x^2 + {d} x^3")

99 2141.4946036875945
199 1517.241285263613
299 1075.7480688553105
399 763.4969389836564
499 542.6457638271887
599 386.4353050695364
699 275.942436643732
799 197.7848311238322
899 142.49818213773324
999 103.38883790059057
1099 75.72252300144964
1199 56.15067068407265
1299 42.304753233016726
1399 32.50939909289728
1499 25.579505231582402
1599 20.676746319583483
1699 17.208088035821678
1799 14.754006071217585
1899 13.017714130757355
1999 11.789250867090503
Result: y = -0.05764280966610518 + 0.8592104398627766 x + 0.00994434384789622 x^2 + -0.09368166767839492 x^3


# PyTorch: Tensor, Manual approach to backpropagation

In [2]:
import torch
import math

dtype = torch.float
device = torch.device("cpu")

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 911.1416625976562
199 613.6539916992188
299 414.5614013671875
399 281.2171630859375
499 191.84410095214844
599 131.8975830078125
699 91.65753173828125
799 64.6238784790039
899 46.44725036621094
999 34.21502685546875
1099 25.97598648071289
1199 20.42135238647461
1299 16.672897338867188
1399 14.140918731689453
1499 12.428922653198242
1599 11.27011489868164
1699 10.484951972961426
1799 9.952383041381836
1899 9.590747833251953
1999 9.344919204711914
Result: y = -0.016647376120090485 + 0.8404631614685059 x + 0.002871948992833495 x^2 + -0.0910150334239006 x^3


# PyTorch: Tensor with `Autograd`

In [3]:
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 3311.67333984375
199 2235.150146484375
299 1510.9268798828125
399 1023.2794189453125
499 694.6273193359375
599 472.92156982421875
699 323.21624755859375
799 222.0280303955078
899 153.5638427734375
999 107.19232940673828
1099 75.75115966796875
1199 54.41001510620117
1299 39.9085693359375
1399 30.043785095214844
1499 23.325590133666992
1599 18.74508285522461
1699 15.61854076385498
1799 13.481956481933594
1899 12.020190238952637
1999 11.018983840942383
Result: y = -0.03784185275435448 + 0.8271968364715576 x + 0.006528348661959171 x^2 + -0.08912800997495651 x^3


# Defining new `autograd` functions

In [4]:
import torch
import math


class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For this example, we need
# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
# not too far from the correct result to ensure convergence.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)

learning_rate = 5e-6
for t in range(2000):
    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
    P3 = LegendrePolynomial3.apply

    # Forward pass: compute predicted y using operations; we compute
    # P3 using our custom autograd operation.
    y_pred = a + b * P3(c + d * x)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')

99 209.95834350585938
199 144.66018676757812
299 100.70249938964844
399 71.03519439697266
499 50.97850799560547
599 37.403133392333984
699 28.206867218017578
799 21.973188400268555
899 17.7457275390625
999 14.877889633178711
1099 12.931766510009766
1199 11.610918045043945
1299 10.714258193969727
1399 10.10548210144043
1499 9.692106246948242
1599 9.411375045776367
1699 9.220745086669922
1799 9.091285705566406
1899 9.003361701965332
1999 8.943639755249023
Result: y = -5.423830273798558e-09 + -2.208526849746704 * P3(1.3320399228078372e-09 + 0.2554861009120941 x)


# PyTorch: Neural Network `torch.nn`, Update parameter manually

In [5]:
import torch
import math

x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

model = torch.nn.Sequential(
    torch.nn.Linear(3, 1), # first layer has 3 input and 1 output
    torch.nn.Flatten(0, 1)z
)

loss_fn = torch.nn.MSELoss(reduction="sum")

# Computation graph
# xx -> Linear -> Flatten -> y_pred -> loss 
learning_rate = 1e-6
for t in range(2000):
    y_pred = model(xx)

    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # set zero the gradients before running backward pass
    model.zero_grad()

    # compute gradient of the loss with respect to ALL learnable parameter
    # of the model
    loss.backward()

    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    
linear_layer = model[0]
# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3') 

99 931.6278076171875
199 621.7103271484375
299 415.99249267578125
399 279.4062194824219
499 188.69552612304688
599 128.4348907470703
699 88.39071655273438
799 61.772254943847656
899 44.07232666015625
999 32.298519134521484
1099 24.463703155517578
1199 19.247922897338867
1299 15.774333000183105
1399 13.459919929504395
1499 11.917160034179688
1599 10.888257026672363
1699 10.201677322387695
1799 9.743288040161133
1899 9.437076568603516
1999 9.232401847839355
Result: y = 0.009530384093523026 + 0.8389708399772644 x + -0.0016441497718915343 x^2 + -0.09080275893211365 x^3


# PyTorch: Neural Networks `optim` auto optimization

In [None]:
import torch
import math

x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

model = torch.nn.Sequential(
    torch.nn.Linear(3, 1)
)

In [None]:
import torch
x = torch.tensor([1, 2, 3])
print(x.shape)
print(torch.unsqueeze(x, 0).shape)
print(torch.unsqueeze(x, -1).shape)