# Getting Started
First, an implementation of forward and backward propagation using numpy

In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import math

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 2079.436954821379
199 1469.7694806933382
299 1039.803212237527
399 736.5056523004184
499 522.5171800182621
599 371.5114041479894
699 264.93194050498425
799 189.69595137180244
899 136.57749129755962
999 99.06905968381292
1099 72.57966654985084
1199 53.869782280089176
1299 40.65309211169526
1399 31.31574371361777
1499 24.7183775562006
1599 20.05649970613814
1699 16.76198273846374
1799 14.433566561111542
1899 12.787812002047021
1999 11.624482156741202
Result: y = 0.0557218130705722 + 0.8509037691588796 x + -0.009612939969645453 x^2 + -0.09250011610589048 x^3


# Introducing PyTorch
Now, the same - but using PyTorch (using CPU here)

In [3]:
# -*- coding: utf-8 -*-

import torch
import math


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 903.8649291992188
199 633.1228637695312
299 444.61956787109375
399 313.25360107421875
499 221.63357543945312
599 157.6856231689453
699 113.0193099975586
799 81.79891204833984
899 59.9622688293457
999 44.67908477783203
1099 33.976016998291016
1199 26.476150512695312
1299 21.217878341674805
1399 17.52927017211914
1499 14.940471649169922
1599 13.12266731262207
1699 11.845672607421875
1799 10.948190689086914
1899 10.317190170288086
1999 9.873372077941895
Result: y = 0.033149220049381256 + 0.8482986688613892 x + -0.005718793720006943 x^2 + -0.09212956577539444 x^3


# Introducing Autograd
This implementation adds the use of autograd, and it also allows the script to utilize CUDA rather than CPU.  
As a result, this implementation is incredibly fast relative to the previous ones.

In [1]:
# -*- coding: utf-8 -*-
import torch
import math

dtype = torch.float
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), dtype=dtype, requires_grad=True)
b = torch.randn((), dtype=dtype, requires_grad=True)
c = torch.randn((), dtype=dtype, requires_grad=True)
d = torch.randn((), dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 847.9603271484375
199 599.4351806640625
299 424.65289306640625
399 301.68487548828125
499 215.14056396484375
599 154.21051025390625
699 111.30058288574219
799 81.07230377197266
899 59.77180099487305
999 44.758384704589844
1099 34.17372131347656
1199 26.70968246459961
1299 21.44503402709961
1399 17.730958938598633
1499 15.110249519348145
1599 13.260708808898926
1699 11.955192565917969
1799 11.033523559570312
1899 10.382753372192383
1999 9.923189163208008
Result: y = 0.03477516397833824 + 0.8517211079597473 x + -0.005999294109642506 x^2 + -0.09261637926101685 x^3


Examples provided by https://pytorch.org/tutorials/beginner/pytorch_with_examples.html  
This notebook by Joe Norton