# Revision Walkthrough
This notebook shows different implementations of a fully-connected ReLU network with one hidden layer without biases, trained to predict y from x by minimising squared Euclidean distance.

## 1. NumPy
We first implement the network using numpy before PyTorch.

In [None]:
import numpy as np

In [None]:
# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [None]:
# Random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [None]:
# Random weights initiation
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [None]:
epsilon = 1e-6
for t in range(501):
    # Forward pass
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute the loss
    loss = np.square(y_pred - y).sum()
    if not t % 100:
        print(t, loss)
    
    # Backprop to compute the gradients
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update the weights using gradient descent
    w1 -= epsilon * grad_w1
    w2 -= epsilon * grad_w2

## 2. Tensors
We use PyTorch Tensors to fit the network, but need to manually implement the forward and backward passes through the process.

In [None]:
import torch

dtype = torch.float
device = torch.device("cpu")

In [None]:
# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [None]:
# Random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [None]:
# Random weight initiation
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [None]:
epsilon = 1e-6
for t in range(501):
    # Forward pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute the loss
    loss = (y_pred - y).pow(2).sum().item()
    if not t % 100:
        print(t, loss)
    
    # Backprop to compute the gradients
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update the weights using gradient descent
    w1 -= epsilon * grad_w1
    w2 -= epsilon * grad_w2
    

## 3. Tensors and Autograd
Here we use PyTorch Tensors and Autograd to implement our network, now that we no longer need to manually implement the backward pass through the network.

In [None]:
import torch

dtype = torch.float
device = torch.device("cpu")

In [None]:
# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [None]:
# Random input and output data
# Set requires_grad=False to avoid autograd on these tensors.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [None]:
# Random weights initiation
# Set requires_grad=True to indicate auto compute gradients.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [None]:
epsilon = 1e-6
for t in range(501):
    # Forward pass
    y_pred = x.mm(w1).clamp(min=0).mm(w2)  # ignoring intermediate values
    
    # Compute the loss
    loss = (y_pred - y).pow(2).sum()
    if not t % 100:
        print(t, loss.item())
    
    # Autograd to compute the backward pass
    loss.backward()
    
    # Manually update weights using gradient descent
    with torch.no_grad():  # no need to track the grad after update
        w1 -= epsilon * w1.grad
        w2 -= epsilon * w2.grad
        
        # manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

## 4. Define New Autograd Functions
In this implementation we implement our own custom autograd function to perform the ReLU function.



In [None]:
import torch

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)  # context object
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
        

In [None]:
dtype = torch.float
device = torch.device("cpu")

In [None]:
# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [None]:
# Random input and output data
# Set requires_grad=False to avoid autograd on these tensors.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [None]:
# Random weights initiation
# Set requires_grad=True to indicate auto compute gradients.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [None]:
epsilon = 1e-6
for t in range(501):
    # Apply function
    relu = MyReLU.apply
    
    # Forward pass
    y_pred = relu(x.mm(w1)).mm(w2)  # relu before mm
    
    # Compute the loss
    loss = (y_pred - y).pow(2).sum()
    if not t % 100:
        print(t, loss.item())
        
    # Autograd to compute the backward pass
    loss.backward()
    
    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= epsilon * w1.grad
        w2 -= epsilon * w2.grad
        
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

## 5. torch.nn Module
nn provides a set of modules and loss functions to build network layers. In this example we use the nn package to implement our two-layer network.

In [None]:
import torch

# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use nn package to define model as a sequence of layers.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# Use the loss function
loss_fn = torch.nn.MSELoss(reduction='sum')

# Main
epsilon = 1e-4
for t in range(501):
    # Forward pass
    # compute predicted y by passing x to the model
    y_pred = model(x)
    
    # Compute the loss
    loss = loss_fn(y_pred, y)
    if not t % 100:
        print(t, loss.item())
        
    # Zero the gradients before running the backward pass
    model.zero_grad()
    
    # Backward pass
    loss.backward()
    
    # Update the weights using gradient descent
    with torch.no_grad():
        for param in model.parameters():
            param -= epsilon * param.grad

## 6. torch.optim Module
The optim package in PyTorch abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms. \
In this example we will use the nn package to define our model, and we will optimize the model using the Adam algorithm provided by the optim package.




In [None]:
import torch

# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use nn package to define model as a sequence of layers.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# Use the loss function
loss_fn = torch.nn.MSELoss(reduction='sum')

# Define an optimiser to update the weights of the model
epsilon = 1e-4
optimiser = torch.optim.Adam(model.parameters(), lr=epsilon)

# Main    
for t in range(501):
    # Forward pass
    y_pred = model(x)
    
    # Compute the loss
    loss = loss_fn(y_pred, y)
    if not t % 100:
        print(t, loss.item())
        
    # Zero the gradients before running the backward pass
    optimiser.zero_grad()
    
    # Backward pass
    loss.backward()
    
    # Calling the step function on optimiser to update the parameters
    optimiser.step()

## 7. Custom nn Modules
To specify more complex models, you can define your own Modules by subclassing nn.Module and defining a forward function. In this example we implement our two-layer network as a custom Module subclass.

In [None]:
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [None]:
# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the above class
model = TwoLayerNet(D_in, H, D_out)

# Construct loss function and optimiser
criterion = torch.nn.MSELoss(reduction='sum')
optimiser = torch.optim.SGD(model.parameters(), lr=1e-4)

# Main
for t in range(501):
    # Forward pass
    y_pred = model(x)
    
    # Compute the loss
    loss = criterion(y_pred, y)
    if not t % 100:
        print(t, loss.item())
    
    # Zero gradients, backward pass, and update the weights
    optimiser.zero_grad()
    loss.backward()
    optimiser.step()

## 8. Control Flow / Weight Sharing
For this model we can use normal Python flow control to implement the loop, and we can implement weight sharing among the innermost layers by simply reusing the same Module multiple times when defining the forward pass.

In [None]:
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. 
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

In [None]:
# Batch size; input/ hidden/ output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the above class
model = DynamicNet(D_in, H, D_out)

# Construct loss function and optimiser
criterion = torch.nn.MSELoss(reduction='sum')
optimiser = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

# Main
for t in range(501):
    # Forward pass
    y_pred = model(x)
    
    # Compute the loss
    loss = criterion(y_pred, y)
    if not t % 100:
        print(t, loss.item())
    
    # Zero gradients, backward pass, and update the weights
    optimiser.zero_grad()
    loss.backward()
    optimiser.step()