# Models, parameters, and Autograd

In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cuda device


## Neural network models

In [4]:
# The Base class for all neural network modules is torch.nn.Module -- 
# every module in PyTorch subclasses torch.nn.Module. Our models should
# also subclass this class. Modules can contain other Modules, creating
# a nested structure.

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        # Defines a function for flattening the data to a series of inputs... not sure
        # why this was needed to be defined as a method.
        self.flatten = nn.Flatten()
        # Define NN stack. Takes in a 28*28 length series of inputs and connects to
        # layer of 512 nodes. Then a second layer of 512, then a layer of 10 outputs.
        # nn.Sequential is an ordered container of modules, and data passed to it
        # gets acted on by all the moduels in the sequence in the defined order.
        self.linear_relu_stack = nn.Sequential(
            # Module applies linear transformation using 512 weights and biases which
            # can be stored
            nn.Linear(28*28, 512),
            # Module applies non-linear activation to generate complex interactions
            # and improve learning
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU()
        )
    # Every nn.Module subclass implements operations on input data in the forward method.
    def forward(self, x):
        # Flatten the image into a 28*28 series of inputs
        x = self.flatten(x)
        # Transform the series data using the NN stack defined above
        logits = self.linear_relu_stack(x)
        # Return values from -inf to inf
        return logits

In [5]:
# Create instance of NeuralNetwork and move it to device
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)


In [6]:
# Give the NN some random data and predict. Note: Do not call model.forward() directly!
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
# Softmax normalizes values to be between 0,1 and sum to 1 (corresponds to probabilities
# of predictions)
pred_probab = nn.Softmax(dim=1)(logits)
# Get prediction by determining prediction with max probability
y_pred = pred_probab.argmax(1)
y_pred_prob = pred_probab.max()
print(f"Predicted class: {y_pred}")
print(f"Prediction probability: {y_pred_prob}")

Predicted class: tensor([3], device='cuda:0')
Prediction probability: 0.1043078750371933


In [7]:
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


## Model parameters

In [8]:
# Each layer of the NN is often parameterized (think weights, biases). Subclasses
# of nn.Module automatically track parameters and can make them accessible.

print("Model structure: ", model, "\n\n")

# The first layer has 512 * 28 * 28 weights and 512 biases, etc
# (ie each node of the first layer has weights associated with connections to
# each of the 28*28=784 inputs, plus a bias (w_0 + w_1 x_1 + w_2 x_2 + ... + w_784 x_784)
# for a total of 785 parameters for each of the 512 nodes in the first layer)

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure:  NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
) 


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0171, -0.0021, -0.0003,  ..., -0.0348, -0.0158,  0.0267],
        [-0.0133,  0.0262,  0.0289,  ..., -0.0130, -0.0296,  0.0041]],
       device='cuda:0', grad_fn=<SliceBackward>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([ 0.0033, -0.0229], device='cuda:0', grad_fn=<SliceBackward>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0100, -0.0220,  0.0423,  ..., -0.0303,  0.0369,  0.0172],
        [ 0.0435, -0.0308, -0.0086,  ..., -0.0379, -0.0366,  0.0302]],
       device='cuda

## Autograd

In [11]:
# When training NNs, the most frequently-used algorithm is back propogation. The model 
# parameters are adjusted according to the gradient of the loss function wrt the corresponding
# parameter.

# Any time an operation is performed on a tensor that has requires_grad enabled, a computational 
# graph is built that stores both the result of the operation, but also its gradient for the back 
# prop step. Autograd is PyTorch's built-in differentiation engine, which computes the gradient
# for any computational graph.

x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output
# w and b are parameters that we want to optimize, so requires_grad=True
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w)+b
# The loss function to use for back prop
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

print('Gradient function for z =', z.grad_fn)
print('Gradient function for loss =', loss.grad_fn)

Gradient function for z = <AddBackward0 object at 0x7fcf264b8c50>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward object at 0x7fcfa8049310>


In [12]:
# To compute the gradient, use the backward() method of the loss function, which
# computes dloss/dparam. The gradient will only be available for tensors that had
# requires_grad=True.

# Note that gradient calculations can only be calculated once for a given
# computational graph for performance reasons. If for some reason multiple
# calls to backward need to be made on the same graph, we must pass
# retain_graph=True to the backward call.

loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.0217, 0.0097, 0.3308],
        [0.0217, 0.0097, 0.3308],
        [0.0217, 0.0097, 0.3308],
        [0.0217, 0.0097, 0.3308],
        [0.0217, 0.0097, 0.3308]])
tensor([0.0217, 0.0097, 0.3308])


In [19]:
# We can also disable gradient tracking. Cases where we might do that are if
# we have already trained the model and just want to apply it to some new
# data. We may also want to freeze parameters of a model (such as in the case
# for finetuning a pre-trained NN)

z = torch.matmul(x, w) + b
print(z.requires_grad)

# Use the torch.no_grad to specify that gradients for the computational graph
# do not need to be tracked
with torch.no_grad():
    z_nograd = torch.matmul(x, w) + b
print(z_nograd.requires_grad)

# We can also use the detach() tensor method
z_det = z.detach()
print(z_det.requires_grad)

True
False
False
