In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,)),
                              ])
# Download and load the training data
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

In [3]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10))

# Define the loss
criterion = nn.CrossEntropyLoss()

# Get our data
images, labels = next(iter(trainloader))
# Flatten images
images = images.view(images.shape[0], -1)

# Forward pass, get our logits
logits = model(images)
# Calculate the loss with the logits and the labels
loss = criterion(logits, labels)

print(loss)

tensor(2.3079, grad_fn=<NllLossBackward>)


In my experience it's more convenient to build the model with a log-softmax output using nn.LogSoftmax or F.log_softmax (documentation). Then you can get the actual probabilites by taking the exponential torch.exp(output). With a log-softmax output, you want to use the negative log likelihood loss, nn.NLLLoss (documentation).

Exercise: Build a model that returns the log-softmax as the output and calculate the loss using the negative log likelihood loss.

In [4]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim=1))

# Define the loss
criterion = nn.NLLLoss()

# Get our data
images, labels = next(iter(trainloader))
# Flatten images
images = images.view(images.shape[0], -1)

# Forward pass, get our log-probabilities
logps = model(images)
# Calculate the loss with the logps and the labels
loss = criterion(logps, labels)

print(loss)

tensor(2.3097, grad_fn=<NllLossBackward>)


In [10]:
x = torch.randn(2,2, requires_grad=True)
print(x)
y = x**2
print(y)
'''grad_fn shows the function that generated this variable'''
print(y.grad_fn)
'''The autograd module keeps track of these operations and knows how to calculate the gradient for each one. 
In this way, it's able to calculate the gradients for a chain of operations, with respect to any one tensor. 
Let's reduce the tensor y to a scalar value, the mean.'''
z = y.mean()
print(z)
'''You can check the gradients for x and y but they are empty currently.'''
print(x.grad)

'''To calculate the gradients, you need to run the .backward method on a Variable, z for example. 
This will calculate the gradient for z with respect to x
'''
z.backward()
print(x.grad)
print(y.grad)
print(x/2)

tensor([[ 1.1467,  0.0764],
        [ 0.0172, -0.7530]], requires_grad=True)
tensor([[1.3149e+00, 5.8403e-03],
        [2.9566e-04, 5.6699e-01]], grad_fn=<PowBackward0>)
<PowBackward0 object at 0x00000277BC9A9E80>
tensor(0.4720, grad_fn=<MeanBackward0>)
None
tensor([[ 0.5734,  0.0382],
        [ 0.0086, -0.3765]])
None
tensor([[ 0.5734,  0.0382],
        [ 0.0086, -0.3765]], grad_fn=<DivBackward0>)


In [9]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim=1))

criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

logps = model(images)
loss = criterion(logps, labels)

'''When we create a network with PyTorch, all of the parameters are initialized with requires_grad = True. 
This means that when we calculate the loss and call loss.backward(), the gradients for the parameters are calculated. 
These gradients are used to update the weights with gradient descent. Below you can see an example of calculating the gradients
using a backwards pass.` '''

print('Before backward pass: \n', model[0].weight.grad)

loss.backward()

print('After backward pass: \n', model[0].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[-0.0024, -0.0024, -0.0024,  ..., -0.0024, -0.0024, -0.0024],
        [-0.0003, -0.0003, -0.0003,  ..., -0.0003, -0.0003, -0.0003],
        [-0.0031, -0.0031, -0.0031,  ..., -0.0031, -0.0031, -0.0031],
        ...,
        [-0.0046, -0.0046, -0.0046,  ..., -0.0046, -0.0046, -0.0046],
        [ 0.0010,  0.0010,  0.0010,  ...,  0.0010,  0.0010,  0.0010],
        [-0.0007, -0.0007, -0.0007,  ..., -0.0007, -0.0007, -0.0007]])


Training the network!
There's one last piece we need to start training, an optimizer that we'll use to update the weights with the gradients. We get these from PyTorch's optim package. For example we can use stochastic gradient descent with optim.SGD. You can see how to define an optimizer below.

In [11]:
from torch import optim

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.01)

Make a forward pass through the network
Use the network output to calculate the loss
Perform a backward pass through the network with loss.backward() to calculate the gradients
Take a step with the optimizer to update the weights


optimizer.zero_grad(). When you do multiple backwards passes with the same parameters, the gradients are accumulated. This means that you need to zero the gradients on each training pass or you'll retain gradients from previous training batches.

In [13]:
print('Initial weights - ', model[0].weight)

images, labels = next(iter(trainloader))
images.resize_(64, 784)

# Clear the gradients, do this because gradients are accumulated
optimizer.zero_grad()

# Forward pass, then backward pass, then update weights
output = model(images)
loss = criterion(output, labels)
loss.backward()
print('Gradient -', model[0].weight.grad)

Initial weights -  Parameter containing:
tensor([[-0.0252, -0.0044, -0.0256,  ...,  0.0301, -0.0100,  0.0144],
        [ 0.0172, -0.0344, -0.0058,  ..., -0.0160,  0.0116,  0.0259],
        [ 0.0145, -0.0055, -0.0349,  ..., -0.0242,  0.0089,  0.0080],
        ...,
        [ 0.0214,  0.0269,  0.0245,  ..., -0.0228,  0.0116, -0.0026],
        [-0.0084, -0.0120, -0.0143,  ...,  0.0347, -0.0077,  0.0128],
        [-0.0295,  0.0066, -0.0133,  ...,  0.0287, -0.0120,  0.0297]],
       requires_grad=True)
Gradient - tensor([[-0.0052, -0.0052, -0.0052,  ..., -0.0052, -0.0052, -0.0052],
        [-0.0006, -0.0006, -0.0006,  ..., -0.0006, -0.0006, -0.0006],
        [-0.0016, -0.0016, -0.0016,  ..., -0.0016, -0.0016, -0.0016],
        ...,
        [-0.0060, -0.0060, -0.0060,  ..., -0.0060, -0.0060, -0.0060],
        [-0.0023, -0.0023, -0.0023,  ..., -0.0023, -0.0023, -0.0023],
        [-0.0006, -0.0006, -0.0006,  ..., -0.0006, -0.0006, -0.0006]])


In [14]:
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim=1))

criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.003)

epochs = 5
for e in range(epochs):
    running_loss = 0
    for images, labels in trainloader:
        # Flatten MNIST images into a 784 long vector
        images = images.view(images.shape[0], -1)
    
        # TODO: Training pass
        optimizer.zero_grad()
        
        output = model(images)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(trainloader)}")

Training loss: 1.914447036887537
Training loss: 0.8376263892218503
Training loss: 0.5216996668974982
Training loss: 0.4276656572148998
Training loss: 0.38337779103883546


In [16]:
#With the network trained, we can check out it's predictions.

%matplotlib inline

images, labels = next(iter(trainloader))
img = images[0].view(1, 784)
# Turn off gradients to speed up this part
with torch.no_grad():
    logps = model(img)

# Output of the network are log-probabilities, need to take exponential for probabilities
ps = torch.exp(logps)
print(ps)

tensor([[3.9605e-03, 2.1605e-06, 1.6611e-03, 8.6057e-05, 7.6604e-03, 3.7065e-04,
         6.8483e-05, 1.6366e-01, 4.7700e-03, 8.1777e-01]])
