In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cuda device


## Training a NN model

In [3]:
# Training, validating, and testing a model with data by optimizing its parameters.
# An iterative process, where each iteration is called an epoch. Makes a prediction
# based on the input training data, calculates the error of its guess from the loss
# function, calculates the gradient of the error with respect to parameters using
# autograd, and optimizes the parameters using gradient descent.

# The code below is straight from the dataset_dataloaders and 
# models_parameters_autograd notebooks

training_data = datasets.FashionMNIST(
    root="../data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="../data",
    train=False,
    download=True,
    transform=ToTensor()
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz


0it [00:00, ?it/s]

Extracting ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


0it [00:00, ?it/s]

Extracting ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


0it [00:00, ?it/s]

Extracting ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


0it [00:00, ?it/s]

Extracting ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw
Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [56]:
# Hyperparameters are adjustable parameters that let you control the model optimization
# process, which can impact model training and convergence rates.

# Number of epochs is the number of times we will iterate over the dataset. Each epoch
# consists of the train loop (iterating over the minibatch of the training dataset) and
# the validation/test loop, iterating over the test dataset to check if model performance
# is improving.
epochs = 10

# Batch size is number of data samples seen by the model in each epoch
batch_size = 64

# Learning rate is how much to update the parameters at each epoch
learning_rate = 1e-3

# Momentum is a parameter that speeds up gradient descent along directions in parameter
# space that have a larger improvement. It effectively adds a velocity term to the update
# step that is derived from the average over multiple steps. The momentum parameter tunes
# the impact of the velocity term.
momentum = 0.9

# Weight decay is a regularization factor that improves model generalization by preventing
# parameters from growing too large. The weight decay pre-multiplies the square (usually) of
# the parameters in the loss function.
weight_decay = 0.0005

In [57]:
# Initialize the dataloaders and model
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

model = NeuralNetwork()
model.to(device)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)

In [58]:
# We also have to choose a loss function. Common loss functions include nn.MSELoss 
# (Mean Square Error) for regression tasks, and nn.NLLLoss (Negative Log Likelihood) 
# for classification. nn.CrossEntropyLoss combines nn.LogSoftmax and nn.NLLLoss.

loss_fn = nn.CrossEntropyLoss()

In [59]:
# We also need to choose an optimization algorithm, which defines the process of
# how model parameters are updated to reduce the loss. This logic is encapsulated
# by the optimizer object. Here we use Stochastic Gradient Descent, but many other
# optimizers are available.

# The optimizer is initialized by the model parameters that need trained and the
# learning rate.
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,  momentum=momentum, weight_decay=weight_decay)

In [60]:
# Inside the training loop, the optimization process happens in three steps:
# 1) Call optimizer.zero_grad() to reset the gradients of model parameters,
#    otherwise the gradients sum up by default.
# 2) Backpropogation using the loss.backwards() method, computing the gradients
#    wrt each parameter.
# 3) Call optimizer.step() method to adjust the parameters using the gradients.

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Get the input data X and label y from the dataloader
    for batch, (X, y) in enumerate(dataloader):
        # Compute the model prediction given current model parameters.
        pred = model(X.to(device))
        # Compute the loss from the prediction and the label
        loss = loss_fn(pred, y.to(device))

        # Optimization: zero gradients, backpropogation, adjust parameters.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0
    
    # Turn off grad computation to reduce overhead of forward pass for testing.
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X.to(device))
            # Accumulate the total loss on the test data.
            test_loss += loss_fn(pred, y.to(device)).item()
            # Count the number of correct answers to calculate the accuracy.
            correct += (pred.argmax(1) == y.to(device)).type(torch.float).sum().item()

    # Compute average loss and the overall accuracy of the model.
    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [61]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 52.8%, Avg loss: 0.022875 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 58.2%, Avg loss: 0.019900 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 59.5%, Avg loss: 0.018536 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.017906 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.017549 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 60.9%, Avg loss: 0.017315 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 61.0%, Avg loss: 0.017144 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 61.2%, Avg loss: 0.017007 

Epoch 9
-------------------------------
Test Error: 
 Accuracy: 61.4%, Avg loss: 0.016885 

Epoch 10
-------------------------------
Test Error: 
 Accuracy: 61.5%, Avg loss: 0.016776 

Done!


## Saving and loading model parameters

In [11]:
# We can now save the trained model parameters using the torch.save function
torch.save(model.state_dict(), '../data/params/fmnist_model_weights.pt')

In [18]:
next(model.parameters())

Parameter containing:
tensor([[ 0.0204,  0.0131,  0.0302,  ...,  0.0210,  0.0062,  0.0017],
        [-0.0027,  0.0169, -0.0093,  ...,  0.0070,  0.0058,  0.0038],
        [ 0.0322,  0.0328,  0.0057,  ...,  0.0024, -0.0108,  0.0227],
        ...,
        [ 0.0052, -0.0266,  0.0251,  ...,  0.0288, -0.0073, -0.0038],
        [-0.0057,  0.0106,  0.0311,  ...,  0.0143,  0.0047, -0.0200],
        [ 0.0138,  0.0178, -0.0096,  ...,  0.0006, -0.0258, -0.0085]],
       device='cuda:0', requires_grad=True)

In [38]:
# We can also load the parameters into a new model that has the same architecture shape.
# This new model starts with random parameters

model_new = NeuralNetwork()
next(model_new.parameters())

Parameter containing:
tensor([[ 0.0304,  0.0180,  0.0037,  ...,  0.0074,  0.0344,  0.0216],
        [ 0.0075,  0.0092,  0.0291,  ..., -0.0263, -0.0301, -0.0340],
        [ 0.0157, -0.0128, -0.0009,  ...,  0.0076,  0.0176, -0.0340],
        ...,
        [-0.0232,  0.0247, -0.0314,  ..., -0.0115, -0.0217,  0.0031],
        [-0.0276,  0.0234,  0.0257,  ...,  0.0238,  0.0085, -0.0234],
        [-0.0356,  0.0119,  0.0042,  ...,  0.0264, -0.0078,  0.0137]],
       requires_grad=True)

In [39]:
# We can load the saved weights using the load_state_dict() method. 
# Note: Be sure to call the eval() method before inferencing, which sets the dropout 
# and batch normalization layers to evaluation mode. Failing to do this will yield 
# inconsistent inference results.

model_new.load_state_dict(torch.load("../data/params/fmnist_model_weights.pt"))
model_new.eval()

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)

In [40]:
# The new model now has the same parameters as before
next(model_new.parameters())

Parameter containing:
tensor([[ 0.0204,  0.0131,  0.0302,  ...,  0.0210,  0.0062,  0.0017],
        [-0.0027,  0.0169, -0.0093,  ...,  0.0070,  0.0058,  0.0038],
        [ 0.0322,  0.0328,  0.0057,  ...,  0.0024, -0.0108,  0.0227],
        ...,
        [ 0.0052, -0.0266,  0.0251,  ...,  0.0288, -0.0073, -0.0038],
        [-0.0057,  0.0106,  0.0311,  ...,  0.0143,  0.0047, -0.0200],
        [ 0.0138,  0.0178, -0.0096,  ...,  0.0006, -0.0258, -0.0085]],
       requires_grad=True)

In [41]:
# In this case we had to create a new model that has the same shape as the original.
# If we wanted to, we could just save the entire model with architecture as well.
torch.save(model, "../data/params/fmnist_trained_model.pt")

In [42]:
# And load the entire model again. Note that this approach relies on Python pickle
# serialization, so requires the model class definition to be available when loading.
model_new = torch.load("../data/params/fmnist_trained_model.pt")
model_new

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)

In [43]:
# The parameters are also saved.
next(model_new.parameters())

Parameter containing:
tensor([[ 0.0204,  0.0131,  0.0302,  ...,  0.0210,  0.0062,  0.0017],
        [-0.0027,  0.0169, -0.0093,  ...,  0.0070,  0.0058,  0.0038],
        [ 0.0322,  0.0328,  0.0057,  ...,  0.0024, -0.0108,  0.0227],
        ...,
        [ 0.0052, -0.0266,  0.0251,  ...,  0.0288, -0.0073, -0.0038],
        [-0.0057,  0.0106,  0.0311,  ...,  0.0143,  0.0047, -0.0200],
        [ 0.0138,  0.0178, -0.0096,  ...,  0.0006, -0.0258, -0.0085]],
       device='cuda:0', requires_grad=True)