# Training neural models with PyTorch

Start by importing the PyTorch module.

In [None]:
import torch

## Linear regression

### Load and vectorize the data

Load the sample data (housing prices in Portland, OR) and store it into lists.

In [None]:
xs, ys = [], []
with open('portland.csv') as file:
    for line in file:
        cols = [int(c) for c in line.rstrip().split(',')]
        xs.append(cols[:-1])
        ys.append(cols[-1])

Show the first few samples.

In [None]:
xs[:5]

Convert the lists into tensors.

In [None]:
x = torch.FloatTensor(xs)
y = torch.FloatTensor(ys)

This is how the first few rows of the design matrix look like:

In [None]:
x[:5]

The next cell prints the shape of the design matrix:

In [None]:
x.size()

And this is how the corresponding values of the target vector look like:

In [None]:
y[:5]

In [None]:
y.size()

### Plot the data

The following line enables plotting into the notebook.

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

%config InlineBackend.figure_format = 'retina'

Plot the data.

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(121)
plt.scatter(x[::,0], y)
plt.xlabel('Size in square feet (x1)')
plt.ylabel('Price in dollars (y)')
plt.subplot(122)
plt.scatter(x[::,1], y)
plt.xlabel('Number of bedrooms (x2)')
plt.ylabel('Price in dollars (y)')
plt.show()

### Fit a model

Set up a linear model and train it using the mean squared error loss function.

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

model = nn.Linear(2, 1)
optimizer = optim.SGD(model.parameters(), lr=1e-7)

for t in range(5):
    optimizer.zero_grad()
    output = model.forward(x)
    loss = F.mse_loss(output, y.view(-1, 1))
    loss.backward()
    optimizer.step()

Show the parameters (weights and biases) of the model.

In [None]:
list(model.parameters())

Predict the prizes of the training data:

In [None]:
with torch.no_grad():
    y_pred = model.forward(x)

Here are the first few predicted prizes:

In [None]:
y_pred[:5]

Plot the data again, this time with predictions from the trained model.

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(121)
plt.scatter(x[::,0], y)
plt.xlabel('Size in square feet (x1)')
plt.ylabel('Price in dollars (y)')
plt.scatter(x[::,0], y_pred)
plt.subplot(122)
plt.scatter(x[::,1], y)
plt.xlabel('Number of bedrooms (x2)')
plt.ylabel('Price in dollars (y)')
plt.scatter(x[::,1], y_pred)
plt.show()

## Handwritten digit recognition

We next turn to a classical classification problem: handwritten digit recognition.

### Load and vectorize the data

The MNIST images and labels are stored as compressed bytestreams.

In [None]:
import gzip

def extract_images(file):
    with gzip.open(file) as source:
        source.read(16)  # skip header
        return torch.FloatTensor(list(source.read())).view(-1, 784) / 255

def extract_labels(file):
    with gzip.open(file) as source:
        source.read(8)  # skip header
        return torch.LongTensor(list(source.read()))

We actually have two different data sets: one for training, one for testing (validation).

In [None]:
train_x, train_y = extract_images('train-images-idx3-ubyte.gz'), extract_labels('train-labels-idx1-ubyte.gz')
print('Shapes of the training data matrices:', train_x.size(), train_y.size())

test_x, test_y = extract_images('t10k-images-idx3-ubyte.gz'), extract_labels('t10k-labels-idx1-ubyte.gz')
print('Shapes of the test data matrices:', test_x.size(), test_y.size())

### Evaluation

We will evaluate our models using classification accuracy (percentage of correctly classified images).

In [None]:
def accuracy(y_pred, y):
    return torch.mean(torch.eq(y_pred, y).float())

### Batching

Because the MNIST dataset is much bigger than the dataset with the housing prices, we will use stochastic gradient descent with minibatches. The following function splits the data into randomly sampled minibatches of the specified size.

In [None]:
def minibatches(x, y, batch_size):
    random_indices = torch.randperm(x.size(0))
    for i in range(0, x.shape[0] - batch_size + 1, batch_size):
        batch_indices = random_indices[i:i+batch_size]
        yield x[batch_indices], y[batch_indices]

### Fit a softmax model

Set up a softmax model and train it using the cross entropy loss function. Also, plot the per-epoch losses and the per-epoch accuracies on the test set.

In [None]:
# Minimal training loop

def train_softmax(x, y, n_epochs=10, batch_size=50, eta=1e-1):
    model = nn.Linear(784, 10)
    optimizer = optim.SGD(model.parameters(), lr=eta)
    for t in range(n_epochs):
        for bx, by in minibatches(x, y, batch_size):
            optimizer.zero_grad()
            output = model(bx)
            loss = F.cross_entropy(output, by)
            loss.backward()
            optimizer.step()

In [None]:
# Same training loop with evaluation and plotting

def train_softmax(x, y, n_epochs=10, batch_size=50, eta=1e-1, validation_data=None):
    model = nn.Linear(784, 10)
    optimizer = optim.SGD(model.parameters(), lr=eta)
    losses = []
    accuracies = []
    for t in range(n_epochs):
        model.train()
        running_loss = 0
        for bx, by in minibatches(x, y, batch_size):
            optimizer.zero_grad()
            output = model(bx)
            loss = F.cross_entropy(output, by)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * len(bx)
        losses.append(running_loss / len(x))
        if validation_data:
            test_x, test_y = validation_data
            model.eval()
            with torch.no_grad():
                test_y_pred = torch.argmax(model(test_x), axis=1)
                acc = accuracy(test_y_pred, test_y)
                accuracies.append(acc)
                print('\repoch {}, accuracy {:.4f}'.format(t, acc), end='')
    print()
    plt.figure(figsize=(15,6))
    plt.subplot(121)
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Average loss')
    plt.subplot(122)
    plt.plot(accuracies)
    plt.xlabel('Epoch')
    plt.ylabel('Test set accuracy')
    plt.ylim([0.90, 1.00])

Note that the output layer of this network is not a softmax layer but just a linear layer. This means that the outputs for each class are not normalised probabilities but just scores. During training, these scores will be combined with a cross-entropy loss, which will implicitly compute the softmax output.

In [None]:
train_softmax(train_x, train_y, validation_data=(test_x, test_y))

### Fit a neural network model

Define a simple, two-layer feed-forward neural network with a ReLU.

In [None]:
class FeedForwardNetwork(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

Train the model using the Adam optimizer.

In [None]:
# Minimal training loop

def train_feedforward(x, y, n_epochs=10, batch_size=50, eta=1e-3):
    model = FeedForwardNetwork(784, 392, 10)
    optimizer = optim.Adam(model.parameters(), lr=eta)
    for t in range(n_epochs):
        for bx, by in minibatches(x, y, batch_size):
            optimizer.zero_grad()
            output = model(bx)
            loss = F.cross_entropy(output, by)
            loss.backward()
            optimizer.step()

In [None]:
# Same training loop with evaluation and plotting

def train_feedforward(x, y, n_epochs=10, batch_size=50, eta=1e-3, validation_data=None):
    model = FeedForwardNetwork(784, 392, 10)
    optimizer = optim.Adam(model.parameters(), lr=eta)
    losses = []
    accuracies = []
    for t in range(n_epochs):
        model.train()
        running_loss = 0
        for bx, by in minibatches(x, y, batch_size):
            optimizer.zero_grad()
            output = model(bx)
            loss = F.cross_entropy(output, by)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * len(bx)
        losses.append(running_loss / len(x))
        if validation_data:
            test_x, test_y = validation_data
            model.eval()
            with torch.no_grad():
                test_y_pred = torch.argmax(model(test_x), axis=1)
                acc = accuracy(test_y_pred, test_y)
                accuracies.append(acc)
                print('\repoch {}, accuracy {:.4f}'.format(t, acc), end='')
    print()
    plt.figure(figsize=(15,6))
    plt.subplot(121)
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Average loss')
    plt.subplot(122)
    plt.plot(accuracies)
    plt.xlabel('Epoch')
    plt.ylabel('Test set accuracy')
    plt.ylim([0.90, 1.00])

This will take some time.

In [None]:
train_feedforward(train_x, train_y, validation_data=(test_x, test_y))

That's all folks!