# Lab 9 - Introduction to Pytorch and Metrics Tracking

- **Author:** Satej Soman
- **Date:** April 2, 2025
- **Course:** INFO 251: Applied Machine Learning

## Topics
1. Python neural networks libraries
2. Computational substrates
3. Typical training setup
4. Metrics tracking

## Goals
By the end of this lab, you should be able to:
1. define a neural network in Python
2. train a neural network on a cloud-hosted compute instance
3. track key metrics during neural network training

In [None]:
%pip install torch torchvision numpy scikit-learn matplotlib tensorboard

In [None]:
# old friends
import time
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression
import seaborn as sns
sns.set()

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# and new
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tensorboard import summary


# analogy:


$(X'X)^{-1}(X'y)$  <->  `sklearn.linear_model.LinearRegression`

$\beta_t = \beta_{t-1} - R \nabla_\beta J$ <-> neural network libraries

In [None]:
# let's generate some data

X = np.hstack([np.ones((100, 1)), np.random.uniform(0, 10, size = (100, 2))])
y = X @ np.array([-1, 2, -2])


fig, axs = plt.subplots(1, 2, figsize = (10, 4))
for (d, ax) in enumerate(axs, start=1):
    plt.sca(ax)
    plt.scatter(X[:, d], y)
    plt.title(f"synthetic regression data - dimension {d}")
    plt.xlabel(f"$X_{d}$")
    plt.ylabel("$y$")
plt.show()

In [None]:
# analytical solution:

def linear_regression_analytical(X, y):
    return np.linalg.inv(X.T @ X) @ X.T @ y

regression_coefficients_analytical = linear_regression_analytical(X, y)
print(f"analytical regression coefficients: {regression_coefficients_analytical}")

linreg = LinearRegression(fit_intercept=False).fit(X, y)
print("sklearn regression coefficients:", linreg.coef_)


In [None]:
# numerical solution:

def linear_regression_gradient_descent(X, y, R=0.01, max_iter=5000, tol=1e-6):
    (n, d) = X.shape
    Δβ, βt = float("inf"), [np.zeros(d)]
    while len(βt) < max_iter and np.max(np.abs(Δβ)) > tol:
        Δβ = -R/n * X.T @ (X @ βt[-1] - y)
        βt.append(βt[-1] + Δβ)
    return βt

gradient_descent_iterates = linear_regression_gradient_descent(X, y)
print(f"gradient descent regression coefficients: {gradient_descent_iterates[-1]}")

# pytorch equivalent

- loop taken from the [pytorch docs](https://pytorch.org/tutorials/beginner/introyt/trainingyt.html)

- higher-level libraries (e.g. pytorch lightning) optimize this away

In [None]:
# configure training objective

class PytorchLinearRegression(nn.Module):
    def __init__(self, d):
        super().__init__()
        self.linear = nn.Linear(d, 1, bias=True)

    def forward(self, x):
        return self.linear(torch.tensor(x, dtype=torch.float32))

model = PytorchLinearRegression(d=2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
loss = nn.MSELoss()

# core training loop
num_epochs = 5000
for i in range(num_epochs):
    optimizer.zero_grad()
    y_pred = model(X[:, 1:])
    loss_value = loss(y_pred, torch.tensor(y, dtype=torch.float32).reshape(-1, 1))
    loss_value.backward()
    optimizer.step()

print("pytorch coefficients:", np.array([*model.linear.bias.data.numpy(), *model.linear.weight.data.numpy().flatten()]))

# report training loss
print(f"training loss: {loss_value.item()}")


# a more realistic pytorch example

(again following the [pytorch docs](https://pytorch.org/tutorials/beginner/introyt/trainingyt.html))

let's build a classifier to classify images from the [Fashion-MNIST dataset](https://paperswithcode.com/dataset/fashion-mnist).

In [None]:
# common image processing step: specify how we want our inputs to be passed
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# just like sklearn, DL libraries have utilities for common datasets
trn_set = torchvision.datasets.FashionMNIST('./data', train=True, transform=transform, download=True)
val_set = torchvision.datasets.FashionMNIST('./data', train=False, transform=transform, download=True)

# Create data loaders for our datasets; shuffle for training, not for validation
BATCH_SIZE = 4
trn_loader = torch.utils.data.DataLoader(trn_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

# Class labels
classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')

# Report split sizes
print()
print('Training set has {} instances'.format(len(trn_set)))
print('Validation set has {} instances'.format(len(val_set)))

In [None]:
torch.manual_seed(0)

viz_set = torchvision.datasets.FashionMNIST('./data', train=True, transform=transforms.ToTensor(), download=True)
viz_loader = torch.utils.data.DataLoader(viz_set, batch_size=BATCH_SIZE, shuffle=True)
batch = (batch_imgs, batch_labels) = next(iter(viz_loader)) # get a batch of images
batch


In [None]:
fig, axs = plt.subplots(1, BATCH_SIZE, figsize = (10, 4))
for (i, ax) in enumerate(axs):
    ax.imshow(np.asarray(batch_imgs[i].squeeze()), cmap='Greys')
    label_index = batch_labels[i].item()
    label = classes[label_index]
    ax.set_xticks([1], [f"label: {label}"])
    ax.set_yticks([])


## let's define our model

In [None]:
class MNISTClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.activation = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)

        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)

        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.activation(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.activation(x)
        x = self.pool(x)
        x = x.view(-1, 16 * 4 * 4)
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        x = self.activation(x)
        x = self.fc3(x)
        return x


In [None]:
# what do we do in a single epoch?

def train_epoch(model, loss_fn, optimizer, trn_loader, device=torch.device("cpu")):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    epoch_start = time.time()
    start = time.time()
    for i, data in enumerate(trn_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            end = time.time()
            batch_time = round(end - start, 2)
            print('  batch {} - loss: {}, time taken for 1k batches: {}s'.format(i + 1, last_loss, batch_time))
            running_loss = 0.
            start = time.time()
    epoch_end = time.time()
    epoch_time = round(epoch_end - epoch_start, 2)
    print(f"epoch time: {epoch_time}s")
    return last_loss

In [None]:
# entire training loop

def train_model(model, loss_fn, optimizer, trn_loader, val_loader, num_epochs, device=torch.device("cpu")):
    model = model.to(device)
    loss_fn = loss_fn.to(device)

    best_vloss = float("inf")
    best_epoch = None

    for epoch in range(num_epochs):
        print('EPOCH {}:'.format(epoch + 1))

        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_epoch(model, loss_fn, optimizer, trn_loader, device=device)

        running_vloss = 0.0
        # Set the model to evaluation mode, disabling dropout and using population
        # statistics for batch normalization.
        model.eval()

        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(val_loader):
                vinputs, vlabels = vdata
                vinputs = vinputs.to(device)
                vlabels = vlabels.to(device)
                voutputs = model(vinputs)
                vloss = loss_fn(voutputs, vlabels)
                running_vloss += vloss

        avg_vloss = running_vloss / (i + 1)
        print('LOSS: trn {} val {}'.format(avg_loss, avg_vloss))

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            best_epoch = epoch


In [None]:
classifier = MNISTClassifier(len(classes))
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9)

num_epochs = 5
train_model(classifier, loss_fn, optimizer, trn_loader, val_loader, num_epochs)

# training on a GPU instead of a CPU

In [None]:
# Check for GPU availability
print(torch.cuda.is_available())
gpu = torch.device("cuda")

# Create data loaders for our datasets; shuffle for training, not for validation
BATCH_SIZE = 4
trn_loader = torch.utils.data.DataLoader(trn_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

classifier = MNISTClassifier(len(classes))
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9)

train_model(classifier, loss_fn, optimizer, trn_loader, val_loader, num_epochs, device=gpu)

In [None]:
BATCH_SIZE = 16
trn_loader = torch.utils.data.DataLoader(trn_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

classifier = MNISTClassifier(len(classes))
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9)

train_model(classifier, loss_fn, optimizer, trn_loader, val_loader, num_epochs, device=gpu)

# tracking metrics

In [None]:
# let's see how our gradient descent estimates changed over time

for i in (0, 1, 2):
    plt.scatter(
        list(range(len(gradient_descent_iterates))),
        [β[i] for β in gradient_descent_iterates],
        label = f"β$_{i}(t)$", s = 1)
plt.legend()
plt.title("gradient descent iterates vs number of steps")
plt.xlabel("$t$")
plt.ylabel("β")
plt.show()

some fancier tracking options:
- TensorBoard ([example with Keras](https://colab.research.google.com/github/tensorflow/tensorboard/blob/master/docs/tensorboard_in_notebooks.ipynb))

- Weights and Biases (wandb.ai)


In [None]:
from torch.utils.tensorboard import SummaryWriter

# Load the TensorBoard notebook extension (only once per notebook)
%load_ext tensorboard

# Start TensorBoard
%tensorboard --logdir runs

def train_model(model, loss_fn, optimizer, trn_loader, val_loader, num_epochs, device=torch.device("cpu")):
    model = model.to(device)
    loss_fn = loss_fn.to(device)

    writer = SummaryWriter()  # Initialize SummaryWriter

    best_vloss = float("inf")
    best_epoch = None

    for epoch in range(num_epochs):
        print('EPOCH {}:'.format(epoch + 1))

        model.train(True)
        avg_loss = train_epoch(model, loss_fn, optimizer, trn_loader, device=device)
        writer.add_scalar('Loss/train', avg_loss, epoch) # Log training loss

        running_vloss = 0.0
        model.eval()
        with torch.no_grad():
            for i, vdata in enumerate(val_loader):
                vinputs, vlabels = vdata
                vinputs = vinputs.to(device)
                vlabels = vlabels.to(device)
                voutputs = model(vinputs)
                vloss = loss_fn(voutputs, vlabels)
                running_vloss += vloss

        avg_vloss = running_vloss / (i + 1)
        print('LOSS: trn {} val {}'.format(avg_loss, avg_vloss))
        writer.add_scalar('Loss/validation', avg_vloss, epoch) # Log validation loss
        writer.flush()

        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            best_epoch = epoch

    writer.close() # Close the writer when done
    print(f"best epoch: {best_epoch}")
classifier = MNISTClassifier(len(classes))
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9)

num_epochs = 5
train_model(classifier, loss_fn, optimizer, trn_loader, val_loader, num_epochs, device=gpu)




other metrics to track:

- prediction metrics
    - classification
        - accuracy
        - recall
        - F1
        - ROC-AUC
    - regression
        - R2

- model internals
    - gradient magnitudes
    - parameter values
    - maximally-activating instances for each layer