In [3]:
# --------------------------------------------------
# Training a Deep Network on Flowers102 with PyTorch
# --------------------------------------------------

import torch
from torch import nn, optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

# Use GPU if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# --------------------------
# Step 1: Data Preparation
# --------------------------

# Define transformations for input images:
# 1. Resize all images to 128x128 — because the original dataset has images of varying sizes.
# 2. Convert PIL image to PyTorch tensor for model compatibility.
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

# Load the Flowers102 dataset
# The dataset will return a tuple (image_tensor, label) where label is the integer class index.
train_dataset = datasets.Flowers102("./flowers", split='train', download=True, transform=transform)
test_dataset = datasets.Flowers102("./flowers", split='test', download=True, transform=transform)

# Dataset objects implement:
# - __len__ to return the number of samples
# - __getitem__ to return a single (image, label) pair with transforms applied
# Access samples via indexing: e.g., `train_dataset[0]`

# Take a peek
print(train_dataset)
print(test_dataset)

Dataset Flowers102
    Number of datapoints: 1020
    Root location: ./flowers
    split=train
    StandardTransform
Transform: Compose(
               Resize(size=(128, 128), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
           )
Dataset Flowers102
    Number of datapoints: 6149
    Root location: ./flowers
    split=test
    StandardTransform
Transform: Compose(
               Resize(size=(128, 128), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
           )


In [11]:
# --------------------------
# Step 2: DataLoader
# --------------------------

# Wrap datasets in DataLoaders to:
# - Automatically form mini-batches
# - Optionally shuffle data (important for SGD)
# - Use multiple worker processes for speed

train_loader = DataLoader(
    train_dataset,
    batch_size=64,      # 64 is a typical batch size that balances speed and stability
    shuffle=True,       # Crucial for SGD convergence
    num_workers=2       # Use multiple subprocesses to load data faster (especially helpful with large image datasets)
)

# Important DataLoader options explained:
# - shuffle: Avoids ordering artifacts by reshuffling data each epoch.
# - num_workers: Parallelizes disk access/loading; >0 can significantly improve I/O performance.
# - drop_last: If dataset size isn't divisible by batch size, drop the last small batch (optional).
# - pin_memory: For GPU training, speeds up host-to-device transfer.

# Take a peek
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1db6eb958d0>

In [15]:
# --------------------------
# Step 3: Model Definition
# --------------------------

# Define a Multi-Layer Perceptron (MLP) with configurable hidden layers
# Input size is flattened 128x128 RGB image → 128*128*3 features

# Note about layer sizes: layer_sizes defines the number and size of hidden layers in the model.
# For example, [512, 512, 512] will create three hidden layers, each with 512 units.
# 
# This design makes the model size configurable — you can easily experiment with
# deeper or shallower networks by passing different layer sizes when instantiating the model.
# 
# Example usage:
# - MLP([256, 128]) creates a smaller 2-layer model
# - MLP([1024, 512, 256, 128]) creates a deeper and wider model
#
# The last layer (output layer) is always fixed to have 102 outputs (number of classes),
# so you only control the *intermediate* layers via this list.

class MLP(nn.Module):
    def __init__(self, layer_sizes=[512, 512, 512]):
        super().__init__()
        layers = []
        input_size = 128 * 128 * 3  # Image dimensions flattened
        layers.append(nn.Flatten())  # Turn each image into a vector

        # Add fully-connected layers with ReLU activations
        for size in layer_sizes:
            layers.append(nn.Linear(input_size, size))
            layers.append(nn.ReLU())
            input_size = size  # Update input size for next layer

        # Final layer maps to 102 flower classes (no softmax needed with CrossEntropyLoss)
        layers.append(nn.Linear(input_size, 102))
        
        # Create a network where the output of one layer is fed in as the input of next layer 
        # until nothing is left
        self.model = nn.Sequential(*layers) 

    def forward(self, x):
        return self.model(x)

# Instantiate model and move to correct device
model = MLP().to(device)

In [16]:
# --------------------------
# Step 4: Loss and Optimizer
# --------------------------

# Use CrossEntropyLoss — combines LogSoftmax + NLLLoss internally
loss_fn = nn.CrossEntropyLoss()

# Use SGD with momentum; Adam can also be tried in later experiments
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [18]:
# --------------------------
# Step 5: Training Loop
# --------------------------

# One "epoch" = one full pass through the training dataset
# Typically, we train for many epochs (e.g., 10s–100s)

epochs = 10
for epoch in range(epochs):
    total_loss = 0.0

    for batch in train_loader:
        # Each batch is a tuple (images, labels)
        images, labels = batch
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        preds = model(images)

        # Compute the loss (how wrong the model is)
        loss = loss_fn(preds, labels)

        # Backward pass: compute gradients
        optimizer.zero_grad()      # Clear previous gradients
        loss.backward()            # Compute gradients
        optimizer.step()           # Update model weights

        # A note on losses: 
        # loss.item() gives the loss value for the **most recent batch**.
        # total_loss is the **sum of all batch losses** across the entire epoch.
        # Printing both helps you see batch-level vs. epoch-level progress:
        # - loss.item() fluctuates (some batches are harder)
        # - total_loss (or its average) should go down steadily across epochs
        total_loss += loss.item()  # Accumulate batch loss

    print(f"Epoch {epoch + 1}, Loss Value: {loss.item()}, Total Loss: {total_loss:.4f}")

# --------------------------
# Notes & Performance Tips
# --------------------------

# - Each batch may vary in difficulty → causes small fluctuations in loss.
# - Loss should decrease over time, but not always in a straight line.
# - Low learning rate = slow convergence; high learning rate = risk of divergence or oscillation.
# - If loss increases → try lowering the learning rate.
# - If model isn’t learning, try:
#   - Using fewer or simpler layers (e.g., linear model without hidden layers)
#   - Increasing learning rate gradually
#   - Double-checking image sizes and transformations
# - Slowdowns may be due to:
#   - Data loading from disk at each batch
#   - CPU-only training
#   - Solutions: use GPU, increase num_workers, cache data in memory

# --------------------------
# Summary
# --------------------------

# A full PyTorch training pipeline includes:
# 1. Data loading and preprocessing with transforms
# 2. Dataset and DataLoader setup
# 3. Model definition with `nn.Module`
# 4. Loss function and optimizer selection
# 5. Epoch + batch-level training loop:
#    - forward → loss → backward → step
# 6. Optional: logging/printing loss for monitoring

# Training deep networks really boils down to:
# → A double for loop with a handful of function calls

# This foundational structure will stay the same even when we:
# - Switch to CNNs or transformers
# - Use fancier optimizers or loss functions
# - Add validation and testing loops


# Tip: use help() to learn more about any function, e.g. "help(torch.utils.data.DataLoader())"

Epoch 1, Loss Value: 4.635961055755615, Total Loss: 74.0531
Epoch 2, Loss Value: 4.586237907409668, Total Loss: 73.8435
Epoch 3, Loss Value: 4.593423366546631, Total Loss: 73.3589
Epoch 4, Loss Value: 4.436051368713379, Total Loss: 72.1184
Epoch 5, Loss Value: 4.217155456542969, Total Loss: 69.3830
Epoch 6, Loss Value: 4.009844779968262, Total Loss: 64.8913
Epoch 7, Loss Value: 3.9751429557800293, Total Loss: 62.5397
Epoch 8, Loss Value: 3.7055091857910156, Total Loss: 59.6945
Epoch 9, Loss Value: 3.7261412143707275, Total Loss: 56.3934
Epoch 10, Loss Value: 3.655364513397217, Total Loss: 54.0248
