# PyTorch Optimization Techniques
## Mini-batch gradient descent
### Dataset preparation and batch loading

In [1]:
from typing import Any
import torch
from sklearn.datasets import load_wine
from torch.utils.data import TensorDataset, DataLoader

# Load wine classification dataset (3 classes, 13 features)
wine: Any = load_wine()
X = torch.tensor(wine.data, dtype=torch.float32)
y = torch.tensor(wine.target, dtype=torch.long)

# Create dataset and dataloader for mini-batch processing
dataset = TensorDataset(X, y)
dataloader: Any = DataLoader(dataset, batch_size=12, shuffle=True)

# Display batch sizes to verify mini-batch configuration
print(f"Batch sizes: {[len(item[0]) for item in dataloader]}")

Batch sizes: [12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 10]


### Model training with mini-batch gradient descent

In [2]:
import torch.nn as nn
import torch.optim as optim

# Define neural network architecture: 13 -> 16 -> 16 -> 3
model_minibatch = nn.Sequential(
    nn.Linear(13, 16), nn.ReLU(), nn.Linear(16, 16), nn.ReLU(), nn.Linear(16, 3)
)

# Initialize loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_minibatch.parameters(), lr=0.001)

# Training loop with mini-batch gradient descent
num_epochs = 500
loss_history = []
for epoch in range(num_epochs):
    model_minibatch.train()
    running_loss = 0

    # Process data in mini-batches
    for X_batch, y_batch in dataloader:
        # Zero gradients from previous iteration
        optimizer.zero_grad()

        # Forward pass on mini-batch
        batch_outputs = model_minibatch(X_batch)
        batch_loss = criterion(batch_outputs, y_batch)

        # Backward pass and optimization
        batch_loss.backward()
        optimizer.step()

        # Accumulate loss (weighted by batch size)
        running_loss += batch_loss * X_batch.size(0)

    # Calculate average epoch loss
    epoch_loss = running_loss / len(dataloader.dataset)

    # Report progress every 100 epochs
    if not (epoch + 1) % 100:
        print(f"Epoch: {epoch + 1}, loss: {epoch_loss}")

Epoch: 100, loss: 0.26576852798461914
Epoch: 200, loss: 0.1520654857158661
Epoch: 300, loss: 0.1266169250011444
Epoch: 400, loss: 0.10865620523691177
Epoch: 500, loss: 0.11050953716039658


## Learning rate scheduling
### Data preparation with train-validation split

In [3]:
from typing import Any
import torch
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

# Load wine classification dataset
wine: Any = load_wine()
X = torch.tensor(wine.data, dtype=torch.float32)
y = torch.tensor(wine.target, dtype=torch.long)

# Split data into training and validation sets (80/20 split)
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Display dataset sizes for verification
print(f"Training X size: {X_train.size()}")
print(f"Validation X size: {X_validation.size()}")

Training X size: torch.Size([142, 13])
Validation X size: torch.Size([36, 13])


### Model training with adaptive learning rate

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

# Define neural network architecture: 13 -> 16 -> 16 -> 3
model_lrschedule = nn.Sequential(
    nn.Linear(13, 16), nn.ReLU(), nn.Linear(16, 16), nn.ReLU(), nn.Linear(16, 3)
)

# Initialize loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_lrschedule.parameters(), lr=0.001)

# Initialize learning rate scheduler (reduces LR when validation loss plateaus)
scheduler = lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=10
)

# Training loop with learning rate scheduling
num_epochs = 500
history = {"loss": [], "val_loss": []}
for epoch in range(num_epochs):
    # Training phase
    model_lrschedule.train()
    optimizer.zero_grad()
    outputs = model_lrschedule(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    history["loss"].append(loss.item())

    # Validation phase
    model_lrschedule.eval()
    with torch.no_grad():
        validation_outputs = model_lrschedule(X_validation)
        validation_loss = criterion(validation_outputs, y_validation)

        # Update learning rate based on validation loss
        scheduler.step(validation_loss)
        history["val_loss"].append(validation_loss.item())

    # Report progress every 100 epochs
    if not (epoch + 1) % 100:
        print(f"Epoch: {epoch + 1}, loss: {loss}, validation loss: {validation_loss}")

Epoch: 100, loss: 1.1840826272964478, validation loss: 1.0858325958251953
Epoch: 200, loss: 1.1472747325897217, validation loss: 1.059104323387146
Epoch: 300, loss: 1.1240156888961792, validation loss: 1.046764850616455
Epoch: 400, loss: 1.1114978790283203, validation loss: 1.0414541959762573
Epoch: 500, loss: 1.1034529209136963, validation loss: 1.0368276834487915


## Regularization techniques
### Dropout regularization

In [5]:
import torch.nn as nn
import torch.optim as optim

# Define neural network architecture with dropout regularization: 13 -> 16 -> 16 -> 3
model_dropout = nn.Sequential(
    nn.Linear(13, 16),
    nn.ReLU(),
    nn.Dropout(0.2),  # Dropout with 20% probability after first hidden layer
    nn.Linear(16, 16),
    nn.ReLU(),
    nn.Dropout(0.2),  # Dropout with 20% probability after second hidden layer
    nn.Linear(16, 3),
)

# Initialize loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_dropout.parameters(), lr=0.001)

print(model_dropout)

Sequential(
  (0): Linear(in_features=13, out_features=16, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2, inplace=False)
  (3): Linear(in_features=16, out_features=16, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.2, inplace=False)
  (6): Linear(in_features=16, out_features=3, bias=True)
)


### Training with dropout and L2 regularization

In [6]:
import torch
import torch.optim as optim

# Training loop with combined dropout and L2 regularization
num_epochs = 1000
history = {"loss": [], "val_loss": []}
for epoch in range(num_epochs):
    # Training phase with dropout enabled
    model_dropout.train()  # Enables dropout during training
    optimizer.zero_grad()
    outputs = model_dropout(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    history["loss"].append(loss.item())

    # Validation phase with dropout disabled
    model_dropout.eval()  # Disables dropout during evaluation
    with torch.no_grad():
        validation_outputs = model_dropout(X_validation)
        validation_loss = criterion(validation_outputs, y_validation)
        history["val_loss"].append(validation_loss.item())

    # Enable L2 regularization halfway through training
    if (epoch + 1) == 500:
        # Add weight decay (L2 regularization) to optimizer
        optimizer = optim.Adam(model_dropout.parameters(), lr=0.001, weight_decay=0.02)
        print(f"Epoch: {epoch + 1}, L2 regularization enabled")

    # Report progress every 200 epochs
    if not (epoch + 1) % 200:
        print(f"Epoch: {epoch + 1}, loss: {loss}, validation loss: {validation_loss}")

Epoch: 200, loss: 1.0045427083969116, validation loss: 0.8699984550476074
Epoch: 400, loss: 0.9233108162879944, validation loss: 0.7323180437088013
Epoch: 500, L2 regularization enabled
Epoch: 600, loss: 0.8085758686065674, validation loss: 0.6503400802612305
Epoch: 800, loss: 0.6585947871208191, validation loss: 0.5298316478729248
Epoch: 1000, loss: 0.5619053244590759, validation loss: 0.40555843710899353
