In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

In [8]:
# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Prepare data
X_train = trainset.data.reshape(trainset.data.shape[0], -1).astype(np.float32)
y_train = np.array(trainset.targets)
X_test = testset.data.reshape(testset.data.shape[0], -1).astype(np.float32)
y_test = np.array(testset.targets)

# Normalize data
X_train /= 255.0
X_test /= 255.0

# Add bias term
X_train = np.hstack((X_train, np.ones((X_train.shape[0], 1))))
X_test = np.hstack((X_test, np.ones((X_test.shape[0], 1))))

# Convert to tensors with float32 dtype
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).long()
X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test).long()

In [9]:
# Linear classifier
class LinearClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LinearClassifier, self).__init__()
        self.fc = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.fc(x)

In [10]:
# Loss functions
def svm_loss(scores, labels, delta=1.0):
    batch_size = scores.size(0)
    correct_class_scores = scores[torch.arange(batch_size), labels].view(-1, 1)
    margins = torch.max(torch.zeros_like(scores), scores - correct_class_scores + delta)
    margins[torch.arange(batch_size), labels] = 0
    loss = torch.mean(torch.sum(margins, dim=1))
    return loss

def softmax_loss(scores, labels):
    batch_size = scores.size(0)
    scores = scores - torch.max(scores, dim=1, keepdim=True).values
    exp_scores = torch.exp(scores)
    prob = exp_scores / torch.sum(exp_scores, dim=1, keepdim=True)
    loss = -torch.mean(torch.log(prob[torch.arange(batch_size), labels]))
    return loss

In [11]:
# Regularization techniques
def l1_regularization(model, lambda_l1=0.001):
    l1_loss = 0
    for param in model.parameters():
        l1_loss += torch.sum(torch.abs(param))
    return lambda_l1 * l1_loss

def l2_regularization(model, lambda_l2=0.001):
    l2_loss = 0
    for param in model.parameters():
        l2_loss += torch.sum(param ** 2)
    return lambda_l2 * l2_loss

def elastic_net_regularization(model, lambda_l1=0.001, lambda_l2=0.001):
    return l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2)

In [12]:
# Optimization demonstration
def train_model(model, optimizer, X_train, y_train, X_test, y_test, epochs=10, regularization=None):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        scores = model(X_train)
        loss = softmax_loss(scores, y_train)

        if regularization == 'l1':
            loss += l1_regularization(model)
        elif regularization == 'l2':
            loss += l2_regularization(model)
        elif regularization == 'elastic_net':
            loss += elastic_net_regularization(model)

        loss.backward()
        optimizer.step()

        # Learning rate decay
        if isinstance(optimizer, torch.optim.SGD) or isinstance(optimizer, torch.optim.Adam):
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.99

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    test_scores = model(X_test)
    test_loss = softmax_loss(test_scores, y_test)
    print(f"\nFinal Test Loss: {test_loss.item():.4f}")

In [14]:
# Demonstrate different optimizers
input_size = X_train_tensor.shape[1]
num_classes = 10

# Vanilla SGD
model = LinearClassifier(input_size, num_classes)
optimizer = optim.SGD(model.parameters(), lr=0.001)
print("Training with Vanilla SGD:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# SGD with Momentum
model = LinearClassifier(input_size, num_classes)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
print("\nTraining with SGD + Momentum:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# SGD with Nesterov Momentum
model = LinearClassifier(input_size, num_classes)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, nesterov=True)
print("\nTraining with SGD + Nesterov Momentum:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# RMSProp
model = LinearClassifier(input_size, num_classes)
optimizer = optim.RMSprop(model.parameters(), lr=0.001)
print("\nTraining with RMSProp:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# Adam
model = LinearClassifier(input_size, num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.001)
print("\nTraining with Adam:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# AdamW
model = LinearClassifier(input_size, num_classes)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
print("\nTraining with AdamW:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# AdaGrad
model = LinearClassifier(input_size, num_classes)
optimizer = optim.Adagrad(model.parameters(), lr=0.001)
print("\nTraining with AdaGrad:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# L-BFGS
model = LinearClassifier(input_size, num_classes)

# L-BFGS requires a different training loop because it's a batch optimizer
def train_model_lbfgs(model, optimizer, X_train, y_train, X_test, y_test, epochs=10, regularization=None):
    for epoch in range(epochs):
        model.train()

        # Define closure for L-BFGS
        def closure():
            optimizer.zero_grad()
            scores = model(X_train)
            loss = softmax_loss(scores, y_train)

            if regularization == 'l1':
                loss += l1_regularization(model)
            elif regularization == 'l2':
                loss += l2_regularization(model)
            elif regularization == 'elastic_net':
                loss += elastic_net_regularization(model)

            loss.backward()
            return loss

        loss = optimizer.step(closure)
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    test_scores = model(X_test)
    test_loss = softmax_loss(test_scores, y_test)
    print(f"\nFinal Test Loss: {test_loss.item():.4f}")

optimizer = optim.LBFGS(model.parameters(), lr=0.001)
print("\nTraining with L-BFGS:")
train_model_lbfgs(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

# L1 Regularization
model = LinearClassifier(input_size, num_classes)
optimizer = optim.SGD(model.parameters(), lr=0.001)
print("\nTraining with L1 Regularization:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, regularization='l1')

# L2 Regularization
model = LinearClassifier(input_size, num_classes)
optimizer = optim.SGD(model.parameters(), lr=0.001)
print("\nTraining with L2 Regularization:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, regularization='l2')

# Elastic Net Regularization
model = LinearClassifier(input_size, num_classes)
optimizer = optim.SGD(model.parameters(), lr=0.001)
print("\nTraining with Elastic Net Regularization:")
train_model(model, optimizer, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, regularization='elastic_net')

Training with Vanilla SGD:
Epoch 1, Loss: 2.3303
Epoch 2, Loss: 2.3237
Epoch 3, Loss: 2.3180
Epoch 4, Loss: 2.3130
Epoch 5, Loss: 2.3086
Epoch 6, Loss: 2.3047
Epoch 7, Loss: 2.3012
Epoch 8, Loss: 2.2981
Epoch 9, Loss: 2.2954
Epoch 10, Loss: 2.2929

Final Test Loss: 2.2919

Training with SGD + Momentum:
Epoch 1, Loss: 2.3595
Epoch 2, Loss: 2.3512
Epoch 3, Loss: 2.3382
Epoch 4, Loss: 2.3244
Epoch 5, Loss: 2.3129
Epoch 6, Loss: 2.3051
Epoch 7, Loss: 2.3007
Epoch 8, Loss: 2.2988
Epoch 9, Loss: 2.2978
Epoch 10, Loss: 2.2966

Final Test Loss: 2.2956

Training with SGD + Nesterov Momentum:
Epoch 1, Loss: 2.4054
Epoch 2, Loss: 2.3770
Epoch 3, Loss: 2.3470
Epoch 4, Loss: 2.3213
Epoch 5, Loss: 2.3029
Epoch 6, Loss: 2.2919
Epoch 7, Loss: 2.2868
Epoch 8, Loss: 2.2853
Epoch 9, Loss: 2.2849
Epoch 10, Loss: 2.2840

Final Test Loss: 2.2826

Training with RMSProp:
Epoch 1, Loss: 2.3448
Epoch 2, Loss: 9.3914
Epoch 3, Loss: 15.4589
Epoch 4, Loss: 15.5058
Epoch 5, Loss: 16.7541
Epoch 6, Loss: 16.2943
Epoc