#CNN HPO with dCMA-ES, CMA-ES, BO

## 0.Load packages

In [1]:
!pip install cma
!pip install scikit-optimize

Collecting cma
  Downloading cma-4.0.0-py3-none-any.whl.metadata (8.0 kB)
Downloading cma-4.0.0-py3-none-any.whl (283 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/283.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m174.1/283.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.5/283.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cma
Successfully installed cma-4.0.0
Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.9.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.9.0-py3-non

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from cma import CMAEvolutionStrategy
from torch.utils.data import DataLoader, random_split
import time
import numpy as np
import torch
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

## 1. Load CIFAR-10 Dataset

In [None]:
# 1. Load CIFAR-10 Dataset
def load_data_train_test(batch_size=128):
    # Data preprocessing
    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),  # Data augmentation
        transforms.RandomCrop(32, padding=4),  # Data augmentation
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))  # Normalize to mean/std of CIFAR-10
    ])

    full_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

    train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [None]:
train_loader, test_loader = load_data_train_test()

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:05<00:00, 29.1MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


## 2. Define a simple convolutional neural network (CNN)

In [None]:
# CNN Model Definition
class SimpleCNN(nn.Module):
    def __init__(self, num_hidden=128, dropout=0, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

## 3. Continuous CMA-ES

In [None]:
# Objective Function
def objective_function_continuous(params):
    """
    Objective function for CMA-ES optimization.
    params[0]: Learning rate (discrete: 0.00001 to 0.01)
    params[1]: Number of hidden units in the fully connected layer
    params[2]: Dropout rate (discrete: 0 to 0.6)
    """
    lr = params[0]
    num_hidden = int(params[1])
    dropout = params[2]

    # Model, Loss, Optimizer
    # model = SimpleCNN(num_hidden=num_hidden).cuda()
    model = SimpleCNN(num_hidden=num_hidden, dropout=dropout)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Train for 10 epoch
    model.train()
    for epoch in range(10):
        # print(epoch)
        for inputs, labels in train_loader:
            # inputs, labels = inputs.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate on the test set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            # inputs, labels = inputs.cuda(), labels.cuda()
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Return negative accuracy (for minimization)
    accuracy = correct / total
    print("Complete")
    return -accuracy

In [None]:
# CMA-ES Setup
initial_guess = [0.001, 256, 0.4]  # [learning_rate, num_neurons, dropout]
sigma = 1.0
bounds = [[0.00001, 64, 0.0],  # Lower bounds: learning_rate, num_neurons, dropout
          [0.01, 512,  0.6]]  # Upper bounds: learning_rate, num_neurons, dropout

es = CMAEvolutionStrategy(
        x0=initial_guess,  # Initial guess
        sigma0=sigma,           # Initial sampling standard deviation
        inopts={
            'bounds': bounds,  # Bounds
            'maxiter': 5,                         # Maximum iterations
            'popsize': 3,                          # Population size
            'tolx': 1e-6,                           # Convergence threshold for solution change
            'seed': 42                              # Reproducibility
        }
    )

(1,3mirr1)-aCMA-ES (mu_w=1.0,w_1=100%) in dimension 3 (seed=42, Mon Dec  2 05:38:43 2024)




In [None]:
start_time = time.time()
# Optimization loop
while not es.stop():
    solutions = es.ask()
    fitness = [objective_function_continuous(sol) for sol in solutions]
    es.tell(solutions, fitness)
    print(es.result)
    print(f"Current best fitness: {-es.result.fbest:.4f}")  # Accuracy is negative fitness
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken by CNN + CPU+ Continuous CMA-ES: {elapsed_time:.2f} seconds")

# Best hyperparameters
best_params = es.result.xbest
print("Best Parameters:", best_params)

CMAEvolutionStrategyResult(xbest=array([1.87331003e-03, 2.55861733e+02, 5.29537708e-01]), fbest=-0.67, evals_best=1, evaluations=3, iterations=1, xfavorite=array([1.87331003e-03, 2.55861733e+02, 5.29537708e-01]), stds=array([0.00266497, 0.81851309, 0.16670956]), stop={})
Current best fitness: 0.6700
CMAEvolutionStrategyResult(xbest=array([8.79603459e-04, 2.55579601e+02, 5.63067968e-01]), fbest=-0.6879, evals_best=4, evaluations=6, iterations=2, xfavorite=array([8.79603459e-04, 2.55579601e+02, 5.63067968e-01]), stds=array([0.0022367 , 0.64840036, 0.13923067]), stop={})
Current best fitness: 0.6879
CMAEvolutionStrategyResult(xbest=array([1.42286317e-03, 2.55346203e+02, 5.52035344e-01]), fbest=-0.6892, evals_best=9, evaluations=9, iterations=3, xfavorite=array([1.42286317e-03, 2.55346203e+02, 5.52035344e-01]), stds=array([0.00205133, 0.62387989, 0.14094156]), stop={})
Current best fitness: 0.6892
CMAEvolutionStrategyResult(xbest=array([1.42286317e-03, 2.55346203e+02, 5.52035344e-01]), fbe

In [None]:
es.result

CMAEvolutionStrategyResult(xbest=array([6.96013458e-04, 2.56260829e+02, 2.86534999e-01]), fbest=-0.7161, evals_best=13, evaluations=15, iterations=5, xfavorite=array([6.96013458e-04, 2.56260829e+02, 2.86534999e-01]), stds=array([0.00228127, 0.65727284, 0.16640399]), stop={'maxiter': 5})

## Discrete CMA-ES (dCMA-ES)

In [None]:
# Objective Function
def objective_function_discrete(params):
    """
    Objective function for CMA-ES optimization.
    params[0]: Learning rate (discrete: 0.00001 to 0.01)
    params[1]: Number of hidden units in the fully connected layer
    params[2]: Dropout rate (discrete: 0 to 0.6)
    """
    # Discretize parameters
    lr = [0.00001, 0.0001, 0.001, 0.01][int(round(params[0]))]
    num_hidden = [64, 128, 256, 512][int(round(params[1]))]
    dropout = [0, 0.2, 0.4, 0.6][int(round(params[2]))]

    # Model, Loss, Optimizer
    # model = SimpleCNN(num_hidden=num_hidden).cuda()
    model = SimpleCNN(num_hidden=num_hidden, dropout=dropout)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Train for 10 epoch
    model.train()
    for epoch in range(10):
        # print(epoch)
        for inputs, labels in train_loader:
            # inputs, labels = inputs.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate on the test set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            # inputs, labels = inputs.cuda(), labels.cuda()
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Return negative accuracy (for minimization)
    accuracy = correct / total
    return -accuracy

In [None]:
# CMA-ES Optimization
def run_cma_es():
    # Initial mean and sigma
    mean = [1.5, 1.5, 1.5]  # Approx middle of discrete ranges
    sigma = 1.0

    # Define bounds for parameters
    lower_bounds = [0, 0, 0]  # Discrete range start indices
    upper_bounds = [3, 3, 3]  # Discrete range end indices

    # CMA-ES Initialization
    es = CMAEvolutionStrategy(
        x0=mean,  # Initial guess
        sigma0=sigma,           # Initial sampling standard deviation
        inopts={
            'bounds': [lower_bounds, upper_bounds],  # Bounds
            'maxiter': 5,                         # Maximum iterations
            'popsize': 3,                          # Population size
            'tolx': 1e-6,                           # Convergence threshold for solution change
            'seed': 42                              # Reproducibility
        }
    )
    # Optimization loop
    while not es.stop():
        solutions = es.ask()
        fitness = [objective_function_discrete(sol) for sol in solutions]
        es.tell(solutions, fitness)
        print(es.result)
        print(f"Current best fitness: {-es.result.fbest:.4f}")  # Accuracy is negative fitness

    # Best result
    best_params = es.result.xbest
    best_lr = [0.00001, 0.0001, 0.001, 0.01][int(round(best_params[0]))]
    best_num_hidden = [64, 128, 256, 512][int(round(best_params[1]))]
    best_dropout = [0, 0.2, 0.4, 0.6][int(round(best_params[2]))]

    print("Best Hyperparameters:")
    print(f"Learning Rate: {best_lr}")
    print(f"Number of Hidden Units: {best_num_hidden}")
    print(f"Dropout Rate: {best_dropout}")

In [None]:
start_time = time.time()
run_cma_es()
end_time = time.time()
print(f"Total time taken discrete CMA-ES + CPU: {end_time - start_time} seconds")

(1,3mirr1)-aCMA-ES (mu_w=1.0,w_1=100%) in dimension 3 (seed=42, Mon Dec  2 14:12:14 2024)
CMAEvolutionStrategyResult(xbest=array([1.99671415, 1.36173339, 2.14771013]), fbest=-0.6995, evals_best=1, evaluations=3, iterations=1, xfavorite=array([1.99671415, 1.36173339, 2.14771013]), stds=array([0.80029006, 0.81851309, 0.83357557]), stop={})
Current best fitness: 0.6995
CMAEvolutionStrategyResult(xbest=array([1.95529706, 0.44479661, 0.19871161]), fbest=-0.7014, evals_best=5, evaluations=6, iterations=2, xfavorite=array([1.95529706, 0.44479661, 0.19871161]), stds=array([0.81733588, 0.87153666, 0.97069156]), stop={})
Current best fitness: 0.7014
CMAEvolutionStrategyResult(xbest=array([1.95529706, 0.44479661, 0.19871161]), fbest=-0.7014, evals_best=5, evaluations=9, iterations=3, xfavorite=array([1.52923168, 0.11048272, 0.65756864]), stds=array([0.87972284, 0.96393221, 1.        ]), stop={})
Current best fitness: 0.7014
CMAEvolutionStrategyResult(xbest=array([1.95529706, 0.44479661, 0.1987116

## Bayesian Optimization

In [None]:
# Define the search space for the hyperparameters
search_space = [
    Real(0.0001, 0.1, name='learning_rate'),
    Integer(64, 512, name='num_neurons'),
    Real(0.0, 0.6, name='dropout'),
]

In [None]:
# Run Bayesian Optimization
start_time = time.time()
result = gp_minimize(objective_function_continuous, search_space, n_calls=15, random_state=42)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken by CNN + CPU+ BO: {elapsed_time:.2f} seconds")

# Best Hyperparameters and Score
print("Best Hyperparameters:")
for name, value in zip([dim.name for dim in search_space], result.x):
    print(f"{name}: {value}")

print(f"Best Accuracy: {-result.fun:.4f}")

Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Complete
Time taken by CNN + CPU+ BO: 4867.62 seconds
Best Hyperparameters:
learning_rate: 0.0014445708192988278
num_neurons: 350
dropout: 0.01770426682237134
Best Accuracy: 0.7288


In [None]:
result

          fun: -0.7288
            x: [0.0014445708192988278, 350, 0.01770426682237134]
    func_vals: [-1.000e-01 -1.000e-01 ... -7.288e-01 -7.228e-01]
      x_iters: [[0.07967464438733729, 146, 0.46781460016366166], [0.059725330778854065, 264, 0.05998494949080174], [0.045978964307390145, 214, 0.08572009075316449], [0.06512375844759041, 89, 0.4331992633600949], [0.09386141563067348, 64, 0.5953269355747306], [0.061786402811808895, 338, 0.004239783131830445], [0.0024039362616374346, 299, 0.23991658302915334], [0.004761899755040182, 500, 0.13966280425818256], [0.0091515828098288, 341, 0.2294771947602977], [0.09832476549209816, 273, 0.5159642440417924], [0.00011863313541933089, 306, 0.5414215517196299], [0.0001, 302, 0.3310151499616592], [0.1, 301, 0.0], [0.0014445708192988278, 350, 0.01770426682237134], [0.0006547360999080961, 468, 0.12379459159192213]]
       models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),
            