# Evolutionary Hyperparameter Optimization Experiments

This notebook implements the experimental workflow described in the project report and prepares a reproducible pipeline for benchmarking evolutionary algorithms against classical baselines on MNIST and CIFAR-10.

## 1. Setup and Reproducibility Controls

Configure the environment, ensure required packages are available, and define deterministic helpers.

In [14]:
import importlib
import itertools
import json
import math
import os
import random
import subprocess
import sys
import time
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

REQUIRED_PACKAGES = [
    "torch",
    "torchvision",
    "deap",
    "pyswarms",
    "scipy",
    "numpy",
    "pandas",
    "matplotlib",
    "seaborn",
    "sklearn",
]

def ensure_package(pkg: str) -> None:
    try:
        importlib.import_module(pkg)
    except ImportError:
        print(f"Installing missing package: {pkg}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

for package in REQUIRED_PACKAGES:
    ensure_package(package)

import numpy as np
import pandas as pd
import seaborn as sns
import torch
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.metrics import accuracy_score
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision import datasets, transforms

try:
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
except AttributeError:
    pass

GLOBAL_SEEDS = [42, 123, 456]
RESULTS_DIR = Path("outputs")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


def set_global_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    if hasattr(torch, "use_deterministic_algorithms"):
        torch.use_deterministic_algorithms(True, warn_only=True)


def get_device() -> torch.device:
    if torch.backends.mps.is_available():
        return torch.device("mps")
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")


def format_seconds(seconds: float) -> str:
    if seconds < 60:
        return f"{seconds:.1f}s"
    minutes, secs = divmod(seconds, 60)
    if minutes < 60:
        return f"{int(minutes)}m {secs:.0f}s"
    hours, mins = divmod(minutes, 60)
    return f"{int(hours)}h {int(mins)}m"


@dataclass
class Timer:
    label: str
    start_time: float = time.time()
    elapsed: float = 0.0

    def __enter__(self) -> "Timer":
        self.start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        self.elapsed = time.time() - self.start_time
        print(f"[{self.label}] {format_seconds(self.elapsed)}")


device = get_device()
print(f"Using device: {device}")
for seed in GLOBAL_SEEDS:
    set_global_seed(seed)


Using device: mps


## 2. Data Loading and Preprocessing

Download datasets, apply transforms, and create reusable train/validation/test splits.

In [15]:
from collections import defaultdict

DATA_ROOT = Path("data")
DATA_ROOT.mkdir(parents=True, exist_ok=True)

TRANSFORMS = {
    "mnist": {
        "train": transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,)),
        ]),
        "eval": transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,)),
        ]),
    },
    "cifar10": {
        "train": transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, padding=4),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
        ]),
        "eval": transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
        ]),
    },
}

DATASETS: Dict[str, Dict[str, Dataset]] = {}
DATALOADER_CACHE: Dict[Tuple[str, int, int], Tuple[DataLoader, DataLoader, DataLoader]] = {}
DATASET_METADATA: Dict[str, Dict[str, Any]] = {}


def prepare_dataset(dataset_name: str, download: bool = True) -> None:
    dataset_name = dataset_name.lower()
    if dataset_name in DATASETS:
        return

    if dataset_name == "mnist":
        train_dataset = datasets.MNIST(
            root=DATA_ROOT,
            train=True,
            transform=TRANSFORMS["mnist"]["train"],
            download=download,
        )
        full_train_len = len(train_dataset)
        train_len = int(0.8 * full_train_len)
        val_len = int(0.1 * full_train_len)
        test_len = full_train_len - train_len - val_len
        generator = torch.Generator().manual_seed(2025)
        train_subset, val_subset, heldout_subset = torch.utils.data.random_split(
            train_dataset,
            lengths=[train_len, val_len, test_len],
            generator=generator,
        )
        eval_transform = TRANSFORMS["mnist"]["eval"]
        heldout_subset.dataset.transform = eval_transform
        val_subset.dataset.transform = eval_transform
        test_dataset = datasets.MNIST(
            root=DATA_ROOT,
            train=False,
            transform=eval_transform,
            download=download,
        )
    elif dataset_name == "cifar10":
        train_dataset = datasets.CIFAR10(
            root=DATA_ROOT,
            train=True,
            transform=TRANSFORMS["cifar10"]["train"],
            download=download,
        )
        full_train_len = len(train_dataset)
        train_len = int(0.8 * full_train_len)
        val_len = int(0.1 * full_train_len)
        test_len = full_train_len - train_len - val_len
        generator = torch.Generator().manual_seed(2025)
        train_subset, val_subset, heldout_subset = torch.utils.data.random_split(
            train_dataset,
            lengths=[train_len, val_len, test_len],
            generator=generator,
        )
        eval_transform = TRANSFORMS["cifar10"]["eval"]
        for subset in (val_subset, heldout_subset):
            subset.dataset.transform = eval_transform
        test_dataset = datasets.CIFAR10(
            root=DATA_ROOT,
            train=False,
            transform=eval_transform,
            download=download,
        )
    else:
        raise ValueError(f"Unsupported dataset: {dataset_name}")

    DATASETS[dataset_name] = {
        "train": train_subset,
        "val": val_subset,
        "test": heldout_subset,
        "external_test": test_dataset,
    }
    DATASET_METADATA[dataset_name] = {
        "train_size": len(train_subset),
        "val_size": len(val_subset),
        "test_size": len(heldout_subset),
        "external_test_size": len(test_dataset),
    }


def build_dataloaders(
    dataset_name: str,
    batch_size: int,
    seed: int,
    num_workers: Optional[int] = None,
) -> Tuple[DataLoader, DataLoader, DataLoader]:
    dataset_name = dataset_name.lower()
    prepare_dataset(dataset_name)
    key = (dataset_name, batch_size, seed)
    if key in DATALOADER_CACHE:
        return DATALOADER_CACHE[key]

    generator = torch.Generator().manual_seed(seed)
    num_workers = num_workers if num_workers is not None else min(4, os.cpu_count() or 1)

    train_subset = DATASETS[dataset_name]["train"]
    val_subset = DATASETS[dataset_name]["val"]
    test_subset = DATASETS[dataset_name]["test"]

    train_loader = DataLoader(
        train_subset,
        batch_size=batch_size,
        shuffle=True,
        generator=generator,
        num_workers=num_workers,
        pin_memory=device.type != "cpu",
    )
    val_loader = DataLoader(
        val_subset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=device.type != "cpu",
    )
    test_loader = DataLoader(
        test_subset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=device.type != "cpu",
    )

    DATALOADER_CACHE[key] = (train_loader, val_loader, test_loader)
    return DATALOADER_CACHE[key]


metadata_frame = pd.DataFrame.from_dict(DATASET_METADATA, orient="index")
display(metadata_frame if not metadata_frame.empty else "Datasets will be prepared on demand.")

'Datasets will be prepared on demand.'

## 3. Model Architectures

Define adaptable PyTorch models for MNIST and CIFAR-10 driven by hyperparameter configurations.

In [16]:
class MNISTMLP(nn.Module):
    def __init__(self, hidden_layers: List[int], dropout: float):
        super().__init__()
        layers: List[nn.Module] = []
        input_dim = 28 * 28
        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            input_dim = hidden_dim
        layers.append(nn.Linear(input_dim, 10))
        self.model = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.view(x.size(0), -1)
        return self.model(x)


class CIFAR10CNN(nn.Module):
    def __init__(self, conv_channels: List[int], dropout: float):
        super().__init__()
        layers: List[nn.Module] = []
        in_channels = 3
        for out_channels in conv_channels:
            layers.extend(
                [
                    nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
                    nn.BatchNorm2d(out_channels),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(kernel_size=2),
                ]
            )
            if dropout > 0:
                layers.append(nn.Dropout2d(dropout))
            in_channels = out_channels
        self.features = nn.Sequential(*layers)
        feature_dim = conv_channels[-1] * (32 // (2 ** len(conv_channels))) ** 2
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(feature_dim, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(256, 10),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        return self.classifier(x)


PARAM_SPACE = [
    {"name": "learning_rate", "type": "continuous", "bounds": (1e-4, 5e-2), "scale": "log"},
    {"name": "batch_size", "type": "discrete", "choices": [32, 48, 64, 96, 128]},
    {"name": "dropout", "type": "continuous", "bounds": (0.0, 0.5)},
    {"name": "width_scale", "type": "continuous", "bounds": (0.5, 2.0)},
    {"name": "num_layers", "type": "discrete", "choices": [2, 3, 4]},
    {"name": "optimizer", "type": "categorical", "choices": ["adam", "sgd"]},
]


def clamp(value: float, lower: float, upper: float) -> float:
    return max(lower, min(value, upper))


def decode_param(param: Dict[str, Any], value: float) -> Any:
    if param["type"] == "continuous":
        lower, upper = param["bounds"]
        if param.get("scale") == "log":
            log_lower, log_upper = np.log10(lower), np.log10(upper)
            actual = 10 ** (log_lower + value * (log_upper - log_lower))
        else:
            actual = lower + value * (upper - lower)
        return float(actual)
    if param["type"] in {"discrete", "categorical"}:
        choices = param["choices"]
        index = int(round(clamp(value, 0.0, 1.0) * (len(choices) - 1)))
        return choices[index]
    raise ValueError(f"Unsupported parameter type: {param['type']}")


def encode_param(param: Dict[str, Any], value: Any) -> float:
    if param["type"] == "continuous":
        lower, upper = param["bounds"]
        if param.get("scale") == "log":
            log_lower, log_upper = np.log10(lower), np.log10(upper)
            return float((np.log10(value) - log_lower) / (log_upper - log_lower))
        return float((value - lower) / (upper - lower))
    if param["type"] in {"discrete", "categorical"}:
        choices = param["choices"]
        return float(choices.index(value) / (len(choices) - 1))
    raise ValueError(f"Unsupported parameter type: {param['type']}")


def decode_vector(vector: Iterable[float]) -> Dict[str, Any]:
    return {
        param["name"]: decode_param(param, clamp(v, 0.0, 1.0))
        for param, v in zip(PARAM_SPACE, vector)
    }


def encode_config(config: Dict[str, Any]) -> List[float]:
    return [encode_param(param, config[param["name"]]) for param in PARAM_SPACE]


def build_mnist_hidden_layers(num_layers: int, width_scale: float) -> List[int]:
    base_units = [256, 128, 64, 32]
    layers = [max(32, int(width_scale * base_units[i])) for i in range(num_layers)]
    return layers


def build_cifar_channels(num_layers: int, width_scale: float) -> List[int]:
    base_channels = [32, 64, 128, 256]
    channels = [int(width_scale * base_channels[i]) for i in range(num_layers)]
    channels = [max(16, (c // 8) * 8) for c in channels]
    return channels


def build_model(dataset_name: str, config: Dict[str, Any]) -> nn.Module:
    dataset_name = dataset_name.lower()
    dropout = float(config["dropout"])
    num_layers = int(config["num_layers"])
    width_scale = float(config["width_scale"])
    if dataset_name == "mnist":
        hidden_layers = build_mnist_hidden_layers(num_layers, width_scale)
        return MNISTMLP(hidden_layers=hidden_layers, dropout=dropout)
    if dataset_name == "cifar10":
        conv_channels = build_cifar_channels(num_layers, width_scale)
        return CIFAR10CNN(conv_channels=conv_channels, dropout=dropout)
    raise ValueError(f"Unsupported dataset: {dataset_name}")


def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


## 4. Baseline Search Strategies

Implement grid and random search orchestrators operating on the shared hyperparameter space.

In [17]:
TRAINING_CONFIG = {
    "epochs": 8,
    "patience": 3,
    "max_steps_per_epoch": None,  # Optionally cap batches per epoch for quick smoke tests
}

EVALUATION_CACHE: Dict[Tuple[str, Tuple[Tuple[str, Any], ...]], Dict[str, Any]] = {}
RESULTS_REGISTRY: List[Dict[str, Any]] = []
RECORDED_KEYS: set = set()


def config_to_key(dataset_name: str, config: Dict[str, Any]) -> Tuple[str, Tuple[Tuple[str, Any], ...]]:
    rounded_items = []
    for key, value in sorted(config.items()):
        if isinstance(value, float):
            rounded_items.append((key, round(value, 6)))
        else:
            rounded_items.append((key, value))
    return dataset_name.lower(), tuple(rounded_items)


def get_optimizer(model: nn.Module, config: Dict[str, Any]) -> torch.optim.Optimizer:
    learning_rate = float(config["learning_rate"])
    optimizer_name = config["optimizer"].lower()
    if optimizer_name == "adam":
        return torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    if optimizer_name == "sgd":
        return torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
    raise ValueError(f"Unsupported optimizer: {optimizer_name}")


def evaluate_model(model: nn.Module, loader: DataLoader, device: torch.device) -> Tuple[float, float]:
    model.eval()
    criterion = nn.CrossEntropyLoss()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * targets.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == targets).sum().item()
            total += targets.size(0)
    avg_loss = running_loss / max(1, total)
    accuracy = correct / max(1, total)
    return avg_loss, accuracy


def build_external_loader(dataset_name: str, batch_size: int, num_workers: Optional[int] = None) -> DataLoader:
    dataset_name = dataset_name.lower()
    prepare_dataset(dataset_name)
    dataset = DATASETS[dataset_name]["external_test"]
    num_workers = num_workers if num_workers is not None else min(4, os.cpu_count() or 1)
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=device.type != "cpu",
    )


def train_single_seed(
    dataset_name: str,
    config: Dict[str, Any],
    seed: int,
    device: torch.device,
    training_config: Dict[str, Any],
) -> Dict[str, Any]:
    set_global_seed(seed)
    batch_size = int(config["batch_size"])
    train_loader, val_loader, test_loader = build_dataloaders(dataset_name, batch_size, seed)
    model = build_model(dataset_name, config).to(device)
    optimizer = get_optimizer(model, config)
    criterion = nn.CrossEntropyLoss()

    best_state = None
    best_val_acc = -np.inf
    best_epoch = -1
    epochs_without_improvement = 0
    history: List[Dict[str, float]] = []

    max_epochs = training_config["epochs"]
    patience = training_config["patience"]
    max_steps_per_epoch = training_config.get("max_steps_per_epoch")

    for epoch in range(1, max_epochs + 1):
        model.train()
        running_loss = 0.0
        total_examples = 0
        start_time = time.time()
        for step, (inputs, targets) in enumerate(train_loader, start=1):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad(set_to_none=True)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            running_loss += loss.item() * targets.size(0)
            total_examples += targets.size(0)
            if max_steps_per_epoch and step >= max_steps_per_epoch:
                break
        train_loss = running_loss / max(1, total_examples)
        val_loss, val_acc = evaluate_model(model, val_loader, device)
        test_loss, test_acc = evaluate_model(model, test_loader, device)
        epoch_duration = time.time() - start_time
        history.append(
            {
                "epoch": epoch,
                "train_loss": train_loss,
                "val_loss": val_loss,
                "val_acc": val_acc,
                "test_loss": test_loss,
                "test_acc": test_acc,
                "duration_sec": epoch_duration,
            }
        )

        improved = val_acc > best_val_acc + 1e-4
        if improved:
            best_val_acc = val_acc
            best_epoch = epoch
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if patience and epochs_without_improvement >= patience:
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    final_val_loss, final_val_acc = evaluate_model(model, val_loader, device)
    final_test_loss, final_test_acc = evaluate_model(model, test_loader, device)
    external_loader = build_external_loader(dataset_name, batch_size)
    ext_test_loss, ext_test_acc = evaluate_model(model, external_loader, device)

    return {
        "seed": seed,
        "history": history,
        "val_accuracy": final_val_acc,
        "val_loss": final_val_loss,
        "test_accuracy": final_test_acc,
        "test_loss": final_test_loss,
        "external_test_accuracy": ext_test_acc,
        "external_test_loss": ext_test_loss,
        "best_epoch": best_epoch,
        "state_dict": best_state,
    }


def evaluate_config(
    dataset_name: str,
    config: Dict[str, Any],
    method: str,
    training_config: Dict[str, Any],
) -> Dict[str, Any]:
    cache_key = config_to_key(dataset_name, config)
    if cache_key in EVALUATION_CACHE:
        return EVALUATION_CACHE[cache_key]

    seed_results = []
    wall_start = time.time()
    with Timer(label=f"{method} | {dataset_name} config evaluation"):
        for seed in GLOBAL_SEEDS:
            seed_result = train_single_seed(dataset_name, config, seed, device, training_config)
            seed_results.append(seed_result)
    total_runtime = time.time() - wall_start

    val_accs = [result["val_accuracy"] for result in seed_results]
    test_accs = [result["test_accuracy"] for result in seed_results]
    ext_accs = [result["external_test_accuracy"] for result in seed_results]

    summary = {
        "dataset": dataset_name,
        "method": method,
        "config": config,
        "val_accuracy_mean": float(np.mean(val_accs)),
        "val_accuracy_std": float(np.std(val_accs)),
        "test_accuracy_mean": float(np.mean(test_accs)),
        "test_accuracy_std": float(np.std(test_accs)),
        "external_test_accuracy_mean": float(np.mean(ext_accs)),
        "external_test_accuracy_std": float(np.std(ext_accs)),
        "runtime_sec": total_runtime,
        "seed_results": seed_results,
    }

    EVALUATION_CACHE[cache_key] = summary
    return summary


def record_result(result: Dict[str, Any]) -> None:
    global RECORDED_KEYS
    key = (result.get("method"), config_to_key(result["dataset"], result["config"]))
    if key in RECORDED_KEYS:
        return
    RECORDED_KEYS.add(key)
    RESULTS_REGISTRY.append(result)


GRID_VALUES = {
    "learning_rate": [1e-3, 5e-3],
    "batch_size": [48, 96],
    "dropout": [0.1, 0.3],
    "width_scale": [0.75, 1.25],
    "num_layers": [2, 3],
    "optimizer": ["adam", "sgd"],
}


def iter_grid_configs() -> Iterable[Dict[str, Any]]:
    keys = list(GRID_VALUES.keys())
    values = [GRID_VALUES[key] for key in keys]
    for combination in itertools.product(*values):
        yield {key: value for key, value in zip(keys, combination)}


def sample_random_config() -> Dict[str, Any]:
    config: Dict[str, Any] = {}
    for param in PARAM_SPACE:
        if param["type"] == "continuous":
            value = random.random()
            config[param["name"]] = decode_param(param, value)
        elif param["type"] in {"discrete", "categorical"}:
            config[param["name"]] = random.choice(param["choices"])
    return config


def run_grid_search(
    dataset_name: str,
    max_trials: Optional[int] = None,
    training_config: Optional[Dict[str, Any]] = None,
) -> pd.DataFrame:
    training_config = training_config or TRAINING_CONFIG
    records: List[Dict[str, Any]] = []
    for trial_id, config in enumerate(iter_grid_configs(), start=1):
        summary = evaluate_config(dataset_name, config, method="Grid", training_config=training_config)
        record_result({**summary, "trial_id": trial_id})
        records.append({**summary, "trial_id": trial_id})
        if max_trials and trial_id >= max_trials:
            break
    return pd.DataFrame(records)


def run_random_search(
    dataset_name: str,
    n_trials: int,
    training_config: Optional[Dict[str, Any]] = None,
) -> pd.DataFrame:
    training_config = training_config or TRAINING_CONFIG
    records: List[Dict[str, Any]] = []
    for trial_id in range(1, n_trials + 1):
        config = sample_random_config()
        summary = evaluate_config(dataset_name, config, method="Random", training_config=training_config)
        record_result({**summary, "trial_id": trial_id})
        records.append({**summary, "trial_id": trial_id})
    return pd.DataFrame(records)


## 5. Genetic Algorithm Optimization

Use DEAP to evolve hyperparameter vectors and evaluate them with the training loop.

In [28]:
from deap import base, creator, tools, algorithms

if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)


def clamp_individual(individual: List[float]) -> None:
    for i in range(len(individual)):
        individual[i] = float(clamp(individual[i], 0.0, 1.0))


def run_ga(
    dataset_name: str,
    population_size: int = 12,
    generations: int = 6,
    crossover_prob: float = 0.7,
    mutation_prob: float = 0.2,
    training_config: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    training_config = training_config or TRAINING_CONFIG

    toolbox = base.Toolbox()
    toolbox.register("attr_float", random.random)
    toolbox.register(
        "individual",
        tools.initRepeat,
        creator.Individual,
        toolbox.attr_float,
        n=len(PARAM_SPACE),
    )
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    cache_hits: Dict[Tuple[str, Tuple[Tuple[str, Any], ...]], int] = defaultdict(int)
    trial_counter = {"count": 0}

    def evaluate_individual(individual: List[float]) -> Tuple[float]:
        clamp_individual(individual)
        config = decode_vector(individual)
        key = config_to_key(dataset_name, config)
        cache_hits[key] += 1
        summary = evaluate_config(dataset_name, config, method="GA", training_config=training_config)
        if cache_hits[key] == 1:
            trial_counter["count"] += 1
            record_result({**summary, "trial_id": trial_counter["count"]})
        return (summary["val_accuracy_mean"],)

    toolbox.register("evaluate", evaluate_individual)
    toolbox.register("mate", tools.cxBlend, alpha=0.2)
    toolbox.register("mutate", tools.mutGaussian, mu=0.0, sigma=0.2, indpb=0.5)
    toolbox.register("select", tools.selTournament, tournsize=3)

    population = toolbox.population(n=population_size)
    hof = tools.HallOfFame(5)

    stats = tools.Statistics(lambda ind: ind.fitness.values[0])
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    logbook = tools.Logbook()
    logbook.header = ["gen", "nevals", "avg", "std", "min", "max"]

    history_rows: List[Dict[str, Any]] = []

    invalid_individuals = [ind for ind in population if not ind.fitness.valid]
    fitnesses = list(toolbox.map(toolbox.evaluate, invalid_individuals))
    for ind, fit in zip(invalid_individuals, fitnesses):
        ind.fitness.values = fit

    hof.update(population)
    record = stats.compile(population)
    logbook.record(gen=0, nevals=len(invalid_individuals), **record)
    history_rows.append({"generation": 0, **record})

    for gen in range(1, generations + 1):
        offspring = tools.selTournament(population, len(population), tournsize=3)
        offspring = list(map(toolbox.clone, offspring))

        offspring = algorithms.varAnd(offspring, toolbox, cxpb=crossover_prob, mutpb=mutation_prob)
        for ind in offspring:
            clamp_individual(ind)

        invalid_individuals = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = list(toolbox.map(toolbox.evaluate, invalid_individuals))
        for ind, fit in zip(invalid_individuals, fitnesses):
            ind.fitness.values = fit

        population[:] = offspring
        hof.update(population)
        record = stats.compile(population)
        logbook.record(gen=gen, nevals=len(invalid_individuals), **record)
        history_rows.append({"generation": gen, **record})

    best_config = decode_vector(hof[0]) if len(hof) > 0 else None
    return {
        "population": population,
        "hall_of_fame": hof,
        "logbook": logbook,
        "history": pd.DataFrame(history_rows),
        "best_config": best_config,
    }

## 6. Differential Evolution Optimization

Wrap SciPy's differential evolution routine to search the normalized hyperparameter space.

In [19]:
from scipy.optimize import differential_evolution


def run_de(
    dataset_name: str,
    popsize: int = 15,
    max_iter: int = 8,
    mutation: Tuple[float, float] = (0.5, 1.0),
    recombination: float = 0.7,
    training_config: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    training_config = training_config or TRAINING_CONFIG
    bounds = [(0.0, 1.0)] * len(PARAM_SPACE)
    evaluation_records: List[Dict[str, Any]] = []
    best_tracker = {"score": -np.inf, "config": None}
    trial_counter = {"count": 0}

    def objective(vector: np.ndarray) -> float:
        vector = np.clip(vector, 0.0, 1.0)
        config = decode_vector(vector)
        key = config_to_key(dataset_name, config)
        was_cached = key in EVALUATION_CACHE
        summary = evaluate_config(dataset_name, config, method="DE", training_config=training_config)
        if not was_cached:
            trial_counter["count"] += 1
            record_result({**summary, "trial_id": trial_counter["count"]})
        score = summary["val_accuracy_mean"]
        evaluation_records.append(
            {
                "config": config,
                "score": score,
                "test_accuracy": summary["test_accuracy_mean"],
                "external_test_accuracy": summary["external_test_accuracy_mean"],
            }
        )
        if score > best_tracker["score"]:
            best_tracker["score"] = score
            best_tracker["config"] = config
        return -score

    history_rows: List[Dict[str, Any]] = []

    def callback(xk: np.ndarray, convergence: float) -> bool:
        history_rows.append(
            {
                "generation": len(history_rows) + 1,
                "best_score": best_tracker["score"],
                "convergence": convergence,
            }
        )
        return False

    result = differential_evolution(
        objective,
        bounds,
        strategy="best1bin",
        popsize=popsize,
        maxiter=max_iter,
        mutation=mutation,
        recombination=recombination,
        seed=GLOBAL_SEEDS[0],
        callback=callback,
        updating="deferred",
        workers=1,
        polish=False,
    )

    best_config = decode_vector(result.x)
    summary = evaluate_config(dataset_name, best_config, method="DE", training_config=training_config)
    record_result({**summary, "trial_id": trial_counter["count"] + 1})

    return {
        "result": result,
        "best_config": best_config,
        "history": pd.DataFrame(history_rows),
        "evaluations": pd.DataFrame(evaluation_records),
        "summary": summary,
    }


## 7. Particle Swarm Optimization

Configure a PSO loop (via PySwarms) to explore the normalized hyperparameter space with velocity clamping.

In [30]:

def run_pso(
    dataset_name: str,
    particles: int = 20,
    iterations: int = 10,
    inertia_start: float = 0.9,
    inertia_end: float = 0.5,
    cognitive: float = 1.6,
    social: float = 1.6,
    velocity_clamp: Tuple[float, float] = (-1.5, 1.5),
    training_config: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    training_config = training_config or TRAINING_CONFIG
    dim = len(PARAM_SPACE)
    seed_value = GLOBAL_SEEDS[1] if len(GLOBAL_SEEDS) > 1 else GLOBAL_SEEDS[0] if GLOBAL_SEEDS else None
    rng = np.random.default_rng(seed_value)
    positions = rng.random((particles, dim))
    velocities = rng.uniform(velocity_clamp[0], velocity_clamp[1], size=(particles, dim))
    personal_best_positions = positions.copy()
    personal_best_scores = np.full(particles, -np.inf)
    global_best_position = positions[0].copy()
    global_best_score = -np.inf
    trial_counter = {"count": 0}
    evaluation_records: List[Dict[str, Any]] = []
    history_rows: List[Dict[str, Any]] = []

    for iteration in range(1, iterations + 1):
        inertia = inertia_start + (inertia_end - inertia_start) * ((iteration - 1) / max(1, iterations - 1))
        for idx in range(particles):
            positions[idx] = np.clip(positions[idx], 0.0, 1.0)
            config = decode_vector(positions[idx])
            key = config_to_key(dataset_name, config)
            was_cached = key in EVALUATION_CACHE
            summary = evaluate_config(dataset_name, config, method="PSO", training_config=training_config)
            if not was_cached:
                trial_counter["count"] += 1
                record_result({**summary, "trial_id": trial_counter["count"]})
            score = summary["val_accuracy_mean"]
            evaluation_records.append(
                {
                    "iteration": iteration,
                    "particle": idx,
                    "config": config,
                    "score": score,
                    "test_accuracy": summary["test_accuracy_mean"],
                    "external_test_accuracy": summary["external_test_accuracy_mean"],
                }
            )
            if score > personal_best_scores[idx]:
                personal_best_scores[idx] = score
                personal_best_positions[idx] = positions[idx].copy()
            if score > global_best_score:
                global_best_score = score
                global_best_position = positions[idx].copy()

        r1 = rng.random((particles, dim))
        r2 = rng.random((particles, dim))
        velocities = (
            inertia * velocities
            + cognitive * r1 * (personal_best_positions - positions)
            + social * r2 * (global_best_position - positions)
        )
        velocities = np.clip(velocities, velocity_clamp[0], velocity_clamp[1])
        positions = np.clip(positions + velocities, 0.0, 1.0)
        history_rows.append(
            {
                "iteration": iteration,
                "inertia": inertia,
                "best_score": global_best_score,
            }
        )

    best_config = decode_vector(global_best_position)
    summary = evaluate_config(dataset_name, best_config, method="PSO", training_config=training_config)
    record_result({**summary, "trial_id": trial_counter["count"] + 1})

    return {
        "best_config": best_config,
        "history": pd.DataFrame(history_rows),
        "evaluations": pd.DataFrame(evaluation_records),
        "summary": summary,
    }

## 8. Unified Evaluation Pipeline

Aggregate experiment outputs and materialize accuracy/runtime tables aligned with the manuscript template.

In [31]:
EXPERIMENT_PLAN = {
    "mnist": {
        "grid_trials": 4,
        "random_trials": 6,
        "ga": {"population_size": 10, "generations": 4},
        "de": {"popsize": 12, "max_iter": 6},
        "pso": {"particles": 16, "iterations": 6},
    },
    "cifar10": {
        "grid_trials": 4,
        "random_trials": 6,
        "ga": {"population_size": 12, "generations": 5},
        "de": {"popsize": 14, "max_iter": 6},
        "pso": {"particles": 18, "iterations": 7},
    },
}


def results_dataframe() -> pd.DataFrame:
    if not RESULTS_REGISTRY:
        return pd.DataFrame()
    df = pd.json_normalize(RESULTS_REGISTRY)
    return df


def summarise_for_table(dataset_name: str) -> pd.DataFrame:
    df = results_dataframe()
    if df.empty:
        return pd.DataFrame()
    df = df[df["dataset"].str.lower() == dataset_name.lower()].copy()
    grouped = (
        df.groupby("method")
        .agg(
            mean_accuracy=("test_accuracy_mean", "mean"),
            std_accuracy=("test_accuracy_mean", "std"),
            mean_runtime=("runtime_sec", "mean"),
        )
        .reset_index()
    )
    grouped["dataset"] = dataset_name
    grouped = grouped[["method", "dataset", "mean_accuracy", "std_accuracy", "mean_runtime"]]
    grouped.sort_values(by="mean_accuracy", ascending=False, inplace=True)
    return grouped


def build_reporting_table() -> pd.DataFrame:
    frames = []
    for dataset_name in ["mnist", "cifar10"]:
        frames.append(summarise_for_table(dataset_name))
    table = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    table.rename(
        columns={
            "method": "Method",
            "dataset": "Dataset",
            "mean_accuracy": "Mean Accuracy",
            "std_accuracy": "Std Dev",
            "mean_runtime": "Runtime (sec)",
        },
        inplace=True,
    )
    return table


def run_experiment_suite(plan: Optional[Dict[str, Dict[str, Any]]] = None) -> None:
    plan = plan or EXPERIMENT_PLAN
    for dataset_name, config in plan.items():
        print(f"\n=== Running baseline searches for {dataset_name.upper()} ===")
        if config.get("grid_trials"):
            run_grid_search(dataset_name, max_trials=config["grid_trials"])
        if config.get("random_trials"):
            run_random_search(dataset_name, n_trials=config["random_trials"])

        print(f"\n=== Running evolutionary searches for {dataset_name.upper()} ===")
        ga_params = config.get("ga")
        if ga_params:
            run_ga(dataset_name, **ga_params)

        de_params = config.get("de")
        if de_params:
            run_de(dataset_name, **de_params)

        pso_params = config.get("pso")
        if pso_params:
            run_pso(dataset_name, **pso_params)

        print(f"Completed suite for {dataset_name}.")

    print("\nAll experiment suites completed. Build the reporting table once metrics are ready.")

## 9. Statistical Analysis and Visualization

Run Welch's t-tests with Holm–Bonferroni correction and produce convergence and distribution plots.

In [22]:
sns.set_theme(style="whitegrid")


def expand_seed_results() -> pd.DataFrame:
    rows: List[Dict[str, Any]] = []
    for record in RESULTS_REGISTRY:
        dataset = record["dataset"]
        method = record["method"]
        config = record["config"]
        trial_id = record.get("trial_id")
        for seed_entry in record.get("seed_results", []):
            rows.append(
                {
                    "dataset": dataset,
                    "method": method,
                    "trial_id": trial_id,
                    "seed": seed_entry["seed"],
                    "val_accuracy": seed_entry["val_accuracy"],
                    "test_accuracy": seed_entry["test_accuracy"],
                    "external_test_accuracy": seed_entry["external_test_accuracy"],
                    "best_epoch": seed_entry["best_epoch"],
                    "config": config,
                }
            )
    return pd.DataFrame(rows)


def welch_t_test(group1: Iterable[float], group2: Iterable[float]) -> Tuple[float, float]:
    statistic, pvalue = stats.ttest_ind(group1, group2, equal_var=False)
    return statistic, pvalue


def holm_bonferroni(pvalues: Dict[str, float], alpha: float = 0.05) -> pd.DataFrame:
    ordered = sorted(pvalues.items(), key=lambda item: item[1])
    m = len(ordered)
    results = []
    for rank, (label, pvalue) in enumerate(ordered, start=1):
        threshold = alpha / (m - rank + 1)
        reject = pvalue <= threshold
        results.append(
            {
                "comparison": label,
                "pvalue": pvalue,
                "threshold": threshold,
                "reject_H0": reject,
            }
        )
    return pd.DataFrame(results)


def compare_methods_with_baseline(
    dataset_name: str,
    baseline_method: str,
    candidate_methods: List[str],
    alpha: float = 0.05,
) -> pd.DataFrame:
    seed_df = expand_seed_results()
    if seed_df.empty:
        return pd.DataFrame()
    subset = seed_df[seed_df["dataset"].str.lower() == dataset_name.lower()]
    results: Dict[str, float] = {}
    for method in candidate_methods:
        baseline_scores = subset[subset["method"] == baseline_method]["test_accuracy"].values
        method_scores = subset[subset["method"] == method]["test_accuracy"].values
        if len(baseline_scores) == 0 or len(method_scores) == 0:
            continue
        statistic, pvalue = welch_t_test(method_scores, baseline_scores)
        results[f"{method} vs {baseline_method}"] = pvalue
    if not results:
        return pd.DataFrame()
    corrected = holm_bonferroni(results, alpha=alpha)
    return corrected


def plot_convergence(history: pd.DataFrame, metric: str, title: str) -> None:
    if history.empty:
        print("History is empty. Run the optimizer first.")
        return
    plt.figure(figsize=(8, 4))
    plt.plot(history[history.columns[0]], history[metric], marker="o")
    plt.title(title)
    plt.xlabel(history.columns[0].capitalize())
    plt.ylabel(metric.replace("_", " ").capitalize())
    plt.tight_layout()
    plt.show()


def plot_accuracy_distributions(dataset_name: str) -> None:
    seed_df = expand_seed_results()
    if seed_df.empty:
        print("No seed-level results available yet.")
        return
    subset = seed_df[seed_df["dataset"].str.lower() == dataset_name.lower()]
    plt.figure(figsize=(8, 4))
    sns.violinplot(data=subset, x="method", y="test_accuracy", inner="quartile", cut=0)
    plt.title(f"Test accuracy distributions for {dataset_name.upper()}")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.show()


## 10. Experiment Logging and Export

Persist raw metrics, configuration metadata, and figures for inclusion in the report.

In [23]:
def export_results(output_dir: Path = RESULTS_DIR) -> Dict[str, Path]:
    output_dir.mkdir(parents=True, exist_ok=True)
    tables_dir = output_dir / "tables"
    raw_dir = output_dir / "raw"
    checkpoints_dir = output_dir / "checkpoints"
    figures_dir = output_dir / "figures"
    for directory in [tables_dir, raw_dir, checkpoints_dir, figures_dir]:
        directory.mkdir(parents=True, exist_ok=True)

    artefacts: Dict[str, Path] = {}
    results_df = results_dataframe()
    if not results_df.empty:
        csv_path = tables_dir / "all_results.csv"
        results_df.to_csv(csv_path, index=False)
        artefacts["all_results_csv"] = csv_path

        report_table = build_reporting_table()
        if not report_table.empty:
            table_path = tables_dir / "table_4_1_template.csv"
            report_table.to_csv(table_path, index=False)
            artefacts["report_table_csv"] = table_path

    snapshot_path = raw_dir / "results_registry.pt"
    torch.save(RESULTS_REGISTRY, snapshot_path)
    artefacts["registry_snapshot"] = snapshot_path

    metadata = []
    for record in RESULTS_REGISTRY:
        metadata.append(
            {
                key: value
                for key, value in record.items()
                if key not in {"seed_results"}
            }
        )
    metadata_path = raw_dir / "summary.json"
    with metadata_path.open("w") as fp:
        json.dump(metadata, fp, indent=2, default=str)
    artefacts["summary_json"] = metadata_path

    for record in RESULTS_REGISTRY:
        dataset = record["dataset"].lower()
        method = record["method"].lower().replace(" ", "_")
        trial_id = record.get("trial_id", "na")
        for seed_entry in record.get("seed_results", []):
            state_dict = seed_entry.get("state_dict")
            if state_dict is None:
                continue
            filename = f"{dataset}_{method}_trial{trial_id}_seed{seed_entry['seed']}.pt"
            checkpoint_path = checkpoints_dir / filename
            torch.save(state_dict, checkpoint_path)
    artefacts["checkpoints_dir"] = checkpoints_dir
    artefacts["figures_dir"] = figures_dir

    print("Export completed. Artefacts saved to:")
    for label, path in artefacts.items():
        print(f" - {label}: {path}")

    return artefacts


def save_figure(fig: plt.Figure, filename: str, figures_dir: Path = RESULTS_DIR / "figures") -> Path:
    figures_dir.mkdir(parents=True, exist_ok=True)
    path = figures_dir / filename
    fig.tight_layout()
    fig.savefig(path, dpi=300)
    print(f"Saved figure to {path}")
    return path


### Data Prefetch Utility

In [24]:
def ensure_data_availability(download: bool = True) -> pd.DataFrame:
    """Trigger dataset preparation and return the metadata frame."""
    DATASETS.clear()
    DATALOADER_CACHE.clear()
    DATASET_METADATA.clear()
    for name in ("mnist", "cifar10"):
        print(f"Preparing {name.upper()} (download={download})...")
        prepare_dataset(name, download=download)
    metadata_frame = pd.DataFrame.from_dict(DATASET_METADATA, orient="index")
    display(metadata_frame)
    return metadata_frame

### Usage Notes

1. Adjust `TRAINING_CONFIG` (epochs, patience, and optional `max_steps_per_epoch`) if you need faster smoke tests.
2. Use `run_smoke_test()` to grab datasets, execute a compact suite, and preview the reporting table before longer runs.
3. Update `EXPERIMENT_PLAN` to scale the search budget per dataset before calling `run_experiment_suite()`.
4. After experiments finish, call `build_reporting_table()` to populate the manuscript template and `export_results()` to persist metrics and checkpoints.

### Quick Smoke Test Runner

In [36]:
def run_smoke_test(override_plan: Optional[Dict[str, Dict[str, Any]]] = None) -> pd.DataFrame:
    """
    Run a compact experiment suite for validation.
    Downloads data if needed, trims training config for speed, and restores defaults afterwards.
    """
    ensure_data_availability(download=True)
    EVALUATION_CACHE.clear()
    RESULTS_REGISTRY.clear()
    RECORDED_KEYS.clear()

    plan = override_plan or {
        "mnist": {
            "random_trials": 1,
            "ga": None,
            "de": None,
            "pso": None,
        }
    }

    original_config = TRAINING_CONFIG.copy()
    original_seeds = GLOBAL_SEEDS.copy()
    try:
        TRAINING_CONFIG.update({"epochs": 1, "patience": 1, "max_steps_per_epoch": 25})
        GLOBAL_SEEDS.clear()
        GLOBAL_SEEDS.extend([original_seeds[0]])
        run_experiment_suite(plan)
        table = build_reporting_table()
        display(table)
        return table
    finally:
        TRAINING_CONFIG.update(original_config)
        GLOBAL_SEEDS.clear()
        GLOBAL_SEEDS.extend(original_seeds)



def run_full_benchmark(
    plan: Optional[Dict[str, Dict[str, Any]]] = None,
    training_override: Optional[Dict[str, Any]] = None,
    download: bool = False,
    seed_subset: Optional[List[int]] = None,
) -> pd.DataFrame:
    """Execute the full experiment suite with optional overrides and return the summary table."""
    ensure_data_availability(download=download)
    EVALUATION_CACHE.clear()
    RESULTS_REGISTRY.clear()
    RECORDED_KEYS.clear()

    original_config = TRAINING_CONFIG.copy()
    original_seeds = GLOBAL_SEEDS.copy()
    try:
        if seed_subset:
            GLOBAL_SEEDS.clear()
            GLOBAL_SEEDS.extend(seed_subset)
        if training_override:
            TRAINING_CONFIG.update(training_override)
        run_experiment_suite(plan or EXPERIMENT_PLAN)
        table = build_reporting_table()
        display(table)
        return table
    finally:
        TRAINING_CONFIG.update(original_config)
        GLOBAL_SEEDS.clear()
        GLOBAL_SEEDS.extend(original_seeds)



def generate_report_figures(
    table: Optional[pd.DataFrame] = None,
    output_dir: Path = RESULTS_DIR / "figures",
) -> Dict[str, Path]:
    """Create comparison figures for the report and return their file paths."""
    output_dir.mkdir(parents=True, exist_ok=True)
    if table is None or table.empty:
        table = build_reporting_table()
    figures: Dict[str, Path] = {}
    if table is None or table.empty:
        print("No aggregate results available. Run experiments first.")
        return figures

    accuracy_fig, ax = plt.subplots(figsize=(7, 4))
    sns.barplot(data=table, x="Dataset", y="Mean Accuracy", hue="Method", ax=ax)
    ax.set_ylim(0, 1)
    ax.set_ylabel("Mean Accuracy")
    ax.set_title("Method Comparison by Dataset")
    figures["accuracy_comparison"] = save_figure(accuracy_fig, "accuracy_comparison.png", figures_dir=output_dir)
    plt.close(accuracy_fig)

    runtime_fig, ax = plt.subplots(figsize=(7, 4))
    sns.barplot(data=table, x="Dataset", y="Runtime (sec)", hue="Method", ax=ax)
    ax.set_ylabel("Runtime (sec)")
    ax.set_title("Runtime Comparison by Dataset")
    figures["runtime_comparison"] = save_figure(runtime_fig, "runtime_comparison.png", figures_dir=output_dir)
    plt.close(runtime_fig)

    seed_df = expand_seed_results()
    if not seed_df.empty:
        for dataset_name in sorted(seed_df["dataset"].str.lower().unique()):
            subset = seed_df[seed_df["dataset"].str.lower() == dataset_name]
            if subset.empty:
                continue
            fig, ax = plt.subplots(figsize=(7, 4))
            sns.violinplot(data=subset, x="method", y="test_accuracy", inner="quartile", cut=0, ax=ax)
            ax.set_xlabel("Method")
            ax.set_ylabel("Test Accuracy")
            ax.set_title(f"Test Accuracy Distribution - {dataset_name.upper()}")
            plt.xticks(rotation=30)
            filename = f"{dataset_name}_accuracy_distribution.png"
            figures[f"{dataset_name}_distribution"] = save_figure(fig, filename, figures_dir=output_dir)
            plt.close(fig)
    else:
        print("Seed-level results unavailable; skipping distribution plots.")

    return figures


run_smoke_test()

: 

In [35]:
full_results = run_full_benchmark(
    plan=EXPERIMENT_PLAN,
    training_override={"epochs": 3, "patience": 1, "max_steps_per_epoch": 150},
    download=False,
 )
full_results

Preparing MNIST (download=False)...
Preparing CIFAR10 (download=False)...


Unnamed: 0,train_size,val_size,test_size,external_test_size
mnist,48000,6000,6000,10000
cifar10,40000,5000,5000,10000



=== Running baseline searches for MNIST ===




[Grid | mnist config evaluation] 1m 28s
[Grid | mnist config evaluation] 1m 21s
[Grid | mnist config evaluation] 1m 27s
[Grid | mnist config evaluation] 1m 28s
[Random | mnist config evaluation] 1m 26s
[Random | mnist config evaluation] 1m 18s
[Random | mnist config evaluation] 1m 21s

=== Running evolutionary searches for MNIST ===
[GA | mnist config evaluation] 1m 21s
[GA | mnist config evaluation] 1m 9s
[GA | mnist config evaluation] 1m 21s
[GA | mnist config evaluation] 1m 20s
[GA | mnist config evaluation] 1m 17s
[GA | mnist config evaluation] 1m 19s
[GA | mnist config evaluation] 1m 17s
[GA | mnist config evaluation] 1m 20s
[GA | mnist config evaluation] 1m 15s
[GA | mnist config evaluation] 1m 18s
[GA | mnist config evaluation] 1m 18s
[GA | mnist config evaluation] 1m 20s
[GA | mnist config evaluation] 1m 19s
[GA | mnist config evaluation] 1m 20s
[GA | mnist config evaluation] 1m 15s
[GA | mnist config evaluation] 1m 19s
[GA | mnist config evaluation] 1m 22s
[GA | mnist config e

Traceback (most recent call last):
  File [35m"<string>"[0m, line [35m1[0m, in [35m<module>[0m
    from multiprocessing.spawn import spawn_main; [31mspawn_main[0m[1;31m(tracker_fd=81, pipe_handle=106)[0m
                                                  [31m~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/spawn.py"[0m, line [35m122[0m, in [35mspawn_main[0m
    exitcode = _main(fd, parent_sentinel)
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/spawn.py"[0m, line [35m132[0m, in [35m_main[0m
    self = reduction.pickle.load(from_parent)
  File [35m"/Users/kaicho/Develop/comp815/Project_Report/.venv/lib/python3.13/site-packages/torch/__init__.py"[0m, line [35m54[0m, in [35m<module>[0m
    from torch._utils_internal import (
    ...<5 lines>...
    )
  

[DE | mnist config evaluation] 21.5s


KeyboardInterrupt: 