In [1]:
!pip install pandas numpy matplotlib pyarrow scikit-learn tkan temporal_linear_network scipy jax xlrd -U

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Run on: RYZEN 7900X (no cuda enable as model is small, GPU overhead is more important than training time)

In [2]:
# Standard library imports
import os
import time
from urllib.request import urlretrieve
from abc import ABC, abstractmethod

import jax
print(jax.devices())

# Set Keras backend
BACKEND = 'jax'  # You can use any backend here
os.environ['KERAS_BACKEND'] = BACKEND

# Third-party imports
import numpy as np
import pandas as pd
from scipy import stats, ndimage
from scipy.ndimage import zoom
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Keras imports
import keras
from keras import ops
from keras import layers
from keras import optimizers
from keras import callbacks
from keras.models import Sequential, Model
from keras.optimizers import Optimizer, Adam
from keras.datasets import fashion_mnist, cifar100, california_housing
from keras.layers import Dense, GlobalAveragePooling2D
from keras.applications import EfficientNetV2B0, MobileNetV2
from keras.applications.efficientnet_v2 import preprocess_input as efficientnet_v2_preprocess_input
from keras.applications.mobilenet_v2 import preprocess_input as mobilenet_preprocess


An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


[CpuDevice(id=0)]


In [3]:
N_MAX_EPOCHS = 1000
BATCH_SIZE = 128

def early_stopping_callback():
    return keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0.00001,
        patience=25,
        mode="min",
        restore_best_weights=True,
        start_from_epoch=12,
    )

def lr_callback():
    return keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.25,
        patience=8,
        mode="min",
        min_delta=0.00001,
        min_lr=0.000025,
        verbose=0,
    )

def get_callbacks():
    return [early_stopping_callback(), lr_callback(), keras.callbacks.TerminateOnNaN()]

In [4]:
class ScalingStrategy(ABC):
    def __init__(self, **kwargs):
        self.hyperparameters = kwargs

    @abstractmethod
    def compute_scaling_factor(self, connections, gradients, layer_info, model_info):
        raise NotImplementedError


class AdditiveMinMaxMedianConnectionScaling(ScalingStrategy):
    def compute_scaling_factor(self, connections, gradients, layer_info, model_info):
        scaling_factor = self.hyperparameters.get('scaling_factor', 0.95)
        if model_info['min_connections'] == model_info['max_connections']:
            return 1.0
        if connections <= model_info['median_connections']:
            return 1 + scaling_factor * (
                (model_info['median_connections'] - connections) / 
                (model_info['median_connections'] - model_info['min_connections'])
            )
        else:
            return 1 - scaling_factor * (
                (connections - model_info['median_connections']) / 
                (model_info['max_connections'] - model_info['median_connections'])
            )
            
    def get_config(self):
        return {"scaling_factor": self.hyperparameters.get('scaling_factor', 5.0)}

    @classmethod
    def from_config(cls, config):
        return cls(scaling_factor=config['scaling_factor'])

class MultiplicativeMinMaxMedianConnectionScaling(ScalingStrategy):
    def compute_scaling_factor(self, connections, gradients, layer_info, model_info):
        scaling_factor = self.hyperparameters.get('scaling_factor', 5.0)
        if model_info['min_connections'] == model_info['max_connections']:
            return 1.0
        
        # Normalize the connections to a range between -1 and 1
        if connections <= model_info['median_connections']:
            normalized = (model_info['median_connections'] - connections) / (model_info['median_connections'] - model_info['min_connections'])
        else:
            normalized = (connections - model_info['median_connections']) / (model_info['max_connections'] - model_info['median_connections'])
        
        # Use an exponential function to map the normalized value to a scaling factor
        # This will map -1 to 1/scaling_factor, 0 to 1, and 1 to scaling_factor
        return ops.exp(ops.log(scaling_factor) * normalized)

    def get_config(self):
        return {"scaling_factor": self.hyperparameters.get('scaling_factor', 5.0)}

    @classmethod
    def from_config(cls, config):
        return cls(scaling_factor=config['scaling_factor'])
        
class DepthConnectionScaling(ScalingStrategy):
    def compute_scaling_factor(self, connections, gradients, layer_info, model_info):
        scaling_factor = self.hyperparameters.get('scaling_factor', 1.0)
        return (1. + scaling_factor) ** ((model_info['total_depth'] - (1 + layer_info['depth'])) / model_info['total_depth'])

    def get_config(self):
        return {"scaling_factor": self.hyperparameters.get('scaling_factor', 5.0)}

    @classmethod
    def from_config(cls, config):
        return cls(scaling_factor=config['scaling_factor'])
        

class ConnectionAwareAdam(Optimizer):
    def __init__(
        self,
        learning_rate=0.001,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-7,
        amsgrad=False,
        scaling_strategy=AdditiveMinMaxMedianConnectionScaling(scaling_factor=0.9),
        name="ConnectionAwareAdam",
        **kwargs
    ):
        super().__init__(learning_rate=learning_rate, name=name, **kwargs)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.amsgrad = amsgrad
        self.scaling_strategy = scaling_strategy
        self.connection_counts = {}
        self.min_connections = float('inf')
        self.max_connections = 0
        self.median_connections = 0
        self.model_info = {}

    def build(self, var_list):
        super().build(var_list)
        self._momentums = []
        self._velocities = []
        for var in var_list:
            self._momentums.append(
                self.add_variable_from_reference(reference_variable=var, name="m")
            )
            self._velocities.append(
                self.add_variable_from_reference(reference_variable=var, name="v")
            )
        if self.amsgrad:
            self._velocity_hats = []
            for var in var_list:
                self._velocity_hats.append(
                    self.add_variable_from_reference(reference_variable=var, name="vhat")
                )
        self._calculate_connection_counts(var_list)

    def _calculate_connection_counts(self, var_list):
        all_connections = []
        for var in var_list:
            if len(var.shape) > 1:  # Only consider variables with more than 1 dimension (i.e., not biases)
                connections = np.prod(var.shape)
                self.connection_counts[var.name] = connections
                all_connections.append(connections)

        if all_connections:
            self.min_connections = min(all_connections)
            self.max_connections = max(all_connections)
            self.median_connections = np.median(all_connections)
        else:
            self.min_connections = self.max_connections = self.median_connections = 1  # Default to avoid division by zero

        self.model_info = {
            'min_connections': self.min_connections,
            'max_connections': self.max_connections,
            'median_connections': self.median_connections,
            'total_depth': len(var_list),
        }

    def update_step(self, gradient, variable, learning_rate):
        lr = ops.cast(learning_rate, variable.dtype)
        gradient = ops.cast(gradient, variable.dtype)
        local_step = ops.cast(self.iterations + 1, variable.dtype)
        beta_1_power = ops.power(ops.cast(self.beta_1, variable.dtype), local_step)
        beta_2_power = ops.power(ops.cast(self.beta_2, variable.dtype), local_step)

        m = self._momentums[self._get_variable_index(variable)]
        v = self._velocities[self._get_variable_index(variable)]

        # Apply scaling strategy
        connections = self.connection_counts.get(variable.name, 0)
        layer_info = {
            'depth': self._get_variable_index(variable),
        }
        scaling_factor = self.scaling_strategy.compute_scaling_factor(
            connections, gradient, layer_info, self.model_info
        )

        alpha = lr * ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)
        alpha *= ops.cast(scaling_factor, variable.dtype)

        m.assign(self.beta_1 * m + (1 - self.beta_1) * gradient)
        v.assign(self.beta_2 * v + (1 - self.beta_2) * ops.square(gradient))

        if self.amsgrad:
            v_hat = self._velocity_hats[self._get_variable_index(variable)]
            v_hat.assign(ops.maximum(v_hat, v))
            v = v_hat

        variable.assign_sub(alpha * m / (ops.sqrt(v) + self.epsilon))

    def get_config(self):
        config = super().get_config()
        config.update({
            "beta_1": self.beta_1,
            "beta_2": self.beta_2,
            "epsilon": self.epsilon,
            "amsgrad": self.amsgrad,
            "scaling_strategy": self.scaling_strategy.__class__.__name__,
        })
        return config

In [8]:

def create_densenet_model(input_shape, num_classes):
    base_model = DenseNet121(input_shape=input_shape, include_top=False, weights=None)
    
    x = base_model.output
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    predictions = layers.Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=predictions)
    
    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    return model

def create_mobilenet_model(input_shape, num_classes):
    base_model = MobileNetV3Small(input_shape=input_shape, include_top=False, weights=None)
    
    x = base_model.output
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(512, activation='relu')(x)
    predictions = layers.Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=predictions)
    
    # Unfreeze all layers for full retraining
    for layer in model.layers:
        layer.trainable = True
    
    return model

def create_mlp_model(input_shape, hidden_layers):
    model = Sequential()
    model.add(layers.Input(shape=input_shape))
    for units in hidden_layers:
        model.add(layers.Dense(units, activation='relu'))
    model.add(layers.Dense(1))  # For regression tasks
    return model

def create_mlp_model(input_shape, hidden_layers):
    model = keras.Sequential()
    model.add(layers.Input(shape=input_shape))
    for units in hidden_layers:
        model.add(layers.Dense(units, activation='relu'))
    model.add(layers.Dense(1))  # For regression tasks
    return model

def resize_image(image, target_size):
    zoom_factors = (target_size[0] / image.shape[0], target_size[1] / image.shape[1], 1)
    return ndimage.zoom(image, zoom_factors, order=1)


def load_dataset(name):
    if name == 'fashion_mnist':
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
        # Upscale to 56x56 (2x)
        x_train = np.array([zoom(img, (2, 2)) for img in x_train])
        x_test = np.array([zoom(img, (2, 2)) for img in x_test])
        # Add channel dimension and convert to RGB for MobileNetV3
        x_train = np.stack((x_train,)*3, axis=-1) / 255.
        x_test = np.stack((x_test,)*3, axis=-1) / 255.
        # Preprocess for MobileNetV2
        x_train = mobilenet_v3_preprocess_input(x_train)
        x_test = mobilenet_v3_preprocess_input(x_test)
    elif name == 'cifar100':
        (x_train, y_train), (x_test, y_test) = cifar100.load_data()
        x_train = x_train / 255.
        x_test = x_test/ 255.
        x_train = densenet_preprocess_input(x_train)
        x_test = densenet_preprocess_input(x_test)
    elif name == 'california':
        (x_train, y_train), (x_test, y_test) = california_housing.load_data()
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        # Scale target (important for regression tasks)
        scaler_y = StandardScaler()
        y_train = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
        y_test = scaler_y.transform(y_test.reshape(-1, 1)).ravel()
        
    else:
        raise ValueError(f"Unknown dataset: {name}")
    
    return x_train, x_test, y_train, y_test

def train_and_evaluate(model, X_train, y_train, x_test, y_test, optimizer, run, is_classification=False):
    if is_classification:
        loss = 'sparse_categorical_crossentropy'
        metrics = ['accuracy']
    else:
        loss = 'mse'
        metrics = ['mae']

    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    start_time = time.time()
    history = model.fit(
        X_train, y_train,
        epochs=N_MAX_EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.2,
        callbacks=get_callbacks(),
        verbose=0
    )
    end_time = time.time()
    
    train_time = end_time - start_time
    epochs_to_converge = len(history.history['loss'])

    if is_classification:
        test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
        return {
            'run': run,
            'test_accuracy': test_acc,
            'test_loss': test_loss,
            'train_time': train_time,
            'epochs_to_converge': epochs_to_converge
        }
    else:
        test_mse, test_mae = model.evaluate(x_test, y_test, verbose=0)
        return {
            'run': run,
            'test_rmse': np.sqrt(test_mse),
            'test_mae': test_mae,
            'train_time': train_time,
            'epochs_to_converge': epochs_to_converge
        }

def run_benchmark(results = [], start_learning_rate=0.001):
    datasets = ['california']
    mlp_hidden_layers_configs = [[64, 32], [128, 64, 32], [128, 128, 64], [256, 128, 64, 32], [128, 128, 128, 128, 128], [1024, 256, 64, 16]]
    n_runs = 30

    keras_optimizers = [
        ('Adam', lambda: optimizers.Adam(learning_rate=start_learning_rate)),
        ('AdamW', lambda: optimizers.AdamW(learning_rate=start_learning_rate)),
        ('Adamax', lambda: optimizers.Adamax(learning_rate=start_learning_rate)),
        ('Nadam', lambda: optimizers.Nadam(learning_rate=start_learning_rate)),
         
    ]

    connection_aware_strategies = [
        'AdditiveMinMaxMedianConnectionScaling', 'MultiplicativeMinMaxMedianConnectionScaling', 'DepthConnectionScaling',
    ]

    for dataset_name in datasets:
        print(f'On dataset: {dataset_name}')
        X_train, X_test, y_train, y_test = load_dataset(dataset_name)
        
        if dataset_name == 'fashion_mnist':
            model_configs = [('MobileNetV3Small', create_mobilenet_model, (56, 56, 3), 10)]
            is_classification = True
        elif dataset_name == 'cifar100':
            model_configs = [('DenseNet121', create_densenet_model, (32, 32, 3), 100)]
            is_classification = True
        else:  # california
            model_configs = [('MLP', create_mlp_model, X_train.shape[1:], hidden_layers) for hidden_layers in mlp_hidden_layers_configs]
            is_classification = False

        for model_name, model_func, *model_args in model_configs:
            print(f'Using model: {model_name}')
            
            initial_model = model_func(*model_args)
            initial_weights = initial_model.get_weights()
            
            for run in range(n_runs):
                print(f'Run: {run + 1}/{n_runs}')
                
                # Keras optimizers
                for opt_name, opt_func in keras_optimizers:
                    model = keras.models.clone_model(initial_model)
                    model.set_weights(initial_weights)
                    optimizer = opt_func()
                    result = train_and_evaluate(model, X_train, y_train, X_test, y_test, optimizer, run, is_classification)
                    
                    result.update({
                        'dataset': dataset_name,
                        'model': model_name,
                        'optimizer': opt_name,
                        'model_args': model_args,
                        'scaling_strategy': 'N/A',
                        'scaling_factor': 'N/A'
                    })
                    print(result)
                    results.append(result)
                    del model

                # ConnectionAwareAdam
                for strategy_name in connection_aware_strategies:
                    model = keras.models.clone_model(initial_model)
                    model.set_weights(initial_weights)
                    
                    if strategy_name == 'AdditiveMinMaxMedianConnectionScaling':
                        strategy = AdditiveMinMaxMedianConnectionScaling()
                    elif strategy_name == 'MultiplicativeMinMaxMedianConnectionScaling':
                        strategy = MultiplicativeMinMaxMedianConnectionScaling()
                    elif strategy_name == 'DepthConnectionScaling':
                        strategy = DepthConnectionScaling()
                    
                    optimizer = ConnectionAwareAdam(learning_rate=start_learning_rate, scaling_strategy=strategy)
                    result = train_and_evaluate(model, X_train, y_train, X_test, y_test, optimizer, run, is_classification)
                    result.update({
                        'dataset': dataset_name,
                        'model': model_name,
                        'optimizer': 'ConnectionAwareAdam',
                        'scaling_strategy': strategy_name,
                        'model_args': model_args,
                    })
                    print(result)
                    results.append(result)
                    del model

    df_results = pd.DataFrame(results)
    return df_results


def perform_statistical_tests(df):
    df['model_args'] = df['model_args'].astype(str)
    grouped = df.groupby(['dataset', 'model_args', 'model', 'optimizer', 'scaling_strategy',])
        
    def get_metric_and_direction(dataset):
        if dataset in ['fashion_mnist', 'cifar100']:
            return 'test_accuracy', 1  # Higher is better
        elif dataset == 'california':
            return 'test_rmse', -1  # Lower is better
        else:
            raise ValueError(f"Unknown dataset: {dataset}")
    
    summary_list = []
    statistical_results = []
    
    for dataset in df['dataset'].unique():
        metric, improvement_direction = get_metric_and_direction(dataset)
        
        dataset_summary = grouped.agg({
            metric: ['mean', 'std'],
            'train_time': ['mean', 'std'],
            'epochs_to_converge': ['mean', 'std']
        }).reset_index()
        dataset_summary = dataset_summary[dataset_summary['dataset']==dataset]
        
        dataset_summary.columns = ['dataset', 'model_args', 'model', 'optimizer', 'scaling_strategy', 
                                   f'{metric}_mean', f'{metric}_std', 'time_mean', 'time_std', 
                                   'epochs_mean', 'epochs_std']
        summary_list.append(dataset_summary)
        
        for model in df['model'].unique():
            for model_args in df['model_args'].unique():
                adam_results = df[(df['dataset'] == dataset) & 
                                  (df['model'] == model) & 
                                  (df['optimizer'] == 'Adam') &
                                  (df['model_args'] == model_args)]
                
                if adam_results.empty:
                    continue
                
                adam_metric_mean = adam_results[metric].mean()
                adam_time_mean = adam_results['train_time'].mean()
                
                for optimizer in df['optimizer'].unique():
                    if optimizer == 'Adam':
                        continue
                    
                    if optimizer == 'ConnectionAwareAdam':
                        for strategy in df[df['optimizer'] == 'ConnectionAwareAdam']['scaling_strategy'].unique():
                            opt_results = df[(df['dataset'] == dataset) & 
                                             (df['model'] == model) & 
                                             (df['optimizer'] == 'ConnectionAwareAdam') & 
                                             (df['scaling_strategy'] == strategy) &
                                             (df['model_args'] == model_args)]
                            
                            if opt_results.empty:
                                continue
                            
                            opt_metric_mean = opt_results[metric].mean()
                            opt_time_mean = opt_results['train_time'].mean()
                            
                            metric_improvement = ((opt_metric_mean - adam_metric_mean) / adam_metric_mean) * 100 * improvement_direction
                            time_improvement = ((opt_time_mean - adam_time_mean) / adam_time_mean) * 100 * -1  # Negative because lower time is better
                            
                            metric_t_stat, metric_p_value = stats.ttest_ind(adam_results[metric], opt_results[metric])
                            time_t_stat, time_p_value = stats.ttest_ind(adam_results['train_time'], opt_results['train_time'])
                            
                            statistical_results.append({
                                'dataset': dataset,
                                'model': model,
                                'optimizer': 'ConnectionAwareAdam',
                                'scaling_strategy': strategy,
                                'model_args': model_args,
                                'metric': metric,
                                f'{metric}_improvement_%': metric_improvement,
                                f'{metric}_t_statistic': metric_t_stat,
                                f'{metric}_p_value': metric_p_value,
                                'time_improvement_%': time_improvement,
                                'time_t_statistic': time_t_stat,
                                'time_p_value': time_p_value
                            })
                    else:
                        opt_results = df[(df['dataset'] == dataset) & 
                                         (df['model'] == model) & 
                                         (df['optimizer'] == optimizer) &
                                         (df['model_args'] == model_args)]
                        
                        if opt_results.empty:
                            continue
                        
                        opt_metric_mean = opt_results[metric].mean()
                        opt_time_mean = opt_results['train_time'].mean()
                        
                        metric_improvement = ((opt_metric_mean - adam_metric_mean) / adam_metric_mean) * 100 * improvement_direction
                        time_improvement = ((opt_time_mean - adam_time_mean) / adam_time_mean) * 100 * -1  # Negative because lower time is better
                        
                        metric_t_stat, metric_p_value = stats.ttest_ind(adam_results[metric], opt_results[metric])
                        time_t_stat, time_p_value = stats.ttest_ind(adam_results['train_time'], opt_results['train_time'])
                        
                        statistical_results.append({
                            'dataset': dataset,
                            'model': model,
                            'optimizer': optimizer,
                            'scaling_strategy': 'N/A',
                            'model_args': model_args,
                            'metric': metric,
                            f'{metric}_improvement_%': metric_improvement,
                            f'{metric}_t_statistic': metric_t_stat,
                            f'{metric}_p_value': metric_p_value,
                            'time_improvement_%': time_improvement,
                            'time_t_statistic': time_t_stat,
                            'time_p_value': time_p_value
                        })
    
    summary = pd.concat(summary_list, axis=0).sort_values(['dataset', 'model_args', 'optimizer', 'scaling_strategy'])
    statistical_results = pd.DataFrame(statistical_results).sort_values(['dataset', 'model_args', 'optimizer', 'scaling_strategy'], ascending=[True, False, True, True])
    return summary, statistical_results


In [6]:
# Run the benchmark
results = []
results = run_benchmark(results=results)

# Save raw results to CSV
results.to_csv('benchmark_results.csv', index=False)


On dataset: california
Using model: MLP
Run: 1/30
{'run': 0, 'test_rmse': np.float64(0.4663915430558545), 'test_mae': 0.3136950135231018, 'train_time': 3.889787435531616, 'epochs_to_converge': 119, 'dataset': 'california', 'model': 'MLP', 'optimizer': 'Adam', 'model_args': [(8,), [64, 32]], 'scaling_strategy': 'N/A', 'scaling_factor': 'N/A'}
{'run': 0, 'test_rmse': np.float64(0.46482417922935637), 'test_mae': 0.3107568919658661, 'train_time': 13.68610405921936, 'epochs_to_converge': 489, 'dataset': 'california', 'model': 'MLP', 'optimizer': 'AdamW', 'model_args': [(8,), [64, 32]], 'scaling_strategy': 'N/A', 'scaling_factor': 'N/A'}
{'run': 0, 'test_rmse': np.float64(0.46877729018561665), 'test_mae': 0.3166363835334778, 'train_time': 5.871182680130005, 'epochs_to_converge': 197, 'dataset': 'california', 'model': 'MLP', 'optimizer': 'Adamax', 'model_args': [(8,), [64, 32]], 'scaling_strategy': 'N/A', 'scaling_factor': 'N/A'}
{'run': 0, 'test_rmse': np.float64(0.4651303569799942), 'test_m

In [9]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Perform statistical tests
summary, statistical_results = perform_statistical_tests(results)

# Save processed results to CSV
summary.to_csv('benchmark_summary.csv', index=False)
statistical_results.to_csv('benchmark_statistical_results.csv', index=False)

# Display results
print("Summary:")
display(summary)
print("\nStatistical Results:")
display(statistical_results)


Summary:


Unnamed: 0,dataset,model_args,model,optimizer,scaling_strategy,test_rmse_mean,test_rmse_std,time_mean,time_std,epochs_mean,epochs_std
0,california,"[(8,), [1024, 256, 64, 16]]",MLP,Adam,,0.441815,0.002265,14.837137,1.72826,72.5,8.807325
1,california,"[(8,), [1024, 256, 64, 16]]",MLP,AdamW,,0.441797,0.002627,14.822462,1.854127,72.833333,9.645736
2,california,"[(8,), [1024, 256, 64, 16]]",MLP,Adamax,,0.451686,0.001872,18.738402,3.549944,93.0,18.192458
3,california,"[(8,), [1024, 256, 64, 16]]",MLP,ConnectionAwareAdam,AdditiveMinMaxMedianConnectionScaling,0.439638,0.002908,14.226903,1.440883,69.0,7.027041
4,california,"[(8,), [1024, 256, 64, 16]]",MLP,ConnectionAwareAdam,DepthConnectionScaling,0.439627,0.002757,14.48597,1.68762,71.033333,9.049608
5,california,"[(8,), [1024, 256, 64, 16]]",MLP,ConnectionAwareAdam,MultiplicativeMinMaxMedianConnectionScaling,0.442952,0.003823,14.072293,1.11504,68.866667,5.817532
6,california,"[(8,), [1024, 256, 64, 16]]",MLP,Nadam,,0.440843,0.00289,15.845406,2.831015,75.766667,14.6539
7,california,"[(8,), [128, 128, 128, 128, 128]]",MLP,Adam,,0.454633,0.00335,6.845449,0.625387,61.333333,6.132634
8,california,"[(8,), [128, 128, 128, 128, 128]]",MLP,AdamW,,0.45348,0.003271,6.748562,0.548952,58.266667,5.457885
9,california,"[(8,), [128, 128, 128, 128, 128]]",MLP,Adamax,,0.463989,0.001838,8.143008,0.823123,72.6,8.302887



Statistical Results:


Unnamed: 0,dataset,model,optimizer,scaling_strategy,model_args,metric,test_rmse_improvement_%,test_rmse_t_statistic,test_rmse_p_value,time_improvement_%,time_t_statistic,time_p_value
0,california,MLP,AdamW,,"[(8,), [64, 32]]",test_rmse,0.057681,0.380163,0.7052128,11.481429,1.091489,0.2795713
1,california,MLP,Adamax,,"[(8,), [64, 32]]",test_rmse,-1.242696,-8.60245,6.040607e-12,-79.171039,-3.69036,0.0004962069
3,california,MLP,ConnectionAwareAdam,AdditiveMinMaxMedianConnectionScaling,"[(8,), [64, 32]]",test_rmse,0.614282,3.529204,0.0008233814,25.897255,3.001624,0.00395601
5,california,MLP,ConnectionAwareAdam,DepthConnectionScaling,"[(8,), [64, 32]]",test_rmse,0.408312,2.228688,0.02972452,16.791727,1.686074,0.09715443
4,california,MLP,ConnectionAwareAdam,MultiplicativeMinMaxMedianConnectionScaling,"[(8,), [64, 32]]",test_rmse,2.068059,13.300243,2.8945689999999995e-19,30.114795,3.123639,0.002787146
2,california,MLP,Nadam,,"[(8,), [64, 32]]",test_rmse,0.309219,1.928972,0.05863241,18.189179,2.008789,0.04922232
18,california,MLP,AdamW,,"[(8,), [256, 128, 64, 32]]",test_rmse,0.274515,2.194996,0.03218024,-1.024145,-0.39743,0.6925091
19,california,MLP,Adamax,,"[(8,), [256, 128, 64, 32]]",test_rmse,-1.517863,-15.586083,2.124299e-22,-32.517127,-6.585111,1.453267e-08
21,california,MLP,ConnectionAwareAdam,AdditiveMinMaxMedianConnectionScaling,"[(8,), [256, 128, 64, 32]]",test_rmse,1.153082,6.804279,6.24369e-09,4.994738,1.741974,0.08681412
23,california,MLP,ConnectionAwareAdam,DepthConnectionScaling,"[(8,), [256, 128, 64, 32]]",test_rmse,0.925293,6.670653,1.045372e-08,0.414596,0.146798,0.8838004
