In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import pmlb
from model_definitions import *
from concurrent.futures import ThreadPoolExecutor

def cross_val(models, data):
    X, y = data.drop('target', axis=1).values, data['target'].values

    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    results = {}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train, X_test = preprocess_data(X_train, X_test)

        for model in models:
            model_name = type(model).__name__
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)

            if model_name not in results:
                results[model_name] = []

            results[model_name].append(mae)
    
    return results

def fetch_return_filtered_pmlb_data_sets():
    
    def fetch_dataset(row):
        dataset_name = row[1]['dataset']
        data = pmlb.fetch_data(dataset_name)
        return data

    metadata_path = os.path.join(os.path.dirname(pmlb.__file__), 'all_summary_stats.tsv')
    metadata = pd.read_csv(metadata_path, sep='\t')

    filtered_datasets = metadata[
        (metadata['n_features'] < 7) &
        (metadata['n_binary_features'] == 0) &
        (metadata['n_categorical_features'] == 0) &
        (metadata['n_continuous_features'] == metadata['n_features'] ) &
        (metadata['n_instances'] >= 500) &
        (metadata['n_instances'] <= 1000) &
        (metadata['endpoint_type'] == 'continuous') &
        (metadata['task'] == 'regression')
    ]
    
    with ThreadPoolExecutor() as executor:
        datasets = list(executor.map(fetch_dataset, filtered_datasets.iterrows()))
    
    return (filtered_datasets, datasets)

# expensive and slow operation done once.
filtered_datasets, datasets = fetch_return_filtered_pmlb_data_sets() 
print(filtered_datasets)

all_results = {}

for dataset, row in zip(datasets, filtered_datasets.iterrows()):
    dataset_name = row[1]['dataset']
    print(f"Evaluating dataset: {dataset_name}")

    input_dimension = row[1]['n_features']
    models = initialize_all_models(input_dimension, seed_val=42)
    compile_models(models)

    results = cross_val(models, dataset)
    all_results[dataset_name] = results

print(all_results)


    dataset  n_instances  n_features  n_binary_features  \
2  1029_LEV         1000           4                  0   
3  1030_ERA         1000           4                  0   

   n_categorical_features  n_continuous_features endpoint_type  n_classes  \
2                       0                      4    continuous        5.0   
3                       0                      4    continuous        9.0   

   imbalance        task  
2   0.111245  regression  
3   0.031251  regression  
Evaluating dataset: 1029_LEV




Evaluating dataset: 1030_ERA






{'1029_LEV': {'Sequential': [2.022149314954877, 0.8057132032513619, 1.6944525875151157, 1.875617575980723, 0.5450097620487213, 0.7417305302619934, 1.9036535339802503, 0.5216325622797012, 0.571788524389267, 1.617690000012517, 0.49113801166415216, 0.4915307480096817, 1.7136510978639126, 0.4535551906377077, 0.5071004921197891, 1.7111542962491513, 0.4345022637397051, 0.4587698485702276, 1.4666826477646828, 0.42295899694785477, 0.42552731562405827, 1.659765855371952, 0.48245368907228114, 0.5074503801390529, 1.8418554875254631, 0.49294900462031366, 0.5011041653156281, 1.4888701567053795, 0.4120843033492565, 0.43688630871474743], 'LookupTableModel': [1.9143181284330786, 1.914218575283885, 1.904450712596299, 1.907915610789787, 1.9126935548431356, 1.9106763733894332, 1.8900008165091275, 1.889912162795663, 1.8839742811908946, 1.888671344574541, 1.9027824209444224, 1.8980593091173796, 1.8089309713989496, 1.8083669704943894, 1.8009339591918252, 1.8141076921334025, 1.8223370199324562, 1.81980241127

In [2]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import pmlb
from pmlb import fetch_data
from model_definitions import *

def compile_models(models, optimizer='adam', loss='mean_absolute_error'):
    """Compile TensorFlow/Keras models."""
    for model in models:
        model.compile(optimizer=optimizer, loss=loss)


def preprocess_data(train_data, test_data):
    """Preprocess the data by zero-centering, scaling to unit variance, and applying a sigmoid."""
    bias = np.mean(train_data, axis=0)
    variance = np.std(train_data, axis=0)
    
    train_data = (train_data - bias) / variance
    test_data = (test_data - bias) / variance
    
    train_data = 1 / (1 + np.exp(-train_data))
    test_data = 1 / (1 + np.exp(-test_data))
    
    return train_data, test_data


def cross_val(models, dataset_name, n_splits=10):
    data = fetch_data(dataset_name)
    X, y = data.drop('target', axis=1).values, data['target'].values

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = {}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train, X_test = preprocess_data(X_train, X_test)

        for model in models:
            model_name = type(model).__name__
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)

            if model_name not in results:
                results[model_name] = []

            results[model_name].append(mae)
    
    return results


def initialize_all_models(input_dimension: int, 
                          seed_val: int, 
                          output_dim: int = 1,
                          hidden_units_wide: int = 1000,
                          hidden_units_deep: int = 16,
                          hidden_layers: int = 8,
                          num_exps: int = 6) -> list:
    """Initialize models with given configurations."""
    common_args = {
        'input_dim': input_dimension, 
        'output_dim': output_dim, 
        'seed': seed_val
    }

    models = [
        create_linear_model(**common_args),
        create_wide_relu_ann(hidden_units=hidden_units_wide, **common_args),
        create_deep_relu_ann(hidden_units=hidden_units_deep, hidden_layers=hidden_layers, **common_args),
        LookupTableModel(partition_num=1, default_val=-1., **common_args)
    ]

    for partition_num in [1,2,4,8,10]:
        models.append(SplineANN(partition_num=partition_num, **common_args))
        models.append(LookupTableModel(partition_num=partition_num, default_val=-1., **common_args))
        models.append(ANNEXSpline(partition_num=partition_num, num_exps=num_exps, **common_args))

    return models


# Path to the TSV file
metadata_path = os.path.join(os.path.dirname(pmlb.__file__), 'all_summary_stats.tsv')
metadata = pd.read_csv(metadata_path, sep='\t')

# Filter datasets based on number of records (rows) and other criteria
filtered_datasets = metadata[
    (metadata['n_features'] < 5) &
    (metadata['n_binary_features'] == 0) &
    (metadata['n_categorical_features'] == 0) &
    (metadata['n_continuous_features'] == metadata['n_features'] ) &
    (metadata['n_instances'] >= 500) &  # Assuming 'n_instances' is the column for the number of records
    (metadata['n_instances'] <= 1000) &
    (metadata['endpoint_type'] == 'continuous') &
    (metadata['task'] == 'regression')
]

all_results = {}

for _, row in filtered_datasets.iterrows():
    dataset_name = row['dataset']
    print(f"Evaluating dataset: {dataset_name}")
    
    input_dimension = row['n_features']
    models = initialize_all_models(input_dimension, seed_val=42)
    compile_models(models)
    
    results = cross_val(models, dataset_name)
    all_results[dataset_name] = results

print(all_results)

# Please wrap the filtering and data fetching in a separate function. I want the **data set** itself to be given as an input to the 
# cross_val function. Here is an example:

# Function to fetch a dataset by name
def fetch_dataset(row):
    dataset_name = row[1][0]  # Adjust this based on the actual structure
    #print(dataset_name)
    data = fetch_data(dataset_name)
    # Further processing if needed
    return data

# Fetch the filtered datasets in parallel
with ThreadPoolExecutor() as executor:
    datasets = list(executor.map(fetch_dataset, filtered_datasets.iterrows()))

# 'datasets' now contains the datasets that meet your criteria I want to contain/encapsulate the PMLB dependent code
# in one function to make my life easier if I wat to use other data se




Evaluating dataset: 1029_LEV


Evaluating dataset: 1030_ERA


{'1029_LEV': {'Sequential': [2.0221621349453924, 0.7933430811762809, 1.6941675129532814, 1.874968957975507, 0.5584732055664062, 0.7524310970306396, 1.9036995647475123, 0.5179093956947327, 0.589298689365387, 1.618078442439437, 0.4877166260778904, 0.511274830698967, 1.7129278630018234, 0.44329070568084716, 0.49818726181983947, 1.7118319395929575, 0.4322559240460396, 0.45962959721684454, 1.4660055024921894, 0.4231223513931036, 0.4359669440239668, 1.6609746345877647, 0.4848525023832917, 0.5162886396795511, 1.8431587083637715, 0.49203173197805883, 0.5089119470492005, 1.490471440255642, 0.40738728769123556, 0.44686184383928773], 'LookupTableModel': [1.9142543955240399, 1.9142684667930008, 1.905387876444729, 1.9080076811497566, 1.9122485671692993, 1.9110672793351113, 1.8899396319687367, 1.8899350018054246, 1.8847575262491592, 1.8885278181289322, 1.9034409697019146, 1.8985536113928538, 1.8081036318838597, 1.8083110586553812, 1.8014922329195542, 1.8132911333837547, 1.8223285286518511, 1.8201774

In [None]:
# Function to fetch a dataset by name
def fetch_dataset(row):
    dataset_name = row[1][0]  # Adjust this based on the actual structure
    #print(dataset_name)
    data = fetch_data(dataset_name)
    # Further processing if needed
    return data

# Fetch the filtered datasets in parallel
with ThreadPoolExecutor() as executor:
    datasets = list(executor.map(fetch_dataset, filtered_datasets.iterrows()))

# 'datasets' now contains the datasets that meet your criteria
