In [1]:
# we need a way to specify unique model name, or save model outputs based on indices.

import numpy as np
import os
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor


In [2]:
# expensive and slow operation done once.
filtered_datasets, datasets = fetch_return_filtered_pmlb_data_sets() 
print(filtered_datasets)

for dataset, row in zip(datasets, filtered_datasets.iterrows()):
    dataset_name = row[1]['dataset']
    print(f"Evaluating dataset: {dataset_name}")

    print(dataset)

    dataset  n_instances  n_features  n_binary_features  \
2  1029_LEV         1000           4                  0   
3  1030_ERA         1000           4                  0   

   n_categorical_features  n_continuous_features endpoint_type  n_classes  \
2                       0                      4    continuous        5.0   
3                       0                      4    continuous        9.0   

   imbalance        task  
2   0.111245  regression  
3   0.031251  regression  
Evaluating dataset: 1029_LEV
     In1  In2  In3  In4  target
0    4.0  2.0  3.0  0.0     3.0
1    3.0  3.0  0.0  3.0     3.0
2    2.0  4.0  1.0  0.0     2.0
3    2.0  1.0  2.0  3.0     2.0
4    2.0  3.0  4.0  2.0     2.0
..   ...  ...  ...  ...     ...
995  2.0  2.0  1.0  4.0     2.0
996  1.0  2.0  2.0  3.0     2.0
997  0.0  0.0  1.0  4.0     0.0
998  0.0  2.0  1.0  3.0     1.0
999  2.0  0.0  3.0  4.0     1.0

[1000 rows x 5 columns]
Evaluating dataset: 1030_ERA
      in1   in2   in3   in4  target
0    1

In [3]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
import multiprocessing as mp
from concurrent.futures import ThreadPoolExecutor

def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

def train_evaluate_model(model_tuple, fold_data, epoch_number):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    history = model.fit(X_train, y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test, y_test))

    loss = model.evaluate(X_test, y_test, verbose=0)

    predictions = model.predict(X_test)
    r_squared_value = r2_score(y_test,predictions)
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}
    
    return results

def evaluate_models_parallel(fold_data, dataset_name):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, 2): model for model in models}
        for future in futures:
            result = future.result()
            results.append(result)
            print(f'Trained {futures[future][1]} on {dataset_name}, fold {fold_data[4]}')
    
    return results

def evaluate_all_folds_parallel(kfold_datasets, dataset_name):
    
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name): fold_data for fold_data in kfold_datasets}
        for future in futures:
            result = future.result()
            results.extend(result)
            print(f'Completed models evaluation on dataset {dataset_name}, fold {futures[future][4]}')
    
    return results

# expensive and slow operation done once.
filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets() 
print(filtered_datasets_metadata)

num_folds = 10

list_of_kfold_datasets = []
for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
    dataset_name = row[1]['dataset']
    print(f"Evaluating dataset: {dataset_name}")
    print(dataset)
    list_of_kfold_datasets.append(generate_cross_validation_dataset(dataset, num_folds))
    
evaluate_all_folds_parallel(list_of_kfold_datasets[0], '1029_LEV')


    dataset  n_instances  n_features  n_binary_features  \
2  1029_LEV         1000           4                  0   
3  1030_ERA         1000           4                  0   

   n_categorical_features  n_continuous_features endpoint_type  n_classes  \
2                       0                      4    continuous        5.0   
3                       0                      4    continuous        9.0   

   imbalance        task  
2   0.111245  regression  
3   0.031251  regression  
Evaluating dataset: 1029_LEV
     In1  In2  In3  In4  target
0    4.0  2.0  3.0  0.0     3.0
1    3.0  3.0  0.0  3.0     3.0
2    2.0  4.0  1.0  0.0     2.0
3    2.0  1.0  2.0  3.0     2.0
4    2.0  3.0  4.0  2.0     2.0
..   ...  ...  ...  ...     ...
995  2.0  2.0  1.0  4.0     2.0
996  1.0  2.0  2.0  3.0     2.0
997  0.0  0.0  1.0  4.0     0.0
998  0.0  2.0  1.0  3.0     1.0
999  2.0  0.0  3.0  4.0     1.0

[1000 rows x 5 columns]
Evaluating dataset: 1030_ERA
      in1   in2   in3   in4  target
0    1

Trained Deep ReLU ANN on 1029_LEV, fold 4
Trained One Parameter on 1029_LEV, fold 4
Trained One Parameter on 1029_LEV, fold 7
Trained Deep ReLU ANN on 1029_LEV, fold 1
Trained One Parameter on 1029_LEV, fold 1
Trained Deep ReLU ANN on 1029_LEV, fold 8
Trained One Parameter on 1029_LEV, fold 8
Trained Deep ReLU ANN on 1029_LEV, fold 3
Trained Deep ReLU ANN on 1029_LEV, fold 2
Trained One Parameter on 1029_LEV, fold 2
Trained One Parameter on 1029_LEV, fold 6
Trained Deep ReLU ANN on 1029_LEV, fold 9
Trained One Parameter on 1029_LEV, fold 9
Trained Deep ReLU ANN on 1029_LEV, fold 5
Trained One Parameter on 1029_LEV, fold 5
Trained Deep ReLU ANN on 1029_LEV, fold 10
Trained One Parameter on 1029_LEV, fold 10
Trained Spline ANN (z=1) on 1029_LEV, fold 2
Trained Lookup Table (z=1) on 1029_LEV, fold 2
Trained Lookup Table (z=1) on 1029_LEV, fold 6
Trained Spline ANN (z=1) on 1029_LEV, fold 3
Trained Lookup Table (z=1) on 1029_LEV, fold 3
Trained Lookup Table (z=1) on 1029_LEV, fold 7
Traine

Trained Spline ANN (z=2) on 1029_LEV, fold 7
Trained Lookup Table (z=2) on 1029_LEV, fold 7
Trained ABEL-Spline (z=1) on 1029_LEV, fold 5
Trained Spline ANN (z=2) on 1029_LEV, fold 5
Trained Lookup Table (z=2) on 1029_LEV, fold 5
Trained ABEL-Spline (z=2) on 1029_LEV, fold 5
Trained Spline ANN (z=4) on 1029_LEV, fold 5
Trained Lookup Table (z=4) on 1029_LEV, fold 5
Trained ABEL-Spline (z=1) on 1029_LEV, fold 3
Trained Spline ANN (z=2) on 1029_LEV, fold 3
Trained Lookup Table (z=2) on 1029_LEV, fold 3
Trained Spline ANN (z=4) on 1029_LEV, fold 7
Trained Lookup Table (z=4) on 1029_LEV, fold 7
Trained ABEL-Spline (z=4) on 1029_LEV, fold 7
Trained Spline ANN (z=8) on 1029_LEV, fold 7
Trained Lookup Table (z=8) on 1029_LEV, fold 7
Trained ABEL-Spline (z=8) on 1029_LEV, fold 7
Trained Spline ANN (z=10) on 1029_LEV, fold 7
Trained Lookup Table (z=10) on 1029_LEV, fold 7
Trained Spline ANN (z=2) on 1029_LEV, fold 6
Trained Lookup Table (z=2) on 1029_LEV, fold 6
Trained ABEL-Spline (z=2) on 102

Completed models evaluation on dataset 1029_LEV, fold 4
Completed models evaluation on dataset 1029_LEV, fold 5
Trained ABEL-Spline (z=4) on 1029_LEV, fold 6
Trained Spline ANN (z=8) on 1029_LEV, fold 6
Trained Lookup Table (z=8) on 1029_LEV, fold 6
Trained ABEL-Spline (z=8) on 1029_LEV, fold 6
Trained Spline ANN (z=10) on 1029_LEV, fold 6
Trained Lookup Table (z=10) on 1029_LEV, fold 6
Trained ABEL-Spline (z=10) on 1029_LEV, fold 6
Completed models evaluation on dataset 1029_LEV, fold 6
Completed models evaluation on dataset 1029_LEV, fold 7
Completed models evaluation on dataset 1029_LEV, fold 8
Completed models evaluation on dataset 1029_LEV, fold 9
Completed models evaluation on dataset 1029_LEV, fold 10


[{'model': 'Linear Model',
  'fold': 1,
  'train_history': [1.9217804670333862, 1.8248345851898193],
  'val_history': [2.0039119720458984, 1.889233112335205],
  'loss': 1.889233112335205,
  'r_squared_value': -4.05973480331124,
  'test_error': 5.199383483882628},
 {'model': 'Wide ReLU ANN',
  'fold': 1,
  'train_history': [0.9965468049049377, 0.499619722366333],
  'val_history': [0.4966243803501129, 0.48317086696624756],
  'loss': 0.48317086696624756,
  'r_squared_value': 0.6376782748126593,
  'test_error': 0.37232180480251115},
 {'model': 'Deep ReLU ANN',
  'fold': 1,
  'train_history': [0.8652048110961914, 0.6868149042129517],
  'val_history': [0.7452402710914612, 0.6598928570747375],
  'loss': 0.6598928570747375,
  'r_squared_value': 0.2784967292839099,
  'test_error': 0.7414167609878539},
 {'model': 'One Parameter',
  'fold': 1,
  'train_history': [2.7675464153289795, 2.7385470867156982],
  'val_history': [2.7910003662109375, 2.762000799179077],
  'loss': 2.762000799179077,
  'r_sq

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor
from keras.models import Sequential
from keras.layers import Dense

def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    kf = KFold(n_splits=num_folds)
    dataset_list = [(X[train_index], y[train_index], X[test_index], y[test_index], fold+1) 
                    for fold, (train_index, test_index) in enumerate(kf.split(X))]
    
    return dataset_list

def train_evaluate_model(model_tuple, fold_data):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    history = model.fit(X_train, y_train,
                        epochs=2,
                        verbose=0,
                        validation_data=(X_test, y_test))

    loss = model.evaluate(X_test, y_test, verbose=0)

    predictions = model.predict(X_test)
    
    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r2_score(y_test,predictions),
        'test_error': mean_squared_error(y_test,predictions)}
    
    return results

def evaluate_models_parallel(fold_data):
    
    models = initialize_all_models(fold_data[0].shape[1])
    compile_models(models)
    
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data): model for model in models}
        results = [future.result() for future in futures]
        
    return results

def evaluate_all_folds_parallel(kfold_datasets):
    
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data): fold_data for fold_data in kfold_datasets}
        results = [future.result() for future in futures]
        
    return results

filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

num_folds = 10
list_of_kfold_datasets = [generate_cross_validation_dataset(dataset, num_folds) 
                          for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows())]

evaluate_all_folds_parallel(list_of_kfold_datasets[0])

TypeError: initialize_all_models() missing 1 required positional argument: 'seed_val'

In [1]:
# Importing necessary modules
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    history = model.fit(X_train, y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test, y_test))

    loss = model.evaluate(X_test, y_test)

    predictions = model.predict(X_test)
    r_squared_value = r2_score(y_test,predictions)
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}
    
    return results

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, 2): model for model in models}
        for future in futures:
            result = future.result()
            results.append(result)
    
    return results

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name):
    
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name): fold_data for fold_data in kfold_datasets}
        for future in futures:
            result = future.result()
            results.extend(result)
    
    return results

# Fetching data and running evaluations
filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

num_folds = 10

list_of_kfold_datasets = []
for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
    dataset_name = row[1]['dataset']
    list_of_kfold_datasets.append(generate_cross_validation_dataset(dataset, num_folds))
    
evaluate_all_folds_parallel(list_of_kfold_datasets[0], '1029_LEV')

TypeError: initialize_all_models() missing 1 required positional argument: 'seed_val'

In [5]:
#evaluate_models_parallel(list_of_kfold_datasets[0][0], '1029_LEV')

In [32]:
from concurrent.futures import ThreadPoolExecutor




# usage
evaluate_all_folds_parallel(list_of_kfold_datasets[0], '1029_LEV')

Trained Linear Model on 1029_LEV, fold 2
Trained Linear Model on 1029_LEV, fold 1
Trained Linear Model on 1029_LEV, fold 3
Trained Wide ReLU ANN on 1029_LEV, fold 1
Trained Wide ReLU ANN on 1029_LEV, fold 2
Trained Wide ReLU ANN on 1029_LEV, fold 3
Trained Deep ReLU ANN on 1029_LEV, fold 3
Trained One Parameter on 1029_LEV, fold 3
Trained Deep ReLU ANN on 1029_LEV, fold 2
Trained One Parameter on 1029_LEV, fold 2
Trained Deep ReLU ANN on 1029_LEV, fold 1
Trained One Parameter on 1029_LEV, fold 1
Trained Spline ANN (z=1) on 1029_LEV, fold 3
 1/11 [=>............................] - ETA: 18sTrained Spline ANN (z=1) on 1029_LEV, fold 1
Trained Lookup Table (z=1) on 1029_LEV, fold 1
Trained Spline ANN (z=1) on 1029_LEV, fold 2
Trained Lookup Table (z=1) on 1029_LEV, fold 2
 1/11 [=>............................] - ETA: 29sTrained ABEL-Spline (z=1) on 1029_LEV, fold 1
Trained Spline ANN (z=2) on 1029_LEV, fold 1
Trained Lookup Table (z=2) on 1029_LEV, fold 1
Trained ABEL-Spline (z=1) on 1029_

[{'model': 'Linear Model',
  'fold': 1,
  'train_history': [1.945974588394165, 1.8677778244018555],
  'val_history': [1.914549708366394, 1.8404541015625],
  'loss': 1.8404541015625,
  'r_squared_value': -4.3533380471438905,
  'test_error': 5.168872563859969},
 {'model': 'Wide ReLU ANN',
  'fold': 1,
  'train_history': [1.158201813697815, 0.4991505444049835],
  'val_history': [0.6010774970054626, 0.5225627422332764],
  'loss': 0.5225627422332764,
  'r_squared_value': 0.49351350770430435,
  'test_error': 0.4890339655254219},
 {'model': 'Deep ReLU ANN',
  'fold': 1,
  'train_history': [0.8782848715782166, 0.7327370643615723],
  'val_history': [0.7937200665473938, 0.6971293091773987],
  'loss': 0.6971293091773987,
  'r_squared_value': 0.17112921698297268,
  'test_error': 0.800309528670175},
 {'model': 'One Parameter',
  'fold': 1,
  'train_history': [2.7603604793548584, 2.7393605709075928],
  'val_history': [2.7933714389801025, 2.772371530532837],
  'loss': 2.772371530532837,
  'r_squared_

In [None]:
from concurrent.futures import ProcessPoolExecutor

def evaluate_all_folds_parallel(kfold_datasets, dataset_name):
    
    results = []
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name): fold_data for fold_data in kfold_datasets}
        for future in futures:
            result = future.result()
            results.extend(result)
            print(f'Completed models evaluation on dataset {dataset_name}, fold {futures[future][4]}')
    
    return results


# usage
evaluate_all_folds_parallel(list_of_kfold_datasets[0], '1029_LEV')

In [35]:
def evaluate_models_parallel(fold_data, dataset_name):
    try:
        models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
        compile_models(models)

        results = []
        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(train_evaluate_model, model, fold_data): model for model in models}
            for future in futures:
                result = future.result()
                results.append(result)
                print(f'Trained {futures[future][1]} on {dataset_name}, fold {fold_data[4]}')

        return results
    except Exception as e:
        print(f"An error occurred in evaluate_models_parallel: {e}")
        
evaluate_all_folds_parallel(list_of_kfold_datasets[0], '1029_LEV')

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [4]:
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

def generate_cross_validation_models(input_dim, num_folds):
    model_lists = []
    for fold in range(num_folds):
        models = initialize_all_models(input_dim, seed_val=fold)
        compile_models(models)
        model_lists.append(models)
    return model_lists

def train_evaluate_model(model_tuple, fold_data, epoch_number):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    history = model.fit(X_train, y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test, y_test))

    loss = model.evaluate(X_test, y_test, verbose=0)

    predictions = model.predict(X_test)
    r_squared_value = r2_score(y_test,predictions)
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history['loss'],
        'val_history': history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}
    
    return results    

def cross_validation(data, epoch_number, num_folds):
    
    # Generate cross validation datasets
    cv_datasets = generate_cross_validation_dataset(data, num_folds)
    cv_models = generate_cross_validation_models(X_train.shape[1], num_folds)
    
    for fold in range(num_folds):
        fold_data = cv_datasets[fold]
        models = cv_models[fold]
        for model_tuple in models:
            train_evaluate_model(model_tuple, fold_data, epoch_number)

In [None]:
def cross_validation(data, epoch_number, num_folds):
    
    # Generate cross validation datasets
    cv_datasets = generate_cross_validation_dataset(data, num_folds)
    cv_models = generate_cross_validation_models(X_train.shape[1], num_folds)
    
    for fold in range(num_folds):
        fold_data = cv_datasets[fold]
        models = cv_models[fold]
        for model_tuple in models:
            train_evaluate_model(model_tuple, fold_data, epoch_number)
            
            
    
    all_results = []
    
    for fold_data in cv_datasets:
        X_train, y_train, X_test, y_test , fold = fold_data
        
        # Generate models for this fold
        models = generate_cross_validation_models(X_train.shape[1], num_folds)
        
        fold_results = []
        
        # Train and evaluate each model on this fold's data
        for model_tuple in models:
            result = train_evaluate_model(model_tuple, fold_data, epoch_number)
            fold_results.append(result)
        
        all_results.append(fold_results)

    return all_results

all_results = {}

for dataset, row in zip(datasets, filtered_datasets.iterrows()):
    dataset_name = row[1]['dataset']
    print(f"Evaluating dataset: {dataset_name}")

    results = cross_validation(dataset)
    all_results[dataset_name] = results

# save results to a JSON file
with open('results.json', 'w') as f:
    json.dump(all_results, f)

In [None]:

# the k-fold
[(train_index, test_index) for train_index,test_index in kf.split(X)]

input_dimension = row[1]['n_features']
X_train, X_test = X[train_index], X[test_index]

models = initialize_all_models(X_train.shape[1], seed_val=fold) #input dimension specify

In [None]:
def train_evaluate_model(model_name_tuple, data, indices, fold, epochs):
    