In [1]:
# Importing necessary modules
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data
    loss = model.evaluate(X_test, y_test, verbose=0)

    # Making predictions on the test data
    predictions = model.predict(X_test)
    
    # Calculate metrics 
    r_squared_value = r2_score(y_test,predictions)
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}
    
    return results

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, 2): model for model in models}
        for future in futures:
            result = future.result()
            results.append(result)
    
    return results

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name): fold_data for fold_data in kfold_datasets}
        for future in futures:
            result = future.result()
            results.extend(result)
    
    return results

# Fetching data and running evaluations
filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

num_folds = 4

list_of_kfold_datasets = []
for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
    dataset_name = row[1]['dataset']
    list_of_kfold_datasets.append(generate_cross_validation_dataset(dataset, num_folds))
    
evaluate_all_folds_parallel(list_of_kfold_datasets[0], '1029_LEV')





[{'model': 'Linear Model',
  'fold': 1,
  'train_history': [1.9237008094787598, 1.8234695196151733],
  'val_history': [2.005307674407959, 1.887061357498169],
  'loss': 1.887061357498169,
  'r_squared_value': -4.044631305747247,
  'test_error': 5.18386312978587},
 {'model': 'Wide ReLU ANN',
  'fold': 1,
  'train_history': [1.0195107460021973, 0.5012112855911255],
  'val_history': [0.4753933846950531, 0.4933238923549652],
  'loss': 0.4933238923549652,
  'r_squared_value': 0.6282871426806469,
  'test_error': 0.3819721321813671},
 {'model': 'Deep ReLU ANN',
  'fold': 1,
  'train_history': [0.8225933909416199, 0.6690576076507568],
  'val_history': [0.7233116030693054, 0.6547183394432068],
  'loss': 0.6547183394432068,
  'r_squared_value': 0.2931374201190522,
  'test_error': 0.7263719870856618},
 {'model': 'One Parameter',
  'fold': 1,
  'train_history': [2.7675464153289795, 2.7385470867156982],
  'val_history': [2.7910003662109375, 2.762000799179077],
  'loss': 2.762000799179077,
  'r_squar

In [3]:
# Importing necessary modules
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data
    loss = model.evaluate(X_test, y_test, verbose=0)

    # Making predictions on the test data
    predictions = model.predict(X_test)
    
    # Calculate metrics 
    r_squared_value = r2_score(y_test,predictions)
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

    # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')

    np.save(f'aggregate_results/{dataset_name}-{name}-fold{fold}.npy', results)

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, 2, dataset_name): model for model in models}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name): fold_data for fold_data in kfold_datasets}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Fetching data and running evaluations
filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

num_folds = 4

list_of_kfold_datasets = []
for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
    dataset_name = row[1]['dataset']
    list_of_kfold_datasets.append(generate_cross_validation_dataset(dataset, num_folds))
    
evaluate_all_folds_parallel(list_of_kfold_datasets[0], '1029_LEV')

