In [1]:
# Importing necessary modules
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data
    loss = model.evaluate(X_test, y_test, verbose=0)

    # Making predictions on the test data
    predictions = model.predict(X_test)
    
    # Calculate metrics 
    r_squared_value = r2_score(y_test,predictions)
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

    # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')

    np.save(f'aggregate_results/{dataset_name}-{name}-fold{fold}.npy', results)

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, 2, dataset_name): model for model in models}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name): fold_data for fold_data in kfold_datasets}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations():
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    num_folds = 4

    for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
        dataset_name = row[1]['dataset']
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        evaluate_all_folds_parallel(kfold_datasets, dataset_name)

# Call the new function 
retrieve_datasets_and_run_evaluations()






In [3]:
# Importing necessary modules
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data 
    loss = model.evaluate(X_test,y_test)

     # Making predictions on the test data 
    predictions = model.predict(X_test)
    
     # Calculate metrics 
    r_squared_value=r2_score(y_true=y_test,y_pred=predictions)
    test_error=mean_squared_error(y_true=y_test,y_pred=predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

    # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')

    np.save(f'aggregate_results/{dataset_name}-{name}-fold{fold}.npy', results)

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name, epoch_number):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, epoch_number, dataset_name): model for model in models}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name, epoch_number): fold_data for fold_data in kfold_datasets}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
        dataset_name = row[1]['dataset']
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number)

# Call the new function 
retrieve_datasets_and_run_evaluations(num_folds=4, epoch_number=2)







In [1]:
# Importing necessary modules
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name,num_folds):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data 
    loss = model.evaluate(X_test,y_test)

     # Making predictions on the test data 
    predictions = model.predict(X_test)
    
     # Calculate metrics 
    r_squared_value=r2_score(y_true=y_test,y_pred=predictions)
    test_error=mean_squared_error(y_true=y_test,y_pred=predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

    # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')

    np.save(f'aggregate_results/{dataset_name}-{name}-fold-{fold}-of-{num_folds}.npy', results)

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name, epoch_number,num_folds):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, epoch_number, dataset_name,num_folds): model for model in models}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name, epoch_number,num_folds): fold_data for fold_data in kfold_datasets}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
        dataset_name = row[1]['dataset']
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds)

# Call the new function 
retrieve_datasets_and_run_evaluations(num_folds=4, epoch_number=2)







In [1]:
# Importing necessary modules
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name,num_folds):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data 
    loss = model.evaluate(X_test,y_test)

     # Making predictions on the test data 
    predictions = model.predict(X_test)
    
     # Calculate metrics 
    r_squared_value=r2_score(y_true=y_test,y_pred=predictions)
    test_error=mean_squared_error(y_true=y_test,y_pred=predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

    # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')

    np.save(f'aggregate_results/{dataset_name}-{name}-fold-{fold}-of-{num_folds}.npy', results)

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name, epoch_number,num_folds):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, epoch_number, dataset_name,num_folds): model for model in models}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name, epoch_number,num_folds): fold_data for fold_data in kfold_datasets}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
        dataset_name = row[1]['dataset']
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds)

# Call the new function 
start_time = time.time()
retrieve_datasets_and_run_evaluations(num_folds=4, epoch_number=2, parallel=True)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"The experiment took {elapsed_time} seconds to complete.")




The experiment took 1.6836485862731934 seconds to complete.


In [None]:
# The experiment took 162.88905668258667 seconds to complete.
# The experiment took 173.05344152450562 seconds to complete.

In [None]:
# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100, parallel=True):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    if parallel:
        # Use ThreadPoolExecutor to spawn separate threads for each dataset.
        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(evaluate_all_folds_parallel, generate_cross_validation_dataset(dataset, num_folds), row[1]['dataset'], epoch_number,num_folds): dataset for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows())}
            for future in futures:
                future.result()
    else:
        # Use a simple for loop to process each dataset sequentially.
        for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
            dataset_name = row[1]['dataset']
            kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
            evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds)

In [None]:
# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100, parallel=True):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    if parallel:
        # Use ProcessPoolExecutor to spawn separate processes for each dataset.
        with ProcessPoolExecutor() as executor:
            executor.map(evaluate_all_folds_parallel, 
                         [generate_cross_validation_dataset(dataset, num_folds) for dataset in datasets], 
                         [row[1]['dataset'] for row in filtered_datasets_metadata.iterrows()], 
                         [epoch_number]*len(datasets), 
                         [num_folds]*len(datasets))
    else:
        # Use a simple for loop to process each dataset sequentially.
        for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
            dataset_name = row[1]['dataset']
            kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
            evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds)

In [None]:
# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100, parallel=True):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    if parallel:
        # Use ProcessPoolExecutor to spawn separate processes for each dataset.
        with ProcessPoolExecutor() as executor:
            all_experiments = []
            for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
                dataset_name = row[1]['dataset']
                kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
                experiment_params = (kfold_datasets, dataset_name, epoch_number,num_folds)
                all_experiments.append(experiment_params)
            
            executor.map(evaluate_all_folds_parallel,*zip(*all_experiments))
    else:
        # Use a simple for loop to process each dataset sequentially.
        for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
            dataset_name = row[1]['dataset']
            kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
            evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds)
