In [11]:
# Importing necessary modules
import gc
import copy
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train, X_test = preprocess_data(X_train, X_test)
        y_train, y_test = preprocess_target_values(y_train, y_test)
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model

def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name,num_folds, parallel=False):
    # Create deep copies of input parameters
    model_tuple_local = copy.deepcopy(model_tuple)
    fold_data_local = copy.deepcopy(fold_data)
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    print(f"Evaluating{name} for {fold}")
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=1,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data 
    loss = model.evaluate(X_test,y_test)

     # Making predictions on the test data 
    predictions = model.predict(X_test)
    
     # Calculate metrics 
    r_squared_value=r2_score(y_true=y_test,y_pred=predictions)
    test_error=mean_squared_error(y_true=y_test,y_pred=predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

   # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')
        
    np.save(f'aggregate_results/{dataset_name}-{name}-epochs-{epoch_number}-fold-{fold}-of-{num_folds}.npy', results)
    
    # Delete local variables
    del model, history, X_train, y_train, X_test, y_test , predictions, model_tuple_local, fold_data_local
     
    # Run garbage collector
    gc.collect()

# Function to evaluate models in parallel or using for loop
def evaluate_models_parallel(fold_data, dataset_name, epoch_number,num_folds, parallel=False):

    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

    if parallel:
        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(train_evaluate_model,
                                       model,
                                       fold_data,
                                       epoch_number,
                                       dataset_name,
                                       num_folds,
                                       parallel): model for model in models}
            for future in futures:
                future.result()
    else:
        for model in models:
            train_evaluate_model(model,
                                 fold_data,
                                 epoch_number,
                                 dataset_name,
                                 num_folds)

# Function to evaluate all folds in parallel or using for loop
def evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds, parallel=False):
    
    if parallel:
        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(evaluate_models_parallel, 
                                       fold_data, 
                                       dataset_name, 
                                       epoch_number,
                                       num_folds,
                                       parallel): fold_data for fold_data in kfold_datasets}
            for future in futures:
                future.result()
    else:
        for fold_data in kfold_datasets:
            evaluate_models_parallel(fold_data,
                                     dataset_name,
                                     epoch_number,
                                     num_folds)

# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=10, epoch_number=101, parallel=False):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
        dataset_name = row[1]['dataset']
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds, parallel)

In [12]:
filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()
filtered_datasets_metadata

Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
0,1027_ESL,488,4,0,0,4,continuous,9.0,0.099363,regression
2,1029_LEV,1000,4,0,0,4,continuous,5.0,0.111245,regression
3,1030_ERA,1000,4,0,0,4,continuous,9.0,0.031251,regression
5,1096_FacultySalaries,50,4,0,0,4,continuous,39.0,0.004063,regression
13,192_vineyard,52,2,0,0,2,continuous,19.0,0.040475,regression
23,228_elusage,55,2,0,0,2,continuous,52.0,0.000953,regression
25,230_machine_cpu,209,6,0,0,6,continuous,116.0,0.004906,regression
29,485_analcatdata_vehicle,48,4,0,0,4,continuous,47.0,0.000434,regression
32,519_vinnie,380,2,0,0,2,continuous,16.0,0.030146,regression
34,523_analcatdata_neavote,100,2,0,0,2,continuous,8.0,0.136914,regression


In [13]:
num_folds = 3

for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
    dataset_name = row[1]['dataset']
    if dataset_name == 'titanic':
        print(row)
        #print(dataset)
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        # Check if kfold_datasets contains valid numbers
        for dataset in kfold_datasets:
            X_train, y_train, X_test, y_test , fold = dataset
            if not (np.isfinite(X_train).all() and np.isfinite(y_train).all() and np.isfinite(X_test).all() and np.isfinite(y_test).all()):
                print(f"Dataset for fold {fold} contains invalid numbers.")
            else:
                print(f"Dataset for fold {fold} is valid.")
        evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number=101,num_folds=10, parallel=False)
        


(270, dataset                      titanic
n_instances                     2201
n_features                         3
n_binary_features                  0
n_categorical_features             0
n_continuous_features              3
endpoint_type             continuous
n_classes                        2.0
imbalance                   0.125266
task                      regression
Name: 270, dtype: object)
Dataset for fold 1 is valid.
Dataset for fold 2 is valid.
Dataset for fold 3 is valid.
INFO:tensorflow:Assets written to: ram://9e550aa0-71c5-4e33-afa2-b1dfde52958f/assets


FileNotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ram://ad00049a-8b8b-4a92-a23a-c94520fec503/variables/variables
 You may be trying to load on a different device from the computational device. Consider setting the `experimental_io_device` option in `tf.saved_model.LoadOptions` to the io_device such as '/job:localhost'.