In [1]:
# Importing necessary modules
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name,num_folds):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data 
    loss = model.evaluate(X_test,y_test)

     # Making predictions on the test data 
    predictions = model.predict(X_test)
    
     # Calculate metrics 
    r_squared_value=r2_score(y_true=y_test,y_pred=predictions)
    test_error=mean_squared_error(y_true=y_test,y_pred=predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

    # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')

    np.save(f'aggregate_results/{dataset_name}-{name}-fold-{fold}-of-{num_folds}.npy', results)

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name, epoch_number,num_folds):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, model, fold_data, epoch_number, dataset_name,num_folds): model for model in models}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, fold_data, dataset_name, epoch_number,num_folds): fold_data for fold_data in kfold_datasets}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
        dataset_name = row[1]['dataset']
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds)



In [3]:
filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()
filtered_datasets_metadata

Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
0,1027_ESL,488,4,0,0,4,continuous,9.0,0.099363,regression
2,1029_LEV,1000,4,0,0,4,continuous,5.0,0.111245,regression
3,1030_ERA,1000,4,0,0,4,continuous,9.0,0.031251,regression
5,1096_FacultySalaries,50,4,0,0,4,continuous,39.0,0.004063,regression
13,192_vineyard,52,2,0,0,2,continuous,19.0,0.040475,regression
23,228_elusage,55,2,0,0,2,continuous,52.0,0.000953,regression
25,230_machine_cpu,209,6,0,0,6,continuous,116.0,0.004906,regression
29,485_analcatdata_vehicle,48,4,0,0,4,continuous,47.0,0.000434,regression
32,519_vinnie,380,2,0,0,2,continuous,16.0,0.030146,regression
34,523_analcatdata_neavote,100,2,0,0,2,continuous,8.0,0.136914,regression


In [None]:
# Call the new function 
start_time = time.time()
retrieve_datasets_and_run_evaluations(num_folds=4, epoch_number=2)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"The experiment took {elapsed_time} seconds to complete.")

In [4]:
# Importing necessary modules
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# Function to generate cross validation dataset
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    # Splitting data into training and testing set for each fold in the cross-validation 
    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train, X_test = preprocess_data(X_train, X_test)
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

# Function to train and evaluate model
def train_evaluate_model(model_tuple, fold_data, epoch_number, dataset_name,num_folds):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    # Training the model 
    history = model.fit(X_train,
                        y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))

     # Evaluating the trained model on test data 
    loss = model.evaluate(X_test,y_test)

     # Making predictions on the test data 
    predictions = model.predict(X_test)
    
     # Calculate metrics 
    r_squared_value=r2_score(y_true=y_test,y_pred=predictions)
    test_error=mean_squared_error(y_true=y_test,y_pred=predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}

    # Save results to numpy file
    if not os.path.exists('aggregate_results'):
        os.makedirs('aggregate_results')

    np.save(f'aggregate_results/{dataset_name}-{name}-epochs-{epoch_number}-fold-{fold}-of-{num_folds}.npy', results)

# Function to evaluate models in parallel
def evaluate_models_parallel(fold_data, dataset_name, epoch_number,num_folds):
    
    models = initialize_all_models(fold_data[0].shape[1], seed_val=fold_data[4])
    compile_models(models)

     # Training and evaluating all models in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(train_evaluate_model, 
                                   model, 
                                   fold_data, 
                                   epoch_number, 
                                   dataset_name,
                                   num_folds): model for model in models}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# Function to evaluate all folds in parallel
def evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds):
    
     # Evaluating all folds in parallel using ThreadPoolExecutor 
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(evaluate_models_parallel, 
                                   fold_data, 
                                   dataset_name, 
                                   epoch_number,
                                   num_folds): fold_data for fold_data in kfold_datasets}
        for future in futures:
            future.result()  # Just to make sure all tasks are finished

# New function to retrieve all datasets and their names and feed it to relevant functions with a for loop.
def retrieve_datasets_and_run_evaluations(num_folds=5, epoch_number=100):
    # Fetching data 
    filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()

    for dataset, row in zip(datasets, filtered_datasets_metadata.iterrows()):
        dataset_name = row[1]['dataset']
        kfold_datasets = generate_cross_validation_dataset(dataset, num_folds)
        evaluate_all_folds_parallel(kfold_datasets, dataset_name, epoch_number,num_folds)

In [5]:
filtered_datasets_metadata, datasets = fetch_return_filtered_pmlb_data_sets()
filtered_datasets_metadata

Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
0,1027_ESL,488,4,0,0,4,continuous,9.0,0.099363,regression
2,1029_LEV,1000,4,0,0,4,continuous,5.0,0.111245,regression
3,1030_ERA,1000,4,0,0,4,continuous,9.0,0.031251,regression
5,1096_FacultySalaries,50,4,0,0,4,continuous,39.0,0.004063,regression
13,192_vineyard,52,2,0,0,2,continuous,19.0,0.040475,regression
23,228_elusage,55,2,0,0,2,continuous,52.0,0.000953,regression
25,230_machine_cpu,209,6,0,0,6,continuous,116.0,0.004906,regression
29,485_analcatdata_vehicle,48,4,0,0,4,continuous,47.0,0.000434,regression
32,519_vinnie,380,2,0,0,2,continuous,16.0,0.030146,regression
34,523_analcatdata_neavote,100,2,0,0,2,continuous,8.0,0.136914,regression


In [7]:
# Call the new function 
start_time = time.time()
retrieve_datasets_and_run_evaluations(num_folds=2, epoch_number=2)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"The experiment took {elapsed_time} seconds to complete.")











TypeError: in user code:

    File "C:\ProgramData\anaconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\ProgramData\anaconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\ProgramData\anaconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\ProgramData\anaconda3\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "C:\ProgramData\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Heinrich\AppData\Local\Temp\__autograph_generated_filebpvx0xla.py", line 18, in tf__call
        scaled_input = ag__.converted_call(ag__.ld(tf).floor, (ag__.ld(inputs) * ag__.ld(self).partition_num,), None, fscope)

    TypeError: Exception encountered when calling layer "lookup_table_model" "                 f"(type LookupTableModel).
    
    in user code:
    
        File "C:\Users\Heinrich\Documents\GitHub\function_approx_tabular\model_data_definitions.py", line 217, in call  *
            scaled_input = tf.floor(inputs * self.partition_num)
    
        TypeError: Value passed to parameter 'x' has DataType int64 not in list of allowed values: bfloat16, float16, float32, float64
    
    
    Call arguments received by layer "lookup_table_model" "                 f"(type LookupTableModel):
      • inputs=tf.Tensor(shape=(None, 2), dtype=int64)
