In [1]:
# we need a way to specify unique model name, or save model outputs based on indices.

import numpy as np
import os
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from model_data_definitions import *
from concurrent.futures import ThreadPoolExecutor

# expensive and slow operation done once.
filtered_datasets, datasets = fetch_return_filtered_pmlb_data_sets() 
print(filtered_datasets)


    dataset  n_instances  n_features  n_binary_features  \
2  1029_LEV         1000           4                  0   
3  1030_ERA         1000           4                  0   

   n_categorical_features  n_continuous_features endpoint_type  n_classes  \
2                       0                      4    continuous        5.0   
3                       0                      4    continuous        9.0   

   imbalance        task  
2   0.111245  regression  
3   0.031251  regression  


In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
import multiprocessing as mp

def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold+1))
    
    return dataset_list

def train_evaluate_model(params):
    
    model_tuple , fold_data , epoch_number = params 
    model, name = model_tuple

    X_train, y_train, X_test, y_test , fold = fold_data
    
    history = model.fit(X_train, y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test,y_test))
    
    loss = model.evaluate(X_test,y_test , verbose=0)

    predictions = model.predict(X_test)
    
    r_squared_value = r2_score(y_test,predictions)
    
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history.history['loss'],
        'val_history': history.history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}
    
    # save results   
    return results

def main():

    # assuming some pandas dataframe here
    data = pd.DataFrame() 

    num_folds = 5
    epoch_number = 100 

    dataset_list = generate_cross_validation_dataset(data, num_folds)

    pool = mp.Pool(mp.cpu_count())

    all_results = []
    
    for fold_data in dataset_list:
        
        input_dim = fold_data[0].shape[1]
        
        models = initialize_all_models(input_dim, seed_val=fold_data[-1])
        
        compile_models(models)
        
        job_params=[(model , fold_data , epoch_number) for model in models]
        
        results=pool.map(train_evaluate_model, job_params)
        
        all_results.extend(results)
    
    pool.close()

if __name__ == "__main__":
  main()

KeyError: "['target'] not found in axis"

In [4]:
def generate_cross_validation_dataset(data, num_folds):
    X, y = data.drop('target', axis=1).values, data['target'].values
    
    dataset_list = []
    kf = KFold(n_splits=num_folds)
    fold = 0

    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        dataset_list.append((X_train, y_train, X_test, y_test , fold))
    
    return dataset_list

def generate_cross_validation_models(input_dim, num_folds):
    model_lists = []
    for fold in range(num_folds):
        models = initialize_all_models(input_dim, seed_val=fold)
        compile_models(models)
        model_lists.append(models)
    return model_lists

def train_evaluate_model(model_tuple, fold_data, epoch_number):
    
    model, name = model_tuple
    X_train, y_train, X_test, y_test , fold = fold_data
    
    history = model.fit(X_train, y_train,
                        epochs=epoch_number,
                        verbose=0,
                        validation_data=(X_test, y_test))

    loss = model.evaluate(X_test, y_test, verbose=0)

    predictions = model.predict(X_test)
    r_squared_value = r2_score(y_test,predictions)
    test_error = mean_squared_error(y_test,predictions)

    results = {
        'model': name,
        'fold': fold,
        'train_history': history['loss'],
        'val_history': history['val_loss'],
        'loss': loss,
        'r_squared_value': r_squared_value,
        'test_error': test_error}
    
    return results    

def cross_validation(data, epoch_number, num_folds):
    
    # Generate cross validation datasets
    cv_datasets = generate_cross_validation_dataset(data, num_folds)
    cv_models = generate_cross_validation_models(X_train.shape[1], num_folds)
    
    for fold in range(num_folds):
        fold_data = cv_datasets[fold]
        models = cv_models[fold]
        for model_tuple in models:
            train_evaluate_model(model_tuple, fold_data, epoch_number)

In [None]:
def cross_validation(data, epoch_number, num_folds):
    
    # Generate cross validation datasets
    cv_datasets = generate_cross_validation_dataset(data, num_folds)
    cv_models = generate_cross_validation_models(X_train.shape[1], num_folds)
    
    for fold in range(num_folds):
        fold_data = cv_datasets[fold]
        models = cv_models[fold]
        for model_tuple in models:
            train_evaluate_model(model_tuple, fold_data, epoch_number)
            
            
    
    all_results = []
    
    for fold_data in cv_datasets:
        X_train, y_train, X_test, y_test , fold = fold_data
        
        # Generate models for this fold
        models = generate_cross_validation_models(X_train.shape[1], num_folds)
        
        fold_results = []
        
        # Train and evaluate each model on this fold's data
        for model_tuple in models:
            result = train_evaluate_model(model_tuple, fold_data, epoch_number)
            fold_results.append(result)
        
        all_results.append(fold_results)

    return all_results

all_results = {}

for dataset, row in zip(datasets, filtered_datasets.iterrows()):
    dataset_name = row[1]['dataset']
    print(f"Evaluating dataset: {dataset_name}")

    results = cross_validation(dataset)
    all_results[dataset_name] = results

# save results to a JSON file
with open('results.json', 'w') as f:
    json.dump(all_results, f)

In [None]:

# the k-fold
[(train_index, test_index) for train_index,test_index in kf.split(X)]

input_dimension = row[1]['n_features']
X_train, X_test = X[train_index], X[test_index]

models = initialize_all_models(X_train.shape[1], seed_val=fold) #input dimension specify

In [None]:
def train_evaluate_model(model_name_tuple, data, indices, fold, epochs):
    