In [1]:
from polynomial_regression import *
from GPR import *
from random_forest import *
import ast

all_labels = ['height', 'phi', 'theta', 
        'impact site x', 'impact site y', 'impact site z', 
        'impact site r', 'impact site phi', 'impact site theta']

# deciding which features will be kept for the optimiation
# features_to_keep = ['sqrt(angle_btw)', 'sqrt(crack len)', 'sqrt(dist btw frts)', 'angle_btw (unchanged)', 'dist btw frts * mean_kink', 'angle_btw + crack len', 'angle_btw + dist btw frts', 'crack len + dist btw frts']

labels_to_predict = ['height', 'impact site x', 'impact site y']
# labels_to_predict = ['height']

model_types = ['GPR','RF','poly2']
model_types = ['poly2']

# model_types = ['GPR']

# Generate some synthetic data for demonstration purposes
# full_dataset_pathname = "/Volumes/Jake_ssd/Paper_1_results_no_feature_engineering/dataset/New_Crack_Len_FULL_OG_dataframe_2023_11_16.csv"
full_dataset_pathname = "/Volumes/Jake_ssd/Paper_1_results_WITH_feature_engineering/dataset/feature_transformations_2023-11-16/height/HEIGHTALL_TRANSFORMED_FEATURES.csv"
df = pd.read_csv(full_dataset_pathname)
if(df.columns.__contains__('timestep_init')):
    df = df.drop('timestep_init', axis=1)

bayesian_opt_results_dir = '/Volumes/Jake_ssd/bayesian_optimization'



In [2]:
all_features_to_keep = {}
backward_feat_selection_results_folder = '/Volumes/Jake_ssd/Paper_1_results_WITH_feature_engineering/results'
''' get the features to keep based on the best performing features on test sets in backward feature engineering outputs'''
''' only include more than 10 features though for now. '''
min_features = 1
max_features = 25
for label in labels_to_predict:
    all_features_to_keep[label] = {}
    for model_type in model_types:
        performances = pd.read_csv(backward_feat_selection_results_folder + f'/{label}/{model_type}/performances/test_performances.csv')
        #only include rows that have less than 100 features and more than 10 features
        performances = performances.drop(performances[(performances['Unnamed: 0'] < 10) | (performances['Unnamed: 0'] > 100)].index)

        #calculate an average performance across all folds for each feature combination
        performances['average'] = performances[['fold0', 'fold1', 'fold2', 'fold3', 'fold4']].mean(axis=1)
        row_with_largest_average = performances['average'].idxmax() #get the row index that has the best performance across all folds

        #now get the features that were used for the best performing set of features
        features_kept = pd.read_csv(backward_feat_selection_results_folder + f'/{label}/{model_type}/performances/features_kept.csv')
        best_feature_combination = features_kept.iloc[row_with_largest_average]['features remaining']
        all_features_to_keep[label][model_type] = ast.literal_eval(best_feature_combination)
                

In [3]:
for model_type in model_types:
    for label in labels_to_predict:
        print(f'\n $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A {model_type} PREDICTING {label} $$$$$$$$$$$$$$$$$$$$$$$$$$ \n')
        features_to_keep = all_features_to_keep[label][model_type]
        saving_dir = f'{bayesian_opt_results_dir}/{label}'
        if(not os.path.exists(f'{saving_dir}/{model_type}')): os.makedirs(f'{saving_dir}/{model_type}')
        
        label_df = df.copy()[label]
        '''if there are defined features to keep, then only keep the defined ones. otherwise, just remove the labels for feat_df'''
        if(features_to_keep != 'ALL'):
            feat_df = df[features_to_keep]
        else:
            feat_df = df.drop(all_labels, axis=1)
        
        if(model_type =='GPR'):
            model = do_bayesian_optimization_GPR(feat_df, label_df, num_tries=300, saving_folder= saving_dir+'/GPR')
        elif(model_type == 'RF'):
            model = do_bayesian_optimization_RF(feat_df, label_df, num_tries=300, saving_folder= saving_dir+'/RF')
        elif(model_type == 'poly2'):
            model = do_bayesian_optimization_poly_reg(feat_df, label_df, num_tries=300, saving_folder= saving_dir+'/poly2')


 $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A GPR PREDICTING height $$$$$$$$$$$$$$$$$$$$$$$$$$ 






$$$$$$$$$$$$ Results for GPR predicting height $$$$$$$$$$$$
$$$$$$$$$$$$ Best parameters found: OrderedDict([('kernel__k1__k1__constant_value', 0.1), ('kernel__k1__k2__length_scale', 38637.086097004925), ('kernel__k2__noise_level', 25.87232476146296)]) $$$$$$$$$$$$
$$$$$$$$$$$$ Best average test score across 5-fold cv: -20.118053605562473 $$$$$$$$$$$$


 $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A GPR PREDICTING impact site x $$$$$$$$$$$$$$$$$$$$$$$$$$ 






$$$$$$$$$$$$ Results for GPR predicting impact site x $$$$$$$$$$$$
$$$$$$$$$$$$ Best parameters found: OrderedDict([('kernel__k1__k1__constant_value', 8.82703271057051), ('kernel__k1__k2__length_scale', 69247.5659140656), ('kernel__k2__noise_level', 0.28850993959734234)]) $$$$$$$$$$$$
$$$$$$$$$$$$ Best average test score across 5-fold cv: 0.7659773628434906 $$$$$$$$$$$$


 $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A GPR PREDICTING impact site y $$$$$$$$$$$$$$$$$$$$$$$$$$ 






$$$$$$$$$$$$ Results for GPR predicting impact site y $$$$$$$$$$$$
$$$$$$$$$$$$ Best parameters found: OrderedDict([('kernel__k1__k1__constant_value', 6.1693404865157015), ('kernel__k1__k2__length_scale', 46252.28543006315), ('kernel__k2__noise_level', 0.1431424839871945)]) $$$$$$$$$$$$
$$$$$$$$$$$$ Best average test score across 5-fold cv: 0.5203014203578659 $$$$$$$$$$$$


 $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A RF PREDICTING height $$$$$$$$$$$$$$$$$$$$$$$$$$ 






$$$$$$$$$$$$ Results for RF predicting height $$$$$$$$$$$$
$$$$$$$$$$$$ Best parameters found: OrderedDict([('max_depth', 2), ('max_features', 2), ('min_samples_leaf', 25), ('min_samples_split', 45), ('n_estimators', 9544)]) $$$$$$$$$$$$
$$$$$$$$$$$$ Best average test score across 5-fold cv: -21.900611096315572 $$$$$$$$$$$$


 $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A RF PREDICTING impact site x $$$$$$$$$$$$$$$$$$$$$$$$$$ 






$$$$$$$$$$$$ Results for RF predicting impact site x $$$$$$$$$$$$
$$$$$$$$$$$$ Best parameters found: OrderedDict([('max_depth', 5), ('max_features', 3), ('min_samples_leaf', 10), ('min_samples_split', 25), ('n_estimators', 7909)]) $$$$$$$$$$$$
$$$$$$$$$$$$ Best average test score across 5-fold cv: 0.7373918199735107 $$$$$$$$$$$$


 $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A RF PREDICTING impact site y $$$$$$$$$$$$$$$$$$$$$$$$$$ 






$$$$$$$$$$$$ Results for RF predicting impact site y $$$$$$$$$$$$
$$$$$$$$$$$$ Best parameters found: OrderedDict([('max_depth', 5), ('max_features', 3), ('min_samples_leaf', 10), ('min_samples_split', 25), ('n_estimators', 8482)]) $$$$$$$$$$$$
$$$$$$$$$$$$ Best average test score across 5-fold cv: 0.6494748623086379 $$$$$$$$$$$$


 $$$$$$$$$$$$$$$$$$$$$$$$$$ OPTIMIZING A poly2 PREDICTING height $$$$$$$$$$$$$$$$$$$$$$$$$$ 



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

KeyboardInterrupt: 