### Load in our accuracies for FICO

In [None]:
import pandas as pd
dfs = []
for s_iter in range(50):
    #df = pd.read_csv(f'./parallelized_results/baselines_2024-01-28_iter_{s_iter}_MICE_imp_10_imp.csv')
    df = pd.read_csv(f'./parallelized_results/baselines_2024-02-25_iter_{s_iter}_10_imp_all_50_max_coef.csv')
    dfs.append(df)
combined_acc_df = pd.concat(dfs, axis=0)

In [None]:
import pandas as pd
dfs = []
for s_iter in range(50):
    df = pd.read_csv(f'./parallelized_results/baselines_2024-01-31_iter_{s_iter}_10_imp_all.csv')
    dfs.append(df)
combined_acc_df = pd.concat(dfs, axis=0)


In [None]:
import pandas as pd
dfs = []
for ds_name in ['PHARYNGITIS']:
    for imputation_method in ['MIWAE']:
        for s_iter in range(120):
            try:
                #df = pd.read_csv(f'./parallelized_results/baselines_2024-02_01_iter_{s_iter}_10_imp_all.csv')
                df = pd.read_csv(f'./parallelized_results/baselines_iter_{s_iter}_{ds_name}_{imputation_method}.csv')
                dfs.append(df)
            except:
                continue
combined_acc_df = pd.concat(dfs, axis=0)

target_imp = 'MIWAE'

In [None]:
mask = (combined_acc_df['num_imputations'] == 10) & \
        (combined_acc_df['missingness_handling'] == target_imp)
cur_acc_df = combined_acc_df[mask]

In [None]:
def get_smim_tag(row):
    if row['use_smim']:
        return row['model_type'] + ' (SMIM)'
    else:
        return row['model_type'] + ' (No SMIM)'
cur_acc_df['model_type'] = cur_acc_df.apply(get_smim_tag, axis=1)

In [None]:
cur_acc_df = cur_acc_df.groupby(['model_type', 'dataset', 'holdout_set', 'metric', 'missingness_handling']).mean().reset_index()
cur_acc_df['model_type'].value_counts()

In [None]:
# Because we have a distinct entry for each val set for our gams,
# we need to take the average of each value along val sets
cur_acc_df = cur_acc_df.groupby(['model_type', 'dataset', 'holdout_set', 'metric', 'missingness_handling']).mean().reset_index()

# Now lets filter down to grab just AUC for BRECA, ACC for FICO
#cur_acc_df = pd.concat([
    #cur_acc_df[(cur_acc_df['dataset'] == 'FICO') & (cur_acc_df['metric'] == 'acc')],
    #cur_acc_df[(cur_acc_df['dataset'] == 'BREAST_CANCER') & (cur_acc_df['metric'] == 'auc')]
#], axis=0)
cur_acc_df = cur_acc_df[cur_acc_df['metric'] == 'auc']


cur_acc_df = cur_acc_df[cur_acc_df['model_type'] != 'GAM_no_missing (SMIM)']

cur_acc_df['Model Type'] = cur_acc_df['model_type']
cur_acc_df.loc[cur_acc_df['Model Type'] == 'GAM_imputation (SMIM)', 'Model Type'] = 'GAM (Imputation)'
cur_acc_df.loc[cur_acc_df['Model Type'] == 'GAM_ind (SMIM)', 'Model Type'] = 'GAM (Indicators)'
cur_acc_df.loc[cur_acc_df['Model Type'] == 'GAM_aug (SMIM)', 'Model Type'] = 'GAM (Interactions)'

cur_acc_df.loc[cur_acc_df['dataset'] == 'BREAST_CANCER', 'dataset'] = 'Breast Cancer'

cur_acc_df = cur_acc_df.sort_values('Model Type')

In [None]:
cur_acc_df

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

#for dataset in acc_res['dataset'].unique():
#mask = acc_res['dataset'] == dataset
sns.set(font_scale=2.0)
figure(figsize=(6, 8), dpi=80)
ax = sns.boxplot(
    cur_acc_df, hue='Model Type', y='metric_value_test', x='dataset'
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(0.0, 2.0), ncol=2)
plt.xlabel('')
#plt.ylim((0.75, 0.85))
#plt.title(dataset)
plt.ylabel('Test ACC')
plt.xticks(rotation=0, ha='center')
plt.show()

### Turning to consider alternative imputation strategies 

Load in our timing data for each imputation method (over 10 imputations)

In [None]:
import pandas as pd
df_list = [
    pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-28_all_BREAST_CANCER_GAIN.csv'),
    pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-28_all_FICO_GAIN.csv'),
    #pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-28_all_FICO_MissForest.csv'),
    pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-29_all_FICO_MIWAE.csv'),
]
for dataset in ['BREAST_CANCER', 'FICO']:
    try:
        df_list.append(pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset}.csv'))
    except:
        df_list.append(pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset}.csv'))
base_timing_df = pd.concat(df_list, axis=0)
base_timing_df['dataset'].value_counts()

In [None]:
base_timing_df = base_timing_df[base_timing_df['m'] < 10]
base_timing_df['imputation'].value_counts()

In [None]:
tmp_df = base_timing_df.groupby(['dataset', 'holdout_set', 'imputation', 'validation_set']).sum()
tmp_df = tmp_df.groupby(['dataset', 'holdout_set', 'imputation']).mean()

# For MICE, time_overall has the total time, and all the others are 0;
# The opposite holds for the other methods
tmp_df['time_overall'] = tmp_df['time_overall'] + tmp_df['time_for_test'] \
    + tmp_df['time_for_val'] + tmp_df['time_for_train'] + tmp_df['time_to_fit'] 
tmp_df = tmp_df.reset_index()

mask = (combined_acc_df['num_imputations'] == 10)\
     & (combined_acc_df['missingness_handling'] != 'GAIN')\
     & (combined_acc_df['metric'] == 'auc') & (combined_acc_df['dataset'] == 'BREAST_CANCER')
     #& (combined_acc_df['metric'] == 'acc') & (combined_acc_df['dataset'] == 'FICO')
acc_df = combined_acc_df[mask]
tmp_df_acc = acc_df.groupby(['dataset', 'holdout_set', 'model_type', 'missingness_handling']).mean().reset_index()
tmp_df_acc['imputation'] = tmp_df_acc['missingness_handling']

merged_df = tmp_df_acc.merge(tmp_df, on=['holdout_set', 'imputation', 'dataset'], how='inner')

# Because we only want models that rely on imputations
merged_df = merged_df[(merged_df['model_type'] != 'GAM_ind') & (merged_df['model_type'] != 'GAM_aug') & (merged_df['model_type'] != 'GAM_no_missing')]
merged_df['time_overall'] = merged_df['time_overall'] + merged_df['mean_fit_time'] 

In [None]:
tmp_df_acc

In [None]:
merged_df['Model Type'] = merged_df['model_type']
merged_df['Imputation'] = merged_df['imputation']
mask = merged_df['Model Type'] == 'GAM_imputation'
merged_df.loc[mask, 'Model Type'] = 'GAM (Imputation)'

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
#stacked_time_df_fico = stacked_time_df[stacked_time_df['data_type'] != 'BREAST_CANCER']
color_pal = sns.color_palette()
'''g = sns.FacetGrid(
    merged_df[merged_df['dataset'] == 'FICO'], 
    col="imputation", 
    col_wrap=2, 
    height=6, 
    sharey=True,
    sharex=False)
g.map_dataframe(sns.scatterplot,
    x='time_overall',
    y='metric_value_test',
    hue='model_type',
    #label="Model Fit Time",
    #errorbar='se'
    #hue='model'
)
g.add_legend(loc='upper center', bbox_to_anchor=(0.5, 1.0), ncol=2)
g.set_xlabels('')'''

sns.set(font_scale=2.0)
g = sns.scatterplot(
    data=merged_df,
    x='time_overall',
    y='metric_value_test',
    hue='Imputation',
    style='Model Type'
    #label="Model Fit Time",
    #errorbar='se'
    #hue='model'
)
#g.set_xticklabels([m for m in stacked_time_df_fico['model_nice'].unique()], rotation=45, ha='right')
g.set_xlabel('Runtime (Seconds)')
g.set_ylabel('Test AUC')
g.set_title('FICO')
#g.legend(loc='upper center', bbox_to_anchor=(0.5, 1.55), ncol=2)
g.legend(loc='upper center', bbox_to_anchor=(1.5, 0.9), ncol=1)
#g.set_ylabel('Time (Seconds)')
#plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=2)

#for ax, title in zip(g.axes.flat, ['FICO (1,883 Samples)', 'FICO (3,765 Samples)', 'FICO (5,648 Samples)', 
#                                    'FICO (7,530 Samples)']):
#    ax.set_title(title)
plt.show()

### Load in data for different numbers of imputations

In [None]:
combined_acc_df.columns

In [None]:
# Load in model fitting data
'''import pandas as pd
df_list = []
for n_imps in [1, 5, 10, 20, 30]:
    df_acc = pd.read_csv(f'full_baseline_results_many_clf_tmp_multi_2024-01-28_10_holdouts_FICO_{n_imps}_imp.csv')
    df_acc = df_acc[df_acc['metric'] == 'acc']
    
    df_list.append(df_acc)
df_acc = pd.concat(df_list, axis=0)
df_acc['model_type'].value_counts()'''

df_acc = combined_acc_df[(combined_acc_df['missingness_handling'] == 'MICE') \
                        & (combined_acc_df['dataset'] == 'FICO')\
                        & (combined_acc_df['metric'] == 'acc')\
                        & (combined_acc_df['model_type'] != 'GAM_aug')\
                        & (combined_acc_df['model_type'] != 'GAM_ind')\
                        & (combined_acc_df['model_type'] != 'GAM_no_missing')]

In [None]:
import pandas as pd
df_list = []
for dataset in ['FICO']:
    try:
        df_list.append(pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset}.csv'))
    except:
        df_list.append(pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset}.csv'))
base_timing_df = pd.concat(df_list, axis=0)
base_timing_df['dataset'].value_counts()

In [None]:
imp_counts_to_consider = [1, 5, 10, 20, 30]
df_list = []
for i in imp_counts_to_consider:
    cur_acc_df = df_acc[df_acc['num_imputations'] == i].reset_index()
    cur_acc_df = cur_acc_df[cur_acc_df['model_type'] != 'GAM_ind']
    cur_acc_df = cur_acc_df[cur_acc_df['model_type'] != 'GAM_aug']

    cur_acc_df['Model Type'] = cur_acc_df['model_type']
    cur_acc_df.loc[cur_acc_df['Model Type'] == 'GAM_imputation', 'Model Type'] = 'GAM (Imputation)'

    tmp_df = base_timing_df[base_timing_df['m'] < i].groupby(['dataset', 'holdout_set', 'validation_set']).sum()
    tmp_df = tmp_df.groupby(['dataset', 'holdout_set']).mean().add_suffix('_agg').reset_index()
    #tmp_df = base_timing_df[base_timing_df['m'] < i].groupby(['dataset', 'holdout_set']).sum().add_suffix('_agg').reset_index()
    #tmp_df = tmp_df.groupby('dataset').mean().reset_index()
    tmp_df['num_imputations'] = i

    tmp_df = tmp_df.merge(cur_acc_df, on=['num_imputations', 'holdout_set'], how='outer')

    df_list.append(tmp_df)
df_agg = pd.concat(df_list, axis=0)
df_agg['time_overall'] = df_agg['time_overall_agg'] + df_agg['mean_fit_time']

In [None]:
import seaborn as sns
sns.set(font_scale=1.1)
#df_agg['Model Type'] = df_agg['model_type']
df_agg['Number of Imputations'] = df_agg['num_imputations']
g = sns.scatterplot(
    data=df_agg,
    x='time_overall',
    y='metric_value_test',
    hue='Number of Imputations',
    style='Model Type',
)
g.set_xlabel('Runtime (Seconds)')
g.set_ylabel('Test Accuracy')
g.legend(loc='upper center', bbox_to_anchor=(1.3, 1.0), ncol=1)

### Getting runtime for different subsets of a dataset

In [None]:
# Load in model fitting data
import pandas as pd
dataset_of_interest = 'FICO'
imputations_of_interest = ['Mean', 'MICE']
#base_timing_df = pd.concat([
#    pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset_of_interest}.csv'),
#    pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset_of_interest}_0.25.csv'),
#    pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset_of_interest}_0.5.csv'),
#    pd.read_csv(f'../../handling_missing_data/timing_stats_10_val_2024-01-27_all_{dataset_of_interest}_0.75.csv')
#], axis=0)
dfs = []
for ds_name in [f'{dataset_of_interest}', f'{dataset_of_interest}_0.25']:#, f'{dataset_of_interest}_0.5', f'{dataset_of_interest}_0.75']:
    for imputation_method in imputations_of_interest:
        for s_iter in range(120):
            #df = pd.read_csv(f'./parallelized_results/baselines_2024-02_01_iter_{s_iter}_10_imp_all.csv')
            df = pd.read_csv(f'./parallelized_results/baselines_iter_{s_iter}_{ds_name}_{imputation_method}.csv')
            dfs.append(df)
combined_acc_df = pd.concat(dfs, axis=0)

df_list = []
for ds_name in [dataset_of_interest]:
    for imputation_method in imputations_of_interest:
        for subsample in ['', '_0.25', '_0.5', '_0.75']:
            df_list.append(pd.read_csv(f'../../handling_missing_data/timing_stats_{ds_name}{subsample}_{imputation_method}_5_3.csv'))
base_timing_df = pd.concat(df_list, axis=0)

base_timing_df = base_timing_df[(base_timing_df['m'] < 10) & (base_timing_df['imputation'].isin(imputations_of_interest))]

mask = (combined_acc_df['num_imputations'] == 10)
acc_df = combined_acc_df[mask]
if dataset_of_interest == 'BREAST_CANCER':
    acc_df = acc_df[acc_df['metric'] == 'auc']
else:
    acc_df = acc_df[acc_df['metric'] == 'acc']

In [None]:
base_timing_df

In [None]:
base_timing_df['dataset'].unique()

In [None]:
acc_df.groupby(['dataset', 'model_type']).mean()['mean_fit_time']

In [None]:
base_timing_df = base_timing_df.groupby(['holdout_set', 'dataset', 'validation_set', 'imputation']).sum().reset_index()
#base_timing_df = base_timing_df.groupby(['holdout_set', 'dataset']).mean().reset_index()
#base_timing_df
#base_timing_df['holdout_set'].value_counts()

In [None]:
acc_df = acc_df.groupby(['dataset', 'holdout_set', 'model_type']).mean().reset_index()#['holdout_set'].value_counts()

merged_df = base_timing_df.merge(acc_df, how='inner', on=['dataset','holdout_set'])

mask = (merged_df['model_type'] != 'GAM_ind') & (merged_df['model_type'] != 'GAM_aug')
merged_df['impute_time'] = merged_df['time_overall']
merged_df['overall_time'] = 0
merged_df.loc[mask, 'overall_time'] = merged_df.loc[mask, 'impute_time'] + merged_df.loc[mask, 'mean_fit_time']
merged_df.loc[~mask, 'overall_time'] = merged_df.loc[~mask,'mean_fit_time']
merged_df.loc[~mask, 'impute_time'] = 0

In [None]:
merged_df = merged_df[merged_df['model_type'] != 'GAM_no_missing']

In [None]:
mask = merged_df['model_type'] == 'GAM_aug'
merged_df.loc[mask, 'model_type'] = 'GAM (Interactions)'
mask = merged_df['model_type'] == 'GAM_ind'
merged_df.loc[mask, 'model_type'] = 'GAM (Indicators)'
mask = merged_df['model_type'] == 'GAM_imputation'
merged_df.loc[mask, 'model_type'] = 'GAM (Imputation)'

"""mask = merged_df['dataset'] == 'FICO'
merged_df.loc[mask, 'dataset'] = 'FICO (7,530 Samples)'
mask = merged_df['dataset'] == 'FICO_0.25'
merged_df.loc[mask, 'dataset'] = 'FICO (1,883 Samples)'
mask = merged_df['dataset'] == 'FICO_0.5'
merged_df.loc[mask, 'dataset'] = 'FICO (3,765 Samples)'
mask = merged_df['dataset'] == 'FICO_0.75'
merged_df.loc[mask, 'dataset'] = 'FICO (5,648 Samples)'"""

In [None]:
merged_df.groupby('model_type').mean()

In [None]:
merged_df = merged_df.sort_values('model_type')
"""if dataset_of_interest == 'FICO':
    merged_df = pd.concat([
        merged_df[merged_df['dataset'] == 'FICO (1,883 Samples)'],
        merged_df[merged_df['dataset'] == 'FICO (3,765 Samples)'],
        merged_df[merged_df['dataset'] == 'FICO (5,648 Samples)'],
        merged_df[merged_df['dataset'] == 'FICO (7,530 Samples)'],
    ], axis=0)
else:
    merged_df = pd.concat([
        merged_df[merged_df['dataset'] == 'BREAST_CANCER_0.25'],
        merged_df[merged_df['dataset'] == 'BREAST_CANCER_0.5'],
        merged_df[merged_df['dataset'] == 'BREAST_CANCER_0.75'],
        merged_df[merged_df['dataset'] == 'BREAST_CANCER'],
    ], axis=0)"""

In [None]:
color_pal = sns.color_palette()
sns.set(font_scale=1.3)
g = sns.FacetGrid(
    merged_df, 
    col="dataset", 
    col_wrap=2, 
    height=4, 
    sharey=True)
g.map_dataframe(sns.barplot,
    x='model_type',
    y='overall_time',
    hue='imputation',
    color=color_pal[1],
    #label="Model Fit Time"
    #hue='model'
)
g.map_dataframe(sns.barplot,
    x='model_type',
    y='impute_time',
    hue='imputation',
    color=color_pal[0],
    #label="Imputation Time"
    #hue='model'
)
g.set_xticklabels([m for m in merged_df['model_type'].unique()],
                    rotation=45, ha='right')

for tick_in, tick_label in enumerate(g.axes[-2].xaxis.get_ticklabels()):
    if merged_df['model_type'].unique()[tick_in] in ['GAM (Interactions)', 'GAM (Indicators)']:
        #tick_label.set_color("red")
        tick_label.set_font({'weight': 'bold'})
    else:
        tick_label.set_color("black")

for tick_in, tick_label in enumerate(g.axes[-1].xaxis.get_ticklabels()):
    if merged_df['model_type'].unique()[tick_in] in ['GAM (Interactions)', 'GAM (Indicators)']:
        #tick_label.set_color("red")
        tick_label.set_font({'weight': 'bold'})
    else:
        tick_label.set_color("black")
    
g.set_xlabels('')
g.set_ylabels('Time (Seconds)')

if dataset_of_interest == 'FICO':
    for ax, title in zip(g.axes.flat, ['FICO \n(2,615 Samples)', 'FICO \n(5,230 Samples)', 
                                        'FICO \n(7,844 Samples)', 'FICO \n(10,459 Samples)']):
        ax.set_title(title)
elif dataset_of_interest == 'BREAST_CANCER':
    for ax, title in zip(g.axes.flat, ['Breast Cancer \n(439 Samples)', 'Breast Cancer \n(878 Samples)', 
                                        'Breast Cancer \n(1,317 Samples)', 'Breast Cancer \n(1,756 Samples)']):
        ax.set_title(title)
plt.tight_layout()
g.add_legend()
sns.move_legend(g, "upper left", bbox_to_anchor=(0.25, 1.05), ncol=2)
plt.show()