# Make figures and tables for publication

In [None]:
import pandas as pd
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

## Cohort Overview

In [None]:
data_dir = 'data'
cohort = pd.read_csv(f'{data_dir}/cohort.csv')
S_train = pd.read_csv(f'{data_dir}/S_train.csv')
S_test = pd.read_csv(f'{data_dir}/S_test.csv')
selection_criteria = pd.read_csv('helper_files/selection_criteria.csv')

# get ids actually in data
y_train = pd.read_csv(f'{data_dir}/experiment/60_all/y_train.csv', index_col=0)
y_test = pd.read_csv(f'{data_dir}/experiment/60_all/y_test.csv', index_col=0)
cohort_ids = pd.concat([y_train, y_test]).index
cohort = cohort[cohort['ID'].isin(cohort_ids)]
cohort = pd.merge(cohort, selection_criteria[['PATIENT_ID','HFRS']], left_on='ID', right_on='PATIENT_ID').drop('PATIENT_ID', axis=1)

cohort['END_DSB'] = cohort['END_DSB'] / 365.25
S = pd.concat([S_train, S_test])
df = pd.merge(S, cohort, on='ID')

var = [item for item in df.columns if item not in ['ID', 'train', 'CASE','END_DSB','HFRS']]
n = pd.DataFrame(df.groupby(['train','CASE'])[var].sum())
p = pd.DataFrame(df.groupby(['train','CASE'])[var].sum() / df.groupby(['train','CASE'])[var].count())
p.columns = [f'{var} (%)' for var in p.columns]
p = p * 100


t_table = []
for v in var:
    t = pd.DataFrame(n.astype(int).astype(str)[v] + ' (' + p[f'{v} (%)'].round(decimals=1).astype(str) + '%)').T
    t.index = [v]
    t_table.append(t)

t_table = pd.concat(t_table)
t_table.to_clipboard()
display(t_table)

In [None]:
(cohort.groupby(['train','CASE'])['END_DSB'].mean().round(1).astype(str) + '+/-' + cohort.groupby(['train','CASE'])['END_DSB'].std().round(1).astype(str))

In [None]:
(cohort.groupby(['train','CASE'])['HFRS'].mean().round(1).astype(str) + '+/-' + cohort.groupby(['train','CASE'])['HFRS'].std().round(1).astype(str))

In [None]:
cohort.groupby('train')['CASE'].value_counts()

## Optimal Rank

In [None]:
df = pd.read_csv('data/lv-pick-rank-hals/rf_rank_eval.csv')

def prep(df):
    df['set'] = df['set'].str.replace('train', 'Train')
    df['set'] = df['set'].str.replace('test', 'Validation')
    df = df.rename(columns={
        'set': 'Dataset', 
        'rank': 'Rank', 
        'f1' : 'F1',
        'auc' : 'AUROC',
        'auprc' : 'AUPRC'
        })
    return df

df = prep(df)

In [None]:
sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.5)
g = sns.lineplot(data=df, x='Rank', y='F1', hue='Dataset')
g.figure.tight_layout()
g.legend_.remove()

## Phenotypes

In [None]:
dxrx_phenotypes = pd.read_csv('data/phenotypes_50_dxrx_HALS-exact/phenotypes.csv')
dxrx_time = pd.read_csv('data/phenotypes_50_dxrx_HALS-exact/time_factors.csv', index_col=0)
lv_phenotypes = pd.read_csv('data/phenotypes_30_lv_HALS-exact/phenotypes.csv')
lv_time = pd.read_csv('data/phenotypes_30_lv_HALS-exact/time_factors.csv', index_col=0)

dxrx_time = dxrx_time.reset_index()
dxrx_time = dxrx_time.rename(columns={'index': 'time'})
dxrx_time['time'] = -dxrx_time['time'] / 2
dxrx_time = dxrx_time.melt(id_vars='time', var_name='phenotype')
dxrx_time['phenotype'] = dxrx_time['phenotype'].astype(int)

lv_time = lv_time.reset_index()
lv_time = lv_time.rename(columns={'index': 'time'})
lv_time['time'] = -lv_time['time'] / 2
lv_time = lv_time.melt(id_vars='time', var_name='phenotype')
lv_time['phenotype'] = lv_time['phenotype'].astype(int)

#### Dx/Rx phenotype

In [None]:
n = 7
sns.set_theme(style="white")
sns.set_context("notebook", font_scale=1.5)
g = sns.lineplot(data=dxrx_time[dxrx_time['phenotype'] == n], x='time', y='value', color='black')
g.figure.tight_layout()
g.set_ylabel('')
g.set_xlabel('Years before AMI onset')

dxrx_phenotypes[dxrx_phenotypes['factor'] == n].sort_values('weight', ascending=False)[['feature','weight']].head(10)

#### LV phenotype

In [None]:
n = 19
sns.set_theme(style="white")
sns.set_context("notebook", font_scale=1.5)
g = sns.lineplot(data=lv_time[lv_time['phenotype'] == n], x='time', y='value', color='black')
g.figure.tight_layout()
g.set_ylabel('')
g.set_xlabel('Years before AMI onset')

lv_phenotypes[lv_phenotypes['factor'] == n].sort_values('weight', ascending=False)[['feature','weight']].head(10)

## CV results

In [None]:
cv = pd.read_csv('tgfnn/cv_output/experiment_cv_results.csv')

# replace abbreviations in model column of cv
cv['model'] = cv['model'].replace({
    # 'RF': 'Random Forest',
    # 'LR': 'Logistic Regression',
    # 'XGB': 'XGBoost',
    'GFN': 'TGFNN'
})

cv['dataset'] = cv['dataset'].replace(
    {
        '30_phenotypes' : 'Phenotypes',
        '30_aggregate' : 'Summary statistics',
        '20_latest+demo' : 'Latest, demographics',
        '30_latest+demo+phenotypes' : 'Latest, demo., phenotypes',
        '20_latest+demo+aggregate' : 'Latest, demo., statistics',
        '60_all' : 'All'
    }
)

cv = cv.rename(
    columns = {
    'model' : 'Model',
    'dataset' : 'Feature Set',
    'roc_auc' : 'AUROC',
    'auprc' : 'AUPRC',
    'f1' : 'F1',
    'precision' : 'Precision',
    'recall' : 'Recall',
    }
)

Feature set performance across all models

In [None]:
means = cv[(cv['set'] == 'test')].groupby('Feature Set').mean().sort_values(by='F1', ascending=False).round(3).astype(str)
stdvs = cv[(cv['set'] == 'test')].groupby('Feature Set').std().round(3).astype(str).reindex(means.index)
(means + '±' + stdvs)

Model performance across all feature sets

In [None]:
means = cv[(cv['set'] == 'test')].groupby('Model').mean().sort_values(by='F1', ascending=False).round(3).astype(str)
stdvs = cv[(cv['set'] == 'test')].groupby('Model').std().round(3).astype(str).reindex(means.index)
(means + '±' + stdvs)

Mean of all metrics

In [None]:
cv2 = cv.melt(id_vars=['Model', 'Feature Set','set'], value_vars=['AUROC', 'AUPRC', 'F1', 'Precision', 'Recall'])
cv2[cv2['set'] == 'test'].groupby(['Model', 'Feature Set','set']).mean().reset_index().sort_values('value', ascending=False)

Looking at the best run of each model+feature set combination, according to F1

In [None]:
cv[cv['set'] == 'test'].loc[cv[cv['set'] == 'test'].groupby(['Model'])['AUROC'].idxmax().values,:].to_clipboard(index=False)

full table of metrics

In [None]:
(cv.groupby(['set','Feature Set','Model']).mean().round(3).astype(str) + '±' + cv.groupby(['set','Feature Set','Model']).std().round(3).astype(str)).to_clipboard()

### Plots

#### test set

In [None]:
sns.set(rc={'figure.figsize':(7,4)}, style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='test'], 
    x='Model', 
    y='AUROC', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
g.set_ylim(bottom=0.5, top=1)
g.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)

In [None]:
sns.set(rc={'figure.figsize':(7,4)}, style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='test'], 
    x='Model', 
    y='F1', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
g.set_ylim(bottom=0, top=1)
g.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)

In [None]:
sns.set(rc={'figure.figsize':(7,4)}, style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='test'], 
    x='Model', 
    y='AUPRC', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
g.set_ylim(bottom=0.3, top=1)
g.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)  # Move the legend below the plot

In [None]:
sns.set(rc={'figure.figsize':(7,4)}, style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='test'], 
    x='Model', 
    y='Precision', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
g.set_ylim(bottom=0, top=1)
g.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)  # Move the legend below the plot

In [None]:
sns.set(rc={'figure.figsize':(7,4)}, style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='test'], 
    x='Model', 
    y='Recall', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
g.set_ylim(bottom=0, top=1)
g.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)  # Move the legend below the plot

#### Valid set

In [None]:
sns.set(style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='valid'], 
    x='Model', 
    y='F1', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
g.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=2)  # Move the legend below the plot
plt.show()

#### Train set

In [None]:
sns.set(style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='train'], 
    x='Model', 
    y='AUROC', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
plt.show()

In [None]:
sns.set(style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='train'], 
    x='Model', 
    y='F1', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
plt.show()

In [None]:
sns.set(style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='train'], 
    x='Model', 
    y='AUPRC', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
plt.show()

In [None]:
sns.set(style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='train'], 
    x='Model', 
    y='Precision', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
plt.show()

In [None]:
sns.set(style='whitegrid')
g = sns.barplot(
    data=cv[cv['set']=='train'], 
    x='Model', 
    y='Recall', 
    hue='Feature Set',
    errorbar='sd',
    palette='colorblind')
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
plt.show()

## Model Interpretation

In [None]:
import pickle
import shap
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import kendalltau

In [None]:
mdl_dir = 'cv_output'
exp_name = 'experiment'
feature_set = '60_all'
X_train = pd.read_csv(f'data/{exp_name}/{feature_set}/X_train.csv', index_col=0)
X_test = pd.read_csv(f'data/{exp_name}/{feature_set}/X_test.csv', index_col=0)
y_test = pd.read_csv(f'data/{exp_name}/{feature_set}/y_test.csv', index_col=0)

### Feature importance across all models

In [None]:
lr_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_LR_cv_models.pkl', 'rb'))
rf_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_RF_cv_models.pkl', 'rb'))
xgb_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_XGB_cv_models.pkl', 'rb'))
tnet_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_TNET_cv_models.pkl', 'rb'))
ebm_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_EBM_cv_models.pkl', 'rb'))

model_lists = {
    'LR' : lr_models,
    'RF' : rf_models,
    'XGB' : xgb_models,
    'TNET' : tnet_models,
    'EBM' : ebm_models
}

# compute feature importance
fi_list = []
for name, models in model_lists.items():

    fi = []

    for model in models:
        if name == 'LR':
            fi.append(abs(model.coef_[0]))
        elif name == 'EBM':
            data_dict = model.explain_global().data()
            scores = np.array(model.explain_global().data()['scores'])[:len(X_test.columns.tolist())]
            fi.append(scores)
        else:
            fi.append(model.feature_importances_)

    fi = pd.DataFrame(fi, columns=X_train.columns)
    fi['model'] = name
    fi_list.append(fi)

# combine feature importance and scale 0-1
scaler = MinMaxScaler()
all_fi = pd.concat(fi_list).set_index('model').T
all_fi = pd.DataFrame(scaler.fit_transform(all_fi), index=all_fi.index, columns=all_fi.columns)
all_fi = all_fi.T.reset_index()

# add replicate index to model name
repeated = np.tile(np.arange(5), 5)
all_fi['model'] = all_fi['model'] + '_' + repeated.astype(str)

In [None]:
means = all_fi.melt(id_vars='model').groupby('variable')['value'].mean().sort_values(ascending=False)
variable_to_id = {variable: i for i, variable in enumerate(means.index)}
df_sorted = all_fi.melt(id_vars='model').sort_values(by='variable', key=lambda x: means[x], ascending=False)
df_sorted['variable_id'] = df_sorted['variable'].map(variable_to_id)
g = sns.lineplot(data=df_sorted, x='variable_id', y='value', color='black')
g.set(xlabel='Rank', ylabel='Mean importance')

In [None]:
means = all_fi.mean().sort_values(ascending=False).round(3).astype(str)
stdvs = all_fi.std().round(3).astype(str).reindex(means.index)
(means + '±' + stdvs).head(10)

In [None]:
kt = []

for i in range(all_fi.shape[0]):
    for j in range(all_fi.shape[0]):
        x = all_fi.set_index('model').iloc[i,:].sort_values(ascending=False) #.head(20)
        x_name = x.name
        x = x.index.tolist()
        y = all_fi.set_index('model').iloc[j,:].sort_values(ascending=False) #.head(20)
        y_name = y.name
        y = y.index.tolist()
        corr, _ = kendalltau(x, y) 
        if i != j:
            kt.append([x_name, y_name, corr])
        else:
            kt.append([x_name, y_name, 0])

In [None]:
sns.set(font_scale=0.85,style='white')
g = sns.heatmap(pd.DataFrame(kt).pivot_table(index=0, columns=1, values=2), square=True, cmap='coolwarm', center=0, annot=False)
g.set_xlabel(None)
g.set_ylabel(None)
plt.show()

### Logistic Regression coefficients

In [None]:
lr_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_LR_cv_models.pkl', 'rb'))

In [None]:
fi = []
for lr in lr_models:
    fi.append(lr.coef_[0])

fi = pd.DataFrame(fi, columns=X_train.columns)

fi_table = fi.melt().groupby('variable')['value'].mean().round(3).astype(str) + '±' + fi.melt().groupby('variable')['value'].std().round(3).astype(str)
sorted_f = fi.melt().groupby('variable')['value'].mean().abs().sort_values(ascending=False).index

fi_table = fi_table[sorted_f]
fi_table.head(20)

In [None]:
means = fi.reset_index().melt(id_vars='index').groupby('variable')['value'].mean().sort_values(ascending=False)
variable_to_id = {variable: i for i, variable in enumerate(means.index)}
df_sorted = fi.reset_index().melt(id_vars='index').sort_values(by='variable', key=lambda x: means[x], ascending=False)
df_sorted['variable_id'] = df_sorted['variable'].map(variable_to_id)
g = sns.lineplot(data=df_sorted, x='variable_id', y='value')
g.set(xlabel='Index', ylabel='Mean importance')

In [None]:
new_cols = []
for col in X_test.columns.tolist():
    if col.isdigit():
        col = f'Dx/Rx phenotype {col}'
    elif '_lv' in col:
        col = f'Lab/vital phenotype {col.split("_")[0]}'
    elif '_dxrx' in col:
        col = f'Dx/Rx phenotype {col.split("_")[0]}'
    elif col == 'max_BP_SYS':
        col = 'Max systolic blood pressure'
    elif col == 'mean_BP_DIA':
        col = 'Mean diastolic blood pressure'
    elif col == 'FAMILY_CARDIAC_HX':
        col = 'Family history of cardiac diseases'
    elif col == 'latest_SMOKING_STATUS_Never':
        col = 'Latest smoking status: Never'
    elif col == 'mean_BMI':
        col = 'Mean body mass index'
    elif col == 'min_BMI':
        col = 'Min body mass index'
    elif col == 'max_Mean Corpuscular Hgb':
        col = 'Max mean corpuscular hemoglobin'
    elif col == 'mean_TEMP':
        col = 'Mean temperature'
    elif col == 'mean_Glucose':
        col = 'Mean glucose'
    elif col == 'min_Creatinine':
        col = 'Min creatinine'
    
    new_cols.append(col)


shap_values = None

for lr in lr_models:
    explainer = shap.Explainer(lr, X_train, feature_names=X_train.columns)
    if shap_values is None:
        shap_values = explainer.shap_values(X_test)
    else:
        shap_values += explainer.shap_values(X_test)

shap_values /= len(lr_models)

shap.summary_plot(shap_values, X_test, max_display=10, feature_names=new_cols, show=False)

### Random forest feature importance

In [None]:
rf_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_RF_cv_models.pkl', 'rb'))

In [None]:
new_cols = []
for col in X_test.columns.tolist():
    if col.isdigit():
        col = f'Dx/Rx phenotype {col}'
    elif '_lv' in col:
        col = f'Lab/vital phenotype {col.split("_")[0]}'
    elif '_dxrx' in col:
        col = f'Dx/Rx phenotype {col.split("_")[0]}'
    elif col == 'max_BP_SYS':
        col = 'Max systolic blood pressure'
    elif col == 'FAMILY_CARDIAC_HX':
        col = 'Family history of cardiac diseases'
    elif col == 'latest_SMOKING_STATUS_Never':
        col = 'Latest smoking status: Never'
    
    new_cols.append(col)


shap_values = None

for rf in rf_models:
    explainer = shap.Explainer(rf)
    if shap_values is None:
        shap_values = explainer.shap_values(X_test)[1]
    else:
        shap_values += explainer.shap_values(X_test)[1]

shap_values /= len(rf_models)

shap.summary_plot(shap_values, X_test, max_display=10, feature_names=new_cols, show=False)

In [None]:
fi = []
for rf in rf_models:
    fi.append(rf.feature_importances_)

fi = pd.DataFrame(fi, columns=X_train.columns)

fi_table = fi.melt().groupby('variable')['value'].mean().round(3).astype(str) + '$\pm$' + fi.melt().groupby('variable')['value'].std().round(3).astype(str)
sorted_f = fi.melt().groupby('variable')['value'].mean().sort_values(ascending=False).index

fi_table = fi_table[sorted_f]
print(fi_table.head(10))

In [None]:
means = fi.reset_index().melt(id_vars='index').groupby('variable')['value'].mean().sort_values(ascending=False)
variable_to_id = {variable: i for i, variable in enumerate(means.index)}
df_sorted = fi.reset_index().melt(id_vars='index').sort_values(by='variable', key=lambda x: means[x], ascending=False)
df_sorted['variable_id'] = df_sorted['variable'].map(variable_to_id)
g = sns.lineplot(data=df_sorted, x='variable_id', y='value')
g.set(xlabel='Index', ylabel='Mean importance')

### Decision tree

In [None]:
dt_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_DT_cv_models.pkl', 'rb'))

In [None]:
from sklearn import tree
import graphviz

In [None]:
dot_data = tree.export_graphviz(
    dt_models[0], 
    feature_names=X_train.columns,
    filled=True,
    out_file=None
    ) 
graph = graphviz.Source(dot_data) 
graph.render("DT") 

### XGBoost

In [None]:
xgb_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_XGB_cv_models.pkl', 'rb'))

In [None]:
fi = []
for xgb in xgb_models:
    fi.append(xgb.feature_importances_)

fi = pd.DataFrame(fi, columns=X_train.columns)

fi_table = fi.melt().groupby('variable')['value'].mean().round(3).astype(str) + '$\pm$' + fi.melt().groupby('variable')['value'].std().round(3).astype(str)
sorted_f = fi.melt().groupby('variable')['value'].mean().sort_values(ascending=False).index

fi_table = fi_table[sorted_f]
fi_table.head(10)

In [None]:
means = fi.reset_index().melt(id_vars='index').groupby('variable')['value'].mean().sort_values(ascending=False)
variable_to_id = {variable: i for i, variable in enumerate(means.index)}
df_sorted = fi.reset_index().melt(id_vars='index').sort_values(by='variable', key=lambda x: means[x], ascending=False)
df_sorted['variable_id'] = df_sorted['variable'].map(variable_to_id)
g = sns.lineplot(data=df_sorted, x='variable_id', y='value')
g.set(xlabel='Index', ylabel='Mean importance')

In [None]:
new_cols = []
for col in X_test.columns.tolist():
    if col.isdigit():
        col = f'Dx/Rx phenotype {col}'
    elif '_lv' in col:
        col = f'Lab/vital phenotype {col.split("_")[0]}'
    elif '_dxrx' in col:
        col = f'Dx/Rx phenotype {col.split("_")[0]}'
    elif col == 'max_BP_SYS':
        col = 'Max systolic blood pressure'
    elif col == 'FAMILY_CARDIAC_HX':
        col = 'Family history of cardiac diseases'
    elif col == 'latest_SMOKING_STATUS_Never':
        col = 'Latest smoking status: Never'
    elif col == 'mean_BMI':
        col = 'Mean body mass index'
    elif col == 'max_Mean Corpuscular Hgb':
        col = 'Max mean corpuscular hemoglobin'
    elif col == 'mean_TEMP':
        col = 'Mean temperature'
    
    new_cols.append(col)


shap_values = None

for xgb in xgb_models:
    explainer = shap.Explainer(xgb)
    if shap_values is None:
        shap_values = explainer.shap_values(X_test)
    else:
        shap_values += explainer.shap_values(X_test)

shap_values /= len(xgb_models)

shap.summary_plot(shap_values, X_test, max_display=10, feature_names=new_cols, show=False)

### TabNet

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
tnet_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_TNET_cv_models.pkl', 'rb'))

In [None]:
tnet.feature_importances_

In [None]:
fi = []
for tnet in tnet_models:
    fi.append(tnet.feature_importances_)

fi = pd.DataFrame(fi, columns=X_train.columns)

fi_table = fi.melt().groupby('variable')['value'].mean().round(3).astype(str) + '±' + fi.melt().groupby('variable')['value'].std().round(3).astype(str)
sorted_f = fi.melt().groupby('variable')['value'].mean().abs().sort_values(ascending=False).index

fi_table = fi_table[sorted_f]
fi_table.head(10)

### EBM

In [None]:
from interpret import show

In [None]:
ebm_models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_EBM_cv_models.pkl', 'rb'))

In [None]:
data_dict = model.explain_global().data()
scores = np.array(model.explain_global().data()['scores'])[:len(X_test.columns.tolist())]

In [None]:
feature_dict = {f'feature_{i+1:04d}': col for i, col in enumerate(X_test.columns.tolist())} # map feature names

fi = []
for ebm in ebm_models:
    data_dict = ebm.explain_global().data()
    model_fi = pd.DataFrame([data_dict['names'], data_dict['scores']]).T
    model_fi.columns = ['variable', 'importance']
    model_fi['variable'] = model_fi['variable'].replace(feature_dict, regex=True)
    fi.append(model_fi)

merged_fi = pd.DataFrame(fi[0])
for i in range(1, len(fi)):
    merged_fi = pd.merge(merged_fi, fi[i], on='variable', suffixes=('', f'_{i+1}'), how='inner')

fi = merged_fi.set_index('variable').T

fi_table = fi.melt().groupby('variable')['value'].mean().round(3).astype(str) + '±' + fi.melt().groupby('variable')['value'].std().round(3).astype(str)
sorted_f = fi.melt().groupby('variable')['value'].mean().abs().sort_values(ascending=False).index

fi_table = fi_table[sorted_f]
fi_table.head(10)

## Confusion Matrices

In [None]:
mdl_dir = 'cv_output'
exp_name = 'experiment'
feature_set = '60_all'
X_train = pd.read_csv(f'data/{exp_name}/{feature_set}/X_train.csv', index_col=0)
X_test = pd.read_csv(f'data/{exp_name}/{feature_set}/X_test.csv', index_col=0)
y_test = pd.read_csv(f'data/{exp_name}/{feature_set}/y_test.csv', index_col=0)

In [None]:
all_confmats = []

for model_name in ['DT','RF','LR','XGB','EBM','TNET','GFN']:
    models = pickle.load(open(f'tgfnn/{mdl_dir}/exp_{exp_name}_{feature_set}_{model_name}_cv_models.pkl', 'rb'))

    confmat = []
    for model in models:
        y_pred_test = model.predict(X_test.values)
        confmat.append(confusion_matrix(y_test, y_pred_test) / y_test.shape[0])

    confmat = np.array(confmat)
    confmat = pd.DataFrame(np.mean(confmat, axis=0)).round(3).astype(str) + '+/-' + pd.DataFrame(np.std(confmat, axis=0)).round(3).astype(str)
    all_confmats.append(confmat)

In [None]:
for x in all_confmats:
    print(x)