In [1]:
import pandas as pd
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
import numpy as np
import seaborn as sns
import pingouin as pg

In [2]:
cv = pd.read_csv('cv_results.csv')

# replace abbreviations in model column of cv
cv['model'] = cv['model'].replace({
    'GFN': 'TGFNN'
})

cv['dataset'] = cv['dataset'].replace(
    {
        '30_phenotypes' : 'Phenotypes',
        '30_aggregate' : 'Summary statistics',
        '20_latest+demo' : 'Latest, demographics',
        '30_latest+demo+phenotypes' : 'Latest, demo., phenotypes',
        '20_latest+demo+aggregate' : 'Latest, demo., statistics',
        '60_all' : 'All'
    }
)

cv = cv.rename(
    columns = {
    'model' : 'Model',
    'dataset' : 'Feature Set',
    'roc_auc' : 'AUROC',
    'auprc' : 'AUPRC',
    'f1' : 'F1',
    'precision' : 'Precision',
    'recall' : 'Recall',
    }
)

Assess if phenotyping, summary statistics, and latest/demographic feature sets have statistically significant difference performance

In [132]:
x = cv[(cv['set'] == 'test')]

feature_sets = x['Feature Set'].unique()
metrics = ['AUROC', 'AUPRC', 'F1', 'Precision', 'Recall']

f_res = []
n_res = []
means = []

for metric in metrics:
    model_obs = []

    for i in range(len(feature_sets)):
        obs = x[(x['Feature Set'] == feature_sets[i])][metric].values.tolist()
        model_obs.append(obs)
        means.append([metric, feature_sets[i], x[(x['Feature Set'] == feature_sets[i])][metric].mean()])

    f, p = friedmanchisquare(*model_obs)
    f_res.append([metric, p])
    n_res.append([metric, pd.DataFrame(sp.posthoc_nemenyi_friedman(np.array(model_obs).T).values, columns=feature_sets, index=feature_sets)])

f_df = pd.DataFrame(f_res, columns=['Metric','p-value'])

means = pd.DataFrame(means, columns=['Metric', 'Feature Set', 'Mean'])

def format(entry):
    n = entry[1]
    mask = np.triu(np.ones(n.shape), k=1).astype(bool)
    n = n.where(mask)
    n = n.reset_index()
    n = n.rename(columns={'index': 'Feature Set 1'})
    n = n.melt(id_vars='Feature Set 1', var_name='Feature Set 2', value_name='p-value').dropna()
    n['Metric'] = entry[0]
    n = n.reset_index(drop=True)

    return n

format_n = []
for entry in n_res:
    format_n.append(format(entry))
n_df = pd.concat(format_n)

n_df = pd.merge(n_df, means, left_on=['Metric', 'Feature Set 1'], right_on=['Metric', 'Feature Set'])
n_df = pd.merge(n_df, means, left_on=['Metric', 'Feature Set 2'], right_on=['Metric', 'Feature Set'], suffixes=('_1', '_2'))
n_df = n_df.drop(columns=['Feature Set_1', 'Feature Set_2'])
n_df = n_df.rename(columns={'Mean_1': 'Mean of Feature Set 1', 'Mean_2': 'Mean of Feature Set 2'})

In [133]:
f_df[f_df['p-value'] < 0.05 / 5]

Unnamed: 0,Metric,p-value
0,AUROC,2.787031e-20
1,AUPRC,1.862851e-10
2,F1,3.681754e-07
3,Precision,0.0001536833
4,Recall,0.0005043132


In [134]:
n_df[(n_df['p-value'] < 0.05)]

Unnamed: 0,Feature Set 1,Feature Set 2,p-value,Metric,Mean of Feature Set 1,Mean of Feature Set 2
0,Phenotypes,Summary statistics,0.001,AUROC,0.624179,0.591041
1,Phenotypes,"Latest, demographics",0.001,AUROC,0.624179,0.598922
4,Summary statistics,"Latest, demo., phenotypes",0.001,AUROC,0.591041,0.617286
5,"Latest, demographics","Latest, demo., phenotypes",0.001,AUROC,0.598922,0.617286
6,Phenotypes,"Latest, demo., statistics",0.003685,AUROC,0.624179,0.605754
7,Summary statistics,"Latest, demo., statistics",0.009258,AUROC,0.591041,0.605754
9,"Latest, demo., phenotypes","Latest, demo., statistics",0.014276,AUROC,0.617286,0.605754
11,Summary statistics,All,0.001,AUROC,0.591041,0.63309
12,"Latest, demographics",All,0.001,AUROC,0.598922,0.63309
14,"Latest, demo., statistics",All,0.001,AUROC,0.605754,0.63309


Compute if one model performed better than the others

In [136]:
x = cv[(cv['set'] == 'test')]

models = x['Model'].unique()
metrics = ['AUROC', 'AUPRC', 'F1', 'Precision', 'Recall']

f_res = []
n_res = []
means = []

for metric in metrics:
    model_obs = []

    for i in range(len(models)):
        obs = x[(x['Model'] == models[i])][metric].values.tolist()
        model_obs.append(obs)
        means.append([metric, models[i], x[(x['Model'] == models[i])][metric].mean()])


    f, p = friedmanchisquare(*model_obs)
    f_res.append([metric, p])
    n_res.append([metric, pd.DataFrame(sp.posthoc_nemenyi_friedman(np.array(model_obs).T).values, columns=models, index=models)])

f_df = pd.DataFrame(f_res, columns=['Metric','p-value'])
means = pd.DataFrame(means, columns=['Metric', 'Model', 'Mean'])


def format(entry):
    n = entry[1]
    mask = np.triu(np.ones(n.shape), k=1).astype(bool)
    n = n.where(mask)
    n = n.reset_index()
    n = n.rename(columns={'index': 'Model 1'})
    n = n.melt(id_vars='Model 1', var_name='Model 2', value_name='p-value').dropna()
    n['Metric'] = entry[0]
    n = n.reset_index(drop=True)

    return n

format_n = []
for entry in n_res:
    format_n.append(format(entry))
n_df = pd.concat(format_n)

n_df = pd.merge(n_df, means, left_on=['Metric', 'Model 1'], right_on=['Metric', 'Model'])
n_df = pd.merge(n_df, means, left_on=['Metric', 'Model 2'], right_on=['Metric', 'Model'], suffixes=('_1', '_2'))
n_df = n_df.drop(columns=['Model_1', 'Model_2'])
n_df = n_df.rename(columns={'Mean_1': 'Mean of Model 1', 'Mean_2': 'Mean of Model 2'})

In [137]:
f_df[f_df['p-value'] < 0.05 / 5]

Unnamed: 0,Metric,p-value
0,AUROC,7.24095e-27
1,AUPRC,1.061765e-23
2,F1,5.524771e-27
3,Precision,4.6646790000000005e-22
4,Recall,5.167719e-25


In [138]:
n_df[(n_df['p-value'] < 0.05)]

Unnamed: 0,Model 1,Model 2,p-value,Metric,Mean of Model 1,Mean of Model 2
1,RF,TGFNN,0.003975,AUROC,0.632947,0.598982
2,LR,TGFNN,0.001000,AUROC,0.634851,0.598982
3,RF,XGB,0.001000,AUROC,0.632947,0.598403
4,LR,XGB,0.001000,AUROC,0.634851,0.598403
6,RF,TNET,0.001000,AUROC,0.632947,0.592901
...,...,...,...,...,...,...
96,TGFNN,EBM,0.001000,Recall,0.701117,0.095361
97,XGB,EBM,0.001000,Recall,0.512930,0.095361
101,TGFNN,DT,0.044580,Recall,0.701117,0.510052
103,TNET,DT,0.001000,Recall,0.158677,0.510052
