In [1]:
import sys
sys.path.append('/ssd/kangdang/mmc_project_2023_submission/main_exp_code/step4_exp_analysis')

import pickle
import pandas as pd
import numpy as np
import os
import os.path as pathlib
from stats_fun import cal_ci95
import sklearn.metrics as metrics

output_dir = '/ssd/kangdang/mmc_project_2023_submission/main_exp_code/step4_exp_analysis/table2/fusion_data'
os.makedirs(output_dir, exist_ok=True)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
def get_score_and_gt_helper(input_dir, method_id, feature_id, fusion_type='metric_only', imputation_type='mi'):
    if imputation_type == 'mi':
        input_dir = pathlib.join(input_dir, method_id)
        val_pred_list = list()
        val_gt_list = list()
        test_pred_list = list()
        test_gt_list = list()
        for mice_id in range(20):
            curr_folder = pathlib.join(input_dir, 'mice_imputation_' + str(mice_id+ 1), fusion_type)
            val_score = pd.read_csv(pathlib.join(curr_folder, "feat_[" + str(feature_id) + "]_val_pred_result.csv"))
            test_score = pd.read_csv(pathlib.join(curr_folder, "feat_[" + str(feature_id) + "]_test_pred_result.csv"))
            val_pred_list.append(np.array(val_score['pred']))
            val_gt_list.append(np.array(val_score['gt']))
            test_pred_list.append(np.array(test_score['pred']))
            test_gt_list.append(np.array(test_score['gt']))   
        val_gt_list = val_gt_list[0]
        test_gt_list = test_gt_list[0]
        val_pred_list = np.mean(np.array(val_pred_list), axis = 0)
        test_pred_list = np.mean(np.array(test_pred_list), axis = 0)
    else:
        curr_folder = pathlib.join(input_dir, method_id, fusion_type)
        val_score = pd.read_csv(pathlib.join(curr_folder, "feat_[" + str(feature_id) + "]_val_pred_result.csv"))
        test_score = pd.read_csv(pathlib.join(curr_folder, "feat_[" + str(feature_id) + "]_test_pred_result.csv"))
        val_pred_list = np.array(val_score['pred'])
        val_gt_list = np.array(val_score['gt'])
        test_pred_list = np.array(test_score['pred'])
        test_gt_list = np.array(test_score['gt'])
    return  val_gt_list, test_gt_list, val_pred_list, test_pred_list


def get_score_and_gt(input_dir, feature_id, fusion_type='metric_only', imputation_type='mi', method_id_list = ['method_lr_basic', 'method_lda_basic',  'method_gbc_basic', 'method_rf_basic',  'method_lsvm_basic',  'method_rbfsvm_basic'] ):
    val_auc_list = []
    for method_id in method_id_list:
        val_gt_list, test_gt_list, val_pred_list, test_pred_list  = get_score_and_gt_helper(input_dir, method_id, feature_id, fusion_type, imputation_type)
        fpr, tpr, threshold = metrics.roc_curve(val_gt_list, val_pred_list)
        auc = metrics.auc(fpr, tpr)
        fpr_test, tpr_test, _ = metrics.roc_curve(test_gt_list, test_pred_list)
        auc_test = metrics.auc(fpr_test, tpr_test)
        print(method_id + ':' + str(round(auc, 3)) + ', ' + str(round(auc_test, 3)))
        val_auc_list.append(auc)
    
    best_val_idx = np.argmax(val_auc_list)
    best_method_id = np.array(method_id_list)[best_val_idx]
    val_gt_list, test_gt_list, val_pred_list, test_pred_list = get_score_and_gt_helper(input_dir, best_method_id, feature_id, fusion_type, imputation_type)
    return best_method_id, val_gt_list, test_gt_list, val_pred_list, test_pred_list

In [3]:
def save_merged_dataframe(input_dir, metric_string, save_name, imputation_type='mi', method_id_list = ['method_lr_basic']):
    model_type_metric_only, _, test_gt_list0, _, test_pred_list0 = get_score_and_gt(input_dir, metric_string, "metric_only", imputation_type, method_id_list = method_id_list)
    model_type_fusion, _, test_gt_list1, _, test_pred_list1 = get_score_and_gt(input_dir, metric_string, "fusion", imputation_type, method_id_list = method_id_list)
    
    print('metric_only')
    auc95, rec, sep, sep_90, global_auc, global_rec, global_sep, global_sep_90, global_largest_youden_th_metric, global_given_recall_th_metric = cal_ci95(test_gt_list0, test_pred_list0, 0.8)
    fpr, tpr, threshold = metrics.roc_curve(test_gt_list0, test_pred_list0)
    auc = metrics.auc(fpr, tpr)
    print('auc')
    print("[" + "{:.1f}".format(auc95[0] * 100) + "," + "{:.1f}".format(auc95[1] * 100) + "]")
    print("{:.1f}".format(auc * 100))
    print('recall')
    print("[" + "{:.1f}".format(rec[0] * 100) + "," + "{:.1f}".format(rec[1] * 100) + "]")
    print("{:.1f}".format(global_rec * 100))
    print('sep')
    print("[" + "{:.1f}".format(sep[0] * 100) + "," + "{:.1f}".format(sep[1] * 100) + "]")
    print("{:.1f}".format(global_sep * 100))
    

    print('fusion')
    auc95, rec, sep, sep_90, global_auc, global_rec, global_sep, global_sep_90, global_largest_youden_th_fusion, global_given_recall_th_fusion = cal_ci95(test_gt_list1, test_pred_list1, 0.8)
    fpr, tpr, threshold = metrics.roc_curve(test_gt_list1, test_pred_list1)
    auc = metrics.auc(fpr, tpr)
    print('auc')
    print("[" + "{:.1f}".format(auc95[0] * 100) + "," + "{:.1f}".format(auc95[1] * 100) + "]")
    print("{:.1f}".format(auc * 100))
    print('recall')
    print("[" + "{:.1f}".format(rec[0] * 100) + "," + "{:.1f}".format(rec[1] * 100) + "]")
    print("{:.1f}".format(global_rec * 100))
    print('sep')
    print("[" + "{:.1f}".format(sep[0] * 100) + "," + "{:.1f}".format(sep[1] * 100) + "]")
    print("{:.1f}".format(global_sep * 100))
    
    
    assert((test_gt_list0 == test_gt_list1).all())
    merge_data_list = list()
    merge_data_list.append(test_gt_list0)
    merge_data_list.append(test_pred_list0)
    merge_data_list.append(test_pred_list1)
    merge_data_list.append(np.array(test_pred_list0) >= global_largest_youden_th_metric)
    merge_data_list.append(np.array(test_pred_list1) >= global_largest_youden_th_fusion) 
    
    merge_data_list = np.array(merge_data_list)
    merged_data_list = merge_data_list.transpose()

    merged_data_frame = pd.DataFrame(merged_data_list, columns=['target', 'metric', 'fusion', 'metric_th', 'fusion_th'])
    merged_data_frame.to_csv(pathlib.join(output_dir, save_name), index=False)
    

In [4]:
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_mi'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score.csv'
save_merged_dataframe(input_dir, metric_string, save_name, method_id_list = ['method_lr_basic'])

method_lr_basic:0.855, 0.847
method_lr_basic:0.864, 0.853
metric_only
auc
[84.0,85.5]
84.7
recall
[77.0,78.9]
77.9
sep
[74.4,77.3]
75.8
fusion
auc
[84.6,86.1]
85.3
recall
[77.3,79.2]
78.2
sep
[75.4,78.4]
76.9


In [5]:
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_remove_any_nan_sample'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score_no_imputation.csv'
save_merged_dataframe(input_dir, metric_string, save_name, imputation_type='random_remove_any_nan_sample', method_id_list = ['method_lr_basic'])

method_lr_basic:0.853, 0.844
method_lr_basic:0.862, 0.849
metric_only
auc
[83.6,85.3]
84.4
recall
[75.1,77.2]
76.2
sep
[75.5,78.6]
77.1
fusion
auc
[84.1,85.7]
84.9
recall
[76.4,78.4]
77.3
sep
[74.9,78.1]
76.6


In [6]:
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_simple_imputation'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score_simple_imputation.csv'
save_merged_dataframe(input_dir, metric_string, save_name, imputation_type='simple', method_id_list = ['method_lr_basic'])

method_lr_basic:0.855, 0.847
method_lr_basic:0.864, 0.853
metric_only
auc
[84.0,85.5]
84.7
recall
[77.3,79.2]
78.2
sep
[74.1,77.0]
75.5
fusion
auc
[84.6,86.1]
85.3
recall
[78.5,80.2]
79.3
sep
[73.7,76.9]
75.3


In [7]:
# method_id_list = ['method_lr_basic', 'method_lda_basic',  'method_gbc_basic', 'method_rf_basic',  'method_lsvm_basic',  'method_rbfsvm_basic']
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_mi'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score_lda.csv'
save_merged_dataframe(input_dir, metric_string, save_name, method_id_list = ['method_lda_basic'])

method_lda_basic:0.853, 0.846
method_lda_basic:0.861, 0.851
metric_only
auc
[83.8,85.4]
84.6
recall
[78.7,80.5]
79.6
sep
[71.7,74.7]
73.2
fusion
auc
[84.3,85.9]
85.1
recall
[74.7,76.6]
75.6
sep
[77.1,79.9]
78.5


In [8]:
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_mi'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score_gbc.csv'
save_merged_dataframe(input_dir, metric_string, save_name, method_id_list = ['method_gbc_basic'])

method_gbc_basic:0.854, 0.847
method_gbc_basic:0.855, 0.841
metric_only
auc
[83.9,85.5]
84.7
recall
[75.8,77.7]
76.7
sep
[75.9,78.8]
77.4
fusion
auc
[83.3,84.9]
84.1
recall
[74.0,76.0]
75.0
sep
[76.3,79.0]
77.6


In [9]:
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_mi'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score_rf.csv'
save_merged_dataframe(input_dir, metric_string, save_name, method_id_list = ['method_rf_basic'])

method_rf_basic:0.849, 0.84
method_rf_basic:0.853, 0.842
metric_only
auc
[83.2,84.8]
84.0
recall
[74.0,76.0]
75.0
sep
[76.2,79.0]
77.6
fusion
auc
[83.3,84.9]
84.2
recall
[71.7,73.7]
72.7
sep
[78.1,80.9]
79.5


In [10]:
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_mi'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score_lsvm.csv'
save_merged_dataframe(input_dir, metric_string, save_name, method_id_list = ['method_lsvm_basic'])

method_lsvm_basic:0.854, 0.846
method_lsvm_basic:0.861, 0.852
metric_only
auc
[83.9,85.4]
84.6
recall
[75.5,77.5]
76.5
sep
[75.3,78.1]
76.7
fusion
auc
[84.4,86.0]
85.2
recall
[74.7,76.6]
75.6
sep
[77.4,80.2]
78.8


In [11]:
input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_mi'
metric_string = "'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'"
save_name = 'scenario_random_metric_and_fusion_score_rbf_svm.csv'
save_merged_dataframe(input_dir, metric_string, save_name, method_id_list = ['method_rbfsvm_basic'])

method_rbfsvm_basic:0.847, 0.836
method_rbfsvm_basic:0.855, 0.842
metric_only
auc
[82.7,84.4]
83.6
recall
[74.1,76.1]
75.0
sep
[76.3,79.1]
77.7
fusion
auc
[83.4,85.1]
84.2
recall
[77.9,79.8]
78.8
sep
[74.3,77.3]
75.7


In [16]:
indicator_name_list = ['Gender', 'Age', 'Diastolic Bp', 'Systolic Bp',  'Heart Rate', 'BMI', 'Course of Diabetes', 'Hypertension', 'Hyperlipidemia', 'Cardiovascular Disease', 'Classification of smoker', 'Classification of drinker']
for idx in range(12):
    idx  = idx + 1
    curr_indicator_name = indicator_name_list[idx - 1]
    print(curr_indicator_name)
    input_dir = '/mnt/eye_team/kangdang/mmc_project_2023_submission/exp_records/fusion_exps/random_mi_single_variable'
    metric_string = "'" + str(idx) + "'"
    save_name = 'scenario_random_metric_and_fusion_score_' + curr_indicator_name + '.csv'
    save_merged_dataframe(input_dir, metric_string, save_name, method_id_list = ['method_lr_basic'])
    print('\n')

Gender
method_lr_basic:0.534, 0.536
method_lr_basic:0.795, 0.78
metric_only
auc
[52.5,54.6]
53.6
recall
[42.5,44.8]
43.7
sep
[61.8,65.2]
63.6
fusion
auc
[77.1,79.0]
78.0
recall
[67.4,69.4]
68.4
sep
[72.3,75.5]
73.9


Age
method_lr_basic:0.764, 0.766
method_lr_basic:0.804, 0.794
metric_only
auc
[75.6,77.6]
76.6
recall
[70.0,72.1]
71.1
sep
[65.9,69.2]
67.5
fusion
auc
[78.5,80.3]
79.4
recall
[72.8,74.7]
73.7
sep
[68.7,71.9]
70.2


Diastolic Bp
method_lr_basic:0.649, 0.631
method_lr_basic:0.816, 0.801
metric_only
auc
[62.0,64.2]
63.1
recall
[49.3,51.6]
50.5
sep
[67.2,70.3]
68.7
fusion
auc
[79.1,81.0]
80.1
recall
[71.4,73.4]
72.5
sep
[72.2,75.3]
73.7


Systolic Bp
method_lr_basic:0.758, 0.749
method_lr_basic:0.841, 0.83
metric_only
auc
[73.9,75.9]
74.9
recall
[59.5,61.9]
60.7
sep
[75.3,78.3]
76.8
fusion
auc
[82.1,83.8]
83.0
recall
[76.3,78.3]
77.3
sep
[71.5,74.7]
73.1


Heart Rate
method_lr_basic:0.537, 0.535
method_lr_basic:0.801, 0.786
metric_only
auc
[52.2,54.7]
53.5
recall
[68.0,70.1]
6

In [19]:
test_result = pickle.load(open('/ssd/kangdang/mmc_project_2023_submission/external_test_code/img_only/top_checkpoint_ensemble_external_test_result.pickle', 'rb'))
test_pred = test_result['pred_set']
test_gt = test_result['gt_set']

auc95, rec, sep, sep_90, global_auc, global_rec, global_sep, global_sep_90, global_largest_youden_th_fusion, global_given_recall_th_fusion = cal_ci95(test_gt, test_pred, 0.8)
fpr, tpr, threshold = metrics.roc_curve(test_gt, test_pred)
auc = metrics.auc(fpr, tpr)
print('auc')
print("[" + "{:.1f}".format(auc95[0] * 100) + "," + "{:.1f}".format(auc95[1] * 100) + "]")
print("{:.1f}".format(auc * 100))
print('recall')
print("[" + "{:.1f}".format(rec[0] * 100) + "," + "{:.1f}".format(rec[1] * 100) + "]")
print("{:.1f}".format(global_rec * 100))
print('sep')
print("[" + "{:.1f}".format(sep[0] * 100) + "," + "{:.1f}".format(sep[1] * 100) + "]")
print("{:.1f}".format(global_sep * 100))

auc
[75.7,78.6]
77.2
recall
[72.0,74.7]
73.4
sep
[65.9,70.8]
68.4
