In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as spy
from tqdm import tqdm
import time
import pickle
import glob


# 결과 확인을 용이하게 하기 위한 코드
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_rows',200)
pd.set_option('display.max_colwidth',None)

from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# icd version selection

In [2]:
ICD_VER=9

# xgboost model

In [3]:
def XGBoost(X_train, X_test, y_train, y_test):
    # Create XGBoost classifer object
    xgb = XGBClassifier()

    # Train XGBoost Classifer
    model=xgb.fit(X_train, y_train)

    # Predict the response for test dataset
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)

    # Model Accuracy, how often is the classifier correct?
    APR=metrics.average_precision_score(y_test, y_pred_prob[:,1], average='weighted')
    AUC=metrics.roc_auc_score(y_test, y_pred_prob[:,1])

    return APR, AUC, list(model.feature_names_in_), list(model.feature_importances_)

# data preparation for XGB modeling

In [4]:
def ML_data_preparation(cl1_dat, cl0_dat):
    cl1_dat_prep=cl1_dat.copy()
    cl0_dat_prep=cl0_dat.copy()
    
    cl1_dat_prep['gender']=cl1_dat_prep['gender'].replace({'M':1, 'F':2})
    cl0_dat_prep['gender']=cl0_dat_prep['gender'].replace({'M':1, 'F':2})

    cl1_dat_prep['class']=1
    cl0_dat_prep['class']=0
    
    cl0_dat_prep_sub=cl0_dat_prep.sample(len(cl1_dat_prep))
    concat_data=pd.concat([cl1_dat_prep, cl0_dat_prep_sub], axis=0)

    X=concat_data.drop(columns='class')
    y=concat_data['class']
    
    return X, y

# XGBoost modeling

In [5]:
xgb_res_df=pd.DataFrame(columns=['Hage_equalmore', 'Hage_less', 'APR','AUC', 'FTN', 'FTI'])

global_it=0
for file_path in glob.glob("./results/data_cl1_icd{}_*_*.pickle".format(ICD_VER)):
    Hage_th1, Hage_th2 = file_path.split('.')[-2].split('_')[-2:]
    file_path
    with open(file_path, 'rb') as f:
        cl1_dat=pickle.load(f).drop(columns=['subject_id','H_time', 'HF_diff'])

    with open(file_path.replace('data_cl1_icd','data_cl0_icd'), 'rb') as f:
        cl0_dat=pickle.load(f).drop(columns=['subject_id','H_time', 'HF_diff'])

    for it in range(1000):
        X,y=ML_data_preparation(cl1_dat, cl0_dat)

        ## split traing and test data
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y)

        ## machine learning model: (XGBoost)
        APR, AUC, FTN, FTI=XGBoost(X_train, X_test, y_train, y_test)
        APR, AUC, FTN, FTI=XGBoost(X_train, X_test, y_train, y_test)
        xgb_res_df.loc[global_it]=[Hage_th1, Hage_th2, APR, AUC, FTN, FTI]
        global_it+=1
        
xgb_res_df
with open('./xgb_results/XGB_res_icd{}.pickle'.format(ICD_VER), 'wb') as f:
    pickle.dump(xgb_res_df, f)

'./results/data_cl1_icd9_0_65.pickle'

'./results/data_cl1_icd9_0_120.pickle'

'./results/data_cl1_icd9_65_80.pickle'

'./results/data_cl1_icd9_80_120.pickle'

Unnamed: 0,Hage_equalmore,Hage_less,APR,AUC,FTN,FTI
0,0,65,0.639743,0.674684,"[gender, H_age, 07054, 2449, 25000, 25060, 2720, 2724, 2761, 27800, 27801, 2859, 30000, 3004, 3051, 311, 32723, 33829, 3572, 412, 41401, 42731, 486, 49390, 496, 53081, 5849, 5990, 78659, V1254, V1582, V4581, V4582, V5861, V5866, V5867]","[0.013923257, 0.019665463, 0.03626204, 0.017095285, 0.020031724, 0.024627877, 0.030178916, 0.015598282, 0.013047555, 0.01522253, 0.015419227, 0.02331943, 0.014614609, 0.022016335, 0.022907875, 0.025357332, 0.03722352, 0.026314003, 0.043897945, 0.025273139, 0.060625367, 0.10207326, 0.021984546, 0.0232356, 0.034217466, 0.021952106, 0.014520799, 0.017128866, 0.02813953, 0.057308756, 0.022089036, 0.026794866, 0.023807572, 0.010753601, 0.04022795, 0.033144392]"
1,0,65,0.666635,0.690137,"[gender, H_age, 07054, 2449, 25000, 25060, 2720, 2724, 2761, 27800, 27801, 2859, 30000, 3004, 3051, 311, 32723, 33829, 3572, 412, 41401, 42731, 486, 49390, 496, 53081, 5849, 5990, 78659, V1254, V1582, V4581, V4582, V5861, V5866, V5867]","[0.01969942, 0.017566983, 0.0271908, 0.015461154, 0.029207338, 0.05921217, 0.01103986, 0.017638251, 0.033592407, 0.019302094, 0.047441196, 0.01724356, 0.026377408, 0.029490335, 0.021516992, 0.015643522, 0.023805931, 0.011022388, 0.049974293, 0.014033188, 0.0885018, 0.06289474, 0.038193267, 0.009726856, 0.030702183, 0.01898205, 0.024512494, 0.015251405, 0.025828775, 0.015238455, 0.031473454, 0.053364888, 0.014033172, 0.028988587, 0.014782061, 0.021066615]"
2,0,65,0.633344,0.644182,"[gender, H_age, 07054, 2449, 25000, 25060, 2720, 2724, 2761, 27800, 27801, 2859, 30000, 3004, 3051, 311, 32723, 33829, 3572, 412, 41401, 42731, 486, 49390, 496, 53081, 5849, 5990, 78659, V1254, V1582, V4581, V4582, V5861, V5866, V5867]","[0.016810128, 0.016463123, 0.068234764, 0.014758523, 0.024366327, 0.024912663, 0.022261411, 0.021142587, 0.01554384, 0.016867287, 0.023449536, 0.016272854, 0.020678656, 0.012130943, 0.022604363, 0.019961871, 0.023430858, 0.021228062, 0.045253683, 0.07130547, 0.025863731, 0.04118526, 0.020820979, 0.016476035, 0.035124734, 0.021709397, 0.029657241, 0.020261576, 0.031839408, 0.013904345, 0.023279497, 0.08280482, 0.01106999, 0.023706982, 0.032410588, 0.052208513]"
3,0,65,0.600180,0.579414,"[gender, H_age, 07054, 2449, 25000, 25060, 2720, 2724, 2761, 27800, 27801, 2859, 30000, 3004, 3051, 311, 32723, 33829, 3572, 412, 41401, 42731, 486, 49390, 496, 53081, 5849, 5990, 78659, V1254, V1582, V4581, V4582, V5861, V5866, V5867]","[0.015240487, 0.023190925, 0.045828406, 0.021265887, 0.03441975, 0.059295624, 0.031185828, 0.01811754, 0.016229201, 0.022117712, 0.035204176, 0.027153764, 0.030687867, 0.035783395, 0.020950219, 0.025421454, 0.021636734, 0.019865122, 0.032322913, 0.015964542, 0.038118728, 0.04294139, 0.03810749, 0.027720876, 0.010466861, 0.02070078, 0.03757021, 0.017150544, 0.032398596, 0.014062842, 0.020194102, 0.047261745, 0.012176912, 0.030792953, 0.020486305, 0.03796814]"
4,0,65,0.643173,0.632894,"[gender, H_age, 07054, 2449, 25000, 25060, 2720, 2724, 2761, 27800, 27801, 2859, 30000, 3004, 3051, 311, 32723, 33829, 3572, 412, 41401, 42731, 486, 49390, 496, 53081, 5849, 5990, 78659, V1254, V1582, V4581, V4582, V5861, V5866, V5867]","[0.01977822, 0.019039141, 0.016608663, 0.022042232, 0.024420712, 0.048058655, 0.052736603, 0.0262055, 0.022638343, 0.024385273, 0.03333685, 0.0230999, 0.019971924, 0.015809389, 0.016216744, 0.01988931, 0.024920633, 0.016300756, 0.10106065, 0.011396671, 0.043583967, 0.05747666, 0.02472337, 0.013189823, 0.034304343, 0.022272302, 0.01269676, 0.029163837, 0.028358376, 0.018645609, 0.028063191, 0.049538616, 0.017228834, 0.012617093, 0.026049327, 0.024171632]"
...,...,...,...,...,...,...
3995,80,120,0.603513,0.636004,"[gender, H_age, 2449, 25000, 2720, 2724, 2749, 2761, 27651, 2851, 2859, 30000, 311, 3659, 412, 41400, 41401, 4241, 42731, 42789, 486, 496, 53081, 56400, 5849, 5990, 60000, 71590, 73300, V103, V1046, V1083, V1254, V1582, V4364, V4581, V4582, V5861]","[0.021235082, 0.021297494, 0.02677869, 0.027033135, 0.027963247, 0.022844924, 0.015541687, 0.018312285, 0.027783029, 0.047250755, 0.02811232, 0.029518805, 0.023459082, 0.02458, 0.016624397, 0.022091767, 0.02851007, 0.050602626, 0.034310974, 0.024137784, 0.018990273, 0.02582146, 0.02062861, 0.027972806, 0.02429327, 0.032135278, 0.02685669, 0.015961066, 0.028147357, 0.015031154, 0.043792617, 0.018648408, 0.028382102, 0.027984874, 0.022521205, 0.031988647, 0.02879112, 0.024064802]"
3996,80,120,0.582811,0.572039,"[gender, H_age, 2449, 25000, 2720, 2724, 2749, 2761, 27651, 2851, 2859, 30000, 311, 3659, 412, 41400, 41401, 4241, 42731, 42789, 486, 496, 53081, 56400, 5849, 5990, 60000, 71590, 73300, V103, V1046, V1083, V1254, V1582, V4364, V4581, V4582, V5861]","[0.026018001, 0.020871753, 0.022146149, 0.023858765, 0.022801835, 0.019103263, 0.01856355, 0.019650057, 0.03729481, 0.017730363, 0.025855176, 0.029160308, 0.025362296, 0.03504622, 0.021235244, 0.044467915, 0.028244335, 0.03127128, 0.025157727, 0.026203698, 0.022907954, 0.030615028, 0.022675073, 0.021065189, 0.028382875, 0.025726935, 0.023913039, 0.020556934, 0.020382091, 0.020754183, 0.0180127, 0.025659231, 0.0328871, 0.032672707, 0.030422222, 0.036013007, 0.03297456, 0.03433633]"
3997,80,120,0.546417,0.548440,"[gender, H_age, 2449, 25000, 2720, 2724, 2749, 2761, 27651, 2851, 2859, 30000, 311, 3659, 412, 41400, 41401, 4241, 42731, 42789, 486, 496, 53081, 56400, 5849, 5990, 60000, 71590, 73300, V103, V1046, V1083, V1254, V1582, V4364, V4581, V4582, V5861]","[0.022943638, 0.021110713, 0.02687431, 0.024058975, 0.027642952, 0.021575863, 0.01285033, 0.01680018, 0.030489765, 0.027192004, 0.026230033, 0.022141198, 0.023236774, 0.030021174, 0.017155817, 0.024977297, 0.026237229, 0.046762634, 0.029679019, 0.019923847, 0.0167065, 0.033175033, 0.023846606, 0.023344787, 0.02338798, 0.03258109, 0.026144037, 0.036878206, 0.02176894, 0.02500826, 0.028727254, 0.023583528, 0.01753039, 0.029939609, 0.024539098, 0.041377813, 0.0406379, 0.032919247]"
3998,80,120,0.603768,0.603322,"[gender, H_age, 2449, 25000, 2720, 2724, 2749, 2761, 27651, 2851, 2859, 30000, 311, 3659, 412, 41400, 41401, 4241, 42731, 42789, 486, 496, 53081, 56400, 5849, 5990, 60000, 71590, 73300, V103, V1046, V1083, V1254, V1582, V4364, V4581, V4582, V5861]","[0.018620417, 0.020184413, 0.021412779, 0.024185788, 0.02586962, 0.022228982, 0.01056532, 0.03421498, 0.042193454, 0.013612217, 0.022967158, 0.013774689, 0.026011981, 0.022258056, 0.017565558, 0.04359388, 0.026184628, 0.04408432, 0.03426424, 0.034300745, 0.015268413, 0.020011876, 0.02206783, 0.0288795, 0.021027716, 0.023849385, 0.025431057, 0.023779625, 0.029621882, 0.022532368, 0.030653445, 0.017687358, 0.028971665, 0.029163564, 0.035610456, 0.028201843, 0.04833395, 0.030814834]"


In [7]:
xgb_res_df.groupby(['Hage_equalmore','Hage_less'])['APR'].mean().sort_values(ascending=False)
xgb_res_df.groupby(['Hage_equalmore','Hage_less'])['AUC'].mean().sort_values(ascending=False)

Hage_equalmore  Hage_less
0               120          0.672327
                65           0.653877
65              80           0.605344
80              120          0.602134
Name: APR, dtype: float64

Hage_equalmore  Hage_less
0               120          0.692551
                65           0.643257
65              80           0.611525
80              120          0.600857
Name: AUC, dtype: float64