In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from glob import glob

In [None]:
site = 'MCW'
year = 2012
datafolder = '/home/hchan2/AKI/data/'
home_directory = "/home/hchan2/AKI/AKI_Python/"
pred_end = 7

In [None]:
# load tables
onset = pd.read_csv(datafolder+site+'/raw/'+'AKI_ONSETS'+'.csv',sep=',', dtype=({'PATID': 'object', 'ENCOUNTERID': 'object'}))
vital = pd.read_csv(datafolder+site+'/raw/'+'AKI_VITAL'+'.csv',sep=',', dtype=({'PATID': 'object', 'ENCOUNTERID': 'object'}))
demo = pd.read_csv(datafolder+site+'/raw/'+'AKI_DEMO'+'.csv',sep=',', dtype=({'PATID': 'object', 'ENCOUNTERID': 'object'}))
dx = pd.read_csv(datafolder+site+'/raw/'+'AKI_DX'+'.csv',sep=',', dtype=({'PATID': 'object', 'ENCOUNTERID': 'object', 'DX_TYPE': 'object', 'DX': 'object'}))
px = pd.read_csv(datafolder+site+'/raw/'+'AKI_PX'+'.csv',sep=',', dtype=({'PATID': 'object', 'ENCOUNTERID': 'object'}))
lab = pd.read_csv(datafolder+site+'/raw/'+'AKI_LAB'+'.csv',sep=',', dtype=({'PATID': 'object', 'ENCOUNTERID': 'object'}))
amed = pd.read_csv(datafolder+site+'/raw/'+'AKI_AMED'+'.csv',sep=',', dtype=({'PATID': 'object', 'ENCOUNTERID': 'object'}))

In [None]:
# rxnorm -> atc
# pd.DataFrame(amed['MEDADMIN_CODE'].unique()).to_csv('/home/hchan2/AKI/AKI_Python/rxnormtmp.csv', sep=',', index=False, header = False)

# Go to run rxnorm2atcR.ipynb NOW

rxcui2atc = pd.read_csv(home_directory+'rxnorm_out_'+site+'.csv',sep=',') >> rename(MEDADMIN_CODE=X.Rxcui)
amed = amed >> left_join(rxcui2atc, by='MEDADMIN_CODE')

In [None]:
# ICD10 -> ICD09
#dx = pd.read_csv(datafolder+'/raw/'+'AKI_DX'+'.csv',sep=',', dtype=({'DX_TYPE': 'object', 'DX': 'object'}))
icd10toicd09 = pd.read_csv(home_directory+'2018_I10gem.csv',sep=',')
dx['DX'] = dx['DX'].map(lambda x: x.replace('.',''))
icd10toicd09.columns = ['DX', 'DX09']
#add fillna for unmatch ICD10
dx3 = dx >> mask(X.DX_TYPE == '10') >> left_join(icd10toicd09, by='DX') >> mutate(DX_TYPE = '09', DX = X.DX09) >> select(~X.DX09)
dx = pd.concat([dx >> mask(X.DX_TYPE != '10'), dx3], axis=0)

In [None]:
# Get AKI1 paitients
onset['ADMIT_DATE'] = pd.to_datetime(onset['ADMIT_DATE'])
onset_yr = onset.query("ADMIT_DATE >= '"+str(year)+"/01/01' and ADMIT_DATE <= '"+str(year)+"/12/31'")
onset_yr_aki1 = onset_yr[onset_yr["AKI1_ONSET"].notnull()]
onset_yr_aki1_select = onset_yr_aki1[["PATID", "ENCOUNTERID", "AKI1_SINCE_ADMIT"]]
onset_yr_aki1_select.loc[onset_yr_aki1_select.AKI1_SINCE_ADMIT == 0, 'AKI1_SINCE_ADMIT'] = 1

In [None]:
PATID = onset_yr_aki1_select['PATID'].tolist()
ENCOUNTERID = onset_yr_aki1_select['ENCOUNTERID'].tolist()
SINCE_ADMIT = onset_yr_aki1_select['AKI1_SINCE_ADMIT'].tolist()

In [None]:
#Rethink if I need it TODO
dx.loc[dx.DAYS_SINCE_ADMIT == 0, 'DAYS_SINCE_ADMIT'] = 1
px.loc[px.DAYS_SINCE_ADMIT == 0, 'DAYS_SINCE_ADMIT'] = 1
amed.loc[amed.DAYS_SINCE_ADMIT == 0, 'DAYS_SINCE_ADMIT'] = 1
lab.loc[lab.DAYS_SINCE_ADMIT == 0, 'DAYS_SINCE_ADMIT'] = 1
vital.loc[vital.DAYS_SINCE_ADMIT == 0, 'DAYS_SINCE_ADMIT'] = 1

In [None]:
#Expand data (Create row for each day for each encounter until pred end)
newdf = pd.DataFrame(np.repeat(onset_yr_aki1_select.values,7,axis=0))
newdf.columns = onset_yr_aki1_select.columns
since_admit = [*range(7)]*onset_yr_aki1_select.shape[0]
newdf['SINCE_ADMIT'] = since_admit
conditions = [(newdf['SINCE_ADMIT'] < newdf['AKI1_SINCE_ADMIT']-1), (newdf['SINCE_ADMIT'] == newdf['AKI1_SINCE_ADMIT']-1), (newdf['SINCE_ADMIT'] > newdf['AKI1_SINCE_ADMIT']-1)]
values = [0, 1, -1]
newdf['FLAG'] = np.select(conditions, values)
newdf = newdf.drop(newdf[newdf['FLAG']==-1].index)

In [None]:
# drop unused column
vital = vital.drop(['Row #','MEASURE_DATE_TIME','SMOKING', 'TOBACCO', 'TOBACCO_TYPE'],axis=1, errors='ignore')
demo = demo.drop(['Row #','DEATH_DATE','BIRTH_DATE','DDAYS_SINCE_ENC','DEATH_DATE_IMPUTE','DEATH_SOURCE'],axis=1, errors='ignore')
dx = dx.drop(['DX_SOURCE','DX_ORIGIN','PDX','DX_DATE'],axis=1, errors='ignore')
px = px.drop(['Row #', 'PX_SOURCE','DX_ORIGIN','PDX','PX_DATE', 'PX_TYPE'],axis=1, errors='ignore')
lab = lab.drop(['LAB_ORDER_DATE','SPECIMEN_DATE_TIME','RESULT_DATE_TIME','SPECIMEN_SOURCE','LAB_ORDER_DATE','SPECIMEN_DATE_TIME','RESULT_DATE_TIME','SPECIMEN_SOURCE','LAB_PX','LAB_PX_TYPE','RESULT_QUAL','RESULT_UNIT'],axis=1, errors='ignore')
amed = amed.drop(['Row #','MEDADMIN_START_DATE_TIME','MEDADMIN_STOP_DATE_TIME','MEDADMIN_TYPE','MEDADMIN_CODE','MEDADMIN_DOSE_ADMIN','MEDADMIN_ROUTE','MEDADMIN_SOURCE'],axis=1, errors='ignore')

In [None]:
#Calculate daily average
vital_mean = vital.groupby(['PATID', 'ENCOUNTERID', 'DAYS_SINCE_ADMIT']).mean().reset_index()
lab_mean = lab.groupby(['PATID', 'ENCOUNTERID', 'LAB_LOINC', 'DAYS_SINCE_ADMIT']).mean().reset_index()

In [None]:
#Transform dx table (Historical data: Yes if any diagnoasis show up) (Assuming all DAYS_SINCE_ADMIT < 0) (Boolean)
dx_t = dx >> mutate(DX=X.DX_TYPE+":"+X.DX) >> drop('DX_TYPE')
dx_t = (dx_t >> drop('DAYS_SINCE_ADMIT') >> mutate(dummy = True) >> distinct()).pivot(index=['PATID', 'ENCOUNTERID'], columns='DX', values='dummy').fillna(False).reset_index()

In [None]:
#Transform AMED Table (Exact date) (Boolean)
amed_list = []
for i in range(1,pred_end+1):
    amed_p = (amed >> mask(X.DAYS_SINCE_ADMIT == i) >> drop('DAYS_SINCE_ADMIT') >> mutate(dummy = True) >> distinct()).fillna('NI').pivot(index=['PATID', 'ENCOUNTERID'], columns='ATC4th', values='dummy').fillna(False).reset_index()
    amed_p = amed_p >> mutate(SINCE_ADMIT = i)
    amed_list.append(amed_p.copy())
amed_t = pd.concat(amed_list, axis=0, ignore_index=True).fillna(False) 
amed_t = amed_t.astype({'PATID': 'float64', 'ENCOUNTERID':'float64'})
#TODO test if not amed data in onset paitient
# [x for x in newdf['PATID'] if x in amed_t['PATID']]

In [None]:
#Transform LAB Table (Row over the previous value if unknown) (Continuous)
lab_list = []
lab_p = (lab_mean >> mask(X.DAYS_SINCE_ADMIT == 1) >> drop('DAYS_SINCE_ADMIT') >> distinct()).pivot(index=['PATID', 'ENCOUNTERID'], columns='LAB_LOINC', values='RESULT_NUM').reset_index()
lab_p = lab_p >> mutate(SINCE_ADMIT = 1)   
lab_list.append(lab_p.copy())
for i in range(2,pred_end+1):
    lab_p = (lab_mean >> mask(X.DAYS_SINCE_ADMIT == i) >> drop('DAYS_SINCE_ADMIT') >> distinct()).pivot(index=['PATID', 'ENCOUNTERID'], columns='LAB_LOINC', values='RESULT_NUM').reset_index()
    # fill in previous day value (row over if unknown)
    lab_p = lab_p.join(lab_list[-1], lsuffix="DROP").filter(regex="^(?!.*DROP)")
    lab_p = lab_p.combine_first(lab_list[-1])
    lab_p = lab_p >> mutate(SINCE_ADMIT = i)    
    lab_list.append(lab_p.copy())
lab_t = pd.concat(lab_list, axis=0, ignore_index=True) 
lab_t = lab_t.astype({'PATID': 'float64', 'ENCOUNTERID':'float64'})

In [None]:
#Transform PX Table (Exact Date) (Boolean)
px_list = []
for i in range(1,pred_end+1):
    px_p = (px >> mask(X.DAYS_SINCE_ADMIT == i) >> drop('DAYS_SINCE_ADMIT') >> mutate(dummy = True) >> distinct()).pivot(index=['PATID', 'ENCOUNTERID'], columns='PX', values='dummy').fillna(False).reset_index()
    px_p = px_p >> mutate(SINCE_ADMIT = i)
    px_list.append(px_p.copy())
px_t = pd.concat(px_list, axis=0, ignore_index=True).fillna(False) 
px_t = px_t.astype({'PATID': 'float64', 'ENCOUNTERID':'float64'})

In [None]:
# Join different Tables
newdf1 = pd.merge(newdf, vital_mean, left_on=['PATID', 'ENCOUNTERID', 'AKI1_SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'DAYS_SINCE_ADMIT'], how='left')
newdf2 = pd.merge(newdf1, demo, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left')
newdf3 = pd.merge(newdf2, amed_t, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')
newdf3 = newdf3.combine_first(newdf3[list(amed_t.select_dtypes('bool').columns)].fillna(False))
newdf4 = pd.merge(newdf3, lab_t, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')
newdf5 = pd.merge(newdf4, px_t, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')
# Rethink if ENCOUNTERID needed for dx (Yes, since negative SINCE_ADMIT take care of that)
newdf6 = pd.merge(newdf5, dx_t, left_on=['PATID', 'ENCOUNTERID', 'AKI1_SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'DAYS_SINCE_ADMIT'], how='left')
newdf7 = newdf4.drop(['PATID', 'ENCOUNTERID', 'AKI1_SINCE_ADMIT', 'SINCE_ADMIT', 'DAYS_SINCE_ADMIT','DAYS_SINCE_ADMIT_x'],axis=1, errors='ignore')
newdf8 = newdf7.dropna(axis=1, how='all')

In [None]:
# Imputation for categorical data
#newdf8.loc[:, newdf8.dtypes == 'object'] = newdf8.loc[:, newdf8.dtypes == 'object'].fillna('NI')
newdf8 = newdf8.combine_first(newdf8.select_dtypes('object').fillna('NI'))

In [None]:
# convert to onehotencoding
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(newdf8.select_dtypes('object'))
newdf8_onehot_cat = pd.DataFrame(enc.transform(newdf8.select_dtypes('object')).toarray(), columns=enc.get_feature_names(newdf8.select_dtypes('object').columns)).astype('bool')
data = pd.concat([newdf8.select_dtypes(['float64', 'int64']).reset_index(), newdf8_onehot_cat], axis=1).drop('index',axis=1)

In [None]:
# test train split
label = data['FLAG']
data = data[data.columns[data.columns!='FLAG']]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, label, test_size=0.2)

In [None]:
#cat_fea = ['SEX', 'RACE', 'HISPANIC']
#newdf4 = newdf4.drop(cat_fea)
#cat_inx = [list(X_train.columns).index(x) for x in cat_fea]
#cat_noninx = [list(X_train.columns).index(x) for x in list(X_train.columns) if not(x in cat_fea)]

In [None]:
# impute continuous value for SMOTE
imp_num = sklearn.impute.SimpleImputer()
imp_num.fit(X_train.select_dtypes(['int64', 'float64']))
X_train_imp = X_train.reset_index().combine_first(pd.DataFrame(imp_num.transform(X_train.select_dtypes(['int64', 'float64'])), columns=X_train.select_dtypes(['int64', 'float64']).columns)).drop('index',axis=1)
#cat_fea = [X_train_imp.columns.get_loc(c) for c in list(X_train_imp.select_dtypes('bool').columns)]

In [None]:
# SMOTE
sm = SMOTENC(categorical_features=X_train_imp.dtypes == 'bool')
X_res, y_res = sm.fit_resample(X_train_imp, y_train)
X_train_onehot_com = pd.concat([X_res, X_train], axis=0)
y_train_com = pd.concat([y_res, y_train], axis=0)
X_test = X_test[X_train_onehot_com.columns]

In [None]:
dtrain = xgb.DMatrix(X_train_onehot_com, label=y_train_com)
dtest  = xgb.DMatrix(X_test, label=y_test)

In [None]:
params = {'max_depth': [10], 'objective': ['binary:logistic'], 'learning_rate': [0.01, 0.1]}
params['nthread'] = [4]
params['min_child_weight'] = [1]
params['subsample'] = [0.8]
params['colsample_bytree'] = [0.8]
params['gamma'] = [1]
params['eval_metric'] = ['auc']
#params["print.every.n"] = [50]
#params["maximize"] = [True]
params["nthread"] = [-1]
#params["early_stopping_rounds"] = [50]
#params["num_boost_round"] = [1000]

order = params.keys()
params = pd.DataFrame(itertools.product(*[params[k] for k in order]), columns=order).to_dict(orient = 'records')

In [None]:
class SaveBestModel(xgb.callback.TrainingCallback):
    def __init__(self, cvboosters):
        self._cvboosters = cvboosters
    
    def after_training(self, model):
        self._cvboosters[:] = [cvpack.bst for cvpack in model.cvfolds]
        return model

In [None]:
# 10-fold cross validation get initial guess
best_auc = 0
for param in params:
    cvboosters = []
    bst_10 = xgb.cv(param, dtrain, nfold=10, stratified=True, callbacks=[SaveBestModel(cvboosters), ], num_boost_round=1000, early_stopping_rounds=50, maximize=True, verbose_eval = 50)    
    if np.mean(bst_10['test-auc-mean']) > best_auc:
        best_auc = np.mean(bst_10['test-auc-mean'])
        best_params = param
#    best_model = cvboosters[np.argmax(bst_10['test-auc-mean'])]

In [None]:
# tune
#bst = xgb.train(param)
bst = xgb.train(best_params, dtrain, evals=[(dtrain, 'Train'), (dtest, 'Test')], num_boost_round=1000, early_stopping_rounds=50, maximize=True, verbose_eval = 10)

In [None]:
#params = {'max_depth': 10, 'learning_rate' : 0.1, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'n_jobs': 23, 'verbosity': 1}
params = best_params
model = XGBClassifier(**params)
eval_set = [(X_train_onehot_com, y_train_com), (X_test, y_test)]
print(params)
model.fit(X_train_onehot_com, y_train_com, eval_set=eval_set, early_stopping_rounds=50)

In [None]:
print(dir(model))
model.n_classes_

In [None]:
ax = xgb.plot_importance(model.get_booster())
ax.figure.savefig(datafolder+'/'+site+"_"+str(year)+"_feature_importance.png")

In [None]:
model.get_booster().get_score()

In [None]:
# Calculate SHAP value
#dtrain = xgb.DMatrix(X_train_onehot_com)
shap = model.get_booster().predict(dtest, pred_contribs=True)

In [None]:
# Collect SHAP value
shap_data = list()
for i in range(X_train_onehot_com.columns.shape[0]):
    df = pd.DataFrame(list(zip(X_train_onehot_com.iloc[:,i], shap[:, i])),columns =['Name', 'val'])
    plot_data = df.groupby("Name").agg([np.mean, np.std]).reset_index().fillna(0)
    plot_data = plot_data >> mutate(Feature=X_train_onehot_com.columns[i])
    plot_data.columns = [''.join(x) for x in plot_data.columns]
    shap_data.append(plot_data.copy())
shap_data = pd.concat(shap_data)
used_feature = list(bst.get_score().keys())
shap_data= shap_data[shap_data['Feature'].isin(used_feature)]

In [None]:
#Print SHAP plot
for f in np.unique(shap_data['Feature']):
    plot_data = shap_data >> mask(X.Feature == f) >> select(X.Name, X.valmean, X.valstd)
    plt.clf()
    plt.figure()    
    plt.scatter(x=plot_data['Name'],y=plot_data['valmean'])
    plt.errorbar(plot_data['Name'],plot_data['valmean'], yerr=plot_data['valstd'], fmt="o")
    plt.title(f)
    if plot_data.shape[0] > 2:
        spl = np.polynomial.legendre.Legendre.fit(plot_data['Name'], plot_data['valmean'],5, full=True)
        [spline_x, spline_y] = spl[0].linspace()
        plt.plot(spline_x, spline_y)        
    plt.show()
    plt.savefig(datafolder+'/'+site+"_"+str(year)+"_"+X_train_onehot_com.columns[i]+'.png')    

In [None]:
# Get feature importance
model_data = pd.concat([pd.DataFrame(bst.get_score(importance_type='cover'), index=['Cover']), \
pd.DataFrame(bst.get_score(importance_type='gain'), index=['Gain']), \
pd.DataFrame(bst.get_score(importance_type='weight'), index=['Frequency'])]).transpose() >> mutate(Feature = X.index)
model_data['rank'] = model_data['Frequency'].rank(method='min', ascending=False)

In [None]:
# create csv for metaregression
shap_data = shap_data >> left_join(model_data, by='Feature')
siteyr = site+'_1d_no_fs_stg1up'+'_'+str(year)
shap_data >> mutate(site=siteyr) >> rename(fval=X.Name) >> rename(mean_val=X.valmean) >> rename(se_val=X.valstd)
shap_data.to_pickle(datafolder+'/model_explain/'+siteyr+'.pkl')

In [None]:
# Using shap package example
import shap
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_train_onehot_com)
shap.summary_plot(shap_values, X_train_onehot_com, plot_type="bar")