In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
import logging

In [None]:
if __name__ == "__main__":
    site = 'MCRI'
    year = 2013

In [None]:
def bt_onset(site, year):
    #onset
    print('Merging onset on site '+site+":"+str(year), flush = True)    
    try:
        return pd.read_pickle('data/'+site+'/onset_'+site+'_'+str(year)+'.pkl')
#        newdf_debug['onset'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No onset table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No onset table!!!!! '+site+":"+str(year))
        logging.shutdown()

In [None]:
def bt_px(site, year, newdf):        
    #px 
    print('Merging px on site '+site+":"+str(year), flush = True)        
    try:
        px = pd.read_pickle('data/'+site+'/px_'+site+'_'+str(year)+'.pkl')
        #depreciate Since ADMIT
#        newdf = pd.merge(newdf, px, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')
        return pd.merge(newdf, px, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left').fillna(False)
#        newdf_debug['px'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No px table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No px table!!!!! '+site+":"+str(year))
        logging.shutdown()
        return newdf

In [None]:
def bt_dx(site, year, newdf):                
    #dx
    print('Merging dx on site '+site+":"+str(year), flush = True)            
    try:
        dx = pd.read_pickle('data/'+site+'/dx_'+site+'_'+str(year)+'.pkl')
#        newdf = pd.merge(newdf, dx, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left')
        return pd.merge(newdf, dx, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left').fillna(False)       
#        newdf_debug['dx'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No onset table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No onset table!!!!! '+site+":"+str(year))
        logging.shutdown()
        return newdf

In [None]:
def bt_amed(site, year, newdf):                    
    #amed
    print('Merging amed on site '+site+":"+str(year), flush = True)                
    try:
        amed = pd.read_pickle('data/'+site+'/amed_'+site+'_'+str(year)+'.pkl')
#        newdf = pd.merge(newdf, amed, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')
        return pd.merge(newdf, amed, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left').fillna(False)        
#        newdf = newdf.combine_first(newdf[list(amed.select_dtypes('bool').columns)].fillna(False))    
#        newdf_debug['amed'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No amed table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No amed table!!!!! '+site+":"+str(year))
        logging.shutdown()    
        return newdf

In [None]:
def bt_labcat(site, year, newdf):                        
    #labcat
    print('Merging lab_cat on site '+site+":"+str(year), flush = True)                
    try:
        labcat = pd.read_pickle('data/'+site+'/labcat_'+site+'_'+str(year)+'.pkl')
#        newdf = pd.merge(newdf, amed, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')
        return pd.merge(newdf, labcat, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left').fillna(False)        
#        newdf = newdf.combine_first(newdf[list(amed.select_dtypes('bool').columns)].fillna(False))    
#        newdf_debug['amed'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No lab_cat table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No lab_cat table!!!!! '+site+":"+str(year))
        logging.shutdown()          
        return newdf

In [None]:
def bt_demo(site, year, newdf):                            
    #demo
    print('Merging demo on site '+site+":"+str(year), flush = True)                
    try:
        demo = pd.read_pickle('data/'+site+'/demo_'+site+'_'+str(year)+'.pkl')
        newdf2 = pd.merge(newdf, demo, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left')
        return newdf2.combine_first(newdf2[list(demo.select_dtypes('bool').columns)].fillna(False))            
#        newdf_debug['demo'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No demo table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No demo table!!!!! '+site+":"+str(year))
        logging.shutdown()    
        return newdf

In [None]:
def bt_vital(site, year, newdf):                                
    #vital
    print('Merging vital on site '+site+":"+str(year), flush = True)                
    try:
#        vital = pd.read_pickle('data/'+site+'/vital_'+site+'_'+str(year)+'.pkl')
        vital = pd.read_pickle('data/'+site+'/vital_'+site+'_'+str(year)+'.pkl')

#        newdf = pd.merge(newdf, vital, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')
        return pd.merge(newdf, vital, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left')        
#        newdf_debug['vital'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No vital table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No vital table!!!!! '+site+":"+str(year))
        logging.shutdown()
        return newdf

In [None]:
def bt_labnum(site, year, newdf):                                    
    #lab_num
    print('Merging lab_num on site '+site+":"+str(year), flush = True)                
    try:
#        labnum = pd.read_pickle('data/'+site+'/labnum_'+site+'_'+str(year)+'.pkl')
        labnum = pd.read_pickle('data/'+site+'/labnum_'+site+'_'+str(year)+'.pkl')

#        newdf = pd.merge(newdf, lab_t, left_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], right_on=['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'], how='left')   
        return pd.merge(newdf, labnum, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left')
#        newdf_debug['lab'] = newdf.copy()
    except FileNotFoundError:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('No lab_num table!!!!! '+site+":"+str(year), flush = True)
        logging.error('No lab_num table!!!!! '+site+":"+str(year))
        logging.shutdown()
        return newdf

In [None]:
def drop_too_much_nan(site, year, newdf, threshold=0.05):
    print('Remove sparse feature on site '+site+":"+str(year), flush = True)                        
    btX = newdf.replace(False, np.nan)
    #limitPer = len(btX) * threshold
    #col = btX.dropna(thresh=limitPer, axis=1).columns
    btX0 = btX[btX['FLAG']==0]
    btX1 = btX[btX['FLAG']==1]
    limitPer0 = len(btX0) * threshold
    limitPer1 = len(btX1) * threshold
    col0 = btX0.dropna(thresh=limitPer0, axis=1).columns
    col1 = btX1.dropna(thresh=limitPer1, axis=1).columns
    col = list(set(list(col1)+list(col0)))
    return newdf[col]

In [None]:
def handpickremoval(site, year, newdf):   
    # drop CPT service code between 99202 and 99499 
    cptcode0 = np.array([x for x in newdf.columns if 'PX' in x and x.split(':')[2].isnumeric()])
    cptcode = np.array([int(x.split(':')[2]) if x.split(':')[2].isnumeric() else 0 for x in cptcode0])
    cptcodebool = np.logical_or(np.logical_and(cptcode >= 99202, cptcode <= 99499),np.logical_and(cptcode >= 80047, cptcode <= 89398))
    remlist = cptcode0[cptcodebool]
    
    # Additional drop
    remlist2 = ['WT']
    
    remlist = list(remlist)+remlist2
    return newdf.drop(remlist,axis=1, errors='ignore')

In [None]:
# def drop_corr(site, year, newdf, threshold=0.5):
#     print('Remove correlated feature on site '+site+":"+str(year), flush = True)                        
#     corr = newdf.corr()
#     columns = np.full((corr.shape[0],), True, dtype=bool)
#     for i in range(corr.shape[0]):
#         for j in range(i+1, corr.shape[0]):
#             if corr.iloc[i,j] >= threshold:
#                 # if corr.columns[j] == 'ORIGINAL_BMI':
#                 #     if columns[i]:
#                 #         columns[i] = False
#                 if columns[j]:
#                     columns[j] = False
#     selected_columns = newdf.columns[columns]
#     return newdf[selected_columns]

In [None]:
#Pearson
def pearson_list(bt, threshold):
    corr = bt.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
#                print(bt.columns[i], btcont.columns[j], corr.iloc[i,j])
                if columns[j]:
                    columns[j] = False
    return columns   

def point_biserial(btcat, btcon, threshold):
    from scipy import stats
    columns = np.full((btcat.shape[1],), True, dtype=bool)    
    for i in range(btcon.shape[1]):
        for j in range(btcat.shape[1]):
            if stats.pointbiserialr(btcat.iloc[:,j], btcon.iloc[:,i])[0] >= threshold:            
                if columns[j]:
                    columns[j] = False
    return columns
    
def drop_corr(site, year, bt, threshold):
    bt2 = bt.reindex(sorted(bt.columns), axis=1)
    btcat = bt2.select_dtypes('bool')
    btcont = bt2.select_dtypes(exclude='bool')
    return pd.concat([btcont.loc[:,pearson_list(btcont, threshold)], btcat.loc[:, (pearson_list(btcat, threshold) | point_biserial(btcat, btcon, threshold))]], axis=1)

def drop_corr2(site, year, bt, threshold):
    return bt.loc[:,pearson_list(bt, threshold)]

def generate_drop_list(site, year, bt, threshold):
    corr = bt.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                print(site, year, bt.columns[i], btcont.columns[j], corr.iloc[i,j])  

In [None]:
def filter_unit(site, year, data):
    print('Filtering units on site '+site+":"+str(year), flush = True)

    bool_col = data.columns[np.logical_and(data.dtypes == 'bool', data.columns.str.contains('LAB'))]
    data_bool = data[bool_col]
    bool_col_drop = [x for x in bool_col[data_bool.sum() < 40]]
    return data.drop(bool_col_drop, axis=1)    

In [None]:
def nsf_feature_filter(site, year, df):
    print('Filtering NSF on site '+site+":"+str(year), flush = True)
    
    df_filter = pd.read_pickle('nsfsch_feature.pkl')
    filter_dc = list()
    filter_conv = list()
    for dc in df.columns:
        if dc.split(':')[0] == 'LAB':
            lonic = dc.split(':')[-1].split('(')[0]
            if df_filter[df_filter['type']=='LAB']['key'].eq(lonic).any():
                filter_dc.append(dc)
                filter_conv.append(dc.split(':')[0]+':'+dc.split(':')[1]+lonic+'('+dc.split(':')[-1].split('(')[-1])

        elif dc.split(':')[0] == 'DX':
            code  = dc.split(':')[1]
            icd = dc.split(':')[-1].split('(')[0]
            if df_filter[np.logical_and(df_filter['type']==dc.split(':')[0], df_filter['code']==code)]['key'].eq(icd).any():
                row = df_filter[np.logical_and(df_filter['key']==icd ,np.logical_and(df_filter['type']==dc.split(':')[0], df_filter['code']==code))].head(1)
                filter_dc.append(dc)
                filter_conv.append(dc.split(':')[0]+':'+row.iloc[0,:]['source_code']+row.iloc[0,:]['source_key']+'('+dc.split(':')[-1].split('(')[-1])

        elif dc.split(':')[0] == 'PX':
            filter_dc.append(dc)
            filter_conv.append(dc)

        elif dc.split(':')[0] == 'MED':
            code  = dc.split(':')[1]
            rxndc = dc.split(':')[-1].split('(')[0]
            if df_filter[np.logical_and(df_filter['type']==dc.split(':')[0], df_filter['code']==code)]['key'].eq(rxndc).any():
                row = df_filter[np.logical_and(df_filter['key']==rxndc ,np.logical_and(df_filter['type']==dc.split(':')[0], df_filter['code']==code))].head(1)
                filter_dc.append(dc)
                filter_conv.append(dc.split(':')[0]+':'+row.iloc[0,:]['source_code']+row.iloc[0,:]['source_key']+'('+dc.split(':')[-1].split('(')[-1])
        else:
            if df_filter['key'].str.contains(dc).any() or df_filter['key'].str.contains(dc.split('_')[0]).any():
                filter_dc.append(dc)
                filter_conv.append(dc)
    return df[filter_dc]

In [None]:
def bigtable(site, year):
    #Big Table
    print('Merging bt on site '+site+":"+str(year), flush = True)

    #load tables
    newdf_debug = dict()

    try:
        newdf = bt_onset(site, year)    
        # boolean table must merge first

        newdf = bt_px(site, year, newdf)
        newdf = bt_dx(site, year, newdf)
        newdf = bt_amed(site, year, newdf)
        newdf = bt_labcat(site, year, newdf)

        newdf = bt_demo(site, year, newdf)
        newdf = bt_vital(site, year, newdf)
        newdf = bt_labnum(site, year, newdf)
        newdf = handpickremoval(site, year, newdf)

        newdf = filter_unit(site, year, newdf)
        newdf = nsf_feature_filter(site, year, newdf)
        
#        newdf = drop_too_much_nan(site, year, newdf, threshold=0.05)
#        newdf = bt_postprocess(site, year, newdf)
#        newdf = drop_corr2(site, year, newdf, threshold=0.5)        
        
        #Save table
#        newdf.to_pickle('data/'+site+'/bt_'+site+'_'+str(year)+'.pkl')   #Old data (wrong 24 hours)
#        newdf.to_pickle('data/'+site+'/bt2_'+site+'_'+str(year)+'.pkl')  #Old data (per year drop nan)      
        newdf.to_pickle('data/'+site+'/bt3_'+site+'_'+str(year)+'.pkl')

        #consistency check
        if newdf.empty:
            logging.basicConfig(filename='BT.log', filemode='a')    
            print('DATAFRAME EMPTY!!!!!! '+site+":"+str(year), flush = True)
            logging.error('BT: DATAFRAME EMPTY!!!!!! '+site+":"+str(year))
            logging.shutdown()

        print('Finished bt on site '+site+":"+str(year), flush = True)        
    except Exception as e:
        logging.basicConfig(filename='BT.log', filemode='a')    
        print('OTHER ERROR!!!!! '+site+":"+str(year)+'\n+++++++++++++++++\n'+str(e)+'\n-------------------\n', flush = True)
        logging.error('OTHER ERROR!!!!! '+site+":"+str(year)+'\n+++++++++++++++++\n'+str(e)+'\n-------------------\n')
        logging.shutdown()       
        raise    

In [None]:
def collect_feature(site, year):

    newdf = pd.read_pickle('data/'+site+'/bt3_'+site+'_'+str(year)+'.pkl')

    feature_df = pd.DataFrame(newdf.columns)
    feature_df.columns = ['Feature']
    feature_df['site'] = site
    feature_df.to_pickle('data/'+site+'/bt3Features_'+site+'_'+str(year)+'.pkl')

In [None]:
def find_common_feature():
    sites = ['MCRI', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC', 'UTSW']
    feature_df_list = list()
    for site in sites:
        print(site)    
        onset = pd.read_pickle("/home/hchan2/AKI/AKI_Python/"+'data/'+site+'/p0_onset_'+site+'.pkl')
        years = list(pd.to_datetime(onset['ADMIT_DATE']).dt.year.unique()) 
        for year in years:
            try:
                feature_df_list.append(pd.read_pickle('data/'+site+'/bt3Features_'+site+'_'+str(year)+'.pkl'))
            except:
                pass

    feature_df = pd.concat(feature_df_list).drop_duplicates()

    selected_sites = ['MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC', 'UTSW']
    feature_df = feature_df[feature_df['site'].isin(selected_sites)]

    fcount = feature_df.groupby('Feature').count().reset_index()
    fcount = fcount[fcount['site']==len(selected_sites)]

    fcount[['Feature']].to_pickle('common_feature.pkl')