In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
from os.path import exists
import logging
import time
from numpy import nan
import pickle
from sklearn.metrics import roc_auc_score
import xgboost
from catboost import Pool, cv
from os.path import exists

In [None]:
sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC']
tablename = ['onset', 'px', 'dx', 'lab', 'amed', 'vital', 'demo']

In [None]:
#Check existence of p0 Table
parasites = [(site, table) for site in sites for table in tablename]
def if_p0_table_exists(site, table):
    if exists('data/'+site+'/p0_'+table+'_'+site+'.pkl'):
        if pd.read_pickle('data/'+site+'/p0_'+table+'_'+site+'.pkl').empty:
            print('p0 '+site+' '+table+' is empty')
    else:
        print('p0 '+site+' '+table+' is missing')
        
for site, table in parasites:
    if_p0_table_exists(site,table)
print('done')

In [None]:
sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC', 'UTSW']
tablename = ['onset', 'px', 'dx', 'labnum', 'labcat', 'amed', 'vital', 'demo', 'bt']
#tablename = ['bt']

#get years from site
para_list = []
for site in sites:
    onset = pd.read_pickle('data/'+site+'/p0_onset_'+site+'.pkl')
    years = list(pd.to_datetime(onset['ADMIT_DATE']).dt.year.unique())    
    para_list_local = [(site, year, table) for year in years for table in tablename]    
    para_list.extend(para_list_local)

#Check p1 and p2 table exists
def if_p1_table_exists(site, table, year):
    if exists('data/'+site+'/'+table+'_'+site+'_'+str(year)+'.pkl'):
        df = read_pickle('data/'+site+'/'+table+'_'+site+'_'+str(year)+'.pkl')
        if df.empty:
            print('p1 '+site+' ' +str(year)+' '+table+' is empty')
        elif 'DAYS_SINCE_ADMIT' in df.columns:
            print('p1 '+site+' ' +str(year)+' '+table+' not pivoted')                         
    else:
        print('p1 '+site+' ' +str(year)+' '+table+' is missing')

for site, year, table in para_list:
    if_p1_table_exists(site,table,year)
print('done')

In [None]:
# merge check
sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA']
tablename = ['onset', 'px', 'dx', 'lab', 'amed', 'vital', 'demo']
parasites = [(site, table) for site in sites for table in tablename]
def p0_merge_check(site, table):    
    onset = pd.read_pickle('data/'+site+'/p0_onset_'+site+'.pkl')
    newdfX = onset >> select('PATID', 'ENCOUNTERID') >> mutate(dummy = True) >> distinct()
    data = pd.read_pickle('data/'+site+'/p0_'+table+'_'+site+'.pkl')
    newdata = pd.merge(data, newdfX, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left').fillna({'dummy': False})
    if not all(newdata['dummy']):
        print('p0 '+site+' '+table+' is inconsistent')
        print(newdata['dummy'].value_counts())
        
for site, table in parasites:
    p0_merge_check(site, table)
#print('done')

In [None]:
# Per year number of record
sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA']
tablename = ['onset']
#get years from site
para_list = []
for site in sites:
    onset = pd.read_pickle('data/'+site+'/p0_onset_'+site+'.pkl')
    years = list(pd.to_datetime(onset['ADMIT_DATE']).dt.year.unique())    
    para_list_local = [(site, year) for year in years for table in tablename]    
    para_list.extend(para_list_local)

def p1_site_sample_check(site, year):
    newdf = pd.read_pickle('data/'+site+'/onset_'+site+'_'+str(year)+'.pkl')
    if newdf.shape[0] < 100:
        print(site+'-'+str(year))
        print(newdf['FLAG'].value_counts())
    
for site, year in para_list:
    p1_site_sample_check(site,year)
print('done')

In [None]:
#Data time range check
#sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA']
sites = ['MCW']
tablename = ['ONSETS', 'PX', 'DX', 'LAB', 'AMED', 'VITAL', 'DEMO']
datafolder = '/home/hchan2/AKI/data/'

def p0_year_check(site):
    datatt = pd.read_csv(datafolder+site+'/raw/'+'AKI_ONSETS'+'.'+'csv')
    yrs = pd.to_datetime(datatt['ADMIT_DATE'])
    print([site, 'onset', min(yrs).year, max(yrs).year], flush=True)
    datatt = pd.read_csv(datafolder+site+'/raw/'+'AKI_PX'+'.'+'csv')
    yrs = pd.to_datetime(datatt['PX_DATE'])
    print([site, 'px', min(yrs).year, max(yrs).year], flush=True)
    datatt = pd.read_csv(datafolder+site+'/raw/'+'AKI_DX'+'.'+'csv')
    yrs = pd.to_datetime(datatt['DX_DATE'])
    print([site, 'dx', min(yrs).year, max(yrs).year], flush=True)
    datatt = pd.read_csv(datafolder+site+'/raw/'+'AKI_VITAL'+'.'+'csv')
    yrs = pd.to_datetime(datatt['MEASURE_DATE_TIME'])
    print([site, 'vital', min(yrs).year, max(yrs).year], flush=True)
    datatt = pd.read_csv(datafolder+site+'/raw/'+'AKI_AMED'+'.'+'csv')
    yrs = pd.to_datetime(datatt['MEDADMIN_START_DATE_TIME'])
    print([site, 'amed', min(yrs).year, max(yrs).year], flush=True)
    datatt = pd.read_csv(datafolder+site+'/raw/'+'AKI_LAB'+'.'+'csv')
    yrs = pd.to_datetime(datatt['LAB_ORDER_DATE'])
    print([site, 'lab', min(yrs).year, max(yrs).year], flush=True)
    
for site in sites:
    p0_year_check(site)
print('done')

In [None]:
sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UNMC', 'UofU', 'UPITT', 'KUMC']
#sites = ['KUMC']
def ATC_match_check(site, year):
    datafolder = '/home/hchan2/AKI/data/'    
    home_directory = "/home/hchan2/AKI/AKI_Python/"        
    amed = pd.read_pickle('data/'+site+'/p0_amed_'+site+'.pkl')    
    newdfX = pd.read_pickle('data/'+site+'/onset_'+site+'_'+str(year)+'.pkl')
    newdfX = newdfX >> select('PATID', 'ENCOUNTERID') >> mutate(dummy = True) >> distinct()
    amed = (pd.merge(amed, newdfX, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left').fillna({'dummy': False}) >> mask(X.dummy) >> select(~X.dummy)).reset_index(drop=True)    
    rxcui2atc_dtypes =  {"Rxcui": 'object', "MEDADMIN_CODE": 'object'}    
    rxcui2atc = pd.read_csv(home_directory+"data/"+site+'/rxnorm_out_'+site+'.csv',sep=',', dtype=(rxcui2atc_dtypes)) >> rename(MEDADMIN_CODE=X.Rxcui)
    rxcui2atc = rxcui2atc >> mutate(dummy = True) >> distinct()    
    amed = pd.merge(amed, rxcui2atc, left_on=['MEDADMIN_CODE'], right_on=['MEDADMIN_CODE'], how='left').fillna({'dummy': False})
    print(site+'-'+str(year))
    print(amed['dummy'].value_counts())

ban_list = [('UPITT', 2013), ('UPITT', 2012), ('MCW', 2011)] #Sample size too small
para_list = []
for site in sites:
    onset = pd.read_pickle('data/'+site+'/p0_onset_'+site+'.pkl')
    years = list(pd.to_datetime(onset['ADMIT_DATE']).dt.year.unique())    
    para_list_local = [(site, year) for year in years if not (site, year) in ban_list]   
    para_list.extend(para_list_local)    
    
for site, year in para_list:
    ATC_match_check(site, year)
print('done')    

In [None]:
sites = ['UMHC', 'UTHSCSA']
def NDC_match_check(site, year):
    datafolder = '/home/hchan2/AKI/data/'    
    home_directory = "/home/hchan2/AKI/AKI_Python/"        
    amed = pd.read_pickle('data/'+site+'/p0_amed_'+site+'.pkl')    
    newdfX = pd.read_pickle('data/'+site+'/onset_'+site+'_'+str(year)+'.pkl')
    newdfX = newdfX >> select('PATID', 'ENCOUNTERID') >> mutate(dummy = True) >> distinct()
    amed = (pd.merge(amed, newdfX, left_on=['PATID', 'ENCOUNTERID'], right_on=['PATID', 'ENCOUNTERID'], how='left').fillna({'dummy': False}) >> mask(X.dummy) >> select(~X.dummy)).reset_index(drop=True)    
    rxcui2atc_dtypes =  {"ndc": 'object', "MEDADMIN_CODE": 'object'}    
    rxcui2atc = pd.read_csv(home_directory+"data/"+site+'/ndc_out_'+site+'.csv',sep=',', dtype=(rxcui2atc_dtypes)) >> rename(MEDADMIN_CODE=X.Rxcui)
    rxcui2atc = rxcui2atc >> mutate(dummy = True) >> distinct()    
    amed = pd.merge(amed, rxcui2atc, left_on=['MEDADMIN_CODE'], right_on=['MEDADMIN_CODE'], how='left').fillna({'dummy': False})
    print(site+'-'+str(year))
    print(amed['dummy'].value_counts())

ban_list = [('UPITT', 2013), ('UPITT', 2012), ('MCW', 2011)] #Sample size too small
para_list = []
for site in sites:
    onset = pd.read_pickle('data/'+site+'/p0_onset_'+site+'.pkl')
    years = list(pd.to_datetime(onset['ADMIT_DATE']).dt.year.unique())    
    para_list_local = [(site, year) for year in years if not (site, year) in ban_list]   
    para_list.extend(para_list_local)    
    
for site, year in para_list:
    ATC_match_check(site, year)
print('done')    

In [None]:
sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC']

def icd10toicd09_match_check(site):
    datafolder = '/home/hchan2/AKI/data/'    
    home_directory = "/home/hchan2/AKI/AKI_Python/"        
    dx = pd.read_pickle('data/'+site+'/p0_dx_'+site+'.pkl')
    dx['DX_TYPE'] = dx['DX_TYPE'].where(dx['DX_TYPE'] != '9', '09')

    print(site)
    print('BEFORE')
    print(dx['DX_TYPE'].value_counts())
      
    icd10toicd09 = pd.read_csv(home_directory+'2018_I10gem.csv',sep=',')    
    icd10toicd09.columns = ['DX', 'DX09']
    icd10toicd09 = icd10toicd09 >> mutate(dummy = True) >> distinct()    

    dxt = dx >> mask(X.DX_TYPE == '10')
    dxt['DX'] = dxt['DX'].map(lambda x: x.replace('.',''))    
    dxt = pd.merge(dxt, icd10toicd09, left_on=['DX'], right_on=['DX'], how='left').fillna({'dummy': False})
    print('MIDDLE')    
    print(dxt['dummy'].value_counts())

    dx4 = dx >> mask(X.DX_TYPE == '10')
    dx4['DX'] = dx4['DX'].map(lambda x: x.replace('.',''))
    dx4 = dx4 >> left_join(icd10toicd09, by='DX')
    dx4['DX_TYPE'] = dx4['DX_TYPE'].where(dx4['DX09'].isnull(), '09')
    dx4['DX'] = dx4['DX'].where(dx4['DX09'].isnull(), dx4['DX09'])
    dx4 = dx4.drop('DX09', axis=1)
    dx = pd.concat([dx >> mask(X.DX_TYPE != '10'), dx4], axis=0)
    print('AFTER')
    print(dx['DX_TYPE'].value_counts())
    print(dx.isna().sum())
            
        
for site in sites:
    icd10toicd09_match_check(site)
print('done')      

In [None]:
sites = ['MCRI', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC', 'UTSW']
#tablename = ['onset', 'px', 'dx', 'labnum', 'labcat', 'amed', 'vital', 'demo', 'bt']
tablename = ['bt']

#get years from site
para_list = []
for site in sites:
    onset = pd.read_pickle('data/'+site+'/p0_onset_'+site+'.pkl')
    years = list(pd.to_datetime(onset['ADMIT_DATE']).dt.year.unique())    
    para_list_local = [(site, year) for year in years]    
    para_list.extend(para_list_local)

#Check p1 and p2 table exists
def check_bt_count(site, year):
    print(site+':'+str(year))    
    newdf = pd.read_pickle('data/'+site+'/onset_'+site+'_'+str(year)+'.pkl')
    bt = pd.read_pickle('data/'+site+'/bt_'+site+'_'+str(year)+'.pkl')
    newdf_c = newdf['FLAG'].value_counts()
    bt_c = bt['FLAG'].value_counts()
    if not newdf_c == bt_c:
        print('newdf')
        print(newdf_c)
        print('bt')
        print(bt_c)          
    # if exists('data/'+site+'/'+table+'_'+site+'_'+str(year)+'.pkl'):
    #     df = read_pickle('data/'+site+'/'+table+'_'+site+'_'+str(year)+'.pkl')
    #     if df.empty:
    #         print('p1 '+site+' ' +str(year)+' '+table+' is empty')
    #     elif 'DAYS_SINCE_ADMIT' in df.columns:
    #         print('p1 '+site+' ' +str(year)+' '+table+' not pivoted')                         
    # else:
    #     print('p1 '+site+' ' +str(year)+' '+table+' is missing')

for site, year in para_list:
    check_bt_count(site,year)
print('done')