In [None]:
# import python packages
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pickle
import os
import importlib
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import itertools
from scipy import stats

In [None]:
site_list = ['KUMC', 'UTSW', 'MCW', 'UofU', 'UIOWA', 'UMHC', 'UPITT', 'UTHSCSA', 'UNMC']
ext_list = ['csv','dsv', 'dsv', 'csv', 'csv', 'csv', 'csv', 'csv', 'csv']
sep_list = [',','|', '|', '|', ',', ',', ',', ',', '|']
encoding_list = ['utf-8','utf-8','utf-8','utf-8','utf-8','utf-8', 'windows-1252', 'utf-8', 'utf-16'] 
ct = 8

site = site_list[ct]
ext = ext_list[ct]
sep = sep_list[ct]
encoding = encoding_list[ct]
path = []

if site != 'KUMC':
    rawpath = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/' + site + '/raw/'
else: 
    rawpath = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/' + site + '_ORCALE/raw/'
path.append(rawpath)
path.append('/blue/yonghui.wu/hoyinchan/Data/data2022/' + site + '/')
pdata = '/blue/yonghui.wu/hoyinchan/Data/data2022/'+ site 
site

In [None]:
def try_load_csv(dataname, site, ext, sep, path, admit, datecol=None):

    datadtypes =  {"RESULT_NUM":float,  "LAB_LOINC": str, "LAB_PX_TYPE": str, "RESULT_UNIT": str, #lab
               "RESULT_QUAL": str, "RESULT_NUM":float, "SPECIMEN_SOURCE": str, #lab
               "SEX": pd.Categorical, "RACE": pd.Categorical, "HISPANIC": pd.Categorical, #demo, demodeath
               "DX": str, "DX_TYPE": str, #DX
               "PX": str, "PX_TYPE": str, #PX
               "MEDADMIN_TYPE": str, "MEDADMIN_CODE": str, #AMED
               "RXNORM_CUI": str, #PMED
               "NDC": str, #DMED
               "SYSTOLIC": float, "DIASTOLIC": float, "ORIGINAL_BMI": float, "WT": float, #VITAL_OLD
               "OBSCLIN_TYPE": str, "OBSCLIN_CODE": str} #VITAL  
    
    print(f"processing : {dataname}")
    filename = 'AKI_'+dataname.upper()
    outfilename = f"/p0_{dataname}_{site}.parquet"    
    
    # Site Specific filenames
    if 'UMHC' in path[0]:
        filename = 'DEID_'+filename    
    if 'UofU' in path[0]:
        if dataname == 'vital':
            filename = 'DEID_AKI_VITAL_OLD'
        if  dataname == 'dx_current':
            filename = 'DEID_AKI_DX_CURRENT_ADMIT_DATE'                
    
    try:
        df = pd.read_csv(path[0] +  filename + '.' + ext, sep=sep, encoding=encoding, converters=datadtypes, engine='python', on_bad_lines='skip')     
    except Exception as e:
        print(f"{site}: {dataname} Failed to load from {path[1]} as well: {e}")
        
    try:
        # handle UofU column shifted
        if 'UofU' in path[0] and dataname=='dx':
            df = df.reset_index()
            df = df.drop('RAW_DX_POA',axis=1)
            df.columns = ['ONSETS_ENCOUNTERID', 'DIAGNOSISID', 'PATID', 'ENCOUNTERID',
                   'ENC_TYPE', 'ADMIT_DATE', 'DX_DATE', 'PROVIDERID', 'DX', 'DX_TYPE',
                   'DX_SOURCE', 'DX_ORIGIN', 'PDX', 'DX_POA', 'RAW_DX', 'RAW_DX_TYPE',
                   'RAW_DX_SOURCE', 'RAW_PDX', 'RAW_DX_POA', 'DAYS_SINCE_ADMIT']
            df['DX_DATE'] = df['ADMIT_DATE'] 
        
    
        if 'UofU' in path[0] and dataname=='demo':
            df = df.reset_index()
            df = df.drop(['RAW_HISPANIC', 'RAW_RACE', 'BIOBANK_FLAG'],axis=1)
            df.columns = ['ONSETS_ENCOUNTERID', 'AGE', 'PATID',
                   'BIRTH_DATE', 'BIRTH_TIME', 'SEX', 'SEXUAL_ORIENTATION',
                   'GENDER_IDENTITY', 'HISPANIC', 'BIOBANK_FLAG', 'RACE',
                   'PAT_PREF_LANGUAGE_SPOKEN', 'RAW_SEX', 'RAW_SEXUAL_ORIENTATION',
                   'RAW_GENDER_IDENTITY', 'RAW_HISPANIC', 'RAW_RACE',
                   'RAW_PAT_PREF_LANGUAGE_SPOKEN', 'DEATH_DATE', 'DDAYS_SINCE_ENC',
                   'DEATH_DATE_IMPUTE', 'DEATH_SOURCE']   
        
        # Some site use admit date as dx date
        if 'DX_DATE' in df.columns and df['DX_DATE'].isna().all():
            df['DX_DATE'] = df['ADMIT_DATE']
                
        df.rename(columns = {'\ufeff"ONSETS_ENCOUNTERID"': 'ONSETS_ENCOUNTERID'}, inplace = True) # to handle the BOM character in UTHSCSA
        df.columns = df.columns.str.upper()
        df.columns = df.columns.str.replace('"+PD.DATE_SHIFT"','').str.replace('AKI.','') # To handle the starnge date name in KUMC
        df['ONSETS_ENCOUNTERID'] = df['ONSETS_ENCOUNTERID'].astype(str)
        df['PATID'] = df['PATID'].astype(str)
        
        # if not onset
        if admit is not None:
            df["ENCOUNTERID"] = df["ONSETS_ENCOUNTERID"]   
            df = df.drop('ADMIT_DATE', axis=1, errors='ignore')
            df = admit[["PATID","ENCOUNTERID", 'ADMIT_DATE']].merge(df, on = ["PATID","ENCOUNTERID"], how = "inner")

            # recalculate DAYS_SINCE_ADMIT using day as unit
            if datecol is not None:
                df[datecol] = pd.to_datetime(pd.to_datetime(df[datecol]).dt.date)
                df['DAYS_SINCE_ADMIT'] = (df[datecol]-df['ADMIT_DATE']).dt.days
                df = df.drop('ADMIT_DATE',axis=1)
        
        # Convert dataype
        filtered_datadtypes = {key: datadtypes[key] for key in datadtypes if key in df.columns}
        df = df.astype(filtered_datadtypes)            
            
        
        if dataname == 'onsets':
            return df
        else:
            df.to_parquet(pdata+outfilename)
        
    except Exception as e:
        print (f"{site}: {dataname} failed at postprocessing: {e}")
#        raise Exception(f"{site}: {dataname} failed at postprocessing: {e}")

In [None]:
admit = pd.read_pickle(pdata+'/p0_onset_'+site+'.pkl')
admit['PATID'] = admit['PATID'].astype(str)
admit['ENCOUNTERID'] = admit['ENCOUNTERID'].astype(str)

In [None]:
try_load_csv('amed', site, ext, sep, path, admit, datecol='MEDADMIN_START_DATE')
# try_load_csv('demo', site, ext, sep, path, admit, datecol='DEATH_DATE')
# try_load_csv('demo_death', site, ext, sep, path, admit, datecol='DEATH_DATE')
try_load_csv('dmed', site, ext, sep, path, admit, datecol='DISPENSE_DATE')
# try_load_csv('dx', site, ext, sep, path, admit, datecol='DX_DATE')
# try_load_csv('dx_current', site, ext, sep, path, admit, datecol='DX_DATE')
# try_load_csv('lab_scr', site, ext, sep, path, admit, datecol='SPECIMEN_DATE')
try_load_csv('pmed', site, ext, sep, path, admit, datecol='RX_START_DATE')
# try_load_csv('px', site, ext, sep, path, admit, datecol='PX_DATE')
# try_load_csv('vital_old', site, ext, sep, path, admit, datecol='MEASURE_DATE')
# try_load_csv('vital', site, ext, sep, path, admit, datecol='OBSCLIN_START_DATE')

In [None]:
#try_load_csv('lab', site, ext, sep, path, admit, datecol='SPECIMEN_DATE')

In [None]:
from IPython.display import Audio
sound_file = 'beep-11.wav'
Audio(sound_file, autoplay=True)

In [None]:
# convert pmed to amed

outfilename = '/p0_pmed_'+site+'.pkl'
pmed = pd.read_pickle(pdata+outfilename)

pmed = pmed[['PATID', 'ENCOUNTERID', 'ONSETS_ENCOUNTERID', 'RX_PROVIDERID', 
             'PRESCRIBINGID', 'RX_START_DATE', 'RX_END_DATE', 
             'RXNORM_CUI', 'RX_DOSE_ORDERED', 'RX_DOSE_ORDERED_UNIT', 'RX_ROUTE', 'RX_SOURCE',
             'RAW_RX_MED_NAME', 'RAW_RXNORM_CUI', 'RAW_RX_DOSE_ORDERED', 'RAW_RX_DOSE_ORDERED_UNIT', 'RAW_RX_ROUTE', 'DAYS_SINCE_ADMIT']]

pmed['MEDADMIN_TYPE'] = 'RX'

rename_trans = {
                'RX_START_DATE': 'MEDADMIN_START_DATE',
                'RX_END_DATE': 'MEDADMIN_STOP_DATE', 
                'RXNORM_CUI': 'MEDADMIN_CODE',
                'RX_DOSE_ORDERED': 'MEDADMIN_DOSE_ADMIN',
                'RX_DOSE_ORDERED_UNIT': 'MEDADMIN_DOSE_ADMIN_UNIT',
                'RX_ROUTE': 'MEDADMIN_ROUTE', 
                'RX_SOURCE': 'MEDADMIN_SOURCE',
                'RAW_RX_MED_NAME': 'RAW_MEDADMIN_MED_NAME',
                'RAW_RXNORM_CUI': 'RAW_MEDADMIN_CODE',
                'RAW_RX_DOSE_ORDERED': 'RAW_MEDADMIN_DOSE_ADMIN',
                'RAW_RX_DOSE_ORDERED_UNIT': 'RAW_MEDADMIN_DOSE_ADMIN_UNIT',
                'RAW_RX_ROUTE': 'RAW_MEDADMIN_ROUTE',
                'RX_PROVIDERID': 'MEDADMIN_PROVIDERID'
}

pmed = pmed.rename(rename_trans,axis=1)

pmed['MEDADMIN_START_TIME'] = pd.to_datetime('00:00:00')
pmed['MEDADMIN_STOP_TIME'] = pd.to_datetime('00:00:00')
pmed['MEDADMINID'] = pmed.index

outfilename = '/p0_amed_'+site+'.pkl'
pmed.to_pickle(pdata+outfilename)

In [None]:
#Special work for lab
dataname = 'lab'
datecol = 'SPECIMEN_DATE'
outfilename = f"/p0_{dataname}_{site}.pkl"

datadtypes =  {"PATID": 'object', "ENCOUNTERID": 'object', "ONSETS_ENCOUNTERID": 'object', '\ufeff"ONSETS_ENCOUNTERID"': 'object', #General
           "RESULT_NUM":"Float64",  "LAB_LOINC": 'object', "LAB_PX_TYPE": 'object', "RESULT_UNIT": 'object', #lab
           "RESULT_QUAL": 'object', "RESULT_NUM":"Float64", "SPECIMEN_SOURCE": 'object', #lab
           "SEX": 'Int64', "SEX": 'category', "RACE": 'category', "HISPANIC": 'category', #demo, demodeath
           "DX": 'object', "DX_TYPE": 'object', #DX
           "PX": 'object', "PX_TYPE": 'object', #PX
           "MEDADMIN_TYPE": "object", "MEDADMIN_CODE": 'object', #AMED
           "RXNORM_CUI": "object",
           "SYSTOLIC": 'Float64', "DIASTOLIC": 'Float64', "ORIGINAL_BMI": 'Float64', "WT": 'Float64', #VITAL_OLD
           "OBSCLIN_TYPE": "object", "OBSCLIN_CODE": "object"} #VITAL     

df = pd.read_csv(path[0] +  'AKI_LAB'  + '.' + ext, sep=sep, on_bad_lines = 'skip', encoding=encoding, dtype='object', engine='python')

#df_bk = df.copy()
df = df_bk.copy()

# if not onset
df["ENCOUNTERID"] = df["ONSETS_ENCOUNTERID"]   
df = df.drop('ADMIT_DATE', axis=1, errors='ignore')

df = admit[["PATID", "ENCOUNTERID", 'ADMIT_DATE']].merge(df, on = ["PATID", "ENCOUNTERID"], how = "inner")

# recalculate DAYS_SINCE_ADMIT using day as unit
if datecol is not None:
    df[datecol] = pd.to_datetime(pd.to_datetime(df[datecol]).dt.date)
    df['DAYS_SINCE_ADMIT'] = (df[datecol]-df['ADMIT_DATE']).dt.days
    df = df.drop('ADMIT_DATE',axis=1)

# Convert dataype
filtered_datadtypes = {key: datadtypes[key] for key in datadtypes if key in df.columns}
df = df.astype(filtered_datadtypes)            

df.to_pickle(pdata+outfilename)