In [None]:
# import python packages
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
#import utils
import pickle
import os
import importlib
from scipy.stats import chi2_contingency

from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import itertools
from scipy import stats

In [None]:
site_list = ['KUMC', 'UTSW', 'MCW', 'UIOWA', 'UMHC', 'UPITT', 'MCRI', 'UTHSCSA', 'UNMC']
ext_list = ['csv','dsv', 'dsv', 'csv', 'csv', 'csv', 'csv', 'csv', 'csv']
sep_list = [',','|', '|', '|', ',', ',', ',', ',', ',', '|']
encoding_list = ['utf-8','utf-8','utf-8','utf-8','utf-8','utf-8', 'windows-1252', 'utf-8', 'utf-8','utf-16'] 

In [None]:
ct = 1

site = site_list[ct]
ext = ext_list[ct]
sep = sep_list[ct]
encoding = encoding_list[ct]
path = []

In [None]:
if site != 'KUMC':
    rawpath = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/' + site + '/raw/'
else: 
    rawpath = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/' + site + '_ORCALE/raw/'
path.append(rawpath)
path.append('/blue/yonghui.wu/hoyinchan/Data/data2022/' + site + '/')
pdata = '/blue/yonghui.wu/hoyinchan/Data/data2022/'+ site 

In [None]:
def try_load_csv(filename, ext, sep, path):
    try:
        # Try to load the file from the first path
        df = pd.read_csv( path[0] +  filename + '.' + ext, engine="pyarrow", sep=sep, encoding=encoding)
    except Exception as e:
        print(f"Failed to load from {path[0]}: {e}. Loading converted csv...")
        try:
            # If the first attempt fails, try to load the file from the second path
            #df = pd.read_csv( path[0] +  filename + '.' + ext, engine="pyarrow", sep=sep)
            df = pd.read_csv(path[1] +  filename + '.csv', engine="pyarrow", sep=',')
        except Exception as e:
            try:
                # If the first attempt fails, try to load the file from the second path
                df = pd.read_csv(path[1] +  filename + '.csv', sep=',', on_bad_lines = 'skip')
            except Exception as e:
                # If the second attempt also fails, handle or re-raise the exception
                print(f"Failed to load from {path[1]} as well: {e}")
                raise Exception(f"Could not load the file from either path.")
    
    return df

In [None]:
def pat_count(df, var = 'PATID'):
    return df[var].nunique()

def null_count(data):
    return data.isnull().sum()/data.shape[0]

def enc_count(df):
    return df['ENCOUNTERID'].nunique()

def aki_count(df):
    return ( [pat_count(df), enc_count(df)])

In [None]:
def inverse_MDRD(row, KDIGO_baseline):
    age = row["AGE"]
    is_male = True if row["MALE"]  else False
    is_black = True if row["RACE_BLACK"] else False
        
    if is_male and is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Black males"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Black males"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Black males"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Black males"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Black males"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Black males"]
    
    if is_male and not is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Other males"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Other males"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Other males"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Other males"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Other males"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Other males"]

    if not is_male and is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Black females"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Black females"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Black females"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Black females"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Black females"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Black females"]
    
    if not is_male and not is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Other females"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Other females"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Other females"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Other females"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Other females"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Other females"]

In [None]:
def get_scr_baseline(path, ext, sep):
    # load the data sets 
    scr = try_load_csv('AKI_LAB_SCR', ext, sep, path) # pd.read_csv(path+"AKI_LAB_SCR."+ext, engine="pyarrow", sep=sep)
    scr.columns = scr.columns.str.upper()
    scr.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': 'SPECIMEN_DATE'}, inplace = True)
    scr = scr[['ONSETS_ENCOUNTERID', 'PATID', 'SPECIMEN_DATE', 'RESULT_NUM', 'DAYS_SINCE_ADMIT']]
    scr.columns = ['ENCOUNTERID', 'PATID', 'SPECIMEN_DATE', 'RESULT_NUM', 'DAYS_SINCE_ADMIT']
#     scr['ENCOUNTERID'] = scr['ENCOUNTERID'].astype(str)
#     scr['PATID'] = scr['PATID'].astype(str)
    
    
    onset = try_load_csv('AKI_ONSETS', ext, sep, path) # pd.read_csv(path+"AKI_ONSETS."+ext, engine="pyarrow", sep=sep)
    onset.columns = onset.columns.str.upper()
    onset.rename(columns = {'ADMIT_DATE"+PD.DATE_SHIFT"': 'ADMIT_DATE'}, inplace = True)
    onset = onset[['ENCOUNTERID', 'PATID', 'ADMIT_DATE']]
    onset.columns = ['ENCOUNTERID', 'PATID', 'ADMIT_DATE']
    # onset['ENCOUNTERID'] = onset['ENCOUNTERID'].astype(str)
    # onset['PATID'] = onset['PATID'].astype(str)
    
    
    scr = scr.merge(onset, on = ['ENCOUNTERID', 'PATID'], how='left')
    scr = scr.dropna()
    scr['SPECIMEN_DATE'] = pd.to_datetime(pd.to_datetime(scr['SPECIMEN_DATE']).dt.date)
    scr['ADMIT_DATE'] = pd.to_datetime(pd.to_datetime(scr['ADMIT_DATE']).dt.date)
    scr['DAYS_SINCE_ADMIT'] = (scr['SPECIMEN_DATE']-scr['ADMIT_DATE']).dt.days
    scr = scr.sort_values('SPECIMEN_DATE')
    scr = scr[['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'DAYS_SINCE_ADMIT', 'RESULT_NUM']].groupby(['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'DAYS_SINCE_ADMIT']).mean()
    scr = scr.sort_values(['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE'])
    scr = scr.reset_index()


    # process dx data
    dx = try_load_csv('AKI_DX', ext, sep, path) # pd.read_csv(path+"AKI_DX."+ext, sep= sep, engine="pyarrow")
    dx.columns = dx.columns.str.upper()
    dx.rename(columns = {'DX_DATE"+PD.DATE_SHIFT"': 'DX_DATE'}, inplace = True)
    dx["ENCOUNTERID"] = dx["ONSETS_ENCOUNTERID"]
    # dx['ENCOUNTERID'] = dx['ENCOUNTERID'].astype(str)
    # dx['PATID'] = dx['PATID'].astype(str)
    dx = dx[["PATID", "ENCOUNTERID", "DX", "DAYS_SINCE_ADMIT", "DX_DATE", "DX_TYPE"]]
    dx = onset[["PATID","ENCOUNTERID", 'ADMIT_DATE']].merge(dx, on = ["PATID","ENCOUNTERID"], how = "inner")
    dx['ADMIT_DATE'] = pd.to_datetime(pd.to_datetime(dx['ADMIT_DATE']).dt.date)
    dx['DX_DATE'] = pd.to_datetime(pd.to_datetime(dx['DX_DATE']).dt.date)
    dx['DAYS_SINCE_ADMIT'] = (dx['DX_DATE']-dx['ADMIT_DATE']).dt.days
    #dx = dx[dx['DAYS_SINCE_ADMIT'] <= dx['ONSET_SINCE_ADMIT']]  # before onset

    demo = try_load_csv('AKI_DEMO', ext, sep, path) # pd.read_csv(path+'AKI_DEMO.'+ext, sep=sep,  engine="pyarrow")
    # process demo data
    demo.columns = demo.columns.str.upper()
    demo['ENCOUNTERID'] = demo['ONSETS_ENCOUNTERID']
    # demo['ENCOUNTERID'] = demo['ENCOUNTERID'].astype(str)
    # demo['PATID'] = demo['PATID'].astype(str)
    demo['MALE'] = demo['SEX'] == 'M'
    demo['HISPANIC'] = demo['HISPANIC'] == 'Y'
    demo['RACE_WHITE'] = demo['RACE'] == '05'
    demo['RACE_BLACK'] = demo['RACE'] == '03'
    demo = demo[['PATID', 'ENCOUNTERID', 'AGE', 'MALE', 'RACE_WHITE', 'RACE_BLACK', 'HISPANIC']]
    demo = demo.drop_duplicates()

    
    return scr, onset, dx, demo

In [None]:
def get_scr_baseline2(path, ext, sep, scr, onset, dx, demo):
    # complete_df = onset.merge(scr[['ENCOUNTERID', 'PATID', 'SPECIMEN_DATE', 'SPECIMEN_DATETIME', 'RESULT_NUM']],
    #                           on = pat_id_cols,
    #                           how = 'left')
    # Estimate SCr Baseline
    pat_id_cols = ["PATID",  "ENCOUNTERID"]
    
    complete_df = onset.merge(scr[['ENCOUNTERID', 'PATID', 'SPECIMEN_DATE', 'RESULT_NUM']],
                          on = pat_id_cols,
                          how = 'left')

    complete_df['ADMIT_DATE'] =  pd.to_datetime(pd.to_datetime(complete_df['ADMIT_DATE']).dt.date)

    # 1. min between the min of 1-week prior admission SCr and within 24 hour after admission SCr
    #SCr within 24 hour after admission, that is admission day and one day after, get mean
    admission_SCr = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE) & \
                                (complete_df.SPECIMEN_DATE <= (complete_df.ADMIT_DATE + pd.Timedelta(days=1)))].copy()

    # Admission SCr is the mean of all the SCr within 24h admission
    admission_SCr = admission_SCr.groupby(pat_id_cols)['RESULT_NUM'].mean().reset_index()

    admission_SCr.rename(columns = {"RESULT_NUM": "ADMISSION_SCR"}, inplace = True)

    #merge the ADMISSION_SCR back to the main frame
    complete_df = complete_df.merge(admission_SCr, 
                                    on = pat_id_cols,
                                    how = "left")

    #SCr within 7 days prior to admission
    one_week_prior_admission = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE - pd.Timedelta(days=7)) & \
                                     (complete_df.SPECIMEN_DATE < complete_df.ADMIT_DATE)].copy()
    one_week_prior_admission = one_week_prior_admission.groupby(pat_id_cols)['RESULT_NUM'].min().reset_index()
    one_week_prior_admission.rename(columns = {"RESULT_NUM": "ONE_WEEK_SCR"}, inplace = True)

    complete_df = complete_df.merge(one_week_prior_admission, 
                                    on = pat_id_cols,
                                   how = "left")

    #take the min between one week SCr and admission SCr
    complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), "BASELINE_EST_1"] = \
    np.min(complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), ["ONE_WEEK_SCR","ADMISSION_SCR"]], axis = 1)

    ori_num_unique_combinations = onset.groupby(['PATID', 'ENCOUNTERID']).ngroups
    #get the percentage of encounters that do not have past 7-day records
    criterion1_no_missing = complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), :].groupby(pat_id_cols).ngroups
    criterion1_missing_rate = 1 - criterion1_no_missing / ori_num_unique_combinations

    # 2. pre-admission 365-7 day mean
    #here we only care about SCr measurements within 1 year before hospitalization
    one_year_prior_admission = complete_df[(complete_df.SPECIMEN_DATE < (complete_df.ADMIT_DATE - pd.Timedelta(days=7))) & \
                                     (complete_df.SPECIMEN_DATE >= (complete_df.ADMIT_DATE - pd.Timedelta(days=365)))].copy()
    one_year_prior_admission = one_year_prior_admission.loc[:, pat_id_cols + ["RESULT_NUM"]]

    one_year_prior_admission = one_year_prior_admission.groupby(pat_id_cols)['RESULT_NUM'].mean().reset_index()

    one_year_prior_admission.rename(columns = {"RESULT_NUM": "ONE_YEAR_SCR"}, inplace = True)

    complete_df = complete_df.merge(one_year_prior_admission, 
                                    on = pat_id_cols,
                                    how = "left")
    #take the min between one week SCr and admission SCr
    complete_df.loc[complete_df.ONE_YEAR_SCR.notna(), "BASELINE_EST_2"] = \
    np.min(complete_df.loc[complete_df.ONE_YEAR_SCR.notna(), ["ONE_YEAR_SCR","ADMISSION_SCR"]], axis = 1)

    # priority 1: 7day SCr, priority 2: one year SCr
    complete_df["BASELINE_NO_MDRD"] = \
    np.where(complete_df['BASELINE_EST_1'].isna(), complete_df['BASELINE_EST_2'], complete_df['BASELINE_EST_1'])

    #get the percentage of encounters that do not have any 1-year records
    criterion12_no_missing = complete_df.loc[complete_df.BASELINE_NO_MDRD.notna(), :].groupby(pat_id_cols).ngroups
    criterion12_missing_rate = 1 - criterion12_no_missing / ori_num_unique_combinations

    # 3. MDRD to estimate baseline (only for non-CKD patients)
    #get those need to use MDRD to impute baseline
    pat_to_MDRD = complete_df.loc[complete_df.BASELINE_NO_MDRD.isna(), pat_id_cols+["ADMIT_DATE", "ADMISSION_SCR"]]
    #one patient one row
    pat_to_MDRD.drop_duplicates(subset=pat_id_cols, keep="first", inplace = True)


    #adjust name and data type
    # dx["PATID"] = dx["PATID"].astype(str)
    dx["DX"] = dx["DX"].astype(str)
    dx["DX_TYPE"] = dx["DX_TYPE"].astype(str)
    dx['DX_TYPE'] = dx['DX_TYPE'].replace('09', '9')

    # pat_to_MDRD["PATID"] = pat_to_MDRD["PATID"].astype(str)

    pat_to_MDRD_check_CKD = pat_to_MDRD.merge(dx.drop(['ENCOUNTERID', 'ADMIT_DATE'], axis = 1), 
                                              on = "PATID", 
                                              how = "left")

    pat_to_MDRD_check_CKD.DX_DATE.isna().mean()

    #calculate DX_DATE for missing
    pat_to_MDRD_check_CKD.loc[pat_to_MDRD_check_CKD.DX_DATE.isna(), "DX_DATE"] = \
    pat_to_MDRD_check_CKD.loc[pat_to_MDRD_check_CKD.DX_DATE.isna(), "ADMIT_DATE"] + \
    pd.to_timedelta(pat_to_MDRD_check_CKD.loc[pat_to_MDRD_check_CKD.DX_DATE.isna(), 'DAYS_SINCE_ADMIT'], unit='D')

#     # Convert ADMIT_DATE to pandas datetime, if not already
#     pat_to_MDRD_check_CKD['ADMIT_DATE'] = pd.to_datetime(pat_to_MDRD_check_CKD['ADMIT_DATE'])

#     # Calculate DX_DATE for missing values
#     mask = pat_to_MDRD_check_CKD['DX_DATE'].isna()
#     timedelta = pd.to_timedelta(pat_to_MDRD_check_CKD.loc[mask, 'DAYS_SINCE_ADMIT'], unit='D')
#     pat_to_MDRD_check_CKD.loc[mask, 'DX_DATE'] = pat_to_MDRD_check_CKD.loc[mask, 'ADMIT_DATE'] + timedelta
    
    
    #still have patients that do not have DX in the database
    pat_to_MDRD_check_CKD.DX_DATE.isna().mean()

    # filter out those DX after admission
    pat_to_MDRD_check_CKD = pat_to_MDRD_check_CKD[pat_to_MDRD_check_CKD.DX_DATE <= \
                                                 pat_to_MDRD_check_CKD.ADMIT_DATE]
    pat_to_MDRD_check_CKD
    #assert(pat_to_MDRD_check_CKD.DX_DATE.isna().mean() == 0)

    CKD_code = ["585.1","585.2","585.3","585.4","585.5","585.9",
                "N18.1","N18.2","N18.3","N18.4",
                "N18.5","N18.6","N18.9"]

    pat_to_MDRD_with_CKD = pat_to_MDRD_check_CKD[pat_to_MDRD_check_CKD.DX.isin(CKD_code)]

    n_CKD_encounter = pat_to_MDRD_with_CKD.groupby(pat_id_cols).ngroups
    n_CKD_encounter

    CKD_no_history_rate = n_CKD_encounter / ori_num_unique_combinations
    CKD_no_history_rate

    patid_without_CKD = list(pat_to_MDRD.loc[~pat_to_MDRD.PATID.isin(pat_to_MDRD_with_CKD.PATID),
                                             "PATID"].unique())

    # Apply MDRD to NON-CKD patients

    KDIGO_baseline = np.array([
        [1.5, 1.3, 1.2, 1.0],
        [1.5, 1.2, 1.1, 1.0],
        [1.4, 1.2, 1.1, 0.9],
        [1.3, 1.1, 1.0, 0.9],
        [1.3, 1.1, 1.0, 0.8],
        [1.2, 1.0, 0.9, 0.8]
    ])
    KDIGO_baseline = pd.DataFrame(KDIGO_baseline, columns = ["Black males", "Other males",
                                                            "Black females", "Other females"],
                                 index = ["20-24", "25-29", "30-39", "40-54", "55-65", ">65"])

    # pat_to_MDRD["PATID"] = pat_to_MDRD["PATID"].astype(str)
    demo["AGE"] = demo["AGE"].astype(int)

    #merge DEMO with pat_to_MRDR
    pat_to_MDRD = pat_to_MDRD.merge(demo, on = pat_id_cols, how = "left")

    #calculate on non CKD patient
    pat_to_MDRD.loc[pat_to_MDRD.PATID.isin(patid_without_CKD), "BASELINE_MDRD"] = pat_to_MDRD.apply(inverse_MDRD, args = (KDIGO_baseline,), axis = 1)

    #for CKD patients without prior history, use admission SCr
    pat_to_MDRD["BASELINE_EST_3"] = np.min(pat_to_MDRD[["ADMISSION_SCR", "BASELINE_MDRD"]], axis = 1)


    #merge back MRDR computation results
    complete_df = complete_df.merge(pat_to_MDRD[pat_id_cols + ["BASELINE_EST_3"]], 
                                    on = pat_id_cols,
                                   how = "left")


    #replace the old baseline
    # since BASELINE_NO_MDRD and BASELINE_EST_3 are mutually exclusive, just use min
    complete_df["SERUM_CREAT_BASE"] = np.min(complete_df[["BASELINE_NO_MDRD", "BASELINE_EST_3"]], axis = 1)

    #drop those still cannot find baseline
    complete_df = complete_df.dropna(subset=['SERUM_CREAT_BASE'])
    complete_df
    return complete_df#.drop_duplicates()

In [None]:
scr, admit, dx, demo = get_scr_baseline(path, ext, sep)

In [None]:
scr.to_pickle(pdata+'/scr.pkl')
admit.to_pickle(pdata+'/admit.pkl')
dx.to_pickle(pdata+'/dx.pkl')
demo.to_pickle(pdata+'/demo.pkl')

In [None]:
df_base = get_scr_baseline2(path, ext, sep, scr, admit, dx, demo)
df_base.to_pickle(pdata+'/scr_baseline.pkl')
df_base

In [None]:
def get_rrt(path, ext, sep, df_y):
    px = try_load_csv('AKI_PX', ext, sep, path) # pd.read_csv(path+"AKI_PX."+ext, engine="pyarrow", sep= sep)
    px.columns = px.columns.str.upper()
    px.rename(columns = {'PX_DATE"+PD.DATE_SHIFT"': 'PX_DATE'}, inplace = True)
    px["ENCOUNTERID"] = px["ONSETS_ENCOUNTERID"]

    idx_transplant = np.logical_or(np.logical_or(
                           np.logical_and(px['PX_TYPE']=='CH',px['PX'].isin(['50300','50320','50323','50325','50327','50328','50329','50340','50360','50365','50370','50380'])),
                           np.logical_and(px['PX_TYPE']=='09',px['PX'].isin(['55.51','55.52','55.53','55.54','55.61','55.69']))),np.logical_or(
                           np.logical_and(px['PX_TYPE']=='9',px['PX'].isin(['55.51','55.52','55.53','55.54','55.61','55.69'])),                       
                           np.logical_and(px['PX_TYPE']=='10',px['PX'].isin(['0TY00Z0','0TY00Z1','0TY00Z2','0TY10Z0','0TY10Z1','0TY10Z2','0TB00ZZ','0TB10ZZ','0TT00ZZ','0TT10ZZ','0TT20ZZ']))))

    idx_dialysis =( ( (px['PX_TYPE']=='CH')  &  (px['PX'].isin(['90935', '90937'])))  |   # ( (px['PX_TYPE']=='CH')  & 
        ( (px['PX_TYPE']=='CH')  & (pd.to_numeric(px['PX'], errors='coerce').between(90940, 90999))) |   #(px['PX_TYPE']=='CH')  &
        ( (px['PX_TYPE']=='9')  & ( (px['PX'].isin(['39.93','39.95','54.98', 'V45.11']))) )  |  #(px['PX_TYPE']=='9')  &
        ( (px['PX_TYPE']=='09')  & (px['PX'].isin(['39.93','39.95','54.98', 'V45.11'])) ) |  #(px['PX_TYPE']=='09')  &
        ( (px['PX_TYPE']=='10')  & (px['PX'].isin(['5A1D00Z','5A1D60Z','5A1D70Z','5A1D80Z','5A1D90Z', 'Z99.2']))) )  #(px['PX_TYPE']=='10')  &
   
 
    rrt_stage =  px[idx_transplant | idx_dialysis] 

    
    rrt_stage['PX_DATE'] = pd.to_datetime(pd.to_datetime(rrt_stage['PX_DATE']).dt.date )
    df_y['ADMIT_DATE'] = pd.to_datetime(pd.to_datetime(df_y['ADMIT_DATE']).dt.date ) 
    rrt_stage = rrt_stage[['PATID','ENCOUNTERID','PX_DATE']]
    rrt_stage.columns = ['PATID','ENCOUNTERID','RRT3_ONSET_DATE']
    rrt_stage['RRT3_ONSET_DATE'] = pd.to_datetime(pd.to_datetime(rrt_stage['RRT3_ONSET_DATE']).dt.date )

    rrt_stage = rrt_stage.merge(df_y, on=['PATID', 'ENCOUNTERID'], how='inner')
    
    rrt_stage['RRT3_ONSET_DATE'] = pd.to_datetime(rrt_stage['RRT3_ONSET_DATE'])
    rrt_stage['ADMIT_DATE'] = pd.to_datetime(rrt_stage['ADMIT_DATE'])
    rrt_stage['RRT3_SINCE_ADMIT'] = (rrt_stage['RRT3_ONSET_DATE']-rrt_stage['ADMIT_DATE']).dt.total_seconds()/(3600*24)
    rrt_stage = rrt_stage.loc[rrt_stage[['ENCOUNTERID', 'RRT3_SINCE_ADMIT']].groupby('ENCOUNTERID').idxmin().reset_index()['RRT3_SINCE_ADMIT']]
    rrt_stage.drop('ADMIT_DATE', axis = 1, inplace = True)
    return rrt_stage, px

In [None]:
df_rrt, px = get_rrt(path, ext, sep, admit)
px.to_pickle(pdata+'/px.pkl')
df_rrt.to_pickle(pdata+'/df_rrt.pkl')

In [None]:
#df_rrt = pd.read_pickle(pdata+'/df_rrt.pkl')

In [None]:
def read_onset_new(df_onset, df_y, df_rrt, df_base, part = 'either'):
    xxx = df_onset.copy()
    yyy = df_y.copy()

    zzz = df_base[['PATID', 'ENCOUNTERID', 'SERUM_CREAT_BASE']].drop_duplicates()
    zzz.columns= ['PATID', 'ENCOUNTERID',  'RESULT_NUM_BASE_7d']
    xxx = xxx.merge(zzz, on = ['PATID', 'ENCOUNTERID'], how='left')

    zzz2 = xxx[['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'RESULT_NUM']].groupby(['PATID', 'ENCOUNTERID']).rolling('2d', on='SPECIMEN_DATE').min().reset_index()
    zzz2 = zzz2[['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'RESULT_NUM']]
    zzz2.columns= ['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'RESULT_NUM_BASE_2d']
    xxx = xxx.merge(zzz2, on = ['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE'], how='left')

    #0.3 increase in 48 hours
    #1.5 increase in 7 days
    xxx['AKI1.5'] = (xxx['RESULT_NUM']>=1.5*xxx['RESULT_NUM_BASE_7d']) & (xxx['DAYS_SINCE_ADMIT']>=0)                        
    xxx['AKI0.3'] = (xxx['RESULT_NUM']-xxx['RESULT_NUM_BASE_2d']>=0.3) & (xxx['DAYS_SINCE_ADMIT']>=0)      
    
    if part == '2d':
        xxx = xxx[xxx['AKI0.3']]
        xxx = xxx.sort_values('SPECIMEN_DATE')
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']
        #xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE'] = xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE_7d']
        
    elif part == '7d':
        xxx = xxx[xxx['AKI1.5']]
        xxx = xxx.sort_values('SPECIMEN_DATE')
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']

    elif part == 'both':
        xxx = xxx[xxx['AKI0.3'] & xxx['AKI1.5']]
        xxx = xxx.sort_values('SPECIMEN_DATE')
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']
        #xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE'] = xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE_7d']
        
    elif part == 'either':
        xxx = xxx[xxx['AKI0.3'] | xxx['AKI1.5']]
        xxx = xxx.sort_values('SPECIMEN_DATE')
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']
        #xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE'] = xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE_7d']

    #xxx = xxx.apply(set_result_base,axis=1)
    xxx['AKI1_SINCE_ADMIT'] = xxx['DAYS_SINCE_ADMIT'].copy()
    xxx['AKI1_DATE'] = xxx['SPECIMEN_DATE'].copy()
    #xxx['AKI1_DATETIME'] = xxx['SPECIMEN_DATETIME'].copy()
    xxx['AKI1_SCR'] = xxx['RESULT_NUM'].copy()
    xxx['SCR_BASELINE'] = xxx['RESULT_NUM_BASE'].copy()
    xxx['SCR_REFERENCE'] = xxx['RESULT_NUM_BASE_2d'].copy()
    xxx['AKI1_7D'] = xxx['AKI1.5'].copy()
    xxx['AKI1_2D'] = xxx['AKI0.3'].copy()
    xxx = xxx[['PATID', 'ENCOUNTERID', 'SCR_BASELINE', 'SCR_REFERENCE',  'AKI1_DATE', 'AKI1_SCR', 'AKI1_SINCE_ADMIT', 'AKI1_7D', 'AKI1_2D']]

    # Check condition for AKI2: 2.0x - <3.0x
    aki2 = xxx.merge(xxx_backup, on=['PATID', 'ENCOUNTERID'], how='left')
    aki2 = aki2[aki2['SPECIMEN_DATE']>=aki2['AKI1_DATE']]
    aki2 = aki2[aki2['RESULT_NUM']>=2*aki2['SCR_BASELINE']]
    aki2 = aki2.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
    aki2['AKI2_SINCE_ADMIT'] = aki2['DAYS_SINCE_ADMIT'].copy()
    aki2['AKI2_DATE'] = aki2['SPECIMEN_DATE'].copy()
    #aki2['AKI2_DATETIME'] = aki2['SPECIMEN_DATETIME'].copy()
    aki2['AKI2_SCR'] = aki2['RESULT_NUM'].copy()
    aki2 = aki2[['PATID', 'ENCOUNTERID', 'AKI2_DATE', 'AKI2_SCR', 'AKI2_SINCE_ADMIT']]
    
    # Check condition for AKI3: SCR >= 3.0x Baseline
    aki3 = xxx.merge(xxx_backup, on=['PATID', 'ENCOUNTERID'], how='left')
    aki3 = aki3[aki3['SPECIMEN_DATE']>=aki3['AKI1_DATE']]
    aki3 = aki3[(aki3['RESULT_NUM']>=3*aki3['SCR_BASELINE']) | (aki3['RESULT_NUM']>=4)]
    aki3 = aki3.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
    aki3['AKI3_SINCE_ADMIT'] = aki3['DAYS_SINCE_ADMIT'].copy()
    aki3['AKI3_DATE'] = aki3['SPECIMEN_DATE'].copy()
    #aki3['AKI3_DATETIME'] = aki3['SPECIMEN_DATETIME'].copy()
    aki3['AKI3_SCR'] = aki3['RESULT_NUM'].copy()
    aki3 = aki3[['PATID', 'ENCOUNTERID', 'AKI3_DATE', 'AKI3_SINCE_ADMIT', 'AKI3_SCR']]
    
    # Check condition for AKI3: initiation of RRT
    #df_rrt = get_rrt(path, ext, sep, yyy)
    rrt = df_rrt.merge(xxx[['PATID', 'ENCOUNTERID', 'AKI1_DATE']], on=['PATID', 'ENCOUNTERID'], how='left')
    rrt = rrt[rrt['RRT3_ONSET_DATE'] >= rrt['AKI1_DATE']]
    aki3b =  aki3.merge(rrt, on = ['PATID', 'ENCOUNTERID'], how = 'outer')
    cond_rrt = (aki3b['RRT3_SINCE_ADMIT'] < aki3b['AKI3_SINCE_ADMIT']) | (aki3b['AKI3_SINCE_ADMIT'].isna() & aki3b['RRT3_SINCE_ADMIT'].notna())
    aki3b.loc[cond_rrt, 'AKI3_SINCE_ADMIT'] = aki3b.loc[cond_rrt, 'RRT3_SINCE_ADMIT']
    aki3b.loc[cond_rrt, 'AKI3_DATE'] = aki3b.loc[cond_rrt, 'RRT3_ONSET_DATE']
    #aki3b.loc[cond_rrt, 'AKI3_DATETIME'] = aki3b.loc[cond_rrt, 'RRT3_ONSET_DATETIME']
    
    aki3_all = aki3b[['PATID', 'ENCOUNTERID', 'AKI3_DATE', 'AKI3_SINCE_ADMIT', 'AKI3_SCR']]
    
    onset = xxx.merge(aki2, on=['PATID', 'ENCOUNTERID'], how='outer').merge(aki3_all, on=['PATID', 'ENCOUNTERID'], how='outer')
    onset = onset.merge(yyy, on=['PATID', 'ENCOUNTERID'], how='left')

    onset.columns = onset.columns.str.upper()
    onset['ONSET_DATE'] = onset['AKI1_DATE'].copy()  
    
    onset['SCR_ONSET'] = onset['AKI1_SCR'].copy() 

    onset = onset[["PATID","ENCOUNTERID", "ADMIT_DATE", 
                   'ONSET_DATE', "AKI1_SINCE_ADMIT", "AKI2_SINCE_ADMIT", 
                   "AKI3_SINCE_ADMIT", "SCR_ONSET", 
                   'SCR_BASELINE',  'SCR_REFERENCE', 'AKI1_7D', 'AKI1_2D']]

    onset['FLAG'] = (onset['AKI2_SINCE_ADMIT'].notnull()) | (onset['AKI3_SINCE_ADMIT'].notnull())
    onset['ONSET_SINCE_ADMIT'] = onset["AKI1_SINCE_ADMIT"].copy()  #onset[["AKI1_SINCE_ADMIT", "AKI2_SINCE_ADMIT", "AKI3_SINCE_ADMIT"]].min(axis=1)
        
    
    onset['AKI_STAGE'] = 0
    
    aki_s3 = onset['AKI3_SINCE_ADMIT'].notna()
    aki_s2 = onset['AKI2_SINCE_ADMIT'].notna() & onset['AKI3_SINCE_ADMIT'].isna()
    aki_s1 = onset['AKI1_SINCE_ADMIT'].notna() & onset['AKI2_SINCE_ADMIT'].isna() & onset['AKI3_SINCE_ADMIT'].isna()
    
    onset.loc[aki_s3, 'AKI_STAGE'] = 3
    onset.loc[aki_s2, 'AKI_STAGE'] = 2
    onset.loc[aki_s1, 'AKI_STAGE'] = 1
    
    return onset.drop_duplicates()

In [None]:
onset = read_onset_new(df_onset = scr, df_y = admit, df_rrt= df_rrt, df_base = df_base, part = 'either')
onset.to_pickle(pdata+'/onset.pkl')
onset

In [None]:
onset = pd.read_pickle(pdata+'/onset.pkl')