In [None]:
# import python packages
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
import re

In [None]:
def get_data_file_path(pdatpath, site):
    input_path = os.path.join(pdatpath, site) + '/'
    output_path = os.path.join(pdatpath, site, 'aug24') + '/'
    aux_path = os.path.join(pdatpath, 'aux_files') + '/'
    # Check if the output path exists, if not, create it
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    return [input_path, output_path, aux_path]

In [None]:
def load_onset_data(filepath_lst):
    xxx = pd.read_pickle(filepath_lst[0] + 'AKI_LAB_SCR'+'.pkl')
    yyy = pd.read_pickle(filepath_lst[0] + 'AKI_ONSETS'+'.pkl') 
    yyy = yyy[['ENCOUNTERID', 'PATID', 'ADMIT_DATE', 'DISCHARGE_DATE']]
    xxx = xxx[['ENCOUNTERID', 'PATID', 'SPECIMEN_DATE',  'RESULT_NUM']] 
    xxx = xxx.merge(yyy, on = ['ENCOUNTERID', 'PATID'], how='left')
    xxx = xxx.dropna()
    xxx['DAYS_SINCE_ADMIT'] = (xxx['SPECIMEN_DATE']-xxx['ADMIT_DATE']).dt.days
    # take daily average
    xxx = xxx[['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'DAYS_SINCE_ADMIT', 'RESULT_NUM', 'ADMIT_DATE']].groupby(['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'DAYS_SINCE_ADMIT', 'ADMIT_DATE']).mean()
    xxx = xxx.sort_values(['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE'])
    xxx = xxx.reset_index()
    return xxx, yyy.drop_duplicates()

In [None]:
def get_scr_baseline_new(df_scr, df_admit, filepath_lst,  aggfunc_7d = 'last', aggfunc_1y = 'mean', keep_ckd = False):
    cohort_table = dict()
    
    # load & process dx data
    dx = pd.read_pickle(filepath_lst[0]+'AKI_DX.pkl') 
    
    existing_columns = [col for col in ['PATID', 'ENCOUNTERID', 'DX', 'DX_DATE', 'DX_TYPE', 'DAYS_SINCE_ADMIT']
                        if col in dx.columns]
    dx = dx[existing_columns]
    dx = df_admit[['PATID', 'ENCOUNTERID', 'ADMIT_DATE']].merge(dx, on = ['PATID', 'ENCOUNTERID'], how = 'inner')

    if 'DAYS_SINCE_ADMIT' not in dx.columns:
        dx['DAYS_SINCE_ADMIT'] = (dx['DX_DATE']-dx['ADMIT_DATE']).dt.days
        

    dx['DX'] = dx['DX'].astype(str)
    dx['DX_TYPE'] = dx['DX_TYPE'].astype(str)
    dx['DX_TYPE'] = dx['DX_TYPE'].replace('09', '9')
    
    # load & process demo data
    demo = pd.read_pickle(filepath_lst[0]+'AKI_DEMO'+'.pkl')  
    demo['MALE'] = demo['SEX'] == 'M'

    demo['RACE_WHITE'] = demo['RACE'] == '05'
    demo['RACE_BLACK'] = demo['RACE'] == '03'
    demo = demo[['PATID', 'ENCOUNTERID', 'AGE', 'MALE', 'RACE_WHITE', 'RACE_BLACK']]
    demo = demo.drop_duplicates()
    
    # estimate SCr Baseline
    pat_id_cols = ['PATID', 'ENCOUNTERID']
    complete_df = df_scr[['ENCOUNTERID', 'PATID', 'ADMIT_DATE', 'SPECIMEN_DATE', 'RESULT_NUM']]
 
    # 1. min between the min of 1-week prior admission SCr and within 24 hour after admission SCr
    # SCr within 24 hour after admission, that is admission day and one day after, get mean
    admission_SCr = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE) & \
                                (complete_df.SPECIMEN_DATE <= (complete_df.ADMIT_DATE + pd.Timedelta(days=1)))].copy()

    # Admission SCr is the mean of all the SCr within 24h admission
    admission_SCr = admission_SCr.groupby(pat_id_cols)['RESULT_NUM'].mean().reset_index()

    admission_SCr.rename(columns = {'RESULT_NUM': 'ADMISSION_SCR'}, inplace = True)

    # merge the ADMISSION_SCR back to the main frame
    complete_df = complete_df.merge(admission_SCr, 
                                    on = pat_id_cols,
                                    how = 'left')

    # SCr within 7 days prior to admission
    one_week_prior_admission = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE - pd.Timedelta(days=7)) & \
                                           (complete_df.SPECIMEN_DATE < complete_df.ADMIT_DATE)].copy()
    one_week_prior_admission = one_week_prior_admission.sort_values(by = ['PATID', 'ENCOUNTERID','SPECIMEN_DATE'])
    
    # ! record the scr times 
    scr_1w_time = one_week_prior_admission.copy() #.groupby(pat_id_cols).last().reset_index()
    scr_1w_time['days_before_admit']  = (scr_1w_time['ADMIT_DATE']  -  scr_1w_time['SPECIMEN_DATE']).dt.days
    cohort_table['scr_1w_df'] = scr_1w_time
    
    one_week_prior_admission = one_week_prior_admission.groupby(pat_id_cols)['RESULT_NUM'].agg(aggfunc_7d).reset_index()

        
    one_week_prior_admission = one_week_prior_admission.rename(columns = {'RESULT_NUM': 'ONE_WEEK_SCR'})

    complete_df = complete_df.merge(one_week_prior_admission, 
                                    on = pat_id_cols,
                                    how = 'left')

    # take the min between one week SCr and admission SCr
    complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), 'BASELINE_EST_1'] = \
                np.nanmin(complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), ['ONE_WEEK_SCR','ADMISSION_SCR']], axis = 1)

    complete_dfe = complete_df.drop(['SPECIMEN_DATE', 'RESULT_NUM'],axis=1).drop_duplicates()
    
    
    cohort_table['ALL_ENCOUNTERS'] = len(complete_dfe[['PATID','ENCOUNTERID']].drop_duplicates())
    cohort_table['ALL_PATIENTS'] = complete_dfe.PATID.nunique()
    cohort_table['ADMISSION_SCR_YES'] = complete_dfe.ADMISSION_SCR.notna().sum()
    cohort_table['ADMISSION_SCR_NO'] = complete_dfe.ADMISSION_SCR.isna().sum()
    cohort_table['ONE_WEEK_SCR_YES'] = complete_dfe.ONE_WEEK_SCR.notna().sum()
    cohort_table['ONE_WEEK_SCR_NO'] = complete_dfe.ONE_WEEK_SCR.isna().sum()    
    cohort_table['ADMISSION_AND_1W_SCR'] = (complete_dfe.ADMISSION_SCR.notna() & complete_dfe.ONE_WEEK_SCR.notna()).sum()
    cohort_table['ADMISSION_AND_1W_SCR_MIN'] = (complete_dfe.BASELINE_EST_1.notna()).sum()
    cohort_table['ADMISSION_OR_1W_SCR'] = (complete_dfe.ADMISSION_SCR.notna() | complete_dfe.ONE_WEEK_SCR.notna()).sum()
    cohort_table['ONE_WEEK_SCR_ENC'] = complete_dfe[(complete_dfe.ONE_WEEK_SCR.notna() & (complete_dfe['ONE_WEEK_SCR']==complete_dfe['BASELINE_EST_1']))]['ENCOUNTERID'].unique()
    cohort_table['ADMISSION_SCR_1W_ENC'] = complete_dfe[(complete_dfe.ONE_WEEK_SCR.notna() & (complete_dfe['ONE_WEEK_SCR']!=complete_dfe['BASELINE_EST_1']))]['ENCOUNTERID'].unique()
        
    ori_num_unique_combinations = df_scr.groupby(['PATID', 'ENCOUNTERID']).ngroups
    # get the percentage of encounters that do not have past 7-day records
    criterion1_no_missing = complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), :].groupby(pat_id_cols).ngroups
    criterion1_missing_rate = 1 - criterion1_no_missing / ori_num_unique_combinations

    # 2. pre-admission 365-7 day mean
    # here we only care about SCr measurements within 1 year before hospitalization
    one_year_prior_admission = complete_df[(complete_df.SPECIMEN_DATE < (complete_df.ADMIT_DATE - pd.Timedelta(days=7))) & \
                                     (complete_df.SPECIMEN_DATE >= (complete_df.ADMIT_DATE - pd.Timedelta(days=365.25)))].copy()
    one_year_prior_admission = one_year_prior_admission.sort_values(by = ['PATID', 'ENCOUNTERID','SPECIMEN_DATE'])
    
    # ! record the scr times
    scr_1y_time = one_year_prior_admission[one_year_prior_admission.ENCOUNTERID.isin(complete_dfe[complete_dfe.ONE_WEEK_SCR.isna()].ENCOUNTERID.unique())] #.groupby(pat_id_cols).last().reset_index()
    scr_1y_time['days_before_admit']  = (scr_1y_time['ADMIT_DATE']  -  scr_1y_time['SPECIMEN_DATE']).dt.days
    cohort_table['scr_1y_df'] = scr_1y_time
    
    
    one_year_prior_admission = one_year_prior_admission.loc[:, pat_id_cols + ['RESULT_NUM']]
    

    one_year_prior_admission = one_year_prior_admission.groupby(pat_id_cols)['RESULT_NUM'].agg(aggfunc_1y).reset_index()

    
    one_year_prior_admission.rename(columns = {'RESULT_NUM': 'ONE_YEAR_SCR'}, inplace = True)
    
    complete_df = complete_df.merge(one_year_prior_admission, 
                                    on = pat_id_cols,
                                    how = 'left')
    
    # take the min between one week SCr and admission SCr
    complete_df.loc[complete_df.ONE_YEAR_SCR.notna(), 'BASELINE_EST_2'] = \
                np.nanmin(complete_df.loc[complete_df.ONE_YEAR_SCR.notna(), ['ONE_YEAR_SCR', 'ADMISSION_SCR']], axis = 1)

    # priority 1: 7day SCr, priority 2: one year SCr
    complete_df['BASELINE_NO_INVERT'] = \
                np.where(complete_df['BASELINE_EST_1'].isna(), complete_df['BASELINE_EST_2'], complete_df['BASELINE_EST_1'])

    complete_dfe = complete_df.drop(['SPECIMEN_DATE', 'RESULT_NUM'],axis=1).drop_duplicates()
    cohort_table['ONE_YEAR_SCR_YES'] = (complete_dfe.ONE_WEEK_SCR.isna() & complete_dfe.ONE_YEAR_SCR.notna()).sum()
    cohort_table['ONE_YEAR_SCR_NO'] = (complete_dfe.ONE_WEEK_SCR.isna() & complete_dfe.ONE_YEAR_SCR.isna()).sum()
    
    cohort_table['ADMISSION_AND_1Y_SCR'] = (complete_dfe.ADMISSION_SCR.notna() & (complete_dfe.ONE_WEEK_SCR.isna() & complete_dfe.ONE_YEAR_SCR.notna())).sum()
    cohort_table['ADMISSION_AND_1Y_SCR_MIN'] = (complete_dfe.ONE_WEEK_SCR.isna() & complete_dfe.BASELINE_EST_2.notna()).sum()
    
    cohort_table['ADMISSION_OR_1Y_SCR'] = (complete_dfe.ADMISSION_SCR.notna() | (complete_dfe.ONE_WEEK_SCR.isna() & complete_dfe.ONE_YEAR_SCR.notna())).sum()
    
    cohort_table['ONE_YEAR_SCR_ENC'] = complete_dfe[(complete_dfe.ONE_WEEK_SCR.isna() & complete_dfe.ONE_YEAR_SCR.notna() & (complete_dfe['ONE_YEAR_SCR']==complete_dfe['BASELINE_EST_2']))]['ENCOUNTERID'].unique()
    cohort_table['ADMISSION_SCR_1Y_ENC'] = complete_dfe[(complete_dfe.ONE_WEEK_SCR.isna() & complete_dfe.ONE_YEAR_SCR.notna() & (complete_dfe['ONE_YEAR_SCR']!=complete_dfe['BASELINE_EST_2']))]['ENCOUNTERID'].unique()
    
    # 3. Invert MDRD equation to estimate baseline (only for non-CKD patients)
    # get those encounters for which we need to impute baseline
    pat_to_invert = complete_df.loc[complete_df.BASELINE_NO_INVERT.isna(), pat_id_cols+['ADMIT_DATE', 'ADMISSION_SCR']]
    
    cohort_table['MDRD_TO_INVERT'] = pat_to_invert['ENCOUNTERID'].nunique()
    cohort_table['MDRD_TO_INVERT_ENC'] = pat_to_invert['ENCOUNTERID'].unique()
    # one patient one row
    pat_to_invert.drop_duplicates(subset=pat_id_cols, keep='first', inplace = True)


    pat_dx = pat_to_invert.merge(dx.drop(['ENCOUNTERID', 'ADMIT_DATE'], axis = 1), 
                                              on = 'PATID', 
                                              how = 'left')

    # calculate DX_DATE when it is missing
    pat_dx.loc[pat_dx.DX_DATE.isna(), 'DX_DATE'] = \
            pat_dx.loc[pat_dx.DX_DATE.isna(), 'ADMIT_DATE'] + \
            pd.to_timedelta(pat_dx.loc[pat_dx.DX_DATE.isna(), 'DAYS_SINCE_ADMIT'], unit='D')

#     # check patients that do not have DX in the database
#     print(pat_dx.DX_DATE.isna().mean())

    # filter out those DX after admission
    pat_dx = pat_dx[pat_dx.DX_DATE <= pat_dx.ADMIT_DATE]   #pat_dx.DAYS_SINCE_ADMIT <= 0
    pat_dx = pat_dx.merge(pat_to_invert[['PATID', 'ENCOUNTERID']], 
                          on = ['PATID', 'ENCOUNTERID'], 
                          how = 'outer')
    
    
    # get the default eGFR for inversion: default to 75 for non-CKD patients, average of eGFR from staging criteria for CKD patients
    pat_dx['DFLT_eGFR'] = 75

    pat_dx.loc[pat_dx['DX'].isin(['585.3', 'N18.3']), 'DFLT_eGFR'] = 90/2
    pat_dx.loc[pat_dx['DX'].isin(['585.4', 'N18.4']), 'DFLT_eGFR'] = 45/2
    pat_dx.loc[pat_dx['DX'].isin(['585.5', 'N18.5']), 'DFLT_eGFR'] = 15/2

    pat_def_egfr = pat_dx.groupby(pat_id_cols)['DFLT_eGFR'].min().reset_index()
    
    
    cohort_table['ALL_CKD3_ENC'] = dx[(dx['DX'].isin(['585.3', 'N18.3']) )  & (dx['ENCOUNTERID'].isin(df_admit['ENCOUNTERID'].unique()))]['ENCOUNTERID'].unique()
    cohort_table['ALL_CKD4_ENC'] = dx[(dx['DX'].isin(['585.4', 'N18.4']) )  & (dx['ENCOUNTERID'].isin(df_admit['ENCOUNTERID'].unique()))]['ENCOUNTERID'].unique()
    cohort_table['ALL_CKD5_ENC'] = dx[(dx['DX'].isin(['585.5', 'N18.5']) )  & (dx['ENCOUNTERID'].isin(df_admit['ENCOUNTERID'].unique()))]['ENCOUNTERID'].unique()

    
    cohort_table['MDRD_NOCKD'] = (pat_def_egfr['DFLT_eGFR'] == 75).sum()
    
    cohort_table['ADMISSION_OR_MDRD_NOCKD'] = (complete_dfe.ADMISSION_SCR.notna() | complete_dfe['ENCOUNTERID'].isin(pat_def_egfr[pat_def_egfr['DFLT_eGFR'] == 75]['ENCOUNTERID'].unique())).sum()
    
    cohort_table['MDRD_CKD3']  = (pat_def_egfr['DFLT_eGFR'] == 90/2).sum()
    cohort_table['MDRD_CKD4']  = (pat_def_egfr['DFLT_eGFR'] == 45/2).sum()
    cohort_table['MDRD_CKD5']  = (pat_def_egfr['DFLT_eGFR'] == 15/2).sum()
        
    pat_to_invert= pat_to_invert.merge(pat_def_egfr, on = pat_id_cols, how = 'left')
    pat_to_invert['DFLT_eGFR'] = pat_to_invert['DFLT_eGFR'].fillna(75)

    pat_to_invert['CKD345'] = pat_to_invert['DFLT_eGFR'] != 75
    
    #pat_to_invert.DFLT_eGFR.value_counts()

    # Backcalculation for patients
    # merge DEMO with pat_to_invert
    pat_to_invert = pat_to_invert.merge(demo, on = pat_id_cols, how = 'left')
    
    KDIGO_baseline = np.array([
        [1.5, 1.3, 1.2, 1.0],
        [1.5, 1.2, 1.1, 1.0],
        [1.4, 1.2, 1.1, 0.9],
        [1.3, 1.1, 1.0, 0.9],
        [1.3, 1.1, 1.0, 0.8],
        [1.2, 1.0, 0.9, 0.8]
    ])
    KDIGO_baseline = pd.DataFrame(KDIGO_baseline, columns = ["Black males", "Other males",
                                                            "Black females", "Other females"],
                                 index = ["20-24", "25-29", "30-39", "40-54", "55-65", ">65"])    
    
    
    # estimate SCr from eGFR
    pat_to_invert.loc[~pat_to_invert['CKD345'], 'BASELINE_INVERT'] = pat_to_invert.loc[~pat_to_invert['CKD345'], :].apply(inverse_MDRD, args = (KDIGO_baseline,), axis = 1) 
    pat_to_invert.loc[pat_to_invert['CKD345'], 'BASELINE_INVERT'] = pat_to_invert.loc[pat_to_invert['CKD345'], :].apply(inverse_MDRD_raw, axis = 1) 

    # take minimum of inverted SCr and admission SCr
    pat_to_invert['BASELINE_EST_3'] = np.min(pat_to_invert[['ADMISSION_SCR', 'BASELINE_INVERT']], axis = 1)

    cohort_table['MDRD_ENC'] = pat_to_invert[(~pat_to_invert['CKD345']) & (pat_to_invert['BASELINE_EST_3'] == pat_to_invert['BASELINE_INVERT'])]['ENCOUNTERID'].unique()
    cohort_table['ADMISSION_SCR_MDRD_ENC'] = pat_to_invert[(~pat_to_invert['CKD345']) & (pat_to_invert['BASELINE_EST_3'] != pat_to_invert['BASELINE_INVERT'])]['ENCOUNTERID'].unique()
        
    
    # merge back the computation results
    complete_df = complete_df.merge(pat_to_invert[pat_id_cols + ['BASELINE_EST_3', 'CKD345']], 
                                    on = pat_id_cols,
                                    how = 'left')

    # replace the old baseline
    complete_df['SERUM_CREAT_BASE'] = np.min(complete_df[['BASELINE_NO_INVERT', 'BASELINE_EST_3']], axis = 1)

    if not keep_ckd:
        complete_df = complete_df[~(complete_df['CKD345'] & complete_df['BASELINE_NO_INVERT'].isna())]

    complete_df = complete_df.drop('CKD345', axis=1)
        
    # drop those still cannot find baseline
    complete_df = complete_df.dropna(subset=['SERUM_CREAT_BASE'])

    return complete_df.drop_duplicates(), cohort_table

In [None]:
def inverse_MDRD_raw(row):
    eGFR = row['DFLT_eGFR']  # or 75
    male = row['MALE']
    black = row['RACE_BLACK']
    age = row['AGE']

    # Constants specific to the MDRD study
    if male:
        gender_factor = 1.0
    else:
        gender_factor = 0.742

    if black:
        race_factor = 1.212
    else:
        race_factor = 1.0

    # MDRD equation rearranged to solve for Scr
    Scr = (eGFR / (175 * (age ** -0.203) * gender_factor * race_factor)) ** (1 / -1.154)
    return Scr

In [None]:
KDIGO_baseline = np.array([
    [1.5, 1.3, 1.2, 1.0],
    [1.5, 1.2, 1.1, 1.0],
    [1.4, 1.2, 1.1, 0.9],
    [1.3, 1.1, 1.0, 0.9],
    [1.3, 1.1, 1.0, 0.8],
    [1.2, 1.0, 0.9, 0.8]
])
KDIGO_baseline = pd.DataFrame(KDIGO_baseline, columns = ["Black males", "Other males",
                                                        "Black females", "Other females"],
                             index = ["20-24", "25-29", "30-39", "40-54", "55-65", ">65"])

def inverse_MDRD(row, KDIGO_baseline):
    age = row["AGE"]
    is_male = True if row["MALE"]  else False
    is_black = True if row["RACE_BLACK"] else False
        
    if is_male and is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Black males"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Black males"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Black males"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Black males"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Black males"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Black males"]
    
    if is_male and not is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Other males"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Other males"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Other males"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Other males"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Other males"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Other males"]

    if not is_male and is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Black females"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Black females"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Black females"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Black females"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Black females"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Black females"]
    
    if not is_male and not is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Other females"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Other females"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Other females"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Other females"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Other females"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Other females"]

In [None]:
def get_onset_stats(onsets):
    df_stats = []
    site_list =  ['KUMC', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UTHSCSA', 'UTSW','UofU', 'UPITT', 'MCRI']  #

    for site in site_list:
        cohort_tbl = onsets[site][2]
        onset_df = onsets[site][0]
        #adm_scr_encs = list(set(cohort_tbl['ADMISSION_SCR_1W_ENC']) | set(cohort_tbl['ADMISSION_SCR_1Y_ENC']) | set(cohort_tbl['ADMISSION_SCR_MDRD_ENC']))

        stats = {
             # Cohort tbl
            'c' : cohort_tbl['ALL_ENCOUNTERS'] == (cohort_tbl['ONE_WEEK_SCR_YES'] + 
                                                   cohort_tbl['ONE_YEAR_SCR_YES'] + 
                                                   cohort_tbl['MDRD_NOCKD'] + 
                                                   cohort_tbl['MDRD_CKD3'] + cohort_tbl['MDRD_CKD4'] + cohort_tbl['MDRD_CKD5']),
            'all_enc_count': cohort_tbl['ALL_ENCOUNTERS'],
            'all_pat_count': cohort_tbl['ALL_PATIENTS'],
            'scr_1w': cohort_tbl['ONE_WEEK_SCR_YES'],
            'scr_1w_no': cohort_tbl['ONE_WEEK_SCR_NO'],
            'scr_1y': cohort_tbl['ONE_YEAR_SCR_YES'],
            'scr_1y_no': cohort_tbl['ONE_YEAR_SCR_NO'],
            'mdrd_nockd': cohort_tbl['MDRD_NOCKD'],
            'ckd345': cohort_tbl['MDRD_CKD3'] + cohort_tbl['MDRD_CKD4'] + cohort_tbl['MDRD_CKD5'],
             # Onset tbl
            'onset_enc_count': len(onset_df),
            'onset_pat_count': onset_df['PATID'].nunique(),
            'aki1_enc_count': len(onset_df[onset_df['AKI_STAGE'] == 1]),
            'aki2_enc_count': len(onset_df[onset_df['AKI_STAGE'] == 2]),
            'aki3_enc_count': len(onset_df[onset_df['AKI_STAGE'] == 3]),
            #'aki1c': 
            # 'aki1_7denc_count': len(onset_df[(onset_df['AKI_STAGE'] == 1) & (onset_df['AKI1_7D'])]),
            # 'aki2_7denc_count': len(onset_df[(onset_df['AKI_STAGE'] == 2) & (onset_df['AKI1_7D'])]),
            # 'aki3_7denc_count': len(onset_df[(onset_df['AKI_STAGE'] == 3) & (onset_df['AKI1_7D'])]),
            # 'aki1_scr1w_count': len(onset_df[(onset_df['AKI_STAGE'] == 1)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['ONE_WEEK_SCR_ENC']))]),
            # 'aki2_scr1w_count': len(onset_df[(onset_df['AKI_STAGE'] == 2)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['ONE_WEEK_SCR_ENC']))]),
            # 'aki3_scr1w_count': len(onset_df[(onset_df['AKI_STAGE'] == 3)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['ONE_WEEK_SCR_ENC']))]),
            # 'aki1_scr1y_count': len(onset_df[(onset_df['AKI_STAGE'] == 1)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['ONE_YEAR_SCR_ENC']))]),
            # 'aki2_scr1y_count': len(onset_df[(onset_df['AKI_STAGE'] == 2)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['ONE_YEAR_SCR_ENC']))]),
            # 'aki3_scr1y_count': len(onset_df[(onset_df['AKI_STAGE'] == 3)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['ONE_YEAR_SCR_ENC']))]),
            # 'aki1_mdrd_count': len(onset_df[(onset_df['AKI_STAGE'] == 1)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['MDRD_ENC']))]),
            # 'aki2_mdrd_count': len(onset_df[(onset_df['AKI_STAGE'] == 2)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['MDRD_ENC']))]),
            # 'aki3_mdrd_count': len(onset_df[(onset_df['AKI_STAGE'] == 3)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(cohort_tbl['MDRD_ENC']))]),
            # 'aki1_adm_count': len(onset_df[(onset_df['AKI_STAGE'] == 1)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(adm_scr_encs))]),
            # 'aki2_adm_count': len(onset_df[(onset_df['AKI_STAGE'] == 2)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(adm_scr_encs))]),
            # 'aki3_adm_count': len(onset_df[(onset_df['AKI_STAGE'] == 3)& (onset_df['AKI1_7D']) & (onset_df['ENCOUNTERID'].isin(adm_scr_encs))])
        }
        df_stats.append(pd.Series(stats, name=site))

    df_stats = pd.DataFrame(df_stats)
    # #df_stats.index = site_list  # Set sites as the index
    # df_stats['aki1c'] = df_stats['aki1_scr1w_count'] + df_stats['aki1_scr1y_count'] + df_stats['aki1_mdrd_count'] + df_stats['aki1_adm_count']
    # df_stats['aki2c'] = df_stats['aki2_scr1w_count'] + df_stats['aki2_scr1y_count'] + df_stats['aki2_mdrd_count'] + df_stats['aki2_adm_count']
    # df_stats['aki3c'] = df_stats['aki3_scr1w_count'] + df_stats['aki3_scr1y_count'] + df_stats['aki3_mdrd_count'] + df_stats['aki3_adm_count']
    return df_stats

In [None]:
#df_stats.apply('sum', axis = 0)
#df_stats.apply('sum', axis = 0)/640568

In [None]:
def eGFR_MDRD(df, scr_label):
    # Adjust Scr for units; assuming Scr is given in mg/dL
    Scr = df[scr_label]

    # Coefficients for gender and race
    gender_coeff = np.where(df['MALE'], 1, 0.742)
    race_coeff = np.where(df['RACE_BLACK'], 1.212, 1)

    # MDRD equation components
    Scr_component = (Scr) ** -1.154
    age_component = df['AGE'] ** -0.203

    # eGFR calculation
    eGFR = 175 * Scr_component * age_component * gender_coeff * race_coeff
    return eGFR


def eGFR_CKDEPI09(df, scr_label):
    # Determine kappa and alpha based on 'MALE' column
    kappa = np.where(df['MALE'], 0.9, 0.7)
    alpha = np.where(df['MALE'],  -0.411, -0.329)

    # Coefficients for gender
    gender_coeff = np.where(df['MALE'], 1, 1.018)
    race_coeff = np.where(df['RACE_BLACK'], 1.159, 1)
    
    # Calculate eGFR
    Scr_over_kappa = df[scr_label] / kappa
    min_term = np.where(Scr_over_kappa <= 1, Scr_over_kappa**alpha, 1)
    max_term = np.where(Scr_over_kappa > 1, Scr_over_kappa**(-1.209), 1)
    age_term = 0.993 ** df['AGE']

    # eGFR calculation
    eGFR = 141 * min_term * max_term * age_term * gender_coeff * race_coeff
    return eGFR
    

def eGFR_CKDEPI21(df, scr_label):
    # Determine kappa and alpha based on 'MALE' column
    kappa = np.where(df['MALE'], 0.9, 0.7)
    alpha = np.where(df['MALE'], -0.302, -0.241)

    # Coefficients for gender
    gender_coeff = np.where(df['MALE'], 1, 1.012)

    # Calculate eGFR
    Scr_over_kappa = df[scr_label] / kappa
    min_term = np.where(Scr_over_kappa <= 1, Scr_over_kappa**alpha, 1)
    max_term = np.where(Scr_over_kappa > 1, Scr_over_kappa**(-1.200), 1)
    age_term = 0.9938 ** df['AGE']

    # eGFR calculation
    eGFR = 142 * min_term * max_term * age_term * gender_coeff
    return eGFR
    
def ckd_staging(df, egfr_label):
    # Assuming ckd_egfr is your DataFrame and 'eGFR' is a column in this DataFrame
    conditions = [
        (df[egfr_label] >= 90),
        (df[egfr_label] >= 60) & (df[egfr_label] < 90),
        (df[egfr_label] >= 45) & (df[egfr_label] < 60),
        (df[egfr_label] >= 30) & (df[egfr_label] < 45),
        (df[egfr_label] >= 15) & (df[egfr_label] < 30),
        (df[egfr_label] < 15)
    ]

    # Define the CKD stages corresponding to the above conditions
    choices = [0, 1, 2, 3, 4, 5]

    # Apply the conditions and choices to the DataFrame
    ckd_stage = np.select(conditions, choices, default=np.nan) 
    return ckd_stage

In [None]:
def get_rrt(df_admit, filepath_lst):
    px = pd.read_pickle(filepath_lst[0]+'AKI_PX.pkl')   

    idx_transplant = np.logical_or(np.logical_or(
                           np.logical_and(px['PX_TYPE']=='CH',px['PX'].isin(['50300','50320','50323','50325','50327','50328','50329','50340','50360','50365','50370','50380'])),
                           np.logical_and(px['PX_TYPE']=='09',px['PX'].isin(['55.51','55.52','55.53','55.54','55.61','55.69']))),np.logical_or(
                           np.logical_and(px['PX_TYPE']=='9',px['PX'].isin(['55.51','55.52','55.53','55.54','55.61','55.69'])),                       
                           np.logical_and(px['PX_TYPE']=='10',px['PX'].isin(['0TY00Z0','0TY00Z1','0TY00Z2','0TY10Z0','0TY10Z1','0TY10Z2','0TB00ZZ','0TB10ZZ','0TT00ZZ','0TT10ZZ','0TT20ZZ']))))

    idx_dialysis =(((px['PX_TYPE']=='CH') & (px['PX'].isin(['90935', '90937']))) |  
                  ((px['PX_TYPE']=='CH') & (pd.to_numeric(px['PX'], errors='coerce').between(90940, 90999))) |   
                  ((px['PX_TYPE']=='9') & ((px['PX'].isin(['39.93','39.95','54.98', 'V45.11'])))) | 
                  ((px['PX_TYPE']=='09') & (px['PX'].isin(['39.93','39.95','54.98', 'V45.11']))) |  
                  ((px['PX_TYPE']=='10') & (px['PX'].isin(['5A1D00Z','5A1D60Z','5A1D70Z','5A1D80Z','5A1D90Z', 'Z99.2'])))) 
 
    rrt_stage =  px[idx_transplant | idx_dialysis] 

    rrt_stage = rrt_stage[['PATID','ENCOUNTERID','PX_DATE']]
    rrt_stage.columns = ['PATID','ENCOUNTERID','RRT_ONSET_DATE']

    rrt_stage = rrt_stage.merge(df_admit, on=['PATID', 'ENCOUNTERID'], how='inner')
    rrt_stage['RRT_SINCE_ADMIT'] = (rrt_stage['RRT_ONSET_DATE']-rrt_stage['ADMIT_DATE']).dt.total_seconds()/(3600*24)
    rrt_stage = rrt_stage.loc[rrt_stage[['ENCOUNTERID', 'RRT_SINCE_ADMIT']].groupby('ENCOUNTERID').idxmin().reset_index()['RRT_SINCE_ADMIT']]
    rrt_stage.drop('ADMIT_DATE', axis = 1, inplace = True)
    return rrt_stage

In [None]:
def determine_initial_aki_stage(row):
    # Extract the AKI onset days
    aki_days = {
        1: row['AKI1_SINCE_ADMIT'],
        2: row['AKI2_SINCE_ADMIT'],
        3: row['AKI3_SINCE_ADMIT']
    }
    
    # Remove NaN values
    aki_days = {key: val for key, val in aki_days.items() if not pd.isnull(val)}
    
    if not aki_days:
        return np.nan
    
    # Find the minimum value and handle ties by prioritizing higher stages
    min_value = min(aki_days.values())
    highest_stage = max(stage for stage, day in aki_days.items() if day == min_value)
    
    return highest_stage


def get_aki_onset(df_scr, df_admit, df_rrt, df_baseline, aki_criteria = 'either'):
    xxx = df_scr.copy()
    yyy = df_admit.copy()
    ##### Filter out the CKD patients that does not have baseline 
    # Create a set of tuples representing (ENCOUNTERID, PATID) combinations in df_baseline
    valid_combinations = df_baseline[['ENCOUNTERID', 'PATID']].drop_duplicates()

    # Use merge to keep only rows with matching (ENCOUNTERID, PATID) combinations
    xxx = xxx.merge(valid_combinations, on=['ENCOUNTERID', 'PATID'], how='inner')
    yyy = yyy.merge(valid_combinations, on=['ENCOUNTERID', 'PATID'], how='inner')
    
    ######
    zzz = df_baseline[['PATID', 'ENCOUNTERID', 'SERUM_CREAT_BASE']].drop_duplicates()
    zzz.columns= ['PATID', 'ENCOUNTERID',  'RESULT_NUM_BASE_7d']
    xxx = xxx.merge(zzz, on = ['PATID', 'ENCOUNTERID'], how='left')

    zzz2 = xxx[['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'RESULT_NUM']].groupby(['PATID', 'ENCOUNTERID']).rolling('2d', on='SPECIMEN_DATE').min().reset_index()
    zzz2 = zzz2[['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'RESULT_NUM']]
    zzz2.columns= ['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE', 'RESULT_NUM_BASE_2d']
    xxx = xxx.merge(zzz2, on = ['PATID', 'ENCOUNTERID', 'SPECIMEN_DATE'], how='left')

    # Check condition for AKI1
    #1.5 increase in 7 days
    xxx['AKI1.5'] = (xxx['RESULT_NUM']>=1.5*xxx['RESULT_NUM_BASE_7d']) & (xxx['DAYS_SINCE_ADMIT']>=0) 
    #0.3 increase in 48 hours
    xxx['AKI0.3'] = (xxx['RESULT_NUM']-xxx['RESULT_NUM_BASE_2d']>=0.3) & (xxx['DAYS_SINCE_ADMIT']>=0)      
    
    if aki_criteria == '2d':
        xxx = xxx[xxx['AKI0.3']]
        xxx = xxx.sort_values(['SPECIMEN_DATE', 'RESULT_NUM'], ascending=[True, False])
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']
        #xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE'] = xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE_7d']
        
    elif aki_criteria == '7d':
        xxx = xxx[xxx['AKI1.5']]
        xxx = xxx.sort_values(['SPECIMEN_DATE', 'RESULT_NUM'], ascending=[True, False])
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']

    elif aki_criteria == 'both':
        xxx = xxx[xxx['AKI0.3'] & xxx['AKI1.5']]
        xxx = xxx.sort_values(['SPECIMEN_DATE', 'RESULT_NUM'], ascending=[True, False])
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']
        #xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE'] = xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE_7d']
        
    elif aki_criteria == 'either':
        xxx = xxx[xxx['AKI0.3'] | xxx['AKI1.5']]
        xxx = xxx.sort_values(['SPECIMEN_DATE', 'RESULT_NUM'], ascending=[True, False])
        xxx_backup = xxx.copy()
        xxx = xxx.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
        xxx['RESULT_NUM_BASE'] = xxx['RESULT_NUM_BASE_7d']
        #xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE'] = xxx.loc[xxx['AKI1.5'], 'RESULT_NUM_BASE_7d']

    xxx['AKI1_SINCE_ADMIT'] = xxx['DAYS_SINCE_ADMIT'].copy()
    xxx['AKI1_DATE'] = xxx['SPECIMEN_DATE'].copy()
    xxx['AKI1_SCR'] = xxx['RESULT_NUM'].copy()
    xxx['SCR_BASELINE'] = xxx['RESULT_NUM_BASE'].copy()
    xxx['SCR_REFERENCE'] = xxx['RESULT_NUM_BASE_2d'].copy()
    xxx['AKI1_7D'] = xxx['AKI1.5'].copy()
    xxx['AKI1_2D'] = xxx['AKI0.3'].copy()
    xxx = xxx[['PATID', 'ENCOUNTERID', 'SCR_BASELINE', 'SCR_REFERENCE',  'AKI1_DATE', 'AKI1_SCR', 'AKI1_SINCE_ADMIT', 'AKI1_7D', 'AKI1_2D']]

    # Check condition for AKI2: 2.0x - <3.0x
    aki2 = xxx.merge(xxx_backup, on=['PATID', 'ENCOUNTERID'], how='left')
    aki2 = aki2[aki2['SPECIMEN_DATE']>=aki2['AKI1_DATE']]
    aki2 = aki2[aki2['RESULT_NUM']>=2*aki2['SCR_BASELINE']]
    aki2 = aki2.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
    aki2['AKI2_SINCE_ADMIT'] = aki2['DAYS_SINCE_ADMIT'].copy()
    aki2['AKI2_DATE'] = aki2['SPECIMEN_DATE'].copy()
    aki2['AKI2_SCR'] = aki2['RESULT_NUM'].copy()
    aki2 = aki2[['PATID', 'ENCOUNTERID', 'AKI2_DATE', 'AKI2_SCR', 'AKI2_SINCE_ADMIT']]
    
    # Check condition for AKI3: SCR >= 3.0x Baseline
    aki3 = xxx.merge(xxx_backup, on=['PATID', 'ENCOUNTERID'], how='left')
    aki3 = aki3[aki3['SPECIMEN_DATE']>=aki3['AKI1_DATE']]
    aki3 = aki3[(aki3['RESULT_NUM']>=3*aki3['SCR_BASELINE']) | (aki3['RESULT_NUM']>=4)]
    aki3 = aki3.groupby(['PATID', 'ENCOUNTERID']).first().reset_index()
    aki3['AKI3_SINCE_ADMIT'] = aki3['DAYS_SINCE_ADMIT'].copy()
    aki3['AKI3_DATE'] = aki3['SPECIMEN_DATE'].copy()
    aki3['AKI3_SCR'] = aki3['RESULT_NUM'].copy()
    aki3 = aki3[['PATID', 'ENCOUNTERID', 'AKI3_DATE', 'AKI3_SINCE_ADMIT', 'AKI3_SCR']]
    
    # Check condition for AKI3: initiation of RRT
    #df_rrt = get_rrt(path, ext, sep, yyy)
    rrt = df_rrt.merge(xxx[['PATID', 'ENCOUNTERID', 'AKI1_DATE']], on=['PATID', 'ENCOUNTERID'], how='left')
    rrt = rrt[rrt['RRT_ONSET_DATE'] >= rrt['AKI1_DATE']]
    aki3b =  aki3.merge(rrt, on = ['PATID', 'ENCOUNTERID'], how = 'outer')
    cond_rrt = (aki3b['RRT_SINCE_ADMIT'] < aki3b['AKI3_SINCE_ADMIT']) | (aki3b['AKI3_SINCE_ADMIT'].isna() & aki3b['RRT_SINCE_ADMIT'].notna())
    aki3b.loc[cond_rrt, 'AKI3_SINCE_ADMIT'] = aki3b.loc[cond_rrt, 'RRT_SINCE_ADMIT']
    aki3b.loc[cond_rrt, 'AKI3_DATE'] = aki3b.loc[cond_rrt, 'RRT_ONSET_DATE']
    
    aki3_all = aki3b[['PATID', 'ENCOUNTERID', 'AKI3_DATE', 'AKI3_SINCE_ADMIT', 'AKI3_SCR']]
    
    # Merge AKI staging information
    onset = xxx.merge(aki2, on=['PATID', 'ENCOUNTERID'], how='outer').merge(aki3_all, on=['PATID', 'ENCOUNTERID'], how='outer')
    onset = onset.merge(yyy, on=['PATID', 'ENCOUNTERID'], how='left')

    onset.columns = onset.columns.str.upper()
    onset['ONSET_DATE'] = onset['AKI1_DATE'].copy()  
    onset['SCR_ONSET'] = onset['AKI1_SCR'].copy() 
    
    onset['DISCHARGE_SINCE_ONSET'] = (onset['DISCHARGE_DATE'] - onset['ONSET_DATE']).dt.days
    
    onset = onset[['PATID','ENCOUNTERID', 'ADMIT_DATE', 'DISCHARGE_DATE', 
                   'ONSET_DATE', 'AKI1_SINCE_ADMIT', 'AKI2_SINCE_ADMIT', 
                   'AKI3_SINCE_ADMIT',  'DISCHARGE_SINCE_ONSET','SCR_ONSET', 
                   'SCR_BASELINE',  'SCR_REFERENCE', 'AKI1_7D', 'AKI1_2D']]

    onset['FLAG'] = (onset['AKI2_SINCE_ADMIT'].notna()) | (onset['AKI3_SINCE_ADMIT'].notna())
    onset['ONSET_SINCE_ADMIT'] = onset['AKI1_SINCE_ADMIT'].copy()  #onset[['AKI1_SINCE_ADMIT', 'AKI2_SINCE_ADMIT', 'AKI3_SINCE_ADMIT']].min(axis=1)
    
    #Generate onset staging by taking the first stage
    onset['AKI_STAGE'] = 0
    filter_aki3 = onset['AKI3_SINCE_ADMIT'].notna()
    filter_aki2 = onset['AKI2_SINCE_ADMIT'].notna() & onset['AKI3_SINCE_ADMIT'].isna()
    filter_aki1 = onset['AKI1_SINCE_ADMIT'].notna() & onset['AKI2_SINCE_ADMIT'].isna() & onset['AKI3_SINCE_ADMIT'].isna()
    
    onset.loc[filter_aki3, 'AKI_STAGE'] = 3
    onset.loc[filter_aki2, 'AKI_STAGE'] = 2
    onset.loc[filter_aki1, 'AKI_STAGE'] = 1
    
    
    # Determine the initial AKI stage by finding the column with the smallest onset day
    onset['AKI_INIT_STG'] = onset.apply(determine_initial_aki_stage, axis=1)
    
    return onset.drop_duplicates()

In [None]:
%%time
def process_onset_and_recovery_new(filepath_lst, aggfunc_7d, aggfunc_1y, keep_ckd):
    filepath_lst = get_data_file_path(pdatpath, site)
    df_scr, df_admit = load_onset_data(filepath_lst)
    df_baseline, cohort_table = get_scr_baseline_new(df_scr, df_admit, filepath_lst, aggfunc_7d, aggfunc_1y, keep_ckd)
    df_rrt = get_rrt(df_admit, filepath_lst)

    onset = get_aki_onset(df_scr, df_admit, df_rrt, df_baseline)
    #onset.to_pickle(filepath_lst[1] + 'onset.pkl')

    print(f"Finish generating AKI onset and recovery for {site}.", flush = True)
    return onset, df_baseline, cohort_table


import logging

pdatpath = '/blue/yonghui.wu/qixu/aki/'
# Initialize logging
logging.basicConfig(filename='processing_errors.log', level=logging.ERROR)

site_list = ['KUMC', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UTHSCSA', 'UTSW','UofU', 'UPITT', 'MCRI'] #['KUMC', 'UMHC', 'UNMC', 'UTHSCSA']  

onset_dict4 = {}

for site in site_list:
    try:
        filepath_lst = get_data_file_path(pdatpath, site)
        onset_dict4[site] = process_onset_and_recovery_new(filepath_lst, 
                                            aggfunc_7d = 'last', 
                                            aggfunc_1y = 'mean', 
                                            keep_ckd = False)
    except Exception as e:
        logging.error(f"Error processing site {site}: {e}")
        print(f"Error processing site {site}. Check log for details.")