In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
tqdm.pandas()
pd.set_option('display.max_columns', None)
from A_Data_Merge_eric import read_AKI_ONSETS, read_AKI_LAB_SCR, read_AKI_DEMO, read_AKI_DX
#%store -r ct_names
#%store -r raw_path

raw_path = '/home/hoyinchan/blue/Data/data2021raw/'
ct_names = ['MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UTHSCSA', 'KUMC', 'UTSW', 'UPITT', 'UNMC']

In [None]:
import importlib
importlib.reload(A_Data_Merge_eric)
from A_Data_Merge_eric import read_AKI_ONSETS, read_AKI_LAB_SCR, read_AKI_DEMO, read_AKI_DX

In [None]:
def format_and_concat(mode, dfs):
    processed_dfs = format_dfs(mode, dfs)
    one_df = concat_dfs_to_one(processed_dfs)
    return one_df

In [None]:
def format_dfs(mode, dfs):
    processed_dfs = dict()
    for key, df in tqdm(dfs.items()):    
        #convert id columns to string
        df['PATID'] = df['PATID'].astype(str)
        df['ONSETS_ENCOUNTERID'] = df['ONSETS_ENCOUNTERID'].astype(str)
        
        if mode == 'ONSET':
            # Converting string data type into datetime object
            df['ADMIT_DATE'] = pd.to_datetime(df['ADMIT_DATE'], format='mixed')
            df['DISCHARGE_DATE'] = pd.to_datetime(df['DISCHARGE_DATE'], format='mixed')
            df['AKI1_ONSET'] = pd.to_datetime(df['AKI1_ONSET'], format='mixed')
            df['AKI2_ONSET'] = pd.to_datetime(df['AKI2_ONSET'], format='mixed')
            df['AKI3_ONSET'] = pd.to_datetime(df['AKI3_ONSET'], format='mixed')
        elif mode == 'SCR':
            # Converting string data type into datetime object
            df['SPECIMEN_DATE'] = pd.to_datetime(df['SPECIMEN_DATE'], format='mixed')
            # Extract just the date part(Only needed by UMHC data)
            if key == 'UMHC':
                df['SPECIMEN_DATE'] = df['SPECIMEN_DATE'].dt.date
                df['SPECIMEN_DATE'] = pd.to_datetime(df['SPECIMEN_DATE'])

        # add center name column
        if key == 'KUMC_ORCALE':
            df['CENTER_NAME'] = 'KUMC'
        else:
            df['CENTER_NAME'] = key
        
        processed_dfs[key] = df
    return processed_dfs

In [None]:
def concat_dfs_to_one(dfs):
    dfs_to_concat = []
    for this_df in dfs.values():
        dfs_to_concat.append(this_df)
    one_df = pd.concat(dfs_to_concat, axis = 0)
    return one_df

In [None]:
ONSETS_dfs = read_AKI_ONSETS(ct_names,raw_path)
SCR_dfs = read_AKI_LAB_SCR(ct_names, raw_path)

In [None]:
ONSETs = format_and_concat('ONSET', ONSETS_dfs)
SCRs = format_and_concat('SCR', SCR_dfs)

In [None]:
ori_num_unique_combinations = ONSETs.groupby(['CENTER_NAME', 'PATID', 'ONSETS_ENCOUNTERID']).ngroups

In [None]:
ori_num_unique_combinations

In [None]:
complete_df = ONSETs.merge(SCRs[['ONSETS_ENCOUNTERID', 'PATID', 'SPECIMEN_DATE', 'RESULT_NUM', 'CENTER_NAME']],
                          on = ['CENTER_NAME', 'PATID', 'ONSETS_ENCOUNTERID'],
                          how = 'left')

In [None]:
complete_df

# Estimate SCr Baseline

In [None]:
pat_id_cols = ["CENTER_NAME", "PATID",  "ONSETS_ENCOUNTERID"]

1. min between the min of 1-week prior admission SCr and within 24 hour after admission SCr

In [None]:
#SCr within 24 hour after admission, that is admission day and one day after, get mean
admission_SCr = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE) & \
                            (complete_df.SPECIMEN_DATE <= (complete_df.ADMIT_DATE + pd.Timedelta(days=1)))].copy()

In [None]:
# Admission SCr is the mean of all the SCr within 24h admission
admission_SCr = admission_SCr.groupby(pat_id_cols)['RESULT_NUM'].mean().reset_index()

In [None]:
admission_SCr.rename(columns = {"RESULT_NUM": "ADMISSION_SCR"}, inplace = True)

In [None]:
#merge the ADMISSION_SCR back to the main frame
complete_df = complete_df.merge(admission_SCr, 
                                on = pat_id_cols,
                               how = "left")

In [None]:
# patient do not have admission SCr
complete_df.ADMISSION_SCR.isna().mean()

In [None]:
#SCr within 7 days prior to admission
one_week_prior_admission = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE - pd.Timedelta(days=7)) & \
                                 (complete_df.SPECIMEN_DATE < complete_df.ADMIT_DATE)].copy()

In [None]:
one_week_prior_admission = one_week_prior_admission.groupby(pat_id_cols)['RESULT_NUM'].min().reset_index()

In [None]:
one_week_prior_admission.rename(columns = {"RESULT_NUM": "ONE_WEEK_SCR"}, inplace = True)

In [None]:
complete_df = complete_df.merge(one_week_prior_admission, 
                                on = pat_id_cols,
                               how = "left")

In [None]:
#take the min between one week SCr and admission SCr
complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), "BASELINE_EST_1"] = \
np.min(complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), ["ONE_WEEK_SCR","ADMISSION_SCR"]], axis = 1)

In [None]:
#get the percentage of encounters that do not have past 7-day records
criterion1_no_missing = complete_df.loc[complete_df.ONE_WEEK_SCR.notna(), :].groupby(pat_id_cols).ngroups
criterion1_missing_rate = 1 - criterion1_no_missing / ori_num_unique_combinations

In [None]:
ori_num_unique_combinations

In [None]:
ori_num_unique_combinations - criterion1_no_missing

In [None]:
criterion1_missing_rate

2. pre-admission 365-7 day mean

In [None]:
#here we only care about SCr measurements within 1 year before hospitalization
one_year_prior_admission = complete_df[(complete_df.SPECIMEN_DATE < (complete_df.ADMIT_DATE - pd.Timedelta(days=7))) & \
                                 (complete_df.SPECIMEN_DATE >= (complete_df.ADMIT_DATE - pd.Timedelta(days=365)))].copy()

In [None]:
one_year_prior_admission = one_year_prior_admission.loc[:, pat_id_cols + ["RESULT_NUM"]]

In [None]:
one_year_prior_admission = one_year_prior_admission.groupby(pat_id_cols)['RESULT_NUM'].min().reset_index()

In [None]:
one_year_prior_admission.rename(columns = {"RESULT_NUM": "ONE_YEAR_SCR"}, inplace = True)

In [None]:
complete_df = complete_df.merge(one_year_prior_admission, 
                                on = pat_id_cols,
                               how = "left")

In [None]:
#take the min between one week SCr and admission SCr
complete_df.loc[complete_df.ONE_YEAR_SCR.notna(), "BASELINE_EST_2"] = \
np.min(complete_df.loc[complete_df.ONE_YEAR_SCR.notna(), ["ONE_YEAR_SCR","ADMISSION_SCR"]], axis = 1)

check how many are still missing

In [None]:
# priority 1: 7day SCr, priority 2: one year SCr
complete_df["BASELINE_NO_MDRD"] = \
np.where(complete_df['BASELINE_EST_1'].isna(), complete_df['BASELINE_EST_2'], complete_df['BASELINE_EST_1'])

In [None]:
#get the percentage of encounters that do not have any 1-year records
criterion12_no_missing = complete_df.loc[complete_df.BASELINE_NO_MDRD.notna(), :].groupby(pat_id_cols).ngroups
criterion12_missing_rate = 1 - criterion12_no_missing / ori_num_unique_combinations

In [None]:
ori_num_unique_combinations - criterion12_no_missing

In [None]:
criterion12_missing_rate

3. MDRD to estimate baseline (only for non-CKD patients)

In [None]:
#get those need to use MDRD to impute baseline
pat_to_MDRD = complete_df.loc[complete_df.BASELINE_NO_MDRD.isna(), pat_id_cols+["ADMIT_DATE", "ADMISSION_SCR"]]
#one patient one row
pat_to_MDRD.drop_duplicates(subset=pat_id_cols, keep="first", inplace = True)

In [None]:
pat_to_MDRD

In [None]:
#read in DX to check CKD
AKI_DX_dfs = read_AKI_DX(ct_names, raw_path)

In [None]:
def concat_and_format_dfs(dictionary):
    df_complete = []
    for ct_name in tqdm(dictionary.keys()):
        if ct_name == "KUMC_ORCALE":
            dictionary[ct_name]["CENTER_NAME"] = "KUMC"
        else:
            dictionary[ct_name]["CENTER_NAME"] = ct_name

        df_complete.append(dictionary[ct_name])  
    
    df_complete = pd.concat(df_complete, axis = 0)
    
    return df_complete

In [None]:
DX_complete = concat_and_format_dfs(AKI_DX_dfs)

In [None]:
DX_complete

In [None]:
#adjust name and data type
DX_complete["PATID"] = DX_complete["PATID"].astype(str)
DX_complete["DX"] = DX_complete["DX"].astype(str)
DX_complete["DX_TYPE"] = DX_complete["DX_TYPE"].astype(str)
DX_complete['DX_TYPE'] = DX_complete['DX_TYPE'].replace('09', '9')

In [None]:
pat_to_MDRD["PATID"] = pat_to_MDRD["PATID"].astype(str)
pat_to_MDRD_check_CKD = pat_to_MDRD.merge(DX_complete, 
                                          on = ["CENTER_NAME", "PATID"], 
                                          how = "left")

In [None]:
pat_to_MDRD_check_CKD.DX_DATE.isna().mean()

In [None]:
# calculate DX_DATE for missing
pat_to_MDRD_check_CKD.loc[pat_to_MDRD_check_CKD.DX_DATE.isna(), "DX_DATE"] = \
pat_to_MDRD_check_CKD.loc[pat_to_MDRD_check_CKD.DX_DATE.isna(), "ADMIT_DATE"] + \
pd.to_timedelta(pat_to_MDRD_check_CKD.loc[pat_to_MDRD_check_CKD.DX_DATE.isna(), 'DAYS_SINCE_ADMIT'], unit='D')

In [None]:
#still have patients that do not have DX in the database
pat_to_MDRD_check_CKD.DX_DATE.isna().mean()

In [None]:
# filter out those DX after admission
pat_to_MDRD_check_CKD = pat_to_MDRD_check_CKD[pat_to_MDRD_check_CKD.DX_DATE <= \
                                             pat_to_MDRD_check_CKD.ADMIT_DATE]

In [None]:
assert(pat_to_MDRD_check_CKD.DX_DATE.isna().mean() == 0)

In [None]:
CKD_code = ["585.1","585.2","585.3","585.4","585.5","585.9",
            "N18.1","N18.2","N18.3","N18.4",
           "N18.5","N18.6","N18.9"]

In [None]:
pat_to_MDRD_with_CKD = pat_to_MDRD_check_CKD[pat_to_MDRD_check_CKD.DX.isin(CKD_code)]

In [None]:
n_CKD_encounter = pat_to_MDRD_with_CKD.groupby(pat_id_cols).ngroups

In [None]:
n_CKD_encounter

In [None]:
CKD_no_history_rate = n_CKD_encounter / ori_num_unique_combinations

In [None]:
CKD_no_history_rate

In [None]:
patid_without_CKD = list(pat_to_MDRD.loc[~pat_to_MDRD.PATID.isin(pat_to_MDRD_with_CKD.PATID),
                                         "PATID"].unique())

In [None]:
len(patid_without_CKD)

Apply MDRD to NON-CKD patients

In [None]:
KDIGO_baseline = np.array([
    [1.5, 1.3, 1.2, 1.0],
    [1.5, 1.2, 1.1, 1.0],
    [1.4, 1.2, 1.1, 0.9],
    [1.3, 1.1, 1.0, 0.9],
    [1.3, 1.1, 1.0, 0.8],
    [1.2, 1.0, 0.9, 0.8]
])
KDIGO_baseline = pd.DataFrame(KDIGO_baseline, columns = ["Black males", "Other males",
                                                        "Black females", "Other females"],
                             index = ["20-24", "25-29", "30-39", "40-54", "55-65", ">65"])

In [None]:
KDIGO_baseline

In [None]:
#read in DEMO to get age sex and race
AKI_DEMO_dfs = read_AKI_DEMO(ct_names, raw_path)

In [None]:
#concate DEMO together
DEMO_complete = []
for ct_name in tqdm(AKI_DEMO_dfs.keys()):
    if ct_name == "KUMC_ORCALE":
        AKI_DEMO_dfs[ct_name]["CENTER_NAME"] = "KUMC"
    else:
        AKI_DEMO_dfs[ct_name]["CENTER_NAME"] = ct_name
    
    DEMO_complete.append(AKI_DEMO_dfs[ct_name])

In [None]:
DEMO_complete = pd.concat(DEMO_complete, axis = 0)
#adjust data type
DEMO_complete["ONSETS_ENCOUNTERID"] = \
DEMO_complete["ONSETS_ENCOUNTERID"].astype(str)
DEMO_complete["PATID"] = DEMO_complete["PATID"].astype(str)
DEMO_complete["AGE"] = DEMO_complete["AGE"].astype(int)

In [None]:
pat_to_MDRD

In [None]:
#merge DEMO with pat_to_MRDR
#one patient one row 
pat_to_MDRD = pat_to_MDRD.merge(DEMO_complete, on = pat_id_cols, how = "left")

In [None]:
pat_to_MDRD['SEX'] = pat_to_MDRD['SEX'].replace(['UN', 'NI'], 'F')

In [None]:
def inverse_MDRD(row):
    age = row["AGE"]
    is_male = True if row["SEX"] == "M" else False
    is_black = True if (row["RACE"] == "03" or row["RACE"] == "RACE:black") else False
        
    if is_male and is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Black males"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Black males"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Black males"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Black males"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Black males"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Black males"]
    
    if is_male and not is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Other males"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Other males"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Other males"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Other males"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Other males"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Other males"]

    if not is_male and is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Black females"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Black females"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Black females"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Black females"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Black females"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Black females"]
    
    if not is_male and not is_black:
        if age <= 24:
            return KDIGO_baseline.loc["20-24", "Other females"]
        elif 25 <= age <= 29:
            return KDIGO_baseline.loc["25-29", "Other females"]
        elif 30 <= age <= 39:
            return KDIGO_baseline.loc["30-39", "Other females"]
        elif 40 <= age <= 54:
            return KDIGO_baseline.loc["40-54", "Other females"]
        elif 55 <= age <= 65:
            return KDIGO_baseline.loc["55-65", "Other females"]
        elif age > 65:
            return KDIGO_baseline.loc[">65", "Other females"]

In [None]:
#calculate on non CKD patient
pat_to_MDRD.loc[pat_to_MDRD.PATID.isin(patid_without_CKD), "BASELINE_MDRD"] = pat_to_MDRD.apply(inverse_MDRD, axis = 1)

In [None]:
#for CKD patients without prior history, use admission SCr
pat_to_MDRD["BASELINE_EST_3"] = np.min(pat_to_MDRD[["ADMISSION_SCR", "BASELINE_MDRD"]], axis = 1)

In [None]:
#merge back MRDR computation results
complete_df = complete_df.merge(pat_to_MDRD[pat_id_cols + ["BASELINE_EST_3"]], 
                                on = pat_id_cols,
                               how = "left")

In [None]:
#replace the old baseline
# since BASELINE_NO_MDRD and BASELINE_EST_3 are mutually exclusive, just use min
complete_df["SERUM_CREAT_BASE"] = np.min(complete_df[["BASELINE_NO_MDRD", "BASELINE_EST_3"]], axis = 1)

In [None]:
#drop those still cannot find baseline
complete_df = complete_df.dropna(subset=['SERUM_CREAT_BASE'])

In [None]:
assert(complete_df.SERUM_CREAT_BASE.isna().mean() == 0)

# Proprocessing

In [None]:
#here we only care about SCr measurements within hospitalization, thus we filter out those history records
complete_df_filter = complete_df[(complete_df.SPECIMEN_DATE >= complete_df.ADMIT_DATE) & \
                                 (complete_df.SPECIMEN_DATE <= complete_df.DISCHARGE_DATE)].copy()

In [None]:
num_unique_combinations = complete_df_filter.groupby(pat_id_cols).ngroups

In [None]:
num_unique_combinations

In [None]:
sorted_df = complete_df_filter.sort_values(pat_id_cols + ['SPECIMEN_DATE'])
complete_df_filter_sorted = sorted_df.reset_index(drop=True)

In [None]:
#do not need these columns
complete_df_filter_sorted.drop(["ADMISSION_SCR", "ONE_WEEK_SCR", "ONE_YEAR_SCR",
                                "BASELINE_EST_1", "BASELINE_EST_2", "BASELINE_NO_MDRD",
                               "BASELINE_EST_3"], axis = 1, inplace = True)

In [None]:
complete_df_filter_sorted

# Find AKI Stage 1

In [None]:
from multiprocessing import Pool, cpu_count

In [None]:
# get the max SCr increment within past 2 days
def calculate_max_diff(group):
    max_diffs = []
    for index, row in group.iterrows():
        # past 2 days records
        past_2_days = group[(group['SPECIMEN_DATE'] >= row['SPECIMEN_DATE'] - \
                             pd.Timedelta(days=2)) & (group['SPECIMEN_DATE'] < row['SPECIMEN_DATE'])]
        if not past_2_days.empty:
            # difference
            diffs = row['RESULT_NUM'] - past_2_days['RESULT_NUM']
            max_diff = diffs.max()
        else:
            max_diff = pd.NA
        max_diffs.append(max_diff)
    group['TWO_DAY_MAX_DIFF'] = max_diffs
    return group

In [None]:
encounter_grouped = complete_df_filter_sorted.groupby(pat_id_cols)
groups = [group for _, group in encounter_grouped]

In [None]:
len(complete_df_filter_sorted)

In [None]:
len(groups)

In [None]:
with Pool(cpu_count()) as p:
    ret_list = list(tqdm(p.imap(calculate_max_diff, groups), 
                         total=len(groups), 
                         desc="Calculating Differences"))

In [None]:
finished_df = pd.concat(ret_list, axis = 0, ignore_index=True)

In [None]:
assert(len(finished_df) == len(complete_df_filter_sorted))

In [None]:
finished_df['ONSETS_NOT_CHANGE'] = \
finished_df['ONSETS_ENCOUNTERID'] == \
finished_df['ONSETS_ENCOUNTERID'].shift()

In [None]:
#here we get the cumulative max measurements within each group
finished_df['CUMMAX_RESULT_NUM'] = \
finished_df.groupby(pat_id_cols)['RESULT_NUM'].cummax()

In [None]:
finished_df

In [None]:
#AKI stage 1 definition part 1, absolute increment of 0.3 within 48 hours
condition1 = (finished_df['TWO_DAY_MAX_DIFF'] >= 0.3)
#AKI stage 1 definition part 2, fold increment
condition2 = (finished_df['RESULT_NUM'] >= 1.5 * finished_df['SERUM_CREAT_BASE']) & \
(finished_df['RESULT_NUM'] < 2.0 * finished_df['SERUM_CREAT_BASE'])
#we require that onset src should be the max value until that time, so that higher stage will overwrite lower stage
condition3 = (finished_df['ONSETS_NOT_CHANGE']) & \
(finished_df['RESULT_NUM'] == finished_df['CUMMAX_RESULT_NUM'])

In [None]:
#measurement satisfying AKI-1 
AKI_def = finished_df[(condition1 | condition2) & condition3]

In [None]:
AKI_def

In [None]:
# very first measurement satisfying AKI-1 
AKI_def_first = AKI_def.groupby(pat_id_cols).first().reset_index().copy()

In [None]:
AKI_def_first.rename(columns = {'SPECIMEN_DATE': 'AKI1_ONSET_NEW'}, inplace = True)

In [None]:
# merge back AKI-1 onset date
AKI_def_all = \
finished_df.merge(AKI_def_first[pat_id_cols + ['AKI1_ONSET_NEW']], 
                  on=pat_id_cols, how='left')

In [None]:
AKI_def_all

# Find AKI Stage 2

In [None]:
condition4 = (finished_df['RESULT_NUM'] >= 2.0 * finished_df['SERUM_CREAT_BASE']) & \
(finished_df['RESULT_NUM'] < 3.0 * finished_df['SERUM_CREAT_BASE'])

In [None]:
AKI_def2 = finished_df[condition4 & condition3]
AKI_def2_first = AKI_def2.groupby(pat_id_cols).first().reset_index().copy()

In [None]:
AKI_def2_first.rename(columns = {'SPECIMEN_DATE': 'AKI2_ONSET_NEW'}, inplace = True)

In [None]:
# merge back AKI-2 onset date
AKI_def2_all = \
AKI_def_all.merge(AKI_def2_first[pat_id_cols + ['AKI2_ONSET_NEW']], 
                     on=pat_id_cols, 
                     how='left')

# Find AKI Stage 3

In [None]:
condition5 = (finished_df['RESULT_NUM'] >= 3.0 * finished_df['SERUM_CREAT_BASE'])
condition6 = (finished_df['RESULT_NUM'] >= 4.0)

In [None]:
AKI_def3 = finished_df[(condition5 | condition6) & condition3]
AKI_def3_first = AKI_def3.groupby(['CENTER_NAME', 'PATID', 'ONSETS_ENCOUNTERID']).first().reset_index().copy()

In [None]:
AKI_def3_first.rename(columns = {'SPECIMEN_DATE': 'AKI3_ONSET_NEW'}, inplace = True)

In [None]:
# merge back AKI-3 onset date
AKI_def3_all = \
AKI_def2_all.merge(AKI_def3_first[pat_id_cols + ['AKI3_ONSET_NEW']], 
                     on=pat_id_cols, 
                     how='left')

In [None]:
AKI_def3_all

# Processing before Saving

In [None]:
AKI_def3_all.drop(['AKI1_ONSET', 'AKI2_ONSET', 'AKI3_ONSET'], axis = 1, inplace = True)

In [None]:
AKI_def3_all.rename(columns = {
    'AKI1_ONSET_NEW':'AKI1_ONSET',
    'AKI2_ONSET_NEW':'AKI2_ONSET',
    'AKI3_ONSET_NEW':'AKI3_ONSET',
}, inplace = True)

In [None]:
AKI_def3_all['NONAKI_SINCE_ADMIT'] = np.where(
    AKI_def3_all['AKI1_ONSET'].notna() | AKI_def3_all['AKI2_ONSET'].notna() | AKI_def3_all['AKI3_ONSET'].notna(), 
    False, 
    True
)

In [None]:
AKI_def_use = AKI_def3_all[['PATID', 'ONSETS_ENCOUNTERID', 'ADMIT_DATE', 'DISCHARGE_DATE', 'SERUM_CREAT_BASE',
                          'NONAKI_SINCE_ADMIT', 'AKI1_ONSET', 'AKI2_ONSET', 'AKI3_ONSET', 'CENTER_NAME']].copy()

In [None]:
#each encounter we only keep one row info
AKI_def_use = AKI_def_use.groupby(pat_id_cols).first().reset_index().copy()

In [None]:
num_unique_combinations = AKI_def_use.groupby(pat_id_cols).ngroups

In [None]:
num_unique_combinations

# Save New Onsets to Folder

In [None]:
AKI_def_use.to_csv('/blue/yonghui.wu/lideyi/AKI_subphenotyping_project_v3/NEW_ONSETS/NEW_ONSETS.csv', index=False)