## Imports

In [None]:
import pandas as pd
import numpy as np
import datetime, random

from multiprocessing import Pool
import time

## Reading in file; managing columns

In [None]:
%%time

#Read in dataframe - uncomment which data frame according to which you are working on

#df = pd.read_csv('~/Desktop/patr/StandardizingAKI/inpatient 2014-2018 creatinine.csv') #1441707, 4
#df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\inpatient 2014-2018 creatinine.csv')
df = pd.read_csv('/Users/saranmedical-smile/csv_files/inpatient 2014-2018 creatinine.csv')
#df = pd.read_csv('/Users/saranmedical-smile/csv_files/covid creatinines.csv')

df.pat_mrn_id = df.pat_mrn_id.str.strip('MR').astype('int') #Index with integers --> it's much quicker
df.time = pd.to_datetime(df.time) # Convert to pandas datetime format
print(df.dtypes) # Confirm all the column types are as we want it

df.columns = ['mrn', #Renaming columns for ease
              'enc',
              'time',
              'creat'] 
df.set_index(['time'], inplace=True)
#df.set_index(['mrn', 'enc'], inplace=True) #Turn the index into a hierarchical tuple (mrn, enc)

## Functions: add_rows(df),  add_delta_cols(df), returnAKIpatients(df), parallelizeAnalysis(df, num_cores)

### add_rows()

    Input: Pandas data frame 

    Output: Pandas data frame (w/ extra dummy rows containing back-calculated values)

This function is the bulk of the computation - it adds dummy rows with the appropriate back-calculated values at the first encounter row. If one is only interested in those patients which have AKI according to the Rolling Window definition, the code should run in less than a minute and *add_rows()* isn't necessary. If you are interested in back-calculated AKI cases, then the analysis takes a bit longer (although still under an hour) and that's where *add_rows()* comes in. It simply finds the first encounter recorded for a patient, looks back between 7 and 365 days in the past and sees whether or not there are measured creatinine values. If there are, the mean of those values are put into the dummy row. Otherwise, a value calculated based on the estimated glomerular filtration rate (eGFR) equation from A New Equation to Estimate Glomerular Filtration Rate (Levey et. Al, 2009, https://pubmed.ncbi.nlm.nih.gov/19414839/) is used instead in the dummy row.  

### parallelizeAnalysis()

Parallelizes the analysis.

### returnAKIpatients()

Finds the patients with AKI according to the Rolling Window and/or the back-calculated definitions... runtime significantly depends on which definition you care for.

### addDeltaCols()

The way in which AKI is measured is based on the change in creatinine over a given period of time. As a proxy, $\Delta creat$ and $\Delta time$ columns are created to measure the change from one time point to another. This function calculates those values via the Split-Apply-Combine methodology on the original data frame: splitting by encounter, transforming via a backward shift, then taking the difference of values, then a forward shift, and then recombining the dataframe object.

In [None]:
def returnAKIpatients(df, aki_calc_type = 'rolling_window'):
    
    x = df.groupby('enc', sort=False)['creat'].rolling(pd.Timedelta('52hours'), min_periods=1).min()#.reset_index('enc').creat
    y = df.groupby('enc', sort=False)['creat'].rolling(pd.Timedelta('172hours'), min_periods=1).min()#.reset_index('enc').creat

    df = df.reset_index().set_index(['enc', 'time'])
    
    df['running_min_48hrs'] = x
    df['running_min_7days'] = y
    
    df['running_delta_48hrs'] = np.round(df['creat'] - df['running_min_48hrs'], decimals = 3)
    df['running_delta_7days'] = np.round(df['creat'] - df['running_min_7days'], decimals = 3)
    
    condition1 = df['running_delta_48hrs'] >= 0.3
    condition2 = df['running_delta_7days'] >= 0.5*df['running_min_7days']
    
    df['aki'] = condition1 | condition2
    
    return df

def OLD_returnAKIpatients(df, aki_calc_type = 'both', keep_extra_cols = True, num_cores=4):
    
    if aki_calc_type == 'rolling_window':
        #Add the delta columns as described above
        d = add_delta_cols(df)
        
        #Two conditions for rolling-window definition
        condition1 = np.logical_and(d.delta_creat >= 0.3, d.delta_time <= datetime.timedelta(hours=48))
        condition2 = np.logical_and(d.delta_creat >= d.creat.shift(1)*0.5, d.delta_time <= datetime.timedelta(days=7))    
    
        d['aki'] = condition1 | condition2
        
    elif aki_calc_type == 'back_calc':
        #Now, we gotta go through the process of creating the dummy rows
        split_dfs = np.array_split(df, num_cores)
        with Pool(num_cores) as p:
            result = p.map(add_rows, split_dfs)
            d_ = pd.concat(result)
        d = add_delta_cols(d_)
        condition2 = np.logical_and(d.delta_creat >= d.creat.shift(1)*0.5, d.delta_time < datetime.timedelta(days=7))    
        d['aki'] = condition2 & (d.delta_time == datetime.timedelta(0)) #Remember, back-calc doesn't include the 0.3 increase criterion... has to be a 50% increase
        
    elif aki_calc_type == 'both':
        split_dfs = np.array_split(df, num_cores)
        with Pool(num_cores) as p:
            result = p.map(add_rows, split_dfs)
            d_ = pd.concat(result)
        d = add_delta_cols(d_)
        
        condition1 = np.logical_and(d.delta_creat >= 0.3, d.delta_time < datetime.timedelta(hours=48))
        condition2 = np.logical_and(d.delta_creat >= d.creat.shift(1)*0.5, d.delta_time < datetime.timedelta(days=7))    
        
        d['aki'] = condition1 | condition2
    
    return d

def add_rows(df):
    '''
    Input: Pandas data frame
    Output: Pandas data frame
    
    This function is the bulk of the computation - it adds dummy rows with the appropriate back-calculated values
    at the first encounter row. If one is only interested in those patients which have AKI according to the 
    Rolling Window definition, the code should run in less than a minute and *add_rows()* isn't necessary. 
    If you are interested in back-calculated AKI cases, then the analysis takes a bit longer 
    (although still under an hour) and that's where *add_rows()* comes in. It simply finds the first encounter 
    recorded for a patient, looks back between 7 and 365 days in the past and sees whether or not there are measured
    creatinine values. If there are, the mean of those values are put into the dummy row. Otherwise, a value 
    calculated based on the estimated glomerular filtration rate (eGFR) equation from A New Equation to Estimate 
    Glomerular Filtration Rate (Levey et. Al, 2009, https://pubmed.ncbi.nlm.nih.gov/19414839/) is used instead 
    in the dummy row.  
    '''
    patient_dfs = df.groupby(['mrn'], sort=False) 
    patients = dict()
    
    for mrn, pat_df in patient_dfs:
        patients[mrn] = pat_df
        backcalc_rows = np.array(pat_df.reset_index().drop_duplicates(['mrn', 'enc']).index)
        times_to_consider = [pat_df.iloc[indx].time for indx in backcalc_rows]

        new_rows = pat_df.iloc[backcalc_rows].copy()
        for i in range(1,len(times_to_consider)):
            new_rows.iloc[i, new_rows.columns.get_loc('creat')] = pat_df.loc[np.logical_and(times_to_consider[i] - pat_df.time < datetime.timedelta(days=365),times_to_consider[i] - pat_df.time > datetime.timedelta(days=7)), 'creat'].mean()

        mini_dfs = np.split(pat_df, backcalc_rows[1:])
        mini_dfs = [pd.concat([new_rows.iloc[[indx]], dataframe]) for indx, dataframe in enumerate(mini_dfs)]
        pat_df = pd.concat(mini_dfs)
        patients[mrn] = pat_df
        
    return pd.concat(list(patients.values()))

def add_delta_cols(df):
    delta_df = df.groupby(['mrn', 'enc']).shift(-1) - df
    df['delta_creat'] = np.round(delta_df['creat'].shift(1), decimals = 2)
    df['delta_time'] = delta_df['time'].shift(1)
    firstencs = df.reset_index().drop_duplicates('mrn') #Similarly, df.groupby(['mrn']).head(1)
    df['first_enc'] = [i in firstencs.index for i in range(df.shape[0])]
    return df

In [None]:
#returnAKIpatients(df)
print('done!')

In [None]:
%%time
x = df.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('2days'), min_periods=1).min()#.reset_index('enc').creat
y = df.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('7days'), min_periods=1).min()#.reset_index('enc').creat

In [None]:
df = df.reset_index().set_index(['enc', 'time'])

In [None]:
df['running_min_48hrs'] = x
df['running_min_7days'] = y

df['running_delta_48hrs'] = np.round(df['creat'] - df['running_min_48hrs'], decimals = 3)
df['running_delta_7days'] = np.round(df['creat'] - df['running_min_7days'], decimals = 3)    

condition1 = df['running_delta_48hrs'] >= 0.3
condition2 = df['running_delta_7days'] >= 0.5*df['running_min_7days']

df['aki'] = condition1 | condition2

## Validate the rolling-window code

In [None]:
%%time
df['running_min_48hrs'] = df.groupby('enc', sort=False)['creat'].rolling(pd.Timedelta('2days'), min_periods=1).min().values#reset_index('enc').creat

In [None]:
%%time
df['running_min_7days'] = df.groupby('enc', sort=False)['creat'].rolling(pd.Timedelta('7days'), min_periods=1).min().values#reset_index('enc').creat

In [None]:
%%time
df['running_delta_48hrs'] = np.round(df['creat'] - df['running_min_48hrs'], decimals = 3)
df['running_delta_7days'] = np.round(df['creat'] - df['running_min_7days'], decimals = 3)

condition1 = df['running_delta_48hrs'] >= 0.3
condition2 = df['running_delta_7days'] >= 0.5*df['running_min_7days']

df['aki'] = condition1 | condition2

### Checking against Yu's flagger values

Hey Yu, I'm looking at patient mrn 'MR1019104' in H:\Data\Standardized AKI definition\dataset\aki flagger inpatient 2014-2018.csv, we don't match on 2015-06-20 01:27:00 ... you flagged I wasn't sure if I missed something. Would you mind taking a look when you have a moment (no rush)? 

In [None]:
df_yu = pd.read_csv(r'/Users/saranmedical-smile/csv_files/aki flagger inpatient 2014-2018.csv')
df_yu['aki'] = df_yu['aki'].astype('bool')
df_yu['time'] = pd.to_datetime(df_yu['time'])
#df_yu = df_yu[:10000]
df_yu.set_index(['pat_enc_csn_id', 'time'], inplace=True)
print(df_yu.shape, df.shape)
print(np.all(df.index == df_yu.index)) #Check whether the encounters are the same at least

In [None]:
mycount = df.aki.sum()
yucount = df_yu.aki.sum()
print('My AKI count:', mycount, '| Yu AKI count:', yucount, '| Discrepancy: ', np.abs(yucount-mycount))

In [None]:
#Example case: we disagree on 1019104
#df_yu[df_yu.pat_mrn_id == 'MR1019104']#.aki == df[df.mrn == 1019104].aki
#df[df.mrn == 1019104]


#These are the patients which we disagree on
df.iloc[np.where(df.aki != df_yu.aki)[0]].mrn.unique()

#The first one - 1002080 - accounts for 9 of our discrepancies. I am doing it by 

In [None]:
#We agree if I group at the level of patient instead of encounter ... cool :)
tmp_df = df.loc[df.mrn == 1002080]
tmp_df = tmp_df.reset_index('enc')
tmp_df['running_min_48hrs'] = tmp_df.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('2days'), min_periods=1).min().reset_index('mrn').creat
tmp_df['running_min_7days'] = tmp_df.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('7days'), min_periods=1).min().reset_index('mrn').creat

tmp_df['running_delta_48hrs'] = np.round(tmp_df['creat'] - tmp_df['running_min_48hrs'], decimals = 3)
tmp_df['running_delta_7days'] = np.round(tmp_df['creat'] - tmp_df['running_min_7days'], decimals = 3)

condition1 = tmp_df['running_delta_48hrs'] >= 0.3
condition2 = tmp_df['running_delta_7days'] >= 0.5*tmp_df['running_min_7days']

tmp_df['aki'] = condition1 | condition2
#tmp_df#.aki == df_yu.loc[df_yu.pat_mrn_id == 'MR1002080'].aki


## Read in COVID-19 cases, same analysis 

In [None]:
cdf = pd.read_csv('/Users/saranmedical-smile/csv_files/covid creatinines.csv')
cdf.pat_mrn_id = cdf.pat_mrn_id.str.strip('MR').astype('int') #Index with integers --> it's much quicker
cdf.time = pd.to_datetime(cdf.time) # Convert to pandas datetime format
cdf.admission = pd.to_datetime(cdf.admission)
cdf.discharge = pd.to_datetime(cdf.discharge)
cdf.drop(['enc_id',
         'age',
         'sex',
         'inpatient',
         'race',
         'admission',
         'discharge'], axis=1, inplace=True)
print(cdf.dtypes) # Confirm all the column types are as we want it
cdf.columns = ['mrn', #Renaming columns for ease
              'enc',
              'time',
              'creat'] 
cdf.set_index(['time'], inplace=True)

In [None]:
%%time
#returnAKIpatients(cdf)
print('done!')

In [None]:
%%time
x = cdf.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('52hours'), min_periods=1).min()#.reset_index('enc').creat
y = cdf.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('172hours'), min_periods=1).min()#.reset_index('enc').creat    

In [None]:
cdf = cdf.reset_index().set_index(['enc', 'time'])

In [None]:
cdf['running_min_48hrs'] = x.values
cdf['running_min_7days'] = y.values

cdf['running_delta_48hrs'] = np.round(cdf['creat'] - cdf['running_min_48hrs'], decimals = 3)
cdf['running_delta_7days'] = np.round(cdf['creat'] - cdf['running_min_7days'], decimals = 3)    

condition1 = cdf['running_delta_48hrs'] >= 0.3
condition2 = cdf['running_delta_7days'] >= 0.5*cdf['running_min_7days']

cdf['aki'] = condition1 | condition2

In [None]:
cdf.reset_index('enc', inplace=True)
cdf.sort_index(inplace=True)
cdf['aki_cumsum'] = cdf.aki.cumsum()
#cdf['aki_cumsum_7d'] = cdf.reset_index('enc').aki.rolling(pd.Timedelta('7days')).cumsum()

In [None]:
cdf.to_csv('/Users/saranmedical-smile/csv_files/covid19aki.csv', index=True)

In [None]:
df_yu[df_yu.pat_mrn_id == 'MR1019104']