In [3]:
import pandas as pd
import numpy as np
import datetime, random

from multiprocessing import Pool
import time

### Previous method: Function-based implementation

In [4]:
%%time
import akiFlagger
covid_df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\covid creatinines.csv')
covid_df['mrn'] = covid_df.pat_mrn_id.str.strip('MR').astype('int')
covid_df['enc'] = covid_df.enc_id
covid_df['time'] = pd.to_datetime(covid_df.time)
covid_df['sex'] = covid_df.sex.astype('bool')
covid_df['race'] = covid_df.race.astype('bool')
covid_df['inpatient'] = covid_df.inpatient.astype('bool')
covid_df['creat'] = covid_df['creatinine']
covid_df['admission'] = pd.to_datetime(covid_df.admission)
covid_df['discharge'] = pd.to_datetime(covid_df.discharge)
print('Shape:', covid_df.shape)
print(covid_df.dtypes)

df = covid_df[['mrn', 'enc', 'time', 'creat', 'age', 'sex', 'race', 'inpatient', 'admission', 'discharge']]
df = df[:10000]
df = df[~df.duplicated()] #The rows where the pat_enc_csn_id was lumped into enc_id become duplicates
df = df.groupby('enc', sort=False).apply(lambda d: d.sort_values('time'))
df = df.reset_index(drop=True)
p = akiFlagger.returnAKIpatients(df)
p.head()

Shape: (450399, 16)
pat_mrn_id                 object
PAT_ENC_CSN_ID              int64
time               datetime64[ns]
creatinine                float64
hosp_admsn_time            object
hosp_disch_time            object
enc_id                      int64
admission          datetime64[ns]
discharge          datetime64[ns]
age                       float64
sex                          bool
race                         bool
inpatient                    bool
mrn                         int32
enc                         int64
creat                     float64
dtype: object
Wall time: 15.8 s


Unnamed: 0,time,mrn,enc,creat,age,sex,race,inpatient,admission,discharge,mincreat_48hr,mincreat_7day,deltacreat_48hr,deltacreat_7day,rollingwindow_aki,stage1,stage2,stage3,baseline_creat,backcalc_aki
0,2020-03-13 16:20:00,1000026,214990932,0.5,18.945205,True,False,False,2020-04-10 15:53:00,2020-04-16 19:22:00,,,,,False,False,False,False,0.5,
1,2020-04-07 20:38:00,1000026,214990932,0.56,18.945205,True,False,False,2020-04-10 15:53:00,2020-04-16 19:22:00,0.56,0.56,0.0,0.0,False,False,False,False,0.5,
2,2020-04-10 18:29:00,1000026,214990932,0.59,18.945205,True,False,True,2020-04-10 15:53:00,2020-04-16 19:22:00,0.59,0.56,0.0,0.03,False,False,False,False,0.5,False
3,2020-04-12 07:46:00,1000026,214990932,0.56,18.945205,True,False,True,2020-04-10 15:53:00,2020-04-16 19:22:00,0.56,0.56,0.0,0.0,False,False,False,False,0.5,False
4,2019-07-18 12:12:00,1000037,222096628,1.5,15.260274,False,False,False,2020-07-05 20:14:00,2020-07-06 15:10:00,,,,,False,False,False,False,2.43,


## Class-based method for akiFlagger so that the required column names can be independent of my naming system

In [18]:
noad = df.loc[:,['mrn', 'creat', 'time', 'inpatient']]
noad.head()

Unnamed: 0,mrn,creat,time,inpatient
0,1000026,0.5,2020-03-13 16:20:00,False
1,1000026,0.56,2020-04-07 20:38:00,False
2,1000026,0.59,2020-04-10 18:29:00,True
3,1000026,0.56,2020-04-12 07:46:00,True
4,1000037,1.5,2019-07-18 12:12:00,False


In [42]:
class AKIFlagger:
    '''Flagger to detect patients with acute kidney injury (AKI).
    
    '''
    def __init__(self, patient_id = 'mrn', creatinine='creatinine', time = 'time', inpatient = 'inpatient', 
                 aki_calc_type=None, keep_extra_cols = False, eGFR_impute = False, add_stages = None,
                 cond1time = '48hours', cond2time = '168hours', pad1time = '0hours', pad2time = '0hours', 
                 rolling_window = False, back_calculate = False,
                 admission = 'admission', age = 'age', sex = 'sex', race = 'race', encounter_id = 'enc'):
        
        #Identifiers
        self.patient_id = patient_id
        self.encounter_id = encounter_id
        
        #Columns necessary for calculation (if admission not included it will be imputed)
        self.creatinine = creatinine
        self.time = time
        self.inpatient = inpatient
        self.admission = admission
        
        #Demographic variables
        self.age = age
        self.sex = sex
        self.race = race
        
        #Rolling-window variables
        self.cond1time = cond1time
        self.cond2time = cond2time
        self.pad1time = pad1time
        self.pad2time = pad2time
        
        #Extra options to specify what is included in the output
        self.eGFR_impute = eGFR_impute
        self.add_stages = add_stages
        self.keep_extra_cols = keep_extra_cols
        
        #Specifying the calculation type wanted in the flagger
        self.aki_calc_type = aki_calc_type
        self.rolling_window = rolling_window
        self.back_calculate = back_calculate
        
        if self.aki_calc_type is not None:
            if self.aki_calc_type == 'rolling_window':
                self.rolling_window = True
                
        
            
    def returnAKIpatients(self, df, add_stages = None, 
                          cond1time = None, cond2time = None, pad1time = None, pad2time = None):
        '''
        Returns patients with AKI according to the KDIGO guidelines. The KDIGO guidelines are as follows:

        * *Stage 1:* 0.3 increase in serum creatinine in < 48 hours OR 50% increase in serum creatinine in < 7 days (168 hours)
        * *Stage 2:* 100% increase in (or doubling of) serum creatinine in < 48 hours
        * *Stage 3:* 200% increase in (our tripling of) serum creatinine in < 48 hours

        More information can be found in the documentation at akiflagger.readthedocs.io
        Args: 

            df (pd.DataFrame): Patient dataframe, should include some sort of patient and encounter identifier(s) and age, sex, race, serum creatinine and timestamps.
        Returns:
            df (pd.DataFrame): Patient dataframe with AKI patients identified. 

        Raises:
            AssertionError: If the dataframe is missing an expected column; e.g. if there is no age/sex/race and eGFR_impute is True.

        '''
        if add_stages is None:
            add_stages = self.add_stages
        self.add_stages = add_stages
        
        if cond1time is None:
            cond1time = self.cond1time
        self.cond1time = cond1time
        
        if cond2time is None:
            cond2time = self.cond2time
        self.cond2time = cond2time
        
        if self.admission not in df.columns:
            df = self.addAdmissionColumn(df, add_encounter_col = self.encounter_id not in df.columns)
            
        if self.rolling_window: 
            df = df.groupby(self.patient_id, sort=False, as_index=False).apply(lambda d: self.addRollingWindowAKI(d))
            df = df.reset_index(level=0, drop=True).reset_index() #this will leave time as the index
            
        if self.back_calculate:
            df = df.groupby(self.patient_id, sort=False).apply(lambda d: self.addBaselineCreat(d, 
                                                                                               eGFR_impute=eGFR_impute))

            df = df.groupby(self.encounter_id, sort=False).apply(lambda d: self.addBackCalcAKI(d,
                                                                                  cond1time = self.cond1time,
                                                                                  cond2time = self.cond2time))
            df = df.reset_index(self.encounter_id, drop=True, as_index=False).reset_index()
            
        if not self.keep_extra_cols:
            if self.rolling_window or self.aki_calc_type != 'back_calculate':
                df = df.drop([self.mint1_colname, self.mint2_colname, self.delt1_colname, self.delt2_colname], axis=1)
        return df
    
    def addRollingWindowAKI(self, df):
        '''
        Adds the AKI conditions based on rolling window definition: 

        * *Stage 1:* 0.3 increase in serum creatinine in < 48 hours OR 50% increase in serum creatinine in < 7 days (168 hours)
        * *Stage 2:* 100% increase in (or doubling of) serum creatinine in < 48 hours
        * *Stage 3:* 200% increase in (our tripling of) serum creatinine in < 48 hours

        Args: 
            df (pd.DataFrame): dataframe, typically of a single encounter.
            add_stages (bool): boolean, default **True**. 
                Choose whether or not to delineate the rolling-window AKI into the three stages (if False it will just lump Stage 1/2/3 into a boolean True/False)
            cond1time (str): string, default **'48hours'**. 
                The amount of time for the rolling-window according to the first criterion; i.e. 0.3 increase in creatinine in ``cond1time`` hours. 
            cond2time (str): string, default **'168hours'**. 
                The amount of time for the rolling-window according to the second criterion; i.e. 50% increase in creatinine in ``cond2time`` hours.

        Returns: 
            df (pd.DataFrame): dataframe with rolling-window aki values added in
        '''
        df = df.set_index(self.time).sort_index()
        #Changed df.admission[0] to df[~df.admission.isnull()].admission[0]
        df_rw = df.loc[df.admission[0] - pd.Timedelta(hours=172):]
        
        t1 = pd.Timedelta(self.cond1time) + pd.Timedelta(self.pad1time)
        t2 = pd.Timedelta(self.cond2time) + pd.Timedelta(self.pad2time)
        
        minc_t1, minc_t2 = np.empty(df.shape[0]), np.empty(df.shape[0])
        minc_t1[:], minc_t2[:] = np.nan, np.nan 
        minc_t1[df.shape[0]-df_rw.shape[0]:] = df_rw[self.creatinine].rolling(pd.Timedelta(self.cond1time), min_periods=1).min().values
        minc_t2[df.shape[0]-df_rw.shape[0]:] = df_rw[self.creatinine].rolling(pd.Timedelta(self.cond2time), min_periods=1).min().values

        self.mint1_colname = 'mincreat_{}'.format(t1.days*24 + t1.seconds // 3600)
        self.mint2_colname = 'mincreat_{}'.format(t2.days*24 + t2.seconds // 3600)
        self.delt1_colname = 'deltacreat_{}'.format(t1.days*24 + t1.seconds // 3600)
        self.delt2_colname = 'deltacreat_{}'.format(t2.days*24 + t2.seconds // 3600)
        
        df[self.mint1_colname] = minc_t1
        df[self.mint2_colname] = minc_t2

        df[self.delt1_colname] = np.round(df.creat - df[self.mint1_colname], decimals = 5)
        df[self.delt2_colname] = np.round(df.creat - df[self.mint2_colname], decimals = 5)

        if self.add_stages:
            df['stage1'] = (df[self.delt1_colname] >= 0.3) | (df[self.delt2_colname] >= 0.5*df[self.mint2_colname])
            df['stage2'] = df[self.delt2_colname] >= 2*df[self.mint2_colname]
            df['stage3'] = df[self.delt2_colname] >= 3*df[self.mint2_colname]
            
        df['rollingwindow_aki'] = 1*df.stage1 + 1*df.stage2 + 1*df.stage3 if self.add_stages else (df[self.delt1_colname] >= 0.3) | (df[self.delt2_colname] >= 0.5*df[self.mint2_colname])
  
        return df
    
    def addAdmissionColumn(self, df, add_encounter_col = True):
        
        pat_gb = df.groupby(self.patient_id)
        
        #Check for those rows which are all inpatient; e.g. a hospital visit
        df.loc[:, 'all_inp'] = pat_gb.inpatient.transform(lambda d: np.all(d))
        df.loc[:, 'all_inp'] = df.all_inp & ~pat_gb.all_inp.shift(1, fill_value=False)

        df.loc[:,'admission'] = df.inpatient & ~pat_gb.inpatient.shift(1, fill_value=False)
        df.loc[:,'admission'] = df[np.logical_or(df.admission, df.all_inp)].time
        df.loc[:,'admission'] = pat_gb.admission.apply(lambda s: s.bfill().ffill())
        
        if add_encounter_col:
            df.loc[:, 'enc'] = df.inpatient & ~pat_gb.inpatient.shift(1, fill_value=False)
            df.loc[df.enc, 'enc'] = np.arange(1, df.enc.sum()+1)
            df.loc[df.enc == False, 'enc'] = np.nan
            df.loc[:,'enc'] = pat_gb.enc.apply(lambda s: s.bfill().ffill())
        df = df.drop(['all_inp'], axis=1)
        return df

In [43]:
f = AKIFlagger(rolling_window=True, creatinine = 'creat', add_stages=True)
rw = f.returnAKIpatients(noad)
rw.head()

Unnamed: 0,time,mrn,creat,inpatient,all_inp,admission,enc,stage1,stage2,stage3,rollingwindow_aki
0,2020-03-13 16:20:00,1000026,0.5,False,False,2020-04-10 18:29:00,1.0,False,False,False,0
1,2020-04-07 20:38:00,1000026,0.56,False,False,2020-04-10 18:29:00,1.0,False,False,False,0
2,2020-04-10 18:29:00,1000026,0.59,True,False,2020-04-10 18:29:00,1.0,False,False,False,0
3,2020-04-12 07:46:00,1000026,0.56,True,False,2020-04-10 18:29:00,1.0,False,False,False,0
4,2019-07-18 12:12:00,1000037,1.5,False,False,2020-07-05 22:02:00,2.0,False,False,False,0


In [40]:
p.rollingwindow_aki.sum(), rw.rollingwindow_aki.sum()

(642, 658)

In [176]:
%%time
flagger = AKIFlagger(rolling_window=True, creatinine='creat', keep_extra_cols=False)
rw = flagger.returnAKIpatients(noad)

Wall time: 5.45 s


In [177]:
#noad = df.loc[:,['mrn', 'creat', 'time', 'inpatient']]
rw[rw.rollingwindow_aki]

Unnamed: 0_level_0,mrn,creat,inpatient,all_inp,admission,admn_fill,allinp,mincreat_48hr,mincreat_7day,deltacreat_48hr,deltacreat_7day,rollingwindow_aki,stage1,stage2,stage3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-05-11 06:21:00,1000243,1.83,True,False,2020-05-08 01:02:00,2020-05-08 01:02:00,False,1.47,1.47,0.36,0.36,True,True,False,False
2020-04-09 06:26:00,1003079,2.40,True,True,2020-04-08 15:04:00,2020-04-08 15:04:00,False,1.43,1.43,0.97,0.97,True,True,False,False
2020-04-10 07:27:00,1003079,2.01,True,True,2020-04-08 15:04:00,2020-04-08 15:04:00,False,1.43,1.43,0.58,0.58,True,True,False,False
2020-04-28 10:22:00,1003079,1.18,True,True,2020-04-08 15:04:00,2020-04-08 15:04:00,False,0.84,0.84,0.34,0.34,True,True,False,False
2020-05-02 05:28:00,1003079,1.31,True,True,2020-04-08 15:04:00,2020-04-08 15:04:00,False,1.07,0.84,0.24,0.47,True,True,False,False
2020-05-15 09:44:00,1003181,0.90,True,False,2020-05-15 09:44:00,2020-05-15 09:44:00,False,0.90,0.60,0.00,0.30,True,True,False,False
2020-05-15 09:44:00,1003181,0.90,False,False,2020-07-05 16:43:00,2020-07-05 16:43:00,False,0.90,0.60,0.00,0.30,True,True,False,False
2020-03-27 05:09:00,1005100,1.89,True,False,2020-06-15 18:44:00,2020-06-15 18:44:00,False,1.58,1.58,0.31,0.31,True,True,False,False
2020-03-27 05:09:00,1005100,1.89,False,False,2020-06-15 18:44:00,2020-06-15 18:44:00,False,1.58,1.58,0.31,0.31,True,True,False,False
2020-03-27 17:38:00,1005100,2.55,False,False,2020-06-15 18:44:00,2020-06-15 18:44:00,False,1.58,1.58,0.97,0.97,True,True,False,False


In [151]:
%%time
pat_gb = noad.groupby('mrn')

#Check for those rows which are all inpatient; e.g. a hospital visit
noad.loc[:,'all_inp'] = pat_gb.inpatient.transform(lambda d: np.all(d))
noad.loc[:,'all_inp'] = noad.all_inp & ~pat_gb.all_inp.shift(1, fill_value=False)

noad.loc[:,'admission'] = noad.inpatient & ~pat_gb.inpatient.shift(1, fill_value=False)
noad.loc[:,'admission'] = noad[np.logical_or(noad.admission, noad.all_inp)].time
noad.loc[:,'admission'] = pat_gb.admission.apply(lambda d: d.bfill().ffill())
noad

Wall time: 816 ms


Unnamed: 0,mrn,creat,time,inpatient,all_inp,admission,admn_fill
0,1000026,0.50,2020-03-13 16:20:00,False,False,2020-04-10 18:29:00,2020-04-10 18:29:00
1,1000026,0.56,2020-04-07 20:38:00,False,False,2020-04-10 18:29:00,2020-04-10 18:29:00
2,1000026,0.59,2020-04-10 18:29:00,True,False,2020-04-10 18:29:00,2020-04-10 18:29:00
3,1000026,0.56,2020-04-12 07:46:00,True,False,2020-04-10 18:29:00,2020-04-10 18:29:00
4,1000037,1.50,2019-07-18 12:12:00,False,False,2020-07-05 22:02:00,2020-07-05 22:02:00
5,1000037,1.56,2019-07-25 15:14:00,False,False,2020-07-05 22:02:00,2020-07-05 22:02:00
6,1000037,1.82,2019-09-21 11:22:00,False,False,2020-07-05 22:02:00,2020-07-05 22:02:00
7,1000037,1.98,2019-10-05 11:32:00,False,False,2020-07-05 22:02:00,2020-07-05 22:02:00
8,1000037,1.89,2019-10-17 11:29:00,False,False,2020-07-05 22:02:00,2020-07-05 22:02:00
9,1000037,1.98,2019-11-09 13:29:00,False,False,2020-07-05 22:02:00,2020-07-05 22:02:00


## Edge case checks

In [171]:
#Check to make sure that when switching from mrn to mrn it's still capturing admissions 
c1 = np.logical_and(noad.inpatient, noad.inpatient.shift(1)) #where T & T
c2 = np.logical_and(noad.mrn != noad.mrn.shift(1), ~noad.all_inp)
noad[np.logical_and(c1, c2)].head(10)

#Verify that allinp is capturing the first rows
noad['all_inp'] = pat_gb.inpatient.transform(lambda d: np.all(d))
noad['allinp'] = noad.all_inp & ~pat_gb.all_inp.shift(1, fill_value=False)
noad[np.logical_and(c1, c2)].head(10)
#np.all(noad[noad.all_inp].groupby('mrn').head(1) == noad[noad.allinp])

Unnamed: 0,mrn,creat,time,inpatient,all_inp,admission,admn_fill,allinp
1278,1010944,0.83,2020-05-25 20:57:00,True,False,2020-05-25 20:57:00,2020-05-25 20:57:00,False
1920,1014973,1.1,2020-05-26 15:34:00,True,False,2020-05-26 15:34:00,2020-05-26 15:34:00,False
1940,1015207,0.83,2020-04-24 11:57:00,True,False,2020-04-24 11:57:00,2020-04-24 11:57:00,False
3751,1032611,0.7,2020-04-25 12:29:00,True,False,2020-04-25 12:29:00,2020-04-25 12:29:00,False
4978,1045752,1.8,2020-04-04 16:09:00,True,False,2020-04-04 16:09:00,2020-04-04 16:09:00,False
7123,1063539,0.9,2020-04-26 17:45:00,True,False,2020-04-26 17:45:00,2020-04-26 17:45:00,False
7212,1063882,0.66,2020-04-29 12:55:00,True,False,2020-04-29 12:55:00,2020-04-29 12:55:00,False
8766,1082424,1.6,2020-04-27 15:31:00,True,False,2020-04-27 15:31:00,2020-04-27 15:31:00,False
8960,1086539,0.77,2020-06-23 18:27:00,True,False,2020-06-23 18:27:00,2020-06-23 18:27:00,False
9780,109244,0.9,2020-07-10 17:52:00,True,False,2020-07-10 17:52:00,2020-07-10 17:52:00,False


## Code for generating admission column with intermediate steps as separate columns

In [139]:
noad = df.loc[:,['mrn', 'creat', 'time', 'inpatient']]
pat_gb = noad.groupby('mrn')

noad.loc[:,'admn'] = noad.inpatient & ~pat_gb.inpatient.shift(1, fill_value=False)
noad.loc[:,'all_inp'] = pat_gb.inpatient.transform(lambda d: np.all(d))
noad.loc[:,'allinp'] = noad.all_inp & ~pat_gb.all_inp.shift(1, fill_value=False)
noad.loc[:,'admission'] = noad[np.logical_or(noad.admn, noad.allinp)].time
noad.loc[:,'admn_fill'] = pat_gb['admission'].apply(lambda d: d.bfill().ffill())