In [1]:
import pandas as pd
import numpy as np
import datetime, random

from multiprocessing import Pool
import time

### Previous method: Function-based implementation

In [2]:
%%time
import akiFlagger
covid_df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\covid creatinines.csv')
covid_df['mrn'] = covid_df.pat_mrn_id.str.strip('MR').astype('int')
covid_df['enc'] = covid_df.enc_id
covid_df['time'] = pd.to_datetime(covid_df.time)
covid_df['sex'] = covid_df.sex.astype('bool')
covid_df['race'] = covid_df.race.astype('bool')
covid_df['inpatient'] = covid_df.inpatient.astype('bool')
covid_df['creat'] = covid_df['creatinine']
covid_df['admission'] = pd.to_datetime(covid_df.admission)
covid_df['discharge'] = pd.to_datetime(covid_df.discharge)
print('Shape:', covid_df.shape)
print(covid_df.dtypes)

df = covid_df[['mrn', 'enc', 'time', 'creat', 'age', 'sex', 'race', 'inpatient', 'admission', 'discharge']]
df = df[:10000]
df = df[~df.duplicated()] #The rows where the pat_enc_csn_id was lumped into enc_id become duplicates
df = df.groupby('enc', sort=False).apply(lambda d: d.sort_values('time'))
df = df.reset_index(drop=True)
p = akiFlagger.returnAKIpatients(df, aki_calc_type = 'rolling_window')
#p.head()

Shape: (450399, 16)
pat_mrn_id                 object
PAT_ENC_CSN_ID              int64
time               datetime64[ns]
creatinine                float64
hosp_admsn_time            object
hosp_disch_time            object
enc_id                      int64
admission          datetime64[ns]
discharge          datetime64[ns]
age                       float64
sex                          bool
race                         bool
inpatient                    bool
mrn                         int32
enc                         int64
creat                     float64
dtype: object
Wall time: 9.42 s


## Class-based method for akiFlagger so that the required column names can be independent of my naming system

In [3]:
class AKIFlagger:
    '''Flagger to detect patients with acute kidney injury (AKI).
    
    '''
    def __init__(self, patient_id = 'mrn', creatinine='creatinine', time = 'time', inpatient = 'inpatient', 
                 aki_calc_type=None, keep_extra_cols = False, eGFR_impute = False, add_stages = None,
                 cond1time = '48hours', cond2time = '168hours', pad1time = '0hours', pad2time = '0hours', 
                 rolling_window = False, back_calculate = False,
                 admission = 'admission', age = 'age', sex = 'sex', race = 'race', encounter_id = 'enc'):
        
        #Identifiers
        self.patient_id = patient_id
        self.encounter_id = encounter_id
        
        #Columns necessary for calculation (if admission not included it will be imputed)
        self.creatinine = creatinine
        self.time = time
        self.inpatient = inpatient
        self.admission = admission
        
        #Demographic variables
        self.age = age
        self.sex = sex
        self.race = race
        
        #Rolling-window variables
        self.cond1time = cond1time
        self.cond2time = cond2time
        self.pad1time = pad1time
        self.pad2time = pad2time
        
        #Extra options to specify what is included in the output
        self.eGFR_impute = eGFR_impute
        self.add_stages = add_stages
        self.keep_extra_cols = keep_extra_cols
        
        #Specifying the calculation type wanted in the flagger
        self.aki_calc_type = aki_calc_type
        self.rolling_window = rolling_window
        self.back_calculate = back_calculate
        
        if self.aki_calc_type is not None:
            if self.aki_calc_type == 'rolling_window':
                self.rolling_window = True
            elif self.aki_calc_type == 'back_calculate':
                self.back_calculate = True
            elif self.aki_calc_type == 'both':
                self.rolling_window = True
                self.back_calculate = True
            
    def returnAKIpatients(self, df, add_stages = None, 
                          cond1time = None, cond2time = None, pad1time = None, pad2time = None):
        '''
        Returns patients with AKI according to the KDIGO guidelines. The KDIGO guidelines are as follows:

        * *Stage 1:* 0.3 increase in serum creatinine in < 48 hours OR 50% increase in serum creatinine in < 7 days (168 hours)
        * *Stage 2:* 100% increase in (or doubling of) serum creatinine in < 7 days (168 hours)
        * *Stage 3:* 200% increase in (our tripling of) serum creatinine in < 7 days (168 hours)

        More information can be found in the documentation at akiflagger.readthedocs.io
        Args: 
            df (pd.DataFrame): Patient dataframe, should include some sort of patient and encounter identifier(s) and age, sex, race, serum creatinine and timestamps.
        Returns:
            df (pd.DataFrame): Patient dataframe with AKI patients identified. 

        Raises:
            AssertionError: If the dataframe is missing an expected column; e.g. if there is no age/sex/race and eGFR_impute is True.

        '''
        if add_stages is None:
            add_stages = self.add_stages
        self.add_stages = add_stages
        
        if cond1time is None:
            cond1time = self.cond1time
        self.cond1time = cond1time
        
        if cond2time is None:
            cond2time = self.cond2time
        self.cond2time = cond2time
        
        if pad1time is None:
            pad1time = self.pad1time
        self.pad1time = pad1time
        
        if pad2time is None:
            pad2time = self.pad2time
        self.pad2time = pad2time
        
        if self.admission not in df.columns:
            df = self.addAdmissionColumn(df, add_encounter_col = self.encounter_id not in df.columns)
            
        if self.rolling_window: 
            df = df.groupby(self.patient_id, sort=False, as_index=False).apply(lambda d: self.addRollingWindowAKI(d))
            df = df.reset_index(level=0, drop=True).reset_index() #this will leave time as the index
            
        if self.back_calculate:
            df = df.groupby(self.patient_id, sort=False).apply(lambda d: self.addBaselineCreat(d, 
                                                                                               eGFR_impute=eGFR_impute))

            df = df.groupby(self.encounter_id, sort=False).apply(lambda d: self.addBackCalcAKI(d,
                                                                                  cond1time = self.cond1time,
                                                                                  cond2time = self.cond2time))
            df = df.reset_index(self.encounter_id, drop=True, as_index=False).reset_index()
            
        if not self.keep_extra_cols:
            if self.rolling_window or self.aki_calc_type != 'back_calculate':
                df = df.drop([self.mint1_colname, self.mint2_colname, self.delt1_colname, self.delt2_colname], axis=1)
        return df
    
    def addAdmissionColumn(self, df, add_encounter_col = None):

        pat_gb = df.groupby(self.patient_id)

        #Check for those rows which are all inpatient; e.g. a hospital visit
        df.loc[:, 'all_inp'] = pat_gb.inpatient.transform(lambda d: np.all(d))
        df.loc[:, 'all_inp'] = df.all_inp & ~pat_gb.all_inp.shift(1, fill_value=False)

        df.loc[:,'admission'] = df.inpatient & ~pat_gb.inpatient.shift(1, fill_value=False)
        df.loc[:,'admission'] = df[np.logical_or(df.admission, df.all_inp)].time
        df.loc[:,'admission'] = pat_gb.admission.apply(lambda s: s.bfill().ffill())

        if add_encounter_col:
            df.loc[:, 'enc'] = df.inpatient & ~pat_gb.inpatient.shift(1, fill_value=False)
            df.loc[df.enc, 'enc'] = np.arange(1, df.enc.sum()+1)
            df.loc[df.enc == False, 'enc'] = np.nan
            df.loc[:,'enc'] = pat_gb.enc.apply(lambda s: s.bfill().ffill())
        df = df.drop(['all_inp'], axis=1)
        
        return df
    
    def addRollingWindowAKI(self, df):
        '''
        Adds the AKI conditions based on rolling window definition: 

        * *Stage 1:* 0.3 increase in serum creatinine in < 48 hours OR 50% increase in serum creatinine in < 7 days (168 hours)
        * *Stage 2:* 100% increase in (or doubling of) serum creatinine in < 48 hours
        * *Stage 3:* 200% increase in (our tripling of) serum creatinine in < 48 hours

        Args: 
            df (pd.DataFrame): dataframe, typically of a single encounter.
            add_stages (bool): boolean, default **True**. 
                Choose whether or not to delineate the rolling-window AKI into the three stages (if False it will just lump Stage 1/2/3 into a boolean True/False)
            cond1time (str): string, default **'48hours'**. 
                The amount of time for the rolling-window according to the first criterion; i.e. 0.3 increase in creatinine in ``cond1time`` hours. 
            cond2time (str): string, default **'168hours'**. 
                The amount of time for the rolling-window according to the second criterion; i.e. 50% increase in creatinine in ``cond2time`` hours.

        Returns: 
            df (pd.DataFrame): dataframe with rolling-window aki values added in
        '''
        df = df.set_index(self.time).sort_index()
        df_rw = df.loc[df.admission[0] - pd.Timedelta(hours=172):]
        
        t1 = pd.Timedelta(self.cond1time) + pd.Timedelta(self.pad1time)
        t2 = pd.Timedelta(self.cond2time) + pd.Timedelta(self.pad2time)
        
        minc_t1, minc_t2 = np.empty(df.shape[0]), np.empty(df.shape[0])
        minc_t1[:], minc_t2[:] = np.nan, np.nan 
        minc_t1[df.shape[0]-df_rw.shape[0]:] = df_rw[self.creatinine].rolling(t1, min_periods=1).min().values
        minc_t2[df.shape[0]-df_rw.shape[0]:] = df_rw[self.creatinine].rolling(t2, min_periods=1).min().values

        self.mint1_colname = 'mincreat_{}'.format(t1.days*24 + t1.seconds // 3600)
        self.mint2_colname = 'mincreat_{}'.format(t2.days*24 + t2.seconds // 3600)
        self.delt1_colname = 'deltacreat_{}'.format(t1.days*24 + t1.seconds // 3600)
        self.delt2_colname = 'deltacreat_{}'.format(t2.days*24 + t2.seconds // 3600)
        
        df[self.mint1_colname] = minc_t1
        df[self.mint2_colname] = minc_t2

        df[self.delt1_colname] = np.round(df.creat - df[self.mint1_colname], decimals = 5)
        df[self.delt2_colname] = np.round(df.creat - df[self.mint2_colname], decimals = 5)

        if self.add_stages:
            df['stage1'] = (df[self.delt1_colname] >= 0.3) | (df[self.delt2_colname] >= 0.5*df[self.mint2_colname])
            df['stage2'] = df[self.delt2_colname] >= 2*df[self.mint2_colname]
            df['stage3'] = df[self.delt2_colname] >= 3*df[self.mint2_colname]
            
        df['rollingwindow_aki'] = 1*df.stage1 + 1*df.stage2 + 1*df.stage3 if self.add_stages else (df[self.delt1_colname] >= 0.3) | (df[self.delt2_colname] >= 0.5*df[self.mint2_colname])
  
        return df

### First, checking population-level values to make sure the output hasn't changed

In [6]:
%%time
flagger = AKIFlagger(rolling_window = True, creatinine = 'creat', add_stages = True)
rw = flagger.returnAKIpatients(df)
print('Stage 1: {} vs {}'.format(rw.stage1.sum(), p.stage1.sum()))
print('Stage 2: {} vs {}'.format(rw.stage2.sum(), p.stage2.sum()))
print('Stage 3: {} vs {}'.format(rw.stage3.sum(), p.stage3.sum()))
print('Unique mrns: {} vs {}'.format(rw.mrn.unique().shape, p.mrn.unique().shape))
print('Unique encs: {} vs {}'.format(rw.enc.unique().shape, p.enc.unique().shape))
print('All match, great!')

Stage 1: 655 vs 655
Stage 2: 21 vs 21
Stage 3: 15 vs 15
Unique mrns: (419,) vs (419,)
Unique encs: (496,) vs (496,)
All match, great!
Wall time: 5.5 s


### Now against a dataframe with the bare minimum requirements (no admission or encounter columns)

In [9]:
%%time
noad = df.loc[:,['mrn', 'creat', 'time', 'inpatient']]
rw = flagger.returnAKIpatients(noad)
print('Stage 1: {} vs {}'.format(rw.stage1.sum(), p.stage1.sum()))
print('Stage 2: {} vs {}'.format(rw.stage2.sum(), p.stage2.sum()))
print('Stage 3: {} vs {}'.format(rw.stage3.sum(), p.stage3.sum()))
print('Unique mrns: {} vs {}'.format(rw.mrn.unique().shape, p.mrn.unique().shape))
print('Unique encs: {} vs {}'.format(rw.enc.unique().shape, p.enc.unique().shape))
print('If the numbers are pretty close, we\'re happy with the imputation performed.')

Stage 1: 658 vs 655
Stage 2: 21 vs 21
Stage 3: 15 vs 15
Unique mrns: (419,) vs (419,)
Unique encs: (482,) vs (496,)
If the numbers are pretty close, we're happy with the imputation performed.
Wall time: 7.09 s


## Code for generating admission column with intermediate steps as separate columns & edge-case checks

In [10]:
#Check to make sure that when switching from mrn to mrn it's still capturing admissions 
c1 = np.logical_and(noad.inpatient, noad.inpatient.shift(1)) #where T & T
c2 = np.logical_and(noad.mrn != noad.mrn.shift(1), ~noad.all_inp)
noad[np.logical_and(c1, c2)].head(10)

#Verify that allinp is capturing the first 
pat_gb = noad.groupby('mrn')
noad['all_inp'] = pat_gb.inpatient.transform(lambda d: np.all(d))
noad['allinp'] = noad.all_inp & ~pat_gb.all_inp.shift(1, fill_value=False)
#noad[np.logical_and(c1, c2)]
np.all(noad[noad.all_inp].groupby('mrn').head(1) == noad[noad.allinp])

True

In [11]:
%%time
noad = df.loc[:,['mrn', 'creat', 'time', 'inpatient']]
pat_gb = noad.groupby('mrn')

#Check for those rows which are all inpatient; e.g. a hospital visit
noad.loc[:,'all_inp'] = pat_gb.inpatient.transform(lambda d: np.all(d))
noad.loc[:,'all_inp'] = noad.all_inp & ~pat_gb.all_inp.shift(1, fill_value=False)

noad.loc[:,'admission'] = noad.inpatient & ~pat_gb.inpatient.shift(1, fill_value=False)
noad.loc[:,'admission'] = noad[np.logical_or(noad.admission, noad.all_inp)].time
noad.loc[:,'admission'] = pat_gb.admission.apply(lambda d: d.bfill().ffill())
#noad.head()

Wall time: 1.01 s
