## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime, random

from multiprocessing import Pool

## Reading in file; managing columns

In [2]:
%%time
df = pd.read_csv('/Users/saranmedical-smile/Desktop/patr/StandardizingAKI/inpatient 2014-2018 creatinine.csv') #1441707, 4
df.pat_mrn_id = df.pat_mrn_id.str.strip('MR').astype('int')
df.time = pd.to_datetime(df.time)
print(df.dtypes)
df.columns = ['mrn',
              'enc',
              'time',
              'creat']
df.set_index(['mrn', 'enc'], inplace=True)

pat_mrn_id                 int64
pat_enc_csn_id             int64
time              datetime64[ns]
creatinine               float64
dtype: object
CPU times: user 2.07 s, sys: 238 ms, total: 2.31 s
Wall time: 2.34 s


## Creating $\Delta$creat and $\Delta$t variables

In [3]:
delta_df = df.groupby(['mrn', 'enc']).shift(-1) - df
df['delta_creat'] = delta_df['creat'].shift(1)
df['delta_time'] = delta_df['time'].shift(1)
#df[['delta_time, delta_creat']] = delta_df[['time', 'creatinine']]

## Two criterion for rolling-window definition of AKI:

#### *$creat \uparrow$ of 0.3 in < 48 hrs* OR *$creat \uparrow$ of 50% in < 7 days*

In [4]:
#Criterion 1
print(np.where(np.logical_and(df.delta_creat > 0.3, df.delta_time < datetime.timedelta(days=2))))

#Criterion 2
print(np.where(np.logical_and(df.delta_creat > df.creat.shift(1)*0.5, df.delta_time < datetime.timedelta(days=7))))

condition1 = np.logical_and(df.delta_creat > 0.3, df.delta_time < datetime.timedelta(hours=48))
condition2 = np.logical_and(df.delta_creat > df.creat.shift(1)*0.5, df.delta_time < datetime.timedelta(days=7))

(array([     95,     256,     282, ..., 1441489, 1441503, 1441653]),)
(array([     95,     256,     354, ..., 1441402, 1441418, 1441653]),)


In [5]:
#Quick question to ask: say a patient comes in on a separate encounter within 7 days
#and their creatinine has gone up by 50% since their previous encounter. Does that count as AKI?

#((df.groupby(['pat_mrn_id']).shift(-1) - df).time < datetime.timedelta(days=7)).sum()
#((df.groupby(['pat_mrn_id', 'pat_enc_csn_id']).shift(-1) - df).time < datetime.timedelta(days=7)).sum()

#df['AKI'] = np.logical_or(condition1, condition2)

#Question: Once you've got AKI you've got AKI for the rest of the encounter, right?
# df.loc['MR1000136']

#Condition 1 examples:
# MR1000136, MR1000468, MR1000507

#Condition 2 examples:
#MR1001546, MR1001806, 

#df[np.logical_xor(condition1, condition2)] gives unique examples of C1 or C2

In [6]:
firstencs = df.reset_index().drop_duplicates('mrn') #df.groupby(['mrn']).head(1)
df['first_enc'] = [i in firstencs.index for i in range(df.shape[0])]
df.reset_index(inplace=True)
df.head()

Unnamed: 0,mrn,enc,time,creat,delta_creat,delta_time,first_enc
0,1000041,115884935,2015-06-14 20:59:00,1.6,,NaT,True
1,1000041,115884935,2015-06-15 07:54:00,0.9,-0.7,0 days 10:55:00,False
2,1000041,115884935,2015-06-16 07:02:00,0.9,0.0,0 days 23:08:00,False
3,1000041,115884935,2015-06-17 07:11:00,0.9,0.0,1 days 00:09:00,False
4,1000041,117378943,2015-07-18 08:39:00,1.1,,NaT,False


In [7]:
#An idea of the numbers of those who have AKI according to the rolling window definition
print('creat > 0.3:', (df.delta_creat > 0.3).sum())
print('time < 48hrs:',(df.delta_time < datetime.timedelta(hours=48)).sum())
print('cond1:', np.logical_and(df.delta_creat > 0.3, df.delta_time < datetime.timedelta(hours=48)).sum())

print('creat_t > 1.5*creat_{t-1}:', (df.delta_creat > df.creat.shift(1)*0.5).sum())
print('time < 48hrs:',(df.delta_time < datetime.timedelta(days=7)).sum())
print('cond2:', np.logical_and(df.delta_creat > df.creat.shift(1)*0.5, df.delta_time < datetime.timedelta(days=7)).sum())

print('cond1 OR cond2:', np.logical_or(condition1, condition2).sum())

creat > 0.3: 63893
time < 48hrs: 1193853
cond1: 60601
creat_t > 1.5*creat_{t-1}: 11899
time < 48hrs: 1234986
cond2: 11825
cond1 OR cond2: 62992


## Defining *add_rows()* function, which adds dummy rows to encode where back-calculation is necessary

In [8]:
def add_rows(df):
    patient_dfs = df.groupby(['mrn'])
    patients = dict()
    for mrn, pat_df in patient_dfs:
        patients[mrn] = pat_df
        backcalc_rows = np.where(pat_df.delta_creat.isnull())[0]

        indx=None
        rows_to_add = list()
        for indx in backcalc_rows:
            lookback_indices = list()
            init_indx = indx

            if indx is not None:
                while (pat_df.iloc[indx].time - pat_df.iloc[indx-1].time) < datetime.timedelta(days=365) and pat_df.iloc[indx].mrn == pat_df.iloc[indx-1].mrn:
                    if pat_df.iloc[init_indx].time - pat_df.iloc[indx-1].time > datetime.timedelta(days=7):
                        lookback_indices.append(indx-1)
                    if indx == 0:
                        break
                    indx -= 1
            lookback_indices = np.sort(lookback_indices)
            new_row = pat_df.iloc[[init_indx]]
            new_row.loc[:, 'creat'] = pat_df.iloc[lookback_indices].creat.mean()
            rows_to_add.append(new_row)

        mini_dfs = np.split(pat_df, backcalc_rows)
        for indx, mini_df in enumerate(mini_dfs[1:]):
            mini_dfs[indx+1] = pd.concat([rows_to_add[indx], mini_df])
        pat_df = pd.concat(mini_dfs)
        patients[mrn] = pat_df
    return pd.concat(list(patients.values()))

def parallelize_analysis(df, func=add_rows, num_cores = 4):
    split_dfs = np.array_split(df, num_cores)
    with Pool(num_cores) as p:
        df = pd.concat(p.map(func, split_dfs))
    return df

#Benchmark testing:
#For df of size 10000 add_rows(df) runs in 18.3 seconds, parallelize_analysis(df) runs in 11.5 s

In [13]:
#%%time
#d = parallelize_analysis(df)

%%time
df.reset_index(inplace=True)
patient_dfs = df.groupby(['mrn'])
patients = dict()
for mrn, pat_df in patient_dfs:
    patients[mrn] = pat_df
    backcalc_rows = np.where(pat_df.delta_creat.isnull())[0]

    indx=None
    rows_to_add = list()
    for indx in backcalc_rows:
        lookback_indices = list()
        init_indx = indx

        if indx is not None:
            while (pat_df.iloc[indx].time - pat_df.iloc[indx-1].time) < datetime.timedelta(days=365) and pat_df.iloc[indx].mrn == pat_df.iloc[indx-1].mrn:
                if pat_df.iloc[init_indx].time - pat_df.iloc[indx-1].time > datetime.timedelta(days=7):
                    lookback_indices.append(indx-1)
                if indx == 0:
                    break
                indx -= 1
        lookback_indices = np.sort(lookback_indices)
        new_row = pat_df.iloc[[init_indx]]
        new_row.loc[:, 'creat'] = pat_df.iloc[lookback_indices].creat.mean()
        rows_to_add.append(new_row)

    mini_dfs = np.split(pat_df, backcalc_rows)
    for indx, mini_df in enumerate(mini_dfs[1:]):
        mini_dfs[indx+1] = pd.concat([rows_to_add[indx], mini_df])
    pat_df = pd.concat(mini_dfs)
    patients[mrn] = pat_df
pd.concat(list(patients.values()))

In [10]:
def eGFR(creat, age, black, female):
    '''
    Calculates the estimated glomerular filtration rate based on the serum creatinine levels, age, sex, and race (black or not black);
    Based on the formula in the paper A New Equation to Estimate Glomerular Filtration Rate (Levey et. Al, 2009) linked below
    
    https://pubmed.ncbi.nlm.nih.gov/19414839/
    
    '''
    #Term 2 - np.clip(creat/(0.9-0.2*female, a_min=1, a_max=None) is the same as taking min(1, creat/k)
    #Term 3 - np.clip(creat/(0.9-0.2*female, a_min=None, a_max=None) is the same as taking max(1, creat/k)
    #where k is the data-derived constant given in the paper: 0.7 for females and 0.9 for males
    
    return 141*(np.clip(creat/(0.9-0.2*female), a_min=1, a_max=None)**(-0.411+0.082*female))*(np.clip(creat/(0.9-0.2*female), a_min=None, a_max=1)**-1.209)*(0.993**age)*(1+female*0.018)*(1+black*0.159)

#Sample test data
creat = np.random.normal(loc=1, scale=0.2, size=10)
age = np.random.normal(loc=55, scale=10, size=10)
black = np.random.rand(10) > 0.5
female = np.random.rand(10) > 0.5

eGFR(creat, age, black, female) 
#values seem pretty reasonable (80-120)

array([113.89903888, 100.8786952 ,  87.46893682,  97.14803991,
        91.94999961, 112.69153215,  96.81254917, 129.33209018,
       126.9104726 ,  93.73315848])

In [11]:
#groupby and drop_duplicates operate similarly - groupby retains index dd doesn't
#df.groupby(['mrn']).head(1)
#df.reset_index().drop_duplicates('mrn')

In [12]:
df['age'] = None
df['sex'] = None
mrn_gb = df.groupby(['mrn'])
tmp = mrn_gb.head(1)
tmp['sex'] = np.random.rand(tmp.shape[0]) > 0.5
tmp['age'] = np.random.normal(loc=50, scale=10, size=tmp.shape[0])
df['age'] = tmp['age'] #So this works! I can match by index the ages & sex
df['sex'] = tmp['sex'] #even though df and tmp are different shapes... awesome! 
df['age'] = mrn_gb['age'].transform(lambda x: x.ffill())
df['sex'] = mrn_gb['sex'].transform(lambda x: x.ffill())
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


KeyboardInterrupt: 

#For pat mrn 212
##0 --> 0
#11 --> 0.29818181818181827
#18 --> 0.31500000000000006
#26 --> 0.33666666666666667

pat_df = patient_dfs.get_group(212)
#backcalc_rows = np.where(np.logical_and(pat_df.delta_creat.isnull(), ~pat_df.first_enc))[0]
backcalc_rows = np.where(pat_df.delta_creat.isnull())[0]
print(backcalc_rows)

indx=None
rows_to_add = list()
for indx in backcalc_rows:
    lookback_indices = list()
    init_indx = indx
    
    if indx is not None:
        while (pat_df.iloc[indx].time - pat_df.iloc[indx-1].time) < datetime.timedelta(days=365) and pat_df.iloc[indx].mrn == pat_df.iloc[indx-1].mrn:
            if pat_df.iloc[init_indx].time - pat_df.iloc[indx-1].time > datetime.timedelta(days=7):
                lookback_indices.append(indx-1)
            if indx == 0:
                break
            indx -= 1
    lookback_indices = np.sort(lookback_indices)
    new_row = pat_df.iloc[[init_indx]]
    new_row.loc[:, 'creat'] = pat_df.iloc[lookback_indices].creat.mean()
    rows_to_add.append(new_row)
    
mini_dfs = np.split(pat_df, backcalc_rows)

for indx, mini_df in enumerate(mini_dfs[1:]):
    mini_dfs[indx+1] = pd.concat([rows_to_add[indx], mini_df])
pat_df = pd.concat(mini_dfs)