## Imports

In [None]:
import pandas as pd
import numpy as np
import datetime, random

from multiprocessing import Pool
import time

## Reading in file; managing columns

In [None]:
%%time
df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\output.csv')
df['time'] = pd.to_datetime(df.time)
df = df.drop(['Unnamed: 0'], axis=1)

yu = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\aki flagger covid creatinines with covidtest.csv')
yu['pat_mrn_id'] = yu.pat_mrn_id.str.strip('MR').astype('int') #indexing with integers is quicker
yu['time'] = pd.to_datetime(yu.time)
yu['covidtest'] = yu.covidtest.astype('bool')
yu.columns = ['mrn', #Renaming columns for ease
              'pat_enc_id',
              'time',
              'creat',
              'enc',
              'running_aki_stage',
              'historical_aki_stage',
              'covid'] 

yu = yu.groupby('enc').apply(lambda d: d[~d.time.duplicated()])
yu = yu.reset_index(drop=True)
#yu = yu.set_index(['mrn', 'enc', 'time'])
#yu.index = yu.sort_values(['time', 'enc', 'mrn']).index
print(yu.shape)

In [None]:
%%time
df_subset = df.set_index(['mrn', 'enc', 'time']).loc[yu.set_index(['mrn','enc','time']).index]
df_subset = df_subset.reset_index().groupby('enc').apply(lambda d: d[~d.time.duplicated()])
df_subset = df_subset.reset_index(drop=True)
print(np.all(df_subset.time == yu.time))
print(np.all(df_subset.enc == yu.enc))
print(np.all(df_subset.mrn == yu.mrn))
#print('Subset shape:',df_subset.shape, 'Yu\'s df shape:',yu.shape, 'Original df shape:',df.shape)

In [None]:
df_subset.loc[df_subset.rollingwindow_aki.isnull(), 'rollingwindow_aki'] = 0

In [None]:
yu['running_aki'] = yu.running_aki_stage > 0 

In [None]:
df_subset.iloc[np.where(df_subset.rollingwindow_aki != yu.running_aki)[0]

In [None]:
(df_subset.rollingwindow_aki == yu.running_aki).sum()

In [None]:
df_subset.loc[df_subset.backcalc_aki.isnull(), 'backcalc_aki'] = 0

In [None]:
print('Back-calc match count:', , 119982/127254, 'Rolling-window match count:', 123471/127254)

In [None]:
(df_subset.backcalc_aki == yu.historical_aki_stage).sum(), df_subset.shape

In [None]:
yu.iloc[np.where(df_subset.backcalc_aki != yu.historical_aki_stage)[0]]

In [None]:
df.loc[df.mrn == 2307280]

In [None]:
yu.loc[yu.mrn == 2307280]

In [None]:
df_subset.iloc[np.where(df_subset.backcalc_aki != yu.historical_aki_stage)[0]]

In [None]:
%%time
mrn_diffs = [df.mrn[indx] in set(df.mrn.unique()) - set(yu.mrn.unique()) for indx in range(df.shape[0])]

In [None]:
pd.read_csv(r"H:\Data\Standardized AKI definition\dataset\covid test result.csv").enc_id.unique().shape

## Read in COVID-19 cases, same analysis 

In [None]:
cdf = pd.read_csv('/Users/saranmedical-smile/csv_files/covid creatinines.csv')
cdf.pat_mrn_id = cdf.pat_mrn_id.str.strip('MR').astype('int') #Index with integers --> it's much quicker
cdf.time = pd.to_datetime(cdf.time) # Convert to pandas datetime format
cdf.admission = pd.to_datetime(cdf.admission)
cdf.discharge = pd.to_datetime(cdf.discharge)
cdf.drop(['enc_id',
         'age',
         'sex',
         'inpatient',
         'race',
         'admission',
         'discharge'], axis=1, inplace=True)
print(cdf.dtypes) # Confirm all the column types are as we want it
cdf.columns = ['mrn', #Renaming columns for ease
              'enc',
              'time',
              'creat'] 
cdf.set_index(['time'], inplace=True)

In [None]:
%%time
#returnAKIpatients(cdf)
print('done!')

In [None]:
%%time
x = cdf.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('52hours'), min_periods=1).min()#.reset_index('enc').creat
y = cdf.groupby('mrn', sort=False)['creat'].rolling(pd.Timedelta('172hours'), min_periods=1).min()#.reset_index('enc').creat    

In [None]:
cdf = cdf.reset_index().set_index(['enc', 'time'])

In [None]:
cdf['running_min_48hrs'] = x.values
cdf['running_min_7days'] = y.values

cdf['running_delta_48hrs'] = np.round(cdf['creat'] - cdf['running_min_48hrs'], decimals = 3)
cdf['running_delta_7days'] = np.round(cdf['creat'] - cdf['running_min_7days'], decimals = 3)    

condition1 = cdf['running_delta_48hrs'] >= 0.3
condition2 = cdf['running_delta_7days'] >= 0.5*cdf['running_min_7days']

cdf['aki'] = condition1 | condition2

In [None]:
cdf.reset_index('enc', inplace=True)
cdf.sort_index(inplace=True)
cdf['aki_cumsum'] = cdf.aki.cumsum()
#cdf['aki_cumsum_7d'] = cdf.reset_index('enc').aki.rolling(pd.Timedelta('7days')).cumsum()

## Reading in df

In [None]:
%%time

#Read in dataframe - uncomment which data frame according to which you are working on

#df = pd.read_csv('~/Desktop/patr/StandardizingAKI/inpatient 2014-2018 creatinine.csv') #1441707, 4
#df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\inpatient 2014-2018 creatinine.csv')
#df = pd.read_csv('/Users/saranmedical-smile/csv_files/inpatient 2014-2018 creatinine.csv')
#df = pd.read_csv('/Users/saranmedical-smile/csv_files/covid creatinines.csv')
df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\aki flagger covid creatinines with covidtest.csv')

df['pat_mrn_id'] = df.pat_mrn_id.str.strip('MR').astype('int') #Index with integers --> it's much quicker
df['time'] = pd.to_datetime(df.time) # Convert to pandas datetime format
df['covidtest'] = df.covidtest.astype('bool')
print(df.dtypes) # Confirm all the column types are as we want it

#df.drop(['running_aki_stage',
#         'historical_aki_stage'], axis=1, inplace=True)
df.columns = ['mrn', #Renaming columns for ease
              'pat_enc_id',
              'time',
              'creat',
              'enc',
              'running_aki_stage',
              'historical_aki_stage',
              'covid'] 

print(df.enc.unique().shape[0], "unique encounters")
#df.set_index(['time'], inplace=True)
#df.set_index(['mrn', 'enc'], inplace=True) #Turn the index into a hierarchical tuple (mrn, enc)