## Imports

In [74]:
import pandas as pd
import numpy as np
import datetime, random

from multiprocessing import Pool
import time
import os

## Reading in files; managing columns

In [189]:
%%time
df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\output.csv')
df['time'] = pd.to_datetime(df.time)
df = df.drop(['Unnamed: 0'], axis=1)

yu = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\aki flagger covid creatinines with covidtest.csv')
yu['pat_mrn_id'] = yu.pat_mrn_id.str.strip('MR').astype('int') #indexing with integers is quicker
yu['time'] = pd.to_datetime(yu.time)
yu['covidtest'] = yu.covidtest.astype('bool')
yu.columns = ['mrn', #Renaming columns for ease
              'pat_enc_id',
              'time',
              'creat',
              'enc',
              'running_aki_stage',
              'historical_aki_stage',
              'covid'] 

yu['running_aki'] = yu.running_aki_stage > 0 
yu['historical_aki'] = yu.historical_aki_stage > 0

yu = yu.groupby('enc').apply(lambda d: d[~d.time.duplicated()])
yu = yu.reset_index(drop=True)

print(yu.shape)

(127254, 10)
Wall time: 24.8 s


In [190]:
%%time
df_subset = df.set_index(['mrn', 'enc', 'time']).loc[yu.set_index(['mrn','enc','time']).index]
df_subset = df_subset.reset_index().groupby('enc').apply(lambda d: d[~d.time.duplicated()])
df_subset = df_subset.reset_index(drop=True)
print(np.all(df_subset.time == yu.time))
print(np.all(df_subset.enc == yu.enc))
print(np.all(df_subset.mrn == yu.mrn))
print('Subset shape:',df_subset.shape, 'Yu\'s df shape:',yu.shape)
df_subset.loc[df_subset.rollingwindow_aki.isnull(), 'rollingwindow_aki'] = 0
df_subset.loc[df_subset.backcalc_aki.isnull(), 'backcalc_aki'] = 0

True
True
True
Subset shape: (127254, 17) Yu's df shape: (127254, 10)
Wall time: 34 s


### Check mismatch 

###### Back-calc match: 127105 / 127254; 99.92%

###### Rolling-window match: 127215 / 127254; 99.97%

In [191]:
print('Back-calc match:', (df_subset.backcalc_aki == yu.historical_aki).sum(), '/', df_subset.shape[0],
      ';',(df_subset.backcalc_aki == yu.historical_aki).sum() / df_subset.shape[0])
print('Rolling-window match:', (df_subset.rollingwindow_aki == yu.running_aki).sum(), '/', yu.shape[0],
      ';', (df_subset.rollingwindow_aki == yu.running_aki).sum() / yu.shape[0])

Back-calc match: 127175 / 127254 ; 0.9993791943671712
Rolling-window match: 127215 / 127254 ; 0.9996935263331604


### Back-calc mismatch

1248799, 4488514

In [216]:
yu_mismatch = yu.iloc[np.where(df_subset.backcalc_aki != yu.historical_aki)[0]] #we don't match on 75 patients
print(yu_mismatch.shape)
print(yu_mismatch.historical_aki.sum()) # He says they're all false

(79, 10)
0


In [215]:
df_mismatch = df_subset.iloc[np.where(df_subset.backcalc_aki != yu.historical_aki)[0]]
print(df_mismatch.shape)
print(df_mismatch.backcalc_aki.sum()) # I say they're all true

(79, 17)
79


### Rolling-window mismatch

We mismatch on mrn: 6496928, 6500618, 5446541, ...

In [213]:
yu_mismatch = yu.iloc[np.where(df_subset.rollingwindow_aki != yu.running_aki)[0]]
print(yu_mismatch.shape)
print(yu_mismatch.running_aki.sum()) #Again, he says all false

(39, 10)
0


In [214]:
df_mismatch = df_subset.iloc[np.where(df_subset.rollingwindow_aki != yu.running_aki)[0]]
print(df_mismatch.shape)
print(df_mismatch.rollingwindow_aki.sum())  #Again, I say all True

(39, 17)
39


## Reading in other dataframes, trying to make heads and tails of what data is included and what isn't

In [124]:
yu_covid_mrn = pd.read_csv(os.path.join(baseFolder, 'aki flagger covid creatinines with covidtest mrn.csv'))
print(yu_covid_mrn.shape)

yu_covid = pd.read_csv(os.path.join(baseFolder, 'aki flagger covid creatinines with covidtest.csv'))
print(yu_covid.shape)

yu = pd.read_csv(os.path.join(baseFolder, 'aki flagger covid creatinines.csv')) #these were all tested for covid
print(yu.shape)

covid_test_result = pd.read_csv(os.path.join(baseFolder, 'covid test result.csv'))
covid_test_result['mrn'] = covid_test_result.pat_mrn_id.str.strip('MR').astype('int')
covid_test_result['covidtest'] = covid_test_result.covidtest.astype('bool')
print(covid_test_result.shape)

covid_creats = pd.read_csv(os.path.join(baseFolder, 'covid creatinines.csv'))

covid_creats['mrn'] = covid_creats.pat_mrn_id.str.strip('MR').astype('int')
print(covid_creats.shape, df.shape)

(1089, 3)
(127272, 8)
(127272, 6)
(25265, 5)
(299718, 12) (215121, 17)


In [233]:
%%time
indices = [df.mrn[indx] in covid_test_result.mrn.values for indx in range(df.shape[0])]
yu_indices = [yu.mrn[indx] in covid_test_result.mrn.values for indx in range(yu.shape[0])]
cc_indices = [covid_creats.mrn[indx] in yu.mrn.values for indx in range(covid_creats.shape[0])]
df_indices = [df.mrn[indx] in yu.mrn.values for indx in range(df.shape[0])]
df_enc_indices = [df.enc[indx] in yu.enc.values for indx in range(df.shape[0])]
covid_mrn_indices = [df.mrn[indx] in covid_result.mrn.values for indx in range(df.shape[0])]
covid_enc_indices = [df.enc[indx] in covidpos.enc.values for indx in range(df.shape[0])]

Wall time: 1min 25s


In [4]:
%%time
baseFolder = r'H:\Data\Standardized AKI definition\dataset'
covid_result = pd.read_csv(os.path.join(baseFolder, 'covid test result.csv'))
covid_result['mrn'] = covid_result.pat_mrn_id.str.strip('MR').astype('int')
covid_result['enc'] = covid_result.enc_id
covid_result['covidtest'] = covid_result.covidtest.astype('bool')
covid_result['time'] = pd.to_datetime(covid_result.time)
covid_result = covid_result.drop(['pat_enc_csn_id','pat_mrn_id', 'enc_id'], axis=1)

print('covid_result shape:',covid_result.shape, 'df shape:',df.shape,'yu shape:',yu.shape)
print('yu encs:', yu.enc.unique().shape[0], 'yu mrns:', yu.mrn.unique().shape[0])
print('covid+ tests:', covid_result.covidtest.sum())
print('covid encs:',covid_result.enc.unique().shape[0], 'covid mrns:',covid_result.mrn.unique().shape[0])

covid_result shape: (25265, 4) df shape: (295435, 17) yu shape: (127254, 10)
yu encs: 15730 yu mrns: 13821
covid+ tests: 4688
covid encs: 17441 covid mrns: 15454
Wall time: 120 ms
