## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime, random

from multiprocessing import Pool
import time
import os

## Reading in files; managing columns

In [2]:
%%time
df = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\output.csv')
df['time'] = pd.to_datetime(df.time)
df = df.drop(['Unnamed: 0'], axis=1)

yu = pd.read_csv(r'H:\Data\Standardized AKI definition\dataset\new aki flagger.csv')
yu['mrn'] = yu.pat_mrn_id.str.strip('MR').astype('int') #indexing with integers is quicker
yu['enc'] = yu.enc_id
yu['time'] = pd.to_datetime(yu.time)
#yu['covidtest'] = yu.covidtest.astype('bool')

# yu.columns = ['mrn', #Renaming columns for ease
#              'pat_enc_id',
#              'time',
#              'creat',
#              'enc',
#              'running_aki_stage',
#              'historical_aki_stage',
#              'covid'] 

yu['running_aki'] = yu.running_aki_stage > 0 
yu['historical_aki'] = yu.historical_aki_stage > 0

yu = yu.groupby('enc').apply(lambda d: d[~d.time.duplicated()])
yu = yu.reset_index(drop=True)

print(yu.shape)

(127254, 23)
Wall time: 49.9 s


In [3]:
%%time
df_subset = df.set_index(['mrn', 'enc', 'time']).loc[yu.set_index(['mrn','enc','time']).index]
df_subset = df_subset.reset_index().groupby('enc').apply(lambda d: d[~d.time.duplicated()])
df_subset = df_subset.reset_index(drop=True)
print(np.all(df_subset.time == yu.time))
print(np.all(df_subset.enc == yu.enc))
print(np.all(df_subset.mrn == yu.mrn))
print('Subset shape:',df_subset.shape, 'Yu\'s df shape:',yu.shape)
df_subset.loc[df_subset.rollingwindow_aki.isnull(), 'rollingwindow_aki'] = 0
df_subset.loc[df_subset.backcalc_aki.isnull(), 'backcalc_aki'] = 0

True
True
True
Subset shape: (127254, 17) Yu's df shape: (127254, 23)
Wall time: 34 s


### Check mismatch 

###### Back-calc match: 127254 / 127254 ; 100% Match 

###### Rolling-window match: 127236 / 127254; 99.99%

In [4]:
print('Back-calc match:', (df_subset.backcalc_aki == yu.historical_aki).sum(), '/', df_subset.shape[0],
      ';',(df_subset.backcalc_aki == yu.historical_aki).sum() / df_subset.shape[0])

print('Rolling-window match:', (df_subset.rollingwindow_aki == yu.running_aki).sum(), '/', yu.shape[0],
      ';', (df_subset.rollingwindow_aki == yu.running_aki).sum() / yu.shape[0])

Back-calc match: 127254 / 127254 ; 1.0
Rolling-window match: 127236 / 127254 ; 0.9998585506153048


### Back-calc mismatch - FULLY MATCHED NOW

~~1248799, 4488514~~

In [56]:
yu_mismatch = yu.iloc[np.where(df_subset.backcalc_aki != yu.historical_aki)[0]] #we don't match on 75 patients
print(yu_mismatch.shape)
print(yu_mismatch.historical_aki.sum()) # He says they're all false

(0, 23)
0


### Rolling-window mismatch

We mismatch on mrn: 2258332, 6492074, ...

In [5]:
yu_mismatch = yu.iloc[np.where(df_subset.rollingwindow_aki != yu.running_aki)[0]]
print(yu_mismatch.shape)
print(yu_mismatch.running_aki.sum()) #Again, he says all false

(18, 23)
17


In [6]:
yu_mismatch.mrn.unique()

array([2258332, 5446541,  503537, 6492074, 6510158, 1275931, 1914532,
         75841, 1199641, 4934022, 5447238, 3465859, 1983855, 2505988],
      dtype=int64)

In [7]:
df_mismatch = df_subset.iloc[np.where(df_subset.rollingwindow_aki != yu.running_aki)[0]]
print(df_mismatch.shape)
print(df_mismatch.rollingwindow_aki.sum())  #Again, I say all True

(18, 17)
1


In [16]:
#yu.loc[yu.mrn == 6492074]

In [17]:
#df_subset.loc[df_subset.mrn == 6492074]

## Reading in other dataframes, trying to make heads and tails of what data is included and what isn't

In [9]:
yu_covid_mrn = pd.read_csv(os.path.join(baseFolder, 'aki flagger covid creatinines with covidtest mrn.csv'))
print(yu_covid_mrn.shape)

yu_covid = pd.read_csv(os.path.join(baseFolder, 'aki flagger covid creatinines with covidtest.csv'))
print(yu_covid.shape)

yu = pd.read_csv(os.path.join(baseFolder, 'aki flagger covid creatinines.csv')) #these were all tested for covid
print(yu.shape)

covid_test_result = pd.read_csv(os.path.join(baseFolder, 'covid test result.csv'))
covid_test_result['mrn'] = covid_test_result.pat_mrn_id.str.strip('MR').astype('int')
covid_test_result['covidtest'] = covid_test_result.covidtest.astype('bool')
print(covid_test_result.shape)

covid_creats = pd.read_csv(os.path.join(baseFolder, 'covid creatinines.csv'))

covid_creats['mrn'] = covid_creats.pat_mrn_id.str.strip('MR').astype('int')
print(covid_creats.shape, df.shape)

NameError: name 'baseFolder' is not defined

In [None]:
%%time
indices = [df.mrn[indx] in covid_test_result.mrn.values for indx in range(df.shape[0])]
yu_indices = [yu.mrn[indx] in covid_test_result.mrn.values for indx in range(yu.shape[0])]
cc_indices = [covid_creats.mrn[indx] in yu.mrn.values for indx in range(covid_creats.shape[0])]
df_indices = [df.mrn[indx] in yu.mrn.values for indx in range(df.shape[0])]
df_enc_indices = [df.enc[indx] in yu.enc.values for indx in range(df.shape[0])]
covid_mrn_indices = [df.mrn[indx] in covid_result.mrn.values for indx in range(df.shape[0])]
covid_enc_indices = [df.enc[indx] in covidpos.enc.values for indx in range(df.shape[0])]

In [None]:
%%time
baseFolder = r'H:\Data\Standardized AKI definition\dataset'
covid_result = pd.read_csv(os.path.join(baseFolder, 'covid test result.csv'))
covid_result['mrn'] = covid_result.pat_mrn_id.str.strip('MR').astype('int')
covid_result['enc'] = covid_result.enc_id
covid_result['covidtest'] = covid_result.covidtest.astype('bool')
covid_result['time'] = pd.to_datetime(covid_result.time)
covid_result = covid_result.drop(['pat_enc_csn_id','pat_mrn_id', 'enc_id'], axis=1)

print('covid_result shape:',covid_result.shape, 'df shape:',df.shape,'yu shape:',yu.shape)
print('yu encs:', yu.enc.unique().shape[0], 'yu mrns:', yu.mrn.unique().shape[0])
print('covid+ tests:', covid_result.covidtest.sum())
print('covid encs:',covid_result.enc.unique().shape[0], 'covid mrns:',covid_result.mrn.unique().shape[0])