In [1]:
import datetime
import os
os.chdir('../')
from utils.hosp_preprocess_util import *

In [2]:
mimic4_path = "./mimic-iv-1.0/"

adm = read_admissions_table(mimic4_path)
pts = read_patients_table(mimic4_path)

In [3]:
# Get the MAXIMUM possible year that the anchor_year corresponds to
pts['min_year_group'] = pts.anchor_year_group.str.slice(start=0, stop=4).astype(int)
pts['max_year_group'] = pts.anchor_year_group.str.slice(start=-4).astype(int)

# Youngest possible age occurs by finding the diff between the max year and 2008, then subtracting from anchor_age
# This ensures you don't accidentally include someone who might've been <18 in that anchor year range during the 2008-2019 window
pts['min_age'] = pts.anchor_age.values - (pts.max_year_group.values - 2008)

pts = pts.loc[pts.min_age >= 18]

In [4]:
# Merge pts with adm to ensure patients MUST be at least 18 when considering an admission
adm_cohort = adm.merge(pts[['subject_id', 'anchor_year','max_year_group','min_year_group']], how='inner', left_on=['subject_id'], right_on=['subject_id'])
adm_cohort['admit_year'] = adm_cohort.admittime.dt.year
adm_cohort

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,ethnicity,anchor_year,max_year_group,min_year_group,admit_year
0,10292548,26653546,2120-01-07 05:51:00,2120-01-12 13:45:00,NaT,ASIAN,2118,2010,2008,2120
1,10292548,24067979,2119-07-26 04:35:00,2119-07-27 19:35:00,NaT,ASIAN,2118,2010,2008,2119
2,19120008,24459786,2185-09-18 11:15:00,2185-09-20 15:30:00,NaT,ASIAN,2185,2010,2008,2185
3,19120008,28496347,2187-07-27 01:04:00,2187-08-02 18:16:00,NaT,ASIAN,2185,2010,2008,2187
4,19120008,27495024,2190-06-30 01:16:00,2190-07-18 18:07:00,NaT,ASIAN,2185,2010,2008,2190
...,...,...,...,...,...,...,...,...,...,...
442579,17137356,25525683,2125-01-28 18:59:00,2125-01-30 15:25:00,NaT,BLACK/AFRICAN AMERICAN,2125,2010,2008,2125
442580,11043367,25542639,2153-06-06 23:14:00,2153-06-09 12:31:00,NaT,OTHER,2151,2010,2008,2153
442581,11084272,25138318,2135-11-03 08:00:00,2135-11-07 14:26:00,NaT,OTHER,2135,2010,2008,2135
442582,14582634,24338797,2170-04-17 23:23:00,2170-04-21 14:22:00,NaT,BLACK/AFRICAN AMERICAN,2170,2013,2011,2170


In [5]:
adm_cohort['timedelta'] = get_range(adm_cohort, 'admit_year')
adm_cohort

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,ethnicity,anchor_year,max_year_group,min_year_group,admit_year,timedelta
0,10292548,26653546,2120-01-07 05:51:00,2120-01-12 13:45:00,NaT,ASIAN,2118,2010,2008,2120,2.0
1,10292548,24067979,2119-07-26 04:35:00,2119-07-27 19:35:00,NaT,ASIAN,2118,2010,2008,2119,1.0
2,19120008,24459786,2185-09-18 11:15:00,2185-09-20 15:30:00,NaT,ASIAN,2185,2010,2008,2185,0.0
3,19120008,28496347,2187-07-27 01:04:00,2187-08-02 18:16:00,NaT,ASIAN,2185,2010,2008,2187,2.0
4,19120008,27495024,2190-06-30 01:16:00,2190-07-18 18:07:00,NaT,ASIAN,2185,2010,2008,2190,5.0
...,...,...,...,...,...,...,...,...,...,...,...
442579,17137356,25525683,2125-01-28 18:59:00,2125-01-30 15:25:00,NaT,BLACK/AFRICAN AMERICAN,2125,2010,2008,2125,0.0
442580,11043367,25542639,2153-06-06 23:14:00,2153-06-09 12:31:00,NaT,OTHER,2151,2010,2008,2153,2.0
442581,11084272,25138318,2135-11-03 08:00:00,2135-11-07 14:26:00,NaT,OTHER,2135,2010,2008,2135,0.0
442582,14582634,24338797,2170-04-17 23:23:00,2170-04-21 14:22:00,NaT,BLACK/AFRICAN AMERICAN,2170,2013,2011,2170,3.0


In [6]:
adm_cohort = adm_cohort.dropna(subset=['timedelta'])
death_ids = adm_cohort.loc[(adm_cohort.timedelta <= 6) & (~adm_cohort.deathtime.isna())]
alive_df = adm_cohort.loc[~adm_cohort.subject_id.isin(death_ids)]

In [7]:
# Set of IDs where a patient has admission history within 2008-2016 AND a CKD diagnosis within 2017-2019 (were readmitted in 2017-2019)
case = set(alive_df.loc[alive_df.timedelta <= 6].subject_id.unique()).intersection(set(alive_df.loc[alive_df.timedelta > 6].subject_id.unique()))
case = pd.Series(list(case), name='subject_id')
print("IDs with history in 2008-2016 AND admission in 2017-2019:      ", len(case))

# Set of IDs where a patient has admission history within 2008-2016 but not admitted in 2017-2019
ctrl = set(alive_df.loc[alive_df.timedelta <= 6].subject_id.unique()).difference(set(alive_df.loc[alive_df.timedelta > 6].subject_id.unique()))
ctrl = pd.Series(list(ctrl), name='subject_id')
print("IDs with history in 2008-2016 but no admission in 2017-2019:    ", len(ctrl))

IDs with history in 2008-2016 AND admission in 2017-2019:       24789
IDs with history in 2008-2016 but no admission in 2017-2019:     119259


In [10]:
caseinfo = alive_df.merge(case, how='inner', left_on='subject_id', right_on='subject_id')
caseinfo = caseinfo.loc[caseinfo.timedelta <= 6]

ctrlinfo = alive_df.merge(ctrl, how='inner', left_on='subject_id', right_on='subject_id')

In [11]:
adm_groups = caseinfo.groupby(by='subject_id')
record_df = pd.DataFrame({
        'subject_id':[x for x,y in adm_groups],
        'admit_year_range': [y.admit_year.max() - y.admit_year.min() for x,y in adm_groups],
        'admit_days_range': [y.admittime.max() - y.admittime.min() for x,y in adm_groups],
        'record_count': [y.shape[0] for x, y in adm_groups],
        'unique_admit_years': [y.admit_year.nunique() for x, y in adm_groups]
    })

record_df.describe()

Unnamed: 0,subject_id,admit_year_range,admit_days_range,record_count,unique_admit_years
count,24789.0,24789.0,24789,24789.0,24789.0
mean,14997120.0,1.385171,524 days 06:08:44.039695024,3.148655,1.80326
std,2881628.0,2.066843,760 days 22:03:01.764682280,4.736366,1.300865
min,10000980.0,0.0,0 days 00:00:00,1.0,1.0
25%,12493670.0,0.0,0 days 00:00:00,1.0,1.0
50%,15004290.0,0.0,21 days 10:41:00,2.0,1.0
75%,17481340.0,3.0,994 days 11:53:00,3.0,2.0
max,19999040.0,6.0,2550 days 05:59:00,169.0,7.0


In [12]:
adm_groups = ctrlinfo.groupby(by='subject_id')
record_df = pd.DataFrame({
        'subject_id':[x for x,y in adm_groups],
        'admit_year_range': [y.admit_year.max() - y.admit_year.min() for x,y in adm_groups],
        'admit_days_range': [y.admittime.max() - y.admittime.min() for x,y in adm_groups],
        'record_count': [y.shape[0] for x, y in adm_groups],
        'unique_admit_years': [y.admit_year.nunique() for x, y in adm_groups]
    })

record_df.describe()

Unnamed: 0,subject_id,admit_year_range,admit_days_range,record_count,unique_admit_years
count,119259.0,119259.0,119259,119259.0,119259.0
mean,15018930.0,0.577474,215 days 03:46:59.112519808,1.915009,1.325644
std,2883892.0,1.324439,482 days 20:00:43.711726152,2.272057,0.736203
min,10000030.0,0.0,0 days 00:00:00,1.0,1.0
25%,12520900.0,0.0,0 days 00:00:00,1.0,1.0
50%,15026500.0,0.0,0 days 00:00:00,1.0,1.0
75%,17514390.0,0.0,84 days 21:35:30,2.0,1.0
max,19999990.0,6.0,2546 days 00:41:00,94.0,7.0


In [24]:
adm_cohort.loc[adm_cohort.timedelta <= 6].subject_id.nunique()

144048

In [26]:
cohort = adm_cohort.merge(
    pd.concat([pd.concat([case, pd.Series(np.ones(len(case)), name='label', dtype=int)], axis=1), pd.concat([ctrl, pd.Series(np.zeros(len(ctrl)), name='label', dtype=int)], axis=1)], axis=0),
    how='inner',
    left_on='subject_id',
    right_on='subject_id'
)

In [28]:
cohort.subject_id.nunique()

144048

In [36]:
cohort.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,ethnicity,anchor_year,max_year_group,min_year_group,admit_year,timedelta,label
0,10292548,26653546,2120-01-07 05:51:00,2120-01-12 13:45:00,NaT,ASIAN,2118,2010,2008,2120,2.0,0
1,10292548,24067979,2119-07-26 04:35:00,2119-07-27 19:35:00,NaT,ASIAN,2118,2010,2008,2119,1.0,0
2,19120008,24459786,2185-09-18 11:15:00,2185-09-20 15:30:00,NaT,ASIAN,2185,2010,2008,2185,0.0,0
3,19120008,28496347,2187-07-27 01:04:00,2187-08-02 18:16:00,NaT,ASIAN,2185,2010,2008,2187,2.0,0
4,19120008,27495024,2190-06-30 01:16:00,2190-07-18 18:07:00,NaT,ASIAN,2185,2010,2008,2190,5.0,0


## Save Datasets

In [37]:
# Admissions info for our cohort
cohort.to_pickle("./data/adm_cohort.gzip", compression='gzip')

# Patients table with only the cohort subjects and the label col
pts.merge(cohort[['subject_id', 'label']].drop_duplicates(), how='inner', left_on='subject_id', right_on='subject_id').to_pickle("./data/cohort.pkl", compression='gzip')

In [35]:
pts.merge(cohort[['subject_id', 'label']].drop_duplicates(), how='inner', left_on='subject_id', right_on='subject_id')

Unnamed: 0,subject_id,gender,dod,anchor_age,anchor_year,anchor_year_group,yob,min_year_group,max_year_group,min_age,label
0,10018928,F,NaT,31,2125,2008 - 2010,2094,2008,2010,29,1
1,10074117,F,NaT,55,2118,2008 - 2010,2063,2008,2010,53,0
2,10076543,F,NaT,78,2187,2008 - 2010,2109,2008,2010,76,0
3,10098428,F,NaT,85,2119,2008 - 2010,2034,2008,2010,83,0
4,10127185,M,NaT,60,2141,2008 - 2010,2081,2008,2010,58,0
...,...,...,...,...,...,...,...,...,...,...,...
144043,19997448,F,NaT,52,2121,2014 - 2016,2069,2014,2016,44,1
144044,19997887,F,NaT,57,2112,2011 - 2013,2055,2011,2013,52,1
144045,19998203,M,NaT,29,2132,2011 - 2013,2103,2011,2013,24,0
144046,19998350,M,NaT,52,2127,2011 - 2013,2075,2011,2013,47,1
