In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
with open('central_line_duration_df.pickle', 'rb') as read_file:
    df_cld = pickle.load(read_file)
df_cld.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38211 entries, 0 to 38210
Data columns (total 4 columns):
icustay_id        38181 non-null float64
starttime         38211 non-null datetime64[ns]
endtime           38211 non-null datetime64[ns]
duration_hours    38211 non-null float64
dtypes: datetime64[ns](2), float64(2)
memory usage: 1.2 MB


In [3]:
with open('diagnoses_hadmID_updated_codes_df.pickle', 'rb') as read_file:
    dia_hID = pickle.load(read_file)
dia_hID.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58976 entries, 0 to 58975
Data columns (total 22 columns):
CLABSI                    58976 non-null uint8
No diagnosis              58976 non-null uint8
blood                     58976 non-null uint8
circulatory               58976 non-null uint8
congenital                58976 non-null uint8
digestive                 58976 non-null uint8
endocrine                 58976 non-null uint8
external injury           58976 non-null uint8
genitourinary             58976 non-null uint8
infectious                58976 non-null uint8
injury and poisoning      58976 non-null uint8
mental                    58976 non-null uint8
misc                      58976 non-null uint8
muscular                  58976 non-null uint8
neoplasms                 58976 non-null uint8
nervous                   58976 non-null uint8
perinatal                 58976 non-null uint8
pregnancy                 58976 non-null uint8
respiratory               58976 non-null uint

In [4]:
with open('pia_redone_dummies.pickle', 'rb') as read_file:
    pia = pickle.load(read_file)
pia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53432 entries, 0 to 61531
Data columns (total 57 columns):
subject_id                                      53432 non-null float64
hadm_id                                         53432 non-null float64
icustay_id                                      53432 non-null float64
dbsource                                        53432 non-null object
last_careunit                                   53432 non-null object
first_wardid                                    53432 non-null float64
last_wardid                                     53432 non-null float64
intime                                          53432 non-null datetime64[ns]
outtime                                         53429 non-null datetime64[ns]
los                                             53429 non-null float64
admittime                                       53432 non-null datetime64[ns]
dischtime                                       53432 non-null datetime64[ns]
deathtime    

## Merge Strategy

Identifiers in each DataFrame
* pia - icustay_id (unique), hadm_id, subject_id
* cld - icustay_id
* dia - hadm_id

1. Merge pia onto CLD - Left join only interested in patients with central lines
3. Merge diagnostics onto this via hadm_id (unique to dia) - left join

In [8]:
# Merge - only looking at patients with central line/ catheders
df_merge = pd.merge(df_cld,pia,how='left', on='icustay_id')
df_merge.head()

Unnamed: 0,icustay_id,starttime,endtime,duration_hours,subject_id,hadm_id,dbsource,last_careunit,first_wardid,last_wardid,...,admission_location_TRANSFER FROM HOSP/EXTRAM,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,insurance_Medicaid,insurance_Medicare,insurance_Private,insurance_Self Pay,gender_F
0,200001.0,2181-11-25 22:04:00,2181-11-28 20:59:00,70.916667,55973.0,152234.0,metavision,MICU,23.0,23.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,200003.0,2199-08-03 01:00:00,2199-08-08 15:00:00,134.0,27513.0,163557.0,carevue,SICU,57.0,57.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,200009.0,2189-11-30 18:15:00,2189-12-02 08:00:00,37.75,29904.0,129607.0,carevue,CSRU,15.0,15.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,200010.0,2132-08-05 00:55:00,2132-08-05 20:36:00,19.683333,11861.0,192256.0,metavision,MICU,50.0,50.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,200021.0,2114-12-26 23:00:00,2114-12-27 22:46:00,23.766667,61691.0,109307.0,metavision,SICU,33.0,33.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [12]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38211 entries, 0 to 38210
Data columns (total 60 columns):
icustay_id                                      38181 non-null float64
starttime                                       38211 non-null datetime64[ns]
endtime                                         38211 non-null datetime64[ns]
duration_hours                                  38211 non-null float64
subject_id                                      38181 non-null float64
hadm_id                                         38181 non-null float64
dbsource                                        38181 non-null object
last_careunit                                   38181 non-null object
first_wardid                                    38181 non-null float64
last_wardid                                     38181 non-null float64
intime                                          38181 non-null datetime64[ns]
outtime                                         38181 non-null datetime64[ns]
los          

In [10]:
# check that all patients have central lines (duration is not null)
df_merge['duration_hours'].isna().sum()

0

In [15]:
merge_all = pd.merge(df_merge,dia_hID,how='left', on='hadm_id')
merge_all.head()

Unnamed: 0,icustay_id,starttime,endtime,duration_hours,subject_id,hadm_id,dbsource,last_careunit,first_wardid,last_wardid,...,mental,misc,muscular,neoplasms,nervous,perinatal,pregnancy,respiratory,skin,special considerations
0,200001.0,2181-11-25 22:04:00,2181-11-28 20:59:00,70.916667,55973.0,152234.0,metavision,MICU,23.0,23.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,3.0
1,200003.0,2199-08-03 01:00:00,2199-08-08 15:00:00,134.0,27513.0,163557.0,carevue,SICU,57.0,57.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,200009.0,2189-11-30 18:15:00,2189-12-02 08:00:00,37.75,29904.0,129607.0,carevue,CSRU,15.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,200010.0,2132-08-05 00:55:00,2132-08-05 20:36:00,19.683333,11861.0,192256.0,metavision,MICU,50.0,50.0,...,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,200021.0,2114-12-26 23:00:00,2114-12-27 22:46:00,23.766667,61691.0,109307.0,metavision,SICU,33.0,33.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
