In [2]:
import icu_data_defs
import transformers
import extract_transform_load as etl
from sklearn.pipeline import Pipeline
from units import MedicalUreg
import mimic
from constants import column_names,variable_type,clinical_source

In [3]:
data_dict = icu_data_defs.data_dictionary('config/data_definitions.xlsx')
ureg = MedicalUreg('config/medical_units.txt')
category_map = mimic.mimic_category_map(data_dict)

In [4]:
reload(transformers)
cleaners = Pipeline([
        ('aggregate_same_datetime',transformers.same_index_aggregator(lambda grp:grp.iloc[0])),
        ('split_dtype',transformers.split_dtype()),
        ('standardize_columns',transformers.column_standardizer(data_dict,ureg)),
        ('standardize_categories',transformers.standardize_categories(data_dict,category_map)),
        ('split_bad_categories',transformers.split_bad_categories(data_dict)),
        ('one_hotter',transformers.nominal_to_onehot()),
        ('drop_oob_values',transformers.oob_value_remover(data_dict))
    ])

In [5]:
reload(etl)
class ETLConverter(etl.ETLManager):
    
    def extract(self,component):
        return pd.read_hdf('data/mimic_data.h5','extracted/' + component)

    def transform(self,df,component):
        return pd.read_hdf('data/mimic_data.h5','transformed/' + component).head(1000)
    
    def extracted_ids(self,df_extracted):
        return df_extracted[column_names.ID].unique().tolist()

    def extracted_data_count(self,df_extracted):
        return df_extracted[column_names.VALUE].count()
    
etlM = ETLConverter(data_dict,cleaners,'data/mimic_data_test.h5')

In [15]:
specs = {
    'component' : [
        data_dict.components.LACTATE,
        data_dict.components.HEART_RATE          
                  ] 
}

df = etlM.etl(specs,overwrite=True)

(2017-07-06 20:42:18)<< --- (40.0s)
(2017-07-06 20:42:18)>> Nominal to OneHot
(2017-07-06 20:42:18)<< --- (0.0s)
(2017-07-06 20:42:18)>> Drop OOB data | (1000, 2)
(2017-07-06 20:42:18)>>>> heart rate, beats/min, 1000
(2017-07-06 20:42:18)<<<< --- (0.0s)
(2017-07-06 20:42:18)<< --- (0.0s)
(2017-07-06 20:42:20)>> Nominal to OneHot
(2017-07-06 20:42:20)<< --- (0.0s)
(2017-07-06 20:42:20)>> Drop OOB data | (1000, 4)
(2017-07-06 20:42:20)>>>> lactate, mmol/L, 2115
(2017-07-06 20:42:20)<<<< --- (0.0s)
(2017-07-06 20:42:20)<< --- (0.0s)


In [16]:
df

stage,EXTRACTED,EXTRACTED,TRANSFORMED,TRANSFORMED,CLEANED,CLEANED
stat,id_count,data_count,id_count,data_count,id_count,data_count
heart rate,56716,7952939,11,1000,11,1000
lactate,34319,393608,197,2115,197,2115


In [65]:
etlM.open_df(data_dict.components.LACTATE).describe(include='all')

component,lactate,lactate,lactate,lactate,lactate,lactate,lactate
status,known,known,known,known,unknown,unknown,unknown
variable_type,qn,qn,qn,qn,qn,qn,qn
units,mmol/L,mmol/L,mmol/L,mmol/L,no_units,no_units,no_units
description,1531,225668,50813,818,225668,50813,818
count,63130.0,69386.0,176767.0,72911.0,14.0,1.0,1.0
mean,2.773528,2.505725,2.63512,2.842171,52.835714,1.1,0.8
std,2.909538,2.326406,2.590268,3.020533,82.567153,,
min,0.0,0.05,0.0,0.0,0.0,1.1,0.8
25%,1.2,1.2,1.2,1.2,1.175,1.1,0.8
50%,1.8,1.8,1.8,1.8,3.4,1.1,0.8
75%,3.0,2.9,2.9,3.1,115.4,1.1,0.8
max,36.0,32.0,36.0,36.0,203.0,1.1,0.8


In [6]:
reload(mimic)
mimic_etlM = mimic.MimicETLManager(data_dict,cleaners,'data/mimic_data_test.h5','config/mimic_item_map.csv')

In [7]:
etl_info= mimic_etlM.etl(data_specs={},panel_id=12, overwrite=True, save_steps=True)

(2017-07-06 21:11:59) Begin ETL: [u'heart rate', u'blood pressure systolic', u'blood pressure diastolic', u'blood pressure mean', u'respiratory rate', u'temperature body', u'oxygen saturation pulse oximetry', u'weight body', u'output urine', u'glasgow coma scale motor', u'glasgow coma scale eye opening', u'glasgow coma scale verbal', u'normal saline', u'lactated ringers', u'norepinephrine', u'vasopressin', u'hemoglobin', u'lactate']
(2017-07-06 21:11:59)>> HEART RATE
(2017-07-06 21:11:59)>>>> Extract...
(2017-07-06 21:12:00)>>>>>> Extracting 5 items from chartevents
(2017-07-06 21:13:00)<<<<<< --- (60.0s)
(2017-07-06 21:13:00)>>>>>> Combine DF
(2017-07-06 21:13:00)<<<<<< --- (0.0s)
(2017-07-06 21:13:00)>>>>>> Clean UOM
(2017-07-06 21:13:20)<<<<<< --- (20.0s)
(2017-07-06 21:13:20)<<<< --- (81.0s)
(2017-07-06 21:13:20)>>>> Transform...
(2017-07-06 21:15:06)<<<< --- (106.0s)
(2017-07-06 21:15:06)>>>> Clean...
(2017-07-06 21:15:54)>>>>>> Nominal to OneHot
(2017-07-06 21:15:54)<<<<<< --- (0

KeyboardInterrupt: 

In [38]:
store = pd.HDFStore('data/mimic_data_test.h5')

In [40]:
del store['lactate']

In [41]:
del store['heart rate']

In [43]:
store.close()