In [1]:
import pandas
import numpy

# sig_data

In [2]:
def filter_sig_name(name):
    if any(i in name for i in '!?][`|<(-'):
        return None
    name = name.upper().replace('PLETH ', 'PLETH')
    return name

In [3]:
nanvals = {
    'adc_res': -1,
    'adc_zero': -1,
    'init_value': 0,
    'checksum': 0
}

dtypes = {
    'rec_id': 'int32',
    'seg_id': 'int16',
    'sig_index': 'int8',
    'sig_name': 'category',
    'baseline': 'int16',
    'adc_gain': 'float32',
    'adc_res': 'int8',
    'adc_zero': 'int16',
    'fmt': 'int8',
    'init_value': 'int16',
    'checksum': 'int16',
    'units': 'category'
}

sig_data = pandas.read_csv('/scr-ssd/mimic/sig_data.csv')

data = sig_data.copy()
data.loc[data['adc_gain'].abs() > 1e30, 'adc_gain'] = numpy.nan
assert(numpy.allclose(data['adc_gain'], data['adc_gain'].astype('float32'), equal_nan=True))
data.loc[data['baseline'] < -2**15, 'baseline'] = -2**15
data.at[:, 'sig_name'] = data['sig_name'].apply(filter_sig_name)
data = data.fillna(nanvals).astype(dtypes)
data.at[:, 'adc_zero'] = data['adc_zero'].astype('category')

data.to_hdf('/scr-ssd/mimic/sig_data.hdf', 'sig_data', format='table', mode='w', complevel=6)

In [4]:
! du -h /scr-ssd/mimic/sig_data.csv
! du -h /scr-ssd/mimic/sig_data.hdf

498M	/scr-ssd/mimic/sig_data.csv
42M	/scr-ssd/mimic/sig_data.hdf


In [5]:
%time sig_data_old = pandas.read_csv('/scr-ssd/mimic/sig_data.csv')
%time sig_data = pandas.read_hdf('/scr-ssd/mimic/sig_data.hdf')

CPU times: user 6.76 s, sys: 1.09 s, total: 7.85 s
Wall time: 7.85 s
CPU times: user 3.18 s, sys: 224 ms, total: 3.4 s
Wall time: 3.4 s


# metadata

In [2]:
nanvals = {
    'age': -1,
    'height': -1,
    'init_value': 0,
    'checksum': 0
}

dtypes = {
    'rec_id': 'int32',
    'seg_id': 'int16',
    'hadm_id': 'int32',
    'subject_id': 'int32',
    'gender': 'category',
    'age': 'int8',
    'weight': 'float32',
    'height': 'int8',
    'sig_len': 'int32',
}

metadata = pandas.read_csv('/scr-ssd/mimic/metadata.csv', parse_dates=['time', 'dob'])

data = metadata.copy()
data = data.drop(columns=['icd_codes', 'sig_name'])

data = data.fillna(nanvals).astype(dtypes)

data.to_hdf('/scr-ssd/mimic/metadata.hdf', 'metadata', format='table', mode='w', complevel=6)

In [3]:
! du -h /scr-ssd/mimic/metadata.csv
! du -h /scr-ssd/mimic/metadata.hdf

277M	/scr-ssd/mimic/metadata.csv
12M	/scr-ssd/mimic/metadata.hdf


In [6]:
%time metadata_old = pandas.read_csv('/scr-ssd/mimic/metadata.csv')
%time metadata = pandas.read_hdf('/scr-ssd/mimic/metadata.hdf')

CPU times: user 3 s, sys: 392 ms, total: 3.39 s
Wall time: 3.39 s
CPU times: user 1.18 s, sys: 120 ms, total: 1.3 s
Wall time: 1.29 s
