In [1]:
import tensorflow as tf
from time import time
import numpy
import os
import json
import pickle
import pandas
import datetime
from functools import partial, reduce

import sys
sys.path.append('../libs')

import data_pipeline
import conv_model
import initialize
import prepare_data
import flacdb

In [2]:
H = initialize.load_hypes()
H = {
    **H, 
    'epochs': 16, 
    'window_size': 512,
    'steps_per_epoch': 32,
    'validation_steps': 64
}

In [3]:
def load_initial_data():
    metadata = pandas.read_hdf('/scr-ssd/mimic/metadata.hdf')
    metadata = metadata[metadata['sig_len'] > prepare_data.CHUNK_SIZE]
    chunk_counts = metadata['sig_len'].apply(prepare_data.get_chunk_count)
    metadata.at[:, 'chunk_count'] = chunk_counts.astype('uint8')
#     index = (metadata.index & prepare_data.get_downloaded()).sort_values()
#     metadata = metadata.reindex(index)
    missing = metadata['subject_id'] == -1
    fake_ids = -metadata.loc[missing].index.get_level_values(0)
    metadata.at[missing, 'subject_id'] = fake_ids
    subject_ids = metadata['subject_id']
    metadata = metadata.reset_index()
    metadata.set_index(['subject_id', 'rec_id', 'segment'], inplace=True, verify_integrity=True)
    metadata.sort_index(inplace=True)
    index_names = ['rec_id', 'segment', 'sig_name']
    columns = index_names + ['sig_index', 'baseline', 'adc_gain']
    sig_data = pandas.read_hdf('/scr-ssd/mimic/sig_data.hdf', columns=columns)
    sig_data.drop_duplicates(index_names, inplace=True)
    sig_data = sig_data.astype({'sig_name': str})
    dtypes = sig_data.dtypes
    sig_data.set_index(index_names, inplace=True)
    sig_data.at[:, 'sig_index'] += 1
    sig_data.at[:, 'subject_id'] = subject_ids
    sig_data.reset_index(inplace=True)
    sig_data.set_index(['subject_id'] + index_names, inplace=True, verify_integrity=True)
    sig_data = sig_data.unstack(fill_value=0)
    sig_data = sig_data.astype({(k, s): dtypes[k] for k, s in sig_data})
    sig_data = sig_data[sig_data['sig_index']['ABP'] > 0]
    index = (metadata.index & sig_data.index).sort_values()
    metadata = metadata.reindex(index)
    sig_data = sig_data.reindex(index)
    return sig_data, metadata

In [4]:
%%time
sig_data, metadata = load_initial_data()
path = '/scr1/mimic/initial_data/'
sig_data.to_pickle(path + 'sig_data.pkl')
metadata.to_pickle(path + 'metadata.pkl')

CPU times: user 1min 3s, sys: 15.3 s, total: 1min 18s
Wall time: 1min 18s


In [5]:
def load_diagnosis(codes, metadata):
    diagnosis = pandas.read_csv(initialize.clinic_file('diagnoses_icd'))
    new_names = {i.upper(): i for i in ['subject_id', 'hadm_id']}
    new_names['ICD9_CODE'] = 'code'
    diagnosis = diagnosis[new_names].rename(columns=new_names)
    diagnosis.loc[~diagnosis['code'].isin(codes), 'code'] = 'other'
    diagnosis.drop_duplicates(inplace=True)
    diagnosis.set_index(['subject_id', 'hadm_id', 'code'], inplace=True)
    diagnosis.sort_index(inplace=True)
    diagnosis.at[:, 'present'] = True
    diagnosis = diagnosis.unstack(fill_value=False)['present'].astype('bool')
    
    matched_data = metadata.reset_index()
    matched_data = matched_data[matched_data['subject_id'] > 0]    
    matched_data.drop_duplicates(['subject_id', 'hadm_id'], inplace=True)
    matched_data.set_index(['subject_id', 'hadm_id'], inplace=True)
    matched_data.sort_index(inplace=True)
        
    diagnosis = diagnosis.reindex(matched_data.index)
    
    for i in ['gender', 'ethnicity']:
        values = matched_data[i]
        for j in values.dtype.categories:
            diagnosis.at[values.notna(), i + '_' + j] = values == j
    
    died = matched_data['death_time'].notna()
    diagnosis.at[(slice(None), slice(0, None)), 'died'] = died

    index = metadata.reset_index()[diagnosis.index.names]
    index = pandas.MultiIndex.from_frame(index)
    diagnosis = diagnosis.reindex(index).reset_index()
    frames = [diagnosis, metadata.reset_index()[['rec_id', 'segment']]]
    diagnosis = pandas.concat(frames, sort=False, axis=1)

    diagnosis = diagnosis.set_index(['subject_id', 'rec_id', 'segment'])
    diagnosis = diagnosis.sort_index()

    is_negative_always = ~diagnosis.drop(columns='hadm_id').any(level=0)
    is_diagnosed_always = (diagnosis['hadm_id'] > 0).all(level=0)

    is_negative = is_negative_always[is_diagnosed_always]
    diagnosis = diagnosis.drop(columns='hadm_id')
    bool_to_int = {True: 1, False: 0, numpy.nan: 0}
    diagnosis = diagnosis.replace(bool_to_int)
    diagnosis.loc[is_negative.index] -= is_negative.replace(bool_to_int)
    diagnosis = diagnosis.reset_index()
    diagnosis = diagnosis.set_index(metadata.index.names)
    diagnosis = diagnosis.reindex(metadata.index)
        
    return diagnosis

In [6]:
path = '/scr1/mimic/initial_data/'
metadata = pandas.read_pickle(path + 'metadata.pkl')
sig_data = pandas.read_pickle(path + 'sig_data.pkl')
sig_data = sig_data.loc[:, (slice(None), H['input_sigs'] + H['output_sigs'])]

In [7]:
%%time
diagnosis = load_diagnosis(H['icd_codes'], metadata)

CPU times: user 6.76 s, sys: 708 ms, total: 7.47 s
Wall time: 7.45 s


In [184]:
def get_tensors(H, metadata, sig_data, diagnosis):
    assert((metadata.index == sig_data.index).all())
    assert((metadata.index == diagnosis.index).all())
    
    tensors = {}
    
    get_rec_ids = lambda i: [i.index.get_level_values(1).unique()]
    rec_ids = metadata.groupby(level=0).apply(get_rec_ids)
    tensors['rec_ids'] = tf.ragged.constant(rec_ids, dtype='int32')
    
    row_lengths = metadata.groupby(level=[0, 1]).apply(len)
    row_lengths = row_lengths.groupby(level=0).apply(lambda i: i.values)
    row_lengths = [
        row_lengths.apply(len).values,
        numpy.concatenate(row_lengths.values)
    ]
    
    def to_nested_ragged_tensor(df, k):
        values = df.values.astype(initialize.TENSOR_DTYPES[k])
        tensor = tf.RaggedTensor.from_nested_row_lengths(values, row_lengths)
        return tensor
    
    
    reducers = {
        'height': lambda i: i.max(),
        'weight': lambda i: i.mean(),
        'age': lambda i: i.max(),
        'diagnosis': lambda i: i.iloc[0]
    }
    
    def to_ragged_tensor(df, k):
        values = df.groupby(level=[0, 1]).apply(reducers[k])
        to_values = lambda i: [i.values.astype(initialize.TENSOR_DTYPES[k])]
        values = values.groupby(level=0).apply(to_values)
        tensor = tf.ragged.constant(values)
        return tensor
    
    tensors['diagnosis'] = to_ragged_tensor(diagnosis, 'diagnosis')
    
    for k in ['height', 'weight', 'age']:
        tensors[k] = to_ragged_tensor(metadata[k], k)
    
    k = 'chunk_count'
    tensors[k] = to_nested_ragged_tensor(metadata[k], k)
    
    S = H['input_sigs'] + H['output_sigs']
    for k in ['sig_index', 'adc_gain', 'baseline']:
        tensors[k] = to_nested_ragged_tensor(sig_data[k][S], k)
    
    return tensors

In [185]:
%%time

tensors = get_tensors(H, metadata, sig_data, diagnosis)
dataset = tf.data.Dataset.from_tensor_slices(tensors)

CPU times: user 1min 5s, sys: 672 ms, total: 1min 5s
Wall time: 1min 5s


In [188]:
next(iter(dataset.shuffle(tensors['rec_ids'].shape[0])))

{'rec_ids': <tf.RaggedTensor [[3158849, 3933362]]>,
 'diagnosis': <tf.RaggedTensor [[[-1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, 1, 1], [-1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, 1, 1]]]>,
 'height': <tf.RaggedTensor [[-1, -1]]>,
 'weight': <tf.RaggedTensor [[nan, nan]]>,
 'age': <tf.RaggedTensor [[84, 84]]>,
 'chunk_count': <tf.RaggedTensor [[16, 16, 16, 16], [16, 2, 4, 1, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]]>,
 'sig_index': <tf.RaggedTensor [[[0, 3, 0, 5, 6, 0, 2, 1, 4, 7, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 7, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 7, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 7, 0, 0]], [[0, 3, 0, 5, 6, 0, 2, 1, 4, 0, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 7, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 7, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 7, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 0, 0, 0], [0, 3, 0, 5, 6, 0, 2, 1, 4, 0, 0, 0], [0, 

In [159]:
s = metadata[(sig_data['sig_index'][H['input_sigs_validation']] > 0).all(axis=1)].reset_index()['subject_id']
s[s > 0].nunique()

1792

In [22]:
s = metadata.reset_index()['subject_id']
s[s > 0].nunique()

5064

In [6]:
s = metadata[(sig_data['sig_index'][H['input_sigs_validation']] > 0).all(axis=1)].reset_index()['subject_id']
s[s > 0].nunique()

4095

In [10]:
s = metadata[(sig_data['sig_index'][H['input_sigs_validation']] > 0).all(axis=1)].reset_index()['subject_id']
s[s > 0].nunique()

4210

In [27]:
s = metadata[(sig_data['sig_index'][['II', 'PLETH']] > 0).all(axis=1)].reset_index()['subject_id']
s[s > 0].nunique()

7068

In [73]:
(diagnosis.index.get_level_values(0).unique() > 0).sum()

10067

In [12]:
s = metadata.reset_index()['subject_id']
s[s > 0].nunique()

10099

In [252]:
reducers = {
    'height': lambda i: i.max(),
    'weight': lambda i: i.mean(),
    'ethnicity': lambda i: i.mode()
}

with_hadm = matched_data.loc[(slice(None), slice(0, None)), :]
reduced = {k: with_hadm[k].groupby(level=0).apply(reducers[k]) for k in reducers}

In [253]:
for k in reduced:
    matched_data.at[(slice(None), -1), k] = reduced[k]

In [285]:
pandas.Series({'hadm_id': -1})

hadm_id   -1
dtype: int64

In [308]:
k = 'height'
with_hadm = matched_data.loc[(slice(None), slice(0, None)), k]
reduced.at[:, k] = with_hadm.groupby(level=0).apply(reducers[k])
reduced = pandas.DataFrame(reduced)
reduced.at[:, 'hadm_id'] = -1
reduced = reduced.reset_index().set_index(['subject_id', 'hadm_id'])
reduced.sort_index(inplace=True)

In [322]:
matched_data.index.shape

(5433,)

In [135]:
def load_diagnosis_old(codes, metadata):
    diagnosis = pandas.read_csv(clinic_file('diagnoses_icd'))
    new_names = {'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}
    new_names['ICD9_CODE'] = 'code'
    diagnosis = diagnosis[new_names].rename(columns=new_names)
    diagnosis.loc[~diagnosis['code'].isin(codes), 'code'] = 'other'
    diagnosis.drop_duplicates(inplace=True)
    diagnosis.set_index(['subject_id', 'hadm_id', 'code'], inplace=True)
    diagnosis.sort_index(inplace=True)
    diagnosis.at[:, 'present'] = True
    diagnosis = diagnosis.unstack(fill_value=False)['present'].astype('bool')
    
    index = metadata.reset_index()[diagnosis.index.names]
    index = pandas.MultiIndex.from_frame(index)
    diagnosis = diagnosis.reindex(index).reset_index()
    frames = [diagnosis, metadata.reset_index()[['rec_id', 'segment']]]
    diagnosis = pandas.concat(frames, sort=False, axis=1)

    diagnosis = diagnosis.set_index(['subject_id', 'rec_id', 'segment'])
    diagnosis = diagnosis.sort_index()
    
    is_negative_always = ~diagnosis.drop(columns='hadm_id').any(level=0)
    is_diagnosed_always = (diagnosis['hadm_id'] > 0).all(level=0)
    
    is_negative = is_negative_always[is_diagnosed_always]
    diagnosis = diagnosis.drop(columns='hadm_id')
    bool_to_int = {True: 1, False: 0, numpy.nan: 0}
    diagnosis = diagnosis.replace(bool_to_int)
    diagnosis.loc[is_negative.index] -= is_negative.replace(bool_to_int)
    diagnosis = diagnosis.reset_index()
    diagnosis = diagnosis.set_index(metadata.index.names)
    diagnosis = diagnosis.reindex(metadata.index)
    return diagnosis

In [6]:
%%time

%time metadata, subject_ids = load_metadata()
%time sig_data = load_sig_data(H['input_sigs'] + H['output_sigs'], subject_ids)
%time diagnosis = load_diagnosis(H['icd_codes'])
index = (metadata.index & sig_data.index).sort_values()
metadata = metadata.reindex(index)
sig_data = sig_data.reindex(index)

CPU times: user 9.32 s, sys: 2.9 s, total: 12.2 s
Wall time: 20.6 s
CPU times: user 45.6 s, sys: 12.7 s, total: 58.3 s
Wall time: 58.3 s
CPU times: user 57.4 s, sys: 15.8 s, total: 1min 13s
Wall time: 1min 21s


In [127]:
def get_tensors(H, metadata, sig_data):
    assert((metadata.index == sig_data.index).all())
    
    tensors = {}
    
    get_rec_ids = lambda i: [i.index.get_level_values(1).unique()]
    rec_ids = metadata.groupby(level=0).apply(get_rec_ids)
    tensors['rec_ids'] = tf.ragged.constant(rec_ids, dtype='int32')
    
    row_lengths = metadata.groupby(level=[0, 1]).apply(len)
    row_lengths = row_lengths.groupby(level=0).apply(lambda i: i.values)
    row_lengths = [
        row_lengths.apply(len).values,
        numpy.concatenate(row_lengths.values)
    ]
    
    segments = metadata.reset_index()['segment'].values.astype('int32')
    segments = tf.RaggedTensor.from_nested_row_lengths(segments, row_lengths)
    tensors['segments'] = segments
    
    S = H['input_sigs'] + H['output_sigs']
    
    for k in ['sig_index', 'adc_gain', 'baseline']:
        tensors[k] = tf.RaggedTensor.from_nested_row_lengths(
            sig_data[k][S].values, 
            row_lengths
        )
    
    
    gender = metadata.reset_index()[['subject_id', 'gender']]
    gender = gender.drop_duplicates()['gender']
    gender = gender.astype(object).replace({'M': 1, 'F': -1})
    tensors['gender'] = gender.fillna(0).astype('int8')
    
    reducers = {
        'height': lambda i: i.max(),
        'weight': lambda i: i.mean(),
#         'ethnicity': lambda i: i.mode()
    }
        
    for k in reducers:
        values = metadata.reset_index()[['subject_id', k]]
        values = values.groupby('subject_id').apply(reducers[k])
        tensors[k] = values[k]
    
    return tensors

In [128]:
%%time

# tensors = get_tensors(H, metadata.loc[0:200], sig_data.loc[0:200])
tensors = get_tensors(H, metadata, sig_data)
dataset = tf.data.Dataset.from_tensor_slices(tensors)
next(iter(dataset.shuffle(tensors['rec_ids'].shape[0])))

CPU times: user 38.7 s, sys: 136 ms, total: 38.8 s
Wall time: 38.8 s


{'rec_ids': <tf.RaggedTensor [[3563275, 3783672]]>,
 'segments': <tf.RaggedTensor [[1, 2, 3, 6, 10], [9, 10, 13, 16, 17]]>,
 'sig_index': <tf.RaggedTensor [[[0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]], [[0, 1, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0], [0, 1, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]]]>,
 'adc_gain': <tf.RaggedTensor [[[0.0, 40.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.25, 0.0, 0.0, 0.0], [0.0, 39.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.25, 0.0, 0.0, 0.0], [0.0, 39.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.25, 0.0, 0.0, 0.0], [0.0, 41.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.25, 0.0, 0.0, 0.0], [0.0, 44.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.25, 0.0, 0.0, 0.0]], [[0.0, 50.0, 0.0, 120.0, 0.0, 0.0, 0.0, 0.0, 1.25, 0.0, 0.0, 0.0], [0.0, 50.0, 0.0, 120.0, 0.0, 0.0, 0.0,

In [11]:
for i, j in dataset.take(5):
    tabs = '\t'*(2 - i[0].shape[0])
    print(*i.to_list()[0], tabs, *j.to_list())

3544749 	 [5]
3842928 3887555  [5] [9, 20]
3860035 	 [29, 31]
3485814 	 [7]
3255538 	 [3, 4]


In [278]:
matched_data = metadata.reset_index()
matched_data = matched_data[matched_data['subject_id'] > 0]    
matched_data.drop_duplicates(['subject_id', 'hadm_id'], inplace=True)
matched_data.set_index(['subject_id', 'hadm_id'], inplace=True, verify_integrity=True)
matched_data.sort_index(inplace=True)

In [204]:
matched_data.groupby(level=0).apply(lambda i: i['age'].max() - i['age'].min()).value_counts()

0     4869
1       87
2       35
3       24
4       21
6        6
5        5
10       3
8        3
11       2
7        2
13       2
9        2
12       2
14       1
dtype: int64

In [206]:
weight_diffs = matched_data.groupby(level=0).apply(lambda i: i['weight'].max() - i['weight'].min())
weight_diffs.max()

40.19999694824219

In [207]:
height_diffs = matched_data.groupby(level=0).apply(lambda i: i['height'].max() - i['height'][i['height'] > 0].min())
height_diffs.max()

18.0

In [208]:
height_diffs.fillna(0).astype('int32').value_counts()

0     5036
1       13
3        4
2        4
6        2
4        2
18       1
5        1
8        1
dtype: int64

In [209]:
weight_diffs.fillna(0).astype('int32').value_counts()

0     4950
3       15
1       13
6       12
7       10
2        9
4        8
5        5
8        5
11       4
13       4
9        4
22       3
12       3
10       2
15       2
27       2
14       2
20       1
32       1
40       1
31       1
17       1
25       1
29       1
18       1
30       1
19       1
37       1
dtype: int64

In [210]:
weight_diffs.fillna(0).astype('int32').value_counts().sum()

5064

In [211]:
race_diffs = matched_data.groupby(level=0).apply(lambda i: i['ethnicity'].nunique())
race_diffs.value_counts()

1    3865
0    1199
dtype: int64