In [4]:
import tensorflow as tf
from time import time
import numpy
import os
import json
import pickle
import pandas
import datetime
from functools import partial, reduce

import sys
sys.path.append('../libs')

import data_pipeline
import conv_model
import initialize
import prepare_data
import flacdb

In [3]:
H = initialize.load_hypes()
H = {
    **H, 
    'epochs': 32, 
    'window_size': 512,
    'steps_per_epoch': 64,
    'validation_steps': 128
}
path = initialize.DATA_ROOT + 'initial_data_{}.hdf'.format(H['epochs'])
data = initialize.run(H)

92 years,  188592 record segments


In [3]:
# path = initialize.DATA_ROOT + 'initial_data_{}.hdf'.format(H['epochs'])
# data = initialize.run(H)
# initialize.save(path, data)

92 years,  188592 record segments


In [18]:
diagnosis = pandas.read_csv(initialize.clinic_file('diagnoses_icd'))
new_names = {'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id', 'ICD9_CODE': 'code'}
diagnosis = diagnosis[new_names].rename(columns=new_names)
diagnosis.loc[~diagnosis['code'].isin(H['icd_codes']), 'code'] = 'other'
diagnosis.drop_duplicates(inplace=True)
diagnosis.set_index(['subject_id', 'hadm_id', 'code'], inplace=True)
diagnosis.sort_index(inplace=True)
diagnosis.at[:, 'present'] = True
diagnosis = diagnosis.unstack(fill_value=False)['present'].astype('bool')

metadata = data['metadata']

index = metadata.reset_index()[diagnosis.index.names]
index = pandas.MultiIndex.from_frame(index)
diagnosis = diagnosis.reindex(index)

dataframes = [
    diagnosis.reset_index(), 
    metadata.reset_index()[['rec_id', 'segment']]
]
diagnosis = pandas.concat(dataframes, sort=False, verify_integrity=True, axis=1)
diagnosis = diagnosis.set_index(['rec_id', 'segment'], verify_integrity=True)
# diagnosis = diagnosis.reindex(index)

diagnosis[H['icd_codes']].stack().value_counts()

False    902951
True     231094
dtype: int64

In [19]:
diagnosis = diagnosis.reset_index()
diagnosis = diagnosis.set_index(['subject_id', 'rec_id', 'segment'])
diagnosis = diagnosis.sort_index()

In [21]:
is_positive_once = diagnosis.drop(columns='hadm_id').any(level=0)
is_negative_always = ~is_positive_once
is_diagnosed_always = (diagnosis['hadm_id'] > 0).all(level=0)
is_diagnosed_once = (diagnosis['hadm_id'] > 0).any(level=0)

In [36]:
is_negative = is_negative_always[is_diagnosed_always]
fixed1 = diagnosis.drop(columns='hadm_id')
fixed1 = fixed1.replace({True: 1, False: 0, numpy.nan: 0})
fixed1.loc[is_negative.index] -= is_negative.replace({True: 1, False: 0, numpy.nan: 0})
fixed1[H['icd_codes']].stack().value_counts()

 0    1743430
-1     854356
 1     231094
dtype: int64

In [37]:
is_negative = is_negative_always[is_diagnosed_once]
fixed2 = diagnosis.drop(columns='hadm_id')
fixed2 = fixed2.replace({True: 1, False: 0, numpy.nan: 0})
fixed2.loc[is_negative.index] -= is_negative.replace({True: 1, False: 0, numpy.nan: 0})
fixed2[H['icd_codes']].stack().value_counts()

 0    1680166
-1     917620
 1     231094
dtype: int64

In [38]:
unfixed = diagnosis.drop(columns='hadm_id')
unfixed = unfixed.replace({True: 1, False: -1, numpy.nan: 0})
unfixed[H['icd_codes']].stack().value_counts()

 0    1694835
-1     902951
 1     231094
dtype: int64

In [49]:
(tmp.index == metadata.index).all()

True

In [47]:
tmp = fixed1.reset_index().set_index(metadata.index.names).reindex(metadata.index)

In [45]:
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_id,25000,2724,2762,2859,4019,41401,4240,4241,42731,4280,51881,5849,5859,78552,99592,other
rec_id,segment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3275052,15,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3164127,29,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3493248,204,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3242351,33,6539,-1,1,-1,-1,1,-1,-1,-1,1,1,-1,1,1,-1,1,1
3559156,42,14096,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3946181,21,3372,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,1
3317417,50,58662,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,1,-1,1,1,1
3186550,21,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3526492,24,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3376043,77,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
(fixed1[H['icd_codes']] == -1).any(level=0).sum(axis=0)

4019     2495
4280     3309
42731    3166
41401    3165
2724     3423
25000    3679
5859     4234
5849     3738
51881    3562
2859     3984
4240     4245
4241     4199
78552    4181
99592    4076
2762     3994
dtype: int64

In [40]:
(fixed2[H['icd_codes']] == -1).any(level=0).sum(axis=0)

4019     2604
4280     3432
42731    3291
41401    3296
2724     3562
25000    3810
5859     4393
5849     3872
51881    3695
2859     4134
4240     4407
4241     4352
78552    4324
99592    4213
2762     4146
dtype: int64

In [41]:
(unfixed[H['icd_codes']] == -1).any(level=0).sum(axis=0)

4019     2636
4280     3465
42731    3308
41401    3335
2724     3603
25000    3833
5859     4414
5849     3918
51881    3751
2859     4164
4240     4425
4241     4363
78552    4352
99592    4244
2762     4187
dtype: int64

In [25]:
metadata[metadata['hadm_id'] > 0]['subject_id'].unique().shape

(4710,)

In [26]:
diagnosis.drop(columns='hadm_id').any(level=0).sum(axis=0)

25000     900
2724     1148
2762      564
2859      576
4019     2106
41401    1414
4240      303
4241      358
42731    1419
4280     1278
51881    1015
5849      838
5859      317
78552     386
99592     497
dtype: int64

In [34]:
unfixed.notna().sum().sum()

2828880

In [35]:
fixed1.notna().sum().sum()

2828880

In [36]:
fixed2.notna().sum().sum()

2828880

In [45]:
diagnosis[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hadm_id,25000,2724,2762,2859,4019,41401,4240,4241,42731,4280,51881,5849,5859,78552,99592
subject_id,rec_id,segment,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
-1,3000063,6,-1,,,,,,,,,,,,,,,
-1,3000063,7,-1,,,,,,,,,,,,,,,
-1,3000063,10,-1,,,,,,,,,,,,,,,
-1,3000063,11,-1,,,,,,,,,,,,,,,
-1,3000063,12,-1,,,,,,,,,,,,,,,
-1,3000063,13,-1,,,,,,,,,,,,,,,
-1,3000063,14,-1,,,,,,,,,,,,,,,
-1,3000063,15,-1,,,,,,,,,,,,,,,
-1,3000063,16,-1,,,,,,,,,,,,,,,
-1,3000063,18,-1,,,,,,,,,,,,,,,


In [53]:
diagnosis.columns[1:]

Index(['25000', '2724', '2762', '2859', '4019', '41401', '4240', '4241',
       '42731', '4280', '51881', '5849', '5859', '78552', '99592'],
      dtype='object')

In [54]:
diagnosis.drop_duplicates?

[0;31mSignature:[0m [0mdiagnosis[0m[0;34m.[0m[0mdrop_duplicates[0m[0;34m([0m[0msubset[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mkeep[0m[0;34m=[0m[0;34m'first'[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return DataFrame with duplicate rows removed, optionally only
considering certain columns. Indexes, including time indexes
are ignored.

Parameters
----------
subset : column label or sequence of labels, optional
    Only consider certain columns for identifying duplicates, by
    default use all of the columns
keep : {'first', 'last', False}, default 'first'
    - ``first`` : Drop duplicates except for the first occurrence.
    - ``last`` : Drop duplicates except for the last occurrence.
    - False : Drop all duplicates.
inplace : boolean, default False
    Whether to drop duplicates in place or to return a copy

Returns
-------
DataFrame
[0;31mFile:[0m      ~/blood-pressure/env3p7/lib/py

In [69]:
cols = ['subject_id'] + list(diagnosis.columns[1:])
tmp = diagnosis.reset_index().drop_duplicates(cols)
tmp = tmp.set_index(['subject_id', 'rec_id'], verify_integrity=True)
tmp.sort_index(inplace=True)
tmp[tmp['hadm_id'] > 0].shape

(4856, 17)

In [68]:
len(set(diagnosis[diagnosis['hadm_id'] > 0].reset_index()['subject_id']))

4710

In [71]:
len(set(tmp['hadm_id']))

4857

In [82]:
tmp[tmp['hadm_id'] > 0].groupby(level=0).apply(lambda i: i.shape[0]).value_counts()

1    4547
2     133
3      13
4       1
dtype: int64

In [81]:
tmp[tmp['hadm_id'] > 0].groupby(level=0).filter(lambda i: i.shape[0] > 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,segment,hadm_id,25000,2724,2762,2859,4019,41401,4240,4241,42731,4280,51881,5849,5859,78552,99592
subject_id,rec_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2513,3490144,4,117823,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False
2513,3814737,13,173094,False,True,False,False,True,True,True,False,False,True,False,False,False,False,False
2639,3090785,11,158120,False,True,False,False,True,True,False,False,False,True,True,False,False,False,False
2639,3323235,6,185150,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False
2639,3403369,21,157999,False,True,False,False,True,False,True,False,False,True,True,False,False,True,True
2747,3308699,2,163539,False,False,True,False,True,False,False,True,True,False,False,False,False,False,False
2747,3839676,1,160561,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False
4113,3172335,6,182591,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
4113,3592294,1,191180,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False
4599,3083798,6,130848,,,,,,,,,,,,,,,
