In [2]:
import tensorflow as tf
import numpy
import os
import json, pickle
import pandas
from functools import partial, reduce
import importlib

import sys
sys.path.append('../libs')

import flacdb
import prepare_data
import initialize
import data_pipeline
import loss_metrics
import conv_model
import plot_batch

In [2]:
! nvidia-smi

Tue Apr 21 20:33:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.78       Driver Version: 410.78       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX TIT...  On   | 00000000:05:00.0 Off |                  N/A |
| 22%   28C    P8    16W / 250W |  11837MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [3]:
icd_data = json.load(open('/scr-ssd/icd/icd9.json'))
icd_data[0]

[{'code': '001-139',
  'depth': 1,
  'href': '/index.php?action=child&recordid=1',
  'descr': 'INFECTIOUS AND PARASITIC DISEASES '},
 {'code': '001-009',
  'depth': 2,
  'href': '/index.php?action=child&recordid=2',
  'descr': 'INTESTINAL INFECTIOUS DISEASES '},
 {'code': '001',
  'depth': 3,
  'href': '/index.php?action=child&recordid=3',
  'descr': 'Cholera'},
 {'code': '001.9', 'depth': 4, 'href': None, 'descr': 'Cholera, unspecified'}]

In [4]:
def _to_string(i, with_dot):
    a = str(i[0]).zfill(3)
    b = ''.join(str(j) for j in i[1:])
    dot = '.' if with_dot and len(i) > 1 else ''
    return a + dot + b

def code_tuple_to_string(i):
    return _to_string(i, with_dot=False)

def code_string_to_tuple(i):
    return (int(i[:3]), *[int(j) for j in i[3:]])

def group_string_to_tuple(i):
    if '-' in i:
        return tuple((int(j),) for j in i.split('-'))
    elif '.' in i:
        a, b = i.split('.')
        return ((int(a), *[int(j) for j in b]),)
    else:
        return ((int(i),),)

def group_tuple_to_string(i):
    return '-'.join(_to_string(j, with_dot=True) for j in i)

In [5]:
root = '/scr1/mimic/'
clinic_file = lambda i: root + 'clinic/{}.csv'.format(i.upper())
diagnosis_info = pandas.read_csv(clinic_file('d_icd_diagnoses'))
diagnosis_info = diagnosis_info.drop(columns='ROW_ID').set_index('ICD9_CODE')
diagnosis_info.rename(columns={'LONG_TITLE': 'Condition'}, inplace=True)
diagnosis = pandas.read_csv(clinic_file('diagnoses_icd')).drop(columns='ROW_ID')
new_names = {i.upper(): i for i in ['subject_id', 'hadm_id']}
new_names['ICD9_CODE'] = 'code'
diagnosis = diagnosis[new_names].rename(columns=new_names)
diagnosis[:5]

Unnamed: 0,subject_id,hadm_id,code
0,109,172335,40301
1,109,172335,486
2,109,172335,58281
3,109,172335,5855
4,109,172335,4254


In [6]:
code_counts = diagnosis['code'].value_counts()
code_counts = pandas.concat([
    diagnosis_info.reindex(code_counts.index)[['Condition']], 
    pandas.Series(code_counts, name='Count')
], axis=1)
code_counts[:10]

Unnamed: 0,Condition,Count
4019,Unspecified essential hypertension,20703
4280,"Congestive heart failure, unspecified",13111
42731,Atrial fibrillation,12891
41401,Coronary atherosclerosis of native coronary ar...,12429
5849,"Acute kidney failure, unspecified",9119
25000,Diabetes mellitus without mention of complicat...,9058
2724,Other and unspecified hyperlipidemia,8690
51881,Acute respiratory failure,7497
5990,"Urinary tract infection, site not specified",6555
53081,Esophageal reflux,6326


In [7]:
code_strings = {i for i in code_counts.index if i[0] in '0123456789'}

code_tuples = {code_string_to_tuple(i) for i in code_strings}

group_strings = {
    j['code']: j['descr'].title()
    for i in icd_data for j in i 
    if j['code'] is not None and j['code'][0] in '0123456789'   
}

group_tuples = {group_string_to_tuple(i) for i in group_strings}

assert(len(code_tuples) == len(code_strings))

assert(all(
    code_tuple_to_string(code_string_to_tuple(i)) == i 
    for i in code_strings
))

assert(len(group_tuples) == len(group_strings))

assert(all(
    group_tuple_to_string(group_string_to_tuple(i)) == i 
    for i in group_strings
))

In [8]:
groups = {group_string_to_tuple(i): group_strings[i] for i in group_strings}
pandas.DataFrame.from_dict({'name': {str(k): groups[k] for k in groups}})

Unnamed: 0,name
"((1, 0),)",Due To Vibrio Cholerae
"((1, 1),)",Due To Vibrio Cholerae El Tor
"((1, 9),)","Cholera, Unspecified"
"((1,), (139,))",Infectious And Parasitic Diseases
"((1,), (9,))",Intestinal Infectious Diseases
...,...
"((999, 8, 2),)",Extravasation Of Other Vesicant Agent
"((999, 8, 8),)",Other Infusion Reaction
"((999, 8, 9),)",Other Transfusion Reaction
"((999, 9),)",Other And Unspecified Complications Of Medical...


In [9]:
def is_contained(code, group):
    if len(group) == 1:
        node = group[0]
        if node[0] != code[0] or len(node) > len(code):
            return False
        return all(node[i] == code[i] for i in range(len(node)))
    else:
        return group[0][0] <= code[0] <= group[1][0]

In [10]:
%%time

code_groups = {
    c: [g for g in groups if is_contained(c, g)] 
    for c in code_tuples
}

CPU times: user 16.6 s, sys: 24 ms, total: 16.6 s
Wall time: 16.6 s


In [11]:
diag_ = diagnosis.drop_duplicates()
diag_ = diag_.set_index(['subject_id', 'hadm_id', 'code'])
diag_ = diag_.sort_index()

In [12]:
%%time
group_diagnosis = {}
for i in diag_.index.get_level_values('subject_id').unique():
    for j in diag_.loc[i].index.get_level_values('hadm_id').unique():
        group_diagnosis[i, j] = set()
        for k in diag_.loc[(i, j)].index:
            if type(k) is str and k[0] in '0123456789':
                code = code_string_to_tuple(k)
                group_diagnosis[i, j].update(code_groups[code])

CPU times: user 2min 28s, sys: 380 ms, total: 2min 28s
Wall time: 2min 28s


In [13]:
group_counts = {group: 0 for group in groups}
for i in group_diagnosis:
    for group in group_diagnosis[i]:
        group_counts[group] += 1

df = pandas.DataFrame.from_dict({
    'name': {group_tuple_to_string(k): groups[k] for k in groups},
    'count': {group_tuple_to_string(k): group_counts[k] for k in group_counts}
})
df.sort_values(['count'], ascending=False, inplace=True)
df[:50]

Unnamed: 0,name,count
390-459,Diseases Of The Circulatory System,42650
240-279,"Endocrine, Nutritional And Metabolic Diseases,...",35606
270-279,Other Metabolic And Immunity Disorders,28210
401-405,Hypertensive Disease,27912
420-429,Other Forms Of Heart Disease,26025
460-519,Diseases Of The Respiratory System,25260
800-999,Injury And Poisoning,22686
580-629,Diseases Of The Genitourinary System,21779
401,Essential Hypertension,21305
520-579,Diseases Of The Digestive System,21010


In [22]:
df[df.apply(lambda i: 'malignant' in i['name'].lower(), axis=1)][:50]

Unnamed: 0,name,count
190-199,Malignant Neoplasm Of Other And Unspecified Si...,3401
198,Secondary Malignant Neoplasm Of Other Specifie...,1810
197,Secondary Malignant Neoplasm Of Respiratory An...,1748
150-159,Malignant Neoplasm Of Digestive Organs And Per...,1570
200-208,Malignant Neoplasm Of Lymphatic And Hematopoie...,1481
160-165,Malignant Neoplasm Of Respiratory And Intratho...,1247
162,"Malignant Neoplasm Of Trachea, Bronchus, And Lung",1171
179-189,Malignant Neoplasm Of Genitourinary Organs,788
196,Secondary And Unspecified Malignant Neoplasm O...,709
202,Other Malignant Neoplasms Of Lymphoid And Hist...,498


In [26]:
df.index.map(lambda i: '03' in i)

Index([False, False, False, False, False, False, False, False, False, False,
       ...
       False, False, False, False, False, False, False, False, False, False],
      dtype='object', length=9719)

In [29]:
df[df.index.map(lambda i: i[0] == '7')][:50]

Unnamed: 0,name,count
780-799,"Symptoms, Signs, And Ill-Defined Conditions",20182
780-789,Symptoms,16901
710-739,Diseases Of The Musculoskeletal System And Con...,10053
760-779,Certain Conditions Originating In The Perinata...,5488
764-779,Other Conditions Originating In The Perinatal ...,5458
780,General Symptoms,5449
785,Symptoms Involving Cardiovascular System,5351
790-796,Nonspecific Abnormal Findings,4586
785.5,Shock Without Mention Of Trauma,4283
730-739,"Osteopathies, Chondropathies, And Acquired Mus...",3794


In [32]:
df[df.index.map(lambda i: i[:2] == '45')][:50]

Unnamed: 0,name,count
451-459,"Diseases Of Veins And Lymphatics, And Other Di...",8792
458,Hypotension,5045
458.2,Iatrogenic Hypotension,2453
458.29,Other Iatrogenic Hypotension,2121
458.9,"Hypotension, Unspecified",2051
453,Other Venous Embolism And Thrombosis,1556
456,Varicose Veins Of Other Sites,1000
453.8,Of Other Specified Veins,886
459,Other Disorders Of Circulatory System,828
456.2,Esophageal Varices In Diseases Classified Else...,770


In [14]:
df[300:350]

Unnamed: 0,name,count
415,Acute Pulmonary Heart Disease,1116
413,Angina Pectoris,1115
300.4,Dysthymic Disorder,1109
415.1,Pulmonary Embolism And Infarction,1109
519,Other Diseases Of Respiratory System,1107
576,Other Disorders Of Biliary Tract,1104
574,Cholelithiasis,1100
413.9,Other And Unspecified Angina Pectoris,1100
428.22,Chronic,1097
569,Other Disorders Of Intestine,1096


In [15]:
df[350:400]

Unnamed: 0,name,count
787.2,Dysphagia,913
362.0,Diabetic Retinopathy,908
780.5,Sleep Disturbances,907
396,Diseases Of Mitral And Aortic Valves,906
041.0,Streptococcus,899
453.8,Of Other Specified Veins,886
765.27,33-34 Weeks Of Gestation,881
745,Bulbus Cordis Anomalies And Anomalies Of Cardi...,876
960-979,"Poisoning By Drugs, Medicinal And Biological S...",876
362.01,Background Diabetic Retinopathy,875


In [16]:
df[400:450]

Unnamed: 0,name,count
365.0,Glaucoma,742
70.7,Unspecified Viral Hepatitis C,740
557.0,Vascular Insufficiency Of Intestine,736
585.3,"Chronic Kidney Disease, Stage Iii (Moderate)",735
349.0,Other And Unspecified Disorders Of The Nervous...,733
682.6,"Leg, Except Foot",732
537.0,Other Disorders Of Stomach And Duodenum,730
348.4,Compression Of Brain,711
365.9,Unspecified Glaucoma,711
250.8,Diabetes With Other Specified Manifestations,711


In [17]:
df[450:500]

Unnamed: 0,name,count
569.8,Other Specified Disorders Of Intestine,632
342.9,"Hemiplegia, Unspecified",632
930-939,Effects Of Foreign Body Entering Through Orifice,628
730,"Osteomyelitis, Periostitis, And Other Infectio...",620
583.8,With Other Specified Pathological Lesion In Ki...,617
041.04,Group D [Enterococcus],614
294.10,Dementia In Conditions Classified Elsewhere Wi...,613
255,Disorders Of Adrenal Glands,610
438.8,Other Late Effects Of Cerebrovascular Disease,608
785.0,"Tachycardia, Unspecified",607


In [18]:
df[500:550]

Unnamed: 0,name,count
426.0,"Atrioventricular Block, Complete",530
380-389,Diseases Of The Ear And Mastoid Process,523
344,Other Paralytic Syndromes,521
996.81,Kidney,515
038.11,Methicillin Susceptible Staphylococcus Aureus ...,515
532,Duodenal Ulcer,513
721,Spondylosis And Allied Disorders,513
255.4,Corticoadrenal Insufficiency,511
332,Parkinson'S Disease,510
536.3,Gastroparesis,509


In [37]:
selected_groups = open('selected_icd_groups.txt').readlines()
selected_groups = [i.split()[0] for i in selected_groups]
selected_groups = [group_string_to_tuple(i) for i in selected_groups]
selected_groups[:5]

[((401,), (405,)), ((460,), (519,)), ((401,),), ((280,), (289,)), ((427,),)]

In [88]:
%%time

selected_group_diagnosis = {
    i: {group_tuple_to_string(j) for j in group_diagnosis[i] if j in selected_groups}
    for i in group_diagnosis
}

data = [
    [i[0], i[1], j]
    for i in selected_group_diagnosis
    for j in selected_group_diagnosis[i]
]

CPU times: user 7.02 s, sys: 36 ms, total: 7.05 s
Wall time: 7.05 s


In [90]:
df = pandas.DataFrame(data, columns=['subject_id', 'hadm_id', 'code'])
df.set_index(['subject_id', 'hadm_id', 'code'], inplace=True)
df.sort_index(inplace=True)
df.at[:, 'present'] = True
df = df.unstack(fill_value=False)['present'].astype('bool')
df

Unnamed: 0_level_0,code,038,070,150-159,160-165,162,190-199,200-208,244,250,250.4,...,769,780.3,785.51,785.52,789.5,799.0,799.02,995.9,995.91,995.92
subject_id,hadm_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3,145834,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,185777,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,107064,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,150750,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
11,194540,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99985,176670,True,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,False,False,True,False,True
99991,151118,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,True,False,False,True,False,True
99992,197084,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
99995,137810,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [1]:
df

NameError: name 'df' is not defined