In [1]:
import tensorflow as tf
import numpy
import os
import json, pickle
import pandas
from functools import partial, reduce
import importlib

import sys
sys.path.append('../libs')

import flacdb
import prepare_data
import initialize
import data_pipeline
import loss_metrics
import conv_model
import plot_batch

In [2]:
icd_data = json.load(open('/scr-ssd/icd/icd9.json'))

In [3]:
groups = {j['code'] for i in icd_data for j in i}
groups = {i for i in groups if i is not None and i[0] in '0123456789'}
print(len(groups))
sorted(groups)[:10]

9719


['001',
 '001-009',
 '001-139',
 '001.0',
 '001.1',
 '001.9',
 '002',
 '002.0',
 '002.1',
 '002.2']

In [None]:
groups = []
for i in icd_data:
    for j in i:
        c = j['code']
        if c is not None and c[0] in '0123456789':
            if '-' in c:
                group = tuple((int(k),) for k in c.split('-'))
            elif '.' in c:
                a, b = c.split('.')
                group = ((int(a), *[int(k) for k in b]),)
            else:
                group = ((int(c),),)
            groups.append(group)

groups = set(groups)


print(len(groups))
print(sorted(groups)[:5])
print(sorted(groups)[-5:])

In [181]:
diagnosis

Unnamed: 0,subject_id,hadm_id,code
0,109,172335,40301
1,109,172335,486
2,109,172335,58281
3,109,172335,5855
4,109,172335,4254
...,...,...,...
651042,97503,188195,20280
651043,97503,188195,V5869
651044,97503,188195,V1279
651045,97503,188195,5275


In [4]:
root = '/scr1/mimic/'
clinic_file = lambda i: root + 'clinic/{}.csv'.format(i.upper())
diagnoses_info = pandas.read_csv(clinic_file('d_icd_diagnoses'))
diagnoses_info = diagnoses_info.drop(columns='ROW_ID').set_index('ICD9_CODE')
diagnoses_info.rename(columns={'LONG_TITLE': 'Condition'}, inplace=True)
diagnosis = pandas.read_csv(clinic_file('diagnoses_icd')).drop(columns='ROW_ID')
new_names = {i.upper(): i for i in ['subject_id', 'hadm_id']}
new_names['ICD9_CODE'] = 'code'
diagnosis = diagnosis[new_names].rename(columns=new_names)
code_counts = diagnosis['code'].value_counts()
code_counts = pandas.concat([
    diagnoses_info.reindex(code_counts.index)[['Condition']], 
    pandas.Series(code_counts, name='Count')
], axis=1)
code_counts[:10]

Unnamed: 0,Condition,Count
4019,Unspecified essential hypertension,20703
4280,"Congestive heart failure, unspecified",13111
42731,Atrial fibrillation,12891
41401,Coronary atherosclerosis of native coronary ar...,12429
5849,"Acute kidney failure, unspecified",9119
25000,Diabetes mellitus without mention of complicat...,9058
2724,Other and unspecified hyperlipidemia,8690
51881,Acute respiratory failure,7497
5990,"Urinary tract infection, site not specified",6555
53081,Esophageal reflux,6326


In [5]:
def code_string_to_tuple(i):
    return (int(i[:3]), *[int(j) for j in i[3:]])

def code_tuple_to_string(i):
    return str(i[0]).zfill(3) + ''.join(str(j) for j in i[1:])

In [6]:
all(
    code_tuple_to_string(code_string_to_tuple(i)) == i 
    for i in code_counts.index if i[0] in '0123456789'
)

True

In [7]:
codes = [i for i in code_counts.index if i[0] in '0123456789']
codes = [code_string_to_tuple(i) for i in codes]
print(len(set(codes)), 'codes')
codes[:5]

5991 codes


[(401, 9), (428, 0), (427, 3, 1), (414, 0, 1), (584, 9)]

In [8]:
all(len(j) == 1 for i in groups if len(i) == 2 for j in i)

True

In [9]:
def is_contained(code, group):
    if len(group) == 1:
        node = group[0]
        if node[0] != code[0] or len(node) > len(code):
            return False
        return all(node[i] == code[i] for i in range(len(node)))
    else:
        return group[0][0] <= code[0] <= group[1][0]

In [10]:
code_groups = {c: [g for g in groups if is_contained(c, g)] for c in codes}

TypeError: '<=' not supported between instances of 'str' and 'int'

In [99]:
sum(any(is_contained(c, g) for g in groups) for c in codes)

5991

In [102]:
sum(len(code_groups[i]) > 0 for i in code_groups)

5991

In [169]:
code_groups[(401, 9)]

[((401, 9),), ((401,),), ((401,), (405,)), ((390,), (459,))]

In [165]:
group_counts = {group: 0 for group in groups}
for code in code_groups:
    code_string = code_tuple_to_string(code)
    for group in code_groups[code]:
        group_counts[group] += code_counts['Count'][code_string]

In [166]:
group_counts[((240,),)]

75

In [176]:
counts_ = {str(k): group_counts[k] for k in group_counts}
df = pandas.DataFrame.from_dict({'count': counts_})
df.sort_values(['count'], ascending=False, inplace=True)
df[:50]

Unnamed: 0,count
"((390,), (459,))",140257
"((240,), (279,))",69592
"((420,), (429,))",53211
"((460,), (519,))",44825
"((800,), (999,))",42948
"((270,), (279,))",42900
"((520,), (579,))",38527
"((580,), (629,))",32867
"((780,), (799,))",29691
"((401,), (405,))",27988


In [167]:
group_counts_df = pandas.concat([
    pandas.Series(group_counts, name='name'),
    pandas.Series(group_counts, name='count')
], axis=1)
group_counts_df.sort_values(['count'], ascending=False, inplace=True)
group_counts_df[:50]

Unnamed: 0,name,count
"(390,)",140257,140257
"(240,)",69592,69592
"(420,)",53211,53211
"(460,)",44825,44825
"(800,)",42948,42948
"(270,)",42900,42900
"(520,)",38527,38527
"(580,)",32867,32867
"(780,)",29691,29691
"(401,)",27988,27988


In [139]:
max(group_counts.values())

139422

In [107]:
min(group_counts.values())

0