In [1]:
%matplotlib widget

import tensorflow as tf
from matplotlib import pyplot
import matplotlib
import matplotlib.cm as colormap
import numpy
import os
import json, pickle
import pandas
from functools import partial, reduce
import importlib
from sklearn import manifold
from scipy import stats

import sys
sys.path.append('../libs')

import flacdb
import prepare_data
import initialize
import data_pipeline
import conv_model
import plot_batch
import generate_report_kfold
import icd_util

pyplot.style.use('dark_background')

In [2]:
group_names = icd_util.load_group_strings()
metadata = pandas.read_hdf('/scr-ssd/mimic/metadata.hdf')

In [3]:
M = metadata.loc[(slice(None), 1), 'admission_diagnosis']
plain = M[M.notna()].astype(str)
counts = plain.value_counts()

In [45]:
plain.shape

(18362,)

In [54]:
import string
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [69]:
for i in plain.values[:100]:
    print(i.upper())
    print('\n')

S/P VFIB ARREST


PANCREATITS


ENDOCARDITIS


CHOLANGIO CANCER


MITAL VALVE PROLAPSE\MITRAL VALVE REPLACEMENT/SDA


CORONARY ARTERY DISEASE\CATH


S/P FALL


AORTIC STENOSIS\AORTIC VALVE REPLACEMENT/SDA


CELLULITIS


INTRACRANIAL HEMORRHAGE


ST SEGMENT ELEVATION MYOCARDIAL INFARCTION\CATH


ANGINA


AORTIC STENOSIS\AORTIC VALVE REPLACEMENT 


BILIARY OBSTRUCTION\ERCP 


SUBDURAL HEMATOMA;INTRAPARENCHYMAL HEMORRHAGE


GASTROINTESTINAL BLEED


AORTIC STENOSIS\CORONARY ARTERY BYPASS GRAFT WITH AVR /SDA


HYPOTENSION


ALTERED MENTAL STATUS


BI-LATERAL SUBDURAL HEMATOMA


CHEST PAIN


MR;AORTIC STENOSIS;AFIB\AORTIC VALVE REPLACEMENT;MITRAL VALVE REPLACEMENT; MAZE PROCEDURE/SDA


ELEVATED LIVER FUNCTION TESTS


TVR


SHORTNESS OF BREATH;WEAKNESS


EVOLVING ANTERIOR M.I.


ALTERED MENTAL STATUS


ABDOMINAL PAIN


FB IN ESOPHAGUS


BLUNT TRAUMA


SUBARACHNOID HEMORRHAGE/SDA


ENDOCARDITIS


RIGHT RENAL CELL CANCER W/VENA CAVA THROMBUS/SDA


CHEST PAIN


SUBARACHNOID HEMORRHAGE


NSTEMI;A

In [64]:
import string

X0 = [
    (i4 if 'S/P' not in i3 and 'R/O' not in i3 else i3).replace('?', '').strip() 
    for i1 in plain.values 
    for i2 in i1.split(';') 
    for i3 in i2.split('\\')
    for i4 in i3.split('/')
]
X0 = [' '.join(i.split()) for i in X0]
C = string.ascii_lowercase + ' '
X0 = [''.join(c for c in i if c.lower() in C).title() for i in X0]
X0 = [i.replace('Acute ', '') for i in X0]
X0 = [i for i in X0 if i not in ['Sda', 'Telemetry']]
X, counts = numpy.unique(X0, return_counts=True)
print(sum(counts), 'Problems,', len(set(X0)), 'Unique')
I = numpy.argsort(-counts)
for i in I[:sum(counts>100)]:
    print(counts[i], X[i])
print(counts[counts > 100].sum())

txt = ''
for i in I:
    txt += str(counts[i]) + ' ' + X[i] + '\n\n'
with open('unique_problems_new.txt', 'w') as f:
    f.write(txt)

24537 Problems, 2971 Unique
1007 Coronary Artery Disease
653 Congestive Heart Failure
596 Chest Pain
538 Pneumonia
473 Intracranial Hemorrhage
453 Sepsis
441 Stroke
427 Cath
426 Aortic Stenosis
382 Coronary Artery Bypass Graft
351 Subarachnoid Hemorrhage
334 Altered Mental Status
282 Transient Ischemic Attack
276 Renal Failure
272 Sp Fall
272 Myocardial Infarction
270 Subdural Hematoma
267 Gastrointestinal Bleed
264 Upper Gi Bleed
253 Abdominal Pain
228 Hypotension
196 Seizure
187 Fever
181 Lower Gi Bleed
177 Aortic Valve Replacement
176 Pancreatitis
165 Brain Mass
154 Cardiac Cath
134 Blunt Trauma
132 Shortness Of Breath
126 Coronary Syndrome
126 Liver Failure
125 Diabetic Ketoacidosis
116 Unstable Angina
116 Urinary Tract Infection
115 Left Heart Catheterization
114 Dyspnea
114 Respiratory Failure
109 Mitral Valve Replacement
11028


In [7]:
Y = numpy.array(list(group_names.values()))
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    analyzer='char_wb', 
    ngram_range=(2**1, 2**3)
)
Z = vectorizer.fit_transform(list(X) + list(Y))
print(Z.shape)

(12769, 108242)


In [8]:
D = Z.dot(Z.T).toarray()
DX = D[:X.size, :X.size]
DX -= numpy.diag(numpy.diag(DX))
DY = D[-Y.size:, -Y.size:]
DY -= numpy.diag(numpy.diag(DY))
DXY = D[:X.size, -Y.size:]

In [9]:
matches = numpy.argmax(DX, axis=1)
scores = numpy.max(DX, axis=1)
I = numpy.argsort(-scores)
for i in I[2500:2510]:
    print(round(scores[i], 3), '\n', X[i], '\n', X[matches[i]])

0.506 
 Right To Left Shunt 
 Right
0.506 
 Met Renal Cell Ca-Trachel Obstruction 
 Obstruction
0.505 
 Fever Tachy Cardia 
 Fever
0.505 
 Ascending Aorta To Supraciliac Aorta Bypass 
 Ascending Aorta Replacement
0.505 
 Intracutaneous Fistula 
 Enterocutaneous Fistula
0.505 
 Enterocutaneous Fistula 
 Intracutaneous Fistula
0.504 
 V-Tach 
 V-Tach, Heart Failure
0.504 
 Varicella Zoster Encephalitis 
 Varicella Pneumonia
0.504 
 Hip Fracture,Right Side 
 Right Hip Fracture
0.503 
 Icd Shocks 
 Icd Shock For Vt


In [38]:
from scipy import sparse
graph = sparse.csr_matrix(DX > 0.95)
unique_count, labels = sparse.csgraph.connected_components(csgraph=graph, directed=False)
print(unique_count, max((labels == i).sum() for i in labels))

2967 4


In [41]:
txt = ''
for i in range(unique_count):
    I = labels == i
    A, B = counts[I], X[I]
    J = numpy.argsort(A)
    if counts[I].sum() > 10:
        for j in numpy.argsort(-counts[I]):
            txt += str(A[j]) + '\t' + B[j] + '\n'
        txt += '\n'

# print(txt)
# with open('clusters.txt', 'w') as f:
#     f.write(txt)