In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
proceds = pd.read_csv('PROCEDURES_ICD.csv')
d_proceds = pd.read_csv('D_ICD_PROCEDURES.csv')

## Topic Modelling

Topic modelling uses unsupervised ML to identify clusters or groups of similar words within a body of text

LSA: Latent semantic analysis is a statistical technique for extracting and representing the main ideas in a body of text. LSA is based on the principle that words that are close in meaning tend to be used together in context.

Latent Dirichlet analysis is one of the most popular topic modeling methods. It uncovers the hidden structure in a set of observations by looking at the relationships between words in a document and grouping them into topics.

In [6]:
admissions = pd.read_csv('ADMISSIONS.csv')
# get all the individual hadm_id
hadm_ids = admissions['HADM_ID'].unique()

In [7]:
# combine the two tables
df_proceds = pd.merge(proceds, d_proceds, on='ICD9_CODE', how='inner')

In [8]:
df_proceds = df_proceds[['HADM_ID','ICD9_CODE', 'LONG_TITLE']] # extract the columns we need
df_proceds['ICD9_CODE'] = df_proceds['ICD9_CODE'].astype(str) # convert the ICD9_CODE to string
agg_proceds = df_proceds.groupby('HADM_ID')['ICD9_CODE'].apply(list)

In [9]:
unique_procedures = d_proceds['ICD9_CODE'].unique()
num_unique_prods, num_ids = len(unique_procedures), len(hadm_ids)
num_unique_prods, num_ids

(3809, 58976)

In [10]:
# convert to string
unique_procedures = [str(x) for x in unique_procedures]

In [11]:
vector_length = num_unique_prods  # Assuming IDs start from 0 and are contiguous
binary_tensor = np.zeros((num_ids, vector_length), dtype=int)
d = pd.DataFrame(binary_tensor, columns=unique_procedures)
d.index = hadm_ids
# change name of index column
d.index.name = 'HADM_ID'
d.reset_index(inplace=True)
d

Unnamed: 0,HADM_ID,851,852,859,861,862,863,864,869,870,...,9233,9239,9241,9957,9958,9959,9960,9961,9962,9963
0,165315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,152223,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,124321,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,161859,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,129635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,191113,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58972,101071,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58973,122631,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58974,170407,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
agg_proceds = agg_proceds.explode().reset_index()

In [13]:
# populating the OHV, this takes a while
for i in range(len(agg_proceds)):
    hadm = agg_proceds.loc[i]['HADM_ID']
    icd9 = agg_proceds.loc[i]['ICD9_CODE']
    d.loc[d['HADM_ID'] == hadm, icd9] = 1
    

In [35]:
# find HADM_IDs that have diagnosis with 5990
diagnoses = pd.read_csv('DIAGNOSES_ICD.csv')
d_diagnoses = pd.read_csv('D_ICD_DIAGNOSES.csv')

# combine the two tables, based on ICD9_CODE, without short title
df_diag = pd.merge(diagnoses, d_diagnoses, on='ICD9_CODE', how='left')
df_diag = df_diag.dropna(subset=['ICD9_CODE'])
df_diag['ICD9_CODE'] = df_diag['ICD9_CODE'].astype(str)
df_diag = df_diag[['HADM_ID','ICD9_CODE', 'LONG_TITLE']]

In [37]:
# check if the diagnosis is 5990
df_diag['UTI'] = df_diag['ICD9_CODE'].apply(lambda x: 1 if x == '5990' else 0)
# get the hadm_id that has diagnosis 5990
hadm_ids_5990 = df_diag[df_diag['UTI'] == 1]['HADM_ID'].unique()
hadm_ids_5990

array([114585, 134369, 191817, ..., 122472, 109999, 161999])

In [42]:
d

Unnamed: 0,HADM_ID,851,852,859,861,862,863,864,869,870,...,9239,9241,9957,9958,9959,9960,9961,9962,9963,UTI
0,165315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,152223,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,124321,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,161859,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,129635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,191113,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58972,101071,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58973,122631,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58974,170407,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [48]:
from sklearn.model_selection import train_test_split

# split the data into train and test
X = d.drop(['HADM_ID', 'UTI'], axis=1)
y = d['UTI']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Autoencoders

We reduce the sparse OHV down to 32 dimensions.

In [51]:
# autoencoder

from keras.layers import Input, Dense
from keras.models import Model

3809

In [53]:
input_dim = x_train.shape[1]
encoding_dim = 32

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="relu")(input_layer)

autoencoder = Model(inputs=input_layer, outputs=encoder)

In [54]:
encoded = autoencoder.predict(x_train)



In [56]:
encoded.shape

(47180, 32)

In [57]:
x_train.shape

(47180, 3809)