In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
def convert_to_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
        else: return dxStr
    
def convert_to_3digit_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3]
        else: return dxStr

In [None]:
admission_file = Path('../data/ADMISSIONS.csv')
diagnosis_file = Path('../data/DIAGNOSES_ICD.csv')

In [None]:
admissions_df = pd.read_csv(admission_file)
admissions_df['admittime']= pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime']= pd.to_datetime(admissions_df['dischtime'])
admissions_df['deathtime']= pd.to_datetime(admissions_df['deathtime'])
admissions_df['edregtime']= pd.to_datetime(admissions_df['edregtime'])
admissions_df['edouttime']= pd.to_datetime(admissions_df['edouttime'])
admissions_df.head()

In [None]:
diagnosis_df = pd.read_csv(diagnosis_file)
diagnosis_df['icd9_code_converted'] = diagnosis_df['icd9_code'].apply(convert_to_icd9)
diagnosis_df['icd9_code_converted_3digits'] = diagnosis_df['icd9_code'].apply(convert_to_3digit_icd9)
diagnosis_df.head()

In [None]:
codes_per_admission = diagnosis_df.groupby('hadm_id').agg({
    'icd9_code': lambda x: list(x),
    'icd9_code_converted': lambda x: list(x),
    'icd9_code_converted_3digits': lambda x: list(x),
})
combined_df = pd.merge(admissions_df, codes_per_admission, on=['hadm_id'])
admissions_per_subject = combined_df.groupby('subject_id').agg({
    'hadm_id': lambda x: list(x),
    'admittime': lambda x: list(x),
    'diagnosis': lambda x: list(x),
    'icd9_code': lambda x: list(x),
    'icd9_code_converted': lambda x: list(x),
    'icd9_code_converted_3digits': lambda x: list(x),
})
admissions_per_subject['num_admissions'] = admissions_per_subject['hadm_id'].apply(len)
admissions_per_subject.head()

In [None]:
relevant_data = admissions_per_subject[admissions_per_subject['num_admissions'] >= 2]
relevant_data

In [None]:
all_symptoms = list(set([item for sublist in relevant_data['icd9_code_converted_3digits'].agg(lambda x: [item for sublist in x for item in sublist]).tolist() for item in sublist]))
vocab = {}
index = 0
for symptom in all_symptoms:
    vocab[symptom] = index
    index = index+1

len(vocab)

In [None]:
max_sequence_length = relevant_data['icd9_code_converted_3digits'].apply(len).max()
max_symptoms_per_sequence = relevant_data['icd9_code_converted_3digits'].apply(lambda x: sum([len(y) for y in x])).max()
train_sequences, test_sequences = train_test_split(
    relevant_data['icd9_code_converted_3digits'], 
    test_size=0.1, 
    random_state=12345)
train_sequences.tolist()[5]

In [None]:
def split_sequence(sequence):
    splitted = []
    for split_index in range(1, len(sequence)):
        splitted.append({
            'x': sequence[0:split_index],
            'y': sequence[split_index], 
        })

    return splitted

def split_sequences(sequences):
    splitted_sequences = []
    for sequence in sequences:
        splitted_sequences.extend(split_sequence(sequence))

    return splitted_sequences

def transform_symptoms(symptoms, vocab):
    symptom_vec = np.zeros(len(vocab))
    for symptom in symptoms:
        symptom_vec[vocab[symptom]] = 1
    return tf.convert_to_tensor(symptom_vec)

def translate_and_pad_x_flat(splitted, vocab, max_sequence_length):
    splitted['x_vecs'] = []
    for i in range(max_sequence_length - len(splitted['x'])):
        splitted['x_vecs'].append(transform_symptoms([], vocab))
    for x in splitted['x']:
        splitted['x_vecs'].append(transform_symptoms(x, vocab))
    splitted['x_vecs_stacked'] = tf.stack(splitted['x_vecs'])

def translate_and_pad_x_wide(splitted, vocab, max_symptoms_per_sequence):
    splitted['x_vecs'] = []
    all_symptoms = [symptom for x in splitted['x'] for symptom in x]
    for i in range(max_symptoms_per_sequence - len(all_symptoms)):
        splitted['x_vecs'].append(transform_symptoms([], vocab))
    for symptom in all_symptoms:
        splitted['x_vecs'].append(transform_symptoms([symptom], vocab))
    splitted['x_vecs_stacked'] = tf.stack(splitted['x_vecs'])

def translate_and_pad(splitted, vocab, max_sequence_length, max_symptoms_per_sequence, flat):
    splitted['y_vec'] = transform_symptoms(splitted['y'], vocab)
    if flat:
        translate_and_pad_x_flat(splitted, vocab, max_sequence_length)
    else:
        translate_and_pad_x_wide(splitted, vocab, max_symptoms_per_sequence)
    

def transform_sequences(sequences, vocab, max_sequence_length, max_symptoms_per_sequence, flat=True):
    splitted_sequences = split_sequences(sequences)
    for splitted in splitted_sequences:
        translate_and_pad(splitted, vocab, max_sequence_length, max_symptoms_per_sequence, flat)

    return splitted_sequences


transformed_5 = transform_sequences(train_sequences.tolist()[5:6], vocab, max_sequence_length, max_symptoms_per_sequence, flat=False)
tf.stack([[transformed['y_vec']] for transformed in transformed_5])
tf.stack([transformed['x_vecs_stacked'] for transformed in transformed_5])

In [None]:
train_transformed_flat = transform_sequences(train_sequences.tolist(), vocab, max_sequence_length, max_symptoms_per_sequence, flat=True)
test_transformed_flat = transform_sequences(test_sequences.tolist(), vocab, max_sequence_length, max_symptoms_per_sequence, flat=True)

train_x_flat = tf.stack([transformed['x_vecs_stacked'] for transformed in train_transformed_flat])
train_y_flat = tf.stack([[transformed['y_vec']] for transformed in train_transformed_flat])
test_x_flat = tf.stack([transformed['x_vecs_stacked'] for transformed in test_transformed_flat])
test_y_flat = tf.stack([[transformed['y_vec']] for transformed in test_transformed_flat])

print(train_x_flat.shape)
print(train_y_flat.shape)

In [None]:
train_transformed_wide = transform_sequences(train_sequences.tolist(), vocab, max_sequence_length, max_symptoms_per_sequence, flat=False)
test_transformed_wide = transform_sequences(test_sequences.tolist(), vocab, max_sequence_length, max_symptoms_per_sequence, flat=False)

train_x_wide = tf.stack([transformed['x_vecs_stacked'] for transformed in train_transformed_wide])
train_y_wide = tf.stack([[transformed['y_vec']] for transformed in train_transformed_wide])
test_x_wide = tf.stack([transformed['x_vecs_stacked'] for transformed in test_transformed_wide])
test_y_wide = tf.stack([[transformed['y_vec']] for transformed in test_transformed_wide])

print(train_x_wide.shape)
print(train_y_wide.shape)

In [None]:
input_layer_flat = tf.keras.layers.Input(shape=(max_sequence_length,len(vocab)))
emb_layer_flat = tf.keras.layers.Dense(64)
lstm_model_flat = tf.keras.models.Sequential([
    input_layer_flat,
    emb_layer_flat,
    tf.keras.layers.LSTM(32, return_sequences=False),
    #tf.keras.layers.Flatten(),
    #tf.keras.layers.Conv1D(filters=32,
    #                       kernel_size=(3,),
    #                       activation='relu'),
    tf.keras.layers.Dense(len(vocab), activation='relu'),
])
lstm_model_flat.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.optimizers.Adam())
lstm_model_flat.fit(x=train_x_flat, y=train_y_flat, epochs=100)

emb_model_flat = tf.keras.models.Sequential([
    input_layer_flat,
    emb_layer_flat,
])
print(emb_model_flat.predict(train_x_flat).shape)

lstm_model_flat.evaluate(test_x_flat, test_y_flat)

In [None]:
input_layer_wide = tf.keras.layers.Input(shape=(max_symptoms_per_sequence,len(vocab)))
emb_layer_wide = tf.keras.layers.Dense(64)
lstm_model_wide = tf.keras.models.Sequential([
    input_layer_wide,
    emb_layer_wide,
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dense(len(vocab), activation='relu'),
])
lstm_model_wide.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.optimizers.Adam())
lstm_model_wide.fit(x=train_x_wide, y=train_y_wide, epochs=100)

emb_model_wide = tf.keras.models.Sequential([
    input_layer_wide,
    emb_layer_wide,
])
print(emb_model_wide.predict(train_x_wide).shape)

lstm_model_wide.predict(test_x_wide)

In [None]:
prediction = lstm_model_wide.predict(test_x_wide)[0]
np.argwhere(prediction > 0.5)

In [None]:
np.argwhere(test_y_wide[1] == 1)

In [None]:
[(x, vocab[x]) for x in vocab.keys() if vocab[x] in [7, 26]]