In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
def convert_to_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
        else: return dxStr
    
def convert_to_3digit_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3]
        else: return dxStr

In [None]:
admission_file = Path('../data/ADMISSIONS.csv')
diagnosis_file = Path('../data/DIAGNOSES_ICD.csv')

In [None]:
admissions_df = pd.read_csv(admission_file)
admissions_df['admittime']= pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime']= pd.to_datetime(admissions_df['dischtime'])
admissions_df['deathtime']= pd.to_datetime(admissions_df['deathtime'])
admissions_df['edregtime']= pd.to_datetime(admissions_df['edregtime'])
admissions_df['edouttime']= pd.to_datetime(admissions_df['edouttime'])
admissions_df.head()

In [None]:
diagnosis_df = pd.read_csv(diagnosis_file)
diagnosis_df['icd9_code_converted'] = diagnosis_df['icd9_code'].apply(convert_to_icd9)
diagnosis_df['icd9_code_converted_3digits'] = diagnosis_df['icd9_code'].apply(convert_to_3digit_icd9)
diagnosis_df.head()

In [None]:
codes_per_admission = diagnosis_df.groupby('hadm_id').agg({
    'icd9_code': lambda x: list(x),
    'icd9_code_converted': lambda x: list(x),
    'icd9_code_converted_3digits': lambda x: list(x),
})
combined_df = pd.merge(admissions_df, codes_per_admission, on=['hadm_id'])
admissions_per_subject = combined_df.groupby('subject_id').agg({
    'hadm_id': lambda x: list(x),
    'admittime': lambda x: list(x),
    'diagnosis': lambda x: list(x),
    'icd9_code': lambda x: list(x),
    'icd9_code_converted': lambda x: list(x),
    'icd9_code_converted_3digits': lambda x: list(x),
})
admissions_per_subject['num_admissions'] = admissions_per_subject['hadm_id'].apply(len)
admissions_per_subject.head()

In [None]:
relevant_data = admissions_per_subject[admissions_per_subject['num_admissions'] >= 2]
relevant_data

In [None]:
all_symptoms = list(set([item for sublist in relevant_data['icd9_code_converted_3digits'].agg(lambda x: [item for sublist in x for item in sublist]).tolist() for item in sublist]))
vocab = {}
index = 0
for symptom in all_symptoms:
    vocab[symptom] = index
    index = index+1

len(vocab)

In [None]:
max_sequence_length = relevant_data['icd9_code_converted_3digits'].apply(len).max()
max_symptoms_per_sequence = relevant_data['icd9_code_converted_3digits'].apply(lambda x: sum([len(y) for y in x])).max()