In [1]:
# Import libraries
import numpy as np
import pandas as pd
import pickle


In [2]:
with open('diagnoses_icd_df.pickle', 'rb') as read_file:
    df = pickle.load(read_file)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651047 entries, 0 to 651046
Data columns (total 5 columns):
row_id        651047 non-null int64
subject_id    651047 non-null int64
hadm_id       651047 non-null int64
seq_num       651000 non-null float64
icd9_code     651000 non-null object
dtypes: float64(1), int64(3), object(1)
memory usage: 24.8+ MB


**Credit to https://github.com/daniel-codes/hospital-los-predictor/blob/master/hospital_los_prediction.ipynb**

In [4]:
code_formatting = [
    ['-2','special considerations'],
    ['0', 'external injury'],
    ['0389', 'infectious'],
    ['0390', 'CLABSI'],
    ['140', 'infectious'],
    ['240', 'neoplasms'],
    ['280', 'endocrine'],
    ['290', 'blood'],
    ['320', 'mental'],
    ['390', 'nervous'],
    ['460', 'circulatory'],
    ['520', 'respiratory'],
    ['580', 'digestive'],
    ['630', 'genitourinary'],
    ['680', 'pregnancy'],
    ['710', 'skin'],
    ['740', 'muscular'],
    ['760', 'congenital'],
    ['780', 'perinatal'],
    ['800', 'misc'],
    ['99591', 'injury and poisoning'],
    ['99592', 'CLABSI'],
    ['99930', 'injury and poisoning'],
    ['99934', 'CLABSI'],
    ['99999', 'injury and poisoning'],
    ['A', 'error'],
    ['F', 'external injury'],
    ['W', 'special considerations']
]

def lookup_code(num):
    for code_max, title in code_formatting:
        if num == None: 
            return 'No diagnosis'
        elif num < code_max:
            return title
    raise ValueError(f'{num} is not a valid code')

In [5]:
df['bin_code'] = df['icd9_code'].apply(lookup_code)

In [6]:
df.head()

Unnamed: 0,row_id,subject_id,hadm_id,seq_num,icd9_code,bin_code
0,1297,109,172335,1.0,40301,circulatory
1,1298,109,172335,2.0,486,respiratory
2,1299,109,172335,3.0,58281,genitourinary
3,1300,109,172335,4.0,5855,genitourinary
4,1301,109,172335,5.0,4254,circulatory


In [7]:
df.bin_code.value_counts()

circulatory               140257
special considerations     74705
endocrine                  69592
respiratory                44825
injury and poisoning       41342
digestive                  38527
genitourinary              32867
misc                       29691
blood                      25262
mental                     24876
nervous                    23334
external injury            22544
perinatal                  20160
infectious                 16539
neoplasms                  14235
muscular                   13392
skin                        8789
CLABSI                      5331
congenital                  4078
pregnancy                    654
No diagnosis                  47
Name: bin_code, dtype: int64

### Aggregate info into lists for each hadm_id

**Diagnostic codes**

In [9]:
# Aggregate list of diagnoses for each hospital admission 
hadm_list = df.groupby('hadm_id')['bin_code'].apply(list).reset_index()
hadm_list.head()

Unnamed: 0,hadm_id,bin_code
0,100001,"[endocrine, nervous, genitourinary, digestive,..."
1,100003,"[digestive, blood, infectious, digestive, circ..."
2,100006,"[respiratory, respiratory, respiratory, neopla..."
3,100007,"[digestive, digestive, injury and poisoning, r..."
4,100009,"[circulatory, injury and poisoning, circulator..."


In [10]:
# Convert diagnoses list into hospital admission-item matrix
hadm_item = pd.get_dummies(hadm_list['bin_code'].apply(pd.Series).stack()).sum(level=0)
hadm_item.head()

Unnamed: 0,CLABSI,No diagnosis,blood,circulatory,congenital,digestive,endocrine,external injury,genitourinary,infectious,...,mental,misc,muscular,neoplasms,nervous,perinatal,pregnancy,respiratory,skin,special considerations
0,0,0,0,2,0,2,5,0,2,0,...,0,0,0,0,2,0,0,0,1,2
1,0,0,1,2,0,4,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,1,0,1,0,0,0,3,0,2
3,0,0,0,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,7,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,6


In [11]:
# Join back with HADM_ID
hadm_item = hadm_item.join(hadm_list['hadm_id'], how="outer")
hadm_item.head()

Unnamed: 0,CLABSI,No diagnosis,blood,circulatory,congenital,digestive,endocrine,external injury,genitourinary,infectious,...,misc,muscular,neoplasms,nervous,perinatal,pregnancy,respiratory,skin,special considerations,hadm_id
0,0,0,0,2,0,2,5,0,2,0,...,0,0,0,2,0,0,0,1,2,100001
1,0,0,1,2,0,4,0,0,0,1,...,1,0,0,0,0,0,0,0,0,100003
2,0,0,0,0,0,0,1,0,0,0,...,1,0,1,0,0,0,3,0,2,100006
3,0,0,0,1,0,2,0,0,0,0,...,0,0,0,0,0,0,1,0,0,100007
4,0,0,1,7,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,6,100009
