# Preprocessing: diagnoses_icd Table

In [1]:
import os
import pickle
os.chdir('../')
from utils.hosp_preprocess_util import *    # module of preprocessing functions

### Getting the Diagnoses table into long format with timedelta

In [2]:
# Custom function for preprocessing tables with ICD codes
# Takes a path to the diagnoses_icd table, a gzipped admissions-cohort table path, and an optional mapping table path
# Output shows codes that were unable to be mapped by the mapping table
diag = preproc_icd_module("./mimic-iv-1.0/hosp/diagnoses_icd.csv.gz", './data/adm_cohort.gzip', './utils/mappings/ICD9_to_ICD10_mapping.txt', map_code_colname='diagnosis_code')

Error on code 007
Error on code 007
Error on code 007
Error on code 013
Error on code 013
Error on code 013
Error on code 013
Error on code 013
Error on code 013
Error on code 013
Error on code 014
Error on code 014
Error on code 014
Error on code 014
Error on code 014
Error on code 016
Error on code 016
Error on code 017
Error on code 017
Error on code 017
Error on code 017
Error on code 017
Error on code 036
Error on code 036
Error on code 036
Error on code 045
Error on code 046
Error on code 046
Error on code 046
Error on code 046
Error on code 047
Error on code 047
Error on code 049
Error on code 049
Error on code 049
Error on code 049
Error on code 058
Error on code 058
Error on code 058
Error on code 058
Error on code 065
Error on code 066
Error on code 066
Error on code 066
Error on code 066
Error on code 066
Error on code 077
Error on code 077
Error on code 077
Error on code 077
Error on code 083
Error on code 083
Error on code 086
Error on code 086
Error on code 087
Error on c

In [3]:
# Additional columns were appended to the diagnoses_icd dataset:
# label and timedelta columns were joined from the admissions-cohort dataset
# root_icd10_convert are the ICD10 codes generated from ICD-9 root categories (original ICD10 codes were simply carried over)
# root represents the codes from root_icd10_convert, but only the root category (first 3 characters) are kept
diag.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,label,timedelta_days,timedelta_years,root_icd10_convert,root
0,15734973,20475282,3,2825,9,0,32,0.0,D580,D58
1,15734973,20475282,2,V0251,9,0,32,0.0,,
2,15734973,20475282,5,V270,9,0,32,0.0,,
3,15734973,20475282,1,64891,9,0,32,0.0,,
4,15734973,20475282,4,66481,9,0,32,0.0,,


In [4]:
# Print column information
print("Number of unique codes in \'icd_code\' col:      ", diag.icd_code.dropna().nunique())
print("Unique ICD10s in root converted col:             ", diag.root_icd10_convert.dropna().nunique())
print("Unique ICD10 roots in root converted col:        ", diag.root.dropna().nunique())

Number of unique codes in 'icd_code' col:       18705
Unique ICD10s in root converted col:              10035
Unique ICD10 roots in root converted col:         1537


In [5]:
# Save two versions of procedures dataset; one with all converted ICD10 CM codes, and then one with only roots of the converted codes
diag[['subject_id', 'hadm_id', 'root_icd10_convert', 'timedelta_days']].dropna().to_pickle("./data/long_format/diag/long_diag_icd10.gzip", compression='gzip')
diag[['subject_id', 'hadm_id', 'root', 'timedelta_days']].dropna().to_pickle("./data/long_format/diag/long_diag_icd10_roots.gzip", compression='gzip')

### One Hot Encoding

Old code for pivoting long_format data into the form:
```
                                    || feature 1 || ... || feature n ||
|| subject_id || label || timedelta ||
```

In [6]:
# pivot_diag = pivot_cohort(diag[['subject_id', 'label','timedelta', 'root_icd10_convert']], prefix='diag_', target_col='root_icd10_convert', ohe=True, max_features=100)
# pivot_diag_root = pivot_cohort(diag[['subject_id', 'label','timedelta', 'root']], prefix='diag_', target_col='root', ohe=True, max_features=100)

In [10]:
# # Save output
# pivot_diag.fillna(0).astype(int).to_pickle('./data/diag/pivot_icd10.gzip', compression='gzip')
# pivot_diag_root.fillna(0).astype(int).to_pickle('./data/diag/pivot_icd10_root.gzip', compression='gzip')

In [13]:
# # pivot_diag.loc[pivot_diag.index]
# pivot_diag.xs(6, level=1, drop_level=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,diag_A170,diag_B871,diag_B9781,diag_C241,diag_C561,diag_D017,diag_D070,diag_D175,diag_D6109,diag_F1423,...,diag_W2203XA,diag_X16XXXA,diag_Y35893A,diag_Y92231,diag_Y9269,diag_Y9373,diag_Z21,diag_Z471,diag_Z593,diag_Z9621
subject_id,timedelta,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10000032,6.0,,,,,,,,,,,...,,,,,,,1.0,,,
10002013,6.0,,,,,,,,,,,...,,,,,,,,,,
10002930,6.0,,,,,,,,,,,...,,,,,,,1.0,,,
10007920,6.0,,,,,,,,,,,...,,,,,,,1.0,,,
10009129,6.0,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19987150,6.0,,,,,,,,,,,...,,,,,,,,,,
19988669,6.0,,,,,,,,,,,...,,,,,,,1.0,,,
19988951,6.0,,,,,,,,,,,...,,,,,,,,,,
19992875,6.0,,,,,,,,,,,...,,,,,,,,,,
