# Data Preprocessing

In [17]:
import pandas as pd
import simple_icd_10 as icd
import numpy as np

## Input Data

In [18]:
lab_events_df = pd.read_csv('data/LABEVENTS_HPO.csv')
lab_events_df.head(10)

Unnamed: 0,row_id,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag,possible_hpo_features,active_hpo_features
0,6244563,10006,,50868,2164-09-24 20:21:00,19.0,19.0,mEq/L,,HP:0031963;HP:0031961;HP:0031962,
1,6244564,10006,,50882,2164-09-24 20:21:00,27.0,27.0,mEq/L,,HP:0032066;HP:0032065;HP:0032067,
2,6244565,10006,,50893,2164-09-24 20:21:00,10.0,10.0,mg/dL,,HP:0002901;HP:0004363;HP:0003072,
3,6244566,10006,,50902,2164-09-24 20:21:00,97.0,97.0,mEq/L,,HP:0003113;HP:0011422;HP:0011423,
4,6244567,10006,,50912,2164-09-24 20:21:00,7.0,7.0,mg/dL,abnormal,HP:0012101;HP:0012100;HP:0003259,HP:0012101;HP:0003259
5,6244568,10006,,50931,2164-09-24 20:21:00,126.0,126.0,mg/dL,abnormal,HP:0001943;HP:0011015;HP:0003074,HP:0001943;HP:0003074
6,6244569,10006,,50960,2164-09-24 20:21:00,2.3,2.3,mg/dL,,HP:0002917;HP:0004921;HP:0002918,
7,6244570,10006,,50970,2164-09-24 20:21:00,5.6,5.6,mg/dL,abnormal,HP:0002148;HP:0100529;HP:0002905,HP:0002148;HP:0002905
8,6244571,10006,,50971,2164-09-24 20:21:00,4.3,4.3,mEq/L,,HP:0002900;HP:0011042;HP:0002153,
9,6244572,10006,,50983,2164-09-24 20:21:00,139.0,139.0,mEq/L,,HP:0002902;HP:0010931;HP:0003228,


In [19]:
diagnose_df = pd.read_csv('data/DIAGNOSE_ICD_hpo.csv')
diagnose_df.head(10)

Unnamed: 0,row_id,subject_id,hadm_id,seq_num,icd9_code,icd10_codes,hpo_features
0,112344,10006,142345,1,99591,A419,HP:0100806
1,112345,10006,142345,2,99662,T827XXA,
2,112346,10006,142345,3,5672,,
3,112347,10006,142345,4,40391,I120,HP:0004421;HP:0009741;HP:0004972;HP:0005117
4,112348,10006,142345,5,42731,I4891,
5,112349,10006,142345,6,4280,I50814;I509,HP:0001635
6,112350,10006,142345,7,4241,I350;I351;I352;I358;I359,HP:0004963;HP:0004942;HP:0001680;HP:0001650;HP...
7,112351,10006,142345,8,4240,I340;I348,HP:0001653
8,112352,10006,142345,9,2874,,
9,112353,10006,142345,10,3819,A411,HP:0100806


## Transformations

### Lab Results

In [20]:
# Drop null HPO input values
lab_events_df = lab_events_df.dropna(subset=['active_hpo_features'])

# Use only subject id (join field to diagnoses table) and active hpo features
lab_events_df = lab_events_df[['subject_id','active_hpo_features']]

# Final dataset (group all the HPO features for the same subject)
lab_events_df = lab_events_df.groupby(['subject_id'])['active_hpo_features'].apply(';'.join).reset_index()

In [21]:
lab_events_df

Unnamed: 0,subject_id,active_hpo_features
0,10006,HP:0012101;HP:0003259;HP:0001943;HP:0003074;HP...
1,10011,HP:0031964;HP:0003073;HP:0012117;HP:0003282;HP...
2,10013,HP:0031956;HP:0003113;HP:0011423;HP:0003236;HP...
3,10017,HP:0011905;HP:0040217;HP:0001943;HP:0003074;HP...
4,10019,HP:0031964;HP:0003073;HP:0012117;HP:0003282;HP...
...,...,...
95,44083,HP:0031851;HP:0001899;HP:0020062;HP:0020063;HP...
96,44154,H;P;:;0;4;1;0;1;7;2;H;P;:;0;5;0;0;1;1;6;HP:000...
97,44212,HP:0003073;HP:0012117;HP:0031956;HP:0032066;HP...
98,44222,HP:0012101;HP:0003259;HP:0001943;HP:0003074;HP...


### Diagnoses

In [22]:
# Ignore all the null values in the diagnoses output diseases
diagnose_df = diagnose_df.dropna(subset=['hpo_features','icd10_codes'])

# Consider only the first disease for each subject
diagnose_df = diagnose_df[diagnose_df['seq_num'] == 1]

# Use only the Subject ID and Disease information
diagnose_df = diagnose_df[['subject_id','icd10_codes','hpo_features']]

In [23]:
# Add all the HPO features for the same subject
joined_hpos = diagnose_df.groupby(['subject_id'])['hpo_features'].apply(';'.join).reset_index()
diagnose_df = pd.merge(joined_hpos, diagnose_df[['subject_id','icd10_codes']], on="subject_id") 

In [24]:
diagnose_df.head(5)

Unnamed: 0,subject_id,hpo_features,icd10_codes
0,10006,HP:0100806,A419
1,10013,HP:0100806,A419
2,10019,HP:0100806,A419
3,10026,HP:0001342,I619
4,10027,HP:0001635,I50814;I509


## Final Dataset

In [25]:
# Join the Lab Results with the Diagnoses using the Subject ID
df = pd.merge(lab_events_df, diagnose_df, on="subject_id")

In [26]:
# Ignore cases with multiple diseases
df = df[df["icd10_codes"].str.contains(";")==False]

In [27]:
df.head(3)

Unnamed: 0,subject_id,active_hpo_features,hpo_features,icd10_codes
0,10006,HP:0012101;HP:0003259;HP:0001943;HP:0003074;HP...,HP:0100806,A419
1,10013,HP:0031956;HP:0003113;HP:0011423;HP:0003236;HP...,HP:0100806,A419
2,10019,HP:0031964;HP:0003073;HP:0012117;HP:0003282;HP...,HP:0100806,A419


In [28]:
df.to_csv("dataset.csv")

In [29]:
# Train, validation, test split

train_size = 0.6
validate_size = 0.2
train_df, val_df, test_df = np.split(df.sample(frac=1), [int(train_size * len(df)), int((validate_size + train_size) * len(df))])

In [30]:
train_df.to_csv("train_data.csv")
val_df.to_csv("val_data.csv")
test_df.to_csv("test_data.csv")