In [36]:
import random
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [37]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from fancyimpute import KNN, IterativeImputer, SimpleFill, SoftImpute, IterativeSVD, MatrixFactorization, NuclearNormMinimization, BiScaler

In [38]:
SEED = 1
random.seed(SEED)
np.random.seed(SEED)

## 1. Dataset
- training : TrainingWiDS2021.csv
- test : UnlabeledWiDS2021.csv
- descriptions : DataDictionaryWiDS2021.csv

In [39]:
df_tr = pd.read_csv('../dataset/WiDS2021/TrainingWiDS2021.csv')
df_tr = df_tr.drop(columns=['Unnamed: 0'], inplace=False)
df_tr.shape

(130157, 180)

In [40]:
df_te = pd.read_csv('../dataset/WiDS2021/UnlabeledWiDS2021.csv')
df_te = df_te.drop(columns=['Unnamed: 0'], inplace=False)
df_te.shape

(10234, 179)

### 1.1. Drop Columns with high missing ratio

- check missing ratio for each column
- compare missing ratios between training set and test set
- Decise what to drop

In [41]:
def check_missing_data(df):
    # check missing data
    missing_df = pd.DataFrame(df.isna().sum().sort_values(ascending=False)).reset_index()
    missing_df.columns = ['column_name', 'num_miss_rows']
    missing_df['miss_ratio'] = missing_df.num_miss_rows / df.shape[0]

    print(missing_df.loc[missing_df.num_miss_rows > 0])

    print(missing_df.loc[missing_df.miss_ratio > 0.5])
    return missing_df

In [42]:
tr_missing = check_missing_data(df_tr)

          column_name  num_miss_rows  miss_ratio
0    h1_bilirubin_min         119861    0.920896
1    h1_bilirubin_max         119861    0.920896
2      h1_albumin_min         119005    0.914319
3      h1_albumin_max         119005    0.914319
4      h1_lactate_max         118467    0.910185
..                ...            ...         ...
155      d1_sysbp_max            271    0.002082
156  d1_heartrate_max            262    0.002013
157  d1_heartrate_min            262    0.002013
158  icu_admit_source            240    0.001844
159            gender             66    0.000507

[160 rows x 3 columns]
         column_name  num_miss_rows  miss_ratio
0   h1_bilirubin_min         119861    0.920896
1   h1_bilirubin_max         119861    0.920896
2     h1_albumin_min         119005    0.914319
3     h1_albumin_max         119005    0.914319
4     h1_lactate_max         118467    0.910185
..               ...            ...         ...
68  d1_bilirubin_max          76735    0.589557
69  

In [43]:
te_missing = check_missing_data(df_te)

          column_name  num_miss_rows  miss_ratio
0      h1_lactate_max           9421    0.920559
1      h1_lactate_min           9421    0.920559
2    h1_bilirubin_max           9407    0.919191
3    h1_bilirubin_min           9407    0.919191
4      h1_albumin_min           9365    0.915087
..                ...            ...         ...
154     d1_diasbp_max             23    0.002247
155      d1_sysbp_min             23    0.002247
156      d1_sysbp_max             23    0.002247
157     d1_diasbp_min             23    0.002247
158            gender              5    0.000489

[159 rows x 3 columns]
           column_name  num_miss_rows  miss_ratio
0       h1_lactate_max           9421    0.920559
1       h1_lactate_min           9421    0.920559
2     h1_bilirubin_max           9407    0.919191
3     h1_bilirubin_min           9407    0.919191
4       h1_albumin_min           9365    0.915087
..                 ...            ...         ...
69    d1_bilirubin_min           5860 

In [44]:
# train and test set with same missing ratios?
set(tr_missing.loc[tr_missing.miss_ratio > .5].column_name).\
difference(set(te_missing.loc[te_missing.miss_ratio > .5].column_name))

set(te_missing.loc[te_missing.miss_ratio > .5].column_name).\
difference(set(tr_missing.loc[tr_missing.miss_ratio > .5].column_name))

tr_missing.loc[tr_missing.column_name=='urineoutput_apache']
te_missing.loc[tr_missing.column_name=='urineoutput_apache']

set()

{'urineoutput_apache'}

Unnamed: 0,column_name,num_miss_rows,miss_ratio
73,urineoutput_apache,63167,0.485314


Unnamed: 0,column_name,num_miss_rows,miss_ratio
73,urineoutput_apache,5190,0.507133


In [45]:
# since missing ratios are very similar => drop all columns with missing ratio upto 50%
drop_columns = te_missing.loc[te_missing.miss_ratio > .5].column_name.values
df_tr = df_tr.drop(columns = drop_columns, inplace=False)
df_te = df_te.drop(columns = drop_columns, inplace=False)

df_tr.shape
df_te.shape

(130157, 106)

(10234, 105)

### drop hospital_id

-due to distribution difference

In [46]:
df_tr = df_tr.drop(columns=['hospital_id'], inplace=False)
df_te = df_te.drop(columns=['hospital_id'], inplace=False)

### Readmission status have 1 unique value for all dataset => drop

In [47]:
df_tr.readmission_status.nunique()
tr_missing.loc[tr_missing.column_name=='readmission_status']
df_tr.readmission_status.unique()
df_te.readmission_status.unique()

1

Unnamed: 0,column_name,num_miss_rows,miss_ratio
167,readmission_status,0,0.0


array([0])

array([0])

In [48]:
df_tr = df_tr.drop(columns=['readmission_status'], inplace=False)
df_te = df_te.drop(columns=['readmission_status'], inplace=False)

## Combine two

In [49]:
set(df_tr.columns).difference(set(df_te.columns))
set(df_te.columns).difference(set(df_tr.columns))

{'diabetes_mellitus'}

set()

In [50]:
df_te['diabetes_mellitus'] = np.nan
df_tr['split_type'] = 'train'
df_te['split_type'] = 'test'

In [51]:
df_t = pd.concat([df_tr, df_te])
df_t.columns
df_t.shape
df_t.head()

Index(['encounter_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender',
       'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id',
       ...
       'd1_wbc_min', 'aids', 'cirrhosis', 'hepatic_failure',
       'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'diabetes_mellitus', 'split_type'],
      dtype='object', length=105)

(140391, 105)

Unnamed: 0,encounter_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,weight,apache_2_diagnosis,...,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,split_type
0,214826,68.0,22.732803,0,Caucasian,M,180.3,Floor,Floor,92,admit,CTICU,0.541667,73.9,113.0,...,4.0,3.4,136.0,134.0,14.1,14.1,0,0,0,0,0,0,0,1.0,train
1,246060,77.0,27.421875,0,Caucasian,F,160.0,Floor,Floor,90,admit,Med-Surg ICU,0.927778,70.2,108.0,...,4.2,3.8,145.0,145.0,23.3,12.7,0,0,0,0,0,0,0,1.0,train
2,276985,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,93,admit,Med-Surg ICU,0.000694,95.3,122.0,...,,,,,,,0,0,0,0,0,0,0,0.0,train
3,262220,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,92,admit,CTICU,0.000694,61.7,203.0,...,5.0,3.5,,,9.0,8.0,0,0,0,0,0,0,0,0.0,train
4,201746,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,91,admit,Med-Surg ICU,0.073611,,119.0,...,,,,,,,0,0,0,0,0,0,0,0.0,train


## Categorical encoding
- Label Encoding : assign label to a unique integer
- OneHot Encoding : creating dummy variables

In [52]:
df_t.dtypes.unique()
df_t.dtypes.loc[df_tr.dtypes=='O']

array([dtype('int64'), dtype('float64'), dtype('O')], dtype=object)

ethnicity                object
gender                   object
hospital_admit_source    object
icu_admit_source         object
icu_stay_type            object
icu_type                 object
split_type               object
dtype: object

In [53]:
cat_cols = list(df_t.dtypes.loc[df_t.dtypes=='O'].index.values)
cat_cols.append('apache_2_diagnosis')
cat_cols.append('apache_3j_diagnosis')
cat_cols.remove('split_type')
print(cat_cols)

['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_2_diagnosis', 'apache_3j_diagnosis']


In [54]:
df_t = pd.get_dummies(df_t, prefix=cat_cols, columns=cat_cols)

In [55]:
df_t.head()

Unnamed: 0,encounter_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,weight,apache_post_operative,arf_apache,bun_apache,creatinine_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,...,apache_3j_diagnosis_1902.02,apache_3j_diagnosis_1902.03,apache_3j_diagnosis_1902.04,apache_3j_diagnosis_1902.05,apache_3j_diagnosis_1903.01,apache_3j_diagnosis_1903.02,apache_3j_diagnosis_1903.03,apache_3j_diagnosis_1904.01,apache_3j_diagnosis_2101.01,apache_3j_diagnosis_2101.03,apache_3j_diagnosis_2201.01,apache_3j_diagnosis_2201.02,apache_3j_diagnosis_2201.03,apache_3j_diagnosis_2201.04,apache_3j_diagnosis_2201.05
0,214826,68.0,22.732803,0,180.3,92,0.541667,73.9,0,0,31.0,2.51,3.0,6.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,246060,77.0,27.421875,0,160.0,90,0.927778,70.2,0,0,9.0,0.56,1.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,276985,25.0,31.952749,0,172.7,93,0.000694,95.3,0,0,,,3.0,6.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,262220,81.0,22.635548,1,165.1,92,0.000694,61.7,1,0,,,4.0,6.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,201746,19.0,,0,188.0,91,0.073611,,0,0,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [56]:
df_t.columns

Index(['encounter_id', 'age', 'bmi', 'elective_surgery', 'height', 'icu_id',
       'pre_icu_los_days', 'weight', 'apache_post_operative', 'arf_apache',
       ...
       'apache_3j_diagnosis_1903.02', 'apache_3j_diagnosis_1903.03',
       'apache_3j_diagnosis_1904.01', 'apache_3j_diagnosis_2101.01',
       'apache_3j_diagnosis_2101.03', 'apache_3j_diagnosis_2201.01',
       'apache_3j_diagnosis_2201.02', 'apache_3j_diagnosis_2201.03',
       'apache_3j_diagnosis_2201.04', 'apache_3j_diagnosis_2201.05'],
      dtype='object', length=582)

### 2. Data imputation

- Possible approaches : mean, KNN, soft_impute, MICE, iterative_SVD

In [57]:
# TODO apply normalized imputation?
# SimpleFill, SoftImpute, IterativeSVD, MatrixFactorization, NuclearNormMinimization, BiScaler
def impute_data(df_t, impt_type='mice'):
    if impt_type =='mice':
        imputer = IterativeImputer()
    elif impt_type == 'knn':
        imputer = KNN()
    return imputer.fit_transform(df_t)

In [58]:
list(filter(lambda x: x.find('split')>=0, df_t.columns))

['split_type']

In [None]:
impute_cols = list(df_t.columns.values)
impute_cols.remove('diabetes_mellitus')
impute_cols.remove('encounter_id')
impute_cols.remove('split_type')

impt_t = impute_data(df_t[impute_cols], 'mice')

### 3. Scaling

In [None]:
# TODO : different scalers?
def scale_data(mx_t, mx_te, scl_type='minmax'):
    if scl_type == 'minmax':
        scaler = MinMaxScaler()
    return scaler.fit_transform(mx_t), scaler.transform(mx_te)

In [None]:
sc_impt_tr, sc_impt_te = scale_data(impt_tr, impt_te, 'minmax')

### 4. Save

In [None]:
sc_tr = pd.DataFrame(sc_impt_tr)
sc_tr.columns = impute_cols
sc_tr['encounter_id'] = df_tr.encounter_id
sc_tr['diabetes_mellitus'] = df_tr.diabetes_mellitus

sc_te = pd.DataFrame(sc_impt_te)
sc_te.columns = impute_cols
sc_te['encounter_id'] = df_te.encounter_id

sc_tr.to_parquet('../dataset/train_.parquet')
sc_te.to_parquet('../dataset/test_.parquet')