In [None]:
import random
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from fancyimpute import KNN, IterativeImputer, SimpleFill, SoftImpute, IterativeSVD, MatrixFactorization, NuclearNormMinimization, BiScaler

In [None]:
SEED = 1
random.seed(SEED)
np.random.seed(SEED)

## 1. Dataset
- training : TrainingWiDS2021.csv
- test : UnlabeledWiDS2021.csv
- descriptions : DataDictionaryWiDS2021.csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_tr = pd.read_csv('/content/drive/MyDrive/dataset/WiDS2021/TrainingWiDS2021.csv')
df_tr = df_tr.drop(columns=['Unnamed: 0'], inplace=False)
df_tr.shape

In [None]:
df_te = pd.read_csv('/content/drive/MyDrive/dataset/WiDS2021/UnlabeledWiDS2021.csv')
df_te = df_te.drop(columns=['Unnamed: 0'], inplace=False)
df_te.shape

### 1.1. Drop Columns with high missing ratio

- check missing ratio for each column
- compare missing ratios between training set and test set
- Decise what to drop

In [None]:
def check_missing_data(df):
    # check missing data
    missing_df = pd.DataFrame(df.isna().sum().sort_values(ascending=False)).reset_index()
    missing_df.columns = ['column_name', 'num_miss_rows']
    missing_df['miss_ratio'] = missing_df.num_miss_rows / df.shape[0]

    print(missing_df.loc[missing_df.num_miss_rows > 0])

    print(missing_df.loc[missing_df.miss_ratio > 0.5])
    return missing_df

In [None]:
tr_missing = check_missing_data(df_tr)

In [None]:
te_missing = check_missing_data(df_te)

In [None]:
# train and test set with same missing ratios?
set(tr_missing.loc[tr_missing.miss_ratio > .5].column_name).\
difference(set(te_missing.loc[te_missing.miss_ratio > .5].column_name))

set(te_missing.loc[te_missing.miss_ratio > .5].column_name).\
difference(set(tr_missing.loc[tr_missing.miss_ratio > .5].column_name))

tr_missing.loc[tr_missing.column_name=='urineoutput_apache']
te_missing.loc[tr_missing.column_name=='urineoutput_apache']

In [None]:
# since missing ratios are very similar => drop all columns with missing ratio upto 50%
drop_columns = te_missing.loc[te_missing.miss_ratio > .5].column_name.values
df_tr = df_tr.drop(columns = drop_columns, inplace=False)
df_te = df_te.drop(columns = drop_columns, inplace=False)

df_tr.shape
df_te.shape

### drop hospital_id

-due to distribution difference

In [None]:
df_tr = df_tr.drop(columns=['hospital_id'], inplace=False)
df_te = df_te.drop(columns=['hospital_id'], inplace=False)

### Readmission status have 1 unique value for all dataset => drop

In [None]:
df_tr.readmission_status.nunique()
tr_missing.loc[tr_missing.column_name=='readmission_status']
df_tr.readmission_status.unique()
df_te.readmission_status.unique()

In [None]:
df_tr = df_tr.drop(columns=['readmission_status'], inplace=False)
df_te = df_te.drop(columns=['readmission_status'], inplace=False)

## Combine two

In [None]:
set(df_tr.columns).difference(set(df_te.columns))
set(df_te.columns).difference(set(df_tr.columns))

In [None]:
df_te['diabetes_mellitus'] = np.nan
df_tr['split_type'] = 'train'
df_te['split_type'] = 'test'

In [None]:
df_t = pd.concat([df_tr, df_te])
df_t.columns
df_t.shape
df_t.head()

## Categorical encoding
- Label Encoding : assign label to a unique integer
- OneHot Encoding : creating dummy variables

In [None]:
df_t.dtypes.unique()
df_t.dtypes.loc[df_tr.dtypes=='O']

In [None]:
cat_cols = list(df_t.dtypes.loc[df_t.dtypes=='O'].index.values)
cat_cols.append('apache_2_diagnosis')
cat_cols.append('apache_3j_diagnosis')
cat_cols.remove('split_type')
print(cat_cols)

In [None]:
df_t = pd.get_dummies(df_t, prefix=cat_cols, columns=cat_cols)

In [None]:
df_t.head()

In [None]:
df_t.columns

### 2. Data imputation

- Possible approaches : mean, KNN, soft_impute, MICE, iterative_SVD

In [None]:
# TODO apply normalized imputation?
# SimpleFill, SoftImpute, IterativeSVD, MatrixFactorization, NuclearNormMinimization, BiScaler
def impute_data(df_t, impt_type='mice'):
    if impt_type =='mice':
        imputer = IterativeImputer()
    elif impt_type == 'knn':
        imputer = KNN()
    elif impt_type == 'simple':
        imputer = SimpleFill('mean')
    return imputer.fit_transform(df_t)

In [None]:
list(filter(lambda x: x.find('split')>=0, df_t.columns))

In [None]:
impute_cols = list(df_t.columns.values)
impute_cols.remove('diabetes_mellitus')
impute_cols.remove('encounter_id')
impute_cols.remove('split_type')

impt_t = impute_data(df_t[impute_cols], 'mice')

In [None]:
df_impt_t = pd.DataFrame(impt_t)
df_impt_t.columns = impute_cols
df_impt_t.head()

In [None]:
df_impt_t.shape
len(impute_cols)

In [None]:
df_impt_t['encounter_id'] = df_t.encounter_id.values
df_impt_t['diabetes_mellitus'] = df_t.diabetes_mellitus.values
df_impt_t['split_type'] = df_t.split_type.values
df_impt_t = df_impt_t.reset_index(inplace=False, drop=True)
df_impt_t.head()

In [None]:
cols = list(df_impt_t.columns)
cols.remove('split_type')

In [None]:
list(filter(lambda x: x=='split_type', df_impt_t.columns))

In [None]:
tr = df_impt_t.loc[df_impt_t['split_type']=='train']
tr = tr.drop(columns='split_type', inplace=False)

te = df_impt_t.loc[df_impt_t['split_type']=='test']
te = te.drop(columns=['split_type', 'diabetes_mellitus'], inplace=False)

In [None]:
tr.to_parquet('/content/drive/MyDrive/dataset/dummy_noscale_train.parquet')
te.to_parquet('/content/drive/MyDrive/dataset/dummy_noscale_test.parquet')

### 3. Scaling

In [None]:
# TODO : different scalers?
def scale_data(mx_t, scl_type='minmax'):
    if scl_type == 'minmax':
        scaler = MinMaxScaler()
    return scaler.fit_transform(mx_t)

In [None]:
cols = list(df_impt_t.columns)
for cat in ['diabetes_mellitus', 'ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 
            'icu_type', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'encounter_id']:
    relevent_cols = list(filter(lambda x: x.find(cat)>=0, cols))
    print(relevent_cols)
    if len(relevent_cols) > 0:
        for r in relevent_cols:
            cols.remove(r)
cols.remove('split_type')
df_impt_t[cols]

In [None]:
sc_impt_t = scale_data(df_impt_t[cols], 'minmax')
df_impt_t[cols] = sc_impt_t

### 4. Save

In [None]:
s_tr = df_impt_t.loc[df_impt_t['split_type']=='train']
s_tr = s_tr.drop(columns='split_type', inplace=False)

s_te = df_impt_t.loc[df_impt_t['split_type']=='test']
s_te = s_te.drop(columns=['split_type', 'diabetes_mellitus'], inplace=False)

s_tr.to_parquet('../dataset/train_scale_.parquet')
s_te.to_parquet('../dataset/test_scale_.parquet')