## Read Data

In [45]:
import pandas as pd
from pathlib import Path
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import bisect
import numpy as np

In [57]:
input_filepath = '../data/interim/'
output_filepath = '../data/processed/'

# cols
BINARY_COLS = Path.cwd().joinpath(input_filepath).joinpath('binary-cols.pickle')
CATEGORICAL_COLS = Path.cwd().joinpath(input_filepath).joinpath('categorical-cols.pickle')
CONTINUOUS_COLS = Path.cwd().joinpath(input_filepath).joinpath('continuous-cols.pickle')
TARGET_COL = Path.cwd().joinpath(input_filepath).joinpath('target-col.pickle')

BINARY_COLS_OUT = Path.cwd().joinpath(output_filepath).joinpath('binary-cols.pickle')
CATEGORICAL_COLS_OUT = Path.cwd().joinpath(output_filepath).joinpath('categorical-cols.pickle')
CONTINUOUS_COLS_OUT = Path.cwd().joinpath(output_filepath).joinpath('continuous-cols.pickle')
TARGET_COL_OUT = Path.cwd().joinpath(output_filepath).joinpath('target-col.pickle')

# data
TRAIN_CSV = Path.cwd().joinpath(input_filepath).joinpath('train.csv')
VAL_CSV = Path.cwd().joinpath(input_filepath).joinpath('val.csv')
TEST_CSV = Path.cwd().joinpath(input_filepath).joinpath('test.csv')

TRAIN_CSV_OUT = Path.cwd().joinpath(output_filepath).joinpath('train.csv')
VAL_CSV_OUT = Path.cwd().joinpath(output_filepath).joinpath('val.csv')
TEST_CSV_OUT = Path.cwd().joinpath(output_filepath).joinpath('test.csv')

# metadata
BINARY_ENCODERS = Path.cwd().joinpath(output_filepath).joinpath('binary-encoders.pickle')
CATEGORICAL_ENCODERS = Path.cwd().joinpath(output_filepath).joinpath('categorical-encoders.pickle')
TARGET_ENCODERS = Path.cwd().joinpath(output_filepath).joinpath('target-encoders.pickle')
CONTINUOUS_SCALERS = Path.cwd().joinpath(output_filepath).joinpath('continuous-scalers.pickle')

In [34]:
def read_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    return None

In [35]:
binary_cols = read_obj(BINARY_COLS)
categorical_cols = read_obj(CATEGORICAL_COLS)
continuous_cols = read_obj(CONTINUOUS_COLS)
target_col = read_obj(TARGET_COL)

train = pd.read_csv(TRAIN_CSV)
val = pd.read_csv(VAL_CSV)
test = pd.read_csv(TEST_CSV)

## Typify

In [36]:
train[continuous_cols] = train[continuous_cols].astype('float32')
val[continuous_cols] = val[continuous_cols].astype('float32')
test[continuous_cols] = test[continuous_cols].astype('float32')

In [37]:
train[categorical_cols] = train[categorical_cols].astype('str').astype('category')
val[categorical_cols] = val[categorical_cols].astype('str').astype('category')
test[categorical_cols] = test[categorical_cols].astype('str').astype('category')

In [38]:
train[binary_cols] = train[binary_cols].astype('str').astype('category')
val[binary_cols] = val[binary_cols].astype('str').astype('category')
test[binary_cols] = test[binary_cols].astype('str').astype('category')

In [39]:
train[target_col] = train[target_col].astype('str').astype('category')
val[target_col] = val[target_col].astype('str').astype('category')
test[target_col] = test[target_col].astype('str').astype('category')

## Fill Data

In [40]:
train[continuous_cols] = train[continuous_cols].fillna(0)
val[continuous_cols] = val[continuous_cols].fillna(0)
test[continuous_cols] = test[continuous_cols].fillna(0)

## Normalize

In [41]:
def normalize(df, cols, scalers=None):
    if None is scalers:
        scalers = dict()
        
    for col in cols:
        if col not in scalers:
            scalers[col] = StandardScaler(with_mean=True, with_std=True)
            scalers[col].fit(df[col].values.reshape(-1,1))
        
        scaler = scalers[col]
        df[col] = scaler.transform(df[col].values.reshape(-1,1))
    return df, scalers

In [42]:
train, scalers = normalize(train, continuous_cols)
val, _ = normalize(val, continuous_cols, scalers)
test, _ = normalize(test, continuous_cols, scalers)

In [49]:
train[continuous_cols].head()

Unnamed: 0,encounter_id,hospital_id,patient_id,icu_id,gcs_eyes_apache,gcs_motor_apache,gcs_verbal_apache,age,height,pre_icu_los_days,...,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob
0,1.270796,0.626662,-1.00443,0.538333,-0.369069,0.429796,0.053745,0.932812,0.473337,2.871473,...,-0.429592,-0.429957,-0.448565,-0.448563,-0.363765,-0.365583,-0.330597,-0.32983,0.254966,-0.002326
1,0.017563,-0.868366,1.594456,0.826916,-0.369069,0.429796,0.661245,0.216715,0.796378,10.234639,...,-0.429592,-0.429957,-0.448565,-0.448563,-0.363765,-0.365583,-0.330597,-0.32983,1.179493,0.479394
2,1.287226,1.04018,0.665916,-0.458589,0.570437,0.429796,0.661245,-1.167739,-0.312439,-0.269217,...,-0.429592,-0.429957,-0.448565,-0.448563,-0.363765,-0.365583,-0.330597,-0.32983,-0.333369,-0.195014
3,-1.112161,1.405985,1.051252,0.37218,-0.369069,0.429796,0.053745,-0.117463,0.036795,-0.286198,...,-0.429592,-0.429957,-0.448565,-0.448563,-0.363765,-0.365583,-0.330597,-0.32983,-0.039201,-0.050498
4,-0.788018,-0.216279,1.345416,0.866268,0.570437,0.429796,0.661245,-1.597397,0.464606,-0.045956,...,-0.429592,-0.429957,-0.448565,-0.448563,-0.363765,-0.365583,-0.330597,-0.32983,-0.333369,-0.195014


## Label Encode

In [46]:
def labelencode(df, cols, encoders=None, unknown_value='UNK'):
    if None is encoders:
        encoders = dict()
        
    for col in cols:
        if col not in encoders:
            le = LabelEncoder()
            le.fit(df[col].values)
            
            # add unknown val to cats
            cats = le.classes_.tolist()
            bisect.insort_left(cats, unknown_value)
            
            # redefine cats on le
            le.classes_ = np.asarray(cats)

            encoders[col] = le
        
        le = encoders[col]
        df[col] = df[col].map(lambda x: unknown_value if x not in le.classes_ else x)
        df[col] = le.transform(df[col].values)
        
    return df, encoders

In [47]:
train, label_encoders = labelencode(train, categorical_cols)
val, _ = labelencode(val, categorical_cols, label_encoders)
test, _ = labelencode(test, categorical_cols, label_encoders)

In [48]:
train[categorical_cols].head()

Unnamed: 0,bmi,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,apache_2_diagnosis,apache_3j_diagnosis,apache_3j_bodysystem,apache_2_bodysystem
0,15514,0,1,12,2,1,3,43,226,6,10
1,931,2,1,16,1,1,4,37,34,0,0
2,18745,2,0,8,2,1,1,37,76,0,0
3,1304,0,0,3,0,1,6,36,296,7,4
4,30520,5,1,16,2,1,5,30,114,1,1


## One-Hot Encode

In [51]:
# todo

In [52]:
train, ohe_encoders = labelencode(train, binary_cols)
val, _ = labelencode(val, binary_cols, ohe_encoders)
test, _ = labelencode(test, binary_cols, ohe_encoders)

In [53]:
train[binary_cols].head()

Unnamed: 0,elective_surgery,readmission_status,apache_post_operative,arf_apache,gcs_unable_apache,intubated_apache,ventilated_apache,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0


## Label Encode Targets

In [54]:
train, target_encoders = labelencode(train, [target_col])
val, _ = labelencode(val, [target_col], target_encoders)
test, _ = labelencode(test, [target_col], target_encoders)

In [55]:
train[target_col].head()

0    0
1    0
2    0
3    0
4    0
Name: hospital_death, dtype: int64

## Persist Data and Metadata

In [59]:
def pickle_obj(path, obj):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

In [61]:
# cols
pickle_obj(BINARY_COLS_OUT, binary_cols)
pickle_obj(CATEGORICAL_COLS_OUT, categorical_cols)
pickle_obj(CONTINUOUS_COLS_OUT, continuous_cols)
pickle_obj(TARGET_COL_OUT, target_col)

# metadata
pickle_obj(BINARY_ENCODERS, ohe_encoders)
pickle_obj(CATEGORICAL_ENCODERS, label_encoders)
pickle_obj(TARGET_ENCODERS, target_encoders)
pickle_obj(CONTINUOUS_SCALERS, scalers)

# data
train.to_csv(TRAIN_CSV_OUT, index=False)
val.to_csv(VAL_CSV_OUT, index=False)
test.to_csv(TEST_CSV_OUT, index=False)