## Read Data

In [261]:
import pandas as pd
from pathlib import Path
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import bisect
import numpy as np
from itertools import combinations

In [262]:
input_filepath = '../data/interim/'
output_filepath = '../data/processed/'

# cols
BINARY_COLS = Path.cwd().joinpath(input_filepath).joinpath('binary-cols.pickle')
CATEGORICAL_COLS = Path.cwd().joinpath(input_filepath).joinpath('categorical-cols.pickle')
CONTINUOUS_COLS = Path.cwd().joinpath(input_filepath).joinpath('continuous-cols.pickle')
TARGET_COL = Path.cwd().joinpath(input_filepath).joinpath('target-col.pickle')

BINARY_COLS_OUT = Path.cwd().joinpath(output_filepath).joinpath('binary-cols.pickle')
CATEGORICAL_COLS_OUT = Path.cwd().joinpath(output_filepath).joinpath('categorical-cols.pickle')
CONTINUOUS_COLS_OUT = Path.cwd().joinpath(output_filepath).joinpath('continuous-cols.pickle')
TARGET_COL_OUT = Path.cwd().joinpath(output_filepath).joinpath('target-col.pickle')

# data
TRAIN_CSV = Path.cwd().joinpath(input_filepath).joinpath('train.csv')
VAL_CSV = Path.cwd().joinpath(input_filepath).joinpath('val.csv')
TEST_CSV = Path.cwd().joinpath(input_filepath).joinpath('test.csv')

TRAIN_CSV_OUT = Path.cwd().joinpath(output_filepath).joinpath('train.csv')
VAL_CSV_OUT = Path.cwd().joinpath(output_filepath).joinpath('val.csv')
TEST_CSV_OUT = Path.cwd().joinpath(output_filepath).joinpath('test.csv')

# metadata
BINARY_ENCODERS = Path.cwd().joinpath(output_filepath).joinpath('binary-encoders.pickle')
CATEGORICAL_ENCODERS = Path.cwd().joinpath(output_filepath).joinpath('categorical-encoders.pickle')
TARGET_ENCODERS = Path.cwd().joinpath(output_filepath).joinpath('target-encoders.pickle')
CONTINUOUS_SCALERS = Path.cwd().joinpath(output_filepath).joinpath('continuous-scalers.pickle')

In [263]:
def read_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    return None

In [264]:
binary_cols = read_obj(BINARY_COLS)
categorical_cols = read_obj(CATEGORICAL_COLS)
continuous_cols = read_obj(CONTINUOUS_COLS)
target_col = read_obj(TARGET_COL)

train = pd.read_csv(TRAIN_CSV)
val = pd.read_csv(VAL_CSV)
test = pd.read_csv(TEST_CSV)

## Label Engineering

In [265]:
pair_cols = ['ethnicity',
 'gender',
 'hospital_admit_source',
 'icu_admit_source',
 'icu_stay_type',
 'icu_type',
 'apache_3j_bodysystem',
 'apache_2_bodysystem']

In [266]:
cmbs = list(combinations(pair_cols, 2))

In [267]:
def concat_columns(df, columns):
    value = df[columns[0]].astype(str) + ' '
    for col in columns[1:]:
        value += df[col].astype(str) + ' '
    return value

In [268]:
len(cmbs)

28

In [269]:
combo_cols = list()
for cols in cmbs:
    col_name = f'paired_{"_".join(cols)}'
    combo_cols.append(col_name)
    train[col_name] = concat_columns(train, cols)
    val[col_name] = concat_columns(val, cols)
    test[col_name] = concat_columns(test, cols)
categorical_cols.extend(combo_cols)

In [270]:
# aggregate icu
train['hospital_admit_source_is_icu'] = train['hospital_admit_source'].apply(
        lambda x: 
            'True' if x in [
                'Other ICU', 
                'ICU to SDU',
                'ICU'
            ] else 'False')
val['hospital_admit_source_is_icu'] = val['hospital_admit_source'].apply(
        lambda x: 
            'True' if x in [
                'Other ICU', 
                'ICU to SDU',
                'ICU'
            ] else 'False')
test['hospital_admit_source_is_icu'] = test['hospital_admit_source'].apply(
        lambda x: 
            'True' if x in [
                'Other ICU', 
                'ICU to SDU',
                'ICU'
            ] else 'False')
categorical_cols.append('hospital_admit_source_is_icu')

In [271]:
# aggregate cardiac
common_cols = ['CTICU', 'CCU-CTICU', 'Cardiac ICU', 'CSICU']
train['hospital_admit_source_is_cardiac'] = train['icu_type'].apply(lambda x: True if x in common_cols else False)
val['hospital_admit_source_is_cardiac'] = val['icu_type'].apply(lambda x: True if x in common_cols else False)
test['hospital_admit_source_is_cardiac'] = test['icu_type'].apply(lambda x: True if x in common_cols else False)
categorical_cols.append('hospital_admit_source_is_cardiac')

## Typify

In [272]:
train[continuous_cols] = train[continuous_cols].astype('float32')
val[continuous_cols] = val[continuous_cols].astype('float32')
test[continuous_cols] = test[continuous_cols].astype('float32')

In [273]:
train[categorical_cols] = train[categorical_cols].astype('str').astype('category')
val[categorical_cols] = val[categorical_cols].astype('str').astype('category')
test[categorical_cols] = test[categorical_cols].astype('str').astype('category')

In [274]:
train[binary_cols] = train[binary_cols].astype('str').astype('category')
val[binary_cols] = val[binary_cols].astype('str').astype('category')
test[binary_cols] = test[binary_cols].astype('str').astype('category')

In [275]:
train[target_col] = train[target_col].astype('str').astype('category')
val[target_col] = val[target_col].astype('str').astype('category')
test[target_col] = test[target_col].astype('str').astype('category')

## Dropna

In [276]:
train = train.dropna(how='all')
val = val.dropna(how='all')
test = test.dropna(how='all')

In [277]:
confounding_cols = ['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']

# remove confounding vars - biases and undue variance
for x in confounding_cols:
    continuous_cols.remove(x)
    
train = train[[target_col] + continuous_cols + categorical_cols + binary_cols]
val = val[[target_col] + continuous_cols + categorical_cols + binary_cols]
test = test[[target_col] + continuous_cols + categorical_cols + binary_cols]

## Fill Data

In [294]:
def fill(df, cols, fillers=None):
    if None is fillers:
        fillers = dict()
        
        for col in cols:
            for hospital_id in df['hospital_id'].unique():

                subset = df[df['hospital_id'] == hospital_id]
                hospital_col_key = f'{col}_{hospital_id}'

                if hospital_col_key not in fillers:
                    fillers[hospital_col_key] = subset[col].dropna().median()
            
            fillers[col] = df[col].dropna().median()
    
    for col in cols:
        print(f'fillling {col}')
        for hospital_id in df['hospital_id'].unique():
            print(f'fillling {col} - {hospital_id}')
            subset = df[df['hospital_id'] == hospital_id]
            hospital_col_key = f'{col}_{hospital_id}'
            
            if hospital_col_key in fillers:
                df.loc[df['hospital_id'] == hospital_id, col] = subset[col].fillna(fillers[hospital_col_key])
            else:
                df.loc[df['hospital_id'] == hospital_id, col] = subset[col].fillna(fillers[col])
            
            df.loc[df['hospital_id'] == hospital_id, f'{col}_na'] = pd.isnull(subset[col])
    
    return df, fillers

In [293]:
train, fillers = fill(train, continuous_cols)
val, _ = fill(val, continuous_cols, fillers)
test, _ = fill(test, continuous_cols, fillers)

fillling encounter_id
fillling encounter_id - 118.0
fillling encounter_id - 81.0
fillling encounter_id - 33.0
fillling encounter_id - 83.0
fillling encounter_id - 77.0
fillling encounter_id - 149.0
fillling encounter_id - 31.0
fillling encounter_id - 151.0
fillling encounter_id - 69.0
fillling encounter_id - 46.0
fillling encounter_id - 63.0
fillling encounter_id - 113.0
fillling encounter_id - 137.0
fillling encounter_id - 89.0
fillling encounter_id - 168.0
fillling encounter_id - 150.0
fillling encounter_id - 140.0
fillling encounter_id - 34.0
fillling encounter_id - 74.0
fillling encounter_id - 102.0
fillling encounter_id - 91.0
fillling encounter_id - 198.0
fillling encounter_id - 129.0
fillling encounter_id - 68.0
fillling encounter_id - 138.0
fillling encounter_id - 174.0
fillling encounter_id - 134.0
fillling encounter_id - 169.0
fillling encounter_id - 57.0
fillling encounter_id - 177.0
fillling encounter_id - 155.0
fillling encounter_id - 85.0
fillling encounter_id - 78.0
fill

In [None]:
categorical_cols.extend([f'{x}_na' for x in continuous_cols])

## Normalize

In [None]:
def normalize(df, cols, scalers=None):
    if None is scalers:
        scalers = dict()
        
    for col in cols:
        if col not in scalers:
            scalers[col] = StandardScaler(with_mean=True, with_std=True)
            scalers[col].fit(df[col].values.reshape(-1,1))
        
        scaler = scalers[col]
        df[col] = scaler.transform(df[col].values.reshape(-1,1))
    return df, scalers

In [None]:
train, scalers = normalize(train, continuous_cols)
val, _ = normalize(val, continuous_cols, scalers)
test, _ = normalize(test, continuous_cols, scalers)

In [None]:
train[continuous_cols].head()

## Label Encode

In [None]:
train[categorical_cols] = train[categorical_cols].astype('str').astype('category')
val[categorical_cols] = val[categorical_cols].astype('str').astype('category')
test[categorical_cols] = test[categorical_cols].astype('str').astype('category')

In [None]:
def labelencode(df, cols, encoders=None, unknown_value='UNK'):
    if None is encoders:
        encoders = dict()
        
    for col in cols:
        if col not in encoders:
            le = LabelEncoder()
            le.fit(df[col].values)
            
            # add unknown val to cats
            cats = le.classes_.tolist()
            bisect.insort_left(cats, unknown_value)
            
            # redefine cats on le
            le.classes_ = np.asarray(cats)

            encoders[col] = le
        
        le = encoders[col]
        df[col] = df[col].map(lambda x: unknown_value if x not in le.classes_ else x)
        df[col] = le.transform(df[col].values)
        
    return df, encoders

In [None]:
train, label_encoders = labelencode(train, categorical_cols)
val, _ = labelencode(val, categorical_cols, label_encoders)
test, _ = labelencode(test, categorical_cols, label_encoders)

In [None]:
train[categorical_cols].head()

## One-Hot Encode

In [None]:
# todo - not necessary with CatBoost, plut CBoost will tune the cats which will become ohe

In [None]:
train, ohe_encoders = labelencode(train, binary_cols)
val, _ = labelencode(val, binary_cols, ohe_encoders)
test, _ = labelencode(test, binary_cols, ohe_encoders)

In [None]:
train[binary_cols].head()

## Label Encode Targets

In [None]:
train, target_encoders = labelencode(train, [target_col])
val, _ = labelencode(val, [target_col], target_encoders)
test, _ = labelencode(test, [target_col], target_encoders)

In [None]:
train[target_col].head()

## Persist Data and Metadata

In [None]:
def pickle_obj(path, obj):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

In [None]:
# cols
pickle_obj(BINARY_COLS_OUT, binary_cols)
pickle_obj(CATEGORICAL_COLS_OUT, categorical_cols)
pickle_obj(CONTINUOUS_COLS_OUT, continuous_cols)
pickle_obj(TARGET_COL_OUT, target_col)

# metadata
pickle_obj(BINARY_ENCODERS, ohe_encoders)
pickle_obj(CATEGORICAL_ENCODERS, label_encoders)
pickle_obj(TARGET_ENCODERS, target_encoders)
pickle_obj(CONTINUOUS_SCALERS, scalers)

# data
train.to_csv(TRAIN_CSV_OUT, index=False)
val.to_csv(VAL_CSV_OUT, index=False)
test.to_csv(TEST_CSV_OUT, index=False)