In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OrdinalEncoder
import os
import lightgbm as lgb
import optuna
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice
import joblib
import warnings

warnings.filterwarnings("ignore")
pd.set_option('max_rows', 200)
pd.set_option('display.max_columns', 300)
np.random.seed(666)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.5f}'.format)
pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None

In [2]:
TARGET_COL = 'diabetes_mellitus'
cat_cols = ['ethnicity','hospital_admit_source','icu_admit_source','icu_stay_type','icu_type',
            'age_group','demo_profile','apache_3j_group','a1c_bp_group',
            'apache_2_diagnosis','icu_id1','icu_id2']
k = 5
seed = 8

def target_encode(var, dv, weight = 36):
    mean = dv.mean()
    true_weight = weight * max(1, mean/(1-mean))
    agg = dv.groupby(var,dropna=False).agg(['count','mean'])
    counts = agg['count']
    means = agg['mean']
    return (counts * means + true_weight) / (counts + true_weight/mean)

def target_encode_train_test(var_train, dv_train, var_test, weight = 36):
    encode_key = target_encode(var_train,dv_train,weight)
    return var_train.map(encode_key), var_test.map(encode_key).astype(np.float32).fillna(dv_train.mean())

def reduce_mem_usage(df: pd.DataFrame,
                     verbose: bool = True) -> pd.DataFrame:
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':

                if (c_min > np.iinfo(np.int32).min
                      and c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min
                      and c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float32).min
                      and c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    reduction = (start_mem - end_mem) / start_mem

    msg = f'Mem. usage decreased to {end_mem:5.2f} MB ({reduction * 100:.1f} % reduction)'
    if verbose:
        print(msg)

    return df

In [3]:
train = pd.read_csv('Data/train_FE2_filter5.csv')
test = pd.read_csv('Data/test_FE2_filter5.csv')

In [4]:
train['dataset_label'] = 0
test['dataset_label'] = 1
test[TARGET_COL] = np.nan
combined = pd.concat([train, test], ignore_index=True)
combined['icu_id1'] = np.where(combined.icu_id <= 180, combined.icu_id, np.nan)
combined['icu_id2'] = np.where(combined.icu_id > 180, combined.icu_id, np.nan)
str_cols = ['icu_id1','icu_id2','apache_2_diagnosis']
combined[str_cols] = combined[str_cols].astype(str)
combined[cat_cols] = combined[cat_cols].fillna('')
cat_encoder = OrdinalEncoder(dtype=np.int).fit(combined[cat_cols])
combined[cat_cols] = cat_encoder.transform(combined[cat_cols])
train = combined[combined.dataset_label == 0].drop('dataset_label',axis=1)
test = combined[combined.dataset_label == 1].drop(['dataset_label',TARGET_COL],axis=1)

In [5]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 250.52 MB (49.1 % reduction)
Mem. usage decreased to 19.67 MB (49.1 % reduction)


In [6]:
features_to_ignore = ['encounter_id','hospital_id']
vars_to_encode = ['icu_id','apache_3j_diagnosis','demo_profile']
trainX = train.drop(features_to_ignore+[TARGET_COL],axis=1)
trainY = train[TARGET_COL]

In [55]:
def objective(trial, k=5, repeats=5, seed=666, device='GPU'):
    aucs = np.empty(k*repeats)
    kf = RepeatedKFold(n_splits=k, n_repeats=repeats, random_state=seed)    
    for i, (train_index, valid_index) in enumerate(kf.split(trainX,trainY)):
        trainX_cv, trainY_cv = trainX.iloc[train_index,:], trainY[train_index]
        validX_cv, validY_cv = trainX.iloc[valid_index,:], trainY[valid_index]

        for col in vars_to_encode:
            trainX_cv[col+'_encoded'], validX_cv[col+'_encoded'] = target_encode_train_test(
                trainX_cv[col], trainY_cv, validX_cv[col])

        for col in ['icu_id1','icu_id2']:
            train_only = list(set(trainX_cv[col].unique()) - set(validX_cv[col].unique()))
            valid_only = list(set(validX_cv[col].unique()) - set(trainX_cv[col].unique()))
            trainX_cv.loc[trainX_cv[col].isin(train_only), col] = np.nan
            validX_cv.loc[validX_cv[col].isin(valid_only), col] = np.nan

        trainX_cv.drop(['apache_3j_diagnosis','icu_id'],axis=1,inplace=True)
        validX_cv.drop(['apache_3j_diagnosis','icu_id'],axis=1,inplace=True)
        lgtrain = lgb.Dataset(trainX_cv, label=trainY_cv, categorical_feature=cat_cols)
        lgvalid = lgb.Dataset(validX_cv, label=validY_cv, categorical_feature=cat_cols)
        params = {
            'objective':'binary'
            ,'boosting': 'rf'
            ,'verbosity': -1
            ,'metric': 'AUC'
            ,'learning_rate': .04
            ,'early_stopping_rounds': 100
            ,'device': device
            ,'seed': 87707
            ,'bagging_freq': 1
            ,'bagging_fraction': .2
            ,'feature_fraction': .8
            ,'num_leaves': 191
            ,'min_data_in_leaf': 50
            ,'lambda_l2': trial.suggest_uniform('lambda_l2',0,100)
        }        
        
        model_cv = lgb.train(params, lgtrain, num_boost_round=2000,
                             valid_sets=lgvalid, verbose_eval=-1)
        preds = model_cv.predict(validX_cv)
        aucs[i] = roc_auc_score(validY_cv, preds)
    return aucs.mean()

In [56]:
search_space = {
    'lambda_l2': [.2]
}
#no extratrees - 0.8584292818953829


study = optuna.create_study(
    direction='maximize',sampler=optuna.samplers.GridSampler(search_space))

# study = optuna.create_study(
#     direction='maximize',sampler=optuna.samplers.RandomSampler())
# study = optuna.create_study(
#     direction='maximize',
#     sampler=optuna.samplers.CmaEsSampler(n_startup_trials=10, consider_pruned_trials=False),
#     pruner=optuna.pruners.HyperbandPruner()
# )
study.optimize(lambda trial: objective(trial, k=4, repeats=1, seed=87007, device='GPU'), n_trials=50)

[32m[I 2021-02-28 18:15:19,366][0m A new study created in memory with name: no-name-29c887e0-f93b-446c-8026-0577e58d0c3e[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[461]	valid_0's auc: 0.858654
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[266]	valid_0's auc: 0.85955
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[215]	valid_0's auc: 0.855642
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[261]	valid_0's auc: 0.859815


[32m[I 2021-02-28 18:18:03,773][0m Trial 0 finished with value: 0.858402167093137 and parameters: {'lambda_l2': 0.2}. Best is trial 0 with value: 0.858402167093137.[0m


In [None]:
joblib.dump(study, "lgbm_200iter3.pkl")

In [None]:
plot_slice(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_contour(study)