In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import random
import math
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import statistics
import datetime as dt

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

import lightgbm as lgb

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras import backend as K

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
# Set some display options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [3]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [4]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

src_preparation_folder = os.path.join(src_folder, 'preparation')
src_processing_folder = os.path.join(src_folder, 'processing')
src_modeling_folder = os.path.join(src_folder, 'modeling')

In [5]:
# # Import src functions
# sys.path.insert(0, src_preparation_folder)
# from import_data import get_table
# from import_data import get_data_simple
# from import_data import get_patient_admissions_diagnoses
# from import_data import get_admission_data
# from import_data import get_chartevents
# from import_data import get_labevents
# from extract_codes import find_ndc_codes

# sys.path.insert(0, src_processing_folder)
# from stats import plot_KDE
# from stats import plot_perc_bar_chart
# from stats import compare_groups
# from stats import graph_comparisons
# from patient_selection import select_test_groups
# from clean import replace_itemid_with_label
# from clean import find_populated_cols

# sys.path.insert(0, src_modeling_folder)
# from models import train_lgb

In [6]:
# Import data
train = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'acute_respiratory_failure_train.csv')),index_col=0)
test = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'acute_respiratory_failure_test.csv')),index_col=0)

In [7]:
def final_cleaning(train, test):

    # Shuffle
    train = train.sample(frac=1).reset_index(drop=True)
    test = test.sample(frac=1).reset_index(drop=True)

    # Split features and labels
    X_train = train.drop(columns=['subject_id', 'hadm_id', 'target'])
    y_train = np.array(train.target.tolist())
    
    X_test = test.drop(columns=['subject_id', 'hadm_id', 'target'])
    y_test = np.array(test.target.tolist())

    # Impute missing values
    imputer = Imputer(strategy = 'median')
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = (0, 1)) 
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = final_cleaning(train, test)

(22732, 42)
(5684, 42)
(22732,)
(5684,)




In [9]:
def train_lgb(X_train, y_train, n_folds, params, eval_metric, early_stopping_rounds):
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(X_train.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    print('LGB starting')
        
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(X_train):
        
        # Training data for the fold
        train_features  = X_train[train_indices]
        train_labels = [x for i,x in enumerate(y_train) if i in train_indices]
        # Validation data for the fold
        valid_features = X_train[valid_indices]
        valid_labels = [x for i,x in enumerate(y_train) if i in valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(**params)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = eval_metric,
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], early_stopping_rounds = early_stopping_rounds, verbose=500)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid'][eval_metric]
        train_score = model.best_score_['train'][eval_metric]
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
    
    # Overall validation score
    valid_auc = roc_auc_score(y_train, out_of_fold)

    # Overall training score
    train_auc = np.mean(train_scores)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(train_auc)
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})
    
    print(metrics)
    
    return metrics, train_auc, valid_auc

In [10]:
# define the grid search parameters
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(10, 150)),
    'learning_rate': list(np.linspace(0.001, 0.5)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_data_in_leaf': list(range(10, 250, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.001, 1)),
    'subsample': list(np.linspace(0.5, 1)),
    'is_unbalance': [True, False],
    'min_split_gain': list(np.linspace(0.001, 1)),
    'min_data_in_leaf': list(np.arange(1, 200, 3)),
    'n_estimators': list(np.arange(100, 20100, 1000))
}

In [11]:
def tune_lgb(X_train, y_train, param_grid, runs):
    
    ## -- Create output dataframe showing scores and associated hyperparameters
    df_cols = list(param_grid.keys())
    df_cols = df_cols + ['training_score', 'valid_score']

    runs_df = pd.DataFrame(columns=df_cols)
    total_runs = runs
    run =0

    while run < total_runs:

        run += 1

        # Select the random parameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        print('=========')
        print('RUN IS ' + str(run))
        print('=========')

        metrics, train_score, valid_score= train_lgb(X_train=X_train,
                                                      y_train=y_train,
                                                      n_folds = 4,
                                                      params = random_params,
                                                      eval_metric = 'auc',
                                                      early_stopping_rounds = 250)

        temp_df = pd.DataFrame(columns=df_cols)

        for c in list(param_grid.keys()):
            temp_df.loc[0, c] = random_params[c]

        temp_df.loc[0, 'training_score'] = train_score
        temp_df.loc[0, 'valid_score'] = valid_score

        runs_df = runs_df.append(temp_df)

        del temp_df, train_score, valid_score
        
    return runs_df

In [12]:
runs_df = tune_lgb(X_train, y_train, param_grid, runs=5)
runs_df.sort_values(by='valid_score', ascending=False)

RUN IS 1
LGB starting
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[16]	valid's auc: 0.794721	valid's binary_logloss: 0.47985	train's auc: 0.93719	train's binary_logloss: 0.366081
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[23]	valid's auc: 0.793102	valid's binary_logloss: 0.463278	train's auc: 0.96019	train's binary_logloss: 0.316659
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[23]	valid's auc: 0.784947	valid's binary_logloss: 0.473502	train's auc: 0.962549	train's binary_logloss: 0.309117
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[15]	valid's auc: 0.798153	valid's binary_logloss: 0.478801	train's auc: 0.931497	train's binary_logloss: 0.376605
      fold     train     valid
0        0  0.937190  0.794721
1        1  0.960190  0.793102
2        2  0.962549  0.784947
3    

Unnamed: 0,boosting_type,num_leaves,learning_rate,subsample_for_bin,min_data_in_leaf,reg_alpha,reg_lambda,colsample_bytree,subsample,is_unbalance,min_split_gain,n_estimators,training_score,valid_score
0,gbdt,77,0.245408,140000,118,0.326531,0.346939,0.877673,0.918367,False,0.775735,9100,0.924634,0.801788
0,gbdt,56,0.428714,40000,88,0.755102,0.0816327,0.408755,0.581633,True,0.225265,18100,0.947857,0.791983
0,goss,117,0.398163,160000,145,0.489796,0.0612245,0.571857,0.693878,False,0.531082,14100,0.852365,0.788302
0,gbdt,40,0.001,20000,25,0.530612,0.469388,0.36798,0.765306,True,0.81651,3100,0.850276,0.723288
0,dart,107,0.5,120000,61,0.285714,0.918367,0.18449,0.959184,False,0.266041,18100,0.931227,0.648019
