In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import scipy
import random

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Set some display options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [3]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [4]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

In [5]:
# Import src functions
sys.path.insert(0, src_folder)
from modeling import *
from stats_and_visualisations import *
from s3_storage import *
from model_access import *

In [6]:
# Import data
train = from_s3(bucket='mimic-jamesi',
               filename='acute_kidney_failure_train.csv',
               index_col=0)
print('--> Importing done')

--> Importing done


In [7]:
X_train, y_train, feature_names = final_cleaning(ids = ['subject_id', 'hadm_id'],
                                                                  target = 'target', train = train)
print('--> Cleaning done')

--> Cleaning done


In [8]:
def train_lgb(X_train, y_train, n_folds, params, eval_metric, early_stopping_rounds):
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(X_train.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    print('LGB starting')
        
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(X_train):
        
        # Training data for the fold
        train_features  = X_train[train_indices]
        train_labels = [x for i,x in enumerate(y_train) if i in train_indices]
        # Validation data for the fold
        valid_features = X_train[valid_indices]
        valid_labels = [x for i,x in enumerate(y_train) if i in valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(**params)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = eval_metric,
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], early_stopping_rounds = early_stopping_rounds, verbose=0)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid'][eval_metric]
        train_score = model.best_score_['train'][eval_metric]
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
    
    # Overall validation score
    valid_auc = roc_auc_score(y_train, out_of_fold)

    # Overall training score
    train_auc = np.mean(train_scores)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(train_auc)
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})
        
    return metrics, train_auc, valid_auc

In [9]:
# define the grid search parameters
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(10, 150)),
    'learning_rate': list(np.linspace(0.001, 0.5)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_data_in_leaf': list(range(10, 250, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.001, 1)),
    'subsample': list(np.linspace(0.5, 1)),
    'is_unbalance': [True, False],
    'min_split_gain': list(np.linspace(0.001, 1)),
    'min_data_in_leaf': list(np.arange(1, 200, 3)),
    'n_estimators': list(np.arange(100, 20100, 1000))
}

In [10]:
def tune_lgb(X_train, y_train, param_grid, runs):
    
    ## -- Create output dataframe showing scores and associated hyperparameters
    df_cols = list(param_grid.keys())
    df_cols = df_cols + ['params', 'training_score', 'valid_score']

    runs_df = pd.DataFrame(columns=df_cols)
    total_runs = runs
    run =0

    while run < total_runs:

        run += 1

        # Select the random parameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        metrics, train_score, valid_score= train_lgb(X_train=X_train,
                                                      y_train=y_train,
                                                      n_folds = 5,
                                                      params = random_params,
                                                      eval_metric = 'auc',
                                                      early_stopping_rounds = 250)

        print("Run {},  Training score: {},  Valid score: {}".format(run, train_score, valid_score))
        
        temp_df = pd.DataFrame(columns=df_cols)

        for c in list(param_grid.keys()):
            temp_df.loc[0, c] = random_params[c]

        temp_df.loc[0, 'params'] = [random_params]
        temp_df.loc[0, 'training_score'] = train_score
        temp_df.loc[0, 'valid_score'] = valid_score

        runs_df = runs_df.append(temp_df)

        del temp_df, train_score, valid_score
        
    return runs_df

In [11]:
runs_df = tune_lgb(X_train, y_train, param_grid, runs=25)
runs_df = runs_df.sort_values(by='valid_score', ascending=False).reset_index(drop=True)
runs_df

LGB starting
Run 1,  Training score: 0.9802229696528351,  Valid score: 0.874331634350829
LGB starting
Run 2,  Training score: 0.9018096964387459,  Valid score: 0.8774980453585041
LGB starting
Run 3,  Training score: 0.9445557661317936,  Valid score: 0.8854588605454601
LGB starting
Run 4,  Training score: 0.948492139528214,  Valid score: 0.882074828275614
LGB starting
Run 5,  Training score: 0.9813837827999,  Valid score: 0.861826216815057
LGB starting
Run 6,  Training score: 0.996043514546461,  Valid score: 0.8837741354262201
LGB starting
Run 7,  Training score: 0.9679029502670546,  Valid score: 0.886946011313925
LGB starting
Run 8,  Training score: 0.8699371648164664,  Valid score: 0.8595269361772352
LGB starting
Run 9,  Training score: 0.9791780338794347,  Valid score: 0.8752979171232007
LGB starting
Run 10,  Training score: 0.935416035998837,  Valid score: 0.8754484448087698
LGB starting
Run 11,  Training score: 0.9365125214852211,  Valid score: 0.8768169914482744
LGB starting
Run 1

Unnamed: 0,boosting_type,num_leaves,learning_rate,subsample_for_bin,min_data_in_leaf,reg_alpha,reg_lambda,colsample_bytree,subsample,is_unbalance,min_split_gain,n_estimators,params,training_score,valid_score
0,gbdt,44,0.031551,260000,64,0.102041,0.489796,0.510694,0.693878,False,0.347592,19100,"[{'boosting_type': 'gbdt', 'num_leaves': 44, '...",0.967903,0.886946
1,gbdt,38,0.163939,20000,199,0.122449,0.795918,0.36798,0.5,True,0.796122,4100,"[{'boosting_type': 'gbdt', 'num_leaves': 38, '...",0.944556,0.885459
2,dart,14,0.19449,140000,46,0.857143,0.510204,0.755347,0.612245,True,0.959224,1100,"[{'boosting_type': 'dart', 'num_leaves': 14, '...",0.939423,0.885305
3,gbdt,63,0.133388,260000,169,0.591837,0.122449,0.796122,0.897959,True,0.63302,12100,"[{'boosting_type': 'gbdt', 'num_leaves': 63, '...",0.958778,0.884906
4,gbdt,90,0.123204,260000,187,0.428571,0.285714,0.836898,0.765306,False,0.327204,17100,"[{'boosting_type': 'gbdt', 'num_leaves': 90, '...",0.945121,0.884758
5,dart,104,0.11302,180000,22,0.632653,0.204082,0.694184,0.857143,False,0.266041,15100,"[{'boosting_type': 'dart', 'num_leaves': 104, ...",0.996044,0.883774
6,dart,37,0.163939,20000,133,0.714286,0.0408163,0.225265,0.969388,False,0.245653,6100,"[{'boosting_type': 'dart', 'num_leaves': 37, '...",0.96168,0.883035
7,dart,13,0.438898,280000,79,0.816327,0.857143,0.164102,0.744898,True,0.531082,14100,"[{'boosting_type': 'dart', 'num_leaves': 13, '...",0.948492,0.882075
8,dart,90,0.30651,40000,187,0.122449,0.510204,0.266041,0.94898,False,0.694184,8100,"[{'boosting_type': 'dart', 'num_leaves': 90, '...",0.969175,0.881682
9,dart,74,0.337061,200000,40,0.428571,0.897959,0.327204,0.785714,False,0.714571,6100,"[{'boosting_type': 'dart', 'num_leaves': 74, '...",0.995032,0.879963


In [12]:
# Find the best parameters
best_params=runs_df.loc[0, 'params'][0]

# Create the model
model = lgb.LGBMClassifier(**best_params)

# Train the model
model.fit(X_train, y_train, eval_metric = 'auc')

# Save model as Pickle
save_pickle(model, 'light_gbm')