In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import scipy
import random

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Set some display options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [3]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [4]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

In [5]:
# Import src functions
sys.path.insert(0, src_folder)
from modeling import *
from stats_and_visualisations import *
from s3_storage import *

In [6]:
# Import data
train = from_s3(bucket='mimic-jamesi',
               filename='acute_kidney_failure_train.csv',
               index_col=0)

test = from_s3(bucket='mimic-jamesi',
               filename='acute_kidney_failure_test.csv',
               index_col=0)
print('--> Importing done')

--> Importing done


In [7]:
X_train, X_test, y_train, y_test, feature_names = final_cleaning(ids = ['subject_id', 'hadm_id'],
                                                                  target = 'target',
                                                                  train = train, test=test)
print('--> Cleaning done')

--> Cleaning done


In [8]:
def train_lgb(X_train, y_train, n_folds, params, eval_metric, early_stopping_rounds):
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(X_train.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    print('LGB starting')
        
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(X_train):
        
        # Training data for the fold
        train_features  = X_train[train_indices]
        train_labels = [x for i,x in enumerate(y_train) if i in train_indices]
        # Validation data for the fold
        valid_features = X_train[valid_indices]
        valid_labels = [x for i,x in enumerate(y_train) if i in valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(**params)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = eval_metric,
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], early_stopping_rounds = early_stopping_rounds, verbose=0)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid'][eval_metric]
        train_score = model.best_score_['train'][eval_metric]
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
    
    # Overall validation score
    valid_auc = roc_auc_score(y_train, out_of_fold)

    # Overall training score
    train_auc = np.mean(train_scores)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(train_auc)
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})
        
    return metrics, train_auc, valid_auc

In [9]:
# define the grid search parameters
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(10, 150)),
    'learning_rate': list(np.linspace(0.001, 0.5)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_data_in_leaf': list(range(10, 250, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.001, 1)),
    'subsample': list(np.linspace(0.5, 1)),
    'is_unbalance': [True, False],
    'min_split_gain': list(np.linspace(0.001, 1)),
    'min_data_in_leaf': list(np.arange(1, 200, 3)),
    'n_estimators': list(np.arange(100, 20100, 1000))
}

In [10]:
def tune_lgb(X_train, y_train, param_grid, runs):
    
    ## -- Create output dataframe showing scores and associated hyperparameters
    df_cols = list(param_grid.keys())
    df_cols = df_cols + ['params', 'training_score', 'valid_score']

    runs_df = pd.DataFrame(columns=df_cols)
    total_runs = runs
    run =0

    while run < total_runs:

        run += 1

        # Select the random parameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        metrics, train_score, valid_score= train_lgb(X_train=X_train,
                                                      y_train=y_train,
                                                      n_folds = 5,
                                                      params = random_params,
                                                      eval_metric = 'auc',
                                                      early_stopping_rounds = 250)

        print("Run {},  Training score: {},  Valid score: {}".format(run, train_score, valid_score))
        
        temp_df = pd.DataFrame(columns=df_cols)

        for c in list(param_grid.keys()):
            temp_df.loc[0, c] = random_params[c]

        temp_df.loc[0, 'params'] = [random_params]
        temp_df.loc[0, 'training_score'] = train_score
        temp_df.loc[0, 'valid_score'] = valid_score

        runs_df = runs_df.append(temp_df)

        del temp_df, train_score, valid_score
        
    return runs_df

In [11]:
runs_df = tune_lgb(X_train, y_train, param_grid, runs=25)
runs_df = runs_df.sort_values(by='valid_score', ascending=False).reset_index(drop=True)
runs_df

LGB starting
Run 1,  Training score: 0.8953082649526627,  Valid score: 0.8702274616943652
LGB starting
Run 2,  Training score: 0.9295107951594029,  Valid score: 0.859947194116766
LGB starting
Run 3,  Training score: 0.9350825579510399,  Valid score: 0.8809309895418508
LGB starting
Run 4,  Training score: 0.9247505035412922,  Valid score: 0.8864511922618401
LGB starting
Run 5,  Training score: 0.9295282454380167,  Valid score: 0.8870675479868665
LGB starting
Run 6,  Training score: 0.9680055764738325,  Valid score: 0.8880608610307776
LGB starting
Run 7,  Training score: 0.9998685966695728,  Valid score: 0.8816989165595445
LGB starting
Run 8,  Training score: 0.9875593568041675,  Valid score: 0.8860167274877753
LGB starting
Run 9,  Training score: 0.904579218763465,  Valid score: 0.821681193463827
LGB starting
Run 10,  Training score: 0.9022213599322072,  Valid score: 0.8766355741225736
LGB starting
Run 11,  Training score: 0.9527079041291207,  Valid score: 0.8806072468681116
LGB startin

Unnamed: 0,boosting_type,num_leaves,learning_rate,subsample_for_bin,min_data_in_leaf,reg_alpha,reg_lambda,colsample_bytree,subsample,is_unbalance,min_split_gain,n_estimators,params,training_score,valid_score
0,gbdt,31,0.0111837,220000,85,0.0408163,0.836735,0.571857,0.530612,True,0.673796,12100,"[{'boosting_type': 'gbdt', 'num_leaves': 31, '...",0.959954,0.88942
1,dart,31,0.163939,260000,193,0.285714,0.122449,0.857286,0.795918,True,0.469918,12100,"[{'boosting_type': 'dart', 'num_leaves': 31, '...",0.968006,0.888061
2,dart,143,0.0417347,180000,58,0.122449,0.163265,0.490306,0.908163,True,0.388367,7100,"[{'boosting_type': 'dart', 'num_leaves': 143, ...",0.993739,0.887108
3,goss,127,0.0519184,20000,166,0.979592,0.530612,1.0,0.877551,False,0.714571,4100,"[{'boosting_type': 'goss', 'num_leaves': 127, ...",0.929528,0.887068
4,goss,13,0.0519184,40000,52,0.22449,0.44898,0.612633,0.571429,True,0.164102,6100,"[{'boosting_type': 'goss', 'num_leaves': 13, '...",0.924751,0.886451
5,gbdt,118,0.0824694,20000,73,0.122449,0.979592,0.81651,0.530612,False,0.959224,19100,"[{'boosting_type': 'gbdt', 'num_leaves': 118, ...",0.969808,0.886242
6,dart,99,0.0111837,40000,52,0.897959,0.44898,0.714571,0.918367,True,0.36798,19100,"[{'boosting_type': 'dart', 'num_leaves': 99, '...",0.987559,0.886017
7,gbdt,24,0.204673,240000,88,0.653061,0.244898,0.796122,0.632653,False,0.898061,19100,"[{'boosting_type': 'gbdt', 'num_leaves': 24, '...",0.944936,0.885028
8,dart,132,0.337061,60000,19,0.653061,1.0,0.531082,0.867347,False,0.143714,100,"[{'boosting_type': 'dart', 'num_leaves': 132, ...",0.999869,0.881699
9,dart,45,0.479633,220000,139,0.571429,0.591837,0.796122,0.734694,True,0.123327,17100,"[{'boosting_type': 'dart', 'num_leaves': 45, '...",0.935083,0.880931


In [12]:
best_params=runs_df.loc[0, 'params'][0]
best_params

{'boosting_type': 'gbdt',
 'num_leaves': 31,
 'learning_rate': 0.011183673469387756,
 'subsample_for_bin': 220000,
 'min_data_in_leaf': 85,
 'reg_alpha': 0.04081632653061224,
 'reg_lambda': 0.836734693877551,
 'colsample_bytree': 0.5718571428571428,
 'subsample': 0.5306122448979592,
 'is_unbalance': True,
 'min_split_gain': 0.673795918367347,
 'n_estimators': 12100}

In [13]:
# Create the model
model = lgb.LGBMClassifier(**best_params)

# Train the model
model.fit(X_train, y_train, eval_metric = 'auc', verbose=500)

# Record the best iteration
best_iteration = model.best_iteration_

# Record feature importances
feature_importances = model.feature_importances_

# Record the out of fold predictions
train_predict = model.predict_proba(X_train, num_iteration = best_iteration)
test_predict = model.predict_proba(X_test, num_iteration = best_iteration)

# Get AUC
train_score = roc_auc_score(y_train, train_predict[:,1])
test_score = roc_auc_score(y_test, test_predict[:,1])

print('Train', train_score)
print('Test', test_score)

Train 0.9812623431787898
Test 0.8795316572656742


In [14]:
feature_importances_df = pd.DataFrame({'feature': feature_names,
                                       'importance': feature_importances})
feature_importances_df.sort_values(by='importance', ascending=False, inplace=True)
feature_importances_df

Unnamed: 0,feature,importance
10,Creatinine,2234
28,Phosphorus,1662
0,Admission weight,1659
29,Platelet Count,1638
13,HR,1604
27,PTT,1573
4,BP systolic,1548
26,PO2,1536
31,RDW,1502
3,BP mean,1483
