In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import scipy
import random

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Set some display options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [3]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [4]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

In [5]:
# Import src functions
sys.path.insert(0, src_folder)
from modeling import *
from stats_and_visualisations import *
from s3_storage import *

In [6]:
# Import data
train = from_s3(bucket='mimic-jamesi',
               filename='acute_kidney_failure_train.csv',
               index_col=0)

test = from_s3(bucket='mimic-jamesi',
               filename='acute_kidney_failure_test.csv',
               index_col=0)
print('--> Importing done')

--> Importing done


In [7]:
X_train, X_test, y_train, y_test, feature_names = final_cleaning(ids = ['subject_id', 'hadm_id'],
                                                                  target = 'target',
                                                                  train = train, test=test)
print('--> Cleaning done')

--> Cleaning done


In [8]:
def train_lgb(X_train, y_train, n_folds, params, eval_metric, early_stopping_rounds):
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(X_train.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    print('LGB starting')
        
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(X_train):
        
        # Training data for the fold
        train_features  = X_train[train_indices]
        train_labels = [x for i,x in enumerate(y_train) if i in train_indices]
        # Validation data for the fold
        valid_features = X_train[valid_indices]
        valid_labels = [x for i,x in enumerate(y_train) if i in valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(**params)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = eval_metric,
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], early_stopping_rounds = early_stopping_rounds, verbose=500)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid'][eval_metric]
        train_score = model.best_score_['train'][eval_metric]
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
    
    # Overall validation score
    valid_auc = roc_auc_score(y_train, out_of_fold)

    # Overall training score
    train_auc = np.mean(train_scores)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(train_auc)
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})
    
    print(metrics)
    
    return metrics, train_auc, valid_auc

In [9]:
# define the grid search parameters
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(10, 150)),
    'learning_rate': list(np.linspace(0.001, 0.5)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_data_in_leaf': list(range(10, 250, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.001, 1)),
    'subsample': list(np.linspace(0.5, 1)),
    'is_unbalance': [True, False],
    'min_split_gain': list(np.linspace(0.001, 1)),
    'min_data_in_leaf': list(np.arange(1, 200, 3)),
    'n_estimators': list(np.arange(100, 20100, 1000))
}

In [10]:
def tune_lgb(X_train, y_train, param_grid, runs):
    
    ## -- Create output dataframe showing scores and associated hyperparameters
    df_cols = list(param_grid.keys())
    df_cols = df_cols + ['params', 'training_score', 'valid_score']

    runs_df = pd.DataFrame(columns=df_cols)
    total_runs = runs
    run =0

    while run < total_runs:

        run += 1

        # Select the random parameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        print('=========')
        print('RUN IS ' + str(run))
        print('=========')

        metrics, train_score, valid_score= train_lgb(X_train=X_train,
                                                      y_train=y_train,
                                                      n_folds = 4,
                                                      params = random_params,
                                                      eval_metric = 'auc',
                                                      early_stopping_rounds = 250)

        temp_df = pd.DataFrame(columns=df_cols)

        for c in list(param_grid.keys()):
            temp_df.loc[0, c] = random_params[c]

        temp_df.loc[0, 'params'] = [random_params]
        temp_df.loc[0, 'training_score'] = train_score
        temp_df.loc[0, 'valid_score'] = valid_score

        runs_df = runs_df.append(temp_df)

        del temp_df, train_score, valid_score
        
    return runs_df

In [11]:
runs_df = tune_lgb(X_train, y_train, param_grid, runs=25)
runs_df = runs_df.sort_values(by='valid_score', ascending=False).reset_index(drop=True)
runs_df

RUN IS 1
LGB starting
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[191]	valid's binary_logloss: 0.49262	valid's auc: 0.839585	train's binary_logloss: 0.334465	train's auc: 0.970375
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[131]	valid's binary_logloss: 0.509497	valid's auc: 0.824184	train's binary_logloss: 0.376151	train's auc: 0.95331
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[189]	valid's binary_logloss: 0.495745	valid's auc: 0.835994	train's binary_logloss: 0.335082	train's auc: 0.968413
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[183]	valid's binary_logloss: 0.510676	valid's auc: 0.825533	train's binary_logloss: 0.33757	train's auc: 0.967749
      fold     train     valid
0        0  0.970375  0.839585
1        1  0.953310  0.824184
2        2  0.968413  0.835994
3

Early stopping, best iteration is:
[595]	valid's binary_logloss: 0.487671	valid's auc: 0.838717	train's binary_logloss: 0.235707	train's auc: 0.995413
Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.502645	valid's auc: 0.825485	train's binary_logloss: 0.246349	train's auc: 0.992224
Early stopping, best iteration is:
[452]	valid's binary_logloss: 0.50148	valid's auc: 0.82611	train's binary_logloss: 0.261931	train's auc: 0.98875
Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.490283	valid's auc: 0.836128	train's binary_logloss: 0.252358	train's auc: 0.991365
Early stopping, best iteration is:
[707]	valid's binary_logloss: 0.487901	valid's auc: 0.837381	train's binary_logloss: 0.212971	train's auc: 0.9976
Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.507467	valid's auc: 0.825518	train's binary_logloss: 0.24553	train's auc: 0.992417
Early stopping, 

Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.51798	valid's auc: 0.814983	train's binary_logloss: 0.419385	train's auc: 0.888002
Early stopping, best iteration is:
[367]	valid's binary_logloss: 0.516999	valid's auc: 0.815547	train's binary_logloss: 0.424027	train's auc: 0.885307
Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.52553	valid's auc: 0.806714	train's binary_logloss: 0.418142	train's auc: 0.888231
Early stopping, best iteration is:
[334]	valid's binary_logloss: 0.523543	valid's auc: 0.807589	train's binary_logloss: 0.427094	train's auc: 0.883296
Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.503286	valid's auc: 0.822028	train's binary_logloss: 0.424128	train's auc: 0.885631
Early stopping, best iteration is:
[318]	valid's binary_logloss: 0.501577	valid's auc: 0.822294	train's binary_logloss: 0.434379	train's auc: 0.879684
Training unt

Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[125]	valid's binary_logloss: 0.578546	valid's auc: 0.797527	train's binary_logloss: 0.552091	train's auc: 0.853097
Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.513458	valid's auc: 0.818544	train's binary_logloss: 0.46748	train's auc: 0.877601
Early stopping, best iteration is:
[620]	valid's binary_logloss: 0.505285	valid's auc: 0.821482	train's binary_logloss: 0.449992	train's auc: 0.88353
Training until validation scores don't improve for 250 rounds.
[500]	valid's binary_logloss: 0.533509	valid's auc: 0.805243	train's binary_logloss: 0.472442	train's auc: 0.881103
Early stopping, best iteration is:
[427]	valid's binary_logloss: 0.531834	valid's auc: 0.810333	train's binary_logloss: 0.479959	train's auc: 0.875636
      fold     train     valid
0        0  0.881345  0.818691
1        1  0.853097  0.797527
2        2  0.883530  0.821482
3   

Unnamed: 0,boosting_type,num_leaves,learning_rate,subsample_for_bin,min_data_in_leaf,reg_alpha,reg_lambda,colsample_bytree,subsample,is_unbalance,min_split_gain,n_estimators,params,training_score,valid_score
0,dart,65,0.0519184,60000,58,0.591837,0.959184,0.877673,0.540816,True,0.306816,3100,"[{'boosting_type': 'dart', 'num_leaves': 65, '...",0.990635,0.832303
1,dart,103,0.102837,80000,88,0.0204082,0.612245,0.266041,0.632653,True,0.164102,16100,"[{'boosting_type': 'dart', 'num_leaves': 103, ...",0.964962,0.831029
2,dart,108,0.19449,280000,64,0.734694,0.816327,0.490306,0.795918,False,0.898061,14100,"[{'boosting_type': 'dart', 'num_leaves': 108, ...",0.983422,0.830866
3,dart,55,0.0824694,180000,10,0.571429,0.0,0.857286,0.897959,False,0.082551,15100,"[{'boosting_type': 'dart', 'num_leaves': 55, '...",0.992971,0.830845
4,goss,91,0.031551,20000,91,0.897959,0.755102,0.204878,0.5,False,0.714571,18100,"[{'boosting_type': 'goss', 'num_leaves': 91, '...",0.91352,0.830842
5,gbdt,104,0.102837,60000,175,0.632653,0.653061,0.347592,0.867347,True,0.0417755,12100,"[{'boosting_type': 'gbdt', 'num_leaves': 104, ...",0.925762,0.82993
6,dart,23,0.265776,240000,118,0.938776,0.428571,0.327204,0.989796,True,0.734959,100,"[{'boosting_type': 'dart', 'num_leaves': 23, '...",0.915758,0.828226
7,dart,52,0.245408,100000,13,0.959184,0.0612245,0.18449,0.581633,False,0.0417755,19100,"[{'boosting_type': 'dart', 'num_leaves': 52, '...",0.994423,0.825957
8,dart,120,0.326878,180000,181,0.408163,0.469388,0.877673,0.918367,False,0.63302,2100,"[{'boosting_type': 'dart', 'num_leaves': 120, ...",0.965113,0.82186
9,gbdt,39,0.38798,220000,154,0.428571,0.979592,0.286429,0.806122,True,0.449531,7100,"[{'boosting_type': 'gbdt', 'num_leaves': 39, '...",0.895957,0.821649


In [12]:
best_params=runs_df.loc[0, 'params'][0]
best_params

{'boosting_type': 'dart',
 'num_leaves': 65,
 'learning_rate': 0.051918367346938776,
 'subsample_for_bin': 60000,
 'min_data_in_leaf': 58,
 'reg_alpha': 0.5918367346938775,
 'reg_lambda': 0.9591836734693877,
 'colsample_bytree': 0.8776734693877551,
 'subsample': 0.5408163265306123,
 'is_unbalance': True,
 'min_split_gain': 0.30681632653061225,
 'n_estimators': 3100}

In [13]:
# Create the model
model = lgb.LGBMClassifier(**best_params)

# Train the model
model.fit(X_train, y_train, eval_metric = 'auc', verbose=500)

# Record the best iteration
best_iteration = model.best_iteration_

# Record feature importances
feature_importances = model.feature_importances_

# Record the out of fold predictions
train_predict = model.predict_proba(X_train, num_iteration = best_iteration)
test_predict = model.predict_proba(X_test, num_iteration = best_iteration)

# Get AUC
train_score = roc_auc_score(y_train, train_predict[:,1])
test_score = roc_auc_score(y_test, test_predict[:,1])

print('Train', train_score)
print('Test', test_score)

Train 0.9997703372347809
Test 0.8271189231879423


In [14]:
feature_importances_df = pd.DataFrame({'feature': feature_names,
                                       'importance': feature_importances})
feature_importances_df.sort_values(by='importance', ascending=False, inplace=True)
feature_importances_df

Unnamed: 0,feature,importance
12,Glucose,7760
5,BUN,7494
24,Phosphorus,7293
23,PTT,7256
10,Creatinine,6982
27,RDW,6850
33,White blood cells,6745
0,Admission weight,6741
25,Platelet Count,6724
16,Lymphocytes,6681
