In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import random
import math
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import statistics
import datetime as dt

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import lightgbm as lgb

import keras
from keras.models import Sequential
from keras.layers import Dense

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [3]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

src_preparation_folder = os.path.join(src_folder, 'preparation')
src_processing_folder = os.path.join(src_folder, 'processing')
src_modeling_folder = os.path.join(src_folder, 'modeling')

In [4]:
# Import src functions
sys.path.insert(0, src_preparation_folder)
from import_data import get_table
from import_data import get_patient_admissions_diagnoses
from import_data import get_admission_data
from import_data import get_chartevents
from import_data import get_labevents
from extract_codes import find_ndc_codes

sys.path.insert(0, src_processing_folder)
from stats import plot_KDE
from stats import plot_perc_bar_chart
from stats import compare_groups
from stats import graph_comparisons
from patient_selection import select_test_groups
from clean import replace_itemid_with_label
from clean import find_populated_cols

sys.path.insert(0, src_modeling_folder)
from models import train_lgb

  """)


In [5]:
# Hyperparameter grid
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(10, 150)),
    'learning_rate': list(np.linspace(0.001, 0.5)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_data_in_leaf': list(range(10, 250, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.001, 1)),
    'subsample': list(np.linspace(0.5, 1)),
    'is_unbalance': [True, False],
    'min_split_gain': list(np.linspace(0.001, 1)),
    'min_data_in_leaf': list(np.arange(1, 200, 3)),
}

In [6]:
def tune_lgb(diagnosis_name, param_grid, runs):
    
    print('=============')
    print(diagnosis_name)
    print('=============')
    
    ## -- Import data
    df = chart_df = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'chart_events_{}.csv'.format(diagnosis_name))), index_col=0)
    
    ## - Clean data ready for model

    # Dummy variables for age bucket and gender
    df = pd.get_dummies(df)

    # Shuffle
    df = df.sample(frac=1).reset_index(drop=True)

    # Split features and labels
    features = df.drop(columns=['subject_id', 'hadm_id', 'target'])
    labels = np.array(df.target.tolist())

    # Impute missing values
    imputer = Imputer(strategy = 'median')
    imputer.fit(features)
    features = imputer.transform(features)

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = (0, 1)) 
    scaler.fit(features)
    features = scaler.transform(features)
    
    ## -- Train a Light GBM
    df_cols = list(param_grid.keys())
    df_cols = df_cols + ['training_score', 'valid_score']

    runs_df = pd.DataFrame(columns=df_cols)
    total_runs = runs
    run =0

    while run < total_runs:

        run += 1

        # Select the random parameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        # Add constant params
        random_params['n_thread'] = 1
        random_params['n_estimators'] = 10000
        random_params['metric'] = 'auc'

        print('=========')
        print('RUN IS ' + str(run))
        print('=========')

        metrics, train_score, valid_score = train_lgb(features = features,
                                                     labels = labels,
                                                     n_folds = 5,
                                                     params = random_params,
                                                     eval_metric = 'auc',
                                                     early_stopping_rounds = 100)

        temp_df = pd.DataFrame(columns=df_cols)

        for c in list(param_grid.keys()):
            temp_df.loc[0, c] = random_params[c]

        temp_df.loc[0, 'training_score'] = train_score
        temp_df.loc[0, 'valid_score'] = valid_score

        runs_df = runs_df.append(temp_df)

        del temp_df, train_score, valid_score
    
    runs_df = runs_df.sort_values(by='valid_score', ascending=False)
    print('-------------')
    print("FINAL " + diagnosis_name)
    print('-------------')
    print(runs_df.head())
    runs_df.to_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'lgb_runs_{}.csv'.format(diagnosis_name))))    

In [7]:
tune_lgb('acute_respiratory_failure', param_grid, 10)
tune_lgb('hypertension', param_grid, 10)
tune_lgb('acute_kidney_failure', param_grid, 10)
tune_lgb('hyperlipidemia', param_grid, 10)
tune_lgb('anemia', param_grid, 10)
tune_lgb('pneumonia', param_grid, 10)
tune_lgb('depression', param_grid, 10)
tune_lgb('chronic_kidney_disease', param_grid, 10)

acute_respiratory_failure




RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[44]	valid's auc: 0.782573	train's auc: 0.860509
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[46]	valid's auc: 0.796822	train's auc: 0.861717
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[38]	valid's auc: 0.812235	train's auc: 0.851505
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[33]	valid's auc: 0.789012	train's auc: 0.852269
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[70]	valid's auc: 0.798944	train's auc: 0.880275
      fold     train     valid
0        0  0.860509  0.782573
1        1  0.861717  0.796822
2        2  0.851505  0.812235
3        3  0.852269  0.789012
4        4  0.880275  0.798944
5  overall  0.861255  0.795296
RUN IS 2
LGB starting
Trai

Early stopping, best iteration is:
[46]	valid's auc: 0.803605	train's auc: 0.972536
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[23]	valid's auc: 0.811978	train's auc: 0.926496
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[29]	valid's auc: 0.803056	train's auc: 0.943083
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[31]	valid's auc: 0.795369	train's auc: 0.949016
      fold     train     valid
0        0  0.939661  0.778105
1        1  0.972536  0.803605
2        2  0.926496  0.811978
3        3  0.943083  0.803056
4        4  0.949016  0.795369
5  overall  0.946158  0.797985
RUN IS 10
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[264]	valid's auc: 0.800768	train's auc: 0.987137
Training until validation scores don't improve for 100 rounds.
Early stopping, best ite



RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[55]	valid's auc: 0.669989	train's auc: 0.744478
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[23]	valid's auc: 0.656915	train's auc: 0.729816
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[32]	valid's auc: 0.65586	train's auc: 0.732323
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[72]	valid's auc: 0.653431	train's auc: 0.750353
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[43]	valid's auc: 0.658933	train's auc: 0.73342
      fold     train     valid
0        0  0.744478  0.669989
1        1  0.729816  0.656915
2        2  0.732323  0.655860
3        3  0.750353  0.653431
4        4  0.733420  0.658933
5  overall  0.738078  0.658052
RUN IS 2
LGB starting
Traini

Early stopping, best iteration is:
[30]	valid's auc: 0.659531	train's auc: 0.805201
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[70]	valid's auc: 0.661376	train's auc: 0.82816
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[31]	valid's auc: 0.652774	train's auc: 0.8036
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[210]	valid's auc: 0.658582	train's auc: 0.934658
      fold     train     valid
0        0  0.817776  0.673910
1        1  0.805201  0.659531
2        2  0.828160  0.661376
3        3  0.803600  0.652774
4        4  0.934658  0.658582
5  overall  0.837879  0.657032
RUN IS 10
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[2]	valid's auc: 0.636429	train's auc: 0.711831
Training until validation scores don't improve for 100 rounds.
Early stopping, best iterati



RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[249]	valid's auc: 0.851413	train's auc: 0.954656
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[145]	valid's auc: 0.831629	train's auc: 0.930377
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[198]	valid's auc: 0.849148	train's auc: 0.942252
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[216]	valid's auc: 0.848885	train's auc: 0.945012
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[255]	valid's auc: 0.836559	train's auc: 0.958527
      fold     train     valid
0        0  0.954656  0.851413
1        1  0.930377  0.831629
2        2  0.942252  0.849148
3        3  0.945012  0.848885
4        4  0.958527  0.836559
5  overall  0.946165  0.843359
RUN IS 2
LGB starting

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[28]	valid's auc: 0.817213	train's auc: 0.964152
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[24]	valid's auc: 0.833232	train's auc: 0.952947
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[23]	valid's auc: 0.834725	train's auc: 0.94959
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[18]	valid's auc: 0.819359	train's auc: 0.934644
      fold     train     valid
0        0  0.950791  0.833383
1        1  0.964152  0.817213
2        2  0.952947  0.833232
3        3  0.949590  0.834725
4        4  0.934644  0.819359
5  overall  0.950425  0.827430
RUN IS 10
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[113]	valid's auc: 0.796499	train's auc: 0.79656
Training until validation 



RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[113]	valid's auc: 0.746474	train's auc: 0.996362
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[13]	valid's auc: 0.735468	train's auc: 0.875598
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[16]	valid's auc: 0.733838	train's auc: 0.895389
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[13]	valid's auc: 0.725396	train's auc: 0.875805
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[13]	valid's auc: 0.737518	train's auc: 0.875532
      fold     train     valid
0        0  0.996362  0.746474
1        1  0.875598  0.735468
2        2  0.895389  0.733838
3        3  0.875805  0.725396
4        4  0.875532  0.737518
5  overall  0.903737  0.732029
RUN IS 2
LGB starting
Tra

Early stopping, best iteration is:
[180]	valid's auc: 0.728982	train's auc: 0.999352
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[314]	valid's auc: 0.726447	train's auc: 0.999626
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[12]	valid's auc: 0.724905	train's auc: 0.910659
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[162]	valid's auc: 0.72529	train's auc: 0.999328
      fold     train     valid
0        0  0.999202  0.739701
1        1  0.999352  0.728982
2        2  0.999626  0.726447
3        3  0.910659  0.724905
4        4  0.999328  0.725290
5  overall  0.981633  0.719727
RUN IS 10
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[12]	valid's auc: 0.73703	train's auc: 0.866372
Training until validation scores don't improve for 100 rounds.
Early stopping, best ite



RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.662929	train's auc: 0.777326
Early stopping, best iteration is:
[704]	valid's auc: 0.663697	train's auc: 0.80205
Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.689911	train's auc: 0.772737
Early stopping, best iteration is:
[866]	valid's auc: 0.693281	train's auc: 0.817965
Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.674572	train's auc: 0.776035
Early stopping, best iteration is:
[474]	valid's auc: 0.675282	train's auc: 0.771924
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[346]	valid's auc: 0.667552	train's auc: 0.754306
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[246]	valid's auc: 0.671204	train's auc: 0.737198
      fold     train     valid
0        0  0.802050  0.663697
1        1  0.817965  0.693

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[172]	valid's auc: 0.650773	train's auc: 0.988756
      fold     train     valid
0        0  0.988460  0.640075
1        1  0.986883  0.672993
2        2  0.989778  0.650738
3        3  0.989602  0.638154
4        4  0.988756  0.650773
5  overall  0.988696  0.651150
RUN IS 9
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[5]	valid's auc: 0.640296	train's auc: 0.817945
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[137]	valid's auc: 0.650398	train's auc: 0.998166
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3]	valid's auc: 0.633918	train's auc: 0.775644
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[8]	valid's auc: 0.641293	train's auc: 0.854189
Training until validation s



RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[11]	valid's auc: 0.722157	train's auc: 0.815574
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[12]	valid's auc: 0.700512	train's auc: 0.820377
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[10]	valid's auc: 0.706387	train's auc: 0.811642
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[11]	valid's auc: 0.71933	train's auc: 0.81175
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[9]	valid's auc: 0.728899	train's auc: 0.803277
      fold     train     valid
0        0  0.815574  0.722157
1        1  0.820377  0.700512
2        2  0.811642  0.706387
3        3  0.811750  0.719330
4        4  0.803277  0.728899
5  overall  0.812524  0.715226
RUN IS 2
LGB starting
Trainin

Early stopping, best iteration is:
[9]	valid's auc: 0.717666	train's auc: 0.85364
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[10]	valid's auc: 0.725167	train's auc: 0.862836
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[16]	valid's auc: 0.728405	train's auc: 0.903231
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[10]	valid's auc: 0.732318	train's auc: 0.863538
      fold     train     valid
0        0  0.906674  0.712179
1        1  0.853640  0.717666
2        2  0.862836  0.725167
3        3  0.903231  0.728405
4        4  0.863538  0.732318
5  overall  0.877984  0.722750
RUN IS 10
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[98]	valid's auc: 0.74522	train's auc: 0.89198
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteratio



RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.672652	train's auc: 0.693206
[1000]	valid's auc: 0.68257	train's auc: 0.711164
[1500]	valid's auc: 0.688446	train's auc: 0.725146
[2000]	valid's auc: 0.69234	train's auc: 0.737807
[2500]	valid's auc: 0.694956	train's auc: 0.748093
[3000]	valid's auc: 0.697945	train's auc: 0.757421
[3500]	valid's auc: 0.700609	train's auc: 0.765556
[4000]	valid's auc: 0.702346	train's auc: 0.773155
[4500]	valid's auc: 0.703589	train's auc: 0.78025


KeyboardInterrupt: 

In [8]:
tune_lgb('chronic_kidney_disease', param_grid, 10)

chronic_kidney_disease




RUN IS 1
LGB starting
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[23]	valid's auc: 0.872999	train's auc: 0.953253
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[35]	valid's auc: 0.872288	train's auc: 0.97595
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[75]	valid's auc: 0.879112	train's auc: 0.99837
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[15]	valid's auc: 0.884561	train's auc: 0.930524
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[17]	valid's auc: 0.868491	train's auc: 0.93868
      fold     train     valid
0        0  0.953253  0.872999
1        1  0.975950  0.872288
2        2  0.998370  0.879112
3        3  0.930524  0.884561
4        4  0.938680  0.868491
5  overall  0.959355  0.873667
RUN IS 2
LGB starting
Trainin

Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.875042	train's auc: 0.945838
[1000]	valid's auc: 0.877775	train's auc: 0.981723
Early stopping, best iteration is:
[937]	valid's auc: 0.877977	train's auc: 0.978789
Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.880877	train's auc: 0.945142
[1000]	valid's auc: 0.884184	train's auc: 0.981167
Early stopping, best iteration is:
[1064]	valid's auc: 0.884518	train's auc: 0.983755
Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.88104	train's auc: 0.945442
[1000]	valid's auc: 0.88588	train's auc: 0.981101
Early stopping, best iteration is:
[1115]	valid's auc: 0.886339	train's auc: 0.985523
Training until validation scores don't improve for 100 rounds.
[500]	valid's auc: 0.888374	train's auc: 0.944971
Early stopping, best iteration is:
[607]	valid's auc: 0.888948	train's auc: 0.955754
Training until validation scores don't improve for 100 