In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import random
import math
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import statistics
import datetime as dt

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras import backend as K

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
# Set some display options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [3]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [4]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

src_preparation_folder = os.path.join(src_folder, 'preparation')
src_processing_folder = os.path.join(src_folder, 'processing')
src_modeling_folder = os.path.join(src_folder, 'modeling')

In [5]:
# Import src functions
sys.path.insert(0, src_preparation_folder)
from import_data import get_table
from import_data import get_data_simple
from import_data import get_patient_admissions_diagnoses
from import_data import get_admission_data
from import_data import get_chartevents
from import_data import get_labevents
from extract_codes import find_ndc_codes

sys.path.insert(0, src_processing_folder)
from stats import plot_KDE
from stats import plot_perc_bar_chart
from stats import compare_groups
from stats import graph_comparisons
from patient_selection import select_test_groups
from clean import replace_itemid_with_label
from clean import find_populated_cols

sys.path.insert(0, src_modeling_folder)
from models import train_lgb

  """)


In [6]:
df = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'cleaned_respiratory_failure.csv')),index_col=0)

In [7]:
print(df.shape)
print(df.target.value_counts())
df.head()

(28416, 40)
0    23820
1     4596
Name: target, dtype: int64


Unnamed: 0,subject_id,hadm_id,target,gender,age_adm_bucket,Admission weight,Anion Gap,BP diastolic,BP mean,BP systolic,BUN,Basophils,Bicarbonate,Calcium (Total),Chloride,Creatinine,Eosinophils,Glucose,HR,Hematocrit,Hemoglobin,Lymphocytes,MCH,MCHC,MCV,Magnesium,Monocytes,Neutrophils,PTT,Phosphorus,Platelet Count,Potassium,RDW,Red Blood Cells,Respiratory rate,Sodium,Temperature F,Urea Nitrogen,White blood cells,pH
0,111,192123,1,F,3. 60-75,56.700001,12.0,57.0,76.0,135.0,25.0,0.0,30.0,8.7,104.0,0.7,0.0,119.0,70.0,37.5,12.8,6.0,30.4,34.8,87.0,2.5,4.0,86.0,44.1,3.7,291.0,3.4,12.3,4.23,25.0,143.0,96.800003,25.0,12.8,7.48
1,148,199488,1,F,4. 75-89,67.5,13.0,67.0,92.333298,143.0,43.0,0.0,24.0,7.9,104.0,0.2,2.0,94.0,96.0,24.2,8.2,16.0,29.6,33.9,87.0,1.6,7.0,39.0,28.3,2.3,39.0,3.9,14.8,2.78,20.0,137.0,98.199997,15.0,1.8,7.46
2,156,168847,1,M,3. 60-75,112.0,12.0,56.0,75.0,113.0,20.0,0.1,30.0,9.1,102.0,1.1,1.1,113.0,52.0,36.4,12.1,11.3,30.1,33.3,90.0,1.9,5.0,82.6,26.1,3.2,343.0,4.7,14.9,4.04,18.0,139.0,97.400002,20.0,11.9,7.49
3,157,107880,1,M,4. 75-89,63.0,17.0,86.0,105.667,160.0,7.0,0.0,22.0,8.2,108.0,0.5,0.0,100.0,94.0,32.0,10.8,5.0,32.1,33.8,95.0,2.4,14.0,72.0,29.3,3.5,144.0,3.7,17.6,3.37,21.0,143.0,97.199997,6.0,8.6,7.45
4,165,170252,1,M,4. 75-89,70.0,17.0,72.0,84.0,115.0,39.0,0.5,30.0,8.6,100.0,2.0,2.8,127.0,64.0,,14.3,11.0,30.4,32.8,92.0,2.4,4.8,80.9,26.5,3.6,126.0,4.3,13.5,4.7,17.0,143.0,97.7,50.0,12.2,7.35


In [8]:
# Pre Cleaning
# Dummy variables for age bucket and gender
df = pd.get_dummies(df)

# Shuffle
df = df.sample(frac=1).reset_index(drop=True)

# Split features and labels
X = df.drop(columns=['subject_id', 'hadm_id', 'target'])
Y = np.array(df.target.tolist())

# Impute missing values
imputer = Imputer(strategy = 'median')
imputer.fit(X)
X = imputer.transform(X)

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1)) 
scaler.fit(X)
X = scaler.transform(X)



In [9]:
print(X.shape)
print(Y.shape)
print(X.shape[1])

(28416, 42)
(28416,)
42


In [10]:
# define the grid search parameters
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 10]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, Y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 14.9min finished


NameError: name 'grid_result' is not defined

In [12]:
# summarize results
print("Best: %f using %s" % (rf_random.best_score_, rf_random.best_params_))
means = rf_random.cv_results_['mean_test_score']
stds = rf_random.cv_results_['std_test_score']
params = rf_random.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.171473 using {'n_estimators': 1350, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}
0.151004 (0.009468) with: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}
-0.124853 (0.009799) with: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 110, 'bootstrap': False}
0.171473 (0.007788) with: {'n_estimators': 1350, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}
0.169912 (0.006573) with: {'n_estimators': 266, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}
0.169521 (0.007932) with: {'n_estimators': 483, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
0.158971 (0.006988) with: {'n_estimators': 483, 'min_samples_

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
best_model = rf_random.best_estimator_
predictions = best_model.predict(X_test)
roc_auc_score(y_test, predictions)

0.9995950355677543