# Random forest baseline implementation
### Predicts MIMIC-III ICU patient mortality given the first 24 hours

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_curve, auc, brier_score_loss
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy as np

import sys
sys.path.append('/usr/local/lib/python2.7/dist-packages/')

from random import shuffle
np.random.seed(7)

In [3]:
import sys
sys.path.append('/usr/local/lib/python2.7/dist-packages/')

import dbconfig as cfg
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://{}:{}@{}:3306/mimic'.format(cfg.mysql['user'], cfg.mysql['password'],
                                                                cfg.mysql['host']), echo=False)

### Load data

In [40]:
mimic_df = pd.read_pickle('/home/andrea/data/mimic_nontimeseries_normalized')
# mimic_df = pd.get_dummies(mimic_df, columns=['GENDER', 'ADMISSION_LOCATION', 'ADMISSION_TYPE'])
label_col = mimic_df['HOSPITAL_EXPIRE_FLAG']
del mimic_df['HOSPITAL_EXPIRE_FLAG']
del mimic_df['AGE']
del mimic_df['GENDER']
del mimic_df['ADMISSION_LOCATION']
del mimic_df['ADMISSION_TYPE']
del mimic_df['glasgow_score']
del mimic_df['riker_sas']
del mimic_df['eye_open']
mimic_df['HOSPITAL_EXPIRE_FLAG'] = label_col
mimic_df.head()

Unnamed: 0_level_0,lab_hemoglobin,lab_monocyte,lactate_dh,lab_eosinophil,lab_glucose,lab_ck,lab_basophils,troponin_t,sodium_whole_blood,art_dia,resp_pattern,bp_dia,chart_temp,art_mean,bp_sys,art_sys,cvp,HOSPITAL_EXPIRE_FLAG
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3,0.0769231,0.0641026,-0.000482413,0,0.0322581,0.000836092,0,-0.000334448,0.781145,0.000171725,0,0,0.889908,0,0,0.0260664,0.22,0
4,0.0,0.025641,0.00406605,0,-0.0752688,-1.12479e-05,0,-0.000334448,-0.0213244,0.000171725,0,0,0.899083,0,0,0.0260664,0.0,0
6,0.0692308,0.0128205,-0.000482413,0,-0.0752688,-1.12479e-05,0,-0.000334448,0.736251,0.000171725,0,0,0.889908,0,0,0.0260664,0.24,0
9,0.107692,0.0,-0.000482413,0,-0.0752688,-1.12479e-05,0,-0.000334448,0.753086,0.000171725,0,0,0.899083,0,0,0.0260664,0.0,1
11,0.0,0.0,-0.000482413,0,-0.0752688,-1.12479e-05,0,-0.000334448,-0.0213244,0.000171725,0,0,0.899083,0,0,0.0260664,0.0,0


### Shuffle and split data into train and test sets

In [41]:
def shuffle_split(l, train=0.6, test=0.2, val=0.2):
    shuffle(l)
    if val == 0:
        train = 0.7
    train_index = int(len(l)*train)
    test_index = train_index + int(len(l)*test)
    if val == 0:
        return l[:train_index], l[train_index:]
    return l[:train_index], l[train_index:test_index], l[test_index:]

In [42]:
patient_list = mimic_df.index.values

features = mimic_df.columns[1:-1]
total_input = mimic_df[features].as_matrix()
total_labels = mimic_df['HOSPITAL_EXPIRE_FLAG'].values

# train, test = shuffle_split(patient_list, train=0.8, test=0.2, val=0)
# train, test = shuffle_split(patient_list, val=0)

# train_df = mimic_df.ix[train]
# test_df = mimic_df.ix[test]


# X_train = train_df.as_matrix()
# X_test = test_df.as_matrix()

# y_train = train_df['HOSPITAL_EXPIRE_FLAG'].values
# y_test = test_df['HOSPITAL_EXPIRE_FLAG'].values


# print('{} observations in the training data'.format(len(X_train)))
# print('{} observations in the test data'.format(len(X_test)))

### Initialize random forest classifier,  fit, and obtain predictions

In [43]:
k = 10
clf = RandomForestClassifier(n_estimators=50, random_state=7)

In [44]:
kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=7)

pred_vals = []
true_labels = []
run_ids = []

fpr_vals = []
tpr_vals = []
thresholds = []
auc_vals = []

count = 1
for train, test in kfold.split(total_input, total_labels):
    proba = clf.fit(total_input[train], total_labels[train]).predict_proba(total_input[test])
    
    pred_vals.append(proba)
    true_labels.append(total_labels[test])
    run_ids.append([count]*len(proba))
    
#     print 
    
#     fpr, tpr, threshold = roc_curve(total_labels[test], proba)
#     fpr_vals.append(fpr)
#     tpr_vals.append(tpr)
#     thresholds.append(threshold)
    
    count += 1
#     accuracies.append(accuracy_score(y_pred=proba, y_true=total_labels[testcv]))
    

# scores = cross_validate(clf, total_input, total_labels, 
#                         scoring=scoring, cv=10, 
#                         return_train_score=False)

In [28]:
# print('Mean test accuracy: {}'.format(np.mean(scores['test_accuracy'])))
# print('Mean test f1 score: {}'.format(np.mean(scores['test_f1_macro'])))
# print('Mean test precision: {}'.format(np.mean(scores['test_precision_macro'])))
# print('Mean test recall: {}'.format(np.mean(scores['test_recall_macro'])))

In [29]:
def flatten(l):
    return [item for sublist in l for item in sublist]        

In [45]:
results_df = pd.DataFrame()
results_df['0_prob'] = [x[0] for x in flatten(pred_vals)]
results_df['1_prob'] = [x[1] for x in flatten(pred_vals)]
results_df['true_label'] = flatten(true_labels)
results_df['run_ids'] = flatten(run_ids)
results_df.head()

Unnamed: 0,0_prob,1_prob,true_label,run_ids
0,1.0,0.0,0,1
1,0.121192,0.878808,0,1
2,1.0,0.0,0,1
3,1.0,0.0,0,1
4,1.0,0.0,0,1


In [21]:
# zero_predictions = [x[0] for x in proba]
# rounded_predictions = []
# for pred in zero_predictions:
#     if pred < 0.5:
#         rounded_predictions.append(1)
#     elif pred >= 0.5:
#         rounded_predictions.append(0)

In [15]:
# y_true = pd.Series(np.squeeze(true_labels[-1]))
# y_pred = pd.Series(rounded_predictions)
# pd.crosstab(y_true, y_pred, rownames=['True value'], colnames=['Predicted value'])

In [16]:
# print('Precision: {:0.4f}'.format(precision_score(y_true=y_true, y_pred=y_pred)))
# print('Recall: {:0.4f}'.format(recall_score(y_true=y_true, y_pred=y_pred)))
# print('F1 score: {:0.4f}'.format(f1_score(y_true=y_true, y_pred=y_pred)))


### Write results to RDS 

In [47]:
results_df.to_sql(name='random_forest_cv_results', if_exists='replace',con=engine, index=False)

### Model evaluation

In [50]:
# results_df = pd.read_sql_table('random_forest_cv_results', con=engine)
n_folds = set(results_df.run_ids.values)

auc_vals = []
brier_scores = []

for fold in n_folds:
    temp_df = results_df.loc[results_df['run_ids']==fold]
    pred_vals = temp_df['1_prob'].values
    true_vals = temp_df.true_label.values
    fpr, tpr, threshold = roc_curve(true_vals, pred_vals)
    auc_vals.append(auc(fpr,tpr))
    brier_scores.append(brier_score_loss(true_vals,pred_vals))

print('avg AUC: {:.4f}, avg Brier score: {:.4f}'.format(np.mean(auc_vals), np.mean(brier_scores)))

avg AUC: 0.6607, avg Brier score: 0.1012


In [33]:
total_input

array([[0.064102564102564097, -0.0004824126144007057, 0.0, ..., 0, 0.22, 1],
       [0.02564102564102564, 0.0040660491785202337, 0.0, ..., 0, 0.0, 4],
       [0.01282051282051282, -0.0004824126144007057, 0.0, ..., 0,
        0.23999999999999999, 4],
       ..., 
       [0.0, -0.0004824126144007057, 0.0, ..., 0, 0.0, 0],
       [0.0, 0.0031012239497188224, 0.0, ..., 0, 0.0, 0],
       [0.0, -0.0004824126144007057, 0.0, ..., 0, 0.0, 0]], dtype=object)

### Top 5 feature importance

In [32]:
sorted(list(zip(total_input, clf.feature_importances_)), 
       key=lambda x :x[1], reverse=True)[:5]

KeyError: "Index(['glasgow_score'], dtype='object') not in index"

### Confusion matrix

In [24]:
clf.fit(train_df[features], y)
predictions = clf.predict(test_df[features])

pd.crosstab(y_test, 
            predictions, 
            rownames=['True expire_flag'], 
            colnames=['Predicted expire_flag'])

NameError: name 'y' is not defined