# RF: Random Forest  Model Training

We also implemented a Random forest (RF) model which provides a list of feature importance, relevant for model interpretation.   

We used the same list of features than the LR_RMH model (baseline plus additional features)


In [4]:
%c inline
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn import metrics


import sys 
import json
sys.path.append("../../src/features")
import build_features, vital_signs_features, age_features, RFV_features


%matplotlib inline

ERROR:root:Line magic function `%c` not found.


In [5]:
pd.options.mode.chained_assignment = None  # default='warn'

## Reading CDC 2009 file

In [6]:
#reading file
with open('../../fileConfig.json') as config_file:    
        fileConfig = json.load(config_file)
processedDirectory = fileConfig['dataDirectory'] + fileConfig['processedDirectory'] 
cdc_input = pd.read_csv(processedDirectory + 'ED_TOTAL_2009_2009.csv' )

## Feature Engineering

In [7]:
# same list of features than the LR_RMH model 
predictors, target = build_features.get_all_features (cdc_input )
list (predictors)

['Temp_Baseline',
 'Pulse_Baseline',
 'Sys_BP_Baseline',
 'Resp_Rate_Baseline',
 'Oxygen_Sat_Baseline',
 'Reason_Chest_Pain',
 'Reason_Abdominal_Pain',
 'Reason_Headache',
 'Reason_Shortness_of_Breath',
 'Reason_Back_Pain',
 'Reason_Cough',
 'Reason_Nausea_Vomiting',
 'Reason_Fever_Chills',
 'Reason_Syncope',
 'Reason_Dizziness',
 'Reason_Psychiatric_Complaint',
 'Reason_Nervous_System',
 'Reason_Cardiovascular_Other',
 'Reason_Ears_Eyes_Complaint',
 'Reason_Respiratory_Other',
 'Reason_Gastrointestinal_Other',
 'Reason_Genitourinary_Other',
 'Reason_Skin_Hair_Nails_Complaint',
 'Reason_Musculoskeletal_Other',
 'Reason_Injury_Poisoning',
 'Reason_Other',
 'Hypothermia',
 'Hyperthermia',
 'Bradycardia',
 'Mild_Tachycardia',
 'Moderate_Tachycardia',
 'Severe_Tachycardia',
 'Hypotension',
 'Hypertension',
 'Bradypnea',
 'Moderate_Tachypnea',
 'Severe_Tachypnea',
 'Mild_Hypoxia',
 'Severe_Hypoxia',
 'Age_18_30',
 'Age_31_40',
 'Age_41_50',
 'Age_51_60',
 'Age_61_70',
 'Age_71_80',
 'Age_81

## Tuning hyperparameters
(we are not including all the manual tuning in the notebook, just some)

In [8]:
def fit_RF(rf_model, X_train,y_train):
    clf.fit(X_train, y_train)
    predicted_prob = clf.predict_proba(X_dev)
    fpr, tpr, thresholds = metrics.roc_curve(y_dev,predicted_prob[:,1], pos_label = 1)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [9]:
X_train, X_dev, y_train, y_dev = train_test_split(predictors, target, test_size = 0.1)

In [10]:
clf = RandomForestClassifier(class_weight='balanced')
roc_auc = fit_RF(clf, X_train,y_train)
roc_auc

0.7700613960788818

In [11]:
depths = [20, 15,12, 10, 8, 5,3]
for depth in depths:
    clf = RandomForestClassifier(class_weight='balanced',max_depth=depth)
    roc_auc = fit_RF(clf, X_train,y_train)
    print ('depth = %3d, ROC_AUC = %1.4f ' % (depth, roc_auc))

depth =  20, ROC_AUC = 0.8003 
depth =  15, ROC_AUC = 0.8287 
depth =  12, ROC_AUC = 0.8454 
depth =  10, ROC_AUC = 0.8412 
depth =   8, ROC_AUC = 0.8506 
depth =   5, ROC_AUC = 0.8486 
depth =   3, ROC_AUC = 0.8484 


In [12]:
n_estimators = [50, 100,200, 300, 400]
for ne in n_estimators:
    clf = RandomForestClassifier(class_weight='balanced',max_depth=8, n_estimators=ne, random_state = 42)
    roc_auc = fit_RF(clf, X_train,y_train)
    print ('n_estimators = %3d, ROC_AUC = %1.4f ' % (ne, roc_auc))

n_estimators =  50, ROC_AUC = 0.8623 
n_estimators = 100, ROC_AUC = 0.8640 
n_estimators = 200, ROC_AUC = 0.8659 
n_estimators = 300, ROC_AUC = 0.8641 
n_estimators = 400, ROC_AUC = 0.8651 


In [13]:
min_splits = [ 4,8,10,12]
for ms in min_splits:
    clf = RandomForestClassifier(class_weight='balanced',max_depth=8, n_estimators=200, random_state = 42, 
                             min_samples_split =ms)
    roc_auc = fit_RF(clf, X_train,y_train)
    print ('ms = %3d, ROC_AUC = %1.4f ' % (ms, roc_auc))

ms =   4, ROC_AUC = 0.8662 
ms =   8, ROC_AUC = 0.8666 
ms =  10, ROC_AUC = 0.8670 
ms =  12, ROC_AUC = 0.8652 


## 10 fold cross validation

In [14]:
from sklearn.cross_validation  import StratifiedKFold
def RF_crossValidation (predictors, target, max_depth=8, n_estimators=100, random_state = 42, 
                             min_samples_split =8):
    seed = np.random.seed(0)
    kfold = StratifiedKFold(target, n_folds=10)
    cvscores = []
    #for train, test in kfold.split(predictors, target):
    for train, test in kfold:
        X_train_cv = np.array(predictors)[train]
        X_dev_cv = np.array(predictors)[test]
        y_train_cv = np.array (target)[train]
        y_dev_cv = np.array(target)[test]
        clf = RandomForestClassifier(class_weight='balanced',max_depth=max_depth, n_estimators=n_estimators, 
                                     random_state = random_state, min_samples_split =min_samples_split)
        clf.fit(X_train_cv, y_train_cv)
        predicted_prob = clf.predict_proba(X_dev_cv)
        fpr, tpr, thresholds = metrics.roc_curve(y_dev_cv,predicted_prob[:,1], pos_label = 1)
        roc_auc = metrics.auc(fpr, tpr) 
        print ('ROC_AUC = %1.4f ' % (roc_auc))
        cvscores.append(roc_auc)
    print("ROC AUC: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores)*100, np.std(cvscores)))

In [15]:
RF_crossValidation (predictors, target,
                   max_depth=8, n_estimators=100, random_state = 42, 
                             min_samples_split =8)

ROC_AUC = 0.8293 
ROC_AUC = 0.8239 
ROC_AUC = 0.8598 
ROC_AUC = 0.8365 
ROC_AUC = 0.8360 
ROC_AUC = 0.8461 
ROC_AUC = 0.8607 
ROC_AUC = 0.8425 
ROC_AUC = 0.8603 
ROC_AUC = 0.7998 
ROC AUC: 83.95% (+/- 0.02%)


In [19]:
lf = RandomForestClassifier(class_weight='balanced',max_depth=8, n_estimators=100, random_state = 42, 
                             min_samples_split =8)
clf.fit(X_train, y_train)
fi_2009 = clf.feature_importances_

In [20]:
feature_importances_2009 = pd.DataFrame(fi_2009,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances_2009.head(10)

Unnamed: 0,importance
Ambulance_Arrival,0.15129
Other_Arrival,0.125668
Age_18_30,0.075858
rfv1_2,0.065248
CHF,0.062469
Resp_Rate_Baseline,0.03949
Age_81_Above,0.038944
Oxygen_Sat_Baseline,0.037227
Reason_Shortness_of_Breath,0.036495
DIABETES_1,0.029102



### original run result during w210 project
From w210 run (not sure how 81.15 was the result,not replicable now)

```
clf = RandomForestClassifier(class_weight='balanced',max_depth=8, n_estimators=100, random_state = 42, 
                             min_samples_split =10)
ROC_AUC = 0.8349 
ROC_AUC = 0.8454 
ROC_AUC = 0.8233 
ROC_AUC = 0.8099 
ROC_AUC = 0.8461 
ROC_AUC = 0.8294 
ROC_AUC = 0.7979 
ROC_AUC = 0.6863 
ROC_AUC = 0.8176 
ROC_AUC = 0.8242 
ROC AUC: 81.15% (+/- 0.04%)
    ```