<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Train-Test-Split" data-toc-modified-id="Train-Test-Split-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Train Test Split</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model</a></span><ul class="toc-item"><li><span><a href="#OOB:-Out-of-Bag" data-toc-modified-id="OOB:-Out-of-Bag-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>OOB: Out of Bag</a></span></li><li><span><a href="#Results" data-toc-modified-id="Results-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Results</a></span></li></ul></li><li><span><a href="#Plotting-the-Tree" data-toc-modified-id="Plotting-the-Tree-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Plotting the Tree</a></span></li><li><span><a href="#Variable-Importance" data-toc-modified-id="Variable-Importance-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Variable Importance</a></span><ul class="toc-item"><li><span><a href="#Method-1" data-toc-modified-id="Method-1-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Method 1</a></span></li><li><span><a href="#Method-2" data-toc-modified-id="Method-2-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Method 2</a></span></li></ul></li><li><span><a href="#Tree-Depth" data-toc-modified-id="Tree-Depth-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Tree Depth</a></span></li><li><span><a href="#Cross-Validation" data-toc-modified-id="Cross-Validation-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Cross Validation</a></span></li></ul></div>

# Random Forest

*"Random forest builds multiple decision trees and merges them together to get a more accurate and stable prediction."*

Credit: 
- https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
- https://www.blopig.com/blog/2017/07/using-random-forests-in-python-with-scikit-learn/
- https://chrisalbon.com/machine_learning/trees_and_forests/random_forest_classifier_example/
- https://towardsdatascience.com/the-random-forest-algorithm-d457d499ffcd

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

# python version 
import sys
print('python vers:', sys.version[:31])

python vers: 3.6.5 |Anaconda custom (64-bit)


## Load Data

In [2]:
# load iris
iris = datasets.load_iris()

# convert to pandas df 
df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
labels = pd.Categorical.from_codes(iris.target, iris.target_names)
labels[:10]

[setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa]
Categories (3, object): [setosa, versicolor, virginica]

In [4]:
labels_onehot = pd.get_dummies(labels)
labels_onehot.head()

Unnamed: 0,setosa,versicolor,virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


## Train Test Split

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df[iris.feature_names], labels_onehot, test_size=0.5, stratify=iris.target, random_state=0)

## Model 

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, random_state=0)#oob_score=True
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [7]:
from sklearn.metrics import accuracy_score
predicted = rf.predict(x_test)
accuracy = accuracy_score(y_test, predicted)
# print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

Mean accuracy score: 0.933


### OOB: Out of Bag

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, random_state=0,oob_score=True)
rf.fit(x_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

In [24]:
from sklearn.metrics import accuracy_score
predicted = rf.predict(x_test)
accuracy = accuracy_score(y_test, predicted)
print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

Out-of-bag score estimate: 0.964
Mean accuracy score: 0.933


### Results

In [11]:
rf.predict(x_test)[:5]

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [22]:
rf.predict_proba(x_test)[0][:5]

array([[1. , 0. ],
       [0.9, 0.1],
       [1. , 0. ],
       [0. , 1. ],
       [1. , 0. ]])

In [28]:
# from sklearn.metrics import confusion_matrix
# cm = pd.DataFrame(confusion_matrix(y_test, predicted), columns=iris.target_names, index=iris.target_names)
# sns.heatmap(cm, annot=True)

## Plotting the Tree

In [27]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]

feature_list = iris.feature_names

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')


<img src="tree.png"  style="width:400px;">


## Variable Importance

### Method 1 

In [30]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: petal width (cm)     Importance: 0.54
Variable: petal length (cm)    Importance: 0.26
Variable: sepal length (cm)    Importance: 0.12
Variable: sepal width (cm)     Importance: 0.08


### Method 2 

In [8]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

feature_importances

Unnamed: 0,importance
petal width (cm),0.538565
petal length (cm),0.264942
sepal length (cm),0.120978
sepal width (cm),0.075515


## Tree Depth

In [9]:
def dectree_max_depth(tree):
    n_nodes = tree.node_count
    children_left = tree.children_left
    children_right = tree.children_right

    def walk(node_id):
        if (children_left[node_id] != children_right[node_id]):
            left_max = 1 + walk(children_left[node_id])
            right_max = 1 + walk(children_right[node_id])
            return max(left_max, right_max)
        else: # leaf
            return 1

    root_node_id = 0
    return walk(root_node_id)

[dectree_max_depth(t.tree_) for t in rf.estimators_]

[5, 5, 4, 5, 4, 4, 5, 5, 4, 5]

## Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score

In [14]:
scores = cross_val_score(rf, x_train,y_train, cv=5)
scores

array([0.93333333, 1.        , 0.93333333, 0.93333333, 1.        ])

In [15]:
# The mean score and the 95% confidence interval of the score estimate are hence given by:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.96 (+/- 0.07)


In [32]:
scores = cross_val_score(rf, x_train,y_train, cv=5 ,scoring = 'roc_auc')
scores

array([0.91049383, 1.        , 0.93333333, 1.        , 1.        ])