### Random Forest Algorithm Demo

In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [2]:
np.random.seed(0)

In [3]:
iris = load_iris()

In [4]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [10]:
len(iris.data)

150

In [6]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [7]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

In [9]:
len(iris.target)

150

In [11]:
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

In [12]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [13]:
df['is_train'] = np.random.uniform(0,1, len(df)) < .75

In [17]:
df[-5:]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
145,6.7,3.0,5.2,2.3,virginica,True
146,6.3,2.5,5.0,1.9,virginica,True
147,6.5,3.0,5.2,2.0,virginica,False
148,6.2,3.4,5.4,2.3,virginica,True
149,5.9,3.0,5.1,1.8,virginica,False


In [20]:
train, test = df[df['is_train'] == True], df[df['is_train'] == False]

In [21]:
len(train)

118

In [22]:
len(test)

32

In [23]:
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [24]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [25]:
train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True
5,5.4,3.9,1.7,0.4,setosa,True
6,4.6,3.4,1.4,0.3,setosa,True
9,4.9,3.1,1.5,0.1,setosa,True
11,4.8,3.4,1.6,0.2,setosa,True
12,4.8,3.0,1.4,0.1,setosa,True


In [26]:
train['species']

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
5         setosa
6         setosa
9         setosa
11        setosa
12        setosa
14        setosa
15        setosa
16        setosa
22        setosa
24        setosa
25        setosa
26        setosa
28        setosa
29        setosa
30        setosa
32        setosa
33        setosa
34        setosa
35        setosa
36        setosa
37        setosa
39        setosa
40        setosa
41        setosa
42        setosa
         ...    
110    virginica
112    virginica
113    virginica
115    virginica
117    virginica
119    virginica
120    virginica
121    virginica
123    virginica
124    virginica
125    virginica
126    virginica
127    virginica
128    virginica
129    virginica
130    virginica
131    virginica
132    virginica
133    virginica
134    virginica
135    virginica
136    virginica
137    virginica
138    virginica
139    virginica
141    virginica
142    virginica
145    virgini

In [31]:
y = pd.factorize(train['species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2], dtype=int64)

In [32]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)


In [33]:
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [34]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [35]:
clf.predict_proba(test[features])

array([[ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0.9,  0.1,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0.9,  0.1,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  0.5,  0.5],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0.9,  0.1],
       [ 0. ,  0.2,  0.8],
       [ 0. ,  0.3,  0.7],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0.8,  0.2],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ]])

In [37]:
clf.predict_log_proba(test[features])

  return np.log(proba)


array([[ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [-0.10536052, -2.30258509,        -inf],
       [ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [ 0.        ,        -inf,        -inf],
       [-0.10536052, -2.30258509,        -inf],
       [ 0.        ,        -inf,        -inf],
       [       -inf, -0.69314718, -0.69314718],
       [       -inf,  0.        ,        -inf],
       [       -inf, -0.10536052, -2.30258509],
       [       -inf, -1.60943791, -0.22314355],
       [       -inf, -1.2039728 , -0.35667494],
       [       -inf,  0.        ,        -inf],
       [       -inf, -0.22314355, -1.60943791],
       [       -inf,        -inf,  0.   

In [38]:
test['species']

7          setosa
8          setosa
10         setosa
13         setosa
17         setosa
18         setosa
19         setosa
20         setosa
21         setosa
23         setosa
27         setosa
31         setosa
38         setosa
52     versicolor
66     versicolor
68     versicolor
70     versicolor
72     versicolor
89     versicolor
98     versicolor
103     virginica
109     virginica
111     virginica
114     virginica
116     virginica
118     virginica
122     virginica
140     virginica
143     virginica
144     virginica
147     virginica
149     virginica
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [39]:
preds = iris.target_names[clf.predict(test[features])]

In [40]:
preds

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica'],
      dtype='<U10')

In [42]:
pd.crosstab(test['species'], preds, rownames='Actual', colnames='predicted')

AssertionError: arrays and names must have the same length