## Ensemble methods with random forest

This is a classification problem, where in we will be estimating the species label for iris flowers.

In [1]:
import numpy as np
import pandas as pd

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
iris = datasets.load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.DataFrame(iris.target)

In [5]:
y.columns = ['labels']
print(df.head())
y[0:5]

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


Unnamed: 0,labels
0,0
1,0
2,0
3,0
4,0


The data set contains information on the:
- sepal length (cm)
- sepal width (cm)  
- petal length (cm)  
- petal width (cm)
- species type

In [7]:
df.isnull().any()==True

Unnamed: 0,0
sepal length (cm),False
sepal width (cm),False
petal length (cm),False
petal width (cm),False


In [8]:
print(y.labels.value_counts())

labels
0    50
1    50
2    50
Name: count, dtype: int64


In [9]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size = .2, random_state=17)

In [10]:
classifier = RandomForestClassifier(n_estimators=200, random_state=0)
y_train_array = np.ravel(y_train)
classifier.fit(x_train, y_train_array)
y_pred = classifier.predict(x_test)

In [11]:
metrics.classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00         7\n           1       0.92      1.00      0.96        11\n           2       1.00      0.92      0.96        12\n\n    accuracy                           0.97        30\n   macro avg       0.97      0.97      0.97        30\nweighted avg       0.97      0.97      0.97        30\n'

In [12]:
y_test_array = np.ravel(y_test)
print(y_test)

     labels
16        0
78        1
145       2
99        1
126       2
127       2
84        1
117       2
80        1
113       2
144       2
9         0
55        1
11        0
110       2
19        0
38        0
116       2
100       2
132       2
133       2
28        0
149       2
69        1
66        1
85        1
53        1
73        1
0         0
92        1


In [13]:
print(y_pred)

[0 1 2 1 2 2 1 2 1 2 2 0 1 0 2 0 0 2 2 2 1 0 2 1 1 1 1 1 0 1]
