#### Ensemble methods

The goal of ensemble methods is to combine the predictions of

**several base estimators** built with a given learning algorithm

in order to

improve **generalizability** or **robustness** over a single estimator.

https://scikit-learn.org/stable/modules/ensemble.html

### Random Forests

A forest is comprised of trees. It is said that the more trees it has, the more robust a forest is. Random forests creates decision trees on randomly selected data samples, gets prediction from each tree and selects the best solution by means of voting. It also provides a pretty good indicator of the feature importance.

https://www.datacamp.com/community/tutorials/random-forests-classifier-python

In [None]:
!pip install scikit-learn

In [None]:
# check scikit-learn version
import sklearn
print(sklearn.__version__)

0.22.2.post1


We use sinthetic data, generated using scikit-learn

In [None]:
# Generate test classification dataset: Binary (2 classes)

from sklearn.datasets import make_classification

# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=77)

# summarize the dataset
print(X.shape, y.shape)

(1000, 20) (1000,)


In [None]:
print("Labels:", y[0:10])

Labels: [1 1 0 0 1 0 0 0 1 0]


Classification using Random Forests:

In [None]:
# evaluate random forest algorithm for classification

from numpy import mean
from numpy import std

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# define the model
model = RandomForestClassifier(n_estimators=200)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=77)

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print('Accuracy: %.3f (SD: %.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.881 (SD: 0.040)


### Overall code to get PREDICTION from Random Forests
Classification Model with 200 Trees

In [None]:
# make predictions using random forest for classification
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=9)

# define the model
model = RandomForestClassifier(n_estimators=200)

# fit the model on the whole dataset
model.fit(X, y)

# make a single prediction
# row with 20 features
row = [[-8.52381793,5.24451077,-12.14967704,-2.92949242,0.99314133,0.67326595,-0.38657932,1.27955683,-0.60712621,3.20807316,0.60504151,-1.38706415,8.92444588,-7.43027595,-2.33653219,1.10358169,0.21547782,1.05057966,0.6975331,0.26076035]]

yhat = model.predict(row)

print('Predicted Class: %d' % yhat[0])

Predicted Class: 1


Regression using Random Forests:

In [None]:
# Generate test regression dataset
from sklearn.datasets import make_regression

# define dataset
X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=77)

# summarize the dataset
print(X.shape, y.shape)

(1000, 20) (1000,)


In [None]:
import pandas as pd

pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.00168,0.047861,0.042808,0.071955,-0.00082,0.039463,0.061421,-0.000722,-0.022747,-0.012499,-0.00701,-0.053479,-0.006119,-0.031879,-0.019487,-0.036208,-0.002341,-0.026358,-0.047083,0.000396
std,1.010132,1.00363,0.973507,0.97818,1.031268,1.010109,1.015928,1.009602,0.987956,1.011752,1.023893,0.986319,0.994881,1.001909,0.992683,0.965033,1.000694,1.038032,1.019816,1.033563
min,-3.685407,-3.66156,-3.186224,-3.078953,-3.431684,-3.480617,-3.6702,-3.069654,-2.983639,-3.71306,-2.98777,-3.456672,-3.403492,-3.223274,-3.102498,-3.241268,-3.067751,-3.45043,-3.331612,-2.742895
25%,-0.663656,-0.650605,-0.594045,-0.654148,-0.665691,-0.649503,-0.601936,-0.639096,-0.695635,-0.691588,-0.732475,-0.735897,-0.714814,-0.685395,-0.697103,-0.677779,-0.668926,-0.70887,-0.755766,-0.696134
50%,0.022089,0.047379,0.036484,0.080475,-0.048993,0.023088,0.064363,-0.02417,-0.030177,-0.029074,-0.020065,-0.051578,0.027672,-0.039845,-0.003084,-0.048329,-0.027257,-0.031991,-0.064566,0.008449
75%,0.708693,0.728104,0.661573,0.757107,0.683045,0.718467,0.748773,0.695651,0.636253,0.689483,0.670109,0.621787,0.638292,0.651914,0.678319,0.632027,0.676456,0.690135,0.638493,0.670833
max,3.438819,3.368433,2.934838,3.083221,3.66776,3.808858,3.561689,3.309026,3.080305,3.649842,2.90294,3.600465,2.857753,3.556902,2.914743,3.900589,4.073159,2.61322,3.06028,3.448056


In [None]:
print("Labels:", y[0:5])

Labels: [ 222.53747118   42.24564042  -23.47820771 -182.73759726    5.59466015]


In [None]:
# evaluate random forest ensemble for regression
from numpy import mean
from numpy import std

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor

# define the model

model = RandomForestRegressor(n_estimators=200)

# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print('Mean Absolute Error: %.3f (SD: %.3f)' % (mean(n_scores), std(n_scores)))

Mean Absolute Error: -114.204 (SD: 7.973)


### Overall code to get PREDICTION from Random Forests
Regression Model with 200 Trees

In [None]:
# random forest for making predictions for regression
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor

# define dataset
X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=9)

# define the model
model = RandomForestRegressor(n_estimators=200)

# fit the model on the whole dataset
model.fit(X, y)

# make a single prediction
# row with 20 features

row = [[-0.89483109,-1.0670149,-0.25448694,-0.53850126,0.21082105,1.37435592,0.71203659,0.73093031,-1.25878104,-2.01656886,0.51906798,0.62767387,0.96250155,1.31410617,-1.25527295,-0.85079036,0.24129757,-0.17571721,-1.11454339,0.36268268]]

yhat = model.predict(row)

print('Prediction: %d' % yhat[0])

Prediction: 5


## Using Iris Dataset

In [None]:
from sklearn import datasets

iris = datasets.load_iris()

In [None]:
print("Feature Names:", iris.feature_names)
print("Label Names:", iris.target_names)

Feature Names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Label Names: ['setosa' 'versicolor' 'virginica']


In [None]:
# print the iris data (top 5 records)
print(iris.data[0:5][0])

[5.1 3.5 1.4 0.2]


In [None]:
# print the iris labels (0:setosa, 1:versicolor, 2:virginica)
print(iris.target[0:10])

[0 0 0 0 0 0 0 0 0 0]


In [None]:
# We can create a DataFrame of the iris dataset

import pandas as pd
data=pd.DataFrame({
    'sepal length':iris.data[:,0],
    'sepal width':iris.data[:,1],
    'petal length':iris.data[:,2],
    'petal width':iris.data[:,3],
    'species':iris.target
})
data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
from sklearn.model_selection import train_test_split

X=data[['sepal length', 'sepal width', 'petal length', 'petal width']]  # Features
y=data['species']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
RFM =RandomForestClassifier(n_estimators=50)

#Train the model using the training sets y_pred=clf.predict(X_test)
RFM.fit(X_train,y_train)

y_pred=RFM.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy: {:.4f}".format(metrics.accuracy_score(y_test, y_pred)))

Accuracy: 0.9556


In [None]:
print("Predicted class:", RFM.predict([[3, 5, 4, 2]])[0])

Predicted class: 2


In [None]:
pd.Series(RFM.feature_importances_,
         index=iris.feature_names).sort_values(ascending=False)

petal width (cm)     0.558139
petal length (cm)    0.331957
sepal length (cm)    0.087136
sepal width (cm)     0.022768
dtype: float64

As you can see,

**sepal length & sepal width**

are not so much important.

We can build ML model using only two important features.

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset into features and labels
X=data[['petal length', 'petal width',]]
y=data['species']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5) # 70% training and 30% test

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
RFM =RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets y_pred=clf.predict(X_test)
RFM.fit(X_train,y_train)

y_pred=RFM.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy: {:.4f}".format(metrics.accuracy_score(y_test, y_pred)))

Accuracy: 0.9778


In [None]:
print("Predicted class:", RFM.predict([[3, 5]])[0])

Predicted class: 2
