# Ensemble Learning

Ensemble learning brings two or more methods together to accomplish a learning task

In [31]:
"""
Write a basic ensemble learning algorithm with iris data. Visualize the training results.
"""
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_clf = LogisticRegression(random_state=42)
svm_clf = SVC(probability=True, random_state=42)
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('svc', svm_clf)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)
print('y_pred: ', y_pred)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)

y_pred:  [0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 1 1 1 1 1 0 1 0 0 1 1 0
 1 0 1 0 0 0 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 1
 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 1 0 1 0 1 1 1 0
 0 0 0 0 0 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0 1 1 1 1
 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 0
 1 0 0 1 1 0 1 0 1 1 1 1 0 1 0]
Accuracy:  0.865


## Voting Classifiers

Dataset

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [32]:
voting_clf.fit(X_train, y_train)

In [33]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.77
RandomForestClassifier 0.92
SVC 0.905
VotingClassifier 0.865


Soft voting

In [34]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

In [35]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.77
RandomForestClassifier 0.92
SVC 0.905
VotingClassifier 0.91


## Random Forests Classifiers

Random forests are a type of ensemble learning method that employs a set of decision trees to make predictions by aggregating predictions from individual trees. It improves the precision and resilience of single decision trees. It can be used for both classification and regression tasks. 

In [36]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [37]:
np.sum(y_pred == y_pred_rf) / len(y_pred)  # very similar predictions

0.98

Feature evaluation

In [38]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.11249225099876375
sepal width (cm) 0.02311928828251033
petal length (cm) 0.4410304643639577
petal width (cm) 0.4233579963547682


In [39]:
rnd_clf.feature_importances_

array([0.11249225, 0.02311929, 0.44103046, 0.423358  ])