In [1]:
# Ensemble Learning
# - aggregate the predictions of a groups of predictors -> often leads to better prediction than only with one best predictor
# - Ensemble learning algorihm - called Ensemble method

# E.g. can train a group of DT classifiers, each on a different random subset of the training set
# - to make a prediction - predicts the class that gets the most votes 
# -> such ensemble of DTs: Random Forest - very simple but one of the most powerful ML algorithms

In [2]:
# Voting Classifiers
# create multiple classifiers and aggregate the predictions of each classifier and predict the class that gets the most votes
# -> called hard voting classifier

# often achieves a higher accuracy than the best classifier in the ensemble
# even if each classifier is a weak learner (slightly better than random guessing) - the ensemble cans till be a strong learner (given enough classifiers)
# works due to the law of large numbers

In [4]:
# Creating and trainng a voting classifier in Scikit-Learn:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

In [5]:
# Accuracy of each classifier on the test set:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.904


In [6]:
# Voting classifier slighlty outperforms all individual classifiers

In [7]:
# Soft voting:
# - if all classifiers can estimate class probabilities (i.e. predict_prob() method)
# - Scikit-Learn allows you to predict the class with the highest probability, averaged over all the individual classifiers (soft voting)
# often better than hard voting, since it gives more weight to highly confident votes - replace voting="hard" to voting="soft":

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


In [None]:
# The prediction accuracy jumpted to 92%
# Make sure all classifiers can estimate class probabilities -> not the case for SVC by default
# -> set probability hyperparameter to true