In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver='lbfgs', random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma='scale', random_state=42)

voting_clf = VotingClassifier(estimators=[('lc', log_clf), ('rf', rf_clf), ('svm', svm_clf)], voting='hard')

In [4]:
voting_clf.fit(X, y)

VotingClassifier(estimators=[('lc', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svm', SVC(random_state=42))])

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

for clf in (log_clf, rf_clf, svm_clf, voting_clf):
    scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print(clf.__class__.__name__, ':', np.mean(scores))

LogisticRegression : 0.8379866772479138
RandomForestClassifier : 0.891975085972633
SVC : 0.9219993747444869
VotingClassifier : 0.905995238438785


In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bg = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                      max_samples=100, bootstrap=True, random_state=42) #bootstrap=with repl===bagging

bg.fit(X_train, y_train)
y_pred = bg.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [7]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred_tree = dtc.predict(X_test)
accuracy_score(y_test, y_pred_tree)

0.83

In [8]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)

In [9]:
bag = BaggingClassifier(DecisionTreeClassifier(max_features='sqrt', max_leaf_nodes=16), 
                        n_estimators=500, random_state=42)
bag.fit(X_train, y_train)

y_pred_bag = bag.predict(X_test)

In [10]:
all(y_pred_rfc == y_pred_bag)  # the same!

True

In [11]:
# OR
np.sum(y_pred_rfc - y_pred_bag) == 0

True

## Gradient Boosting

In [12]:
import xgboost as xgb

np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=42)

xgb_reg = xgb.XGBRegressor(random_state=42)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

In [13]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
mse, rmse

(0.002392175446297006, 0.04890987064281612)