## bagging，boosting，stacking

## Voting classifiers

In [40]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=500, noise=0.3, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(sum(y_train), sum(y_test))

(375, 2) (375,) (125, 2) (125,)
186 64


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

lr_clf = LogisticRegression(random_state=42)
rfc_clf = RandomForestClassifier(random_state=42)
svc_clf = SVC(random_state=42)

voting_clf = VotingClassifier(estimators=[('lr', lr_clf),
                                         ('rfc', rfc_clf),
                                         ('svc', svc_clf)])
#voting_clf.fit(X_train, y_train)


from sklearn.metrics import accuracy_score
for clf in (lr_clf, rfc_clf, svc_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_test_pred))



LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.896


  if diff:


In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

lr_clf = LogisticRegression(random_state=42)
rfc_clf = RandomForestClassifier(random_state=42)
svc_clf = SVC(probability=True,random_state=42)

voting_clf = VotingClassifier(estimators=[('lr', lr_clf),
                                         ('rfc', rfc_clf),
                                         ('svc', svc_clf)],
                             voting='soft')
#voting_clf.fit(X_train, y_train)


from sklearn.metrics import accuracy_score
for clf in (lr_clf, rfc_clf, svc_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_test_pred))


LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.912


  if diff:


### Bagging，Pasting ensembles

In [44]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=500,
                           max_samples=100,
                           bootstrap=True, # 设置为False即为Pasting
                           n_jobs=-1,
                           random_state=42,
                           oob_score=True) # out_of_bag评价

bag_clf.fit(X_train, y_train)
print(bag_clf.oob_score_)
# print(bag_clf.oob_decision_function_[:3])
y_test_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_test_pred))

0.9253333333333333
[[0.35849057 0.64150943]
 [0.43513514 0.56486486]
 [1.         0.        ]]
0.904


### 随机森林

In [47]:
from sklearn.ensemble import RandomForestClassifier
rfc_clf = RandomForestClassifier(n_estimators=500,
                                 max_leaf_nodes=16,
                                 n_jobs=-1,
                                random_state=42)
rfc_clf.fit(X_train, y_train)
y_pred_rfc = rfc_clf.predict(X_test)



### 极端随机树

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc_clf = ExtraTreesClassifier(n_estimators=500,
                                 max_leaf_nodes=16,
                                 n_jobs=-1)
etc_clf.fit(X_train, y_train)
y_pred_rfc = etc_clf.predict(X_test)

### 特征重要性

In [34]:
from sklearn.datasets import load_iris
iris = load_iris()

rfc_clf = RandomForestClassifier(n_estimators=500,n_jobs=-1)
rfc_clf.fit(iris.data, iris.target)

for name, score in zip(iris['feature_names'], rfc_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09393846613842086
sepal width (cm) 0.024471133109624225
petal length (cm) 0.41997543623329353
petal width (cm) 0.4616149645186617


# 提升：假设增强
## 按顺序训练分类器，后面的尝试修正前面的
## Adaboost, Gradient Boosting

In [None]:
from sklearn.ensemble import AdaboostClassifier

ada_clf = AdaboostClassifier(DecisionTreeClassifier(max_depth=1),
                             n_estimators=200,
                             algorithm='SAMME.R',
                             learnning_rate=0.5)

ada_clf.fit(X_train, y_train)                           

#### 梯度提升

In [49]:
import numpy as np
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] + 0.05 * np.random.randn(100)

from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([1.36995856])

#### learning_rate确立了每个树的贡献，如果它小，就需要更多的树

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingregressor(max_depth=2,n_estimators=3,learning_rate=1.0)
gbrt.fit(X, y)

#### 使用早停技术，找到树的最优数量

In [53]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)

gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]

best_n_estimators = np.argmin(errors)
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=117, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [54]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float('inf')
error_going_up = 0

for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break  # 超过5次就早停