In [3]:
import warnings
warnings.filterwarnings("ignore")

# 앙상블 학습과 랜덤 포레스트

## 투표 기반 분류기
각각 다른 알고리즘으로 학습시킨 독립적인 분류기의 결과를 종합하여 예측.
- hard voting : 예측의 다수결을 클래스로 예측하는 방식.
- soft voting : 각 분류기가 각 클래스에 해당하는 확률을 예측하고 이를 평균내어 확률이 가장 높은 클래스로 예측하는 방식.

In [26]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=1000, noise=0.30, random_state=44)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=44)

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                             

In [30]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.88
RandomForestClassifier 0.916
SVC 0.924
VotingClassifier 0.928


In [28]:
svm_clf_prob = SVC(probability=True)

voting_clf_soft = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf_prob)],
    voting='soft'
)

In [29]:
for clf in (log_clf, rnd_clf, svm_clf_prob, voting_clf_soft):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.88
RandomForestClassifier 0.916
SVC 0.924
VotingClassifier 0.936


## 배깅과 페이스팅
- 같은 알고리즘으로 각각 훈련 데이터의 서브셋을 훈련하여 분류기를 다르게 학습시킨 후 결과를 종합하는 방식 
    - 이 때 서브셋을 구성하는 방식이 복원추출일 경우 배깅(Bagging; Bootstrap aggregating)
    - 비복원추출일 경우 페이스팅(Pasting)
- 각 예측기의 결과를 모을 때, 분류는 최빈값, 회귀는 평균값을 보통 사용한다.
- 학습과 예측 모두 병렬로 수행이 가능하기 때문에(=확장성) 인기가 많다.

In [31]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples = 100, bootstrap=True, n_jobs=-1
) # decision tree 500개를 각 100개의 데이터로 학습시킨 후 결과를 종합할 것임. 페이스팅의 경우 bootstrap=False. n_jobs : 사용할 CPU개수. -1일 경우 가용한 모든 코어.
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [32]:
y_pred = bag_clf.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

0.936

### oob 평가
- Bagging을 사용하면 중복을 허용하기 때문에 평균적으로 훈련 데이터의 63%만 샘플링되는데, 이 때 37%의 훈련 샘플을 oob샘플(out-of-bag)이라고 한다.
- 이 oob샘플을 검증 세트로 활용할 수 있다.

In [34]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, 
    oob_score=True
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.892

In [35]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [36]:
bag_clf.oob_decision_function_

array([[1.        , 0.        ],
       [0.97959184, 0.02040816],
       [0.90340909, 0.09659091],
       ...,
       [0.01197605, 0.98802395],
       [0.        , 1.        ],
       [0.59793814, 0.40206186]])

## 랜덤 패치와 랜덤 서브스페이스
BaggingClassifier는 특성 샘플링을 지원한다. 즉, 무작위로 특성 일부분을 선택하여 훈련시킬 수 있다.
- Random paches method : 특성과 훈련 데이터 둘 다 샘플링
- Random subspace method : 특성만 샘플링

## 랜덤 포레스트
Bagging + Decision tree

In [37]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [38]:
accuracy_score(y_test, y_pred_rf)

0.936

In [40]:
# Bagging + Decision tree
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features="auto", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True
)

In [41]:
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred_bag)

0.936

### 익스트림 랜덤 트리(Extremely randomized trees) = 엑스트라 트리(Extra-trees)
랜덤 포레스트의 특성 일부분 선택 + 분할할 때 최적의 임계값을 찾는 대신 무작위로 분할한다.(속도 매우 빠름)

### 특성 중요도

In [42]:
from sklearn.datasets import load_iris
iris=load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09949692390743267
sepal width (cm) 0.021667504109461156
petal length (cm) 0.4564577091103672
petal width (cm) 0.42237786287273904


## 부스팅
약한 학습기를 여러 개 연결하여 강한 학습기를 만드는 앙상블 방법. 뒤의 모델은 앞의 모델의 오류를 특히 보완하면서 학습한다. 
- 배깅과 페이스팅과는 다르게 직렬적이므로 확장성은 높지 않다.

### AdaBoost(adaptive boosting)
- 이전 모델이 과소적합했던 훈련 샘플에 가중치를 높여 학습한다.

In [43]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5 #SAMME.R : 다중클래스 확률추정
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [44]:
y_pred_ada = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred_ada)

0.924

### Gradient Boosting
AdaBoosting과는 달리 반복마다 각 샘플의 가중치를 갱신하는 것이 아니라 이전 예측기가 만든 잔여오차(residual error)에 예측기를 학습
- Stochastic gradient boosting : subsample=0.25 옵션을 추가하면 무작위로 선택된 25%의 훈련 샘플로만 학습한다.

#### GBRT(gradient boosted regression tree)

In [46]:
import numpy as np
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [48]:
# gbrt 구현
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [50]:
y2 = y-tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [51]:
y3 = y2-tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [58]:
X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([0.66543447])

In [60]:
# GBRT 와 같은 gradient boosting
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X,y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [61]:
gbrt.predict(X_new)

array([0.66543447])

In [63]:
# 최적의 트리 수 찾기 - 일정 트리 수까지 훈련 시키본 뒤 검증 셋 에러가 낮은 트리 수를 선택
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X,y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators = bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=79,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [73]:
np.sqrt(np.min(errors))

0.05832821706688597

In [64]:
# 최적의 트리 수 찾기 - 조기종료
gbrt = GradientBoostingRegressor(max_depth=2, warm_start = True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else :
        error_going_up +=1
        if error_going_up == 5: # 일정 시간동안 검증 에러가 계속 증가할 경우
            break # 조기종료

In [65]:
n_estimators

84

In [72]:
np.sqrt(min_val_error)

0.05832821706688597

### XGBoost
최적화된 gradient boosting 구현 라이브러리.

In [71]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
np.sqrt(mean_squared_error(y_val, y_pred))

0.06448539460889843

In [70]:
# 조기 종료
xgb_reg.fit(X_train, y_train,
                    eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.20223
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.15072
[2]	validation_0-rmse:0.11422
[3]	validation_0-rmse:0.09006
[4]	validation_0-rmse:0.07570
[5]	validation_0-rmse:0.06718
[6]	validation_0-rmse:0.06281
[7]	validation_0-rmse:0.06053
[8]	validation_0-rmse:0.05975
[9]	validation_0-rmse:0.05961
[10]	validation_0-rmse:0.05962
[11]	validation_0-rmse:0.05976
Stopping. Best iteration:
[9]	validation_0-rmse:0.05961



## 스태킹(Stacking; stacked generalization)
여러 예측기의 예측을 간단히 수집하는 대신 취합하는 모델을 훈련시키는 앙상블 방법.
- 마지막에 예측기를 취합하는 예측기를 블렌더blender 또는 meta learner라고 한다.
- 학습 방법(2개의 레이어로 구성할 때)
    + 훈련 데이터셋을 2개의 서브셋으로 나눈다. 
    + 첫번째 서브셋으로 여러 예측기를 훈련시킨다.
    + 훈련시킨 여러 예측기에 두번째 서브셋에 대한 예측값을 만든다.
    + 그 예측값을 입력으로 blender를 훈련시킨다.

### 연습문제

8.앙상블
- MNIST 데이터셋. train/val/test set으로 나누고(50000,10000,10000)
- 랜덤포레스트, 엑스트라 트리, SVM 여러 분류기 훈련
- 더 높은 성능을 내도록 간접/직접 투표 방법으로 앙상블 연결.
- 테스트세트로 성능이 얼마나 높아졌는가 확인

In [74]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

mnist = fetch_openml("mnist_784", version=1)

In [75]:
mnist.keys()

dict_keys(['data', 'url', 'feature_names', 'target_names', 'details', 'target', 'categories', 'frame', 'DESCR'])

In [76]:
X_train_val, X_test, y_train_val, y_test = train_test_split(mnist["data"], mnist["target"], test_size=10000, random_state=44)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=10000, random_state=44)

In [81]:
import collections
collections.Counter(mnist["target"])

Counter({'0': 6903,
         '1': 7877,
         '2': 6990,
         '3': 7141,
         '4': 6824,
         '5': 6313,
         '6': 6876,
         '7': 7293,
         '8': 6825,
         '9': 6958})

In [84]:
from sklearn.ensemble import RandomForestClassifier

rdf_clf = RandomForestClassifier()
rdf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [85]:
rdf_train_pred = rdf_clf.predict(X_train)
rdf_val_pred = rdf_clf.predict(X_val)
accuracy_score(y_train, rdf_train_pred), accuracy_score(y_val, rdf_val_pred)

(1.0, 0.9651)

In [86]:
from sklearn.ensemble import ExtraTreesClassifier

ext_clf = ExtraTreesClassifier()
ext_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [87]:
ext_train_pred = ext_clf.predict(X_train)
ext_val_pred = ext_clf.predict(X_val)
accuracy_score(y_train, ext_train_pred), accuracy_score(y_val, ext_val_pred)

(1.0, 0.9703)

In [108]:
from sklearn.svm import SVC

svm_clf = SVC(probability=True)
svm_clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [109]:
svm_train_pred = svm_clf.predict(X_train)
svm_val_pred = svm_clf.predict(X_val)
accuracy_score(y_train, svm_train_pred), accuracy_score(y_val, svm_val_pred)

(0.98948, 0.9762)

In [110]:
## 간단하게 다시 
# estimators = [rdf_clf, ext_clf, svm_clf]
# for estimator in estimators:
#     estimator.fit(X_train, y_train)

In [111]:
from sklearn.ensemble import VotingClassifier

hard_voting_clf = VotingClassifier(
    estimators=[('randomforest', rdf_clf), ('extratree', ext_clf), ('svm', svm_clf)],
    voting="hard"
)
hard_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('randomforest',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0

In [112]:
hard_voting_clf.score(X_train, y_train), hard_voting_clf.score(X_val, y_val)

(1.0, 0.9714)

In [113]:
hard_voting_clf.score(X_test, y_test)

0.9731

In [114]:
#soft voting
hard_voting_clf.voting="soft"

In [115]:
hard_voting_clf.score(X_train, y_train), hard_voting_clf.score(X_val, y_val)

(1.0, 0.977)

In [116]:
hard_voting_clf.score(X_test, y_test)

0.9775

In [117]:
estimators = [rdf_clf, ext_clf, svm_clf]
[estimator.score(X_test, y_test) for estimator in estimators]

[0.9662, 0.9703, 0.9784]

9.스태킹
- 8에서 각 분류기에 검증 세트를 넣어 예측값 만들기
- 이 예측값으로 MNIST에 대한 블렌더 훈련
- 테스트 세트로 평가.

In [99]:
rdf_val_pred

array(['0', '0', '1', ..., '4', '7', '5'], dtype=object)

In [102]:
# 예측값  rdf_val_pred, ext_val_pred, svm_val_pred
X_val_pred = np.c_[rdf_val_pred, ext_val_pred, svm_val_pred]
X_val_pred = X_val_pred.astype(np.float32)

In [103]:
X_val_pred

array([[0., 0., 0.],
       [0., 0., 0.],
       [1., 1., 1.],
       ...,
       [4., 4., 4.],
       [7., 7., 7.],
       [5., 5., 5.]], dtype=float32)

In [106]:
# random forest blender
rnd_blender = RandomForestClassifier(oob_score=True, random_state=44)
rnd_blender.fit(X_val_pred, y_val)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=44, verbose=0,
                       warm_start=False)

In [107]:
rnd_blender.oob_score_

0.9729

In [121]:
X_test_pred = np.c_[[estimator.predict(X_test) for estimator in estimators]]
X_test_pred = X_test_pred.astype(np.float32)
X_test_pred

array([[8., 4., 6., ..., 9., 3., 3.],
       [8., 4., 6., ..., 9., 3., 3.],
       [8., 4., 6., ..., 9., 3., 3.]], dtype=float32)

In [118]:
y_pred = rnd_blender.predict(X_test_pred)

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jo

In [None]:
accuracy_score(y_test, y_pred)