<a href="https://colab.research.google.com/github/huiesoo/esaa/blob/main/24-1/0318.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHAPTER 7 앙상블 학습과 랜덤 포레스트

## 7.1 투표 기반 분류기

In [2]:
import warnings
warnings.filterwarnings('ignore')

# import package
import numpy as np
import os

#5장에서 소개한 moons dataset 불러오기
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X,y = make_moons(n_samples=100, noise=0.15)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

In [5]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf,voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8
RandomForestClassifier 0.8
SVC 0.8
VotingClassifier 0.8


## 7.2 배깅과 페이스팅

In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=50, bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [7]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.925

In [8]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8

In [9]:
bag_clf.oob_decision_function_

array([[0.        , 1.        ],
       [0.71345029, 0.28654971],
       [0.92814371, 0.07185629],
       [0.85074627, 0.14925373],
       [0.66666667, 0.33333333],
       [0.97237569, 0.02762431],
       [0.9047619 , 0.0952381 ],
       [0.68926554, 0.31073446],
       [0.        , 1.        ],
       [0.99456522, 0.00543478],
       [0.99425287, 0.00574713],
       [1.        , 0.        ],
       [0.06698565, 0.93301435],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.94791667, 0.05208333],
       [0.01123596, 0.98876404],
       [0.01020408, 0.98979592],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.01875   , 0.98125   ],
       [0.00537634, 0.99462366],
       [0.01744186, 0.98255814],
       [0.97714286, 0.02285714],
       [0.        , 1.        ],
       [0.01595745, 0.98404255],
       [0.99415205, 0.00584795],
       [0.97777778, 0.02222222],
       [0.00549451, 0.99450549],
       [1.

## 7.3 랜덤 패치와 랜점 서브스페이스

## 7.4 랜덤 포레스트

In [10]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [11]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features="auto", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

In [12]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
  print(name, score)

sepal length (cm) 0.09587276296514066
sepal width (cm) 0.02410318438695025
petal length (cm) 0.43393571294396965
petal width (cm) 0.4460883397039394


## 7.5 부스팅



*   부스팅
    *   약한 학습기를 여러 개 연결하여 강한 학습기를 만드는 앙상블 방법
    *   앞의 모델을 보완해나가면서 일련의 예측기를 학습시키는 것



### 7.5.1 에이다부스트



*   이전 예측기를 보완하는 새로운 예측기를 만드는 방법은 이전 모델이 과소적합했던 훈련 샘플의 가중치를 더 높이는 것 -> 새로운 예측기는 학습하기 어려운 샘플에 점점 더 맞춰지게 됨



In [13]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

### 7.5.2 그레이디언트 부스팅



*   앙상블에 이전까지의 오차를 보정하도록 예측기를 순차적으로 추가
*   이전 예측기가 만든 잔여 오차에 새로운 예측기를 학습시킴

*   그레이디언트 트리 부스팅, 그레이디언트 부스티드 회귀 트리



In [21]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [22]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

In [23]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

In [24]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

In [25]:
X_new = np.array([[0.8]])

In [26]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)



*   learning rate 매개변수가 각 트리의 기여 정도를 조절. 이를 0.1처럼 낮게 설정 -> 앙상블을 훈련 세트에 학습시키기 위해 많은 트리가 필요하지만 일반적으로 예측의 성능은 좋아짐. => 축소(규제 방법)



In [29]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

In [31]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
  gbrt.n_estimators = n_estimators
  gbrt.fit(X_train, y_train)
  y_pred = gbrt.predict(X_val)
  val_error = mean_squared_error(y_val, y_pred)
  if val_error < min_val_error:
    min_val_error = val_error
    error_going_up = 0
  else:
    error_going_up += 1
    if error_going_up ==5:
      break



*   GradientBoostingRegressor: 각 트리가 훈련할 때 사용할 훈련 샘플의 비율을 지정할 수 있는 subsample 매개변수 지원 -> 편향이 높아지는 대신 분산이 낮아짐. 훈련 속도를 상당히 높임 => 확률적 그레이디언트 부스팅



In [32]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

In [33]:
xgb_reg.fit(X_train, y_train,
            eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.20391
[1]	validation_0-rmse:0.15822
[2]	validation_0-rmse:0.12129
[3]	validation_0-rmse:0.09943
[4]	validation_0-rmse:0.08471
[5]	validation_0-rmse:0.07288
[6]	validation_0-rmse:0.06541
[7]	validation_0-rmse:0.06082
[8]	validation_0-rmse:0.05825
[9]	validation_0-rmse:0.05654
[10]	validation_0-rmse:0.05598
[11]	validation_0-rmse:0.05572
[12]	validation_0-rmse:0.05564
[13]	validation_0-rmse:0.05587


## 7.6 스태킹