### Bagging
#### 위스콘신 유방암 데이터

- from sklearn.ensemble import BaggingClassifier

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
cancer = datasets.load_breast_cancer()

In [3]:
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, train_size = 0.8, test_size = 0.2, random_state = 156)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(455, 30) (114, 30) (455,) (114,)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score

In [5]:
# Bagging은 분류기 하나만 사용
# Logistic Regression 분류기 단일로 사용할 경우
lr_clf = LogisticRegression(max_iter = 10000)
lr_clf.fit(x_train, y_train)

pred_lr = lr_clf.predict(x_test)

In [7]:
print(accuracy_score(y_test, pred_lr)) # accuracy 결과
print(mean_squared_error(y_test, pred_lr)) # MSE 결과(에러율)

0.9473684210526315
0.05263157894736842


In [8]:
# 생성된 LR 분류기를 Bagging 에 적합시키기
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(base_estimator = lr_clf,  # Logistic Regression 분류기 사용
                           n_estimators = 5,  # 5번 샘플링
                           verbose = 1)  # 학습 과정 표시 - 추가 안해도 무관

In [10]:
lr_clf_bag = bag_clf.fit(x_train, y_train)
pred_lr_bag = lr_clf_bag.predict(x_test) # 학습된 Bagging LR 으로 평가 데이터 예측

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [11]:
pred_lr_bag

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [12]:
print(accuracy_score(y_test, pred_lr_bag)) # accuracy 결과
print(mean_squared_error(y_test, pred_lr_bag)) # MSE 결과(에러율)

0.956140350877193
0.043859649122807015


In [14]:
# DT 분류기 단일로 사용할 경우
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)
pred_dt = dt_clf.predict(x_test)

print(accuracy_score(y_test, pred_dt)) 
print(mean_squared_error(y_test, pred_dt)) 

0.9473684210526315
0.05263157894736842


In [15]:
# 생성된 DT 분류기를 Bagging 적합 시키기
bag_clf = BaggingClassifier(base_estimator = dt_clf,  # DT 분류기 사용
                           n_estimators = 5,  # 5번 샘플링
                           verbose = 1)  # 학습 과정 표시 - 추가 안해도 무관

In [16]:
dt_clf_bag = bag_clf.fit(x_train, y_train)
pred_dt_bag = dt_clf_bag.predict(x_test) # 학습된 Bagging DT 으로 평가 데이터 예측

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [17]:
print(accuracy_score(y_test, pred_dt_bag)) # accuracy 결과
print(mean_squared_error(y_test, pred_dt_bag)) # MSE 결과(에러율)

0.9649122807017544
0.03508771929824561


### RandomForest

- from sklearn.ensemble import RandomForestClassifier
- from sklearn.model_selection import GridSearchCV

In [18]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators = 5,
                                max_depth = 3,
                                random_state = 103,
                                verbose = 1)

rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
print(accuracy_score(y_test, pred))

0.9298245614035088


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [19]:
rf_clf2 = RandomForestClassifier(n_estimators = 500,
                                max_depth = 3,
                                random_state = 103,
                                verbose = 1)

rf_clf2.fit(x_train, y_train)
pred2 = rf_clf2.predict(x_test)
print(accuracy_score(y_test, pred2))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.9385964912280702


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


In [20]:
rf_clf3 = RandomForestClassifier(n_estimators = 500,
                                max_depth = 10,
                                random_state = 103,
                                verbose = 1)

rf_clf3.fit(x_train, y_train)
pred3 = rf_clf3.predict(x_test)
print(accuracy_score(y_test, pred3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.9473684210526315


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


In [23]:
from sklearn.model_selection import GridSearchCV  # 모형의 최적 파라미터를 찾아줌

In [24]:
rf_clf4 = RandomForestClassifier()
rf_clf4

RandomForestClassifier()

In [29]:
# 여러 파라미터(params) 조건 중에서 최적의 조건을 찾기
params = { 'n_estimators' : [10, 100, 500, 1000],
         'max_depth' : [3, 5, 10, 15], }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf4 = RandomForestClassifier(random_state = 103,
                                n_jobs = -1,
                                verbose = 1) # RF 객체 생성

grid_cv = GridSearchCV(rf_clf4,
                      param_grid = params,
                      n_jobs = -1,
                      verbose = 1)
grid_cv.fit(x_train, y_train)

print("최적 하이퍼 파라미터 : ", grid_cv.best_params_)
print("최고 예측 정확도 : {:.4f}".format(grid_cv.best_score_) )

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.9s finished


최적 하이퍼 파라미터 :  {'max_depth': 10, 'n_estimators': 1000}
최고 예측 정확도 : 0.9670


In [30]:
rf_clf5 = RandomForestClassifier(n_estimators = 1000,
                                max_depth = 10,
                                random_state = 103,
                                verbose = 1)
rf_clf5.fit(x_train, y_train)
pred5 = rf_clf5.predict(x_test)
print(accuracy_score(y_test, pred5))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.9473684210526315


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.0s finished
