# 스태킹 앙상블(Stacking Ensemble)
- 여러 알고리즘을 결합해 예측 결과를 도출 => 배깅, 부스팅과 비슷
- 개별 알고리즘으로 예측한 데이터를 기반으로 다시 예측을 수행한다는 점이 차이점
- 두 종류의 모델 필요
    - 개별적인 학습 모델
    - 학습 데이터로 학습하는 모델

In [7]:
import numpy as np 
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2)

knn = KNeighborsClassifier(n_neighbors=4)
rf = RandomForestClassifier(n_estimators=100)
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier(n_estimators=100)
lr = LogisticRegression(C=10)

knn.fit(X_train, y_train)
rf.fit(X_train, y_train)
dt.fit(X_train, y_train)
ada.fit(X_train, y_train)
lr.fit(X_train, y_train)

knn_pred = knn.predict(X_test)
rf_pred = rf.predict(X_test)
dt_pred = dt.predict(X_test)
ada_pred = ada.predict(X_test)
lr_pred = lr.predict(X_test)

print('Score of KNN: ', accuracy_score(y_test, knn_pred))
print('Score of Random Forest: ', accuracy_score(y_test, rf_pred))
print('Score of Decision Tree: ', accuracy_score(y_test, dt_pred))
print('Score of AdaBoostClassifier: ', accuracy_score(y_test, ada_pred))
print('Score of Logistic Regression: ', accuracy_score(y_test, lr_pred))

Score of KNN:  0.9385964912280702
Score of Random Forest:  0.9824561403508771
Score of Decision Tree:  0.9736842105263158
Score of AdaBoostClassifier:  0.9824561403508771
Score of Logistic Regression:  0.9736842105263158


## KNN, Random Forest, Decision Tree, AdaboostClassifer를 개별적인 학습 모델로 사용

In [9]:
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
print(pred.shape)

pred = np.transpose(pred)
print(pred.shape)

(4, 114)
(114, 4)


 ## Losgistic Regression을 개별 모델의 예측 데이터를 학습 데이터로 학습하는 모델로 사용

In [10]:
lr.fit(pred, y_test)
pred_final = lr.predict(pred)
print('Score of Stacking Ensemble: ', accuracy_score(y_test, pred_final))

Score of Stacking Ensemble:  0.9912280701754386


### 이처럼 스태킹 앙상블은 높은 정확도를 보인다

## Cross Validation set 기반 스태킹
- 과적합(overfitting)을 개선하기 위해 교차 검증 기반으로 예측된 결과 데이터 세트를 이용

In [15]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train, y_train, X_test, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False)
    train_pred = np.zeros((X_train.shape[0], 1))
    test_pred = np.zeros((X_test.shape[0], n_folds))

    for folder_index, (train_index, valid_index) in enumerate(kf.split(X_train)):
        X_tr = X_train[train_index]
        y_tr = y_train[train_index]
        X_te = X_train[valid_index]
        model.fit(X_tr, y_tr)
        train_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_index] = model.predict(X_test)

    test_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    return train_pred, test_mean

knn_train, knn_test = get_stacking_base_datasets(knn, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada, X_train, y_train, X_test, 7)

X_train_final = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
X_test_final = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)
print(X_train.shape, X_test.shape)
print(X_train_final.shape, X_test_final.shape)

(455, 30) (114, 30)
(455, 4) (114, 4)


In [16]:
lr.fit(X_train_final, y_train)
pred_final = lr.predict(X_test_final)
print('Score of Stacking Ensemble(Based on CVset): ', accuracy_score(y_test, pred_final))

Score of Stacking Ensemble(Based on CVset):  0.9912280701754386
