## 10. 스태킹 앙상블



In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer_data = load_breast_cancer()

X_data = cancer_data.data
y_label = cancer_data.target

X_train, X_test, y_train, y_test = train_test_split(X_data, y_label, test_size = 0.2, random_state = 0)

# 개별모델
knn_clf = KNeighborsClassifier(n_neighbors= 4)
rf_clf = RandomForestClassifier(n_estimators = 100, random_state = 0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators = 100)

lr_final = LogisticRegression(C = 10) # 예측을 학습 할 최종 모델

knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print("knn:", accuracy_score(y_test, knn_pred))
print("rf:", accuracy_score(y_test, rf_pred))
print("dt:", accuracy_score(y_test, dt_pred))
print("ada:", accuracy_score(y_test, ada_pred))


knn: 0.9210526315789473
rf: 0.9649122807017544
dt: 0.9122807017543859
ada: 0.956140350877193


In [2]:
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
print(pred.shape)

pred = np.transpose(pred)
print(pred.shape)


(4, 114)
(114, 4)


In [3]:
lr_final.fit(pred, y_test)
final = lr_final.predict(pred)

print("final:", accuracy_score(y_test, final))

final: 0.9649122807017544


### CV 세트 기반의 스태킹
- 학습용 스태킹 데이터, 테스트용 스태킹 데이터 만들기


In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds_):
    kf = KFold(n_splits = n_folds_, shuffle = False, random_state = 0)

    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds_))
    print(model.__class__.__name__, " model 시작")

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print("\t fold set: ", folder_counter, " start")
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train[valid_index]

        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)
    
    test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1, 1)
    return train_fold_pred, test_pred_mean

knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)

Stack_final_X_train = np.concatenate([knn_train, rf_train, dt_train, ada_train], axis = 1)
Stack_final_X_test = np.concatenate([knn_test, rf_test, dt_test, ada_test], axis = 1)

print("원본 학습 피처 데이터 shape: ", X_train.shape, "원본 테스트 피처 shape: ", X_test.shape)
print("스태킹 학습 피처 shape: ", Stack_final_X_train.shape, "스태킹 테스트 피처 shape: ", Stack_final_X_test.shape)


KNeighborsClassifier  model 시작
	 fold set:  0  start
	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start
RandomForestClassifier  model 시작
	 fold set:  0  start
	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start
DecisionTreeClassifier  model 시작
	 fold set:  0  start
	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start
AdaBoostClassifier  model 시작
	 fold set:  0  start
	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start
원본 학습 피처 데이터 shape:  (455, 30) 원본 테스트 피처 shape:  (114, 30)
스태킹 학습 피처 shape:  (455, 4) 스태킹 테스트 피처 shape:  (114, 4)


In [5]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)
print("accuracy: ", accuracy_score(y_test, stack_final)) 

accuracy:  0.9736842105263158
