# Ensemble2
앙상블 기법 boosting(adaboost, gradient boosting, 확장된 gradient boosting), stacking에 대한 실습을 진행합니다.  
폐암 진단 데이터를 이용한 분류기 만들기 앙상블입니다.  

In [None]:
# ready
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
# sklearn 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,GridSearchCV

In [None]:
# load dataset
cancer = load_breast_cancer()
data = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
X,y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=1004)

In [None]:
y_train[:20]

array([0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0])

## 1. AdaBoost

* base_estimator : 사용하는 모델 
* n_estimators : The maximum number of estimators at which boosting is terminated. 디폴트 50

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
eclf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=500,
    learning_rate=0.1, random_state=318)

In [None]:
# cv
cross_val_score(eclf, X, y, cv=5).mean()

0.9701288619779538

In [None]:
# Grid Search
params = {"base_estimator__criterion" : ["gini", "entropy"],
          "base_estimator__max_features" : [7,8],
          "base_estimator__max_depth" : [1,2],
          "n_estimators": [23, 24, 25, 26, 27]}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X, y)

In [None]:
# best score & params
print("best score: ",grid.best_score_)
print("best parameter: ",grid.best_params_)

best score:  0.9595870206489675
best parameter:  {'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 2, 'base_estimator__max_features': 8, 'n_estimators': 26}


In [None]:
# Feature Importances
grid.best_estimator_.feature_importances_ 

array([0.00679894, 0.08393732, 0.        , 0.00927962, 0.        ,
       0.        , 0.0501088 , 0.06826607, 0.0139775 , 0.        ,
       0.02404445, 0.        , 0.02201243, 0.07712993, 0.        ,
       0.01063741, 0.00188976, 0.        , 0.00477859, 0.00702932,
       0.06985344, 0.12016262, 0.0682317 , 0.11044393, 0.06600203,
       0.00497385, 0.07543759, 0.09794108, 0.00706362, 0.        ])

# 2. Gradient Boost

* n_estimators :수행할 부스팅 단계의 수. Gradient boosting은 오버피팅의 위험성이 낮기 때문에 이 부스팅 단계 수가 크면 좋은 성능이 나오는 경우가 많습니다.

In [None]:
# model
from sklearn.ensemble import GradientBoostingClassifier
eclf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)

In [None]:
# cv
cross_val_score(eclf,X,y,cv=5).mean()

0.9578636857630803

In [None]:
# Grid search
params ={
    "n_estimators" : [10, 20, 30, 50, 100, 200],
    "learning_rate" : [i for i in np.linspace(0.1,1, 10)]}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=5,verbose=3)
grid = grid.fit(X, y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    5.7s
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:   21.7s
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:   37.5s
[Parallel(n_jobs=5)]: Done 300 out of 300 | elapsed:   39.6s finished


In [None]:
# best score & params
print("best score: ", grid.best_score_)
print("best_params: ", grid.best_params_)

best score:  0.9648812296227295
best_params:  {'learning_rate': 0.1, 'n_estimators': 200}


# 3. XGboost

* 기본적인 xgboost 의 사용
* GBM의 경우 n_estimators에 지정된 횟수만큼 학습을 끝까지 수행하지만, XGB의 경우 오류가 더 이상 개선되지 않으면 수행을 중지
* n_estimators 를 200으로 설정하고, 조기 중단 파라미터 값을 50으로 설정하면, 1부터 200회까지 부스팅을 반복하다가 50회를 반복하는 동안 학습오류가 감소하지 않으면 더 이상 부스팅을 진행하지 않고 종료합니다.
* (가령 100회에서 학습오류 값이 0.8인데 101~150회 반복하는 동안 예측 오류가 0.8보다 작은 값이 하나도 없으면 부스팅을 종료)

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
# model
model = XGBClassifier(n_estimators=1000, max_depth=2, learning_rate=0.5, nthread=7)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

0.9298245614035088

* gradient descent 눈으로 보기

In [None]:
dtrain = xgb.DMatrix(data=X_train, label = y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [None]:
param = {'max_depth': 2, 'eta': 0.5, 'silent': 1, 'objective': 'binary:logistic'}
param['nthread'] = 7
param['eval_metric'] = 'auc'
evallist = [(dtest, 'eval'), (dtrain, 'train')]
plst = param.items()

In [None]:
num_round = 50
bst = xgb.train(plst,dtrain,num_round,evallist)

[0]	eval-auc:0.907874	train-auc:0.9805
[1]	eval-auc:0.925615	train-auc:0.982296
[2]	eval-auc:0.94071	train-auc:0.990884
[3]	eval-auc:0.959384	train-auc:0.99456
[4]	eval-auc:0.967943	train-auc:0.997264
[5]	eval-auc:0.971366	train-auc:0.998331
[6]	eval-auc:0.972611	train-auc:0.998458
[7]	eval-auc:0.982104	train-auc:0.99907
[8]	eval-auc:0.980237	train-auc:0.999577
[9]	eval-auc:0.979303	train-auc:0.999641
[10]	eval-auc:0.97868	train-auc:0.999768
[11]	eval-auc:0.980237	train-auc:0.999831
[12]	eval-auc:0.983193	train-auc:0.999894
[13]	eval-auc:0.982882	train-auc:0.999958
[14]	eval-auc:0.98226	train-auc:0.999958
[15]	eval-auc:0.983193	train-auc:1
[16]	eval-auc:0.984127	train-auc:1
[17]	eval-auc:0.982571	train-auc:1
[18]	eval-auc:0.982882	train-auc:1
[19]	eval-auc:0.984438	train-auc:1
[20]	eval-auc:0.985683	train-auc:1
[21]	eval-auc:0.986617	train-auc:1
[22]	eval-auc:0.986306	train-auc:1
[23]	eval-auc:0.986928	train-auc:1
[24]	eval-auc:0.986928	train-auc:1
[25]	eval-auc:0.986928	train-auc:1
[2

In [None]:
ypred =  bst.predict(dtest,ntree_limit=bst.best_ntree_limit)

# 4. Stacking

In [None]:
! pip install xgboost
! pip install vecstack
! pip install lightgbm

Collecting vecstack
  Downloading https://files.pythonhosted.org/packages/d0/a1/b9a1e9e9e5a12078da1ab9788c7885e4c745358f7e57d5f94d9db6a4e898/vecstack-0.4.0.tar.gz
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-cp36-none-any.whl size=19880 sha256=2fca6bc588f1b2c09802e167d61dbb61f377a5b3b5d7aa0f6ebccdf2fe33347a
  Stored in directory: /root/.cache/pip/wheels/5f/bb/4e/f6488433d53bc0684673d6845e5bf11a25240577c8151c140e
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0


In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier 
from vecstack import stacking
from lightgbm import LGBMClassifier
import numpy as np

In [None]:
#Stacking은 validation이 꼭 필요해서 validation 데이터를 다시 split합니다.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1004)

* 개별 모델 준비 (베이스모델 2가지 이상)

In [None]:
# 개별 모델들
svm = SVC(random_state = 0)
rf = RandomForestClassifier(n_estimators =200, random_state=0)
lr = LogisticRegression()

# 최종 모델
lgbm = LGBMClassifier()

#개별 모델을 먼저 학습, 예측
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
svm_pred = svm.predict(X_val)
rf_pred = rf.predict(X_val)
lr_pred = lr.predict(X_val)

print("svm : {0:.4f}, rf : {1:.4f}, lr : {2:.4f}".format(accuracy_score(y_val, svm_pred),accuracy_score(y_val, rf_pred),accuracy_score(y_val, lr_pred)))

svm : 0.8947, rf : 0.9649, lr : 0.9386


* stacking 개별 모델 합쳐주기

In [None]:
new_data = np.array([svm_pred,rf_pred,lr_pred])
new_data.shape

(3, 114)

In [None]:
new_data

array([[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
        1, 1, 0, 1],
       [0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
        1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
        1, 1, 0, 0],
       [0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,

* shape 변경  
합친 pred 값이 다시 훈련 데이터로 들어가기 때문에 shape 을 바꿔줍니다.  
row 는 X_test 와 일치해야하고, 114개의 데이터가 다시 훈련 데이터가 됩니다.
따라서 합친 데이터를 transpose해야 합니다.

In [None]:
new_data = np.transpose(new_data)
new_data.shape

(114, 3)

In [None]:
new_data[:10]        # 확인

array([[0, 0, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])

* 최종 모델 훈련, 예측

In [None]:
lgbm.fit(new_data,y_val)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
#test용 new_test 생성?? (3개 stacking)
svm_pred_2 = svm.predict(X_test)
rf_pred_2 = rf.predict(X_test)
lr_pred_2 = lr.predict(X_test)
new_test = np.array([svm_pred_2,rf_pred_2,lr_pred_2])
new_test.shape

(3, 114)

In [None]:
new_test = np.transpose(new_test)
new_test.shape

(114, 3)

In [None]:
lgbm_pred = lgbm.predict(new_test)
# 예측값으로 실제값을 예측한다.
print('정확도 : {0:.4f}'.format(accuracy_score(y_test, lgbm_pred)))

정확도 : 0.9298


이 stacking은 단순히 합쳐서 예측하는 basic stacking 입니다.

## CV 기반 Stacking

In [None]:
data = load_breast_cancer()

X_data = data.data
y_data = data.target

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 0)

In [None]:
svm = SVC(random_state = 0)
rf = RandomForestClassifier(n_estimators =200, random_state=0)
lr = LogisticRegression()

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):
    kfold = KFold(n_splits = n_folds, random_state = 0)
    
    # 최종 모델에서 사용할 데이터 셋 셋팅(0 값으로)
    # 데이터 shape이 (100, col) 이면 폴드의 검증 과정에서 저장할 데이터의 shape는 (100, 1) 입니다. 
    # (결국 validation 데이터로 예측한 predicted val_y 값의 column은 1개니까 !!)
    train_fold_predict = np.zeros((X_train.shape[0], 1))
    # test는 X_test 값을 이용해서 매 폴드마다 예측을 하기 때문에 (100, fold개수) 만큼의 shape를 갖게 됩니다.(추후 평균내서 (100,1) 사이즈로 만들것)
    # 그래서 해당 폴드마다 X_test의 예측 값을 해당 fold에 해당되는 열에 넣습니다.
    test_predict = np.zeros((X_test.shape[0], n_folds))
    print("model : ", model.__class__.__name__)
    
    for cnt, (train_index, valid_index) in enumerate(kfold.split(X_train)):
        #fold별로 데이터 split하는 부분
        X_train_ = X_train[train_index]
        y_train_ = y_train[train_index]
        X_validation = X_train[valid_index]
        
        #해당 fold 학습
        model.fit(X_train_, y_train_)
        
        #해당 fold에서 학습된 모델에다가 검증 데이터(X_validation)로 예측 후 저장
        #validation에 해당하는 인덱스가 전체의 1/fold수 이므로 해당 fold의 validation 데이터에 해당하는 부분
        train_fold_predict[valid_index, :] = model.predict(X_validation).reshape(-1, 1) 
        
        #해당 fold에서 생성된 모델에게 원본 테스트 데이터(X_test)를 이용해서 예측을 수행하고 저장(추후 모델별로 합쳐서 new_test 데이터 생성)
        test_predict[:, cnt] = model.predict(X_test)
    
    #for문이 끝나면 test_pred는 평균을 내서 하나로 합친다.
    test_predict_mean = np.mean(test_predict, axis =1).reshape(-1, 1)
    
    return train_fold_predict, test_predict_mean

In [None]:
svm_train, svm_test = get_stacking_data(svm, X_train, y_train, X_test)
rf_train, rf_test = get_stacking_data(rf, X_train, y_train, X_test)
lr_train, lr_test = get_stacking_data(lr, X_train, y_train, X_test)

model :  SVC
model :  RandomForestClassifier
model :  LogisticRegression


In [None]:
svm_train.shape    #한 모델 당 column 1개씩

(455, 1)

In [None]:
new_X_train = np.concatenate((svm_train, rf_train, lr_train), axis = 1)
new_X_test = np.concatenate((svm_test, rf_test,lr_test), axis = 1)

In [None]:
print("원본 : ", X_train.shape, X_test.shape)
print("새로운 : ", new_X_train.shape, new_X_test.shape)

원본 :  (455, 30) (114, 30)
새로운 :  (455, 3) (114, 3)


In [None]:

lgbm.fit(new_X_train, y_train)
stack_pred = lgbm.predict(new_X_test)

print("정확도 : {0:.4f}".format(accuracy_score(stack_pred, y_test)))

정확도 : 0.9649
