## Model Ensemble - Stacking

라이브러리

In [2]:
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 

from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier # !conda install py-xgboost

from vecstack import stacking # # !pip install vecstack==0.4.0

<br>

### 1. Functional API

#### Stage 0-0. 데이터 준비

In [3]:
iris = load_iris() 
X, y = iris.data, iris.target 

# 원본데이터 train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#### Stage 0-1. 모델 준비

In [4]:
models = [ 
    ExtraTreesClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 3), 
    RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 3), 
    XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators = 100, max_depth = 3)] 

#### Stage 1. 

In [5]:
S_train, S_test = stacking(models, 
                           X_train, y_train, X_test, # (120, 3), (120, 1), (30, 3)
                           regression = False, 
                           metric = accuracy_score, 
                           n_folds = 4, stratified = True, shuffle = True, 
                           random_state = 0, verbose = 2) 

task:         [classification]
n_classes:    [3]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [ExtraTreesClassifier]
    fold  0:  [0.93548387]
    fold  1:  [0.93333333]
    fold  2:  [0.93333333]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.95053763] + [0.02857060]
    FULL:     [0.95000000]

model  1:     [RandomForestClassifier]
    fold  0:  [0.96774194]
    fold  1:  [0.93333333]
    fold  2:  [0.90000000]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.95026882] + [0.03739072]
    FULL:     [0.95000000]

model  2:     [XGBClassifier]
    fold  0:  [0.96774194]
    fold  1:  [0.93333333]
    fold  2:  [0.90000000]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.95026882] + [0.03739072]
    FULL:     [0.95000000]



In [6]:
# 데이터프레임으로 확인
import pandas as pd

pd.DataFrame(S_train).head()

Unnamed: 0,0,1,2
0,2,2,2
1,1,1,1
2,0,0,0
3,2,2,2
4,2,2,2


In [7]:
pd.DataFrame(S_test).head()

Unnamed: 0,0,1,2
0,2,2,2
1,1,1,1
2,0,0,0
3,2,2,2
4,0,0,0


#### Stage 2.

모델 생성

In [8]:
model = XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators = 100, max_depth = 3) 

모델 학습

In [9]:
model = model.fit(S_train, y_train) 

예측

In [10]:
y_pred = model.predict(S_test)

In [11]:
y_pred # s_test -> x_test 대상 예측결과

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0])

In [12]:
y_test # 실제값

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0])

#### 3. 성능

In [13]:
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score: [0.96666667]


<br>

### 2. Scikit-learn API (권장)

라이브러리 추가

In [14]:
from vecstack import StackingTransformer

#### Stage 0-0. 데이터 준비

In [15]:
iris = load_iris() 
X, y = iris.data, iris.target 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#### Stage 0-1. 모델 준비

In [18]:
estimators = [ 
    ('ExtraTrees', ExtraTreesClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 3)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 3)),
    ('XGB', XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators = 100, max_depth = 3))]

#### Stage 1.

In [21]:
stack = StackingTransformer(estimators, 
#                             X_train, y_train, X_test
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 4, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 2) 

In [22]:
stack = stack.fit(X_train, y_train)

task:         [classification]
n_classes:    [3]
metric:       [accuracy_score]
variant:      [A]
n_estimators: [3]

estimator  0: [ExtraTrees: ExtraTreesClassifier]
    fold  0:  [0.93548387]
    fold  1:  [0.93333333]
    fold  2:  [0.93333333]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.95053763] + [0.02857060]

estimator  1: [RandomForest: RandomForestClassifier]
    fold  0:  [0.96774194]
    fold  1:  [0.93333333]
    fold  2:  [0.90000000]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.95026882] + [0.03739072]

estimator  2: [XGB: XGBClassifier]
    fold  0:  [0.96774194]
    fold  1:  [0.93333333]
    fold  2:  [0.90000000]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.95026882] + [0.03739072]



In [23]:
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)

Train set was detected.
Transforming...

estimator  0: [ExtraTrees: ExtraTreesClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [RandomForest: RandomForestClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [XGB: XGBClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

Transforming...

estimator  0: [ExtraTrees: ExtraTreesClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [RandomForest: RandomForestClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [XGB: XGBClass

#### Stage 2.

In [24]:
model = XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators = 100, max_depth = 3) 
model = model.fit(S_train, y_train) 

In [25]:
y_pred = model.predict(S_test) 
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score: [0.96666667]
