<a href="https://colab.research.google.com/github/jhl0580/ML_study/blob/main/Stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets

In [5]:
iris = datasets.load_iris()

In [65]:
boston = datasets.load_boston()

# Stacking

## Classification

In [15]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [9]:
X, y = iris.data, iris.target

In [62]:
#level-1 data를 만들기 위한 base estimators
base_estimators = [('rf', RandomForestClassifier(n_estimators = 10, max_depth = 3, max_features = 'auto', random_state = 0)),
                   ('bag', BaggingClassifier(n_estimators = 10, random_state = 0)),
                   ('gbm', GradientBoostingClassifier(loss = 'deviance', learning_rate = 0.01, n_estimators = 10, max_depth = 3, random_state = 0))
                   ]

In [63]:
clf_stack = StackingClassifier(estimators = base_estimators,                    #base_estimators에 있는 모델을 활용
                               final_estimator = LogisticRegression(),          #meta learner로 logistic 모델을 활용  
                               cv = 5)                                          #levle-1 data를 얻기 위한 cv estimation에서 5개의 fold를 활용
clf_stack.fit(X, y)

In [64]:
clf_stack.final_estimator_.coef_                                                #base estimator에 random_state가 있다면 meta_learner에서는 항상 같은 결과가 나옴.

array([[ 1.50266135, -0.76958667, -0.73312223,  1.55929598, -0.80142467,
        -0.75791887,  0.15269327, -0.07402085, -0.07871998],
       [-0.78394077,  1.14980643, -0.36570861, -0.74305025,  1.24008497,
        -0.49687766, -0.07999805,  0.15749057, -0.07733547],
       [-0.71872058, -0.38021976,  1.09883085, -0.81624573, -0.43866031,
         1.25479653, -0.07269523, -0.08346972,  0.15605545]])

## Regression

In [115]:
from sklearn.model_selection import train_test_split

X, y = boston.data, boston.target

train_X, test_X = train_test_split(X, test_size = 0.2, shuffle = True, random_state = 0)
train_y, test_y = train_test_split(y, test_size = 0.2, shuffle = True, random_state = 0)

In [156]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import GridSearchCV

### Random Forest

In [120]:
rf = RandomForestRegressor(random_state = 0, max_features = 0.33)

parameters = {'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 128],
              'max_depth' : [3, 5, 7, 10]
              }
rf_cv = GridSearchCV(estimator = RandomForestRegressor(), param_grid = parameters, cv = 10).fit(train_X, train_y)

In [121]:
rf_cv.best_params_

{'max_depth': 10, 'n_estimators': 64}

In [124]:
rf.set_params(**rf_cv.best_params_)

rf.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features=0.33, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=64, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [128]:
np.mean((test_y - rf.predict(test_X))**2)

26.35797003177714

### Bagging

In [143]:
bag = BaggingRegressor(random_state = 0)

parameters = {'n_estimators' : [32, 64, 128, 256, 512]}

bag_cv = GridSearchCV(estimator = BaggingRegressor(), param_grid = parameters, cv = 10).fit(train_X, train_y)

In [144]:
bag_cv.best_params_

{'n_estimators': 64}

In [145]:
bag.set_params(**bag_cv.best_params_)

bag.fit(train_X, train_y)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=64,
                 n_jobs=None, oob_score=False, random_state=0, verbose=0,
                 warm_start=False)

In [148]:
np.mean((test_y - bag.predict(test_X))**2)

19.67245919021906

### GradientBoosting

In [150]:
gbm = GradientBoostingRegressor(loss = 'ls', random_state = 0)

parameters = {'learning_rate' : [0.1, 0.05, 0.01, 0.005, 0.001],
              'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 128],
              'max_depth' : [1, 2, 3, 4, 5]
              }

gbm_cv = GridSearchCV(estimator = GradientBoostingRegressor(), param_grid = parameters, cv = 10).fit(train_X, train_y)

In [151]:
gbm_cv.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 128}

In [154]:
gbm.set_params(**gbm_cv.best_params_)

gbm.fit(train_X, train_y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=128,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=0, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [155]:
np.mean((test_y - gbm.predict(test_X))**2)

17.206733096736173

### Stacking

In [185]:
base_estimators = [
                   ('bag', bag),
                   ('gbm', gbm)
                   ]

In [186]:
reg_stack = StackingRegressor(estimators = base_estimators,
                              final_estimator = LinearRegression(),
                              cv = 5)

reg_stack.fit(train_X, train_y)

StackingRegressor(cv=5,
                  estimators=[('bag',
                               BaggingRegressor(base_estimator=None,
                                                bootstrap=True,
                                                bootstrap_features=False,
                                                max_features=1.0,
                                                max_samples=1.0,
                                                n_estimators=64, n_jobs=None,
                                                oob_score=False, random_state=0,
                                                verbose=0, warm_start=False)),
                              ('gbm',
                               GradientBoostingRegressor(alpha=0.9,
                                                         ccp_alpha=0.0,
                                                         criterion='friedman_mse',
                                                         init=None,
                                 

In [187]:
np.mean((test_y - reg_stack.predict(test_X))**2)

17.022056741921386