# 앙상블(Ensemble)

## Bagging meta-estimator

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_boston
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

### Bagging을 사용한 분류

```
hp = {
    "random_state" : SEED,
    "base_estimator" : base_model, #basse로 사용할 Model 
    "n_estimators": 100, # base_estimator  개수
    "max_features":0.5 # 추출할 샘플 비율,
    "bootstrap_features": Flase # 중복추출 허용
}
```

#### 데이터셋 불러오기

In [None]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

#### KNN + Bagging

##### 붓꽃 데이터

In [None]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.001348876953125
avg score time: 0.002493000030517578
avg test score: 0.96


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.01718592643737793
avg score time: 0.0055214881896972655
avg test score: 0.96


##### 와인 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0013541698455810547
avg score time: 0.002776956558227539
avg test score: 0.9493650793650794


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.017137765884399414
avg score time: 0.005670499801635742
avg test score: 0.9552380952380952


##### 유방암 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0014028072357177735
avg score time: 0.007619190216064453
avg test score: 0.9648501785437045


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.018460512161254883
avg score time: 0.010894680023193359
avg test score: 0.9596025461884802


#### SVC + Bagging

##### 붓꽃 데이터

In [None]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0015039443969726562
avg score time: 0.0004768848419189453
avg test score: 0.9666666666666666


In [None]:

cross_val = cross_validate(
    estimator = bagging_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.024779844284057616
avg score time: 0.0034014701843261717
avg test score: 0.9466666666666667


##### 와인 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0021590232849121095
avg score time: 0.0005911350250244141
avg test score: 0.9833333333333334


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.027790164947509764
avg score time: 0.003362894058227539
avg test score: 0.9720634920634922


##### 유방암 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.005152463912963867
avg score time: 0.0016344070434570312
avg test score: 0.9736376339077782


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.05262150764465332
avg score time: 0.011774444580078125
avg test score: 0.9683744760130415


#### Decision Tree + Bagging

##### 붓꽃 데이터

In [None]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.001516866683959961
avg score time: 0.00042214393615722654
avg test score: 0.9600000000000002


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=iris.data, y = iris.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.020736026763916015
avg score time: 0.0020793914794921876
avg test score: 0.9199999999999999


##### 와인 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0017923831939697266
avg score time: 0.0005547046661376953
avg test score: 0.8709523809523809


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=wine.data, y = wine.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.024535942077636718
avg score time: 0.002945232391357422
avg test score: 0.9553968253968254


##### 유방암 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.00845632553100586
avg score time: 0.0005257129669189453
avg test score: 0.9121099208197483


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=cancer.data, y = cancer.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.03265256881713867
avg score time: 0.002206897735595703
avg test score: 0.9473063188945815


### Bagging을 사용한 회귀

#### 데이터셋 불러오기

In [None]:
boston = load_boston()
diabetes = load_diabetes()

#### KNN + Bagging

##### 보스턴 주택 가격 데이터

In [None]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0012506008148193359
avg score time: 0.0012449264526367188
avg test score: 0.47357748833823543


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.017142248153686524
avg score time: 0.007118797302246094
avg test score: 0.41554553179015946


##### 당뇨병 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0016741275787353516
avg score time: 0.0018115997314453124
avg test score: 0.3689720650295623


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.043489265441894534
avg score time: 0.01722292900085449
avg test score: 0.38983617296559014


#### SVR + Bagging

##### 보스턴 주택 가격 데이터

In [None]:
base_model = make_pipeline(
    StandardScaler(),
    SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.010682487487792968
avg score time: 0.0028667449951171875
avg test score: 0.17631266230186618


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.039025068283081055
avg score time: 0.01263585090637207
avg test score: 0.17664389528453914


##### 당뇨병 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.009217643737792968
avg score time: 0.0023069858551025392
avg test score: 0.14659936199629434


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.03450860977172852
avg score time: 0.010053062438964843
avg test score: 0.07743755187890902


#### Decision Tree + Bagging

##### 보스턴 주택 가격 데이터

In [None]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=.5, max_features=.5)

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.004468727111816406
avg score time: 0.0006801128387451172
avg test score: 0.1302696789260276


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=boston.data, y = boston.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.0265533447265625
avg score time: 0.0020028114318847655
avg test score: 0.5527901564697586


##### 당뇨병 데이터

In [None]:
cross_val = cross_validate(
    estimator = base_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.005020999908447265
avg score time: 0.000739288330078125
avg test score: -0.11725180620614524


In [None]:
cross_val = cross_validate(
    estimator = bagging_model,
    X=diabetes.data, y = diabetes.target,
    cv=5
)
print(f"avg fit time: {cross_val['fit_time'].mean()}")
print(f"avg score time: {cross_val['score_time'].mean()}")
print(f"avg test score: {cross_val['test_score'].mean()}")

avg fit time: 0.040183734893798825
avg score time: 0.003127288818359375
avg test score: 0.370778928744094
