# 앙상블(Ensemble)

일반화와 강건성(Robustness)을 향상시키기 위해 여러 모델의 예측 값을 결합하는 방법
앙상블에는 크게 두 가지 종류가 존재

1) 평균 방법
여러개의 추정값을 독립적으로 구한 뒤 평균을 취함
결합 추정값은 분산이 줄어들기 때문에 단일 추정값보다 좋은 성능을 보임


2) 부스팅 방법
순차적으로 모델 생성
결합된 모델의 편향을 감소 시키기 위해 노력
부스팅 방법의 목표는 여러개의 약한 모델들을 결합해 하나의 강력한 앙상블 모델을 구축하는 것

# Bagging meta-estimator

Bagging은 bootstap aggregating의 줄임말이다.
원래 훈련 데이터셋의 일부를 사용해 여러 모델을 훈련
각각의 결과를 결합해 최종 결과를 생성
분산을 줄이고 과적합을 막음
강력하고 복잡한 모델에서 잘 동작함 
(다른 머신러닝 모델에 비교해서 굉장히 우수한 결과치를 산출함)

In [88]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer,load_boston, load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate


In [17]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [62]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()
boston = load_boston()

In [63]:
base_model= make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [64]:
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y = iris.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

      

avg fit time:0.0009975433349609375 (+/-8.792442757885826e-07)
avg score time:0.0016016960144042969 (+/-0.000492911277893178)
avg test score:0.96 (+/-0.024944382578492935)


In [65]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y = iris.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.013963031768798827 (+/-0.0008924966121626716)
avg score time:0.0051859855651855465 (+/-0.00039927971899069716)
avg test score:0.9333333333333332 (+/-0.04714045207910316)


In [66]:
base_model= make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [67]:
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y = wine.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.0010029792785644532 (+/-1.2066335787359773e-05)
avg score time:0.0013904094696044922 (+/-0.00048202051563972883)
avg test score:0.9493650793650794 (+/-0.037910929811115976)


In [68]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y = wine.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.014959621429443359 (+/-0.0017904007116525606)
avg score time:0.0053921699523925785 (+/-0.000495144743205177)
avg test score:0.9607936507936508 (+/-0.0417058772419794)


In [69]:
base_model= make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [70]:
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y = cancer.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.000797271728515625 (+/-0.00039864025615481304)
avg score time:0.0033844470977783202 (+/-0.0004823650282303032)
avg test score:0.9648501785437045 (+/-0.009609970350036127)


In [71]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y = cancer.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.01555027961730957 (+/-0.0008097594175289366)
avg score time:0.010378789901733399 (+/-0.0004967370712338455)
avg test score:0.9613414066138798 (+/-0.013112602658190713)


In [72]:
base_model= make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [73]:
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y = iris.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.001201629638671875 (+/-0.0003968309478845958)
avg score time:0.000399017333984375 (+/-0.0004886945264472149)
avg test score:0.9666666666666666 (+/-0.02108185106778919)


In [74]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y = iris.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.020549535751342773 (+/-0.003369203921946657)
avg score time:0.0029885292053222655 (+/-0.0006310364369473714)
avg test score:0.9533333333333334 (+/-0.03399346342395189)


In [75]:
base_model= make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [76]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y = wine.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.019946861267089843 (+/-6.468133910298603e-07)
avg score time:0.0035901546478271486 (+/-0.0004886555308230838)
avg test score:0.943968253968254 (+/-0.024675265605019477)


In [77]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y = wine.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.018954896926879884 (+/-1.1143716013075516e-05)
avg score time:0.002997684478759766 (+/-1.2658567218759924e-05)
avg test score:0.9495238095238095 (+/-0.010969620662330795)


In [78]:
base_model= make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [79]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y = wine.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.01875028610229492 (+/-0.00039916153863064154)
avg score time:0.003396272659301758 (+/-0.0004849314336512508)
avg test score:0.9604761904761905 (+/-0.023097283496400105)


In [80]:
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y = wine.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.0015903472900390624 (+/-0.0004842333955352152)
avg score time:0.0006039142608642578 (+/-0.0004932000744148144)
avg test score:0.9833333333333334 (+/-0.022222222222222233)


In [81]:
base_model= make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [82]:
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y = iris.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.0011962890625 (+/-0.0003988566996135635)
avg score time:0.00019927024841308593 (+/-0.00039854049682617185)
avg test score:0.9533333333333334 (+/-0.03399346342395189)


In [83]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y = iris.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.016544437408447264 (+/-0.000793025218513493)
avg score time:0.0012141704559326173 (+/-0.00040387444676499867)
avg test score:0.9466666666666667 (+/-0.03399346342395189)


In [84]:
base_model= make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [85]:
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y = cancer.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.004986333847045899 (+/-0.0006311282032245942)
avg score time:0.00039920806884765627 (+/-0.0004889280349393074)
avg test score:0.9261450085390466 (+/-0.02344380114440703)


In [86]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y = cancer.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.023142147064208984 (+/-0.0004069610690503441)
avg score time:0.0017961502075195313 (+/-0.00039787878236088826)
avg test score:0.9508306163639186 (+/-0.015225053650753124)


In [89]:
boston = load_boston()
diabetes = load_diabetes()

In [99]:
base_model= make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [100]:
cross_val = cross_validate(
    estimator = base_model,
    X = boston.data, y = boston.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.001396036148071289 (+/-0.0004884417044859074)
avg score time:0.001396512985229492 (+/-0.0004888311331523959)
avg test score:0.47357748833823543 (+/-0.13243123464477455)


In [102]:
cross_val = cross_validate(
    estimator = bagging_model,
    X = boston.data, y = boston.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.014378547668457031 (+/-0.0013475716827844625)
avg score time:0.006967067718505859 (+/-1.8116047116707363e-05)
avg test score:0.468292264111414 (+/-0.14681648948227674)


In [103]:
base_model= make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model,n_estimators=10, max_samples=0.5, max_features= 0.5)

In [105]:
cross_val = cross_validate(
    estimator = base_model,
    X = diabetes.data, y = diabetes.target,
    cv = 5
)
print('avg fit time:{} (+/-{})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time:{} (+/-{})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score:{} (+/-{})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg fit time:0.0011901378631591797 (+/-0.00040179198173748084)
avg score time:0.0014026165008544922 (+/-0.0004955071632703129)
avg test score:0.3689720650295623 (+/-0.044659049060165365)
