<a href="https://colab.research.google.com/github/jisu-h/DALC-Study-AI/blob/main/AI_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 앙상블(Ensemble)

## Bagging meta-estimator
    * bagging은 bootstrap aggregating의 줄임말
    * 원래 훈련 데이터셋의 일부를 사용해 여러 모델을 훈련
    * 각각의 결과를 결합해 최종 결과를 생성
    * 분산을 줄이고 과적합을 막음
    * 강력하고 복잡한 모델에서 잘 동작

### 필요한 데이터 셋

In [1]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.datasets import load_boston, load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

### 분류 모델

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

## Bagging을 사용한 분류

데이터셋 불러오기

In [4]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

## KNN
붓꽃 데이터

In [5]:
# 베이스 모델
# 데이터 모델링하기 하기전에는 반드시 스케일링 과정을 거쳐야함
# StandardScaler() : 스케일러(기본 스케일. 평균과 표준편차 사용)
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier()) # KNN모델 만듬
# 베이스 모델을 사용한 배깅 모델
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5) # Bagging KNN 모델 만듬

#### 분류기에 사용되는 매개변수
* n_estimators : 앙상블에 사용할 분류기의 수
* max_samples : 무작위로 뽑을 샘플의 수(0~1사이의 수로 지정하면 비율이 되어, 훈련세트에 곱한 값만큼 샘플링)
* max_features: 최대 feature의 수


In [6]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model, # 베이스 모델을 기준으로 측정하기에 estimator에 base_model 넣음
    X = iris.data, y=iris.target, # x는 train set, y는 test set
    cv = 5)
# 교차검증을 통해 해당 모델의 정확성, 작동하는데 걸린 시간 등을 알 수 있음
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0013502120971679687 (+/- 0.0005265661245590182)
avg score time: 0.002078676223754883 (+/- 0.00020699674042754462)
avg test score: 0.96 (+/- 0.024944382578492935)


In [7]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.019859123229980468 (+/- 0.0027065462986493928)
avg score time: 0.008351850509643554 (+/- 0.001363205861150441)
avg test score: 0.9466666666666667 (+/- 0.026666666666666658)


와인 데이터

In [8]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [9]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002848052978515625 (+/- 0.0033377243988514625)
avg score time: 0.002473783493041992 (+/- 0.0003037415418890169)
avg test score: 0.9493650793650794 (+/- 0.037910929811115976)


In [10]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.01857151985168457 (+/- 0.00064649290347817)
avg score time: 0.008067464828491211 (+/- 0.0005215080518076882)
avg test score: 0.9665079365079364 (+/- 0.020746948644437477)


유방암 데이터

In [11]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [12]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002972269058227539 (+/- 0.0007875655427291602)
avg score time: 0.008790063858032226 (+/- 0.0022377124864383746)
avg test score: 0.9648501785437045 (+/- 0.009609970350036127)


In [13]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.025846338272094725 (+/- 0.003252785654878649)
avg score time: 0.017942047119140624 (+/- 0.0016588419074843991)
avg test score: 0.9666045645086166 (+/- 0.01164730071870122)


## SVC
붓꽃 데이터

In [15]:
base_model = make_pipeline(StandardScaler(),
                          SVC()) 
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [16]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y = iris.target, # x, y에 알맞은 데이터
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0030764102935791015 (+/- 0.0018292621292118063)
avg score time: 0.0012019157409667968 (+/- 0.0005558966158756917)
avg test score: 0.9666666666666666 (+/- 0.02108185106778919)


In [19]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.026966428756713866 (+/- 0.002558697503380737)
avg score time: 0.0030735492706298827 (+/- 5.747254790725658e-05)
avg test score: 0.9466666666666667 (+/- 0.026666666666666658)


와인 데이터

In [18]:
base_model = make_pipeline(StandardScaler(),
                          SVC())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [20]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y = wine.target, # x, y에 알맞은 데이터
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002951812744140625 (+/- 0.0017878162816989441)
avg score time: 0.0006565093994140625 (+/- 5.171575424557867e-05)
avg test score: 0.9833333333333334 (+/- 0.022222222222222233)


In [21]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.029575634002685546 (+/- 0.0021997565430258536)
avg score time: 0.003726339340209961 (+/- 0.00047114422839459393)
avg test score: 0.9555555555555555 (+/- 0.045133546692422026)


유방암 데이터

In [22]:
base_model = make_pipeline(StandardScaler(),
                          SVC())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [23]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.007333040237426758 (+/- 0.0024419333030604277)
avg score time: 0.0015376091003417968 (+/- 9.471412044225089e-05)
avg test score: 0.9736376339077782 (+/- 0.014678541667933545)


In [24]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0390678882598877 (+/- 0.002422083488914657)
avg score time: 0.007060194015502929 (+/- 0.00017065518919427692)
avg test score: 0.9631113181183046 (+/- 0.014000421580213612)


## Decision Tree
붓꽃 데이터

In [25]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [26]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0017421722412109375 (+/- 0.0007440754002624946)
avg score time: 0.0006098270416259766 (+/- 0.0002075546993931556)
avg test score: 0.9666666666666668 (+/- 0.036514837167011066)


In [27]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.02585492134094238 (+/- 0.003972458371142051)
avg score time: 0.0023853302001953123 (+/- 8.648850765573532e-05)
avg test score: 0.9533333333333334 (+/- 0.03399346342395189)


와인 데이터

In [28]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [29]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0024817466735839845 (+/- 0.00035556667800557733)
avg score time: 0.0006777286529541015 (+/- 0.00017637043033085414)
avg test score: 0.8765079365079365 (+/- 0.04835046741937931)


In [30]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.029011297225952148 (+/- 0.0028353080192493104)
avg score time: 0.002565622329711914 (+/- 0.00010812412679661082)
avg test score: 0.9722222222222221 (+/- 0.03513641844631534)


유방암 데이터

In [31]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [32]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.009439373016357422 (+/- 0.0006323141671464819)
avg score time: 0.0007726669311523438 (+/- 0.00019204893441693361)
avg test score: 0.924406148113647 (+/- 0.01817041384858491)


In [33]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.04267277717590332 (+/- 0.004941013960016342)
avg score time: 0.0031805038452148438 (+/- 0.000494659293888585)
avg test score: 0.9437820214252446 (+/- 0.015232714390068411)


## Bagging을 사용한 회귀
### 데이터셋 불러오기

In [34]:
boston = load_boston()
diabetes = load_diabetes()

## KNN
### 보스턴 주택 가격 데이터

In [35]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsRegressor())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [36]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0015607357025146484 (+/- 0.0005878583281789182)
avg score time: 0.0019240379333496094 (+/- 0.00045915751543536943)
avg test score: 0.47357748833823543 (+/- 0.13243123464477455)


In [37]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.018162870407104494 (+/- 0.0012598155538742957)
avg score time: 0.009545040130615235 (+/- 0.00036645478211693686)
avg test score: 0.47161920985743067 (+/- 0.07468419582047187)


### 당뇨병 데이터

In [38]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsRegressor())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [39]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0015477180480957032 (+/- 0.0005874359872240771)
avg score time: 0.0030533313751220704 (+/- 0.0013680344684512565)
avg test score: 0.3689720650295623 (+/- 0.044659049060165365)


In [40]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.02099599838256836 (+/- 0.004269524031052737)
avg score time: 0.009959697723388672 (+/- 0.001063542112890918)
avg test score: 0.3998192388231475 (+/- 0.04084869823667317)


## SVR
### 보스턴 주택 가격 데이터

In [42]:
# SVR을 사용해 base_model과 bagging_model을 만듬
base_model =  make_pipeline(StandardScaler(),
                          SVR())
bagging_model= BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [43]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.016150140762329103 (+/- 0.0009696445987929509)
avg score time: 0.0028767108917236326 (+/- 0.00038172118720433847)
avg test score: 0.17631266230186618 (+/- 0.5224914915128981)


In [44]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.05386219024658203 (+/- 0.0020138100704559293)
avg score time: 0.008781194686889648 (+/- 0.00018202811181318364)
avg test score: 0.16255004942235268 (+/- 0.34109995020010647)


### 당뇨병 데이터

In [45]:
base_model = make_pipeline(StandardScaler(),
                          SVR())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [46]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.012396144866943359 (+/- 0.003790306439626996)
avg score time: 0.0019643306732177734 (+/- 3.570456533984932e-05)
avg test score: 0.14659936199629434 (+/- 0.02190798003342928)


In [47]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.044451332092285155 (+/- 0.003477360588648972)
avg score time: 0.007396793365478516 (+/- 0.00041732843113955354)
avg test score: 0.06556885239460948 (+/- 0.01982767568266837)


## Random Forest
* sklearn.ensemble 모듈에는 무작위 결정 트리를 기반으로하는 두 개의 평균화 알고리즘이 존재
    + Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

## Random Forest 분류

In [50]:
# 랜덤포레스트 모델 만드는 코드
base_model = make_pipeline(StandardScaler(),
                          RandomForestClassifier())

In [51]:
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.13342638015747071 (+/- 0.007407309570588814)
avg score time: 0.008752202987670899 (+/- 0.0007586216127784869)
avg test score: 0.96 (+/- 0.024944382578492935)


In [52]:
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.14841384887695314 (+/- 0.006769827564950689)
avg score time: 0.008529996871948243 (+/- 0.00045088541673817456)
avg test score: 0.9663492063492063 (+/- 0.020705617598882967)


In [53]:
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.20457534790039061 (+/- 0.007742467535987192)
avg score time: 0.009429740905761718 (+/- 0.000529614241374169)
avg test score: 0.9631113181183046 (+/- 0.021749025602840658)


## Random Forest 회귀

In [54]:
model = make_pipeline(
StandardScaler(),
RandomForestRegressor())

In [55]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.3376143455505371 (+/- 0.010758009086337488)
avg score time: 0.008361196517944336 (+/- 0.00036281779516041487)
avg test score: 0.6309864884890652 (+/- 0.19934705126809335)


In [56]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.2831474781036377 (+/- 0.006761520923714831)
avg score time: 0.008350896835327148 (+/- 0.0002531173357447817)
avg test score: 0.41369857747924355 (+/- 0.048793832069413776)


## AdaBoost
* 대표적인 부스팅 알고리즘
* 일련의 약한 모델들을 학습
* 수정된 버전의 데이터를 반복 학습(가중치가 적용된)
* 가중치 투표(또는 합)을 통해 각 모델의 예측 값을 결합
* 첫 단계에서는 원본 데이터를 학습하고 연속적인 반복마다 개별 샘플에 대한 가중치가 수정되고 다시 모델이 학습
    + 잘못 예측된 샘플은 가중치 증가, 올바르게 예측된 샘플은 가중치 감소
    + 각각의 약한 모델들은 예측하기 어려운 샘플에 집중하게 됨

![image.png](attachment:image.png)

In [57]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

## AdaBoost 분류

In [58]:
model = make_pipeline(
StandardScaler(),
AdaBoostClassifier())

In [59]:
cross_val = cross_validate(
    estimator = model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.07613921165466309 (+/- 0.0034110567359253788)
avg score time: 0.0068474292755126955 (+/- 0.0003156491056003601)
avg test score: 0.9466666666666667 (+/- 0.03399346342395189)


In [60]:
cross_val = cross_validate(
    estimator = model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.088043212890625 (+/- 0.009530712009928811)
avg score time: 0.00805816650390625 (+/- 0.0014554672243635477)
avg test score: 0.8085714285714285 (+/- 0.16822356718459935)


In [61]:
cross_val = cross_validate(
    estimator = model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.1733616828918457 (+/- 0.01066863336282834)
avg score time: 0.009555435180664063 (+/- 0.003126536919127269)
avg test score: 0.9718677224033534 (+/- 0.0195587047134823)


## AdaBoost 회귀

In [62]:
model = make_pipeline(
StandardScaler(),
AdaBoostRegressor())

In [63]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.09982953071594239 (+/- 0.0050125905692893335)
avg score time: 0.004819631576538086 (+/- 0.001186354463010263)
avg test score: 0.5776907491497967 (+/- 0.22205633587208704)


In [64]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.09480133056640624 (+/- 0.008713858552899467)
avg score time: 0.004848766326904297 (+/- 0.00161999060861601)
avg test score: 0.3912994545433489 (+/- 0.04337337445407076)


## Gradient Tree Boosting
* 임의의 차별화 가능한 손실함수로 일반화한 부스팅 알고리즘
* 웹 검색, 분류 및 회귀 등 다양한 분야에서 모두 사용 가능

In [65]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

## Gradient Tree Boosting 분류

In [66]:
model = make_pipeline(
StandardScaler(),
GradientBoostingClassifier())

In [67]:
cross_val = cross_validate(
    estimator = model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.19953312873840331 (+/- 0.017296368190782303)
avg score time: 0.0010908126831054687 (+/- 7.210172031868262e-05)
avg test score: 0.96 (+/- 0.024944382578492935)


In [68]:
cross_val = cross_validate(
    estimator = model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.2457146167755127 (+/- 0.010742910717906383)
avg score time: 0.0009769439697265626 (+/- 2.4645424354878377e-05)
avg test score: 0.9330158730158731 (+/- 0.03296317528191366)


In [69]:
cross_val = cross_validate(
    estimator = model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.4510918140411377 (+/- 0.00432529748313963)
avg score time: 0.0010649204254150391 (+/- 3.2347207330389314e-05)
avg test score: 0.9613724576929048 (+/- 0.022574828498321483)


## Gradient Tree Boosting 회귀

In [70]:
model = make_pipeline(
StandardScaler(),
GradientBoostingRegressor())

In [71]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.12670326232910156 (+/- 0.00220746474579185)
avg score time: 0.001115703582763672 (+/- 0.00016295044610214001)
avg test score: 0.6734907303555262 (+/- 0.16757067728454228)


In [72]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.09927129745483398 (+/- 0.004682475476089631)
avg score time: 0.0011170864105224609 (+/- 3.921984901544104e-05)
avg test score: 0.40575807348942095 (+/- 0.0693021998770823)


## 투표 기반 모델(Voting Classifier)

* 서로 다른 모델들의 결과를 투표를 통해 결합
* 두가지 방법으로 투표 가능
    + 가장 많이 예측된 클래스를 정답으로 채택(hard voting)
    + 예측된 확률의 가중치 평균(soft voting)

In [73]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings(action='ignore') # 에러 메시지 안 뜨게 하기 위함

## Hard Voting

In [74]:
model1 = SVC()
model2 = GaussianNB()
model3 = RandomForestClassifier()
# 각기 다른 알고리즘을 가진 모델을 이용해 투표 기반 모델 만듬
vote_model = VotingClassifier(
estimators = [('svc', model1), ('naive', model2), ('forest', model3)], # ('추정기 이름', 추정기)의 모음
voting = 'hard') # voting 속성으로 hard와 soft 지정


In [75]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy: %0.2f [%s]' %(scores.mean(), model_name))
     

Accuracy: 0.97 [SVC]
Accuracy: 0.95 [GaussianNB]
Accuracy: 0.97 [RandomForestClassifier]
Accuracy: 0.97 [VotingClassifier]


## Soft Voting

In [76]:
model1 = SVC(probability=True)
model2 = GaussianNB()
model3 = RandomForestClassifier()
vote_model = VotingClassifier(
estimators = [('svc', model1), ('naive', model2), ('forest', model3)],
voting = 'soft',
weights=[2,1,2]) # weights 가중치 (estimators에 하나씩 해당)

In [77]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy: %0.2f [%s]' %(scores.mean(), model_name))

Accuracy: 0.97 [SVC]
Accuracy: 0.95 [GaussianNB]
Accuracy: 0.96 [RandomForestClassifier]
Accuracy: 0.96 [VotingClassifier]
