# Ensenble

---

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# 1 Ensenble
- 여러 개의 분류기(Classifier)를 생성하고 그 예측을 결합함으로써 보다 정확한 최종 예측을 수행
- Overfitting 방지

|구분 | 지도 학습 | 비고|
|:--- |:--- |:---|
|보팅(Voting) | 서로 다른 알고리즘이 같은 데이터 세트에 대해 학습하고 예측한 결과를 보팅 (Hard Voting / Soft Voting) | 랜덤 포레스트|
|배깅(Bagging) | 단일 결정 트리로 데이터 샘플링을 서로 다르게 가져가면서 학습을 수행해 보팅 | 랜덤 포레스트|
|부스팅(Boosting) | 여러 개의 분류기가 순차적으로 학습하면서 앞에서 학습한 분류기가 틀린 데이터에 대해서는 가중치를 부여하면서 학습과 예측을 진행 | GBM, XGBoost|
|스태킹(Stacking) | 스태킹은 여러가지 다른 모델의 예측 결과값을 다시 학습데이터로 만들어 다른 모델로 재학습시켜 결과를 예측하는 방법 | -|

# 2 Random Forest

- 회귀 결과 예측 방법: 노드의 값들을 평균
- 분류 결과 예측 방법: 범주의 갯수 카운팅
  - Hard voting
  - Soft voting

### 2.1 Random Forest 모델 생성

In [None]:
# Visual Python: Machine Learning > Data Sets
from sklearn.datasets import load_breast_cancer

ldata = load_breast_cancer()
# Create DataFrame
df_ldata = pd.DataFrame(data=ldata.data, columns=ldata.feature_names)
df_ldata['target'] = ldata.target
df_ldata

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_ldata[['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness',
                                                              'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 
                                                              'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 
                                                              'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 
                                                              'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 
                                                              'worst concave points', 'worst symmetry', 'worst fractal dimension']],
                                                              df_ldata['target'])

In [None]:
# Visual Python: Machine Learning > Classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
model.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = model.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
from sklearn import metrics

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
pd.crosstab(y_test, pred, margins=True)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

In [None]:
# Visual Python: Machine Learning > Evaluation
# Accuracy
metrics.accuracy_score(y_test, pred)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Precision
metrics.precision_score(y_test, pred, average='weighted')

In [None]:
# Visual Python: Machine Learning > Evaluation
# Recall
metrics.recall_score(y_test, pred, average='weighted')

In [None]:
# Visual Python: Machine Learning > Evaluation
# F1-score
metrics.f1_score(y_test, pred, average='weighted')

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_feature_importances(model, X_train=None, sort=False):
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(len(model.feature_importances_)) ]
                        
    df_i = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * (df_i['Feature_importance'] / df_i['Feature_importance'].max())
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i
def vp_plot_feature_importances(model, X_train=None, sort=False, top_count=0):
    df_i = vp_create_feature_importances(model, X_train, sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(model, X_train, sort=True, top_count=10)

# 3 GBM - Gradient Boosting Machine

- AdaBoost

- GBM
  - 회귀: 잔차를 예측
  - 분류: logloss 예측

#### GBM 적용

In [None]:
# Visual Python: Machine Learning > GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

gs = GridSearchCV(GradientBoostingClassifier(), {'learning_rate': [0.01,0.02], 'max_depth': [1,3,5]})

In [None]:
# Visual Python: Machine Learning > Fit/Predict
gs.fit(X_train, y=y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = gs.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
from IPython.display import display, Markdown

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

In [None]:
# Visual Python: Machine Learning > Model Info
best_estimator = gs.best_estimator_
best_estimator

In [None]:
# Visual Python: Machine Learning > Model Info
best_params = gs.best_params_
best_params

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

print('훈련 세트 정확도: {:.3f}'.format(gbrt.score(X_train, y_train)))
print('테스트 세트 정확도: {:.3f}'.format(gbrt.score(X_test, y_test)))

#### max_depth = 1

In [None]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)

print('훈련 세트 정확도: {:.3f}'.format(gbrt.score(X_train, y_train)))
print('테스트 세트 정확도: {:.3f}'.format(gbrt.score(X_test, y_test)))

# 4 XGBoost - eXtreme Gradient Boosting

In [None]:
#!pip install xgboost

In [None]:
# Visual Python: Machine Learning > Classifier
from xgboost import XGBClassifier

xgb = XGBClassifier()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
xgb.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = xgb.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
from IPython.display import display, Markdown

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(xgb, X_train, sort=True)

# 5 Ensenble - 수치 예측

In [None]:
# 데이터 로딩
df = pd.read_csv('data/boston.csv')
df

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']], df['target'])

In [None]:
# Visual Python: Machine Learning > Regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
model.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = model.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
from IPython.display import display, Markdown

In [None]:
# Visual Python: Machine Learning > Evaluation
# R squared
print('R squared: {}'.format(metrics.r2_score(y_test, pred)))

In [None]:
# Visual Python: Machine Learning > Evaluation
# MAE(Mean Absolute Error)
print('MAE: {}'.format(metrics.mean_absolute_error(y_test, pred)))

In [None]:
# Visual Python: Machine Learning > Evaluation
# RMSE(Root Mean Squared Error)
print('RMSE: {}'.format(metrics.mean_squared_error(y_test, pred)**0.5))

In [None]:
# Visual Python: Machine Learning > Evaluation
# Regression plot
display(Markdown('### Regression plot'))
plt.scatter(y_test, pred)
plt.xlabel('y_test')
plt.ylabel('pred')
plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(model, X_train, sort=True, top_count=5)

---

In [None]:
# End of file