# Ensenble

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mglearn

plt.rc('figure', figsize=(10, 6))

from matplotlib import rcParams
rcParams['font.family'] = 'New Gulim'
rcParams['font.size'] = 10
rcParams['axes.unicode_minus'] = False

# 1 Ensenble
- 여러 개의 분류기(Classifier)를 생성하고 그 예측을 결합함으로써 보다 정확한 최종 예측을 수행
- Overfitting 방지

구분 | 지도 학습 | 비고
:--- |:--- |: ---
보팅(Voting) | 서로 다른 알고리즘이 같은 데이터 세트에 대해 학습하고 예측한 결과를 보팅 (Hard Voting / Soft Voting) | 랜덤 포레스트
배깅(Bagging) | 단일 결정 트리로 데이터 샘플링을 서로 다르게 가져가면서 학습을 수행해 보팅 | 랜덤 포레스트
부스팅(Boosting) | 여러 개의 분류기가 순차적으로 학습하면서 앞에서 학습한 분류기가 틀린 데이터에 대해서는 가중치를 부여하면서 학습과 예측을 진행 | GBM, XGBoost
스태킹(Stacking) | 스태킹은 여러가지 다른 모델의 예측 결과값을 다시 학습데이터로 만들어 다른 모델로 재학습시켜 결과를 예측하는 방법 | -

# 2 Random Forest

[Random Forest](https://injo.tistory.com/30)

- 회귀 결과 예측 방법: 노드의 값들을 평균
- 분류 결과 예측 방법: 범주의 갯수 카운팅
  - [Hard voting, Soft voting](https://devkor.tistory.com/entry/Soft-Voting-%EA%B3%BC-Hard-Voting)

### 2.1 Random Forest 모델 생성

In [None]:
# 데이터 생성 & 데이터 분할

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=100, noise=0.25, random_state=123)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123)

In [None]:
# 모댈 생성 및 학습 - Random Forest

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=5, random_state=123)
forest.fit(X_train, y_train)

### 2.2 Random Forest 결정 경계 시각화

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(20, 10))

for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)):
    ax.set_title(f'트리 {i}')
    mglearn.plots.plot_tree_partition(X, y, tree, ax=ax)
    
axes[-1, -1].set_title('랜덤 포레스트')
mglearn.plots.plot_2d_separator(forest, X, fill=True, ax=axes[-1, -1], alpha=0.4)

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.show()

### 2.3 Feature Importance

In [None]:
# 데이터 생성 & 데이터 분할

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=123)

In [None]:
# 모댈 생성 및 학습 - Random Forest

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, random_state=123)
forest.fit(X_train, y_train)

print('훈련 세트 정확도: {:.3f}'.format(forest.score(X_train, y_train)))
print('테스트 세트 정확도: {:.3f}'.format(forest.score(X_test, y_test)))

In [None]:
def plot_feature_importances_cancer(model):
    n_features = cancer.data.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), cancer.feature_names)
    plt.xlabel('특성 중요도')
    plt.ylabel('특성')
    plt.ylim(-1, n_features)

In [None]:
# Feature Importance

plot_feature_importances_cancer(forest)

# 3 GBM - Gradient Boosting Machine

[AdaBoost](https://dailyheumsi.tistory.com/115?category=877153)  
[GBM](https://dailyheumsi.tistory.com/116?category=877153)  
[GBM](https://yngie-c.github.io/machine%20learning/2021/03/21/gbm/)

- 회귀: 잔차를 예측
- 분류: logloss 예측

#### GBM 적용

In [None]:
# 데이터 분할

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

print('훈련 세트 정확도: {:.3f}'.format(gbrt.score(X_train, y_train)))
print('테스트 세트 정확도: {:.3f}'.format(gbrt.score(X_test, y_test)))

#### max_depth = 1

In [None]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)

print('훈련 세트 정확도: {:.3f}'.format(gbrt.score(X_train, y_train)))
print('테스트 세트 정확도: {:.3f}'.format(gbrt.score(X_test, y_test)))

#### 학습률 조정 - learning_rate = 0.01

In [None]:
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)

print('훈련 세트 정확도: {:.3f}'.format(gbrt.score(X_train, y_train)))
print('테스트 세트 정확도: {:.3f}'.format(gbrt.score(X_test, y_test)))

#### Feature Importance

In [None]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)

plot_feature_importances_cancer(gbrt)

# 4 XGBoost - eXtra Gradient Boost

In [None]:
#!pip install xgboost

In [None]:
import xgboost
print(xgboost.__version__)

In [None]:
# 데이터 로딩: 위스콘신 Breast Cancer 데이터 셋

from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

In [None]:
df = pd.DataFrame(data=X, columns=cancer.feature_names)
df['target']= y
df

In [None]:
print(cancer.target_names)

In [None]:
df['target'].value_counts()

In [None]:
# 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape , X_test.shape)

In [None]:
# XGBoost 모델 생성 및 학습

from xgboost import XGBClassifier

evals = [(X_test, y_test)]

xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, use_label_encoder=False)

xgb.fit(X_train, y_train, early_stopping_rounds=400, eval_set=evals, eval_metric='logloss', verbose=True)

In [None]:
# 결과 예측
pred = xgb.predict(X_test)

In [None]:
# 평가: 오차 행렬
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, pred)

print('Confusion Matrix\n', confusion)

In [None]:
# 분류 리포트
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
# XGBoost 모델 생성 및 학습 - early_stopping_rounds=100

from xgboost import XGBClassifier

evals = [(X_test, y_test)]

xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, use_label_encoder=False)

xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_set=evals, eval_metric='logloss', verbose=True)

In [None]:
# 결과 예측
pred = xgb.predict(X_test)

In [None]:
# 평가: 오차 행렬
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, pred)

print('Confusion Matrix\n', confusion)

In [None]:
# 분류 리포트
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
# XGBoost 모델 생성 및 학습 - early_stopping_rounds=10

from xgboost import XGBClassifier

evals = [(X_test, y_test)]

xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, use_label_encoder=False)

xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=evals, eval_metric='logloss', verbose=True)

In [None]:
# 결과 예측
pred = xgb.predict(X_test)

In [None]:
# 평가: 오차 행렬
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, pred)

print('Confusion Matrix\n', confusion)

In [None]:
# 분류 리포트
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
# Feature Importance

from xgboost import plot_importance

plot_importance(xgb)
plt.show()

# 5 LightGBM
- XGBoost: 균형 트리 분할(Level Wise)
- LightGBM: 리프 중심 트리 분할(Leaf Wise)

In [None]:
#!pip install lightgbm

In [None]:
import lightgbm
print(lightgbm.__version__)

In [None]:
# LightGBM 모델 생성 및 학습

from lightgbm import LGBMClassifier

evals = [(X_test, y_test)]

lgbm = LGBMClassifier(n_estimators=400)
lgbm.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='logloss',  eval_set=evals, verbose=True)

In [None]:
# 결과 예측
pred = lgbm.predict(X_test)

In [None]:
# 평가: 오차 행렬
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, pred)

print('Confusion Matrix\n', confusion)

In [None]:
# 분류 리포트
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
# Feature Importance

from lightgbm import plot_importance

plot_importance(lgbm)
plt.show()

# 6 Ensenble - 수치 예측

In [None]:
# 데이터 로딩
df = pd.read_csv('data/boston.csv')
X = df.drop('target', axis=1).values
y = df['target'].values
df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [None]:
# 데이터 분할
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y, random_state=123)

### 6.1 Random Forest 적용

In [None]:
%%time
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=1000, random_state=0)

neg_mse_scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores  = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('교차 검증의 개별 Negative MSE scores: ', np.round(neg_mse_scores, 2))
print('교차 검증의 개별 RMSE scores : ', np.round(rmse_scores, 2))
print('교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

### 6.2 DT, GBM, XGBoost, LightGBM

In [None]:
%%time
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dt_reg = DecisionTreeRegressor(max_depth=4, random_state=0)
rf_reg = RandomForestRegressor(n_estimators=1000, random_state=0)
gb_reg = GradientBoostingRegressor(n_estimators=1000, random_state=0)
xgb_reg = XGBRegressor(n_estimators=1000)
lgb_reg = LGBMRegressor(n_estimators=1000)

# 트리 기반의 회귀 모델을 반복하면서 평가 수행 
models = [dt_reg, rf_reg, gb_reg, xgb_reg, lgb_reg]

for model in models:  
    neg_mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse_scores  = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('##### ',model.__class__.__name__ , ' #####')
    print('교차 검증의 평균 RMSE : {:.3f} '.format(avg_rmse))

### 6.3 Feature Importance

In [None]:
import seaborn as sns

rf_reg = RandomForestRegressor(n_estimators=1000)
rf_reg.fit(X, y)

sr = pd.Series(data=rf_reg.feature_importances_, index=df.drop('target', axis=1).columns)
sr = sr.sort_values(ascending=False)

sns.barplot(x= sr, y=sr.index)
plt.show()

In [None]:
plt.scatter(df['RM'] , df['target'])
plt.xlabel('RM')
plt.ylabel('target')
plt.show()

### 6.4 결과 예측 회귀선

In [None]:
from sklearn.linear_model import LinearRegression

# 선형 회귀와 결정 트리 기반의 Regressor 생성. DecisionTreeRegressor의 max_depth는 각각 2, 7
lr_reg = LinearRegression()
rf_reg2 = DecisionTreeRegressor(max_depth=2)
rf_reg7 = DecisionTreeRegressor(max_depth=7)

# 실제 예측을 적용할 테스트용 데이터 셋을 4.5 ~ 8.5 까지 100개 데이터 셋 생성. 
X_test = np.arange(4.5, 8.5, 0.04).reshape(-1, 1)

X_feature = df['RM'].values.reshape(-1,1)
y_target  = df['target'].values.reshape(-1,1)

# 학습과 예측 수행. 
lr_reg.fit(X_feature, y_target)
rf_reg2.fit(X_feature, y_target)
rf_reg7.fit(X_feature, y_target)

pred_lr = lr_reg.predict(X_test)
pred_rf2 = rf_reg2.predict(X_test)
pred_rf7 = rf_reg7.predict(X_test)


In [None]:
fig , (ax1, ax2, ax3) = plt.subplots(figsize=(14,4), ncols=3)

# 선형 회귀로 학습된 모델 회귀 예측선 
ax1.set_title('Linear Regression')
ax1.scatter(df['RM'], df['target'], c='darkorange')
ax1.plot(X_test, pred_lr,label='linear', linewidth=2 )

# DecisionTreeRegressor의 max_depth를 2로 했을 때 회귀 예측선 
ax2.set_title('Decision Tree Regression: \n max_depth=2')
ax2.scatter(df['RM'], df['target'], c='darkorange')
ax2.plot(X_test, pred_rf2, label='max_depth:3', linewidth=2 )

# DecisionTreeRegressor의 max_depth를 7로 했을 때 회귀 예측선 
ax3.set_title('Decision Tree Regression: \n max_depth=7')
ax3.scatter(df['RM'], df['target'], c='darkorange')
ax3.plot(X_test, pred_rf7, label='max_depth:7', linewidth=2)

plt.show()

---

In [None]:
# End of file