<a href="https://colab.research.google.com/github/leepopnamoo/SQL-Study/blob/main/Ch5_Ensemble_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Ch5_앙상블 모델 실습**

##**보팅(Voting) 분류기**

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

###**데이터 불러오기**

In [None]:
import pandas as pd
cancer = load_breast_cancer()
data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)

In [None]:
data_df.info()

###**학습용, 테스트용 데이터 구분**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2 , random_state= 156)

###**로지스틱 회귀, 최근접 이웃 모델 생성**

In [None]:
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=8)

###**소프트 보팅 앙상블 모델 생성(로지스틱 회귀, 최근접 이웃)**

In [None]:
vo_clf = VotingClassifier(estimators=[('LR',lr_clf),('KNN',knn_clf)] , voting='soft' )

###**소프트 보팅 앙상블 모델 학습/예측/평가**

In [None]:
vo_clf.fit(X_train , y_train)
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도: {:.4f}'.format(accuracy_score(y_test, pred)))

###**개별 모델과 성능 비교**

In [None]:
classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(X_train , y_train)
    pred = classifier.predict(X_test)
    class_name= classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test , pred)))

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1)

In [None]:
bag_clf.fit(X_train, y_train)

In [None]:
y_pred = bag_clf.predict(X_test)

In [None]:
print('{0} 정확도: {1:.4f}'.format('배깅', accuracy_score(y_test , y_pred)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

##**랜덤 포레스트**

In [None]:
from sklearn.ensemble import RandomForestClassifier

###**랜덤 포레스트 생성 및 학습**

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(X_train, y_train)

###**테스트 데이터 예측 및 성능 평가**

In [None]:
pred_rnd_clf = rnd_clf.predict(X_test)
accuracy = accuracy_score(y_test , pred_rnd_clf)
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))

##**LightGBM**

In [None]:
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

###**데이터 불러오기**

In [None]:
dataset = load_breast_cancer()
ftr = dataset.data
target = dataset.target

In [None]:
dataset_df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
dataset_df.head(3)

In [None]:
dataset_df['target'] = dataset.target
dataset_df.head(3)

###**학습용, 테스트용 데이터 구분**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset_df.iloc[:, :-1], dataset_df["target"], test_size=0.2 , random_state= 156)

###**LightGBM 분류기**

In [None]:
lgbm_wrapper = LGBMClassifier(n_estimators=200)

###**LightGBM 분류기 학습 설정**

In [None]:
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=evals, verbose=True)

###**모델 성능 평가**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

####**교차표 확인**

In [None]:
preds = lgbm_wrapper.predict(X_test)
confusion = confusion_matrix(y_test, preds)
confusion

In [None]:
accuracy = accuracy_score(y_test, preds)
print('정확도: {0:.4f}'.format(accuracy))

In [None]:
 precision = precision_score(y_test , preds)
 print('정밀도: {0:.4f}'.format(precision))

In [None]:
 recall = recall_score(y_test , preds)
 print('재현율: {0:.4f}'.format(recall))

In [None]:
f1 = f1_score(y_test,preds)
print('F1: {0:.4f}'.format(f1))

In [None]:
roc_score = roc_auc_score(y_test, preds)

In [None]:
print('ROC AUC: {0:.4f}'.format(roc_score*100))

###**입력 데이터 중요도 시각화**

In [None]:
from lightgbm import plot_importance
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax)

##**XGBoost**

In [None]:
from xgboost import XGBClassifier

###**데이터 불러오기**

In [None]:
dataset = load_breast_cancer()
X_features= dataset.data
y_label = dataset.target

cancer_df = pd.DataFrame(data=X_features, columns=dataset.feature_names)
cancer_df['target']= y_label
cancer_df.head(3)

###**학습용, 테스트용 데이터 구분**

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X_features, y_label,
                                         test_size=0.2, random_state=156 )
print(X_train.shape , X_test.shape)

###**XGBoost 분류기 학습**

In [None]:
xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
evals = [(X_test, y_test)]
xgb_wrapper.fit(X_train, y_train,early_stopping_rounds=100, eval_metric="error", 
                eval_set=evals, verbose=True)

ws100_preds = xgb_wrapper.predict(X_test)
ws100_pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]

###**예측 성능 확인**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    print('- 정확도: {0:.4f}\n- 정밀도: {1:.4f}\n- 재현율: {2:.4f}\n- F1: {3:.4f}\n- AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
get_clf_eval(y_test ,ws100_preds, ws100_pred_proba)

###**입력 데이터 중요도 시각화**

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(xgb_wrapper, ax=ax)

###**하이퍼 파라미터 튜닝**

In [None]:
from sklearn.model_selection import GridSearchCV
# 하이퍼 파라미터 테스트의 수행 속도를 향상시키기 위해 n_estimators를 100으로 감소
xgb_clf = XGBClassifier(n_estimators=100)

params = {'max_depth':[5, 7] , 'min_child_weight':[1,3] ,'colsample_bytree':[0.5, 0.75] }

# cv는 3으로 지정 
gridcv = GridSearchCV(xgb_clf, param_grid=params, cv=3)
gridcv.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc",
           eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
print('GridSearchCV 최적 파라미터:',gridcv.best_params_) 
xgb_roc_score = roc_auc_score(y_test, gridcv.predict_proba(X_test)[:,1], average='macro')
print('ROC AUC: {0:.4f}'.format(xgb_roc_score))