#### Ensemble - RandomForest & ExtraTree
 - 배깅 방식의 앙상블 ==> 중복 허용한 랜덤 샐픔 + 동일 모델(DecisionTree)
   * 대표 알고리즘 : RandomForestC/R
 - 페이스트 방식의 앙상블 ==> 랜덤 샘플 + 동일 모델(DecisionTree)
   * 대표 알고리즘 : ExtraTreeC/R

- 와인분류 => 0과 1 2개 종류 분류

[1] 모듈 로딩 및 데이터 준비

In [64]:
#  모듈로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [65]:
# 데이터
wineDF = pd.read_csv(r'C:\Users\KDP-17\EX_PANDAS6\MachineLearning\data\wine.csv')
wineDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [66]:
# 타겟/라벨의 클래스 분포
wineDF['class'].value_counts()

class
1.0    4898
0.0    1599
Name: count, dtype: int64

In [67]:
wineDF.describe()

featureDF = wineDF[wineDF.columns[:-1]]
targetSR =  wineDF[wineDF.columns[-1]]

print(f'featureDF : {featureDF.shape}, targetSr : {targetSR.shape}')



featureDF : (6497, 3), targetSr : (6497,)


[2] 학습 준비

In [68]:
# 학습용 & 테스트용 데이터셋 분할
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(featureDF,targetSR,
                                                    test_size=0.2,
                                                    stratify=targetSR,
                                                    random_state=10)

In [70]:
print(f'X_train : {X_train.shape}, y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}, y_test : {y_test.shape}')

X_train : (5197, 3), y_train : (5197,)
X_test : (1300, 3), y_test : (1300,)


[3] 학습 진행

In [71]:
# 학습 방법 : 지도학습 > 분류
# 알고리즘 : 앙상블 > 배깅 - RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [72]:
# 인스턴스 객체 생성 => 100개의 내부 DT 모델에서 사용할 데이터셋 생성
#                    random state 매개변수 설정으로 고정된 데이터셋 생성
#                    oob_score 매개변수 : 샘플 데이터셋 추출 후 남은 데이터셋 검증용으로 사용
rf_model = RandomForestClassifier(random_state=7,
                                  oob_score=True)

# 학습
rf_model.fit(X_train,y_train)

In [73]:
# 모델 파라미터
print(f'classes_ : {rf_model.classes_}')
print(f'n_classes_ : {rf_model.n_classes_}개')
print(f'feature_names_in_ : {rf_model.feature_names_in_}')
print(f'n_features_in_ : {rf_model.n_features_in_}개')
print(f'feature_importances_ : {rf_model.feature_importances_}')

classes_ : [0. 1.]
n_classes_ : 2개
feature_names_in_ : ['alcohol' 'sugar' 'pH']
n_features_in_ : 3개
feature_importances_ : [0.23396134 0.49681321 0.26922545]


In [74]:
# 모델 파라미터 
print(f'       : {rf_model.estimator_}')
for est in rf_model.estimators_: print(est)


       : DecisionTreeClassifier()
DecisionTreeClassifier(max_features='sqrt', random_state=327741615)
DecisionTreeClassifier(max_features='sqrt', random_state=976413892)
DecisionTreeClassifier(max_features='sqrt', random_state=1202242073)
DecisionTreeClassifier(max_features='sqrt', random_state=1369975286)
DecisionTreeClassifier(max_features='sqrt', random_state=1882953283)
DecisionTreeClassifier(max_features='sqrt', random_state=2053951699)
DecisionTreeClassifier(max_features='sqrt', random_state=959775639)
DecisionTreeClassifier(max_features='sqrt', random_state=1956722279)
DecisionTreeClassifier(max_features='sqrt', random_state=2052949340)
DecisionTreeClassifier(max_features='sqrt', random_state=1322904761)
DecisionTreeClassifier(max_features='sqrt', random_state=165338510)
DecisionTreeClassifier(max_features='sqrt', random_state=1133316631)
DecisionTreeClassifier(max_features='sqrt', random_state=4812360)
DecisionTreeClassifier(max_features='sqrt', random_state=372560217)
Decision

In [75]:
print(f'oob_score : {rf_model.oob_score_}')

oob_score : 0.8974408312487974


[4] 성능 평가

In [76]:
train_score = rf_model.score(X_train,y_train)
test_score = rf_model.score(X_test,y_test)

In [77]:
print(f'train_score : {train_score}, test_score : {test_score}')

train_score : 0.9976909755628247, test_score : 0.8930769230769231


[5] 튜닝
 - RandomizedSearchCV 하이퍼파라미터 최적화 클래스
   * 범위가 넓은 하이퍼파라미터 설정에 좋음
   * 지정된 범윙에서 지정된 횟수만큼 하이퍼파라미터를 추출하여 조합 진행

In [78]:
# 모듈로딩
from sklearn.model_selection import RandomizedSearchCV

In [79]:
# RandomForestClassifier 하이퍼파라미터 설정
params = {'max_depth':range(2,16),
          'min_samples_leaf':range(5,16)}
        

In [80]:
rf_model = RandomForestClassifier(random_state=7)

In [81]:
searchCV = RandomizedSearchCV(rf_model,param_distributions=params,
                              n_iter=50,
                              verbose=4)

In [82]:
searchCV.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END ..max_depth=9, min_samples_leaf=13;, score=0.875 total time=   0.2s
[CV 2/5] END ..max_depth=9, min_samples_leaf=13;, score=0.864 total time=   0.2s
[CV 3/5] END ..max_depth=9, min_samples_leaf=13;, score=0.868 total time=   0.2s
[CV 4/5] END ..max_depth=9, min_samples_leaf=13;, score=0.863 total time=   0.2s
[CV 5/5] END ..max_depth=9, min_samples_leaf=13;, score=0.868 total time=   0.2s
[CV 1/5] END ..max_depth=8, min_samples_leaf=12;, score=0.877 total time=   0.1s
[CV 2/5] END ..max_depth=8, min_samples_leaf=12;, score=0.865 total time=   0.2s
[CV 3/5] END ..max_depth=8, min_samples_leaf=12;, score=0.868 total time=   0.2s
[CV 4/5] END ..max_depth=8, min_samples_leaf=12;, score=0.861 total time=   0.1s
[CV 5/5] END ..max_depth=8, min_samples_leaf=12;, score=0.871 total time=   0.2s
[CV 1/5] END ..max_depth=12, min_samples_leaf=9;, score=0.879 total time=   0.2s
[CV 2/5] END ..max_depth=12, min_samples_leaf=9

In [83]:
# 모델 파라미터
print(f'searchCV.best_score : {searchCV.best_score_}')
print(f'searchCV.best_params_ : {searchCV.best_params_}')
print(f'searchCV.best_estimator_ : {searchCV.best_estimator_}')

cv_resulDF = pd.DataFrame(searchCV.cv_results_)

cv_resulDF

searchCV.best_score : 0.8789675723698822
searchCV.best_params_ : {'min_samples_leaf': 6, 'max_depth': 15}
searchCV.best_estimator_ : RandomForestClassifier(max_depth=15, min_samples_leaf=6, random_state=7)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.262262,0.021541,0.01513,0.001089,13,9,"{'min_samples_leaf': 13, 'max_depth': 9}",0.875,0.864423,0.868142,0.86333,0.868142,0.867808,0.004086,26
1,0.237405,0.003348,0.012916,0.002982,12,8,"{'min_samples_leaf': 12, 'max_depth': 8}",0.876923,0.865385,0.868142,0.861405,0.87103,0.868577,0.005244,20
2,0.278785,0.010487,0.01595,0.002238,9,12,"{'min_samples_leaf': 9, 'max_depth': 12}",0.878846,0.873077,0.87103,0.86333,0.880654,0.873388,0.006155,11
3,0.142632,0.004944,0.010467,0.005551,11,2,"{'min_samples_leaf': 11, 'max_depth': 2}",0.753846,0.755769,0.754572,0.770934,0.753609,0.757746,0.006637,48
4,0.249642,0.004628,0.017169,0.002763,12,12,"{'min_samples_leaf': 12, 'max_depth': 12}",0.870192,0.865385,0.87488,0.85948,0.87488,0.868963,0.005903,16
5,0.206455,0.011654,0.016031,0.006802,6,5,"{'min_samples_leaf': 6, 'max_depth': 5}",0.870192,0.851923,0.862368,0.858518,0.858518,0.860304,0.005978,38
6,0.256978,0.004203,0.015723,0.002892,13,15,"{'min_samples_leaf': 13, 'max_depth': 15}",0.869231,0.865385,0.870067,0.862368,0.872955,0.868001,0.003712,23
7,0.263313,0.014797,0.016604,0.003824,12,15,"{'min_samples_leaf': 12, 'max_depth': 15}",0.875962,0.863462,0.871992,0.860443,0.872955,0.868963,0.005949,18
8,0.285395,0.006519,0.02045,0.00554,6,14,"{'min_samples_leaf': 6, 'max_depth': 14}",0.880769,0.877885,0.875842,0.870067,0.884504,0.877814,0.004846,4
9,0.175148,0.003182,0.014701,0.001326,11,4,"{'min_samples_leaf': 11, 'max_depth': 4}",0.858654,0.838462,0.846006,0.841193,0.843118,0.845487,0.007029,40
