#### Ensemble - RandomForest & ExtraTree
- 배깅 방식의 앙상블 ==> 중복 허용한 랜덤 샘플 + 동일 모델 (DT)
    * 대표 알고리즘 : RandomForest
- 페이스트 방식의 앙상블 ==> 랜덤 샘플 + 동일 모델 (DT)
    * 대표 알고리즘 : ExtraTreeC/R

[목표] 와인 분류 => 0과 1 / 2개 종류 분류

[1] 모듈 로딩 및 데이터 준비

In [1]:
# 모듈 로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 데이터
datafile = '../data/wine.csv'

wineDF = pd.read_csv(datafile)

In [3]:
# 데이터 확인
wineDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [4]:
wineDF.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [5]:
# 타겟/라벨의 클래스 분포
wineDF['class'].value_counts()

class
1.0    4898
0.0    1599
Name: count, dtype: int64

In [6]:
wineDF.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


[2] 학습 준비

In [7]:
# 학습용 & 테스트용 데이터셋 분할
from sklearn.model_selection import train_test_split

In [8]:
# 피쳐/독립변수와 타겟/라벨/종속변수 분리
featureDF = wineDF[wineDF.columns[:-1]]
targetSR = wineDF[wineDF.columns[-1]]

print(f'featureDF : {featureDF.shape}, {featureDF.ndim}D    targetSR : {targetSR.shape}, {targetSR.ndim}D')

featureDF : (6497, 3), 2D    targetSR : (6497,), 1D


In [9]:
# 학습용, 테스트용 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR, test_size=0.2, stratify=targetSR, random_state=1)

In [10]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [11]:
print(f'X_train : {X_train.shape}   y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}   y_test : {y_test.shape}')

X_train : (5197, 3)   y_train : (5197,)
X_test : (1300, 3)   y_test : (1300,)


[3] 학습 진행

In [12]:
# 학습방법 : 지도학습 > 분류
# 알고리즘 : 앙상블 > 배깅 - RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [17]:
# 인스턴스 생성 => 100개의 내부 DT모델에서 사용할 데이터셋 생성
#                 random_state 매개변수 설정으로 고정된 데이터셋 생성
rf_model = RandomForestClassifier(random_state=7, oob_score=True)

# 학습
rf_model.fit(X_train, y_train)

In [14]:
# 모델 파라미터
print(f'classes_             : {rf_model.classes_}')
print(f'n_classes_           : {rf_model.n_classes_}개')
print()
print(f'feature_names_in_    : {rf_model.feature_names_in_}')
print(f'n_features_in_       : {rf_model.n_features_in_}개')
print(f'feature_importances_ : {rf_model.feature_importances_}')

classes_             : [0. 1.]
n_classes_           : 2개

feature_names_in_    : ['alcohol' 'sugar' 'pH']
n_features_in_       : 3개
feature_importances_ : [0.23572103 0.49995154 0.26432743]


In [15]:
# 모델 파라미터
print(f'clasees         : {rf_model.estimator_}')
for est in rf_model.estimators_ : print(est)

clasees         : DecisionTreeClassifier()
DecisionTreeClassifier(max_features='sqrt', random_state=327741615)
DecisionTreeClassifier(max_features='sqrt', random_state=976413892)
DecisionTreeClassifier(max_features='sqrt', random_state=1202242073)
DecisionTreeClassifier(max_features='sqrt', random_state=1369975286)
DecisionTreeClassifier(max_features='sqrt', random_state=1882953283)
DecisionTreeClassifier(max_features='sqrt', random_state=2053951699)
DecisionTreeClassifier(max_features='sqrt', random_state=959775639)
DecisionTreeClassifier(max_features='sqrt', random_state=1956722279)
DecisionTreeClassifier(max_features='sqrt', random_state=2052949340)
DecisionTreeClassifier(max_features='sqrt', random_state=1322904761)
DecisionTreeClassifier(max_features='sqrt', random_state=165338510)
DecisionTreeClassifier(max_features='sqrt', random_state=1133316631)
DecisionTreeClassifier(max_features='sqrt', random_state=4812360)
DecisionTreeClassifier(max_features='sqrt', random_state=372560217)

In [18]:
print(f'oob_score : {rf_model.oob_score_}')
# 인스턴스 생성 시 oob_score = True 설정!!

oob_score : 0.89532422551472


[4] 성능평가

In [None]:
train_score = rf_model.score(X_train, y_train)
test_score = rf_model.score(X_test, y_test)

In [None]:
print(f'train_score : {train_score}     test_score : {test_score}')

train_score : 0.9973061381566288     test_score : 0.9


[5] 튜닝

- RandomizedSearchCV 하이퍼파라미터 최적화 클래스
    * 범위가 넓은 하이퍼파라미터 설정에 좋음
    * 지정된 범위에서 지정된 횟수만큼 하이퍼파라미터를 추출하여 조합 진행

In [19]:
# 모듈 로딩
from sklearn.model_selection import RandomizedSearchCV

In [33]:
# RandomForestClassifier 하이퍼파라미터 설정
params = {'max_depth': range(2,16), 'min_samples_leaf':range(5,16), 'criterion':['gini', 'entropy', 'log_loss']}

In [34]:
rf_model = RandomForestClassifier(random_state=7)

In [37]:
searchCV = RandomizedSearchCV(rf_model, param_distributions=params, n_iter=50, verbose=4)

In [38]:
searchCV.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END criterion=entropy, max_depth=6, min_samples_leaf=6;, score=0.859 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=6, min_samples_leaf=6;, score=0.832 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=6, min_samples_leaf=6;, score=0.868 total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=6, min_samples_leaf=6;, score=0.878 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=6, min_samples_leaf=6;, score=0.859 total time=   0.1s
[CV 1/5] END criterion=entropy, max_depth=13, min_samples_leaf=6;, score=0.881 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=13, min_samples_leaf=6;, score=0.842 total time=   0.2s
[CV 3/5] END criterion=entropy, max_depth=13, min_samples_leaf=6;, score=0.882 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=13, min_samples_leaf=6;, score=0.885 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=13, min_samples_lea

In [39]:
# 모델 파라미터
print(f'best_score_ : {searchCV.best_score_}')
print(f'best_params_ : {searchCV.best_params_}')
print(f'best_estimator_ : {searchCV.best_estimator_}')

cv_resultDF = pd.DataFrame(searchCV.cv_results_)
cv_resultDF

best_score_ : 0.8764707188865032
best_params_ : {'min_samples_leaf': 5, 'max_depth': 14, 'criterion': 'entropy'}
best_estimator_ : RandomForestClassifier(criterion='entropy', max_depth=14, min_samples_leaf=5,
                       random_state=7)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.238802,0.02961,0.014988,0.00131,6,6,entropy,"{'min_samples_leaf': 6, 'max_depth': 6, 'crite...",0.858654,0.831731,0.868142,0.877767,0.858518,0.858962,0.015361,35
1,0.31318,0.005501,0.017628,0.001061,6,13,entropy,"{'min_samples_leaf': 6, 'max_depth': 13, 'crit...",0.880769,0.842308,0.881617,0.884504,0.87873,0.873586,0.015749,6
2,0.268512,0.005489,0.014746,0.000744,6,8,log_loss,"{'min_samples_leaf': 6, 'max_depth': 8, 'crite...",0.871154,0.838462,0.87873,0.879692,0.871992,0.868006,0.015167,21
3,0.165355,0.006646,0.010393,0.001175,6,3,log_loss,"{'min_samples_leaf': 6, 'max_depth': 3, 'crite...",0.799038,0.800962,0.810395,0.835419,0.818094,0.812782,0.013228,43
4,0.302163,0.00458,0.015934,0.000468,5,10,entropy,"{'min_samples_leaf': 5, 'max_depth': 10, 'crit...",0.874038,0.846154,0.880654,0.880654,0.87488,0.871276,0.012866,9
5,0.193393,0.023005,0.011313,0.000863,11,4,entropy,"{'min_samples_leaf': 11, 'max_depth': 4, 'crit...",0.834615,0.8375,0.853705,0.85948,0.841193,0.845299,0.009629,40
6,0.284615,0.024179,0.015858,0.00085,13,13,entropy,"{'min_samples_leaf': 13, 'max_depth': 13, 'cri...",0.874038,0.835577,0.877767,0.885467,0.872955,0.869161,0.017355,16
7,0.302185,0.008938,0.016617,0.001378,5,10,log_loss,"{'min_samples_leaf': 5, 'max_depth': 10, 'crit...",0.874038,0.846154,0.880654,0.880654,0.87488,0.871276,0.012866,9
8,0.262156,0.00377,0.01567,0.000374,12,14,gini,"{'min_samples_leaf': 12, 'max_depth': 14, 'cri...",0.875,0.832692,0.879692,0.882579,0.87488,0.868969,0.018371,17
9,0.14501,0.001909,0.010202,0.001578,12,2,entropy,"{'min_samples_leaf': 12, 'max_depth': 2, 'crit...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,48
