## Random Forest

> 📌 신용카드 가입해지 유무에 대한 데이터셋을 기반으로
랜덤 포레스트를 적용하고, 하이퍼 파라미터 튜닝을 구현합니다.


### dataset

In [6]:
import pandas as pd

# 가입해지 데이터
df = pd.read_csv("./churn.csv")
df = df.drop(columns=["RowNumber", "CustomerId", "Surname", "Gender"]) # drop unnecessary field
df

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.80,3,1,0,113931.57,1
3,699,France,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,39,5,0.00,2,1,0,96270.64,0
9996,516,France,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,42,3,75075.31,2,1,0,92888.52,1


In [7]:
# 상관계수 확인
corr = df.iloc[:, :-1].corr()
corr.style.background_gradient(vmin=-0.99, vmax=0.99, cmap='Accent_r')

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
CreditScore,1.0,-0.003965,0.000842,0.006268,0.012238,-0.005458,0.025651,-0.001384
Age,-0.003965,1.0,-0.009997,0.028308,-0.03068,-0.011721,0.085472,-0.007201
Tenure,0.000842,-0.009997,1.0,-0.012254,0.013444,0.022583,-0.028362,0.007784
Balance,0.006268,0.028308,-0.012254,1.0,-0.30418,-0.014858,-0.010084,0.012797
NumOfProducts,0.012238,-0.03068,0.013444,-0.30418,1.0,0.003183,0.009612,0.014204
HasCrCard,-0.005458,-0.011721,0.022583,-0.014858,0.003183,1.0,-0.011866,-0.009933
IsActiveMember,0.025651,0.085472,-0.028362,-0.010084,0.009612,-0.011866,1.0,-0.011421
EstimatedSalary,-0.001384,-0.007201,0.007784,0.012797,0.014204,-0.009933,-0.011421,1.0


### preprocessing

In [8]:
df.columns

Index(['CreditScore', 'Geography', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [9]:
# 불필요한 컬럼 제외
df = pd.read_csv("./churn.csv")
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Gender'])
df

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.80,3,1,0,113931.57,1
3,699,France,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,39,5,0.00,2,1,0,96270.64,0
9996,516,France,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,42,3,75075.31,2,1,0,92888.52,1


#### encoding & scaling

In [10]:
# encoder
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
enc = OrdinalEncoder()
df['Geography'] = enc.fit_transform(df[['Geography']])

# std scaler (트리 기반 모델은 scaler가 의미 없지만 regression모델을 고려하여 적용)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df[['CreditScore', 'Balance', 'EstimatedSalary']])
transformed = scaler.transform(df[['CreditScore', 'Balance', 'EstimatedSalary']])

# apply
df.loc[:, ['CreditScore', 'Balance', 'EstimatedSalary']] = transformed
df

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,-0.326221,0.0,42,2,-1.225848,1,1,1,0.021886,1
1,-0.440036,2.0,41,1,0.117350,1,0,1,0.216534,0
2,-1.536794,0.0,42,8,1.333053,3,1,0,0.240687,1
3,0.501521,0.0,39,1,-1.225848,2,0,0,-0.108918,0
4,2.063884,2.0,43,2,0.785728,1,1,1,-0.365276,0
...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,0.0,39,5,-1.225848,2,1,0,-0.066419,0
9996,-1.391939,0.0,35,10,-0.306379,1,1,1,0.027988,0
9997,0.604988,0.0,36,7,-1.225848,1,0,1,-1.008643,1
9998,1.256835,1.0,42,3,-0.022608,2,1,0,-0.125231,1


#### data split

In [11]:
# Data Split
X = df.iloc[:, :-1].values
y = df['Exited'].values

# split
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# check split
train_x.shape, train_y.shape

((8000, 9), (8000,))

#### model selection

In [13]:
# SVM, RandomForest 비교 및 두 모델 Voting

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

ranfo_model = RandomForestClassifier(random_state=42)
svm_model = SVC(C=10, gamma='auto', random_state=42)

voting_model = VotingClassifier(
    estimators=[('rf', ranfo_model), ('svm', svm_model)], 
    voting='hard'
    )

In [14]:
# 모델 성능 비교
from sklearn.metrics import recall_score, f1_score
# confusion matrix
from sklearn.metrics import confusion_matrix

for model in [ranfo_model, svm_model, voting_model]:
    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    recall = recall_score(test_y, pred, average='macro')
    
    print(model.__class__.__name__,":", recall)
    print(confusion_matrix(test_y, pred), end='\n\n')

RandomForestClassifier : 0.7148060885027496
[[1545   62]
 [ 209  184]]

SVC : 0.6892713335898447
[[1512   95]
 [ 221  172]]

VotingClassifier : 0.6790164214766503
[[1569   38]
 [ 243  150]]



#### check feature importances 

In [15]:
# 랜덤 포레스트
model = RandomForestClassifier(random_state=42)
model.fit(train_x, train_y)

In [16]:
# feature importances
col_names = list(df.columns[:-1])
importances = pd.DataFrame(ranfo_model.feature_importances_.reshape(ranfo_model.n_features_in_, -1)).T
importances.columns = col_names
importances.style.background_gradient(vmin=0, vmax=1, cmap='Accent_r')

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,0.146992,0.035797,0.240684,0.082813,0.144879,0.132991,0.018474,0.04388,0.15349


`Geography, Tenure, HasCrCard, IsActiveMember 필드 드랍 시도`  
-> 드랍 결과 성능이 오히려 떨어졌음

---

#### Set Pipeline 

In [17]:
df = pd.read_csv("./churn.csv")
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 수치형과 범주형 데이터 구분
num_attrs = [
    'RowNumber', 'CustomerId', 'CreditScore',
    'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
    'IsActiveMember', 'EstimatedSalary'
]

cat_attrs = ['Surname', 'Geography', 'Gender']


# Drop Features
drop_attrs = ['RowNumber', 'CustomerId', 'Surname', 'Gender']
df = df.drop(columns=drop_attrs)

# update attrs
num_attrs = list(set(num_attrs) - set(drop_attrs))
cat_attrs = list(set(cat_attrs) - set(drop_attrs))


full_pipeline = ColumnTransformer([
    ('num-preprocess', StandardScaler(), num_attrs), 
    ('enc-preprocess', OrdinalEncoder(), cat_attrs)
])

prepared = full_pipeline.fit_transform(df)
prepared, prepared.shape

(array([[ 0.64609167,  0.29351742,  0.02188649, ..., -0.32622142,
         -1.04175968,  0.        ],
        [-1.54776799,  0.19816383,  0.21653375, ..., -0.44003595,
         -1.38753759,  2.        ],
        [ 0.64609167,  0.29351742,  0.2406869 , ..., -1.53679418,
          1.03290776,  0.        ],
        ...,
        [-1.54776799, -0.27860412, -1.00864308, ...,  0.60498839,
          0.68712986,  0.        ],
        [ 0.64609167,  0.29351742, -0.12523071, ...,  1.25683526,
         -0.69598177,  1.        ],
        [ 0.64609167, -1.04143285, -1.07636976, ...,  1.46377078,
         -0.35020386,  0.        ]]),
 (10000, 9))

### Model Train & Test

In [19]:
# split
X = prepared
y = df['Exited'].values

# split
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# 랜덤 포레스트
model = RandomForestClassifier(random_state=42)
model.fit(train_x, train_y)

In [21]:
# predict
pred = model.predict(test_x)
pred

array([0, 0, 0, ..., 1, 0, 0])

In [22]:
# mean accuracy
model.score(test_x, test_y)

0.8665

In [23]:
# f1_score
f1_score(test_y, pred, average='macro')

0.7507271624768759

In [24]:
# feature importances
col_names = num_attrs + cat_attrs
importances = pd.DataFrame(model.feature_importances_.reshape(model.n_features_in_, -1)).T
importances.columns = col_names
importances.style.background_gradient(vmin=0, vmax=1, cmap='Accent_r')

Unnamed: 0,HasCrCard,Age,EstimatedSalary,IsActiveMember,Balance,NumOfProducts,CreditScore,Tenure,Geography
0,0.019071,0.242399,0.153857,0.044089,0.142829,0.131122,0.147351,0.082965,0.036317


#### cross-validation score

In [25]:
# Train dataset에 대한 학습(fit)과 CV에 해당함
# test dataset에 대한 평가와 비슷한 수치가 나오는 것을 확인할 수 있어
# train dataset에 크게 overfitting되지 않았음을 확인할 수 있음
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42)
score = cross_val_score(model, train_x, train_y, scoring='f1_macro', cv=5)
score.mean()

0.7442863685224715

### hyper-parameter optimization

> grid search cv & random search cv

sklearn에서 제공하는 두 도구를 사용하면 편리하게 optimization 가능하다.  
하지만 모든 후보군 학습 과정에서의 디테일한 정보를 추적하는 것에는 한계가 있다.  

#### grid search cv

In [34]:
# 랜덤 포레스트의 하이퍼파라미터 default 설정값 지정
class Cfg:
    n_estimators=100,
    criterion='gini', # "gini", "entropy", "log_loss"
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='sqrt', # "sqrt", "log2"
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=42, # random_state = 42
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,

    
# GridSearchCV 패키지
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(Cfg)


param_grid = {
    'max_depth': [50, 80, 100],
    'n_estimators': [65, 70, 75]
}

optimizer = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_macro', 
                         cv=5, verbose=2)

# train 데이터를 대상으로 진행하면 cross-validation도 함께 적용되어 train, validation 효과를 볼 수 있다.
optimizer.fit(train_x, train_y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ......................max_depth=50, n_estimators=65; total time=   0.7s
[CV] END ......................max_depth=50, n_estimators=65; total time=   0.7s
[CV] END ......................max_depth=50, n_estimators=65; total time=   0.6s
[CV] END ......................max_depth=50, n_estimators=65; total time=   0.5s
[CV] END ......................max_depth=50, n_estimators=65; total time=   0.5s
[CV] END ......................max_depth=50, n_estimators=70; total time=   0.5s
[CV] END ......................max_depth=50, n_estimators=70; total time=   0.5s
[CV] END ......................max_depth=50, n_estimators=70; total time=   0.6s
[CV] END ......................max_depth=50, n_estimators=70; total time=   0.6s
[CV] END ......................max_depth=50, n_estimators=70; total time=   0.6s
[CV] END ......................max_depth=50, n_estimators=75; total time=   0.6s
[CV] END ......................max_depth=50, n_es

In [35]:
# 가장 결과가 좋았던 parameter 조회
print(optimizer.best_score_)
optimizer.best_params_

0.7500013543253689


{'max_depth': 100, 'n_estimators': 75}

In [36]:
# best 모델에 대해서 test 데이터를 적용해 본다.
best_model = optimizer.best_estimator_
pred = best_model.predict(test_x)
f1_score(test_y, pred, average='macro')

0.7569168213559622

#### grid search 구현

`product 함수를 사용하면 각 리스트들에 대한 모든 조합을 출력할 수 있다.`

In [37]:
param_grid = {
    'max_depth': [50, 80, 100],
    'n_estimators': [65, 70, 75]
}

In [38]:
from itertools import product
# product 함수는 여러 튜플 입력 시 튜플 내 모든 조합을 인풋 순서에 따라 순차적으로 출력해준다.
# ex) (1, 2, 3), (4, 5, 6) -> (1, 4), (1, 5), (1, 6), (2, 4), ...

names = param_grid.keys()
value_cand = param_grid.values()


for value in product(*value_cand):
    params = {key:value for key, value in zip(names, value)}
    
    print(params)

{'max_depth': 50, 'n_estimators': 65}
{'max_depth': 50, 'n_estimators': 70}
{'max_depth': 50, 'n_estimators': 75}
{'max_depth': 80, 'n_estimators': 65}
{'max_depth': 80, 'n_estimators': 70}
{'max_depth': 80, 'n_estimators': 75}
{'max_depth': 100, 'n_estimators': 65}
{'max_depth': 100, 'n_estimators': 70}
{'max_depth': 100, 'n_estimators': 75}


> Logger 생성

In [27]:
import logging

def get_logger(name, dir_, stream=False):
    """log 데이터 파일 저장
    
    Args:
        name(str): 로그 이름 지정
        dir_(str): 로그 파일을 저장할 경로 지정
        stream(bool): 콘솔에 로그를 남길지에 대한 유무
    
    Returns: logging.RootLogger
        
    """
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)  # logging all levels
    logger.handlers.clear()
    
    formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(message)s')
    stream_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(os.path.join(dir_, f'{name}.log'))

    stream_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    if stream:
        logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    return logger

In [40]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime, timezone, timedelta 
from tqdm.notebook import tqdm
from itertools import product
import os

# grid search 함수에 필요한 파라미터 설정
config = {
    'recorder_dir': './',
    'model': 'RandomForestClassifier',
    'param_grid': {
        'max_depth': [50, 80, 100],
        'n_estimators': [65, 70, 75]
    },
    'metric_info': ['f1_score', {'average': 'macro'}],
    'cv': 5
}


def grid_search_cv(X, y, model, param_grid, cv, metric_info, recorder_dir):
    """Grid Search CV를 실행합니다.
        
    Args:
        X(array): X data
        y(array): label data
        model(str): 모델함수명
        param_grid(dict): GridSearch에 적용할 파라미터 범위
            {'파라미터명': [값1, 값2, ...]}
        cv(int): cross_validation 개수
        metric_info(list): 평가지표 및 평가지표의 파라미터 설정
            ['평가지표함수', {'파라미터1': '값'...}]
        recorder_dir(str): log파일 저장 위치
    
    Returns:
        best_score(float)
        best_params(dict)
    
    """
    
    # train serial (튜닝 시작 시간을 따로 기록해둔다.)
    kst = timezone(timedelta(hours=9)) # 우리나라는 UTC 기준으로 9시간 빠름
    train_serial = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")
    
    # get logger
    logger = get_logger(name=f'train_GSCV', dir_=recorder_dir, stream=False)
    logger.info(f"-----GRID SEARCH START-----")
    logger.info(f"Start Date: {train_serial}")
    logger.info(f"Configure {config}\n")
    
    score_list = []
    params_list = []
    
    names = param_grid.keys() # parameter names
    value_cand = param_grid.values() # value candidate

    # itertools의 product 함수는 여러 튜플 입력 시 튜플 내 모든 조합을 인풋 순서에 따라 순차적으로 출력해준다.
    # ex) (1, 2, 3), (4, 5, 6) -> (1, 4), (1, 5), (1, 6), (2, 4), ...
    
    # 모든 조합의 수
    candidates = 1
    for cand in value_cand:
        candidates *= len(cand)
    
    # log information
    logger.info(f"Fitting {cv} folds for each of {candidates} candidates, totalling {candidates*cv} fits")
    
    for i, value in zip(tqdm(range(candidates), leave=False), product(*value_cand)): # asterisk를 붙여야 리스트를 인식한다.
        params = {key:value for key, value in zip(names, value)}

        # k-fold CV
        avg_score = 0

        k_fold = KFold(n_splits=cv, shuffle=False)
        for train_idx, test_idx in k_fold.split(X):
            train_x, train_y = X[train_idx, :], y[train_idx]
            test_x, test_y = X[test_idx, :], y[test_idx]

            # train, predict
            model_ = eval(model)
            model_ = model_(**params)
            model_.fit(train_x, train_y)
            pred = model_.predict(test_x)

            # metric
            metric_, metric_params = eval(metric_info[0]), metric_info[1]
            score = metric_(test_y, pred, **metric_params)

            # update avg_score
            avg_score += score / cv
        
        # collect
        score_list.append(avg_score)
        params_list.append(params)

        logger.info(f"*****CANDIDATE_NO.{str(i+1).zfill(3)} RESULT*****")
        logger.info(f"Parameters: {params}")
        logger.info(f"{metric_.__name__}: {avg_score}")
    
    # Final Result
    best_score = max(score_list)
    best_params = params_list[score_list.index(best_score)]
    logger.info(f"*****ITERATION END*****")
    logger.info(f"Best Score: {best_score}  Best Params: {params_list[score_list.index(best_score)]}")
    logger.info(f"-----GRID SEARCH END-----\n")
    
    return best_score, best_params

# Run
best_score, best_params = grid_search_cv(train_x, train_y, **config)        

INFO:train_GSCV:-----GRID SEARCH START-----
INFO:train_GSCV:Start Date: 20230317_141853
INFO:train_GSCV:Configure {'recorder_dir': './', 'model': RandomForestClassifier(), 'param_grid': {'max_depth': [50, 80, 100], 'n_estimators': [65, 70, 75]}, 'metric_info': [<function f1_score at 0x7fb8cafef820>, {'average': 'macro'}], 'cv': 5}

INFO:train_GSCV:Fitting 5 folds for each of 9 candidates, totalling 45 fits


  0%|          | 0/9 [00:00<?, ?it/s]

INFO:train_GSCV:*****CANDIDATE_NO.001 RESULT*****
INFO:train_GSCV:Parameters: {'max_depth': 50, 'n_estimators': 65}
INFO:train_GSCV:f1_score: 0.7456586849932658
INFO:train_GSCV:*****CANDIDATE_NO.002 RESULT*****
INFO:train_GSCV:Parameters: {'max_depth': 50, 'n_estimators': 70}
INFO:train_GSCV:f1_score: 0.7440586800870628
INFO:train_GSCV:*****CANDIDATE_NO.003 RESULT*****
INFO:train_GSCV:Parameters: {'max_depth': 50, 'n_estimators': 75}
INFO:train_GSCV:f1_score: 0.7465787516197527
INFO:train_GSCV:*****CANDIDATE_NO.004 RESULT*****
INFO:train_GSCV:Parameters: {'max_depth': 80, 'n_estimators': 65}
INFO:train_GSCV:f1_score: 0.7447371889387937
INFO:train_GSCV:*****CANDIDATE_NO.005 RESULT*****
INFO:train_GSCV:Parameters: {'max_depth': 80, 'n_estimators': 70}
INFO:train_GSCV:f1_score: 0.7420072956640134
INFO:train_GSCV:*****CANDIDATE_NO.006 RESULT*****
INFO:train_GSCV:Parameters: {'max_depth': 80, 'n_estimators': 75}
INFO:train_GSCV:f1_score: 0.7477835651652415
INFO:train_GSCV:*****CANDIDATE_NO.

In [50]:
best_params

{'max_depth': 100, 'n_estimators': 65}

In [51]:
# test 데이터로 평가
best_model = RandomForestClassifier(random_state=42, **best_params).fit(train_x, train_y)
pred = best_model.predict(test_x)
f1_score(test_y, pred, average='macro')

0.7636563195035495

#### random search

GridSearch로 어느정도 폭을 좁혔으므로 Rondom Search로 전환


> Random Search 구현

In [52]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime, timezone, timedelta 
from tqdm.notebook import tqdm
import os


# random search 함수에 필요한 파라미터 설정
config = {
    'recorder_dir': './',
    'model': 'RandomForestClassifier',
    'param_grid': {
        'max_depth': ([95, 105], 'int'),
        'n_estimators': ([60, 70], 'int'),
    },
    'metric_info': ['f1_score', {'average': 'macro'}],
    'num_iter': 20, 
    'cv': 5
}


def random_search_cv(X, y, num_iter, model, param_grid, cv, metric_info, recorder_dir):
    """Random Search CV를 실행합니다.
        
    Args:
        X(array): X data
        y(array): label data
        num_iter(int): iteration 횟수 지정
        model(str): 모델함수명
        param_grid(dict): RandomSearch에 적용할 파라미터 범위
            {'파라미터명': ([최소값: 최대값], 'type')...}
        cv(int): cross_validation 개수
        metric_info(list): 평가지표 및 평가지표의 파라미터 설정
            ['평가지표함수', {'파라미터1': '값'...}]
        recorder_dir(str): log파일 저장 위치
    
    Returns:
        best_score(float)
        best_params(defaultdict)
    
    """
    
    # train serial (튜닝 시작 시간을 따로 기록해둔다.)
    kst = timezone(timedelta(hours=9)) # 우리나라는 UTC 기준으로 9시간 빠름
    train_serial = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")
    
    # get logger
    logger = get_logger(name=f'train_RSCV', dir_=recorder_dir, stream=False)
    logger.info(f"-----RANDOM SEARCH START-----")
    logger.info(f"Start Date: {train_serial}")
    logger.info(f"Configure {config}\n")
    
    score_list = []
    params_list = []
    
    for i in tqdm(range(num_iter), leave=False):
        # hyper-parameter random sampling
        params = dict()
        for name, value_info in param_grid.items():
            if len(value_info[0]) == 1:
                params[name] = value_info[0][0]
            elif value_info[1] == 'int':
                params[name] = np.random.randint(min(value_info[0]), max(value_info[0])+1)
            elif value_info[1] == 'float':
                params[name] = np.random.uniform(min(value_info[0]), max(value_info[0]))
            elif value_info[1] == 'str':
                params[name] = np.random.choice(value_info[0])

        # k-fold CV
        avg_score = 0

        k_fold = KFold(n_splits=cv, shuffle=False)
        for train_idx, test_idx in k_fold.split(X):
            train_x, train_y = X[train_idx, :], y[train_idx]
            test_x, test_y = X[test_idx, :], y[test_idx]


            # train, predict
            model_ = eval(model)
            model_ = model_(**params)
            model_.fit(train_x, train_y)
            pred = model_.predict(test_x)

            # metric
            metric_, metric_params = eval(metric_info[0]), metric_info[1]
            score = metric_(test_y, pred, **metric_params)

            # update avg_score
            avg_score += score / cv
        
        # collect
        score_list.append(avg_score)
        params_list.append(params)

        logger.info(f"*****ITERATION_{str(i+1).zfill(3)} RESULT*****")
        logger.info(f"Parameters: {params}")
        logger.info(f"{metric_.__name__}: {avg_score}")
    
    # Final Result
    best_score = max(score_list)
    best_params = params_list[score_list.index(best_score)]
    logger.info(f"*****ITERATION END*****")
    logger.info(f"Best Score: {best_score}  Best Params: {params_list[score_list.index(best_score)]}")
    logger.info(f"-----RANDOM SEARCH END-----\n")
    
    return best_score, best_params

# Run
best_score, best_params = random_search_cv(train_x, train_y, **config)        

INFO:train_RSCV:-----RANDOM SEARCH START-----
INFO:train_RSCV:Start Date: 20230317_143124
INFO:train_RSCV:Configure {'recorder_dir': './', 'model': RandomForestClassifier(), 'param_grid': {'max_depth': ([95, 105], 'int'), 'n_estimators': ([60, 70], 'int')}, 'metric_info': [<function f1_score at 0x7fb8cafef820>, {'average': 'macro'}], 'num_iter': 20, 'cv': 5}



  0%|          | 0/20 [00:00<?, ?it/s]

INFO:train_RSCV:*****ITERATION_001 RESULT*****
INFO:train_RSCV:Parameters: {'max_depth': 103, 'n_estimators': 67}
INFO:train_RSCV:f1_score: 0.7445461481239134
INFO:train_RSCV:*****ITERATION_002 RESULT*****
INFO:train_RSCV:Parameters: {'max_depth': 99, 'n_estimators': 66}
INFO:train_RSCV:f1_score: 0.7427767317460804
INFO:train_RSCV:*****ITERATION_003 RESULT*****
INFO:train_RSCV:Parameters: {'max_depth': 103, 'n_estimators': 66}
INFO:train_RSCV:f1_score: 0.7448183209396008
INFO:train_RSCV:*****ITERATION_004 RESULT*****
INFO:train_RSCV:Parameters: {'max_depth': 96, 'n_estimators': 65}
INFO:train_RSCV:f1_score: 0.7462728848153729
INFO:train_RSCV:*****ITERATION_005 RESULT*****
INFO:train_RSCV:Parameters: {'max_depth': 98, 'n_estimators': 68}
INFO:train_RSCV:f1_score: 0.7440333982120889
INFO:train_RSCV:*****ITERATION_006 RESULT*****
INFO:train_RSCV:Parameters: {'max_depth': 97, 'n_estimators': 65}
INFO:train_RSCV:f1_score: 0.7483081926438949
INFO:train_RSCV:*****ITERATION_007 RESULT*****
INF

In [53]:
best_params

{'max_depth': 104, 'n_estimators': 67}

In [54]:
# test 데이터로 평가
best_model = RandomForestClassifier(random_state=42, **best_params).fit(train_x, train_y)
pred = best_model.predict(test_x)
f1_score(test_y, pred, average='macro')

0.7599778186397916

#### 로그데이터 조회

In [55]:
# read log file
with open("train_RSCV.log", "r", encoding='utf-8') as f:
    for line in f:
        print(line, end="")

2023-03-17 14:31:24,516 | train_RSCV | INFO | -----RANDOM SEARCH START-----
2023-03-17 14:31:24,517 | train_RSCV | INFO | Start Date: 20230317_143124
2023-03-17 14:31:24,519 | train_RSCV | INFO | Configure {'recorder_dir': './', 'model': RandomForestClassifier(), 'param_grid': {'max_depth': ([95, 105], 'int'), 'n_estimators': ([60, 70], 'int')}, 'metric_info': [<function f1_score at 0x7fb8cafef820>, {'average': 'macro'}], 'num_iter': 20, 'cv': 5}

2023-03-17 14:31:27,419 | train_RSCV | INFO | *****ITERATION_001 RESULT*****
2023-03-17 14:31:27,420 | train_RSCV | INFO | Parameters: {'max_depth': 103, 'n_estimators': 67}
2023-03-17 14:31:27,421 | train_RSCV | INFO | f1_score: 0.7445461481239134
2023-03-17 14:31:29,993 | train_RSCV | INFO | *****ITERATION_002 RESULT*****
2023-03-17 14:31:29,994 | train_RSCV | INFO | Parameters: {'max_depth': 99, 'n_estimators': 66}
2023-03-17 14:31:29,995 | train_RSCV | INFO | f1_score: 0.7427767317460804
2023-03-17 14:31:32,585 | train_RSCV | INFO | *****