In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
!python --version

Python 3.7.9


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. 데이터 준비
# 예시 데이터 생성 (실제 사용 시에는 데이터를 불러와야 함)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 모델 및 하이퍼파라미터 그리드 설정
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# 3. GridSearchCV 객체 생성 및 학습
# cv=5는 5-fold cross-validation을 의미
# n_jobs=-1은 사용 가능한 모든 CPU 코어를 사용하라는 의미
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 4. 결과 확인
print(f"최적의 하이퍼파라미터: {grid_search.best_params_}")
# {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
print(f"최고 교차 검증 점수: {grid_search.best_score_:.4f}") # 0.9387

# 5. 최적 모델로 예측 및 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"테스트 세트 정확도: {accuracy_score(y_test, y_pred):.4f}") # 0.9250

# GridSearchCV 결과 전체 확인
cv_results_df = pd.DataFrame(grid_search.cv_results_)
print(cv_results_df[['param_n_estimators', 'param_max_depth', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head())

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100, total=   0.3s
[CV] max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100, total=   0.4s
[CV] max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100 
[CV]  max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100, total=   0.4s
[CV] max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100 
[CV]  max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100, total=   0.5s
[CV] max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100 
[CV]  max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100, total=   0.4s
[CV] max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.7s
[CV] max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.7s
[CV] max_depth=10, min_samples_leaf=1, min_samples_s

[CV]  max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.5s
[CV] max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.4s
[CV] max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.4s
[CV] max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.4s
[CV] max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.3s
[CV] max_depth=20, min_samples_leaf=1, min_samples_s

[CV]  max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   1.1s
[CV] max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.5s
[CV] max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.5s
[CV] max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   0.3s
[CV] max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   0.3s
[CV] max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   0.3s
[CV] max_depth=None, min_sampl

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.2min finished


최적의 하이퍼파라미터: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
최고 교차 검증 점수: 0.9387
테스트 세트 정확도: 0.9250
   param_n_estimators param_max_depth  mean_test_score  rank_test_score
17                200            None          0.93875                1
9                 200              20          0.93875                1
21                200            None          0.93750                3
13                200              20          0.93750                3
15                200              20          0.93250                5


In [7]:
import sys
import joblib

# sklearn.externals.joblib이 없어서 발생하는 문제 해결을 위해 
# 현재 설치된 joblib을 해당 경로로 매핑해줍니다.
sys.modules['sklearn.externals.joblib'] = joblib

from skopt import BayesSearchCV
# 이후 나머지 import 진행

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# 1. 데이터 준비 (GridSearchCV 예시와 동일)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 모델 및 하이퍼파라미터 탐색 공간(Search Space) 설정
model = RandomForestClassifier(random_state=42)
search_spaces = {
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(10, 50),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 5),
    'max_features': Real(0.1, 1.0, prior='uniform') # 0.1에서 1.0 사이의 실수
}

# 3. BayesSearchCV 객체 생성 및 학습
# n_iter=50은 50개의 조합을 테스트하라는 의미
bayes_search = BayesSearchCV(estimator=model, search_spaces=search_spaces, n_iter=50,
                             cv=5, n_jobs=-1, verbose=2, scoring='accuracy', random_state=42)
bayes_search.fit(X_train, y_train)

# 4. 결과 확인
print(f"최적의 하이퍼파라미터: {bayes_search.best_params_}")
print(f"최고 교차 검증 점수: {bayes_search.best_score_:.4f}")

# 5. 최적 모델로 예측 및 평가
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"테스트 세트 정확도: {accuracy_score(y_test, y_pred):.4f}")

TypeError: __init__() got an unexpected keyword argument 'fit_params'