# ML Manual


### 실습(7)
하이퍼파라미터 튜닝


In [1]:
# * Grid search


import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# 데이터 로드
housing = fetch_california_housing()
target = housing.target
data = housing.data
df_X = pd.DataFrame(data, columns=housing.feature_names)
df_y = pd.DataFrame(target, columns=["Target"])

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# 데이터 스케일링
S_scaler = StandardScaler()
X_train_scale = pd.DataFrame(S_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scale = pd.DataFrame(S_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# RandomForest 모델 정의 및 하이퍼파라미터 그리드 설정
RF = RandomForestRegressor()
params_Grid = {
    'n_estimators': np.arange(20, 40),
    'max_depth': np.arange(2, 4),
    'min_samples_split': np.arange(2, 4),
    'min_samples_leaf': np.arange(2, 4)
}

# GridSearchCV 실행
Grid_serach = GridSearchCV(
    RF,
    param_grid=params_Grid,
    cv=5,
    verbose=2,
    scoring='r2'
)

Grid_serach.fit(X_train_scale, y_train)

# 결과 출력
print(f"최적의 하이퍼파라미터는 : {Grid_serach.best_params_}\n 최적의 점수는 : {Grid_serach.best_score_}")


Fitting 5 folds for each of 160 candidates, totalling 800 fits
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=21; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=21; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=21; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=21; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=2, min_

In [2]:
# * Random search
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# 데이터 로드
housing = fetch_california_housing()
target = housing.target
data = housing.data
df_X = pd.DataFrame(data, columns=housing.feature_names)
df_y = pd.DataFrame(target, columns=["Target"])

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# 데이터 스케일링
S_scaler = StandardScaler()
X_train_scale = pd.DataFrame(S_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scale = pd.DataFrame(S_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# RandomForest 모델 정의 및 하이퍼파라미터 설정
RF = RandomForestRegressor()
params_random = {
    'n_estimators': np.arange(10, 200),
    'max_depth': np.arange(2, 6),
    'min_samples_split': np.arange(2, 6),
    'min_samples_leaf': np.arange(2, 6)
}

# RandomizedSearchCV 실행
Random_serach = RandomizedSearchCV(
    RF,
    param_distributions=params_random,
    cv=5,
    n_iter=100,
    verbose=1,
    scoring='r2'
)

Random_serach.fit(X_train_scale, y_train)

# 결과 출력
print(f"최적의 하이퍼파라미터는 : {Random_serach.best_params_}\n 최적의 점수는 : {Random_serach.best_score_}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
최적의 하이퍼파라미터는 : {'n_estimators': np.int64(179), 'min_samples_split': np.int64(3), 'min_samples_leaf': np.int64(5), 'max_depth': np.int64(5)}
 최적의 점수는 : 0.6676936886824468


In [2]:
# * Bayesian Optimization(optuna)

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
from optuna import Trial
from optuna.pruners import HyperbandPruner

# 데이터 로드
housing = fetch_california_housing()
target = housing.target
data = housing.data
df_X = pd.DataFrame(data, columns=housing.feature_names)
df_y = pd.DataFrame(target, columns=["Target"])

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# 데이터 스케일링
S_scaler = StandardScaler()
X_train_scale = pd.DataFrame(S_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scale = pd.DataFrame(S_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Objective Function
def objective(trial: Trial):
    # 하이퍼파라미터 탐색 공간 정의
    n_estimators = trial.suggest_int('n_estimators', 30, 100)
    max_depth = trial.suggest_int('max_depth', 2, 8)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 8)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 8)

    # 모델 생성
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # KFold 교차 검증 설정 (5-fold)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    # 교차 검증
    for train_index, valid_index in kf.split(X_train_scale):
        X_fold_train, X_fold_valid = X_train_scale.iloc[train_index], X_train_scale.iloc[valid_index]
        y_fold_train, y_fold_valid = y_train[train_index], y_train[valid_index]

        model.fit(X_fold_train, y_fold_train)
        fold_predictions = model.predict(X_fold_valid)
        score = r2_score(y_fold_valid, fold_predictions)
        scores.append(score)

    # 교차 검증 평균 R² 반환
    return np.mean(scores)

# Hyperband 프루너 설정
pruner = HyperbandPruner(
    min_resource=1,
    max_resource=100,
    reduction_factor=2
)

# Optuna 스터디 생성 (랜덤 시드 설정)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=100)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)
print("Best score:", study.best_value)


[I 2025-01-06 15:10:57,399] A new study created in memory with name: no-name-dd050735-a48e-4475-80bb-2ee5f8f48658


[I 2025-01-06 15:11:07,669] Trial 0 finished with value: 0.753877425672939 and parameters: {'n_estimators': 56, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.753877425672939.
[I 2025-01-06 15:11:10,999] Trial 1 finished with value: 0.5600023031409199 and parameters: {'n_estimators': 41, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.753877425672939.
[I 2025-01-06 15:11:21,130] Trial 2 finished with value: 0.7014189644125233 and parameters: {'n_estimators': 72, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.753877425672939.
[I 2025-01-06 15:11:27,393] Trial 3 finished with value: 0.5603919324460842 and parameters: {'n_estimators': 89, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.753877425672939.
[I 2025-01-06 15:11:33,244] Trial 4 finished with value: 0.664771461057996 and parameters: {'n_estimators

Best hyperparameters: {'n_estimators': 98, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 4}
Best score: 0.7547729624850008


In [5]:
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice

# 최적화 및 시각화
if __name__ == "__main__":
    # Study 이름 설정
    study_name = "quadratic-simple"
    storage_url = f"sqlite:///{study_name}.db"

    # 기존 스터디 불러오기 또는 새로 생성
    try:
        study = optuna.load_study(study_name=study_name, storage=storage_url)
    except KeyError:
        study = optuna.create_study(
            direction="maximize",
            storage=storage_url,
            study_name=study_name
        )
    
    # 최적화 실행
    study.optimize(objective, n_trials=100)
    
    # 최적의 하이퍼파라미터와 점수 출력
    print(f"Best value: {study.best_value} (params: {study.best_params})")
    
    # 1. 최적화 과정 히스토리 플롯
    fig1 = plot_optimization_history(study)
    fig1.show()

    # 2. 파라미터 중요도 플롯
    fig2 = plot_param_importances(study)
    fig2.show()

    # 3. 병렬 좌표 플롯
    fig3 = plot_parallel_coordinate(study)
    fig3.show()

    # 4. 슬라이스 플롯
    fig4 = plot_slice(study)
    fig4.show()


[I 2025-01-06 16:28:03,531] Trial 100 finished with value: 0.46380348717106334 and parameters: {'n_estimators': 66, 'max_depth': 2, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 86 with value: 0.46380348717106334.
[I 2025-01-06 16:28:09,598] Trial 101 finished with value: 0.4639012363690792 and parameters: {'n_estimators': 65, 'max_depth': 2, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 86 with value: 0.46380348717106334.
[I 2025-01-06 16:28:14,561] Trial 102 finished with value: 0.46380348717106334 and parameters: {'n_estimators': 66, 'max_depth': 2, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 86 with value: 0.46380348717106334.
[I 2025-01-06 16:28:18,882] Trial 103 finished with value: 0.46413127938848114 and parameters: {'n_estimators': 59, 'max_depth': 2, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 86 with value: 0.46380348717106334.
[I 2025-01-06 16:28:24,017] Trial 104 finished with value: 0.4639830925832591 and

Best value: 0.46380348717106334 (params: {'n_estimators': 66, 'max_depth': 2, 'min_samples_split': 8, 'min_samples_leaf': 3})


ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.