# 1. 필요한 라이브러리 로딩 및 데이터 불러오기, 전처리

In [83]:
import optuna
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

In [2]:
import pandas as pd
import warnings; warnings.filterwarnings(action='ignore')

X = pd.read_csv('features.csv', encoding='cp949')
y = pd.read_csv('target.csv', encoding='cp949')
X.drop(['ID'],axis =1, inplace =True)
y.drop(['ID'],axis =1, inplace =True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [28]:
len(X_train), len(X_test), len(y_train), len(y_test)

(11599, 4971, 11599, 4971)

# 2. 여기는 최적의 파라미터를 찾는 과정


In [88]:
def objective(trial):
    # 하이퍼파라미터 검색 범위 설정-----------------------------------------
    max_depth = trial.suggest_int('max_depth', 10, 800)  
    min_samples_split = trial.suggest_int('min_samples_split', 20, 10000)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 10, 30)
    max_features = trial.suggest_float('max_features', 0.1, 1.0, step=0.001)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 10, 800)
    
    # 모델 생성------------------------------------------------------------
    dtr = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        random_state=0
    )
    
    # 모델을 5번 교차검증---------------------------------------------------
    score = cross_val_score(dtr, X_train, y_train, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
    rmse = score.mean()  # 최적화 목표는 최대화이므로 음수로 변환

    return rmse

# X_train, X_test, y_train, y_test를 적절하게 설정

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# 최적 하이퍼파라미터 출력
best_params = study.best_params
best_neg_rmse = study.best_value
print(f"Best score: {best_neg_rmse}")
print(f"Best parameters: {best_params}")

# 최적 하이퍼파라미터 조합들을 저장할 딕셔너리 생성
optuna_results = {
    "best_score": best_neg_rmse,
    "best_params": best_params,
    "all_trials": []
}

# 모든 하이퍼파라미터 조합과 성능을 저장
for trial in study.trials:
    trial_info = {
        "params": trial.params,
        "value": trial.value,
        "state": trial.state
    }
    optuna_results["all_trials"].append(trial_info)

# JSON 파일로 저장
import json
with open("optuna_results.json", "w") as f:
    json.dump(optuna_results, f)

[I 2023-10-10 21:40:52,882] A new study created in memory with name: no-name-b650dd01-0bf8-4692-a92e-a66f25239c81
[I 2023-10-10 21:40:54,051] Trial 0 finished with value: -1131.278052771858 and parameters: {'max_depth': 552, 'min_samples_split': 3782, 'min_samples_leaf': 20, 'max_features': 0.10900000000000001, 'max_leaf_nodes': 698}. Best is trial 0 with value: -1131.278052771858.
[I 2023-10-10 21:40:54,936] Trial 1 finished with value: -1059.256363387344 and parameters: {'max_depth': 279, 'min_samples_split': 4576, 'min_samples_leaf': 10, 'max_features': 0.21400000000000002, 'max_leaf_nodes': 175}. Best is trial 1 with value: -1059.256363387344.
[I 2023-10-10 21:40:55,685] Trial 2 finished with value: -1171.9602689241674 and parameters: {'max_depth': 197, 'min_samples_split': 8721, 'min_samples_leaf': 26, 'max_features': 0.222, 'max_leaf_nodes': 644}. Best is trial 1 with value: -1059.256363387344.
[I 2023-10-10 21:40:56,567] Trial 3 finished with value: -1040.493234125849 and parame

Best score: -896.9219377138264
Best parameters: {'max_depth': 671, 'min_samples_split': 341, 'min_samples_leaf': 23, 'max_features': 0.741, 'max_leaf_nodes': 30}


In [89]:
max_nrmse = -100000000

for result in tqdm(optuna_results["all_trials"]):
    cnt +=1
#     print("{}번째 추론 중입니다.".format(cnt))
    best_model = DecisionTreeRegressor(**result["params"])
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    nrmse = -np.sqrt(mean_squared_error(y_test, y_pred))
    if nrmse > max_nrmse:
        max_nrmse = nrmse
        max_params = result["params"]
print(max_nrmse, max_params)

100%|█████████████████████████████████████████████████████████████████████████████████| 200/200 [07:14<00:00,  2.17s/it]

-908.8421267097376 {'max_depth': 51, 'min_samples_split': 393, 'min_samples_leaf': 28, 'max_features': 0.768, 'max_leaf_nodes': 93}





# -----------------------------------------------------