In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# 노트북 안에 그래프 그리기
%matplotlib inline

# ggplot: 그래프에서 격자로 숫자 범위 눈에 잘 띄도록 하는 스타일
plt.style.use('ggplot')

# 그래프에서 마이넛 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False

In [None]:
df = pd.read_csv("/content/question1-train_cleaned.csv")
print(len(df))
df.head(20)

8892


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,year,month,hour,dayofweek
0,2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.0032,1,2011,1,5,5
1,2011-01-01 10:00:00,1,0,0,1,15.58,19.695,76,16.9979,36,2011,1,10,5
2,2011-01-01 11:00:00,1,0,0,1,14.76,16.665,81,19.0012,56,2011,1,11,5
3,2011-01-01 12:00:00,1,0,0,1,17.22,21.21,77,19.0012,84,2011,1,12,5
4,2011-01-01 13:00:00,1,0,0,2,18.86,22.725,72,19.9995,94,2011,1,13,5
5,2011-01-01 14:00:00,1,0,0,2,18.86,22.725,72,19.0012,106,2011,1,14,5
6,2011-01-01 15:00:00,1,0,0,2,18.04,21.97,77,19.9995,110,2011,1,15,5
7,2011-01-01 16:00:00,1,0,0,2,17.22,21.21,82,19.9995,93,2011,1,16,5
8,2011-01-01 17:00:00,1,0,0,2,18.04,21.97,82,19.0012,67,2011,1,17,5
9,2011-01-01 18:00:00,1,0,0,3,17.22,21.21,88,16.9979,35,2011,1,18,5


In [None]:
categorical_col = ['workingday', 'weather','dayofweek','month','year','hour']
numerical_col = ['temp', 'humidity']

target_col = "count"

X = df[categorical_col + numerical_col]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def objective(trial, X_train, X_test, y_train, y_test):
    # 하이퍼파라미터 탐색 공간 정의
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),  # 'auto' 제거
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # 모델 생성
    rf_model = RandomForestRegressor(**params, random_state=42)

    # 교차 검증 점수 계산
    scores = cross_val_score(
        rf_model,
        X_train,
        y_train,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )

    # MSE의 평균 반환
    return -scores.mean()

def random_forest_optuna_search(X_train, X_test, y_train, y_test, n_trials=100):
    # Optuna study 생성
    study = optuna.create_study(direction='minimize')

    # 최적화 실행
    study.optimize(
        lambda trial: objective(trial, X_train, X_test, y_train, y_test),
        n_trials=n_trials,
        n_jobs=1  # 병렬 처리 관련 문제 방지를 위해 1로 설정
    )

    # 최적의 하이퍼파라미터 출력
    print("\nBest trial:")
    trial = study.best_trial
    print("  Value (Negative MSE):", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # 최적의 모델로 최종 평가
    best_params = study.best_params
    best_model = RandomForestRegressor(**best_params, random_state=42)
    best_model.fit(X_train, y_train)

    # 테스트 세트에 대한 예측 및 MSE 계산
    y_pred = best_model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred)
    print(f"\nTest MSE with best parameters: {test_mse:.4f}")

    # 학습 과정 시각화
    optuna.visualization.plot_optimization_history(study)
    optuna.visualization.plot_param_importances(study)

    return best_params, test_mse, study

# 실행
best_params, test_mse, study = random_forest_optuna_search(
    X_train, X_test, y_train, y_test, n_trials=100
)

# 추가적인 분석을 위한 함수
def analyze_optuna_results(study):
    print("\n=== Detailed Analysis ===")

    # 상위 10개 trials 출력
    print("\nTop 10 trials:")
    trials_df = study.trials_dataframe()
    sorted_trials = trials_df.sort_values('value')
    print(sorted_trials.head(10))

    # 파라미터 분포 확인
    print("\nParameter distributions in top 25% trials:")
    n_trials = len(study.trials)
    n_top_trials = n_trials // 4

    sorted_trials = sorted(study.trials, key=lambda t: t.value)
    top_trials = sorted_trials[:n_top_trials]

    param_distributions = {}
    for param in study.best_params.keys():
        values = [t.params[param] for t in top_trials]
        if isinstance(values[0], (int, float)):
            mean = np.mean(values)
            std = np.std(values)
            print(f"{param}: mean={mean:.4f}, std={std:.4f}")
        else:
            from collections import Counter
            value_counts = Counter(values)
            print(f"{param}: {dict(value_counts)}")

    return trials_df

# 결과 분석
trials_df = analyze_optuna_results(study)

[I 2024-11-07 19:37:51,752] A new study created in memory with name: no-name-c03bfd19-e5bc-456d-affa-19e5fc5d8250
[I 2024-11-07 19:38:19,198] Trial 0 finished with value: 4296.25348580106 and parameters: {'n_estimators': 1000, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 4296.25348580106.
[I 2024-11-07 19:38:21,543] Trial 1 finished with value: 9295.062248703285 and parameters: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 4296.25348580106.
[I 2024-11-07 19:38:35,197] Trial 2 finished with value: 11135.473068711815 and parameters: {'n_estimators': 1000, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 4296.25348580106.
[I 2024-11-07 19:38:51,790] Trial 3 finished with value: 8385.5420199111 and parameter


Best trial:
  Value (Negative MSE): 4119.2841351552115
  Params: 
    n_estimators: 100
    max_depth: 9
    min_samples_split: 9
    min_samples_leaf: 1
    max_features: log2
    bootstrap: True

Test MSE with best parameters: 4216.7118

=== Detailed Analysis ===

Top 10 trials:
    number        value             datetime_start          datetime_complete  \
81      81  4119.284135 2024-11-07 19:46:24.760786 2024-11-07 19:46:28.104815   
72      72  4119.284135 2024-11-07 19:45:54.731198 2024-11-07 19:45:58.055389   
73      73  4119.284135 2024-11-07 19:45:58.057287 2024-11-07 19:46:00.206767   
74      74  4119.284135 2024-11-07 19:46:00.208639 2024-11-07 19:46:02.284025   
75      75  4119.284135 2024-11-07 19:46:02.286053 2024-11-07 19:46:04.375696   
64      64  4119.284135 2024-11-07 19:45:35.484964 2024-11-07 19:45:37.628913   
63      63  4119.284135 2024-11-07 19:45:33.388820 2024-11-07 19:45:35.483186   
71      71  4119.284135 2024-11-07 19:45:51.672444 2024-11-07 19:45:5

In [None]:
!pip install lightgbm



In [None]:
import optuna
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

def objective(trial, X_train, X_test, y_train, y_test):
    # 하이퍼파라미터 탐색 공간 정의
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 31, 127),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.5),

        # 추가적인 LightGBM 특화 파라미터
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.5),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 0.1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.8, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.8, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7)
    }

    # 모델 생성
    lgbm_model = LGBMRegressor(**params,
                              random_state=42,
                              verbose=-1)  # verbose=-1로 설정하여 출력 억제

    # 교차 검증 점수 계산
    scores = cross_val_score(
        lgbm_model,
        X_train,
        y_train,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=1  # 안정성을 위해 1로 설정
    )

    # MSE의 평균 반환
    return -scores.mean()

def lightgbm_optuna_search(X_train, X_test, y_train, y_test, n_trials=100):
    # Optuna study 생성
    study = optuna.create_study(direction='minimize')

    # 최적화 실행
    study.optimize(
        lambda trial: objective(trial, X_train, X_test, y_train, y_test),
        n_trials=n_trials,
        n_jobs=1,  # 안정성을 위해 1로 설정
        show_progress_bar=True
    )

    # 최적의 하이퍼파라미터 출력
    print("\nBest trial:")
    trial = study.best_trial
    print("  Value (Negative MSE):", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # 최적의 모델로 최종 평가
    best_params = study.best_params
    best_model = LGBMRegressor(**best_params, random_state=42, verbose=-1)
    best_model.fit(X_train, y_train)

    # 테스트 세트에 대한 예측 및 MSE 계산
    y_pred = best_model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred)
    print(f"\nTest MSE with best parameters: {test_mse:.4f}")

    # 학습 과정 시각화
    try:
        optuna.visualization.plot_optimization_history(study)
        optuna.visualization.plot_param_importances(study)
        optuna.visualization.plot_param_correlations(study)
    except (ImportError, ModuleNotFoundError):
        print("Plotly is not installed. Skipping visualizations.")

    return best_params, test_mse, study

def analyze_optuna_results(study):
    print("\n=== Detailed Analysis ===")

    # 상위 10개 trials 출력
    print("\nTop 10 trials:")
    trials_df = study.trials_dataframe()
    sorted_trials = trials_df.sort_values('value')
    print(sorted_trials.head(10))

    # 파라미터 분포 확인
    print("\nParameter distributions in top 25% trials:")
    n_trials = len(study.trials)
    n_top_trials = n_trials // 4

    sorted_trials = sorted(study.trials, key=lambda t: t.value)
    top_trials = sorted_trials[:n_top_trials]

    param_distributions = {}
    for param in study.best_params.keys():
        values = [t.params[param] for t in top_trials]
        if isinstance(values[0], (int, float)):
            mean = np.mean(values)
            std = np.std(values)
            print(f"{param}: mean={mean:.4f}, std={std:.4f}")
        else:
            from collections import Counter
            value_counts = Counter(values)
            print(f"{param}: {dict(value_counts)}")

    # 파라미터 중요도 상위 5개 출력
    importance = optuna.importance.get_param_importances(study)
    print("\nTop 5 most important parameters:")
    for param, score in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{param}: {score:.4f}")

    return trials_df

# 실행
best_params, test_mse, study = lightgbm_optuna_search(
    X_train, X_test, y_train, y_test, n_trials=100
)

# 결과 분석
trials_df = analyze_optuna_results(study)



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.


[I 2024-11-07 19:51:16,691] A new study created in memory with name: no-name-c5fb1751-de1a-4ca8-8299-5342372b17a2


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-11-07 19:51:20,158] Trial 0 finished with value: 1321.8087946833775 and parameters: {'n_estimators': 600, 'max_depth': 6, 'learning_rate': 0.06618043933080732, 'num_leaves': 38, 'subsample': 0.8034373542542478, 'colsample_bytree': 0.9422146746966453, 'min_child_samples': 92, 'reg_alpha': 0.10935102072455993, 'reg_lambda': 0.40156019464268133, 'min_split_gain': 0.36981173708726317, 'min_child_weight': 0.0014596605333305115, 'feature_fraction': 0.9337831871261387, 'bagging_fraction': 0.8600016681100822, 'bagging_freq': 5}. Best is trial 0 with value: 1321.8087946833775.
[I 2024-11-07 19:51:24,949] Trial 1 finished with value: 1196.1231970145952 and parameters: {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.03947091018066664, 'num_leaves': 44, 'subsample': 0.9288190658949029, 'colsample_bytree': 0.8128705783744046, 'min_child_samples': 39, 'reg_alpha': 0.15939472304579466, 'reg_lambda': 0.27075041437394093, 'min_split_gain': 0.05529002639823577, 'min_child_weight': 0.004

AttributeError: module 'optuna.visualization' has no attribute 'plot_param_correlations'