In [1]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import randint, uniform
import pandas as pd
import numpy as np

In [2]:

# 데이터 로드
data = pd.read_csv('merged_data.csv', encoding='utf-8')



In [3]:
# 필요한 열 선택
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'stn_gu', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]


In [4]:

# 범주형 데이터를 숫자로 변환 (Label Encoding)
categorical_features = ['stn_id', 'stn_gu', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]


In [5]:

# 훈련-테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data.drop('borrowed_num', axis=1), data['borrowed_num'], test_size=0.2, random_state=42)

In [6]:

# LightGBM 모델 정의
lgb_model = LGBMRegressor()

# LightGBM의 하이퍼파라미터 분포 정의
lgb_param_dist = {
    'num_leaves': randint(31, 100),
    'learning_rate': uniform(0.01, 0.1),
    'feature_fraction': uniform(0.8, 1.0)
}


In [7]:
# 랜덤 서치 수행
lgb_random_search = RandomizedSearchCV(estimator=lgb_model, param_distributions=lgb_param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
lgb_random_search.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.780060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 5145300, number of used features: 10
[LightGBM] [Info] Start training from score 1.482044
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.707155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 5145300, number of used features: 10
[LightGBM] [Info] Start training from score 1.484790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.854184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1139
[LightGBM] [Info] Number of data points in the train

27 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\anaconda3\Lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    super().fit(
  File "c:\ProgramData\anaconda3\Lib\site-packages\lightgbm\sklearn.py", line 842, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\lightgbm\engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
              ^

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.774935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1139
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 10
[LightGBM] [Info] Start training from score 1.483498


In [8]:

# 최적의 LightGBM 모델
best_lgb_model = lgb_random_search.best_estimator_


In [9]:

# XGBoost 모델 정의
xgb_model = XGBRegressor()

# XGBoost의 하이퍼파라미터 분포 정의
xgb_param_dist = {
    'max_depth': randint(5, 15),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.8, 1.0)
}


In [10]:

# 랜덤 서치 수행
xgb_random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
xgb_random_search.fit(X_train, y_train)



15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\anaconda3\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1086, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^

In [11]:

# 최적의 XGBoost 모델
best_xgb_model = xgb_random_search.best_estimator_


In [12]:

# 최적의 모델을 사용하여 예측 생성
y_pred_train_lgb = best_lgb_model.predict(X_train)
y_pred_test_lgb = best_lgb_model.predict(X_test)

y_pred_train_xgb = best_xgb_model.predict(X_train)
y_pred_test_xgb = best_xgb_model.predict(X_test)





In [13]:
# 앙상블 예측 (평균이나 가중 평균 등의 방법 사용 가능)
y_pred_train_ensemble = (y_pred_train_lgb + y_pred_train_xgb) / 2
y_pred_test_ensemble = (y_pred_test_lgb + y_pred_test_xgb) / 2



In [14]:
# 앙상블 예측 평가
ensemble_rmse_train = mean_squared_error(y_train, y_pred_train_ensemble, squared=False)
ensemble_rmse_test = mean_squared_error(y_test, y_pred_test_ensemble, squared=False)
ensemble_r2_train = r2_score(y_train, y_pred_train_ensemble)
ensemble_r2_test = r2_score(y_test, y_pred_test_ensemble)

print(f'앙상블 훈련 RMSE: {ensemble_rmse_train}')
print(f'앙상블 테스트 RMSE: {ensemble_rmse_test}')
print(f'앙상블 훈련 R-squared: {ensemble_r2_train}')
print(f'앙상블 테스트 R-squared: {ensemble_r2_test}')

앙상블 훈련 RMSE: 1.9548190665518157
앙상블 테스트 RMSE: 2.0104574055356643
앙상블 훈련 R-squared: 0.5432027493696077
앙상블 테스트 R-squared: 0.5165285810506997


In [15]:
# 최적의 LightGBM 모델 및 파라미터 출력
print("Best LightGBM Model:")
print(best_lgb_model)
print("Best LightGBM Parameters:")
print(lgb_random_search.best_params_)

# 최적의 XGBoost 모델 및 파라미터 출력
print("Best XGBoost Model:")
print(best_xgb_model)
print("Best XGBoost Parameters:")
print(xgb_random_search.best_params_)

Best LightGBM Model:
LGBMRegressor(feature_fraction=0.9705241236872916,
              learning_rate=0.01650515929852795, num_leaves=34)
Best LightGBM Parameters:
{'feature_fraction': 0.9705241236872916, 'learning_rate': 0.01650515929852795, 'num_leaves': 34}
Best XGBoost Model:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.10832308858067882,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=13, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_paral

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# 앙상블 예측 평가
ensemble_rmse_train = mean_squared_error(y_train, y_pred_train_ensemble, squared=False)
ensemble_rmse_test = mean_squared_error(y_test, y_pred_test_ensemble, squared=False)
ensemble_mae_train = mean_absolute_error(y_train, y_pred_train_ensemble)
ensemble_mae_test = mean_absolute_error(y_test, y_pred_test_ensemble)
ensemble_r2_train = r2_score(y_train, y_pred_train_ensemble)
ensemble_r2_test = r2_score(y_test, y_pred_test_ensemble)

print(f'앙상블 훈련 RMSE: {ensemble_rmse_train}')
print(f'앙상블 테스트 RMSE: {ensemble_rmse_test}')
print(f'앙상블 훈련 MAE: {ensemble_mae_train}')
print(f'앙상블 테스트 MAE: {ensemble_mae_test}')
print(f'앙상블 훈련 R-squared: {ensemble_r2_train}')
print(f'앙상블 테스트 R-squared: {ensemble_r2_test}')


앙상블 훈련 RMSE: 1.9548190665518157
앙상블 테스트 RMSE: 2.0104574055356643
앙상블 훈련 MAE: 1.1445154810349647
앙상블 테스트 MAE: 1.1633586807474188
앙상블 훈련 R-squared: 0.5432027493696077
앙상블 테스트 R-squared: 0.5165285810506997


In [18]:
lgb_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train_lgb))
lgb_r2_train = r2_score(y_train, y_pred_train_lgb)

lgb_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test_lgb))
lgb_r2_test = r2_score(y_test, y_pred_test_lgb)

print("LightGBM 모델 평가 지표:")
print("훈련 RMSE:", lgb_rmse_train)
print("훈련 R-squared:", lgb_r2_train)
print("테스트 RMSE:", lgb_rmse_test)
print("테스트 R-squared:", lgb_r2_test)

LightGBM 모델 평가 지표:
훈련 RMSE: 2.416895736563943
훈련 R-squared: 0.3017255042017086
테스트 RMSE: 2.4203839177096476
테스트 R-squared: 0.29927184164346776


In [19]:
xgb_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train_xgb))
xgb_r2_train = r2_score(y_train, y_pred_train_xgb)

xgb_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb))
xgb_r2_test = r2_score(y_test, y_pred_test_xgb)

print("\nXGBoost 모델 평가 지표:")
print("훈련 RMSE:", xgb_rmse_train)
print("훈련 R-squared:", xgb_r2_train)
print("테스트 RMSE:", xgb_rmse_test)
print("테스트 R-squared:", xgb_r2_test)


XGBoost 모델 평가 지표:
훈련 RMSE: 1.6884006751088472
훈련 R-squared: 0.6592299774904973
테스트 RMSE: 1.7939598961069139
테스트 R-squared: 0.615048059766362
