In [1]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd

In [2]:
# 데이터 로드
data = pd.read_csv('merged_data.csv', encoding='utf-8')

In [3]:
# 필요한 열 선택
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'stn_gu', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]

In [4]:
# 범주형 데이터를 숫자로 변환 (Label Encoding)
categorical_features = ['stn_id', 'stn_gu', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]

In [5]:
# 훈련-테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data.drop('borrowed_num', axis=1), data['borrowed_num'], test_size=0.2, random_state=42)

# LGBM Model

In [13]:
# LightGBM 모델
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 80,
    'learning_rate': 0.05,
    'feature_fraction': 1.0
}

In [14]:
train_data_lgb = lgb.Dataset(X_train, label=y_train)
test_data_lgb = lgb.Dataset(X_test, label=y_test, reference=train_data_lgb)

lgb_model = lgb.train(lgb_params, train_data_lgb, num_boost_round=10000, valid_sets=[test_data_lgb, train_data_lgb], callbacks=[
    lgb.early_stopping(stopping_rounds=3, verbose=100),
])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.766424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1139
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 10
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 1.85135	valid_0's rmse: 1.87321


# XGBoost Model

In [15]:
# XGBoost 모델
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'booster': 'gbtree',
    'learning_rate': 0.01,
    'max_depth': 9,
    'subsample':0.8
}

In [16]:
train_data_xgb = xgb.DMatrix(X_train, label=y_train)
test_data_xgb = xgb.DMatrix(X_test, label=y_test)

xgb_model = xgb.train(xgb_params, train_data_xgb, num_boost_round=10000, evals=[(test_data_xgb, 'eval')], early_stopping_rounds=3, verbose_eval=100)


[0]	eval-rmse:2.88335
[100]	eval-rmse:2.48297
[200]	eval-rmse:2.37125
[300]	eval-rmse:2.31309
[400]	eval-rmse:2.27163
[500]	eval-rmse:2.24540
[600]	eval-rmse:2.22477
[700]	eval-rmse:2.21164
[800]	eval-rmse:2.19677
[900]	eval-rmse:2.18071
[1000]	eval-rmse:2.16765
[1100]	eval-rmse:2.15537
[1200]	eval-rmse:2.14307
[1300]	eval-rmse:2.13086
[1400]	eval-rmse:2.12102
[1500]	eval-rmse:2.11131
[1600]	eval-rmse:2.10241
[1700]	eval-rmse:2.09459
[1800]	eval-rmse:2.08690
[1900]	eval-rmse:2.07844
[2000]	eval-rmse:2.07158
[2100]	eval-rmse:2.06480
[2200]	eval-rmse:2.05799
[2300]	eval-rmse:2.05173
[2400]	eval-rmse:2.04620
[2500]	eval-rmse:2.04202
[2600]	eval-rmse:2.03719
[2700]	eval-rmse:2.03292
[2800]	eval-rmse:2.02880
[2900]	eval-rmse:2.02493
[3000]	eval-rmse:2.02122
[3100]	eval-rmse:2.01692
[3200]	eval-rmse:2.01327
[3300]	eval-rmse:2.01012
[3400]	eval-rmse:2.00701
[3500]	eval-rmse:2.00384
[3600]	eval-rmse:2.00015
[3700]	eval-rmse:1.99765
[3800]	eval-rmse:1.99463
[3900]	eval-rmse:1.99207
[4000]	eval-

In [17]:
# 예측 생성
y_pred_train_lgb = lgb_model.predict(X_train, num_iteration=lgb_model.best_iteration)
y_pred_test_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

y_pred_train_xgb = xgb_model.predict(train_data_xgb)
y_pred_test_xgb = xgb_model.predict(test_data_xgb)

In [18]:
# 앙상블 예측 (평균이나 가중 평균 등의 방법 사용 가능)
y_pred_train_ensemble = (y_pred_train_lgb + y_pred_train_xgb) / 2
y_pred_test_ensemble = (y_pred_test_lgb + y_pred_test_xgb) / 2

In [19]:
# 앙상블 예측 평가
ensemble_rmse_train = mean_squared_error(y_train, y_pred_train_ensemble, squared=False)
ensemble_rmse_test = mean_squared_error(y_test, y_pred_test_ensemble, squared=False)
ensemble_r2_train = r2_score(y_train, y_pred_train_ensemble)
ensemble_r2_test = r2_score(y_test, y_pred_test_ensemble)

print(f'앙상블 훈련 RMSE: {ensemble_rmse_train}')
print(f'앙상블 테스트 RMSE: {ensemble_rmse_test}')
print(f'앙상블 훈련 R-squared: {ensemble_r2_train}')
print(f'앙상블 테스트 R-squared: {ensemble_r2_test}')

앙상블 훈련 RMSE: 1.8391774472384546
앙상블 테스트 RMSE: 1.8662563126165006
앙상블 훈련 R-squared: 0.5956498446814924
앙상블 테스트 R-squared: 0.5833958114812132
