In [1]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd

In [2]:
# Load Data
data = pd.read_csv('merged_data.csv', encoding='utf-8')

In [3]:
# Selected Features
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]

In [4]:
# Lable Encoding
categorical_features = ['stn_id', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]

In [5]:
# 훈련-테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data.drop('borrowed_num', axis=1), data['borrowed_num'], test_size=0.2, random_state=42)

# LGBM Model

In [6]:
# LightGBM 모델 - gbdt
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 80,
    'learning_rate': 0.05,
    'feature_fraction': 1.0,
    'device': 'gpu'
}

In [None]:
# LightGBM 모델 - dart
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'dart',
    'num_leaves': 80,
    'learning_rate': 0.05,
    'feature_fraction': 1.0,
    'device': 'gpu'
}

In [7]:
train_data_lgb = lgb.Dataset(X_train, label=y_train)
test_data_lgb = lgb.Dataset(X_test, label=y_test, reference=train_data_lgb)

lgb_model = lgb.train(lgb_params, train_data_lgb, num_boost_round=10000, valid_sets=[test_data_lgb, train_data_lgb], callbacks=[
    lgb.early_stopping(stopping_rounds=3, verbose=100),
])

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1112
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 9
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 6 dense feature groups (58.88 MB) transferred to GPU in 0.213486 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 1.483498




# XGBoost Model

In [8]:
# XGBoost 모델
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'booster': 'gbtree',
    'learning_rate': 0.1,
    'max_depth':13,
    'subsample':0.8,
    'device':'gpu'
}

In [9]:
train_data_xgb = xgb.DMatrix(X_train, label=y_train)
test_data_xgb = xgb.DMatrix(X_test, label=y_test)

xgb_model = xgb.train(xgb_params, train_data_xgb, num_boost_round=10000, evals=[(test_data_xgb, 'eval')], early_stopping_rounds=3, verbose_eval=100)


[0]	eval-rmse:2.79100
[100]	eval-rmse:1.97047
[200]	eval-rmse:1.91868
[250]	eval-rmse:1.91346


In [10]:
# 예측 생성
y_pred_train_lgb = lgb_model.predict(X_train, num_iteration=lgb_model.best_iteration)
y_pred_test_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

y_pred_train_xgb = xgb_model.predict(train_data_xgb)
y_pred_test_xgb = xgb_model.predict(test_data_xgb)

In [15]:
# 각 모델의 RMSE 계산
rmse_lgb = np.sqrt(mean_squared_error(y_train, y_pred_train_lgb))
rmse_xgb = np.sqrt(mean_squared_error(y_train, y_pred_train_xgb))

# RMSE를 이용하여 가중치 설정 (낮은 RMSE일수록 높은 가중치)
weight_lgb = 1 / (1 + rmse_lgb)
weight_xgb = 1 / (1 + rmse_xgb)

# 가중평균 계산
y_pred_train_ensemble_weighted = (weight_lgb * y_pred_train_lgb) + (weight_xgb * y_pred_train_xgb)
y_pred_test_ensemble_weighted = (weight_lgb * y_pred_test_lgb) + (weight_xgb * y_pred_test_xgb)

In [16]:
# 앙상블 예측 (평균이나 가중 평균 등의 방법 사용 가능)
y_pred_train_ensemble = (y_pred_train_lgb + y_pred_train_xgb) / 2
y_pred_test_ensemble = (y_pred_test_lgb + y_pred_test_xgb) / 2

In [17]:
# 앙상블 예측 평가
ensemble_rmse_train = mean_squared_error(y_train, y_pred_train_ensemble, squared=False)
ensemble_rmse_test = mean_squared_error(y_test, y_pred_test_ensemble, squared=False)
Wensemble_rmse_train = mean_squared_error(y_train, y_pred_train_ensemble_weighted, squared=False)
Wensemble_rmse_test = mean_squared_error(y_test, y_pred_test_ensemble_weighted, squared=False)
ensemble_r2_train = r2_score(y_train, y_pred_train_ensemble)
ensemble_r2_test = r2_score(y_test, y_pred_test_ensemble)
Wensemble_r2_train = r2_score(y_train, y_pred_train_ensemble_weighted)
Wensemble_r2_test = r2_score(y_test, y_pred_test_ensemble_weighted)

print(f'앙상블 훈련 RMSE: {ensemble_rmse_train}')
print(f'앙상블 테스트 RMSE: {ensemble_rmse_test}')
print(f'앙상블 훈련 R-squared: {ensemble_r2_train}')
print(f'앙상블 테스트 R-squared: {ensemble_r2_test}')
print(f'W 앙상블 훈련 RMSE: {Wensemble_rmse_train}')
print(f'W 앙상블 테스트 RMSE: {Wensemble_rmse_test}')
print(f'W 앙상블 훈련 R-squared: {Wensemble_r2_train}')
print(f'W 앙상블 테스트 R-squared: {Wensemble_r2_test}')

앙상블 훈련 RMSE: 1.855443532431063
앙상블 테스트 RMSE: 1.9584768969573612
앙상블 훈련 R-squared: 0.5884658949749182
앙상블 테스트 R-squared: 0.5412057586675216
W 앙상블 훈련 RMSE: 2.092519286401996
W 앙상블 테스트 RMSE: 2.164685021053375
W 앙상블 훈련 R-squared: 0.47658123586235335
W 앙상블 테스트 R-squared: 0.4395066342199305


In [18]:
lgb_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train_lgb))
lgb_r2_train = r2_score(y_train, y_pred_train_lgb)

lgb_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test_lgb))
lgb_r2_test = r2_score(y_test, y_pred_test_lgb)

print("LightGBM 모델 평가 지표:")
print("훈련 RMSE:", lgb_rmse_train)
print("훈련 R-squared:", lgb_r2_train)
print("테스트 RMSE:", lgb_rmse_test)
print("테스트 R-squared:", lgb_r2_test)

LightGBM 모델 평가 지표:
훈련 RMSE: 2.093150664612776
훈련 R-squared: 0.4762653247381252
테스트 RMSE: 2.103201584595604
테스트 R-squared: 0.47089379686391675


In [19]:
xgb_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train_xgb))
xgb_r2_train = r2_score(y_train, y_pred_train_xgb)

xgb_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb))
xgb_r2_test = r2_score(y_test, y_pred_test_xgb)

print("XGBoost 모델 평가 지표:")
print("훈련 RMSE:", xgb_rmse_train)
print("훈련 R-squared:", xgb_r2_train)
print("테스트 RMSE:", xgb_rmse_test)
print("테스트 R-squared:", xgb_r2_test)

XGBoost 모델 평가 지표:
훈련 RMSE: 1.7220080515194014
훈련 R-squared: 0.6455290051424385
테스트 RMSE: 1.9135301281431685
테스트 R-squared: 0.5620226399398447


In [24]:
# new_data 로드 및 전처리
new_data = pd.read_csv('data/new_data.csv', encoding='utf-8')
new_data = new_data[selected_features[:-1]]  # 'selected_features'는 모델 훈련 시 사용한 특성들과 동일하게 선택


In [25]:
# 범주형 데이터 레이블 인코딩
for feature in categorical_features:
    new_data[feature] = pd.factorize(new_data[feature])[0]

In [26]:
# lgb_model로 예측
y_pred_lgb = lgb_model.predict(new_data, num_iteration=lgb_model.best_iteration, predict_disable_shape_check=True)

# XGBoost 모델로 예측
y_pred_xgb = xgb_model.predict(xgb.DMatrix(new_data))


# 예측값을 'LGBM_Prediction' 열로 추가
new_data['LGBM_Prediction'] = y_pred_lgb

# 예측값을 'XGB_Prediction' 열로 추가
new_data['XGB_Prediction'] = y_pred_xgb

In [27]:
new_data['Ensemble_Prediction'] = (new_data['LGBM_Prediction'] + new_data['XGB_Prediction']) / 2

In [28]:
# 결과 출력 또는 저장
print(new_data[['LGBM_Prediction', 'XGB_Prediction', 'Ensemble_Prediction']])
# 또는
new_data.to_csv('predicted_results_dart.csv', index=False)

      LGBM_Prediction  XGB_Prediction  Ensemble_Prediction
0            2.019004        3.112292             2.565648
1            2.589972        3.369089             2.979531
2            1.786461        2.363587             2.075024
3            1.937376        3.177858             2.557617
4            2.331683        3.560756             2.946220
...               ...             ...                  ...
2239         0.876962        1.109296             0.993129
2240         1.016512        1.136709             1.076610
2241         1.366452        1.143836             1.255144
2242         1.016512        1.076689             1.046601
2243         1.168220        1.074451             1.121336

[2244 rows x 3 columns]
