In [29]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd

In [None]:
# 데이터 로드
data = pd.read_csv('merged_data_.csv', encoding='utf-8')

In [None]:
data

In [None]:
# 필요한 열 선택
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]

In [None]:
# 범주형 데이터를 숫자로 변환 (Label Encoding)
categorical_features = ['stn_id', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]

In [None]:
# 훈련-테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data.drop('borrowed_num', axis=1), data['borrowed_num'], test_size=0.2, random_state=42)

# LGBM Model

In [None]:
# LightGBM 모델
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 80,
    'learning_rate': 0.1,
    'feature_fraction': 1.0
}

In [8]:
train_data_lgb = lgb.Dataset(X_train, label=y_train)
test_data_lgb = lgb.Dataset(X_test, label=y_test, reference=train_data_lgb)

lgb_model = lgb.train(lgb_params, train_data_lgb, num_boost_round=100000, valid_sets=[test_data_lgb, train_data_lgb], callbacks=[
    lgb.early_stopping(stopping_rounds=3, verbose=100),
])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.741760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1112
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 9
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[1074]	training's rmse: 1.91712	valid_0's rmse: 1.96817


# XGBoost Model

In [9]:
# XGBoost 모델
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'booster': 'gbtree',
    'learning_rate': 0.1,
    'max_depth': 13,
    'subsample':0.8
}

In [10]:
train_data_xgb = xgb.DMatrix(X_train, label=y_train)
test_data_xgb = xgb.DMatrix(X_test, label=y_test)

xgb_model = xgb.train(xgb_params, train_data_xgb, num_boost_round=100000, evals=[(test_data_xgb, 'eval')], early_stopping_rounds=3, verbose_eval=100)


[0]	eval-rmse:2.79104
[100]	eval-rmse:1.97290
[200]	eval-rmse:1.92076
[207]	eval-rmse:1.91981


In [11]:
# 예측 생성
y_pred_train_lgb = lgb_model.predict(X_train, num_iteration=lgb_model.best_iteration)
y_pred_test_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

y_pred_train_xgb = xgb_model.predict(train_data_xgb)
y_pred_test_xgb = xgb_model.predict(test_data_xgb)

In [12]:
# 앙상블 예측 (평균이나 가중 평균 등의 방법 사용 가능)
y_pred_train_ensemble = (y_pred_train_lgb + y_pred_train_xgb) / 2
y_pred_test_ensemble = (y_pred_test_lgb + y_pred_test_xgb) / 2

In [13]:
# 앙상블 예측 평가
ensemble_rmse_train = mean_squared_error(y_train, y_pred_train_ensemble, squared=False)
ensemble_rmse_test = mean_squared_error(y_test, y_pred_test_ensemble, squared=False)
ensemble_r2_train = r2_score(y_train, y_pred_train_ensemble)
ensemble_r2_test = r2_score(y_test, y_pred_test_ensemble)

print(f'앙상블 훈련 RMSE: {ensemble_rmse_train}')
print(f'앙상블 테스트 RMSE: {ensemble_rmse_test}')
print(f'앙상블 훈련 R-squared: {ensemble_r2_train}')
print(f'앙상블 테스트 R-squared: {ensemble_r2_test}')

앙상블 훈련 RMSE: 1.8015424272477547
앙상블 테스트 RMSE: 1.907303486975216
앙상블 훈련 R-squared: 0.6120289344943435
앙상블 테스트 R-squared: 0.5648683655165041


In [14]:
lgb_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train_lgb))
lgb_r2_train = r2_score(y_train, y_pred_train_lgb)

lgb_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test_lgb))
lgb_r2_test = r2_score(y_test, y_pred_test_lgb)

print("LightGBM 모델 평가 지표:")
print("훈련 RMSE:", lgb_rmse_train)
print("훈련 R-squared:", lgb_r2_train)
print("테스트 RMSE:", lgb_rmse_test)
print("테스트 R-squared:", lgb_r2_test)

LightGBM 모델 평가 지표:
훈련 RMSE: 1.9171175656447816
훈련 R-squared: 0.5606528216289648
테스트 RMSE: 1.968168264202363
테스트 R-squared: 0.5366539105673866


In [15]:
xgb_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train_xgb))
xgb_r2_train = r2_score(y_train, y_pred_train_xgb)

xgb_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb))
xgb_r2_test = r2_score(y_test, y_pred_test_xgb)

print("XGBoost 모델 평가 지표:")
print("훈련 RMSE:", xgb_rmse_train)
print("훈련 R-squared:", xgb_r2_train)
print("테스트 RMSE:", xgb_rmse_test)
print("테스트 R-squared:", xgb_r2_test)

XGBoost 모델 평가 지표:
훈련 RMSE: 1.756080355051462
훈련 R-squared: 0.6313628347438108
테스트 RMSE: 1.9198051797029994
테스트 R-squared: 0.5591454060935321


In [17]:
# new_data 로드 및 전처리
new_data = pd.read_csv('realdata따릉이 5-6 일요일 ver2.csv', encoding='utf-8')
new_data = new_data[selected_features[:-1]]  # 'selected_features'는 모델 훈련 시 사용한 특성들과 동일하게 선택


In [18]:
# 범주형 데이터 레이블 인코딩
for feature in categorical_features:
    new_data[feature] = pd.factorize(new_data[feature])[0]

In [27]:
new_data['LGBM_Prediction'] = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration, predict_disable_shape_check=True)

ValueError: Length of values (1929488) does not match length of index (2244)

In [25]:
# 컬럼 이름 일치시키기
new_data.columns = X_train_dummy.columns

# XGB_Prediction 컬럼 제거
new_data = new_data.drop('XGB_Prediction', axis=1, errors='ignore')

# XGBoost 모델 예측
new_data['XGB_Prediction'] = xgb_model.predict(xgb.DMatrix(new_data))

# 더미 컬럼 제거
X_train_dummy.drop('XGB_Prediction', axis=1, inplace=True)

In [28]:
print(new_data)

      stn_id  borrowed_hour  borrowed_day  is_holiday  borrowed_num_nearby  \
0          0             17             7           1                  2.0   
1          1             17             7           1                  4.0   
2          2             17             7           1                  1.0   
3          3             17             7           1                  2.0   
4          4             17             7           1                  3.0   
...      ...            ...           ...         ...                  ...   
2239    2239             17             7           1                  1.0   
2240    2240             17             7           1                  2.0   
2241    2241             17             7           1                  4.0   
2242    2242             17             7           1                  2.0   
2243    2243             17             7           1                  3.0   

      강수량(mm)  wind_chill  nearby_id  borrowed_date  XGB_Predic

In [26]:
# 앙상블 예측 (평균이나 가중 평균 등의 방법 사용 가능)
new_data['Ensemble_Prediction'] = (new_data['LGBM_Prediction'] + new_data['XGB_Prediction']) / 2

# 예측 결과를 새로운 컬럼으로 저장
new_data['predicted_borrowed_num'] = new_data['Ensemble_Prediction']

# 결과 출력 또는 저장
print(new_data[['LGBM_Prediction', 'XGB_Prediction', 'Ensemble_Prediction', 'predicted_borrowed_num']])
# 또는
new_data.to_csv('predicted_results_ver2.csv', index=False)

KeyError: 'LGBM_Prediction'