In [75]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [76]:
# 훈련 데이터 로드
data = pd.read_csv('merged_data.csv', encoding='utf-8')

In [77]:
data

Unnamed: 0,stn_id,borrowed_hour,borrowed_day,is_holiday,borrowed_num_nearby,강수량(mm),wind_chill,stn_gu,nearby_id,borrowed_date,borrowed_num
0,ST-10,0,6,1,2,0.0,21.031237,마포구,ST-2167,20221001,2
1,ST-10,1,6,1,1,0.0,20.222173,마포구,ST-2167,20221001,2
2,ST-10,2,6,1,2,0.0,19.589061,마포구,ST-2167,20221001,1
3,ST-10,3,6,1,0,0.0,19.552158,마포구,ST-2167,20221001,0
4,ST-10,4,6,1,2,0.0,18.783459,마포구,ST-2167,20221001,0
...,...,...,...,...,...,...,...,...,...,...,...
9647433,ST-1445,20,2,0,1,0.0,7.781195,용산구,ST-1328,20230228,0
9647434,ST-1445,21,2,0,0,0.0,7.974197,용산구,ST-1328,20230228,3
9647435,ST-1445,22,2,0,0,0.0,8.213957,용산구,ST-1328,20230228,0
9647436,ST-1445,23,2,0,0,0.0,8.163898,용산구,ST-1328,20230228,1


In [78]:
# 필요한 열 선택
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'stn_gu', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]

In [79]:
# 범주형 데이터를 숫자로 변환 (Label Encoding)
categorical_features = ['stn_id', 'stn_gu', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]

In [80]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(data.drop('borrowed_num', axis=1), data['borrowed_num'], test_size=0.2, random_state=42)

In [91]:
# LightGBM 모델 생성 및 훈련
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 70,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [92]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

model = lgb.train(params, train_data, num_boost_round=100000, valid_sets=[test_data], callbacks=[
        lgb.early_stopping(stopping_rounds=3, verbose=100),
    ])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.365077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 10
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[2691]	valid_0's rmse: 1.79101


In [93]:
# 훈련 데이터에서의 평가
y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print(f'Training MSE: {mse_train}')
print(f'Training MAE: {mae_train}')
print(f'Training RMSE: {rmse_train}')

# 검증 데이터에서의 평가
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print(f'Test MSE: {mse_test}')
print(f'Test MAE: {mae_test}')
print(f'Test RMSE: {rmse_test}')

In [None]:
# R-squared 계산
# 훈련 데이터에서의 R-squared 계산
r2_train = r2_score(y_train, y_pred_train)

# 테스트 데이터에서의 R-squared 계산
r2_test = r2_score(y_test, y_pred_test)

print("R-squared for training data:", r2_train)
print("R-squared for test data:", r2_test)

R-squared for training data: 0.6078997735886429
R-squared for test data: 0.5947505725107973


In [69]:
# 새로운 데이터 예측
new_data = pd.read_csv('data/new_data.csv', encoding='utf-8')  # 'new_data.csv'는 새로운 데이터 파일명으로 대체해야 합니다.


In [70]:
new_data

Unnamed: 0,stn_id,borrowed_hour,borrowed_day,is_holiday,borrowed_num_nearby,강수량(mm),wind_chill,stn_gu,nearby_id,borrowed_date
0,ST-1442,21,6,1,2,0.0,2,중랑구,ST-1429,20231202
1,ST-2915,21,6,1,2,0.0,2,노원구,ST-5319,20231202
2,ST-3533,21,6,1,2,0.0,2,광진구,ST-3534,20231202
3,ST-2104,21,6,1,2,0.0,2,관악구,ST-2034,20231202
4,ST-2222,21,6,1,2,0.0,2,서초구,ST-2089,20231202


In [71]:
# Select necessary features
new_data = new_data[selected_features[:-1]]  # Exclude 'borrowed_num' since it's the target variable


In [72]:
# Convert categorical features to numeric using the same Label Encoding
for feature in categorical_features:
    new_data[feature] = pd.factorize(new_data[feature])[0]

In [73]:
# Use the trained LightGBM model to make predictions on the new data
predictions = model.predict(new_data, num_iteration=model.best_iteration)

In [74]:
# Display the predictions
print(predictions)

[2.05250218 1.52827561 1.52827561 0.89383863 1.65474476]


# Done!!!

In [121]:
used_features = X_train.columns.to_list()

In [122]:
new_data_selected = new_data[used_features]

In [123]:
for feature in categorical_features:
    # 새로운 데이터의 범주형 열을 label encoding할 때 object 타입을 피하기 위해 astype 사용
    new_data_selected[feature] = pd.factorize(new_data_selected[feature].astype(str))[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data_selected[feature] = pd.factorize(new_data_selected[feature].astype(str))[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data_selected[feature] = pd.factorize(new_data_selected[feature].astype(str))[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data_selected[feature] = pd.fac

In [124]:
# 예측
y_pred_new = model.predict(new_data_selected, num_iteration=model.best_iteration)
print(f'Predictions for new data: {y_pred_new}')

Predictions for new data: [2.06048776 1.5087868  1.9599274  1.40895879 0.80777283]
