In [1]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

# Google Colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#load data
data = pd.read_csv('/content/drive/MyDrive/23_BA/merged_data.csv', encoding='utf-8')

# Local Data

In [2]:
#load data
data = pd.read_csv('merged_data.csv', encoding='utf-8')

In [3]:
data.head()

Unnamed: 0,stn_id,borrowed_hour,borrowed_day,is_holiday,borrowed_num_nearby,강수량(mm),wind_chill,stn_gu,nearby_id,borrowed_date,borrowed_num
0,ST-10,0,6,1,2,0.0,21.031237,마포구,ST-2167,20221001,2
1,ST-10,1,6,1,1,0.0,20.222173,마포구,ST-2167,20221001,2
2,ST-10,2,6,1,2,0.0,19.589061,마포구,ST-2167,20221001,1
3,ST-10,3,6,1,0,0.0,19.552158,마포구,ST-2167,20221001,0
4,ST-10,4,6,1,2,0.0,18.783459,마포구,ST-2167,20221001,0


# Total Data

In [None]:
# 필요한 열 선택
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'stn_gu', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]

In [None]:
# 범주형 데이터를 숫자로 변환 (Label Encoding)
categorical_features = ['stn_id', 'stn_gu', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]

In [None]:
# 날짜 데이터 전처리
data['borrowed_date'] = pd.to_datetime(data['borrowed_date'])
data['year'] = data['borrowed_date'].dt.year
data['month'] = data['borrowed_date'].dt.month
data['day'] = data['borrowed_date'].dt.day

In [None]:
# 독립변수와 종속변수 분리
X = data.drop(['borrowed_num', 'stn_id', 'borrowed_date'], axis=1)
y = data['borrowed_num']

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## LightGBM (total)

In [None]:
# LightGBM 모델 생성 및 훈련
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 80,
    'learning_rate': 0.03,
    'feature_fraction': 0.9
}


## 교차검증

In [None]:
# K-fold 교차 검증 수행
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = lgb.cv(params, lgb.Dataset(X, label=y), num_boost_round=1000, folds=kf, callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=50),
    ])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.326699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 737
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 11
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.282034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 737
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 11
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.262550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 737
[LightGBM] [Info] Number of data 

In [None]:
cv_results.keys()

dict_keys(['valid rmse-mean', 'valid rmse-stdv'])

In [None]:
# 교차 검증 결과 출력
print(f'Best number of boosting rounds: {len(cv_results["valid rmse-mean"])}')
print(f'Best RMSE: {cv_results["valid rmse-mean"][-1]}')

Best number of boosting rounds: 8905
Best RMSE: 1.948405555438897


In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


In [None]:
model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[test_data, train_data], callbacks=[
        lgb.early_stopping(stopping_rounds=3, verbose=100),
    ])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.329694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 735
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 11
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds


In [None]:
model = lgb.train(params, train_data, num_boost_round=100000, valid_sets=[test_data, train_data], callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=50),
    ])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.346981 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 735
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 11
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[2966]	training's rmse: 1.93316	valid_0's rmse: 1.98176


In [None]:
# 모델 평가
y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
# 훈련 데이터에서의 평가
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)

print(f'Training MSE: {mse_train}')
print(f'Training MAE: {mae_train}')
print(f'Training RMSE: {rmse_train}')

# 검증 데이터에서의 평가
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print(f'Test MSE: {mse_test}')
print(f'Test MAE: {mae_test}')
print(f'Test RMSE: {rmse_test}')

Training MSE: 3.737101203294012
Training MAE: 1.1228728212762518
Training RMSE: 1.9331583492549211
Test MSE: 3.9273722893651293
Test MAE: 1.1337698955516267
Test RMSE: 1.9817598970019374


In [None]:
print(f'Training - Test MSE: {mse_train - mse_test}')
print(f'Training - Test MAE: {mae_train - mae_test}')
print(f'Training - Test RMSE: {rmse_train - rmse_test}')

Training - Test MSE: -0.19027108607111742
Training - Test MAE: -0.010897074275374896
Training - Test RMSE: -0.048601547747016305


In [None]:
# 모델 평가
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

# Sample Data

In [None]:
# 필요한 열 선택
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'stn_gu', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]

In [None]:
# 범주형 데이터를 숫자로 변환 (Label Encoding)
categorical_features = ['stn_id', 'stn_gu', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(data.drop('borrowed_num', axis=1), data['borrowed_num'], test_size=0.2, random_state=42)

In [None]:
# LightGBM 모델 생성 및 훈련
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 69,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
model = lgb.train(params, train_data, num_boost_round=10000, valid_sets=[test_data], callbacks=[
        lgb.early_stopping(stopping_rounds=3, verbose=100),
    ])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.738167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 10
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds


In [None]:
# 모델 평가
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# MSE 계산
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# MAE 계산
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# RMSE 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Overfitting Check

In [4]:
# 필요한 열 선택
selected_features = ['stn_id', 'borrowed_hour', 'borrowed_day', 'is_holiday', 'borrowed_num_nearby', '강수량(mm)', 'wind_chill', 'stn_gu', 'nearby_id', 'borrowed_date', 'borrowed_num']
data = data[selected_features]


In [5]:
# 범주형 데이터를 숫자로 변환 (Label Encoding)
categorical_features = ['stn_id', 'stn_gu', 'nearby_id']
for feature in categorical_features:
    data[feature] = pd.factorize(data[feature])[0]


In [18]:
# # 날짜 데이터 전처리
# data['borrowed_date'] = pd.to_datetime(data['borrowed_date'])
# data['year'] = data['borrowed_date'].dt.year
# data['month'] = data['borrowed_date'].dt.month
# data['day'] = data['borrowed_date'].dt.day

In [6]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(data.drop('borrowed_num', axis=1), data['borrowed_num'], test_size=0.2, random_state=42)


In [29]:
# LightGBM 모델 생성 및 훈련
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 70,
    'learning_rate': 0.01,
    'feature_fraction': 0.9
}


### Change Learning Rate: 0.01

In [None]:
# LightGBM 모델 생성 및 훈련
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.01,
    'feature_fraction': 0.9
}

### Change Learning Rate: 0.001

In [None]:
# LightGBM 모델 생성 및 훈련
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.001,
    'feature_fraction': 0.9
}

### Chage Learning Rate: 0.05 + leaves: 50

In [13]:
# LightGBM 모델 생성 및 훈련
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 90,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

## Result

In [30]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


### num_boost_round = 1000

In [15]:
model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[test_data, train_data], callbacks=[
        lgb.early_stopping(stopping_rounds=3, verbose=100)
    ])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.395200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 10
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 1.82427	valid_0's rmse: 1.85182


In [16]:
# 모델 평가
y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)

In [17]:
# 훈련 데이터에서의 평가
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)


print(f'Training MSE: {mse_train}')
print(f'Training MAE: {mae_train}')
print(f'Training RMSE: {rmse_train}')


# 검증 데이터에서의 평가
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print(f'Test MSE: {mse_test}')
print(f'Test MAE: {mae_test}')
print(f'Test RMSE: {rmse_test}')

Training MSE: 3.3279611853227187
Training MAE: 1.0804852692346003
Training RMSE: 1.824270041776359
Test MSE: 3.429232670543812
Test MAE: 1.0871624492247818
Test RMSE: 1.8518187466768479


In [18]:
# R-squared 계산
# 훈련 데이터에서의 R-squared 계산
r2_train = r2_score(y_train, y_pred_train)

# 테스트 데이터에서의 R-squared 계산
r2_test = r2_score(y_test, y_pred_test)

print("R-squared for training data:", r2_train)
print("R-squared for test data:", r2_test)

R-squared for training data: 0.602178178950332
R-squared for test data: 0.5898166712672139


### num_boost_round = 10000

In [31]:
model = lgb.train(params, train_data, num_boost_round=10000, valid_sets=[test_data, train_data], callbacks=[
        lgb.early_stopping(stopping_rounds=3, verbose=100),
    ])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.064775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 10
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds


KeyboardInterrupt: 

In [None]:
# 모델 평가
y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)


In [None]:
# 훈련 데이터에서의 평가
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)


print(f'Training MSE: {mse_train}')
print(f'Training MAE: {mae_train}')
print(f'Training RMSE: {rmse_train}')


# 검증 데이터에서의 평가
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print(f'Test MSE: {mse_test}')
print(f'Test MAE: {mae_test}')
print(f'Test RMSE: {rmse_test}')

Training MSE: 3.212434090664998
Training MAE: 1.065591928778898
Training RMSE: 1.7923264464558342
Test MSE: 3.3345334027187716
Test MAE: 1.0738247402755428
Test RMSE: 1.8260704813119266


In [None]:
# R-squared 계산
# 훈련 데이터에서의 R-squared 계산
r2_train = r2_score(y_train, y_pred_train)

# 테스트 데이터에서의 R-squared 계산
r2_test = r2_score(y_test, y_pred_test)

print("R-squared for training data:", r2_train)
print("R-squared for test data:", r2_test)

R-squared for training data: 0.6159881955394693
R-squared for test data: 0.6011440044163154


# MAE Parameter

In [23]:
params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'device': 'gpu'
}

In [24]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [25]:
model = lgb.train(params, train_data, num_boost_round=100000, valid_sets=[test_data, train_data], callbacks=[
        lgb.early_stopping(stopping_rounds=3, verbose=100)
    ])

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 7717950, number of used features: 10
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 7 dense feature groups (58.88 MB) transferred to GPU in 0.291820 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 1.483498
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 1.07528	valid_0's l1: 1.08238


In [26]:
# 모델 평가
y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)

In [27]:
# 훈련 데이터에서의 평가
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)


print(f'Training MSE: {mse_train}')
print(f'Training MAE: {mae_train}')
print(f'Training RMSE: {rmse_train}')


# 검증 데이터에서의 평가
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print(f'Test MSE: {mse_test}')
print(f'Test MAE: {mae_test}')
print(f'Test RMSE: {rmse_test}')

Training MSE: 3.280097429239653
Training MAE: 1.0752783974256157
Training RMSE: 1.8111039255767885
Test MSE: 3.3879617774559487
Test MAE: 1.0823811374408754
Test RMSE: 1.8406416754642791


In [28]:
# R-squared 계산
# 훈련 데이터에서의 R-squared 계산
r2_train = r2_score(y_train, y_pred_train)

# 테스트 데이터에서의 R-squared 계산
r2_test = r2_score(y_test, y_pred_test)

print("R-squared for training data:", r2_train)
print("R-squared for test data:", r2_test)

R-squared for training data: 0.60789977410932
R-squared for test data: 0.594753236946168
