In [1]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor

import joblib

import numpy as np
import pandas as pd

In [144]:
# Load data
work_dir = "/Users/ho/Documents/lld/"
data_path = "order_info_202307111047.csv"
data = pd.read_csv(work_dir + data_path)

In [145]:
df = data[['store_id', 'order_div', 'order_dt', 'menu_name', 'sale_price', 'quentity']]

# 취소된 주문 삭제
idx = df[df['order_div'] == '취소'].index
df = df.drop(idx)
df = df.drop('order_div',axis=1)

# menu_name 필요 없는 데이터 삭제
df = df[df.menu_name != '추가배달료 결제 감사합니다']
df = df[df.menu_name != '코카콜라']
df = df[df.menu_name != '사이다']

# menu_name 숫자형으로 변환
mapping = {}
for i, j in enumerate(df['menu_name'].unique()):
    mapping[j] = i
    
df.loc[:,'menu_name'] = df.loc[:,'menu_name'].map(mapping)
df['menu_name'] = df['menu_name'].astype(int)

# 월 별로 묶음
df['order_dt'] = pd.to_datetime(df['order_dt'], format='%Y%m%d')
df['year'] = df['order_dt'].dt.year
df['month'] = df['order_dt'].dt.month
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)
df = df.drop('order_dt', axis=1)

# 계절
df['season'] = df['month'].apply(lambda x: 1 if x in [3, 4, 5] else (2 if x in [6, 7, 8] else (3 if x in [9, 10, 11] else 4)))

# 합산
df = df.groupby(['store_id', 'menu_name', 'sale_price', 'year', 'month', 'season']).sum().reset_index()

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
store_id,762.0,13750880.0,184129.182574,13573234.0,13573234.0,13694034.0,13994257.0,13994257.0
menu_name,762.0,2.384514,2.08314,0.0,0.0,2.0,4.0,7.0
sale_price,762.0,33552.89,14085.475816,10900.0,23000.0,33000.0,40000.0,106000.0
year,762.0,2023.0,0.0,2023.0,2023.0,2023.0,2023.0,2023.0
month,762.0,3.492126,1.758947,1.0,2.0,3.0,5.0,7.0
season,762.0,2.208661,1.352383,1.0,1.0,2.0,4.0,4.0
quentity,762.0,10.57218,7.502827,2.0,6.0,8.0,12.0,80.0


In [146]:
df.columns

Index(['store_id', 'menu_name', 'sale_price', 'year', 'month', 'season',
       'quentity'],
      dtype='object')

In [155]:
file_path = '/Users/ho/Documents/lld/df.csv' 
df.to_csv(file_path, index=False) 

In [147]:
X = df.drop('quentity', axis=1)
y = df['quentity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [148]:
from pycaret.datasets import get_data
from pycaret.regression import *

# 데이터 로드
data = get_data('diamond')

# PyCaret 환경 설정
exp1 = setup(df, target='quentity', session_id=123)

# 회귀 모델 비교
best_model = compare_models()

# 최적의 회귀 모델 선택
final_model = create_model(best_model)

# 테스트 데이터셋 예측
predictions = predict_model(final_model, data=df)

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,1.1,Ideal,H,SI1,VG,EX,GIA,5169
1,0.83,Ideal,H,VS1,ID,ID,AGSL,3470
2,0.85,Ideal,H,SI1,EX,EX,GIA,3183
3,0.91,Ideal,E,SI1,VG,VG,GIA,4370
4,0.83,Ideal,G,SI1,EX,EX,GIA,3171


Unnamed: 0,Description,Value
0,Session id,123
1,Target,quentity
2,Target type,Regression
3,Original data shape,"(762, 7)"
4,Transformed data shape,"(762, 7)"
5,Transformed train set shape,"(533, 7)"
6,Transformed test set shape,"(229, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,4.2455,38.4962,6.1315,0.1283,0.4459,0.4514,0.057
lightgbm,Light Gradient Boosting Machine,4.2733,39.8049,6.2391,0.0888,0.459,0.4563,0.186
br,Bayesian Ridge,4.668,43.7611,6.5125,0.0304,0.4721,0.5132,0.063
lr,Linear Regression,4.6717,43.6973,6.5108,0.03,0.473,0.5125,0.205
ridge,Ridge Regression,4.6716,43.6968,6.5107,0.03,0.473,0.5125,0.043
lar,Least Angle Regression,4.6717,43.6973,6.5108,0.03,0.473,0.5125,0.045
en,Elastic Net,4.6745,43.9419,6.5256,0.0266,0.4732,0.5147,0.043
lasso,Lasso Regression,4.6979,44.3929,6.558,0.017,0.476,0.5182,0.052
llar,Lasso Least Angle Regression,4.6979,44.3929,6.558,0.017,0.476,0.5182,0.06
omp,Orthogonal Matching Pursuit,4.7422,45.734,6.6524,-0.0101,0.4796,0.517,0.057


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.7534,27.5335,5.2472,0.0008,0.4265,0.4286
1,3.7188,22.922,4.7877,0.3176,0.3954,0.4214
2,5.1654,59.552,7.717,0.1419,0.5143,0.5347
3,4.2927,41.5029,6.4423,-0.0265,0.4659,0.4563
4,4.4027,30.0,5.4772,0.1049,0.4493,0.5427
5,4.228,41.9992,6.4807,0.1724,0.4527,0.4053
6,4.2049,42.0538,6.4849,0.0335,0.4574,0.3823
7,3.8141,27.7271,5.2657,0.1943,0.4486,0.5254
8,4.5345,58.3095,7.6361,0.3016,0.4267,0.4149
9,4.3407,33.3618,5.776,0.0422,0.4219,0.4029


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,3.6761,33.6765,5.8031,0.401,0.3821,0.3754


In [153]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test).reshape(-1, 1)

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

# MSE (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# R2 (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)
print("R2:", r2)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 609, number of used features: 5
[LightGBM] [Info] Start training from score 10.592775
MAE: 4.577303546310921
MSE: 53.394428289919375
RMSE: 7.30714912191611
R2: -0.003822970596894182


In [149]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
import numpy as np

class DemandForecastingBlendingModel:
    def __init__(self):
        self.lgbm = LGBMRegressor()
        self.xgb = XGBRegressor()
        self.rf = RandomForestRegressor()
        self.knn = KNeighborsRegressor()
        self.linear_reg = LinearRegression()

    def fit(self, X_train, y_train):
        self.lgbm.fit(X_train, y_train)
        self.xgb.fit(X_train, y_train)
        self.rf.fit(X_train, y_train)
        self.knn.fit(X_train, y_train)
        
        lgbm_pred = self.lgbm.predict(X_train).reshape(-1, 1)
        xgb_pred = self.xgb.predict(X_train).reshape(-1, 1)
        rf_pred = self.rf.predict(X_train).reshape(-1, 1)
        knn_pred = self.knn.predict(X_train).reshape(-1, 1)
        
        X_second_stage = np.concatenate([lgbm_pred, xgb_pred, rf_pred, knn_pred], axis=1)
        
        self.linear_reg.fit(X_second_stage, y_train)

    def predict(self, X_test):
        lgbm_pred_test = self.lgbm.predict(X_test).reshape(-1, 1)
        xgb_pred_test = self.xgb.predict(X_test).reshape(-1, 1)
        rf_pred_test = self.rf.predict(X_test).reshape(-1, 1)
        knn_pred_test = self.knn.predict(X_test).reshape(-1, 1)

        X_second_stage_test = np.concatenate([lgbm_pred_test, xgb_pred_test, rf_pred_test, knn_pred_test], axis=1)

        return self.linear_reg.predict(X_second_stage_test)

In [150]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, mean_absolute_percentage_error

my_model = DemandForecastingBlendingModel()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

# MSE (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# R2 (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)
print("R2:", r2)

# MAPE (Mean Absolute Percentage Error)
def mean_absolute_percentage_error(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print("MAPE:", mape)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 609, number of used features: 5
[LightGBM] [Info] Start training from score 10.592775
MAE: 4.664842051061055
MSE: 56.5724515339533
RMSE: 7.521466049511445
R2: -0.06357026700262036
MAPE: 51.198302365650086


In [151]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
import numpy as np

class DemandForecastingEnsemblingModel:
    def __init__(self):
        self.lgbm = LGBMRegressor()
        self.xgb = XGBRegressor()
        self.rf = RandomForestRegressor()
        self.knn = KNeighborsRegressor()
        self.mlp = MLPRegressor()
        self.voting_regressor = VotingRegressor([('lgbm', self.lgbm), ('xgb', self.xgb), ('rf', self.rf), ('knn', self.knn)])
        
    def fit(self, X_train, y_train):
        self.voting_regressor.fit(X_train, y_train)
        
    def predict(self, X_test):
        return self.voting_regressor.predict(X_test)

In [152]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, mean_absolute_percentage_error

my_model = DemandForecastingEnsemblingModel()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

# MSE (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# R2 (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)
print("R2:", r2)

# MAPE (Mean Absolute Percentage Error)
def mean_absolute_percentage_error(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print("MAPE:", mape)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 609, number of used features: 5
[LightGBM] [Info] Start training from score 10.592775
MAE: 4.219674795353407
MSE: 47.411966869470106
RMSE: 6.885634819642275
R2: 0.10864816893753704
MAPE: 45.2694825939059


In [159]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.neighbors import KNeighborsRegressor

class DemandForecastingModel:
    def __init__(self):
        self.models = [
            ('RandomForest', RandomForestRegressor()),
            ('XGBoost', XGBRegressor()),
            ('Gradient Boosting', GradientBoostingRegressor()),
            ('LightGBM', LGBMRegressor()),
            ('Huber Regressor', HuberRegressor()),
            ('K Neighbors Regressor', KNeighborsRegressor()),
            ('Linear Regression', LinearRegression())
        ]
        self.model_predictions = []
        self.second_model = LinearRegression()

    def fit(self, X_train, y_train):
        for name, model in self.models:
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            self.model_predictions.append(y_pred)
            
        self.second_data = pd.DataFrame({name: pred for name, pred in zip([name for name, _ in self.models], self.model_predictions)})
        self.second_model.fit(self.second_data, y_test)

    def predict(self, X_test):
        return self.second_model.predict(self.second_data)

In [160]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, mean_absolute_percentage_error

my_model = DemandForecastingModel()
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

# MSE (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# R2 (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)
print("R2:", r2)

# MAPE (Mean Absolute Percentage Error)
def mean_absolute_percentage_error(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print("MAPE:", mape)

Training RandomForest...
Training XGBoost...
Training Gradient Boosting...
Training LightGBM...
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 609, number of used features: 5
[LightGBM] [Info] Start training from score 10.592775
Training Huber Regressor...
Training K Neighbors Regressor...
Training Linear Regression...
MAE: 4.065300053439353
MSE: 41.47816032616671
RMSE: 6.440354052858174
R2: 0.22020458974803314
MAPE: 45.057576060430975
