In [180]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor

import math
import numpy as np
import pandas as pd

In [123]:
# Load data
work_dir = "/Users/ho/Documents/lld/"
data_path = "order_info_202307111047.csv"
data = pd.read_csv(work_dir + data_path)

In [130]:
df = data[['store_id', 'order_div', 'order_dt', 'menu_name', 'sale_price', 'quentity']]

# 취소된 주문 삭제
idx = df[df['order_div'] == '취소'].index
df = df.drop(idx)
df = df.drop('order_div',axis=1)

# menu_name 필요 없는 데이터 삭제
df = df[df.menu_name != '추가배달료 결제 감사합니다']
df = df[df.menu_name != '코카콜라']
df = df[df.menu_name != '사이다']

# menu_name 숫자형으로 변환
mapping = {}
for i, j in enumerate(df['menu_name'].unique()):
    mapping[j] = i
    
df.loc[:,'menu_name'] = df.loc[:,'menu_name'].map(mapping)
df['menu_name'] = df['menu_name'].astype(int)


# 합산
df = df.groupby(['store_id', 'menu_name', 'sale_price', 'order_dt']).sum().reset_index()


# 월 별로 묶음
df['dt'] = pd.to_datetime(df['order_dt'], format='%Y%m%d')
df['year'] = df['dt'].dt.year
df['month'] = df['dt'].dt.month
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)
df = df.drop('dt',axis=1)

# 계절 추가
spr_idx = df.loc[(df.month == 3) | (df.month == 4) | (df.month == 5)].index
sum_idx = df.loc[(df.month == 6) | (df.month == 7) | (df.month == 8)].index
aut_idx = df.loc[(df.month == 9) | (df.month == 10) | (df.month == 11)].index
win_idx = df.loc[(df.month == 12) | (df.month == 1) | (df.month == 2)].index
df.loc[spr_idx,'season'] = 0
df.loc[sum_idx,'season'] = 1
df.loc[aut_idx,'season'] = 2
df.loc[win_idx,'season'] = 3
df['season'] = df['season'].astype(int)


# Min value
f_min = lambda x: x.rolling(window=3, min_periods=1).min()
# Max value
f_max = lambda x: x.rolling(window=3, min_periods=1).max()
# Mean value
f_mean = lambda x: x.rolling(window=3, min_periods=1).mean()
# Standard deviation
f_std = lambda x: x.rolling(window=3, min_periods=1).std()

function_list = [f_min, f_max, f_mean, f_std]
function_name = ['min', 'max', 'mean', 'std']

for i in range(len(function_list)):
    df[('quentity_%s' % function_name[i])] = df.groupby(['store_id', 'menu_name'])['quentity'].apply(function_list[i]).reset_index(drop=True)

# Fill the empty std features with 0
df['quentity_std'].fillna(0, inplace=True)

df.to_csv(work_dir + "df.csv", mode='w')
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
store_id,1286.0,13736580.0,183763.360929,13573234.0,13573234.0,13694030.0,13994260.0,13994260.0
menu_name,1286.0,2.294712,2.010731,0.0,0.0,2.0,4.0,7.0
sale_price,1286.0,30280.48,13333.360638,10900.0,20000.0,31000.0,36000.0,106000.0
order_dt,1286.0,20230370.0,175.050136,20230101.0,20230212.0,20230330.0,20230520.0,20230700.0
quentity,1286.0,6.264386,2.460285,2.0,5.0,6.0,7.0,22.0
year,1286.0,2023.0,0.0,2023.0,2023.0,2023.0,2023.0,2023.0
month,1286.0,3.51944,1.750823,1.0,2.0,3.0,5.0,7.0
season,1286.0,1.192068,1.349322,0.0,0.0,1.0,3.0,3.0
quentity_min,1286.0,5.31182,1.57657,2.0,4.0,5.0,6.0,12.0
quentity_max,1286.0,7.419907,3.098293,2.0,5.0,7.0,9.0,22.0


In [132]:
X = df.drop('quentity', axis=1)
y = df['quentity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the individual regression models
lgbm = LGBMRegressor(max_depth=8, 
                     n_estimators=500, 
                     min_child_weight=1000,  
                     colsample_bytree=0.7, 
                     subsample=0.7, 
                     eta=0.3, 
                     seed=0, 
                     force_col_wise=True)  # Set force_col_wise=True to remove the overhead warning
xgb = XGBRegressor(max_depth=8, 
                   n_estimators=500, 
                   min_child_weight=1000,  
                   colsample_bytree=0.7, 
                   subsample=0.7, 
                   eta=0.3, 
                   seed=0)
rf = RandomForestRegressor(n_estimators=50, 
                           max_depth=7, 
                           random_state=0, 
                           n_jobs=-1)
knn = KNeighborsRegressor()
mlp = MLPRegressor()

# Fit the individual models on the training data
lgbm.fit(X_train, y_train)
xgb.fit(X_train, y_train)
rf.fit(X_train, y_train)
knn.fit(X_train, y_train)
mlp.fit(X_train, y_train)

# Get the predictions from the individual models
lgbm_pred = lgbm.predict(X_train).reshape(-1, 1)
xgb_pred = xgb.predict(X_train).reshape(-1, 1)
rf_pred = rf.predict(X_train).reshape(-1, 1)
knn_pred = knn.predict(X_train).reshape(-1, 1)
mlp_pred = mlp.predict(X_train).reshape(-1, 1)

[LightGBM] [Info] Total Bins 509
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 10
[LightGBM] [Info] Start training from score 6.211089






In [133]:
# Create the second-stage Linear Regression model
linear_reg = LinearRegression(n_jobs=-1)

# Concatenate the predictions from the individual models as input features
X_second_stage = np.concatenate([lgbm_pred, xgb_pred, rf_pred, knn_pred, mlp_pred], axis=1)

# Fit the second-stage model on the concatenated predictions
linear_reg.fit(X_second_stage, y_train)

# Get the predictions from the individual models on the test data
lgbm_pred_test = lgbm.predict(X_test).reshape(-1, 1)
xgb_pred_test = xgb.predict(X_test).reshape(-1, 1)
rf_pred_test = rf.predict(X_test).reshape(-1, 1)
knn_pred_test = knn.predict(X_test).reshape(-1, 1)
mlp_pred_test = mlp.predict(X_test).reshape(-1, 1)

# Concatenate the predictions from the individual models as input features for the test data
X_second_stage_test = np.concatenate([lgbm_pred_test, xgb_pred_test, rf_pred_test, knn_pred_test, mlp_pred_test], axis=1)

# Get the final predictions from the second-stage model
y_pred = linear_reg.predict(X_second_stage_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: ", rmse)

Root Mean Squared Error:  1.6299466838294392


In [197]:
result = {'y_test':y_test,'y_pred':y_pred}
result = pd.DataFrame(result).reset_index(drop=True)

# 예측값에 올림 적용
result['y_pred'] = result['y_pred'].apply(math.ceil)

result.to_csv(work_dir + "result.csv", mode='w')

In [198]:
# 결과와 50% 이상 틀린 비율
result['error'] = abs(result['y_test'] - result['y_pred']) / result['y_test']
print(result[result['error'] > 0.5]['error'].count()/ len(result))

0.05813953488372093
