In [1]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor

import numpy as np
import pandas as pd

In [13]:
# Load data
work_dir = "/Users/ho/Documents/lld/"
data_path = "order_info_202307111047.csv"
data = pd.read_csv(work_dir + data_path)

In [12]:
df = data[['store_id', 'order_div', 'order_dt', 'menu_name', 'sale_price', 'quentity']]

# 취소된 주문 삭제
idx = df[df['order_div'] == '취소'].index
df = df.drop(idx)
df = df.drop('order_div',axis=1)

# menu_name 필요 없는 데이터 삭제
df = df[df.menu_name != '추가배달료 결제 감사합니다']
df = df[df.menu_name != '코카콜라']
df = df[df.menu_name != '사이다']

# menu_name 숫자형으로 변환
mapping = {}
for i, j in enumerate(df['menu_name'].unique()):
    mapping[j] = i
    
df.loc[:,'menu_name'] = df.loc[:,'menu_name'].map(mapping)
df['menu_name'] = df['menu_name'].astype(int)

# 월 별로 묶음
df['order_dt'] = pd.to_datetime(df['order_dt'], format='%Y%m%d')
df['year'] = df['order_dt'].dt.year
df['month'] = df['order_dt'].dt.month
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)
df = df.drop('order_dt', axis=1)

# 합산
df = df.groupby(['store_id', 'menu_name', 'sale_price', 'year', 'month']).sum().reset_index()

# Min value
f_min = lambda x: x.rolling(window=3, min_periods=1).min()
# Max value
f_max = lambda x: x.rolling(window=3, min_periods=1).max()
# Mean value
f_mean = lambda x: x.rolling(window=3, min_periods=1).mean()
# Standard deviation
f_std = lambda x: x.rolling(window=3, min_periods=1).std()

function_list = [f_min, f_max, f_mean, f_std]
function_name = ['min', 'max', 'mean', 'std']

for i in range(len(function_list)):
    df[('quentity_%s' % function_name[i])] = df.groupby(['store_id', 'menu_name'])['quentity'].apply(function_list[i]).reset_index(drop=True)

# Fill the empty std features with 0
df['quentity_std'].fillna(0, inplace=True)

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
store_id,762.0,13750880.0,184129.182574,13573234.0,13573234.0,13694030.0,13994260.0,13994260.0
menu_name,762.0,2.384514,2.08314,0.0,0.0,2.0,4.0,7.0
sale_price,762.0,33552.89,14085.475816,10900.0,23000.0,33000.0,40000.0,106000.0
year,762.0,2023.0,0.0,2023.0,2023.0,2023.0,2023.0,2023.0
month,762.0,3.492126,1.758947,1.0,2.0,3.0,5.0,7.0
quentity,762.0,10.57218,7.502827,2.0,6.0,8.0,12.0,80.0
quentity_min,762.0,7.30315,4.09977,2.0,5.0,6.0,8.0,50.0
quentity_max,762.0,14.83071,9.688524,4.0,8.0,12.0,18.0,80.0
quentity_mean,762.0,10.63801,5.787863,3.0,7.0,9.0,12.0,60.66667
quentity_std,762.0,4.078617,4.626223,0.0,1.0,2.857589,5.507571,41.42865


In [5]:
X = df.drop('quentity', axis=1)
y = df['quentity']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the individual regression models
lgbm = LGBMRegressor()
xgb = XGBRegressor()
rf = RandomForestRegressor()
knn = KNeighborsRegressor()
mlp = MLPRegressor()

# Create the voting regressor ensemble
voting_regressor = VotingRegressor([('lgbm', lgbm), ('xgb', xgb), ('rf', rf), ('knn', knn), ('mlp', mlp)])

# Fit the ensemble model on the training data
voting_regressor.fit(X_train, y_train)

# Make predictions using the ensemble model
y_pred = voting_regressor.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: ", rmse)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 609, number of used features: 8
[LightGBM] [Info] Start training from score 10.592775
Root Mean Squared Error:  35.94785036732912


