In [39]:
import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [40]:
# 데이터 로드
df = pd.read_csv('./휴일전날_비_추가_train.csv', index_col=0)
lunch_menu = pd.read_csv('../yk/data/lunch_train.csv')
dinner_menu = pd.read_csv('../yk/data/dinner_train.csv')

In [41]:
# train 데이터 세팅
df = df.drop(columns='일자')
df = df.drop(columns='조식메뉴')
df = df.drop(columns='중식메뉴')
df = df.drop(columns='석식메뉴')

onehot_tmp = pd.get_dummies(df['요일'])
df = pd.concat([df.drop(columns='요일'), onehot_tmp], axis=1)

onehot_tmp = pd.get_dummies(df['휴일전날'])
df = pd.concat([df.drop(columns='휴일전날'), onehot_tmp], axis=1)

In [42]:
# 데이터 분리
lunch_X = pd.concat([df, lunch_menu], axis=1).drop(columns=['중식계', '석식계', '중식메뉴'])
lunch_y = df['중식계']
dinner_X = pd.concat([df, dinner_menu], axis=1).drop(columns=['중식계', '석식계', '석식메뉴'])
dinner_y = df['석식계']

In [43]:
lunch_X.columns

Index(['본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', '점심비',
       '저녁비', '금', '목', '수', '월', '화', '평일', '휴일전날', '샐러드류', '양념 및 장류', '나물류',
       '소스류', '두류', '찜류', '곡류', '죽류', '국탕류', '볶음류', '유제품류', '김치류', '밥류', '묵류',
       '만두류', '견과류', '비빔밥볶음밥류', '음료류', '회류', '부침류', '덥밥국밥류', '빵과자류', '채소류',
       '면류', '육류', '난류', '어패류', '디저트류', '무침류', '찌개류', '해조류', '과일류', '전류',
       '조림류', '튀김류', '장아찌류', '구이류', '스프류', '덮밥국밥류', '떡류'],
      dtype='object')

In [44]:
svc = SVC()
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
ad = AdaBoostRegressor()
gb = GradientBoostingRegressor()
lgbm = LGBMRegressor()
xg = XGBRegressor()

In [45]:
scaler = MinMaxScaler()
models = [svc, ridge, lasso, rf, ad, gb, lgbm, xg]
kfold = KFold(n_splits=3, shuffle=True, random_state=23)

In [46]:
# lunch_X_train, lunch_X_test, lunch_y_train, lunch_y_test = model_selection.train_test_split(lunch_X, lunch_y, test_size=0.2)

In [47]:
for model in models:
    pipe = Pipeline([('scaler', scaler),
                 ('model', model)])
    param_grid = {'model__random_state' : [23]}
    grid_model = GridSearchCV(estimator=pipe,
                             param_grid=param_grid,
                             scoring = 'neg_mean_absolute_error',
                             cv=kfold,
                             iid=True,
                             n_jobs=-1).fit(lunch_X, lunch_y)
    print(model)
    print('교차검증 점수 : ', grid_model.best_score_)
    print('------------------------------------------')

SVC()
교차검증 점수 :  -125.4896265560166
------------------------------------------
Ridge()
교차검증 점수 :  -78.75999513478264
------------------------------------------
Lasso()
교차검증 점수 :  -78.28354623080679
------------------------------------------
RandomForestRegressor()
교차검증 점수 :  -74.72393360995851
------------------------------------------
AdaBoostRegressor()
교차검증 점수 :  -83.17446205231025
------------------------------------------
GradientBoostingRegressor()
교차검증 점수 :  -71.45852536892593
------------------------------------------
LGBMRegressor()
교차검증 점수 :  -72.29699406516667
------------------------------------------
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n

In [48]:
# dinner_X_train, dinner_X_test, dinner_y_train, dinner_y_test = model_selection.train_test_split(dinner_X, dinner_y, test_size=0.2)

In [49]:
for model in models:
    pipe = Pipeline([('scaler', scaler),
                 ('model', model)])
    param_grid = {'model__random_state' : [23]}
    grid_model = GridSearchCV(estimator=pipe,
                             param_grid=param_grid,
                             scoring = 'neg_mean_absolute_error',
                             cv=kfold,
                             iid=True,
                             n_jobs=-1).fit(dinner_X, dinner_y)
    print(model)
    print('교차검증 점수 : ', grid_model.best_score_)
    print('------------------------------------------')

SVC()
교차검증 점수 :  -79.60082987551867
------------------------------------------
Ridge()
교차검증 점수 :  -61.12693855201674
------------------------------------------
Lasso()
교차검증 점수 :  -67.10643479332661
------------------------------------------
RandomForestRegressor()
교차검증 점수 :  -52.40608298755187
------------------------------------------
AdaBoostRegressor()
교차검증 점수 :  -62.63314318392472
------------------------------------------
GradientBoostingRegressor()
교차검증 점수 :  -53.637428226733384
------------------------------------------
LGBMRegressor()
교차검증 점수 :  -53.10888691048014
------------------------------------------
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             

In [50]:
lgbm = LGBMRegressor()
kfold = KFold(n_splits=3, shuffle=True, random_state=23)

pipe = Pipeline([('scaler', scaler),
                 ('model', lgbm)])
param_grid = {'model__random_state' : [23]}
grid_model_1 = GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring = 'neg_mean_absolute_error',
                         cv=kfold,
                         iid=True,
                         n_jobs=-1).fit(lunch_X, lunch_y)
print('교차검증 점수 : ', grid_model_1.best_score_)

교차검증 점수 :  -72.29699406516667


In [51]:
grid_model_1.cv_results_

{'mean_fit_time': array([0.09065795]),
 'std_fit_time': array([0.03729882]),
 'mean_score_time': array([0.00484498]),
 'std_score_time': array([0.00053974]),
 'param_model__random_state': masked_array(data=[23],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'model__random_state': 23}],
 'split0_test_score': array([-73.22642388]),
 'split1_test_score': array([-73.0901709]),
 'split2_test_score': array([-70.57009163]),
 'mean_test_score': array([-72.29699407]),
 'std_test_score': array([1.2208536]),
 'rank_test_score': array([1], dtype=int32)}

In [52]:
lgbm = LGBMRegressor()
pipe = Pipeline([('scaler', scaler),
                 ('model', lgbm)])
param_grid = {'model__random_state' : [23]}
grid_model_2 = GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring = 'neg_mean_absolute_error',
                         cv=kfold,
                         iid=False,
                         n_jobs=-1).fit(dinner_X, dinner_y)
print('교차검증 점수 : ', grid_model_2.best_score_)

교차검증 점수 :  -53.10881505589496


In [53]:
grid_model_2.cv_results_

{'mean_fit_time': array([0.08164303]),
 'std_fit_time': array([0.05355871]),
 'mean_score_time': array([0.00529099]),
 'std_score_time': array([0.00115711]),
 'param_model__random_state': masked_array(data=[23],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'model__random_state': 23}],
 'split0_test_score': array([-55.16418404]),
 'split1_test_score': array([-51.14003085]),
 'split2_test_score': array([-53.02223028]),
 'mean_test_score': array([-53.10881506]),
 'std_test_score': array([1.6439941]),
 'rank_test_score': array([1], dtype=int32)}

### 메뉴 비율 반영

In [54]:
# 데이터 로드
df = pd.read_csv('./휴일전날_비_추가_train.csv', index_col=0)
lunch_menu = pd.read_csv('../yk/data/lunch_train.csv')
dinner_menu = pd.read_csv('../yk/data/dinner_train.csv')

In [55]:
# train 데이터 세팅
df = df.drop(columns='일자')
df = df.drop(columns='조식메뉴')
df = df.drop(columns='중식메뉴')
df = df.drop(columns='석식메뉴')

onehot_tmp = pd.get_dummies(df['요일'])
df = pd.concat([df.drop(columns='요일'), onehot_tmp], axis=1)

onehot_tmp = pd.get_dummies(df['휴일전날'])
df = pd.concat([df.drop(columns='휴일전날'), onehot_tmp], axis=1)

In [56]:
lunch_menu = lunch_menu.drop(columns=['중식메뉴']).div(lunch_menu.sum(axis=1),axis=0)
dinner_menu = dinner_menu.drop(columns=['석식메뉴']).div(dinner_menu.sum(axis=1),axis=0)

In [57]:
# 데이터 분리
lunch_X = pd.concat([df, lunch_menu], axis=1).drop(columns=['중식계', '석식계'])
lunch_y = df['중식계']
dinner_X = pd.concat([df, dinner_menu], axis=1).drop(columns=['중식계', '석식계'])
dinner_y = df['석식계']

In [58]:
lunch_X

Unnamed: 0,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,점심비,저녁비,금,목,수,...,해조류,과일류,전류,조림류,튀김류,장아찌류,구이류,스프류,덮밥국밥류,떡류
0,2601,50,150,238,0.0,0.0,0.0,0,0,0,...,0.000000,0.00,0.0000,0.000000,0.000000,0.0,0.062500,0.0,0.000000,0.0
1,2601,50,173,319,0.0,0.0,0.0,0,0,0,...,0.000000,0.00,0.0000,0.000000,0.062500,0.0,0.062500,0.0,0.000000,0.0
2,2601,56,180,111,0.0,0.0,0.0,0,0,1,...,0.000000,0.00,0.0000,0.071429,0.071429,0.0,0.000000,0.0,0.071429,0.0
3,2601,104,220,355,0.0,0.0,0.0,0,1,0,...,0.000000,0.00,0.0625,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
4,2601,278,181,34,0.0,0.0,0.0,1,0,0,...,0.000000,0.00,0.0000,0.000000,0.062500,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,2983,75,198,4,391.0,0.0,0.0,0,0,1,...,0.000000,0.00,0.0000,0.000000,0.000000,0.0,0.050000,0.0,0.000000,0.0
1201,2983,92,231,462,351.0,0.0,0.0,0,1,0,...,0.000000,0.05,0.0500,0.050000,0.000000,0.0,0.000000,0.0,0.000000,0.0
1202,2983,255,248,1,303.0,0.0,0.0,1,0,0,...,0.050000,0.05,0.0000,0.000000,0.050000,0.0,0.000000,0.0,0.000000,0.0
1203,2983,107,153,616,327.0,0.0,0.0,0,0,0,...,0.080000,0.04,0.0000,0.000000,0.040000,0.0,0.040000,0.0,0.000000,0.0


In [59]:
lgbm = LGBMRegressor()
kfold = KFold(n_splits=3, shuffle=True, random_state=23)

pipe = Pipeline([('scaler', scaler),
                 ('model', lgbm)])
param_grid = {'model__random_state' : [23]}
grid_model_1 = GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring = 'neg_mean_absolute_error',
                         cv=kfold,
                         iid=True,
                         n_jobs=-1).fit(lunch_X, lunch_y)
print('교차검증 점수 : ', grid_model_1.best_score_)

교차검증 점수 :  -74.22247137154748


In [60]:
grid_model_1.cv_results_

{'mean_fit_time': array([0.08962838]),
 'std_fit_time': array([0.01188916]),
 'mean_score_time': array([0.00468818]),
 'std_score_time': array([0.00039931]),
 'param_model__random_state': masked_array(data=[23],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'model__random_state': 23}],
 'split0_test_score': array([-74.51132118]),
 'split1_test_score': array([-75.53397979]),
 'split2_test_score': array([-72.61812223]),
 'mean_test_score': array([-74.22247137]),
 'std_test_score': array([1.20756633]),
 'rank_test_score': array([1], dtype=int32)}

In [61]:
lgbm = LGBMRegressor()
pipe = Pipeline([('scaler', scaler),
                 ('model', lgbm)])
param_grid = {'model__random_state' : [23]}
grid_model_2 = GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring = 'neg_mean_absolute_error',
                         cv=kfold,
                         iid=False,
                         n_jobs=-1).fit(dinner_X, dinner_y)
print('교차검증 점수 : ', grid_model_2.best_score_)

교차검증 점수 :  -52.311447505580084


In [62]:
grid_model_2.cv_results_

{'mean_fit_time': array([0.11151401]),
 'std_fit_time': array([0.02322795]),
 'mean_score_time': array([0.00522248]),
 'std_score_time': array([6.98519873e-05]),
 'param_model__random_state': masked_array(data=[23],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'model__random_state': 23}],
 'split0_test_score': array([-52.7139128]),
 'split1_test_score': array([-51.28083403]),
 'split2_test_score': array([-52.93959569]),
 'mean_test_score': array([-52.31144751]),
 'std_test_score': array([0.73455488]),
 'rank_test_score': array([1], dtype=int32)}

In [20]:
pipe = Pipeline([('scaler', scaler),
                 ('model', lgbm)])

scores = cross_val_score(lgbm, dinner_X, dinner_y, cv=3, scoring='neg_mean_absolute_error')

In [21]:
scores

array([-64.74644166, -63.96315486, -93.30034348])

In [24]:
# 데이터 로드
df = pd.read_csv('./휴일전날_비_추가_test.csv', index_col=0)
lunch_menu = pd.read_csv('../yk/data/lunch_test.csv')
dinner_menu = pd.read_csv('../yk/data/dinner_test.csv')

# test 데이터 세팅
df = df.drop(columns='일자')
df = df.drop(columns='조식메뉴')
df = df.drop(columns='중식메뉴')
df = df.drop(columns='석식메뉴')

onehot_tmp = pd.get_dummies(df['요일'])
df = pd.concat([df.drop(columns='요일'), onehot_tmp], axis=1)

onehot_tmp = pd.get_dummies(df['휴일전날'])
df = pd.concat([df.drop(columns='휴일전날'), onehot_tmp], axis=1)

lunch_X_test_sub = pd.concat([df, lunch_menu], axis=1).drop(columns=['중식메뉴'])
# lunch_y = df['중식계']
dinner_X_test_sub = pd.concat([df, dinner_menu], axis=1).drop(columns=['석식메뉴'])
# dinner_y = df['석식계']
lunch_X_test_sub = lunch_X_test_sub[lunch_X.columns]
dinner_X_test_sub = dinner_X_test_sub[dinner_X.columns]

In [25]:
lunch_X_test_sub.columns == lunch_X.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [27]:
lunch_X

Unnamed: 0,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,점심비,저녁비,금,목,수,...,해조류,과일류,전류,조림류,튀김류,장아찌류,구이류,스프류,덮밥국밥류,떡류
0,2601,50,150,238,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2601,50,173,319,0.0,0.0,0.0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,2601,56,180,111,0.0,0.0,0.0,0,0,1,...,0,0,0,1,1,0,0,0,1,0
3,2601,104,220,355,0.0,0.0,0.0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,2601,278,181,34,0.0,0.0,0.0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,2983,75,198,4,391.0,0.0,0.0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1201,2983,92,231,462,351.0,0.0,0.0,0,1,0,...,0,1,1,1,0,0,0,0,0,0
1202,2983,255,248,1,303.0,0.0,0.0,1,0,0,...,1,1,0,0,1,0,0,0,0,0
1203,2983,107,153,616,327.0,0.0,0.0,0,0,0,...,2,1,0,0,1,0,1,0,0,0


In [26]:
dinner_X_test_sub.columns == dinner_X.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [86]:
# lgbm_1.fit(lunch_X, lunch_y)
# lgbm_2.fit(dinner_X, dinner_y)
pred1 = grid_model_1.predict(lunch_X_test_sub)
pred2 = grid_model_2.predict(dinner_X_test_sub)

In [87]:
pred1

array([1058.4055526 ,  946.37264853,  531.6069045 , 1165.20365109,
        979.46495831,  973.06141005,  932.60286924,  622.88091326,
       1253.13505854,  999.89636052,  671.90865574, 1237.11977376,
       1040.81830703, 1039.92966651,  842.70527916,  609.40959683,
       1177.29394034,  945.23775272,  770.98818086,  769.2145686 ,
        539.01987771, 1129.80327596, 1043.45505378,  929.3394806 ,
        618.04586876, 1251.74130062, 1122.00555504, 1078.02368727,
        912.48343192,  684.99353799, 1264.98809602,  982.26825192,
       1000.69306306,  892.22444045,  563.82558619, 1263.58210697,
       1001.18117614,  841.81471408,  768.77694906,  510.76968239,
       1175.58844204,  996.08608522,  882.1524076 ,  760.21249044,
        515.66938169, 1222.37074921, 1023.34437489,  960.10354009,
        839.88994457,  563.1492482 ])

In [27]:
pred2

array([414.38572915, 419.97760352, 267.50303077, 521.48744328,
       391.33408209, 430.11134278, 461.7012528 , 383.1091044 ,
       575.1070354 , 493.70668473, 258.12463567, 741.75231281,
       633.51887539,  91.82756365, 457.79955403, 376.93582622,
       628.24184161, 594.30288163, 384.69993877, 422.80274549,
       243.62305989, 638.51891006, 450.36682661, 543.17252013,
       383.87131416, 613.37282453, 601.67295368, 440.45829718,
       448.80648858, 250.80530755, 690.3333161 , 523.39063939,
       444.90772422, 450.74147237,  36.40470113, 547.63598545,
       515.49706289, 384.98921531, 396.38238028, 263.32747161,
       550.53683497, 476.56197145, 414.10277646, 388.00034016,
       231.58936621, 550.60968955, 531.34201125, 429.91434282,
       384.10907398, 242.53970746])

In [20]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['중식계'] = pred1
submission['석식계'] = pred2

In [21]:
submission.to_csv('lgbm_0716.csv', index=False)