# 구내식당 식수 인원 예측 AI
- 데이콘 혼자해보기2
- 시각화도구(matplotlib, seaborn, plotly)
- 데이터분석도구(pandas, numpy)
- 머신러닝 도구(sklearn, LightGBM, Grid Search, k-fold 교차검증)
- LightGBM으로 했을때 71점 나왔습니다 ㅎㅎ

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns
import lightgbm as lgb

font_path = "C:/Windows/Fonts/HANDotum.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

sns.set(style="whitegrid", font_scale=1)

import missingno as msno

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



- 초기설정하기

## 프로세스
- 1.데이터셋 확인
- 2.탐색적 데이터 분석
- 3.feature engineering
- 4.model 만들기
- 5.모델 학습 및 예측 
- 6.모델 평가

## 1.데이터셋 확인

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train['일자'] = pd.to_datetime(train['일자'])
test['일자'] = pd.to_datetime(test['일자'])

train['년'] = train['일자'].dt.year
train['월'] = train['일자'].dt.month
train['일'] = train['일자'].dt.day
train['주'] = train['일자'].dt.week
train['요일'] = train['요일'].map({'월':1, '화':2, '수':3, '목':4, '금':5})
train[['현본사소속재택근무자수', '중식계', '석식계']] = train[['현본사소속재택근무자수', '중식계', '석식계']].astype('int')
train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])

test['년'] = test['일자'].dt.year
test['월'] = test['일자'].dt.month
test['일'] = test['일자'].dt.day
test['주'] = test['일자'].dt.week
test['요일'] = test['요일'].map({'월':1, '화':2, '수':3, '목':4, '금':5})
test['현본사소속재택근무자수'] = test['현본사소속재택근무자수'].astype('int')
test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])




- smartnavy님, 베이스라인 코드공유인용

- 데이터불러오기

In [None]:
train.head()

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,중식계,석식계,년,월,일,주,출근
0,2016-02-01,1,2601,50,150,238,0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",1039,331,2016,2,1,5,2401
1,2016-02-02,2,2601,50,173,319,0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",867,560,2016,2,2,5,2378
2,2016-02-03,3,2601,56,180,111,0,모닝롤/베이글 우유/두유/주스 계란후라이 표고버섯죽/쌀밥 (쌀:국내산) 콩나물국...,"카레덮밥 (쌀,현미흑미:국내산) 팽이장국 치킨핑거 (닭고기:국내산) 쫄면야채무침 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 청국장찌개 황태양념구이 (황태:러시아산) 고기...",1017,573,2016,2,3,5,2365
3,2016-02-04,4,2601,104,220,355,0,"모닝롤/토마토샌드 우유/두유/주스 계란후라이 닭죽/쌀밥 (쌀,닭:국내산) 근대국...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 쇠고기무국 주꾸미볶음 부추전 시금치나물 ...","미니김밥*겨자장 (쌀,현미흑미:국내산) 우동 멕시칸샐러드 군고구마 무피클 포...",978,525,2016,2,4,5,2277
4,2016-02-05,5,2601,278,181,34,0,모닝롤/와플 우유/두유/주스 계란후라이 쇠고기죽/쌀밥 (쌀:국내산) 재첩국 방...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 떡국 돈육씨앗강정 (돼지고기:국내산) 우엉잡채...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 차돌박이찌개 (쇠고기:호주산) 닭갈비 (닭고기:...",925,330,2016,2,5,5,2142


- 데이터 확인

In [None]:
test.head()

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,년,월,일,주,출근
0,2021-01-27,3,2983,88,182,5,358,모닝롤/연유버터베이글 우유/주스 계란후라이/찐계란 단호박죽/흑미밥 우거지국 고기완자...,쌀밥/흑미밥/찰현미밥 대구지리 매운돈갈비찜 오꼬노미계란말이 상추무침 포기김치 양상추...,흑미밥 얼큰순두부찌개 쇠고기우엉볶음 버섯햄볶음 (New)아삭이고추무절임 포기김치,2021,1,27,4,2355
1,2021-01-28,4,2983,104,212,409,348,모닝롤/대만샌드위치 우유/주스 계란후라이/찐계란 누룽지탕/흑미밥 황태국 시래기지짐 ...,쌀밥/보리밥/찰현미밥 우렁된장찌개 오리주물럭 청양부추전 수제삼색무쌈 겉절이김치 양상...,충무김밥 우동국물 오징어무침 꽃맛살샐러드 얼갈이쌈장무침 석박지,2021,1,28,4,2319
2,2021-01-29,5,2983,270,249,0,294,모닝롤/핫케익 우유/주스 계란후라이/찐계란 오곡죽/흑미밥 매생이굴국 고구마순볶음 양...,쌀밥/흑미밥/찰현미밥 팽이장국 수제돈까스*소스 가자미조림 동초나물무침 포기김치 양상...,흑미밥 물만둣국 카레찜닭 숯불양념꼬지어묵 꼬시래기무침 포기김치,2021,1,29,4,2170
3,2021-02-01,1,2924,108,154,538,322,모닝롤/촉촉한치즈케익 우유/주스 계란후라이/찐계란 누룽지탕/흑미밥 두부김칫국 새우완...,쌀밥/흑미밥/찰현미밥 배추들깨국 오리대패불고기 시금치프리타타 부추고추장무침 포기김치...,흑미밥 동태탕 돈육꽈리고추장조림 당면채소무침 모자반무침 포기김치,2021,2,1,5,2340
4,2021-02-02,2,2924,62,186,455,314,모닝롤/토마토샌드 우유/주스 계란후라이/찐계란 채소죽/흑미밥 호박맑은국 오이생채 양...,쌀밥/팥밥/찰현미밥 부대찌개 닭살데리야끼조림 버섯탕수 세발나물무침 알타리김치/사과푸...,흑미밥 바지락살국 쇠고기청경채볶음 두부구이*볶은김치 머위된장무침 백김치,2021,2,2,5,2362


- 데이터 확인

In [None]:
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,0,0
1,2021-01-28,0,0
2,2021-01-29,0,0
3,2021-02-01,0,0
4,2021-02-02,0,0


- 데이터 확인

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 17 columns):
일자                1205 non-null datetime64[ns]
요일                1205 non-null int64
본사정원수             1205 non-null int64
본사휴가자수            1205 non-null int64
본사출장자수            1205 non-null int64
본사시간외근무명령서승인건수    1205 non-null int64
현본사소속재택근무자수       1205 non-null int32
조식메뉴              1205 non-null object
중식메뉴              1205 non-null object
석식메뉴              1205 non-null object
중식계               1205 non-null int32
석식계               1205 non-null int32
년                 1205 non-null int64
월                 1205 non-null int64
일                 1205 non-null int64
주                 1205 non-null int64
출근                1205 non-null int64
dtypes: datetime64[ns](1), int32(3), int64(10), object(3)
memory usage: 146.0+ KB


In [None]:
train.describe()

Unnamed: 0,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,중식계,석식계,년,월,일,주,출근
count,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0
mean,3.00332,2807.815768,157.913693,241.142739,274.117012,43.506224,890.33444,461.772614,2018.052282,6.512033,15.947718,26.570954,2365.253112
std,1.415384,171.264404,144.190572,43.532298,246.239651,109.9374,209.505057,139.179202,1.433958,3.453906,8.650452,15.074323,176.563062
min,1.0,2601.0,23.0,41.0,0.0,0.0,296.0,0.0,2016.0,1.0,1.0,1.0,1372.0
25%,2.0,2645.0,71.0,217.0,4.0,0.0,758.0,406.0,2017.0,4.0,9.0,13.0,2281.0
50%,3.0,2760.0,105.0,245.0,299.0,0.0,879.0,483.0,2018.0,7.0,16.0,27.0,2357.0
75%,4.0,2962.0,185.0,272.0,452.0,0.0,1032.0,545.0,2019.0,9.0,23.0,39.0,2461.0
max,5.0,3305.0,1224.0,378.0,1044.0,533.0,1459.0,905.0,2021.0,12.0,31.0,52.0,2921.0


- 이번 구내식당 식수 인원 조사 대회에서 feature는 본사정원수, 본사휴가자수, 본사출장자수, 본사시간외근무명령서승인건수, 현본사소속재택근무자수, 중식메뉴, 석식메뉴 입니다
- 예측할려는 target는 중식계, 석식계 입니다.

### 1.1 Nulldata Check

In [None]:
train.isnull().sum()

일자                0
요일                0
본사정원수             0
본사휴가자수            0
본사출장자수            0
본사시간외근무명령서승인건수    0
현본사소속재택근무자수       0
조식메뉴              0
중식메뉴              0
석식메뉴              0
중식계               0
석식계               0
년                 0
월                 0
일                 0
주                 0
출근                0
dtype: int64

- 결측치 없음

In [None]:
test.isnull().sum()

일자                0
요일                0
본사정원수             0
본사휴가자수            0
본사출장자수            0
본사시간외근무명령서승인건수    0
현본사소속재택근무자수       0
조식메뉴              0
중식메뉴              0
석식메뉴              0
년                 0
월                 0
일                 0
주                 0
출근                0
dtype: int64

- 두 데이터 다 결측치가 없습니다.

### 2.1 EDA

In [None]:
train.corr()[['중식계', '석식계']]

Unnamed: 0,중식계,석식계
요일,-0.731563,-0.312112
본사정원수,-0.115529,-0.173852
본사휴가자수,-0.391975,-0.316894
본사출장자수,-0.51268,-0.188164
본사시간외근무명령서승인건수,0.535611,0.571168
현본사소속재택근무자수,0.076509,-0.057534
중식계,1.0,0.508287
석식계,0.508287,1.0
년,-0.078804,-0.194792
월,-0.154664,-0.127142


## 4.1 변수선택 및 모델 구축

In [None]:
features = ['월', '일', '요일', '출근', '본사시간외근무명령서승인건수', '본사출장자수', '본사휴가자수']
target = ['중식계']
target2 = ['석식계']

In [None]:
from sklearn.ensemble import RandomForestRegressor


lunch_model = RandomForestRegressor(criterion='mae')
dinner_model = RandomForestRegressor(criterion='mae')

In [None]:
X_train, X_test, y_train, y_train2 = train[features], test[features], train[target], train[target2]

### LightGBM

In [None]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000)
model2 = lgb.LGBMRegressor(random_state=777, n_estimators=1000)
model.fit(X_train, y_train)
model2.fit(X_train, y_train2)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=777, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
singleLGBM = submission.copy()

In [None]:
singleLGBM.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,0,0
1,2021-01-28,0,0
2,2021-01-29,0,0
3,2021-02-01,0,0
4,2021-02-02,0,0


In [None]:
singleLGBM['중식계'] = model.predict(X_test)
singleLGBM['석식계'] = model2.predict(X_test)

In [None]:
singleLGBM.to_csv('testLGBM.csv', index = False)

### k-fold 교차검증 (k-fold cross validation)

In [None]:
from sklearn.model_selection import KFold

In [None]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=777)
for train_idx, val_idx in k_fold.split(X_train):
    print(len(train_idx), len(val_idx))
    break

964 241


In [None]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000)
model2 = lgb.LGBMRegressor(random_state=777, n_estimators=1000)

models = []
models2 = []

for train_idx, val_idx in k_fold.split(X_train):
    x_t = X_train.iloc[train_idx]
    y_t = y_train.iloc[train_idx]
    y_t2 = y_train2.iloc[train_idx]
    
    x_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    y_val2 = y_train2.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val, y_val), early_stopping_rounds=100, verbose = 100))
    models2.append(model2.fit(x_t, y_t2, eval_set=(x_val, y_val2), early_stopping_rounds=100, verbose = 100))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 10333.4
Early stopping, best iteration is:
[47]	valid_0's l2: 10119.3
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 7141.7
Early stopping, best iteration is:
[37]	valid_0's l2: 6514.12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 9328.71
Early stopping, best iteration is:
[72]	valid_0's l2: 9182.98
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 10394.4
Early stopping, best iteration is:
[38]	valid_0's l2: 10279.2
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 10114.5
Early stopping, best iteration is:
[26]	valid_0's l2: 9605.4
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 7472.57
Early stopping, best iteration is:
[37]	valid_0's l2: 7201.74
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 10787.2


In [None]:
models

[LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=777, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=777, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
  

In [None]:
models2

[LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=777, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=777, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
  

In [None]:
preds = []
for model in models:
    preds.append(model.predict(X_test))
len(preds)

5

In [None]:
preds2 = []
for model2 in models2:
    preds2.append(model2.predict(X_test))
len(preds2)

5

In [None]:
kfoldLightGBM = submission.copy()

In [None]:
kfoldLightGBM['중식계'] = np.mean(preds, axis = 0)
kfoldLightGBM['석식계'] = np.mean(preds2, axis = 0)
kfoldLightGBM.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1009.863704,239.444086
1,2021-01-28,1004.614666,479.250276
2,2021-01-29,535.964711,242.109394
3,2021-02-01,1276.913346,621.375488
4,2021-02-02,1059.660346,593.391636


In [None]:
kfoldLightGBM.to_csv('kfoldLightGBM.csv', index = False)

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000)
model2 = lgb.LGBMRegressor(random_state=777, n_estimators=1000)

params = {
    'learning_rate': [0.1, 0.01, 0.003],
    'min_child_samples': [20, 30]}

gs = GridSearchCV(estimator=model,
            param_grid=params,
            scoring='neg_mean_squared_error',
            cv = k_fold)
gs2 = GridSearchCV(estimator=model2,
            param_grid=params,
            scoring='neg_mean_squared_error',
            cv = k_fold)

In [None]:
gs.fit(X_train, y_train)
gs2.fit(X_train, y_train2)

GridSearchCV(cv=KFold(n_splits=5, random_state=777, shuffle=True),
             error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=1000, n_jobs=-1,
                                     num_leaves=31, objective=None,
                                     random_state=777, reg_alpha=0.0,
                                     reg_lambda=0.0, silent=True, subsample=1.0,
                                     subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.01, 0.003],
          

In [None]:
gs.best_params_
gs2.best_params_

{'learning_rate': 0.003, 'min_child_samples': 20}

In [None]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000, learning_rate= 0.003, min_child_samples=30)
model2 = lgb.LGBMRegressor(random_state=777, n_estimators=1000, learning_rate= 0.003, min_child_samples=30)

models = []
models = []

for train_idx, val_idx in k_fold.split(X_train):
    x_t = X_train.iloc[train_idx]
    y_t = y_train.iloc[train_idx]
    y_t2 = y_train2.iloc[train_idx]
    
    x_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    y_val2 = y_train2.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val, y_val), early_stopping_rounds=100, verbose = 100))
    models2.append(model.fit(x_t, y_t2, eval_set=(x_val, y_val2), early_stopping_rounds=100, verbose = 100))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 30188.6
[200]	valid_0's l2: 22644.8
[300]	valid_0's l2: 18066.4
[400]	valid_0's l2: 15323.1
[500]	valid_0's l2: 13581.3
[600]	valid_0's l2: 12496
[700]	valid_0's l2: 11806.3
[800]	valid_0's l2: 11238
[900]	valid_0's l2: 10946.4
[1000]	valid_0's l2: 10768.3
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 10768.3
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 13719.3
[200]	valid_0's l2: 10946.5
[300]	valid_0's l2: 9209.88
[400]	valid_0's l2: 8153.23
[500]	valid_0's l2: 7560.39
[600]	valid_0's l2: 7150.8
[700]	valid_0's l2: 6913.41
[800]	valid_0's l2: 6716.41
[900]	valid_0's l2: 6615.66
[1000]	valid_0's l2: 6574.36
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 6574.36
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 31360.2
[200]	valid_0's l2: 22756
[300]	valid_0's l2: 17800.7
[400]	valid_0's l2: 1

In [None]:
preds = []
for model in models:
    preds.append(model.predict(X_test))
    
preds2 = []
for model2 in models2:
    preds.append(model2.predict(X_test))

In [None]:
gs.best_score_
gs2.best_score_

-7792.379806161187

In [None]:
gslgbm = submission.copy()

In [None]:
gslgbm['중식계'] =  np.mean(preds, axis = 0)
gslgbm['석식계'] =  np.mean(preds2, axis = 0)

In [None]:
gslgbm.to_csv('gslgbm.csv', index = False)