In [38]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
from konlpy.tag import Mecab
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline

In [39]:
train = pd.read_csv('train.csv')

In [40]:
test = pd.read_csv('test.csv')

#### 띄어쓰기 및 오타 수정

In [41]:
train.at[1142, '중식메뉴'] = '쌀밥/곤드레밥/찰현미밥 된장찌개 돼지고추장불고기 버섯잡채 삼색물만두무침 겉절이김치/양념장 견과류샐러드*요거트D'

In [42]:
train['중식메뉴'] = train['중식메뉴'].str.replace('삽겹', '삼겹')

***

In [43]:
train['일자'] = pd.to_datetime(train['일자'])
test['일자'] = pd.to_datetime(test['일자'])

In [44]:
train['요일'] = train['일자'].dt.day_name().str[:2].map({'Mo' : 5, 'Tu' : 4, 'We' : 3, 'Th' : 2, 'Fr' : 1})
test['요일'] = test['일자'].dt.day_name().str[:2].map({'Mo' : 5, 'Tu' : 4, 'We' : 3, 'Th' : 2, 'Fr' : 1})

In [45]:
train['month'] = train['일자'].dt.month
test['month'] = test['일자'].dt.month

In [46]:
train['corona'] = [1 if x > 0 else 0 for x in train['현본사소속재택근무자수']]
test['corona'] = [1 if x > 0 else 0 for x in test['현본사소속재택근무자수']]

#### 다음 출근날까지의 일수 차이
- ex) 오늘이 12월 31일 목요일인 경우 다음 출근날은 1월 4일이기 때문에 값이 4

In [47]:
train['shift_1day'] = train.일자.shift(-1)
test['shift_1day'] = test.일자.shift(-1)

In [48]:
train.at[1204, 'shift_1day'] = datetime(2021,1,27)
test.at[49, 'shift_1day'] = datetime(2021,4,12)

In [49]:
train['day_gap'] = (train.shift_1day - train.일자).astype(str)
test['day_gap'] = (test.shift_1day - test.일자).astype(str)

In [50]:
train.at[0, 'day_gap'] = '1 days'
test.at[0, 'day_gap'] = '1 days'

In [51]:
def get_holiday_score(x) :
    
    s = int(re.sub(r'[^0-9]', '', x))
    if s == 1 :
        return 0
    else :
        return 1

In [52]:
train['day_gap'] = train.day_gap.apply(get_holiday_score)
test['day_gap'] = test.day_gap.apply(get_holiday_score)

In [53]:
train['야근요일'] = train.요일.apply(lambda x : 1 if (x == 1) or (x == 3) else 0)
test['야근요일'] = test.요일.apply(lambda x : 1 if (x == 1) or (x == 3) else 0)

***

In [62]:
train['중식메뉴'] = train['중식메뉴'].str.split(' ')
train['석식메뉴'] = train['석식메뉴'].str.split(' ')

test['중식메뉴'] = test['중식메뉴'].str.split(' ')
test['석식메뉴'] = test['석식메뉴'].str.split(' ')

In [63]:
def get_token(data) :
    tokens = []
    for token in data :
        s_list = []
        for t in token :
            if t.startswith('(N') :
                s_list.append(t)
            elif (t.startswith('(') == False) & (len(t) > 1) :
                s_list.append(t)
            else :
                pass
        tokens.append(s_list)
    return tokens

In [64]:
train['중식_토큰'] = get_token(train['중식메뉴'])
train['석식_토큰'] = get_token(train['석식메뉴'])

In [65]:
test['중식_토큰'] = get_token(test['중식메뉴'])
test['석식_토큰'] = get_token(test['석식메뉴'])

In [67]:
train['중식메뉴수'] = train.중식_토큰.apply(len)
train['석식메뉴수'] = train.석식_토큰.apply(len)

test['중식메뉴수'] = test.중식_토큰.apply(len)
test['석식메뉴수'] = test.석식_토큰.apply(len)

### 식재료
- 중식에만 처리함 -> 메뉴가 중식에서 더 중요할 것으로 판단했기 때문

In [68]:
def get_ingredient(data) :
    
    ing_df = pd.DataFrame(np.zeros((data.shape[0], 7)), columns = ['해산물', '소', '돼지', '닭', '오리', '채소', '재료_기타'])

    for t in range(data.shape[0]) :
        token = data.중식_토큰.str[2][t]
        if '연어' in token or'골뱅이' in token or'열기' in token or'조기' in token or'탕수어' in token or'양장피' in token or'홍어' in token or'명태' in token or'적어' in token or'장어' in token or'동태' in token or'산슬' in token or'코다리' in token or'가자미' in token or'해물' in token or'생선' in token or'새우' in token or'꽁치' in token or'갈치' in token or'임연수' in token or'삼치' in token or'고등어' in token or'굴비' in token or'오징어' in token or'쭈꾸미' in token or'주꾸미' in token or'낙지' in token or'문어' in token :
            ing_df.at[t, '해산물'] = 1
        elif '왕갈비' in token or'소갈비' in token or'장조림' in token or'불고기' in token or'차돌' in token or'육전' in token or'너비아니' in token or'떡갈비' in token or(token.startswith('소') & (token.startswith('소세') == False)) or '함박' in token or'쇠고기' in token or'소고기' in token or'쇠' in token :
            ing_df.at[t, '소'] = 1
        elif '궁보계정' in token or'삼계탕' in token or'윙' in token or'유린기' in token or'깐풍'in token or'닭' in token or'치킨' in token or'후라이드' in token :
            ing_df.at[t, '돼지'] = 1
        elif '폭립' in token or'오향장육' in token or'동파육' in token or'히레카츠' in token or'순대' in token or'미트볼' in token or'등갈비' in token or'소세지' in token or'목살' in token or'탕수육' in token or'제육' in token or'돈' in token or'돼지' in token or'두루치기' in token or'삼겹' in token or'보쌈' in token or'족발' in token :
            ing_df.at[t, '닭'] = 1
        elif '오리' in token :
            ing_df.at[t, '오리'] = 1
        elif token.endswith('두부') or '꼬치산적' in token or '고추' in token or'양파' in token or'부추' in token or'고구마' in token or'감자' in token or'깻잎' in token or'샐러드' in token or'시금치' in token or'야채' in token :
            ing_df.at[t, '채소'] = 1
        else :
            ing_df.at[t, '재료_기타'] = 1
            
    return ing_df

In [69]:
train = pd.concat([train, get_ingredient(train)], axis = 1)
test = pd.concat([test, get_ingredient(test)], axis = 1)

### 조리법

In [70]:
def get_recipe(data, col) :
    tm = col[:2]
    cat = ['전', '무침','튀김', '찜', '볶음', '조림', '구이', '훈제', '조리_기타']
    recipe_df = pd.DataFrame(np.zeros((data.shape[0], 9)), columns = [f'{tm}_{x}' for x in cat])

    for t in range(data.shape[0]) :
        try :
            token = data[col][t]
            if '고추잡채' in token or '궁보계정' in token or '산슬' in token or token.endswith('잡채') or '마파두부' in token or '두루치기' in token or '닭갈비' in token or token.endswith('볶음') or '볶음' in token :
                recipe_df.at[t, f'{tm}_볶음'] = 1 
            elif token.endswith('데리야끼') or token.endswith('립') or '함박' in token or '그라탕' in token or token.endswith('갈비') or '주물럭' in token or '스테이크' in token or token.endswith('구이') or '불고기' in token or '구이' in token :
                recipe_df.at[t, f'{tm}_구이'] = 1
            elif '전병' in token or token.endswith('전') :
                recipe_df.at[t, f'{tm}_전'] = 1
            elif token.endswith('김치말이') or token.endswith('만두') or '보쌈' in token or '수육' in token or token.endswith('찜') or '찜' in token :
                recipe_df.at[t, f'{tm}_찜'] = 1
            elif '파채' in token or token.endswith('무침') or token.endswith('샐러드') or '양장피' in token :
                recipe_df.at[t, f'{tm}_무침'] = 1
            elif '오향장육' in token or '동파육' in token or token.endswith('조림') :
                recipe_df.at[t, f'{tm}_조림'] = 1
            elif '통닭' in token or token.endswith('새우') or '강정' in token or '미트볼' in token or '프리타타' in token or '카츠' in token or '깐풍' in token or '고로케' in token or '유린기' in token or '탕수' in token or token.endswith('닭') or token.endswith('치킨') or token.endswith('튀김') or '너겟' in token or token.endswith('강정') or '가스' in token or '까스' in token or '핑거' in token or '텐더' in token or '커틀렛' in token or '커틀릿' in token :
                recipe_df.at[t, f'{tm}_튀김'] = 1
            elif '훈제' in token :
                recipe_df.at[t, f'{tm}_훈제'] = 1
            else :
                recipe_df.at[t, f'{tm}_조리_기타'] = 1
        except :
            recipe_df.at[t, f'{tm}_조리_기타'] = 1
    return recipe_df

In [71]:
train['중식_메인요리'] = train.중식_토큰.str[2]
test['중식_메인요리'] = test.중식_토큰.str[2]

In [72]:
train['석식_메인요리'] = train.석식_토큰.str[2]
test['석식_메인요리'] = test.석식_토큰.str[2]

In [73]:
train = pd.concat([train, get_recipe(train, '중식_메인요리')], axis = 1)
test = pd.concat([test, get_recipe(test, '중식_메인요리')], axis = 1)

In [74]:
train = pd.concat([train, get_recipe(train, '석식_메인요리')], axis = 1)
test = pd.concat([test, get_recipe(test, '석식_메인요리')], axis = 1)

In [75]:
train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

In [76]:
test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

In [77]:
X1 = train[['요일', '야근요일', '출근', 'day_gap', '휴가비율', '출장비율', '야근비율', 'month',  '중식메뉴수', '해산물', '소', '돼지', '닭', '오리', '채소', '재료_기타', '중식_전', '중식_무침', '중식_튀김', '중식_찜', 
       '중식_볶음', '중식_조림', '중식_구이', '중식_훈제', '중식_조리_기타']]

In [41]:
target1 = test[X1.columns]

In [42]:
X2 = train[['corona', '석식메뉴수', 'month', '야근요일', 'day_gap', '요일', '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '석식_전', '석식_무침', 
       '석식_튀김', '석식_찜', '석식_볶음', '석식_조림', '석식_구이', '석식_훈제', '석식_조리_기타']]

In [43]:
target2 = test[X2.columns]

In [44]:
y1 = train.중식계
y2 = train.석식계

In [45]:
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from ngboost import NGBRegressor
from catboost import CatBoostRegressor, Pool

In [46]:
kf = KFold(n_splits = 15, random_state = 718, shuffle = True)

### CatBoostRegressor

In [47]:
cb = CatBoostRegressor(iterations = 20000, learning_rate = 0.01, depth = 4, eval_metric = 'MAE', silent = True, loss_function = 'MAE')

In [48]:
cb_pred_1 = np.zeros((target1.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X1):
    tr_x, val_x = X1.iloc[tr_idx], X1.iloc[val_idx]
    tr_y, val_y = y1.iloc[tr_idx], y1.iloc[val_idx]
    train_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    cb.fit(train_data, eval_set = val_data, early_stopping_rounds = 2000, use_best_model = True, verbose = 5000)
    best = cb.best_iteration_
    pred = cb.predict(val_x, ntree_end = best)
    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    print(f'FOLD MAE = {mae}')
    sub_pred = cb.predict(target1, ntree_end = best) / 15
    cb_pred_1 += sub_pred
print(f'\n{cb.__class__.__name__} MAE = {np.mean(mae_list)}')

0:	learn: 165.7208086	test: 174.4951842	best: 174.4951842 (0)	total: 62.1ms	remaining: 20m 41s
5000:	learn: 44.7818178	test: 61.9085747	best: 61.8818070 (4932)	total: 5.61s	remaining: 16.8s
10000:	learn: 38.2252302	test: 61.4473138	best: 61.4084062 (9821)	total: 10.3s	remaining: 10.3s
15000:	learn: 35.0579376	test: 61.4200779	best: 61.3247083 (13580)	total: 14.9s	remaining: 4.97s
Stopped by overfitting detector  (2000 iterations wait)

bestTest = 61.32470828
bestIteration = 13580

Shrink model to first 13581 iterations.
FOLD MAE = 61.325879285875395
0:	learn: 166.7712980	test: 162.5986410	best: 162.5986410 (0)	total: 2.48ms	remaining: 49.5s
5000:	learn: 44.3015596	test: 73.9240744	best: 73.9210822 (4978)	total: 4.68s	remaining: 14s
10000:	learn: 38.2208748	test: 73.6280890	best: 73.6024269 (8660)	total: 9.4s	remaining: 9.4s
Stopped by overfitting detector  (2000 iterations wait)

bestTest = 73.43427955
bestIteration = 11274

Shrink model to first 11275 iterations.
FOLD MAE = 73.4411602

In [51]:
cb_pred_2 = np.zeros((target2.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X2):
    tr_x, val_x = X2.iloc[tr_idx], X2.iloc[val_idx]
    tr_y, val_y = y2.iloc[tr_idx], y2.iloc[val_idx]
    train_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    cb.fit(train_data, eval_set = val_data, early_stopping_rounds = 2000, use_best_model = True, verbose = 5000)
    best = cb.best_iteration_
    pred = cb.predict(val_x, ntree_end = best)
    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    print(f'FOLD MAE = {mae}')
    sub_pred = cb.predict(target2, ntree_end = best) / 15
    cb_pred_2 += sub_pred
print(f'\n{cb.__class__.__name__} MAE = {np.mean(mae_list)}')

0:	learn: 99.0269474	test: 82.1235793	best: 82.1235793 (0)	total: 1.93ms	remaining: 38.7s
5000:	learn: 29.9611929	test: 41.2941278	best: 41.2782356 (4618)	total: 5.36s	remaining: 16.1s
10000:	learn: 26.0294826	test: 40.8838317	best: 40.8610027 (9930)	total: 10.4s	remaining: 10.4s
Stopped by overfitting detector  (2000 iterations wait)

bestTest = 40.73218658
bestIteration = 12930

Shrink model to first 12931 iterations.
FOLD MAE = 40.734237100833674
0:	learn: 99.1316182	test: 80.7050607	best: 80.7050607 (0)	total: 1.82ms	remaining: 36.4s
5000:	learn: 29.2048574	test: 55.9635529	best: 55.9380499 (4861)	total: 5.78s	remaining: 17.3s
Stopped by overfitting detector  (2000 iterations wait)

bestTest = 55.69152869
bestIteration = 6972

Shrink model to first 6973 iterations.
FOLD MAE = 55.693782736123964
0:	learn: 97.9807641	test: 95.7135793	best: 95.7135793 (0)	total: 1.73ms	remaining: 34.6s
5000:	learn: 29.9661490	test: 43.8533080	best: 43.8183328 (4000)	total: 5.77s	remaining: 17.3s
Stopp

### NGBRegressor

In [54]:
ngb = NGBRegressor(n_estimators = 15000, verbose = 0, random_state = 607)

In [55]:
ngb_pred_1 = np.zeros((target1.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X1):
    tr_x, val_x = X1.iloc[tr_idx], X1.iloc[val_idx]
    tr_y, val_y = y1.iloc[tr_idx], y1.iloc[val_idx]
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 2000)
    pred = ngb.predict(val_x)
    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    print(f'FOLD MAE = {mae}')
    sub_pred = ngb.predict(target1) / 15
    ngb_pred_1 += sub_pred
print(f'\n{ngb.__class__.__name__} MAE = {np.mean(mae_list)}')

FOLD MAE = 68.93021314897271
FOLD MAE = 53.653937044849705
FOLD MAE = 46.80557565007274
FOLD MAE = 34.015911415767384
FOLD MAE = 37.20199168499946
FOLD MAE = 43.273489971316806
FOLD MAE = 36.701970631269454
FOLD MAE = 34.67354770644964
FOLD MAE = 40.711893648589694
FOLD MAE = 33.73674663410894
FOLD MAE = 38.51425002765768
FOLD MAE = 27.956151443367055
FOLD MAE = 26.57288686138337
FOLD MAE = 29.503053690826004
FOLD MAE = 27.322885552213812

NGBRegressor MAE = 38.63830034078964


In [58]:
ngb = NGBRegressor(n_estimators = 15000, verbose = 0, random_state = 607)

In [59]:
ngb_pred_2 = np.zeros((target2.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X2):
    tr_x, val_x = X2.iloc[tr_idx], X2.iloc[val_idx]
    tr_y, val_y = y2.iloc[tr_idx], y2.iloc[val_idx]
    ngb.fit(tr_x,
            tr_y, val_x, val_y, early_stopping_rounds = 2000)
    pred = ngb.predict(val_x)
    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    print(f'FOLD MAE = {mae}')
    sub_pred = ngb.predict(target2) / 15
    ngb_pred_2 += sub_pred
print(f'\n{ngb.__class__.__name__} MAE = {np.mean(mae_list)}')

FOLD MAE = 44.171664919086936
FOLD MAE = 32.868913088772295
FOLD MAE = 25.220459787217816
FOLD MAE = 21.960386570669062
FOLD MAE = 20.040524201800302
FOLD MAE = 18.935727713350282
FOLD MAE = 18.9116412699365
FOLD MAE = 16.29463434135136
FOLD MAE = 16.74260426529482
FOLD MAE = 20.35581551304555
FOLD MAE = 18.07844819538711
FOLD MAE = 17.947413300843102
FOLD MAE = 17.985124950816843
FOLD MAE = 16.079115696458693
FOLD MAE = 14.079807129401638

NGBRegressor MAE = 21.31148539622882


### LGBMRegressor

In [62]:
lgbm = LGBMRegressor(random_state = 718, max_depth = 5, n_estimators = 20000, learning_rate = .02)

In [63]:
lgbm_pred_1 = np.zeros((target1.shape[0]))
mae_list = []
for tr_idX1, val_idX1 in kf.split(X1):
    tr_X1, val_X1 = X1.iloc[tr_idX1], X1.iloc[val_idX1]
    tr_y, val_y = y1.iloc[tr_idX1], y1.iloc[val_idX1]
    lgbm.fit(tr_X1, tr_y, eval_set = [(tr_X1, tr_y), (val_X1, val_y)], eval_metric = 'mean_absolute_error', early_stopping_rounds = 2000, verbose = 5000)
    pred = lgbm.predict(val_X1)
    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    print(f'FOLD MAE = {mae}')
    sub_pred = lgbm.predict(target1) / 15
    lgbm_pred_1 += sub_pred
print(f'\n{lgbm.__class__.__name__} MAE = {np.mean(mae_list)}')

Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[345]	training's l1: 51.583	training's l2: 4678.24	valid_1's l1: 68.7999	valid_1's l2: 8329.61
FOLD MAE = 68.79993446563824
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[136]	training's l1: 64.893	training's l2: 7288.94	valid_1's l1: 73.6322	valid_1's l2: 10883.6
FOLD MAE = 73.63224306104472
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[565]	training's l1: 46.4971	training's l2: 3842.53	valid_1's l1: 72.9292	valid_1's l2: 8635.33
FOLD MAE = 72.92923548471887
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[2014]	training's l1: 30.9305	training's l2: 1698.42	valid_1's l1: 64.6994	valid_1's l2: 8491.1
FOLD MAE = 64.69940192011973
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[521]	traini

In [66]:
lgbm_pred_2 = np.zeros((target2.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(X2):
    tr_x, val_x = X2.iloc[tr_idx], X2.iloc[val_idx]
    tr_y, val_y = y2.iloc[tr_idx], y2.iloc[val_idx]
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], eval_metric = 'mean_absolute_error', early_stopping_rounds = 2000, verbose = 5000)
    pred = lgbm.predict(val_x)
    mae = mean_absolute_error(val_y, pred)
    print(f'FOLD MAE = {mae}')
    mae_list.append(mae)
    sub_pred = lgbm.predict(target2) / 15
    lgbm_pred_2 += sub_pred
print(f'\n{lgbm.__class__.__name__} MAE = {np.mean(mae_list)}')

Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[1421]	training's l1: 25.1178	training's l2: 1233.21	valid_1's l1: 40.5822	valid_1's l2: 2528.15
FOLD MAE = 40.582210737944386
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[522]	training's l1: 31.882	training's l2: 1998.65	valid_1's l1: 54.7419	valid_1's l2: 6554.52
FOLD MAE = 54.741861447963466
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[177]	training's l1: 41.0974	training's l2: 3291.3	valid_1's l1: 43.4276	valid_1's l2: 3314.58
FOLD MAE = 43.4275711272845
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[674]	training's l1: 30.7701	training's l2: 1826.08	valid_1's l1: 48.9874	valid_1's l2: 5498.3
FOLD MAE = 48.98741453258124
Training until validation scores don't improve for 2000 rounds
Early stopping, best iteration is:
[363]	train

In [69]:
submission = pd.read_csv('sample_submission.csv')

In [70]:
submission['중식계'] = (lgbm_pred_1 + ngb_pred_1 + cb_pred_1) / 3
submission['석식계'] = (lgbm_pred_2 + ngb_pred_2 + cb_pred_2) / 3

In [71]:
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1021.270296,367.276235
1,2021-01-28,955.712339,424.324562
2,2021-01-29,637.998207,236.101793
3,2021-02-01,1267.976615,554.4252
4,2021-02-02,1036.126351,494.879986


In [72]:
submission.중식계.sum(), submission.석식계.sum()

(49252.6861395934, 24129.579534888042)

In [73]:
submission.to_csv("0728.csv", index = False)