<a href="https://colab.research.google.com/github/herjh0405/DACON_Meal/blob/master/LH_Lunch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost
!pip install pycaret
!pip install kaggler
!pip install pendulum
!pip install flaml

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
np.random.seed(0)

from pycaret.regression import *
from kaggler.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

from tqdm.notebook import tqdm
import os, re
import glob
import calendar

from flaml import AutoML
import statsmodels.api as sm

In [None]:
# 한글 폰트 사용
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os

def change_matplotlib_font(font_download_url):
    FONT_PATH = 'MY_FONT'
    
    font_download_cmd = f"wget {font_download_url} -O {FONT_PATH}.zip"
    unzip_cmd = f"unzip -o {FONT_PATH}.zip -d {FONT_PATH}"
    os.system(font_download_cmd)
    os.system(unzip_cmd)
    
    font_files = fm.findSystemFonts(fontpaths=FONT_PATH)
    for font_file in font_files:
        fm.fontManager.addfont(font_file)

    font_name = fm.FontProperties(fname=font_files[2]).get_name()
    matplotlib.rc('font', family=font_name)
    print("font family: ", plt.rcParams['font.family'])

font_download_url = "https://fonts.google.com/download?family=Nanum%20Gothic"
change_matplotlib_font(font_download_url)
# 마이너스 폰트 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

In [None]:
path = '/content/drive/MyDrive/구내식당/water/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
holiday = pd.read_csv(path+'holidays.csv', index_col=0)
corona = pd.read_csv(path+'corona_data.csv')

df = pd.concat([train.iloc[:, :-2], test])
target_df = train.iloc[:, -2:]
df.columns = ['일자', '요일', '정원','휴가자', '출장자', '야근자',\
                 '재택근무자', '조식', '중식', '석식']

## 메뉴 관련 전처리

In [None]:
# Menu-extracting function
def extractMenu(array, keywords=[], not_in_keywords={}, comm_not_in=[]):
  extractedMenu = []
  for menu_nm in array:
    for kw in keywords:
      if menu_nm.find(kw) > -1:
        has_not_in = False
        if kw in not_in_keywords:
          for sub_kw in not_in_keywords[kw]:
            if menu_nm.find(sub_kw) > -1:
              has_not_in = True
              break
        for sub_kw in comm_not_in:
            if menu_nm.find(sub_kw) > -1:
              has_not_in = True
              break

        if not has_not_in:
          extractedMenu.append(menu_nm)
          break
  return(extractedMenu)

def extractMenu2(array, keywords=[]):
  extractedMenu = []
  for menu_nm in tot_menu_arr:
    for kw in keywords:
      if menu_nm.find(kw) > -1:
        menu_nm_list = re.split(r'[^\w]', menu_nm)
        for menu_nm_tmp in menu_nm_list:
          if menu_nm_tmp.find(kw) + len(kw) == len(menu_nm_tmp): # 끝에 있으면
            extractedMenu.append(menu_nm)
        break
  return(extractedMenu)

In [None]:
lunch_menu_data = df['중식']
dinner_menu_data = df['석식']

In [None]:
tot_menu_arr = []
pattern = r"\(.*\)"
for menu_data in [lunch_menu_data, dinner_menu_data]:
  for daily_menu in menu_data:
    menu_list = daily_menu.strip().split()
    menu_list2 = []
    for i, menu_nm in enumerate(menu_list):
      menu_nm = re.sub(pattern, '', menu_nm)
      if menu_nm.strip() in ['', '*']:
        continue
      if menu_nm[0] == '(' or menu_nm[-1] == ')':
        continue
      menu_list2.append(menu_nm)
    tot_menu_arr += menu_list2

In [None]:
tot_menu_arr = set(tot_menu_arr)

In [None]:
len(tot_menu_arr)

In [None]:
# 육류 분류
# 소고기
# https://namu.wiki/w/%EC%87%A0%EA%B3%A0%EA%B8%B0
beef = ['소고기', '쇠고기', '불고기', '떡갈비', '갈비찜', '소갈비', '육사시미', '육회', '장조림', '와규', '야키니쿠', '규동', '스테이크', '햄버그 스테이크',
 '함박스테이크', '함바그스테이크', '함박 스테이크', '햄버거', '로스트 디너', '비프가스밀라네사', '웰링턴', '슈하스쿠', '아사도', '우육면',
 '육개장', '육포', '평양냉면', '비프 스트로가노프', '설렁탕', '소고기국', '소머리국밥', '곰탕', '너비아니', '보르챠', '소꼬리']
# 돼지고기
# https://namu.wiki/w/%EB%8F%BC%EC%A7%80%EA%B3%A0%EA%B8%B0
pig = ['돼지', '돼지머리', '머릿고기', '뒷고기', '관자살', '콧등살', '삼각살', '설중살', '설하살', '안중살', '뽈항정살',
 '볼살', '두항정', '돼지코', '항정살', '목살', '가브리살', '갈비', '앞다리살', '갈매기살', '등심', '안심',
 '삼겹살', '오겹살', '뒷다리살', '돈족', '내장', '오소리감투', '허파', '염통', '콩팥', '새끼보', '돈낭',
 '돈족', '돼지꼬리', '사태', '막창', '감자탕', '돈가스', '돼지갈비', '돼지국밥', '돼지불고기', '두루치기', '순대',
 '순댓', '족발', '보쌈', '수육', '편육', '제육', '탕수육', '삼겹', '맥적', '차슈', '향우구육', '꿔바로우', '훙사오러우',
 '회과육', '동파육', '라후테', '오향장육', '슈바인스학세', '소시지', '소세지', '포크 커틀릿', '함바그 스테이크', '함바그스테이크',
 '함박스테이크', '살스테이크','살 스테이크', '함박 스테이크', '베이컨', '햄', '스팸', '폭립', '폭찹', '돈지루', '부타동', '바쿠테', '팟 카파오 무 쌉', '비엔나', '소떡', '육']
# 닭고기
# https://namu.wiki/w/%EB%8B%AD%EA%B3%A0%EA%B8%B0
chicken = ['닭', '깐풍기', '꼬꼬면', '궁보계정', '간장닭', '기스면', '계', '도빙무시', '라조기', '백숙', '영계백숙',
 '불닭', '삼계탕', '삼계선', '오니시메', '옻닭', '연팔기', '유린기', '육회', '좌종당계', '찜닭', '초계밀면',
 '치킨', '도리텐', '지파이', '치짜', '취계', '카라아게', '가라아', '파닭', '양파닭', '케밥', '코코뱅', '탕수기',
 '포계', '프랑구 아사두']
# 양고기
# https://namu.wiki/w/%EC%96%91%EA%B3%A0%EA%B8%B0
sheep = ['양고기','훠궈', '양꼬치', '케밥', '샤슬릭', '징기스칸', '셰퍼드 파이', '허르헉', '양갈비']
# 오리고기
# https://namu.wiki/w/%EC%98%A4%EB%A6%AC%EA%B3%A0%EA%B8%B0
dug = ['오리']

web_keywords = beef + pig + chicken + sheep + dug
keywords = ['돈까스', '히레카츠', '히레까쓰', '히레가스', '포크', '부대찌개', '뒷다리', '앞다리', '돈', '순살',
                '소머리', '등뼈', '곱창', '도가니', '뼈해장국', '뼈다귀해장국', '목심', '채끝', '우둔', '양지', '설도', '만두', '만둣',
                '잡채', '류산슬', '유산슬', '고기', '고깃']
keywords += web_keywords

not_in_keywords = {'오리':['아오리', '오리엔탈'], '계':['계란', '계발', '계피'], '장조림':['계란', '메추리알'], '치킨':['치킨무'], '돈':['돈나물'], '만두':['당면계란'], '만둣':['당면계란']}
meat_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 돼지고기
keywords = ['돼지', '돼지머리', '머릿고기', '뒷고기', '관자살', '콧등살', '삼각살', '설중살', '설하살', '안중살', '뽈항정살',
 '볼살', '두항정', '돼지코', '항정살', '목살', '가브리살', '앞다리살', '갈매기살', '등심', '안심',
 '삼겹살', '오겹살', '앞다리살', '뒷다리살', '돈족', '내장', '오소리감투', '허파', '염통', '콩팥', '새끼보', '돈낭',
 '돈족', '돼지꼬리', '사태', '막창', '감자탕', '돈가스', '돼지갈비', '돼지국밥', '돼지불고기', '두루치기', '순대',
 '순댓', '족발', '보쌈', '수육', '편육', '제육', '탕수육', '삼겹', '맥적', '차슈', '향우구육', '꿔바로우', '훙사오러우',
 '회과육', '동파육', '라후테', '오향장육', '슈바인스학세', '소시지', '소세지', '포크 커틀릿',
 '목살스테이크','목살 스테이크', '베이컨', '햄', '스팸', '폭립', '폭찹', '돈지루', '부타동', '바쿠테', '팟 카파오 무 쌉', '비엔나', '소떡',
 '돈까스', '히레카츠', '히레까쓰', '히레가스', '포크', '돈', '등뼈', '뼈해장국', '뼈다귀해장국']
not_in_keywords = {'돈':['돈나물'], '만두':['당면계란'], '만둣':['당면계란']}
pig_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 소고기
keywords = ['소고기', '쇠고기', '소불고기', '소갈비', '육사시미', '육회', '와규', '야키니쿠', '규동', '소곱창',
            '로스트 디너', '비프가스밀라네사', '웰링턴', '슈하스쿠', '아사도', '우육면',
            '육개장', '육포', '평양냉면', '비프 스트로가노프', '설렁탕', '소고기국', '소머리국밥', '곰탕', '너비아니', '보르챠', '소꼬리', '소머리', '설도', '목심', '채끝', '우둔', '양지', '도가니']
beef_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 닭고기
keywords = ['닭', '깐풍기', '꼬꼬면', '궁보계정', '간장닭', '기스면', '계', '도빙무시', '라조기', '백숙', '영계백숙',
          '불닭', '삼계탕', '삼계선', '오니시메', '옻닭', '연팔기', '유린기', '육회', '좌종당계', '찜닭', '초계밀면',
          '치킨', '도리텐', '지파이', '치짜', '취계', '카라아게', '가라아', '파닭', '양파닭', '케밥', '코코뱅', '탕수기',
          '포계', '프랑구 아사두']
not_in_keywords = {'계':['계란', '계발', '계피'], '치킨':['치킨무']}

chicken_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 양고기 - 데이터 없어서 제외
keywords = ['양고기','훠궈', '양꼬치', '케밥', '샤슬릭', '징기스칸', '셰퍼드 파이', '허르헉', '양갈비']
sheep_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])
sheep_menus

In [None]:
# 오리고기
keywords = ['오리']
not_in_keywords = {'오리':['아오리', '오리엔탈']}
duck_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
#난류 (계란)
keywords = ['계란', '난', '란', '메추리알', '날치알', '동태알']
not_in_keywords = {"란":['토란'], '난':['커리', '카레']}
egg_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 죽류
keywords = ['죽', '누룽지']
juk_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 덮밥 및 국밥류
keywords = ['덮밥', '국밥']
gukbob_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 비빔밥 및 볶음밥류
keywords = ['비빔밥', '볶음밥']
bb_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 국탕류
keywords = ['국', '탕', '찌개', '국물']
soup_menus = extractMenu2(tot_menu_arr, keywords=keywords)

In [None]:
# 구이류
keywords = ['구이']
gui_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 전류
keywords = ['전', '부침개', '빈대떡']
jeon_menus = extractMenu2(tot_menu_arr, keywords=keywords)

In [None]:
# http://yaksik.net/detail.php?number=24904
# 튀김류
keywords = ['튀김', '까스', '카츠', '가츠', '까츠', '탕수', '덴뿌라', '덴푸라', '크로켓', '고로케', '맛탕', '치킨', '통닭', '부각', '강정', '김말이', '깐풍']
fry_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

## 메뉴 추가 특성 - Part1

In [None]:
# 곡물
keywords = ['현미', '밥', '쌀', '보리', '죽', '참깨', '들깨', '수수', '잡곡', '귀리', '퀴노아', '아마란스', '옥수수', '기장', '메밀', '모밀']
grain_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 콩류
keywords = ['콩', '녹두', '팥', '완두']
not_in_keywords = {'콩':['콩나물']}
bean_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 묵
keywords = ['묵']
not_in_keywords = {'묵':['어묵', '묵은지']}
kor_jelly_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 생선 및 조개류
# https://ko.wikipedia.org/wiki/%EC%83%9D%EC%84%A0
# https://namu.wiki/w/%EC%83%9D%EC%84%A0
# https://namu.wiki/w/%EC%A1%B0%EA%B0%9C
keywords = ['생선', '조개', '메기', '송어', '오징어', '굴', '멸치', '숭어', '성게', '고등어', '명태',
            '쏨뱅이', '연어', '틸라피아', '우럭', '이리치', '가재', '참바리', '상어', '돔',
            '삼치', '방어', '참치', '새우', '문어', '홍어', '농어', '붉평치', '청상아리', '황새치',
            '다랑어', '비막치어', '장어', '녹새치', '숭어', '굴비', '조기', '갈치', '꽁치',
            '전어', '명태', '노가리', '황태', '은어', '가물치', '쏘가리', '붕어', '잉어', '모래마주', '가자미',
            '간재미', '가오리', '박대', '양미리', '과메기', '청어', '생태',
            '개복치', '광어', '넙치', '기름치', '까나리', '날치','놀래미'
            ,'능성어','달고기','대구','도다리','도루묵','도미','독가시치'
            ,'만새기','망상어','문절망둑','물메기','미꾸라지','민어','방어'
            ,'추어탕','배스','밴댕이','뱅어','벵에돔','병어','보리멸'
            ,'복어','볼락','부세','부시리','붕장어','블루길'
            ,'빙어','산천어','서대','시샤모','쏘가리','쏠배감펭','쏨뱅이'
            ,'아귀','아구','임연수','전갱이','전복치','점성어','정어리'
            ,'준치','쥐치','청새치','청어','향어','홍어','황새치','매운탕'
            ,'루테피스크','게맛살','물회','회덮밥','부야베스','북엇국','세꼬시','수르스트뢰밍','식해','어묵','오뎅'
            ,'쥐포','추어탕','피시 앤드 칩스','피쉬 앤드 칩스','피시앤드칩스','피쉬앤드칩스','피시앤칩스','피쉬앤칩스','해물'
            ,'가리비', '개오지', '꼬막','대칭이','바지락','백합','홍합','소라', '골뱅이', '고둥','재첩'
            ,'전복','플라티케라무스', '봉골레', '클램차우더']
not_in_keywords = {'굴':['굴소스'], '새우':['새우젓']}
fish_shell_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 채소류
# https://namu.wiki/w/%EC%B1%84%EC%86%8C?from=%EC%95%BC%EC%B1%84
keywords = ['가지', '갓', '감자', '고구마', '고사리', '고추', '페페론치노', '냉이', '근대', '깻잎', '차조기'
            , '당근', '더덕', '도라지', '동아', '딸기', '마', '마늘', '멜론', '무', '무청'
            , '바나나', '배추', '버섯', '부추', '브로콜리', '상추', '생강', '쇠비름', '나물'
            , '쑥', '시금치', '수박', '시호', '아스파라거스', '야콘', '양파', '여주', '연근', '열무', '오이'
            , '우엉', '인삼', '죽순', '청경채', '참외', '칡', '풋콩', '토란', '토마토', '쪽파', '대파', '파인애플'
            , '파프리카', '피망', '케일', '고수', '로즈마리', '루타바가', '바질', '박하', '산마늘', '셀러리'
            , '아티초크', '타임', '파슬리', '호박', '피클', '파채', '파김치', '채소', '야채']
not_in_keywords = {'무':['무침'], '마':'마카로니', '고추':['고추장']}
vegetable_menus =  extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 해조류
# https://namu.wiki/w/%EC%A1%B0%EB%A5%98(%EC%88%98%EC%A4%91%EC%83%9D%EB%AC%BC)?from=%ED%95%B4%EC%A1%B0%EB%A5%98
keywords = ['김', '우뭇가사리', '한천', '매생이', '파래', '바다포도', '해캄', '클로렐라', '청각', '마리모모스볼', '다시마', '미역', '감태', '톳']
not_in_keywords = {'김':['김치', '튀김', '김칫']}
sea_alg_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 쌀케익 - 없어서 패스

In [None]:
# 발효된 콩 상품 -> 장류
# https://namu.wiki/w/%EC%9E%A5%EB%A5%98
keywords = ['된장', '간장', '쯔유', '노추', '미소', '고추장', '청국장', '담북장', '팥장', '두부장', '비지장', '어육장', '춘장', '마장', '낫토', '두반장', '해선장', '굴소스', '게장',
 '장조림', '양념장', '장국', '쌈장', '초장', '*장']
jang_menus  = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 김치
# https://namu.wiki/w/%EA%B9%80%EC%B9%98
keywords = ['김치', '깍두기', '석박지', '동치미', '겉절이', '묵은지', '소박이', '섞박지', '생채', '게국지', '김칫']
kimchi_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 만두
# https://namu.wiki/w/%EB%A7%8C%EB%91%90
keywords = ['만두', '춘권', '만쥬', '사모사', '만둣']
mandu_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords={}, comm_not_in=[])

In [None]:
# 곡물 가루(밀가루, 쌀가루 등 전분)
# https://namu.wiki/w/%EB%B0%80%EA%B0%80%EB%A3%A8
# 미숫가루
keywords = ["면", "수제비", "전", "부침개", "빵", "춘권", "튀김", "과자", "국수", "메밀", "모밀", "피자", "전병", "떡", "어묵", "오뎅", "소시지", "소세지", "햄", "김밥", 
            "부대찌개", "스콘", "만두", "파이", "빈대떡", "케이크", "케익", "쿠키", "핫도그", "파스타", "치킨", "라자냐", "팟타이", "나쵸", "팝콘", '스파게티', '짬뽕']
not_in_keywords = {'치킨':['치킨무'], '전':['전주식'], '짬뽕':['고기', '찌개', '국']}
powder_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=[])

In [None]:
# 과일
# https://namu.wiki/w/%EA%B3%BC%EC%9D%BC
# https://namu.wiki/w/%EC%88%98%EC%9E%85%20%EA%B3%BC%EC%9D%BC
keywords = ['구기자','매실','무화과','버찌','체리','복분자','복숭아','블랙베리','블루베리','딸기','살구','앵두','자두','포도'
            ,'감','다래','대추','머루','모과','무화과','배','사과','석류','으름','귤','유자','레드향','천혜향','한라봉'
            ,'과라나','구아바','구즈베리','토마토','나랑히야','노니','노팔','니파팜','두꾸','두리안','라임','람부탄'
            ,'레몬','애플','루비솔트부쉬','리치','여지','마랑','마룰라','마르멜로','마프랑','망고','블랙베리','아보카도'
            ,'아로니아','아사이베리','아사이 베리','양초열매','오렌지','올리브','용안','롱간','자몽','바나나','딸기','수박'
            ,'참외','멜론','메론','여주','파인애플','토마토','코코넛','크랜베리','타마린드','파파야','패션프루트','패션후르츠']
not_in_keywords = {'살구':['구이', '목살', '삼겹살', '가브리살', '갈비살', '항정살'], '감':['감자'], '배':['배추', '알배기', '소배기']}
comm_not_in = ['주스', '쥬스', '음료', 'D', '순']
fruit_menus = extractMenu(tot_menu_arr, keywords=keywords, not_in_keywords=not_in_keywords, comm_not_in=comm_not_in)

## 메뉴 추가 특성 - Part2

In [None]:
# 쌀
# https://namu.wiki/w/%EA%B3%A1%EB%AC%BC

keywords = ['쌀', '잡곡', '오곡', '현미', '흑미', '귀리', '차조', '렌틸콩', '강낭콩', '병아리콩', '완두콩', '기장', '보리', '수수', '호밀'] 
not_in_keywords = {'쌀':['쌀국수', '찹쌀'], '기장':['장조림'], '수수':['옥수수', '부꾸미']} # 찹쌀은 밥 메뉴명에 쓰이지 않아 삭제
comm_not_in = ['스프']
rice_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 김밥 및 초밥

keywords = ['김밥', '초밥'] 
not_in_keywords = {'김밥':['볶음밥']}
comm_not_in = []
gimbab_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 소금, 식초 등에 절인 해산물
# 해산물이 들어있지 않은 절임류도 포함시킴

keywords = ['절임', '젓'] 
not_in_keywords = {}
comm_not_in = []
saused_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 면류
# https://femiwiki.com/w/%EB%B6%84%EB%A5%98:%EC%A2%85%EB%A5%98/%EB%A9%B4%EC%9A%94%EB%A6%AC
keywords = ['국수', '면', '파스타', '스파게티', '짬뽕', '라면'] 
not_in_keywords = {'짬뽕':['고기', '찌개', '국']}
comm_not_in = []
noodle_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 스튜 - 조림과 찌개의 중간단계
# https://namu.wiki/w/%EC%8A%A4%ED%8A%9C
keywords = ['스튜', '조림'] 
not_in_keywords = {}
comm_not_in = []
stew_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 한국 전통 샐러드
keywords = ['나물', '무침'] 
not_in_keywords = {'나물':['콩나물', '밥']}
comm_not_in = []
namul_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 피클
keywords = ['피클'] 
not_in_keywords = {}
comm_not_in = []
pickle_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 뚝배기 - 없음 -> 제외
keywords = ['뚝배기', '돌솥'] 
not_in_keywords = {}
comm_not_in = []
dduk_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

dduk_menus

In [None]:
# 샐러드
keywords = ['샐러드'] 
not_in_keywords = {}
comm_not_in = []
salad_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 우유
# 우유가 들어간 식재료(크림, 요거트 등)종류로 변경
keywords = ['까르보나라', '크림', '요거트'] 
not_in_keywords = {}
comm_not_in = ['샐러드', 'D', '드레싱', '소스'] # 샐러드 드레싱, 디핑소스 제외
milk_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 빵, 쿠키
# https://ko.wikipedia.org/wiki/%EB%B9%B5_%EB%AA%A9%EB%A1%9D
keywords = ['와플', '케이크', '케잌', '바게트', '도넛', '도너츠', '핫도그', '도라야키', '베이글', '번', '비스킷', '스콘', '토스트', '브레드', '포카차', '피자', '호두과자', '쿠키'] 
not_in_keywords = {}
comm_not_in = []
bread_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
# 음료
# https://ko.wikipedia.org/wiki/%EB%B9%B5_%EB%AA%A9%EB%A1%9D
keywords = ['주스', '쥬스', '수정과', '식혜', '식초', '코코아', '칵테일', '스무디', '우유', '셰이크', '야쿠르트', '요구르트', '커피', '차', '탄산수', '음료'] 
not_in_keywords = {'차':['차돌']}
comm_not_in = []
drink_menus = extractMenu(tot_menu_arr, keywords, not_in_keywords, comm_not_in)

In [None]:
def get_food_one_hot(x, menu_array):
  menu_list = x.strip().split()
  for i, menu_nm in enumerate(menu_list):
    menu_nm = re.sub(pattern, '', menu_nm)
    if menu_nm.strip() in ['', '*']:
      continue
    if menu_nm[0] == '(' or menu_nm[-1] == ')':
      continue
    try:
      if menu_array.index(menu_nm) > -1:
        return 1
    except Exception:
      pass
  return 0

In [None]:
# 데이터 병합
menu_col_nm = ['육류', '난류', '죽류', '덮밥_국밥류', '비빔밥_볶음밥류', '국탕류', '구이류', '전류', '튀김류', '곡물', '콩류',
               '묵', '생선_조개류', '채소류', '해조류', '장류', '김치', '만두', '곡물가루', '과일', '쌀', '김밥_초밥', '절임류',
               '면류', '스튜', '나물_무침류', '피클', '샐러드', '우유', '빵류', '음료', '돼지고기', '소고기', '닭고기', '오리고기']
menu_data_arr = [meat_menus, egg_menus, juk_menus, gukbob_menus, bb_menus, soup_menus, gui_menus, jeon_menus, fry_menus, grain_menus, bean_menus,
                 kor_jelly_menus, fish_shell_menus, vegetable_menus, sea_alg_menus, jang_menus, kimchi_menus, mandu_menus, powder_menus, fruit_menus, rice_menus, gimbab_menus, saused_menus,
                 noodle_menus, stew_menus, namul_menus, pickle_menus, salad_menus, milk_menus, bread_menus, drink_menus, pig_menus, beef_menus, chicken_menus, duck_menus]

for col_type in ['중식메뉴', '석식메뉴']:
  for i, menu_arr in enumerate(menu_data_arr):
    train[col_type + '_' + menu_col_nm[i]] = train[col_type].apply(lambda x: get_food_one_hot(x, menu_arr))
    test[col_type + '_' + menu_col_nm[i]] = train[col_type].apply(lambda x: get_food_one_hot(x, menu_arr))

In [None]:
df = pd.concat([train, test], axis=0).reset_index(drop=True)
df = df.fillna(0)
df.columns = ['일자', '요일', '정원', '휴가자', '출장자', '야근자', '재택근무자', '조식', '중식', '석식',\
              '중식계', '석식계', '중식메뉴_육류',
       '중식메뉴_난류', '중식메뉴_죽류', '중식메뉴_덮밥_국밥류', '중식메뉴_비빔밥_볶음밥류', '중식메뉴_국탕류',
       '중식메뉴_구이류', '중식메뉴_전류', '중식메뉴_튀김류', '중식메뉴_곡물', '중식메뉴_콩류', '중식메뉴_묵',
       '중식메뉴_생선_조개류', '중식메뉴_채소류', '중식메뉴_해조류', '중식메뉴_장류', '중식메뉴_김치', '중식메뉴_만두',
       '중식메뉴_곡물가루', '중식메뉴_과일', '중식메뉴_쌀', '중식메뉴_김밥_초밥', '중식메뉴_절임류', '중식메뉴_면류',
       '중식메뉴_스튜', '중식메뉴_나물_무침류', '중식메뉴_피클', '중식메뉴_샐러드', '중식메뉴_우유', '중식메뉴_빵류',
       '중식메뉴_음료', '중식메뉴_돼지고기', '중식메뉴_소고기', '중식메뉴_닭고기', '중식메뉴_오리고기', '석식메뉴_육류',
       '석식메뉴_난류', '석식메뉴_죽류', '석식메뉴_덮밥_국밥류', '석식메뉴_비빔밥_볶음밥류', '석식메뉴_국탕류',
       '석식메뉴_구이류', '석식메뉴_전류', '석식메뉴_튀김류', '석식메뉴_곡물', '석식메뉴_콩류', '석식메뉴_묵',
       '석식메뉴_생선_조개류', '석식메뉴_채소류', '석식메뉴_해조류', '석식메뉴_장류', '석식메뉴_김치', '석식메뉴_만두',
       '석식메뉴_곡물가루', '석식메뉴_과일', '석식메뉴_쌀', '석식메뉴_김밥_초밥', '석식메뉴_절임류', '석식메뉴_면류',
       '석식메뉴_스튜', '석식메뉴_나물_무침류', '석식메뉴_피클', '석식메뉴_샐러드', '석식메뉴_우유', '석식메뉴_빵류',
       '석식메뉴_음료', '석식메뉴_돼지고기', '석식메뉴_소고기', '석식메뉴_닭고기', '석식메뉴_오리고기']
df.drop(columns=['조식', '중식', '석식'], inplace=True)

### 외부 데이터 추가

In [None]:
dust_dir = os.path.join(path, '미세먼지_일별')
wdata_dir = os.path.join(path, '날씨_시간별')

In [None]:
w_attrs = ['강수', '기온', '습도', '강수형태']
w_years = os.listdir(wdata_dir)

In [None]:
def get_wdata(data_path, dtype='num'):
  datetime_list = []
  value_list_12 = []
  value_list_18 = []
  curr_mon = ''

  with open(data_path, 'r') as f:
    lines = f.readlines()
    for i, line in enumerate(lines):
      if line.strip() == '':
        break
      row_data = line.strip().split(',')
      row_data = [elem.strip() for elem in row_data]
      if i == 0:
        curr_mon = row_data[-1].split()[-1][:-2]
        continue
      if len(row_data) == 1:
        curr_mon = row_data[-1].split()[-1][:-2]
        continue
      r_day, r_hour, r_value = row_data
      if r_hour in ["1200", "1800"]: # 점심 12시, 저녁 6시 기준으로 처리
        if r_hour == "1200":
          datetime_list.append(curr_mon[:4]+'-'+curr_mon[4:]+'-'+str('%02d'%int(r_day)))

        if dtype == 'num':
          if r_hour == "1200":
            value_list_12.append(float(r_value))
          else:
            value_list_18.append(float(r_value))
        else:
          if r_hour == "1200":
            value_list_12.append(str(round(float(r_value))))
          else:
            value_list_18.append(str(round(float(r_value))))
          

  return datetime_list, value_list_12, value_list_18

In [None]:
# 강수, 기온, 습도, 강수형태 데이터
w_data_rain_12 = []
w_data_temp_12 = []
w_data_hum_12 = []
w_data_rtype_12 = []
w_data_rain_18 = []
w_data_temp_18 = []
w_data_hum_18 = []
w_data_rtype_18 = []
w_datetime = []

for year in w_years:
  w_subdir = os.path.join(wdata_dir, year)
  file_names = os.listdir(w_subdir)
  file_name = ""
  if year != '2021':
    file_name = f'{year}01_{year}12.csv'
  else:
    file_name = f'{year}01_{year}04.csv'
  file_path_rain = os.path.join(w_subdir, '충무공동_강수_'+file_name)
  file_path_temp = os.path.join(w_subdir, '충무공동_기온_'+file_name)
  file_path_hum = os.path.join(w_subdir, '충무공동_습도_'+file_name)
  file_path_rtype = os.path.join(w_subdir, '충무공동_강수형태_'+file_name)

  datetime_list_rain, value_list_rain_12, value_list_rain_18 = get_wdata(file_path_rain, dtype='num') # 강수 데이터
  datetime_list_temp, value_list_temp_12, value_list_temp_18 = get_wdata(file_path_temp, dtype='num') # 기온 데이터
  datetime_list_hum, value_list_hum_12, value_list_hum_18 = get_wdata(file_path_hum, dtype='num') # 습도 데이터
  datetime_list_rtype, value_list_rtype_12, value_list_rtype_18 = get_wdata(file_path_rtype, dtype='cat') # 강수형태 데이터
  
  w_datetime   += datetime_list_rain
  w_data_rain_12  += value_list_rain_12
  w_data_temp_12  += value_list_temp_12
  w_data_hum_12   += value_list_hum_12
  w_data_rtype_12 += value_list_rtype_12
  w_data_rain_18  += value_list_rain_18
  w_data_temp_18  += value_list_temp_18
  w_data_hum_18   += value_list_hum_18
  w_data_rtype_18 += value_list_rtype_18

In [None]:
w_df = pd.DataFrame({'일자':pd.Series(w_datetime, dtype='datetime64[ns]'),
                   'rain_lunch':pd.Series(w_data_rain_12, dtype='float'),
                   'temp_lunch':pd.Series(w_data_temp_12, dtype='float'),
                   'hum_lunch':pd.Series(w_data_hum_12, dtype='float'),
                   'rain_type_lunch':pd.Series(w_data_rtype_12, dtype='str'),
                   'rain_dinner':pd.Series(w_data_rain_18, dtype='float'),
                   'temp_dinner':pd.Series(w_data_temp_18, dtype='float'),
                   'hum_dinner':pd.Series(w_data_hum_18, dtype='float'),
                   'rain_type_dinner':pd.Series(w_data_rtype_18, dtype='str')})

In [None]:
# 불쾌지수 컬럼 추가
# https://dacon.io/competitions/official/235736/codeshare/2753?page=1&dtype=recent
w_df['discomfort_index_lunch'] = 1.8*w_df['temp_lunch'] - 0.55*(1-w_df['hum_lunch']/100)*(1.8*w_df['temp_lunch']-26) + 32
w_df['discomfort_index_dinner'] = 1.8*w_df['temp_dinner'] - 0.55*(1-w_df['hum_dinner']/100)*(1.8*w_df['temp_dinner']-26) + 32

In [None]:
dust_file_paths = glob.glob(os.path.join(dust_dir, '*.xls'))
d_datetime = []
d_value1 = []
d_value2 = []

# 시간별 데이터의 경우 미세먼지 측정값 중 빈 값이 있는 경우가 어느 정도 있어서 배제했습니다.
for file_path in dust_file_paths:
  date_yyyymm = os.path.splitext(os.path.basename(file_path))[0] # yyyymm
  date_year = date_yyyymm[:4]
  date_mon = date_yyyymm[4:]
  dust_df = None

  if date_year == '2021':
    dust_df = pd.read_excel(file_path, header=[0, 1], skiprows=3)
  else:
    dust_df = pd.read_excel(file_path, header=[0, 1])
  cols = dust_df.columns
  date_col = cols[0]
  fine_dust_col = cols[1] # 미세먼지
  ufine_dust_col = cols[2] # 초미세먼지

  # 해당월의 일수 가져오기
  days = calendar.monthrange(int(date_year),int(date_mon))[1] 
  for day in range(1, days+1):
    day_1 = '%02d'%day
    curr_day_df =  date_year+ '-' + date_mon + '-' + day_1

    row_lunch = dust_df[dust_df[cols[0]] == curr_day_df]
    row_dinner = dust_df[dust_df[cols[0]] == curr_day_df]
    curr_date = date_year+'-'+date_mon+'-'+day_1
  
    d_datetime.append(curr_date)
    d_value1.append(row_lunch[fine_dust_col].values[0])
    d_value2.append(row_lunch[ufine_dust_col].values[0])

In [None]:
dust_df = pd.DataFrame({'일자':pd.Series(d_datetime, dtype='datetime64[ns]'),
                   'fine_dust':pd.Series(d_value1, dtype='float'),
                   'ultra_fine_dust':pd.Series(d_value2, dtype='float')})

In [None]:
df = pd.merge(df, dust_df, on='일자')
df = pd.merge(df, w_df, on='일자')

# 결측치 근처 관측치로 대체
df['fine_dust'][564, 1129] = [36, 23]
df['ultra_fine_dust'][234, 235, 564, 654, 1129] = [11, 31, 26, 5, 9]

# 저녁 컬럼 삭제
# df = df.drop(columns=['rain_dinner', 'temp_dinner', 'hum_dinner', 'rain_type_dinner', 'discomfort_index_dinner'])

# 미세먼지 명목변수화
df['fine_degree'] = df['fine_dust'].apply(lambda x : 0 if 0<=x<=30 else (1 if 31<=x<=80 else (2 if 81<=x<=150 else 3)))
df['ultra_fine_degree'] = df['ultra_fine_dust'].apply(lambda x : 0 if 0<=x<=15 else (1 if 16<=x<=35 else (2 if 36<=x<=75 else 3)))
df['fine_degree'] = df.apply(lambda x : max(x['fine_degree'], x['ultra_fine_degree']), axis=1)

# 강수량 명목변수화
df['rain_degree_lunch'] = df['rain_lunch'].apply(lambda x : 0 if x < 2 else 1)
df['rain_degree_dinner'] = df['rain_dinner'].apply(lambda x : 0 if x < 2 else 1)

# 불쾌지수 명목변수화 
df['discomfort_degree_lunch'] = df['discomfort_index_lunch'].apply(lambda x : 0 if x<68 else (1 if 68<=x<75 else (2 if 75<=x<80 else 3)))
df['discomfort_degree_dinner'] = df['discomfort_index_dinner'].apply(lambda x : 0 if x<68 else (1 if 68<=x<75 else (2 if 75<=x<80 else 3)))

# 명목변수화하면서 삭제
df = df.drop(columns=['rain_lunch', 'temp_lunch', 'hum_lunch', 'rain_type_lunch', 'discomfort_index_lunch',
                      'rain_dinner', 'temp_dinner', 'hum_dinner', 'rain_type_dinner', 'discomfort_index_dinner',
                      'fine_dust', 'ultra_fine_dust', 'ultra_fine_degree',])

## 중간 변수 색출

### 파생변수

In [None]:
path = '/content/drive/MyDrive/구내식당/water/'
df = pd.read_csv(path+'df.csv')
train = pd.read_csv(path+'train.csv')
holiday = pd.read_csv(path+'holidays.csv', index_col=0)
corona = pd.read_csv(path+'corona_data.csv')

In [None]:
# 코로나 데이터 추가
corona = corona.drop_duplicates(['일자'])
check_corona = corona[['일자', '누적검사자']]

df = pd.merge(df,corona[['일자', '일일검사자']], on='일자', how='left')
df = df.fillna(0)

In [None]:
df['일자'] = df['일자'].astype('datetime64')
corona['일자'] = corona['일자'].astype('datetime64')

df['식사가능인원'] = df['정원']-(df['휴가자']+df['출장자'])+df['야근자']
df['휴가출장'] = df['정원']-(df['휴가자']+df['출장자'])
df['휴가출장재택'] = df['정원']-(df['휴가자']+df['출장자']+df['재택근무자'])
df['휴가출장재택야근'] = df['정원']-(df['휴가자']+df['출장자']+df['재택근무자'])+df['야근자']
df['휴가'] = df['정원']-(df['휴가자'])
df['휴가야근'] = df['정원']-(df['휴가자'])+df['야근자']
df['휴가재택야근'] = df['정원']-(df['휴가자']+df['재택근무자'])+df['야근자']
df['휴가재택'] = df['정원']-(df['휴가자']+df['재택근무자'])
# df['휴가비율'] = df['휴가자']/df['정원']
# df['출장비율'] = df['출장자']/df['정원']
# df['야근비율'] = df['야근자']/df['출근인원']
# df['재택비율'] = df['재택근무자']/df['정원']
df['휴가_출장'] = df['휴가자']+df['출장자']

In [None]:
df['년'] = df['일자'].dt.year
df['월'] = df['일자'].dt.month
df['년월'] = df['년'].astype('str')+'_'+df['월'].astype('str')

In [None]:
first_dayofmonth = []
last_dayofmonth = []
for i in df['년월'].unique() :
    first_dayofmonth.append(df[df['년월']==i].iloc[0].name)
    last_dayofmonth.append(df[df['년월']==i].iloc[-1].name)

In [None]:
df['첫_출근일'] = df.apply(lambda x : 1 if x.name in first_dayofmonth else 0, axis=1)
df['마지막_출근일'] = df.apply(lambda x : 1 if x.name in last_dayofmonth else 0, axis=1)

In [None]:
df['년'] = df['일자'].dt.year
df['월'] = df['일자'].dt.month
df['일'] = df['일자'].dt.day
month_to_season = {1: 3,2: 3,3:0,4:0,5:0,6:1,7:1,8:1,9:2,10:2,11:2,12: 3}
df['계절'] = df['월'].apply(lambda x : month_to_season[x])

df['요일'] = df['일자'].dt.weekday
df['is_monday'] = df['요일'].apply(lambda x : 1 if (x==0) else 0) 
df['야근_가능'] = df['요일'].apply(lambda x : 1 if (x=='수') or (x=='금') else 0)
df['정책_변화'] = df['일자'].apply(lambda x : 0 if x < pd.to_datetime('2019-01-04') else 1)

df['연기준몇주째']= df['일자'].dt.weekofyear
df['월마지막일여부'] =df['일자'].dt.is_month_end

train = df.iloc[:train.shape[0], :]
test = df.iloc[train.shape[0]:, :]

In [None]:
import pendulum
train['주차'] = train['일자'].apply(lambda x: pendulum.parse(str(x)).week_of_month)
test['주차'] = test['일자'].apply(lambda x: pendulum.parse(str(x)).week_of_month)

repair_2017 = train[(train['년']==2017)&(train['주차']<0)]['일자'].dt.week
repair_2021 = train[(train['년']==2021)&(train['주차']<0)]['일자'].dt.week
test_repair = test[(test['년']==2021)&(test['주차']<0)]['일자'].dt.week

train['주차'][list(repair_2017.index)] = repair_2017.values
train['주차'][list(repair_2021.index)] = repair_2021.values
test['주차'][list(test_repair.index)] = test_repair.values
train['주차'][list(train[train['주차']==-46].index)] = np.array([6, 6, 6])

df = pd.concat([train, test])
df['월_주차'] = df['년'].astype('str')+'_'+df['월'].astype('str')+'_'+df['주차'].astype('str')

train = df.iloc[:train.shape[0], :]
test = df.iloc[train.shape[0]:, :]

In [None]:
df['is_corona'] = df['일자'].apply(lambda x : 0 if x < pd.to_datetime('2020-01-06') else 1)

In [None]:
# 공휴일 데이터 추가
holiday['date'] = pd.to_datetime(holiday['date'])
df['일자'] = pd.to_datetime(df['일자'])
df['before_holiday'] = df['일자'].apply(lambda x : 1 if (x+dt.timedelta(1) in holiday['date'].tolist()) else 0)
df['after_holiday'] = df['일자'].apply(lambda x : 1 if (x-dt.timedelta(1) in holiday['date'].tolist()) else 0)

In [None]:
# # 이벤트 데이터 고민 - 복날 / 연말
event = pd.to_datetime(['2016-08-16', '2016-07-27', '2016-07-18', '2017-08-11', '2017-07-21','2017-07-12', '2018-08-16', '2018-07-27', '2018-07-17', '2019-08-12', '2019-07-22', '2019-07-12', '2020-07-27', '2020-07-16'])
df['복날'] = df['일자'].apply(lambda x : 1 if x in event else 0)

# end_year = df[(df['월']==12)&(df['일']>=21)].index
# df['연말'] = df.apply(lambda x : 1 if  x.name in end_year else 0, axis=1)

# 명절 전 영업일 여부 + 복날
event = pd.to_datetime(['2016-02-05', '2019-09-13', '2017-01-26','2017-09-29', '2018-02-14', '2018-09-21', '2019-02-01', '2019-09-11', '2020-01-23', '2020-09-30', '2021-02-10'])
df['명절_이전_영업일'] = df['일자'].apply(lambda x : 1 if x in event else 0)

In [None]:
df['휴식기간'] = df['일자'].diff().dt.days.fillna(0).astype('int')
df['휴식기간'] = df['휴식기간'].apply(lambda x: 1 if x>=4 else 0)

In [None]:
# 인원 변화가 생긴 주에 중식계, 석식계가 많은가 했는데 그렇지 않았다.
df['인원변화'] = df['정원'].diff()
df['인원변화'][0] = 0
df['변화'] = df['인원변화'].apply(lambda x : 2 if x>10 else (1 if -10<=x<=10 else 0))

display(df[df.index.isin(df[df['변화']==0].index-1)][['중식계', '석식계']].mean())
display(df[~df.index.isin(df[df['변화']==0].index-1)][['중식계', '석식계']].mean())

df['출장자제외'] = df['정원'] - df['출장자']
df['재택근무제외'] = df['정원'] - df['재택근무자']
df['연기준몇일째']= df['일자'].dt.dayofyear
df['연기준몇주째']= df['일자'].dt.weekofyear
df['월일수']= df['일자'].dt.days_in_month
df['윤년여부'] = df['일자'].dt.is_leap_year
df['월시작일여부'] = df['일자'].dt.is_month_start
df['월마지막일여부'] =df['일자'].dt.is_month_end
df['분기시작일여부'] =df['일자'].dt.is_quarter_start
df['분기마지막일여부'] =df['일자'].dt.is_quarter_end
df['연시작일여부'] =df['일자'].dt.is_year_start
df['연마지막일여부'] =df['일자'].dt.is_year_end

### 데이터 정규화

In [None]:
def get_one_hot(x, target_val):
  if x == target_val:
    return 1
  else:
    return 0

In [None]:
from sklearn.preprocessing import LabelEncoder
onehot_col = ['년', '월', '요일', '계절']
df_tmp = df.copy()
# df = pd.concat([df[list((set(df.columns)-set(onehot_col)))],\
#                 pd.get_dummies(df[onehot_col])], axis=1)

sub_types = [[2016, 2017, 2018, 2019, 2020, 2021], [1,2,3,4,5,6,7,8,9,10,11,12], [0,1,2,3,4], [0,1,2,3]]

for i, col_type in enumerate(onehot_col):
  for j, class_nm in enumerate(sub_types[i]):
    df_tmp[col_type + '_' + str(class_nm)] = df_tmp[col_type].apply(lambda x: get_one_hot(x, class_nm))

In [None]:
# df[['년', '월']] = df[['년', '월']].astype('category')

In [None]:
# df.drop(columns=['정원', '휴가자', '출장자', '야근자', '년월'], inplace=True)

In [None]:
# from sklearn.preprocessing import LabelEncoder
# # lbe = LabelEncoder()
# # df[['년']] = lbe.fit_transform(df[['년']])
# onehot_col = ['년', '월','요일', '계절']
# df = pd.concat([df[list((set(df.columns)-set(onehot_col)))],\
#                 pd.get_dummies(df[onehot_col])], axis=1)

In [None]:
gukgam = pd.to_datetime(['2016-10-04', '2016-10-05', '2016-10-06', '2016-10-07', '2017-10-10', '2017-10-11', '2017-10-12', '2017-10-13', '2018-10-08', '2018-10-10', '2018-10-11',\
          '2018-10-12', '2019-10-02', '2019-10-04', '2020-10-05', '2020-10-06', '2020-10-07', '2020-10-08'])
# gukgam = ['2019-10-04', '2019-10-02', '2018-10-10']
df_tmp['국정감사'] = df_tmp['일자'].apply(lambda x : 1 if x in gukgam else 0)

In [None]:
# train = df.iloc[:train.shape[0], :]
# test = df.iloc[train.shape[0]:, :]

train = df_tmp.iloc[:train.shape[0], :]
test = df_tmp.iloc[train.shape[0]:, :]

In [None]:
train = train.drop(columns=['일자','월_주차', '정원', '년', '월', '요일',  '계절'])

In [None]:
train[['월마지막일여부', '윤년여부', '월시작일여부', '분기시작일여부', '분기마지막일여부','연시작일여부', '연마지막일여부']] = train[['월마지막일여부', '윤년여부', '월시작일여부', '분기시작일여부', '분기마지막일여부', '연시작일여부', '연마지막일여부']].astype(int)

In [None]:
train_1 =train[[i for i in train.columns if ('석식' not in i) and ('dinner' not in i)]]
train_2 =train[[i for i in train.columns if ('중식' not in i) and ('lunch' not in i)]]

## AutoML

In [None]:
# 이상치 제거
train_2 = train_2.loc[train_2['석식계'] != 0.0]
train_2 = train_2.loc[(train_2['년_2016'] != 1) | (train_2['월_10'] != 1) | (train_2['일'] != 5)]
train_2 = train_2.loc[(train_2['년_2019'] != 1) | (train_2['월_9'] != 1) | (train_2['일'] != 11)]
train_2 = train_2.loc[(train_2['년_2019'] != 1) | (train_2['월_12'] != 1) | (train_2['일'] != 23)]
train_2 = train_2.loc[(train_2['년_2019'] != 1) | (train_2['월_12'] != 1) | (train_2['일'] != 30)]
train_2 = train_2.loc[(train_2['년_2020'] != 1) | (train_2['월_1'] != 1) | (train_2['일'] != 23)]

train_1 = train_1.loc[(train_1['년_2016'] != 1) | (train_1['월_10'] != 1) | (train_1['일'] != 5)]
train_1 = train_1.loc[(train_1['년_2017'] != 1) | (train_1['월_12'] != 1) | (train_1['일'] != 28)]
train_1 = train_1.loc[(train_1['년_2020'] != 1) | (train_1['월_12'] != 1) | (train_1['일'] != 2)]
train_1 = train_1.loc[(train_1['년_2018'] != 1) | (train_1['월_9'] != 1) | (train_1['일'] != 14)]
train_1 = train_1.loc[(train_1['년_2018'] != 1) | (train_1['월_12'] != 1) | (train_1['일'] != 24)]

In [None]:
temp_list = [ '년_2016',
 '년_2017',
 '년_2018',
 '년_2019',
 '년_2020',
 '년_2021',
 '월_1',
 '월_2',
 '월_3',
 '월_4',
 '월_5',
 '월_6',
 '월_7',
 '월_8',
 '월_9',
 '월_10',
 '월_11',
 '월_12',
 '요일_0',
 '요일_1',
 '요일_2',
 '요일_3',
 '요일_4',
 'is_corona',
 'fine_degree',
 'rain_degree_lunch',
 'discomfort_degree_lunch',
 '일',
 '식사가능인원',
 '재택근무자',
 '출장자',
 '휴가자',
 'after_holiday',
 '연기준몇주째',
 '월일수',
 '국정감사',
#   '중식메뉴_육류', '중식메뉴_난류', '중식메뉴_덮밥_국밥류', '중식메뉴_비빔밥_볶음밥류', '중식메뉴_국탕류', '중식메뉴_구이류', '중식메뉴_전류',
#    '중식메뉴_튀김류', '중식메뉴_콩류', '중식메뉴_묵', '중식메뉴_생선_조개류', '중식메뉴_채소류', '중식메뉴_해조류', '중식메뉴_장류', '중식메뉴_만두', '중식메뉴_곡물가루', '중식메뉴_과일', 
#   '중식메뉴_김밥_초밥', '중식메뉴_절임류', '중식메뉴_면류', '중식메뉴_스튜', '중식메뉴_샐러드', '중식메뉴_우유', '중식메뉴_빵류', '중식메뉴_돼지고기', '중식메뉴_소고기', '중식메뉴_닭고기', '중식메뉴_오리고기',
#  'is_monday',
#   '중식메뉴_육류',
#  '중식메뉴_난류',
#  '중식메뉴_죽류',
#  '중식메뉴_덮밥_국밥류',
#  '중식메뉴_비빔밥_볶음밥류',
#  '중식메뉴_국탕류',
#  '중식메뉴_구이류',
#  '중식메뉴_전류',
#  '중식메뉴_튀김류',
#  '중식메뉴_곡물',
#  '중식메뉴_콩류',
#  '중식메뉴_묵',
#  '중식메뉴_생선_조개류',
#  '중식메뉴_채소류',
#  '중식메뉴_해조류',
#  '중식메뉴_장류',
#  '중식메뉴_김치',
#  '중식메뉴_만두',
#  '중식메뉴_곡물가루',
#  '중식메뉴_과일',
#  '중식메뉴_쌀',
#  '중식메뉴_김밥_초밥',
#  '중식메뉴_절임류',
#  '중식메뉴_면류',
#  '중식메뉴_스튜',
#  '중식메뉴_나물_무침류',
#  '중식메뉴_피클',
#  '중식메뉴_샐러드',
#  '중식메뉴_우유',
#  '중식메뉴_빵류',
#  '중식메뉴_음료',
#  '중식메뉴_돼지고기',
#  '중식메뉴_소고기',
#  '중식메뉴_닭고기',
#  '중식메뉴_오리고기',
#  '주차'
#  '변화', '인원변화', '휴식기간', '명절_이전_영업일','복날',
#  'before_holiday', '주차', '정책_변화',
#  , '야근_가능' '일일검사자','첫_출근일',
#  '마지막_출근일',
 ]

* 의미가 큰 변수  
    * 날씨, 일, 
    * 식사가능인원 / 재택근무자 / 출장자 / 휴가자
    * 휴일 다음날 
* 의미가 없는 것
    * 계절, 첫_출근일, 마지막_출근일



* 60.6889

In [None]:
automl = AutoML()

y_train = train_1['중식계']
X_train = train_1[['휴가자', '출장자', '야근자', '재택근무자', '중식메뉴_육류', '중식메뉴_난류', '중식메뉴_덮밥_국밥류', '중식메뉴_비빔밥_볶음밥류', '중식메뉴_국탕류', '중식메뉴_구이류', '중식메뉴_전류', '중식메뉴_튀김류', '중식메뉴_콩류', '중식메뉴_묵', '중식메뉴_생선_조개류', '중식메뉴_채소류', '중식메뉴_해조류', '중식메뉴_장류', '중식메뉴_만두', '중식메뉴_곡물가루', '중식메뉴_과일', '중식메뉴_김밥_초밥', '중식메뉴_절임류', '중식메뉴_면류', '중식메뉴_스튜', '중식메뉴_샐러드', '중식메뉴_우유', '중식메뉴_빵류', '중식메뉴_돼지고기', '중식메뉴_소고기', '중식메뉴_닭고기', '중식메뉴_오리고기', '식사가능인원', '일', '야근_가능', '연기준몇주째', '월마지막일여부', '주차', 'is_corona', 'fine_degree', 'rain_degree_lunch', 'discomfort_degree_lunch', 'before_holiday', 'after_holiday', '년_2016', '년_2017', '년_2018', '년_2019', '년_2020', '년_2021', '월_1', '월_2', '월_3', '월_4', '월_5', '월_6', '월_7', '월_8', '월_9', '월_10', '월_11', '월_12', '요일_0', '요일_1', '요일_2', '요일_3', '요일_4', '계절_0', '계절_1', '계절_2', '계절_3']]
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 120,  # in seconds
    "metric": 'mae',
    "task": 'regression'
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
           **automl_settings)
# Predict
# print(automl.predict(X_train))
# Export the best model
print(automl.model) # 62.8802
# 61.8124 - 이상치 제거

In [None]:
X_test = test[list(X_train.columns)]
pred1 = automl.predict(X_test)

In [None]:
list(train_2.columns)

* 42.8

In [None]:
temp_din = [ '년_2016',
 '년_2017',
 '년_2018',
 '년_2019',
 '년_2020',
 '년_2021',
 '월_1',
 '월_2',
 '월_3',
 '월_4',
 '월_5',
 '월_6',
 '월_7',
 '월_8',
 '월_9',
 '월_10',
 '월_11',
 '월_12',
 '요일_0',
 '요일_1',
 '요일_2',
 '요일_3',
 '요일_4',
  '일',
 '식사가능인원',
 '휴가자',
 '야근자',
 '출장자',
 'is_corona',
 '명절_이전_영업일',
 '휴식기간',
  '계절_0',
 '계절_1',
 '계절_2',
 '계절_3',
#  '국정감사',
#  '월일수',
#  '변화',
#  'before_holiday',
# 'after_holiday',
#  '연기준몇주째',
#  '야근_가능',
#  '첫_출근일',
#  '마지막_출근일',
#  '재택근무자'
#    '일일검사자',
#    'fine_degree',
#  'rain_degree_dinner',
#  'discomfort_degree_dinner',
]

In [None]:
automl = AutoML()
y_train = train_2['석식계']
X_train = train_2[['휴가자', '출장자', '야근자', '재택근무자', '석식메뉴_육류', '석식메뉴_난류', '석식메뉴_덮밥_국밥류', '석식메뉴_비빔밥_볶음밥류', '석식메뉴_국탕류', '석식메뉴_구이류', '석식메뉴_전류', '석식메뉴_튀김류', '석식메뉴_콩류', '석식메뉴_묵', '석식메뉴_생선_조개류', '석식메뉴_채소류', '석식메뉴_해조류', '석식메뉴_장류', '석식메뉴_만두', '석식메뉴_곡물가루', '석식메뉴_과일', '석식메뉴_김밥_초밥', '석식메뉴_절임류', '석식메뉴_면류', '석식메뉴_스튜', '석식메뉴_샐러드', '석식메뉴_우유', '석식메뉴_빵류', '석식메뉴_돼지고기', '석식메뉴_소고기', '석식메뉴_닭고기', '석식메뉴_오리고기', '식사가능인원', '일', '야근_가능', '연기준몇주째', '월마지막일여부', '주차', 'is_corona', 'fine_degree', 'rain_degree_dinner', 'discomfort_degree_dinner', 'before_holiday', 'after_holiday', '명절_이전_영업일', '년_2016', '년_2017', '년_2018', '년_2019', '년_2020', '년_2021', '월_1', '월_2', '월_3', '월_4', '월_5', '월_6', '월_7', '월_8', '월_9', '월_10', '월_11', '월_12', '요일_0', '요일_1', '요일_2', '요일_3', '요일_4', '계절_0', '계절_1', '계절_2', '계절_3']]
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 120,  # in seconds
    "metric": 'mae',
    "task": 'regression'
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
           **automl_settings)
# Predict
# print(automl.predict(X_train))
# Export the best model
print(automl.model) # 44.7386
# 43.4131 - 명절 전 영업일 추가
# 42.7798 - 명절 전 영업일 추가, 이상치 제거

In [None]:
X_test = test[list(X_train.columns)]
pred2 = automl.predict(X_test)

In [None]:
sample_submission = pd.read_csv(path+'sample_submission.csv')
submission = sample_submission.copy()

In [None]:
submission['중식계'] = pred1
submission['석식계'] = pred2

In [None]:
from datetime import datetime
now_tm = datetime.today().strftime('%Y-%m-%d-%H:%M:%S')

sub_path = '/content/drive/MyDrive/DACON/Dacon_Industry_Meal/submit/'
submission.to_csv(sub_path+now_tm+'.csv', index=False)

## Pycaret

feature_selection - 전진 단계별 선택법

In [None]:
import statsmodels.api as sm
## 전진 단계별 선택법 - 중식
lunch_cols = train_1.columns.tolist()
lunch_cols.remove('중식계')
lunch_cols.remove('일자')
variables = lunch_cols ## 설명 변수 리스트
 
y = train_1['중식계'] ## 반응 변수
selected_variables = [] ## 선택된 변수들
sl_enter = 0.05
sl_remove = 0.05
 
sv_per_step = [] ## 각 스텝별로 선택된 변수들
adjusted_r_squared = [] ## 각 스텝별 수정된 결정계수
steps = [] ## 스텝
step = 0
while len(variables) > 0:
    remainder = list(set(variables) - set(selected_variables))
    pval = pd.Series(index=remainder) ## 변수의 p-value
    ## 기존에 포함된 변수와 새로운 변수 하나씩 돌아가면서 
    ## 선형 모형을 적합한다.
    for col in remainder: 
        X = train_1[selected_variables+[col]]
        X = sm.add_constant(X)
        model = sm.OLS(y,X).fit()

        pval[col] = model.pvalues[col]
 
    min_pval = pval.min()
    if min_pval < sl_enter: ## 최소 p-value 값이 기준 값보다 작으면 포함
        selected_variables.append(pval.idxmin())
        ## 선택된 변수들에대해서
        ## 어떤 변수를 제거할지 고른다.
        while len(selected_variables) > 0:
            selected_X = train_1[selected_variables]
            selected_X = sm.add_constant(selected_X)
            selected_pval = sm.OLS(y,selected_X).fit().pvalues[1:] ## 절편항의 p-value는 뺀다
            max_pval = selected_pval.max()
            if max_pval >= sl_remove: ## 최대 p-value값이 기준값보다 크거나 같으면 제외
                remove_variable = selected_pval.idxmax()
                selected_variables.remove(remove_variable)
            else:
                break
        
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(train_1[selected_variables])).fit().rsquared_adj
        adjusted_r_squared.append(adj_r_squared)
        sv_per_step.append(selected_variables.copy())
    else:
        break

In [None]:
fig = plt.figure(figsize=(10,10))
fig.set_facecolor('white')
 
font_size = 15
plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=12)
plt.plot(steps, adjusted_r_squared, marker='o')
    
plt.ylabel('Adjusted R Squared',fontsize=font_size)
plt.grid(True)
plt.show()

In [None]:
train_1_tmp = train_1[selected_variables]
train_1_tmp['중식계'] = train_1['중식계']
train_1 = train_1_tmp

In [None]:
## 전진 단계별 선택법 - 석식
dinner_cols = train_2.columns.tolist()
dinner_cols.remove('석식계')
dinner_cols.remove('일자')
variables = dinner_cols ## 설명 변수 리스트
 
y = train_2['석식계'] ## 반응 변수
selected_variables_dinner = [] ## 선택된 변수들
sl_enter = 0.05
sl_remove = 0.05
 
sv_per_step = [] ## 각 스텝별로 선택된 변수들
adjusted_r_squared = [] ## 각 스텝별 수정된 결정계수
steps = [] ## 스텝
step = 0
while len(variables) > 0:
    remainder = list(set(variables) - set(selected_variables_dinner))
    pval = pd.Series(index=remainder) ## 변수의 p-value
    ## 기존에 포함된 변수와 새로운 변수 하나씩 돌아가면서 
    ## 선형 모형을 적합한다.
    for col in remainder: 
        X = train_2[selected_variables_dinner+[col]]
        X = sm.add_constant(X)
        model = sm.OLS(y,X).fit()

        pval[col] = model.pvalues[col]
 
    min_pval = pval.min()
    if min_pval < sl_enter: ## 최소 p-value 값이 기준 값보다 작으면 포함
        selected_variables_dinner.append(pval.idxmin())
        ## 선택된 변수들에대해서
        ## 어떤 변수를 제거할지 고른다.
        while len(selected_variables_dinner) > 0:
            selected_X = train_2[selected_variables_dinner]
            selected_X = sm.add_constant(selected_X)
            selected_pval = sm.OLS(y,selected_X).fit().pvalues[1:] ## 절편항의 p-value는 뺀다
            max_pval = selected_pval.max()
            if max_pval >= sl_remove: ## 최대 p-value값이 기준값보다 크거나 같으면 제외
                remove_variable = selected_pval.idxmax()
                selected_variables_dinner.remove(remove_variable)
            else:
                break
        
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(train_2[selected_variables_dinner])).fit().rsquared_adj
        adjusted_r_squared.append(adj_r_squared)
        sv_per_step.append(selected_variables_dinner.copy())
    else:
        break

In [None]:
fig = plt.figure(figsize=(10,10))
fig.set_facecolor('white')
 
font_size = 15
plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=12)
plt.plot(steps, adjusted_r_squared, marker='o')
    
plt.ylabel('Adjusted R Squared',fontsize=font_size)
plt.grid(True)
plt.show()

In [None]:
train_2_tmp = train_2[selected_variables_dinner]
train_2_tmp['석식계'] = train_2['석식계']
train_2 = train_2_tmp

In [None]:
# test 데이터
cols_lunch = selected_variables
cols_dinner = selected_variables_dinner
test_lunch = test[cols_lunch]
test_dinner = test[cols_dinner]

In [None]:
reg = setup(data=train_1,
            target='중식계',
            numeric_imputation = 'mean',
            normalize = True,
            silent= True)

In [None]:
best_5 = compare_models(sort='MAE', n_select=5)
blended = blend_models(estimator_list= best_5, fold=5, optimize='MAE')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
pred1 = predict_model(final_model, test)

In [None]:
sample_submission = pd.read_csv(path+'sample_submission.csv')
submission = sample_submission.copy()

In [None]:
submission['중식계'] = pred1.reset_index()['Label']

In [None]:
reg = setup(data=train_2,
            target='석식계',
            numeric_imputation = 'mean',
            normalize = True,
            silent= True)

In [None]:
best_5 = compare_models(sort='MAE', n_select=5)
blended = blend_models(estimator_list= best_5, fold=5, optimize='MAE')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
pred2 = predict_model(final_model, test)

In [None]:
submission['석식계'] = pred2.reset_index()['Label']

In [None]:
sub_path = '/content/drive/MyDrive/DACON/Dacon_Industry_Meal/submit/'
best_submit = pd.read_csv(sub_path+'20210605_01_79.csv')
df_82 = pd.read_csv(sub_path+'20210608_01_holiday_82.csv')
from sklearn.metrics import mean_absolute_error
def show_mae(data) : 
    result = mean_absolute_error(best_submit['중식계'], data['중식계'])+mean_absolute_error(best_submit['석식계'], data['석식계'])
    return display(result)

show_mae(submission)

In [None]:
submission.to_csv(sub_path+'/20210619_02.csv', index=False)