<a href="https://colab.research.google.com/github/herjh0405/DACON_Meal/blob/master/New_Sight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pendulum
!pip install pycaret

In [None]:
import pandas as pd
from pycaret.regression import *

# 한글 폰트 사용
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os

def change_matplotlib_font(font_download_url):
    FONT_PATH = 'MY_FONT'
    
    font_download_cmd = f"wget {font_download_url} -O {FONT_PATH}.zip"
    unzip_cmd = f"unzip -o {FONT_PATH}.zip -d {FONT_PATH}"
    os.system(font_download_cmd)
    os.system(unzip_cmd)
    
    font_files = fm.findSystemFonts(fontpaths=FONT_PATH)
    for font_file in font_files:
        fm.fontManager.addfont(font_file)

    font_name = fm.FontProperties(fname=font_files[0]).get_name()
    matplotlib.rc('font', family=font_name)
    print("font family: ", plt.rcParams['font.family'])

font_download_url = "https://fonts.google.com/download?family=Noto%20Sans%20KR"
change_matplotlib_font(font_download_url)

In [None]:
path = '/content/drive/MyDrive/구내식당/water/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
# 석식계가 0인 날들은 학습에서 제외
train = train[train['석식계']!=0]
train.reset_index(drop=True, inplace=True)
df = pd.concat([train.iloc[:, :-2], test])
target_df = train.iloc[:, -2:]
df.columns = ['일자', '요일', '정원','휴가자', '출장자', '야근자',\
                 '재택근무자', '조식', '중식', '석식']
me_df = df[['조식', '중식', '석식']]
df = df.drop(columns=['조식', '중식', '석식'])

In [None]:
import datetime as dt
# 일자를 기반으로 여러 파생 변수 도출
df['일자'] = pd.to_datetime(df['일자'])
df['년'] = df['일자'].dt.year
df['월'] = df['일자'].dt.month
df['일'] = df['일자'].dt.day
df['월일'] = df['일자'].apply(lambda x : str(x)[5:10])

# 식사 가능 인원 파악
df['식사가능'] = df.apply(lambda x : x['정원']-x['휴가자']-x['출장자']-x['재택근무자'], axis=1)
# 식사 가능 인원 중 몇 %가 야근을 하는가
df['야근비율'] = df.apply(lambda x : x['야근자']/x['식사가능'], axis=1)
# 휴가자와 출장자의 비율에 따라 결과 값이 달라질까?
df['휴가비율'] = df.apply(lambda x : x['휴가자']/(x['휴가자']+x['출장자']), axis=1)
df['출장비율'] = df.apply(lambda x : x['출장자']/(x['휴가자']+x['출장자']), axis=1)
df['재택근무자'] = df.apply(lambda x : x['재택근무자']/(x['정원']), axis=1)

In [None]:
# 야근 가능 여부와, 코로나 전후 시점에 대한 표시
df['overtime'] = df['요일'].apply(lambda x : 1 if (x=='수') or (x=='금') else 0)
df['is_corona'] = df['일자'].apply(lambda x : 0 if x <= pd.to_datetime('2020-02-02') else 1)

month_to_season = {1:'겨울', 2:'겨울', 3:'봄', 4:'봄', 5:'봄', 6:'여름', 7:'여름', 8:'여름',\
                   9:'가을',10:'가을',11:'가을',12:'겨울'}
df['계절'] = df['월'].apply(lambda x : month_to_season[x])

import pendulum
train = df.iloc[:train.shape[0],:] 
test = df.iloc[train.shape[0]:,:]
train['주차'] = train['일자'].apply(lambda x: pendulum.parse(str(x)).week_of_month)
test['주차'] = test['일자'].apply(lambda x: pendulum.parse(str(x)).week_of_month)

repair_2017 = train[(train['년']==2017)&(train['주차']<0)]['일자'].dt.week
repair_2021 = train[(train['년']==2021)&(train['주차']<0)]['일자'].dt.week
test_repair = test[(test['년']==2021)&(test['주차']<0)]['일자'].dt.week

train['주차'][list(repair_2017.index)] = repair_2017.values
train['주차'][list(repair_2021.index)] = repair_2021.values
test['주차'][list(test_repair.index)] = test_repair.values

train[train['주차']==-46]['주차'] = [6, 6]

In [None]:
df = pd.concat([train, test])
df = df[['일자', '요일', '식사가능', '재택근무자', '년', '월',\
    '야근비율', '휴가비율', '출장비율', 'overtime', 'is_corona',\
    '계절', '주차']]

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()
df[['년']] = lbe.fit_transform(df[['년']])
onehot_col = ['요일', '계절']
df = pd.concat([df[list((set(df.columns)-set(onehot_col)))],\
                pd.get_dummies(df[['요일', '계절']])], axis=1)

In [None]:
train = pd.concat([df.iloc[:train.shape[0], :], target_df], axis=1)
test = df.iloc[train.shape[0]:, :]

In [None]:
train_1 =train.drop(columns=['overtime', '야근비율', '석식계'])
train_2 = train.drop(columns=['출장비율', '휴가비율', '중식계'])

In [None]:
sample_submission = pd.read_csv(path+'sample_submission.csv')
submission = sample_submission.copy()

In [None]:
reg = setup(data=train_1,
            target='중식계',
            numeric_imputation = 'mean',
            normalize = True,
            silent= True)

In [None]:
best_5 = compare_models(sort='MAE', n_select=5)

In [None]:
blended = blend_models(estimator_list= best_5, fold=5, optimize='MAE')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
pred1 = predict_model(final_model, test)

In [None]:
# Voting Regressor	70.3393	8841.6164	94.0299	0.781	0.117	0.0857

In [None]:
submission['중식계'] = pred1.reset_index()['Label']

In [None]:
reg = setup(data=train_2,
            target='석식계',
            numeric_imputation = 'mean',
            normalize = True,
            silent= True)

In [None]:
best_5 = compare_models(sort = 'MAE', n_select = 5)

In [None]:
blended = blend_models(estimator_list = best_5, fold = 5, optimize = 'MAE')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
pred2 = predict_model(final_model, test)

In [None]:
# Voting Regressor	41.4552	3000.7717	54.7793	0.7263	0.1413	0.1027

In [None]:
submission['석식계'] = pred2['Label']

In [None]:
sub_path = '/content/drive/MyDrive/DACON/Dacon_Industry_Meal/submit'
submission.to_csv(sub_path+'20210611_01.csv', index=False)