<a href="https://colab.research.google.com/github/herjh0405/DACON_Meal/blob/master/LH_Dinner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycaret
!pip install kaggler
!pip install pendulum

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
np.random.seed(0)

from pycaret.regression import *
from kaggler.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

from tqdm.notebook import tqdm
import os, re
import glob
import calendar

In [None]:
# 한글 폰트 사용
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os

def change_matplotlib_font(font_download_url):
    FONT_PATH = 'MY_FONT'
    
    font_download_cmd = f"wget {font_download_url} -O {FONT_PATH}.zip"
    unzip_cmd = f"unzip -o {FONT_PATH}.zip -d {FONT_PATH}"
    os.system(font_download_cmd)
    os.system(unzip_cmd)
    
    font_files = fm.findSystemFonts(fontpaths=FONT_PATH)
    for font_file in font_files:
        fm.fontManager.addfont(font_file)

    font_name = fm.FontProperties(fname=font_files[0]).get_name()
    matplotlib.rc('font', family=font_name)
    print("font family: ", plt.rcParams['font.family'])

font_download_url = "https://fonts.google.com/download?family=Noto%20Sans%20KR"
change_matplotlib_font(font_download_url)

In [None]:
path = '/content/drive/MyDrive/구내식당/water/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
holiday = pd.read_csv(path+'holidays.csv', index_col=0)
corona = pd.read_csv(path+'corona_data.csv')

df = pd.concat([train, test])
df = df.fillna(0)
df.columns = ['일자', '요일', '정원','휴가자', '출장자', '야근자',\
                 '재택근무자', '조식', '중식', '석식', '중식계', '석식계']

In [None]:
df = df.drop(columns=['조식', '중식', '중식계'])

In [None]:
dust_dir = os.path.join(path, '미세먼지_일별')
wdata_dir = os.path.join(path, '날씨_시간별')

In [None]:
w_attrs = ['강수', '기온', '습도', '강수형태']
w_years = os.listdir(wdata_dir)

In [None]:
def get_wdata(data_path, dtype='num'):
  datetime_list = []
  value_list_12 = []
  value_list_18 = []
  curr_mon = ''

  with open(data_path, 'r') as f:
    lines = f.readlines()
    for i, line in enumerate(lines):
      if line.strip() == '':
        break
      row_data = line.strip().split(',')
      row_data = [elem.strip() for elem in row_data]
      if i == 0:
        curr_mon = row_data[-1].split()[-1][:-2]
        continue
      if len(row_data) == 1:
        curr_mon = row_data[-1].split()[-1][:-2]
        continue
      r_day, r_hour, r_value = row_data
      if r_hour in ["1200", "1800"]: # 점심 12시, 저녁 6시 기준으로 처리
        if r_hour == "1200":
          datetime_list.append(curr_mon[:4]+'-'+curr_mon[4:]+'-'+str('%02d'%int(r_day)))

        if dtype == 'num':
          if r_hour == "1200":
            value_list_12.append(float(r_value))
          else:
            value_list_18.append(float(r_value))
        else:
          if r_hour == "1200":
            value_list_12.append(str(round(float(r_value))))
          else:
            value_list_18.append(str(round(float(r_value))))
          

  return datetime_list, value_list_12, value_list_18

In [None]:
# 강수, 기온, 습도, 강수형태 데이터
w_data_rain_12 = []
w_data_temp_12 = []
w_data_hum_12 = []
w_data_rtype_12 = []
w_data_rain_18 = []
w_data_temp_18 = []
w_data_hum_18 = []
w_data_rtype_18 = []
w_datetime = []

for year in w_years:
  w_subdir = os.path.join(wdata_dir, year)
  file_names = os.listdir(w_subdir)
  file_name = ""
  if year != '2021':
    file_name = f'{year}01_{year}12.csv'
  else:
    file_name = f'{year}01_{year}04.csv'
  file_path_rain = os.path.join(w_subdir, '충무공동_강수_'+file_name)
  file_path_temp = os.path.join(w_subdir, '충무공동_기온_'+file_name)
  file_path_hum = os.path.join(w_subdir, '충무공동_습도_'+file_name)
  file_path_rtype = os.path.join(w_subdir, '충무공동_강수형태_'+file_name)

  datetime_list_rain, value_list_rain_12, value_list_rain_18 = get_wdata(file_path_rain, dtype='num') # 강수 데이터
  datetime_list_temp, value_list_temp_12, value_list_temp_18 = get_wdata(file_path_temp, dtype='num') # 기온 데이터
  datetime_list_hum, value_list_hum_12, value_list_hum_18 = get_wdata(file_path_hum, dtype='num') # 습도 데이터
  datetime_list_rtype, value_list_rtype_12, value_list_rtype_18 = get_wdata(file_path_rtype, dtype='cat') # 강수형태 데이터
  
  w_datetime   += datetime_list_rain
  w_data_rain_12  += value_list_rain_12
  w_data_temp_12  += value_list_temp_12
  w_data_hum_12   += value_list_hum_12
  w_data_rtype_12 += value_list_rtype_12
  w_data_rain_18  += value_list_rain_18
  w_data_temp_18  += value_list_temp_18
  w_data_hum_18   += value_list_hum_18
  w_data_rtype_18 += value_list_rtype_18

In [None]:
w_df = pd.DataFrame({'일자':pd.Series(w_datetime, dtype='str'),
                   'rain_lunch':pd.Series(w_data_rain_12, dtype='float'),
                   'temp_lunch':pd.Series(w_data_temp_12, dtype='float'),
                   'hum_lunch':pd.Series(w_data_hum_12, dtype='float'),
                   'rain_type_lunch':pd.Series(w_data_rtype_12, dtype='str'),
                   'rain_dinner':pd.Series(w_data_rain_18, dtype='float'),
                   'temp_dinner':pd.Series(w_data_temp_18, dtype='float'),
                   'hum_dinner':pd.Series(w_data_hum_18, dtype='float'),
                   'rain_type_dinner':pd.Series(w_data_rtype_18, dtype='str')})

In [None]:
# 불쾌지수 컬럼 추가
# https://dacon.io/competitions/official/235736/codeshare/2753?page=1&dtype=recent
w_df['discomfort_index_lunch'] = 1.8*w_df['temp_lunch'] - 0.55*(1-w_df['hum_lunch']/100)*(1.8*w_df['temp_lunch']-26) + 32
w_df['discomfort_index_dinner'] = 1.8*w_df['temp_dinner'] - 0.55*(1-w_df['hum_dinner']/100)*(1.8*w_df['temp_dinner']-26) + 32

In [None]:
dust_file_paths = glob.glob(os.path.join(dust_dir, '*.xls'))
d_datetime = []
d_value1 = []
d_value2 = []

# 시간별 데이터의 경우 미세먼지 측정값 중 빈 값이 있는 경우가 어느 정도 있어서 배제했습니다.
for file_path in dust_file_paths:
  date_yyyymm = os.path.splitext(os.path.basename(file_path))[0] # yyyymm
  date_year = date_yyyymm[:4]
  date_mon = date_yyyymm[4:]
  dust_df = None

  if date_year == '2021':
    dust_df = pd.read_excel(file_path, header=[0, 1], skiprows=3)
  else:
    dust_df = pd.read_excel(file_path, header=[0, 1])
  cols = dust_df.columns
  date_col = cols[0]
  fine_dust_col = cols[1] # 미세먼지
  ufine_dust_col = cols[2] # 초미세먼지

  # 해당월의 일수 가져오기
  days = calendar.monthrange(int(date_year),int(date_mon))[1] 
  for day in range(1, days+1):
    day_1 = '%02d'%day
    curr_day_df =  date_year+ '-' + date_mon + '-' + day_1

    row_lunch = dust_df[dust_df[cols[0]] == curr_day_df]
    row_dinner = dust_df[dust_df[cols[0]] == curr_day_df]
    curr_date = date_year+'-'+date_mon+'-'+day_1
  
    d_datetime.append(curr_date)
    d_value1.append(row_lunch[fine_dust_col].values[0])
    d_value2.append(row_lunch[ufine_dust_col].values[0])

In [None]:
dust_df = pd.DataFrame({'일자':pd.Series(d_datetime, dtype='str'),
                   'fine_dust':pd.Series(d_value1, dtype='float'),
                   'ultra_fine_dust':pd.Series(d_value2, dtype='float')})

In [None]:
df = pd.merge(df, dust_df, on='일자')
df = pd.merge(df, w_df, on='일자')

In [None]:
df['fine_dust'][564, 1129] = [36, 23]
df['ultra_fine_dust'][234, 235, 564, 654, 1129] = [11, 31, 26, 5, 9]

In [None]:
df = df.drop(columns=['rain_lunch', 'temp_lunch', 'hum_lunch', 'rain_type_lunch', 'discomfort_index_lunch'])

In [None]:
df['일자'] = pd.to_datetime(df['일자'])
df['년'] = df['일자'].dt.year
df['월'] = df['일자'].dt.month
df['일'] = df['일자'].dt.day

In [None]:
holiday['date'] = pd.to_datetime(holiday['date'])
df['before_holiday'] = df['일자'].apply(lambda x : 1 if (x+dt.timedelta(1) in holiday['date'].tolist()) else 0)
df['after_holiday'] = df['일자'].apply(lambda x : 1 if (x-dt.timedelta(1) in holiday['date'].tolist()) else 0)

In [None]:
df['요일'] = df['일자'].dt.weekday
df['야근_가능'] = df['요일'].apply(lambda x : 1 if (x==2) or (x==4) else 0)
df['출근인원'] = df['정원']-(df['휴가자']+df['출장자']+df['재택근무자'])
df['휴가비율'] = df['휴가자']/df['정원']
df['출장비율'] = df['출장자']/df['정원']
df['야근비율'] = df['야근자']/df['출근인원']
df['재택비율'] = df['재택근무자']/df['정원']
month_to_season = {1: 3,2: 3,3:0,4:0,5:0,6:1,7:1,8:1,9:2,10:2,11:2,12: 3}
df['계절'] = df['월'].apply(lambda x : month_to_season[x])

In [None]:
import pendulum

train = df.iloc[:train.shape[0],:] 
test = df.iloc[train.shape[0]:,:]
train['주차'] = train['일자'].apply(lambda x: pendulum.parse(str(x)).week_of_month)
test['주차'] = test['일자'].apply(lambda x: pendulum.parse(str(x)).week_of_month)

repair_2017 = train[(train['년']==2017)&(train['주차']<0)]['일자'].dt.week
repair_2021 = train[(train['년']==2021)&(train['주차']<0)]['일자'].dt.week
test_repair = test[(test['년']==2021)&(test['주차']<0)]['일자'].dt.week

train['주차'][list(repair_2017.index)] = repair_2017.values
train['주차'][list(repair_2021.index)] = repair_2021.values
test['주차'][list(test_repair.index)] = test_repair.values
train['주차'][list(train[train['주차']==-46].index)] = np.array([6, 6, 6])

df = pd.concat([train, test])
df['출장자제외'] = df['정원'] - df['출장자']
df['재택근무제외'] = df['정원'] - df['재택근무자']
df['연기준몇일째']= df['일자'].dt.dayofyear
df['연기준몇주째']= df['일자'].dt.weekofyear
df['월일수']= df['일자'].dt.days_in_month
df['윤년여부'] = df['일자'].dt.is_leap_year
df['월시작일여부'] = df['일자'].dt.is_month_start
df['월마지막일여부'] =df['일자'].dt.is_month_end
df['분기시작일여부'] =df['일자'].dt.is_quarter_start
df['분기마지막일여부'] =df['일자'].dt.is_quarter_end
df['연시작일여부'] =df['일자'].dt.is_year_start
df['연마지막일여부'] =df['일자'].dt.is_year_end

In [None]:
end_year = df[(df['월']==12)&(df['일']>=21)].index
df['연말'] = df.apply(lambda x : 1 if  x.name in end_year else 0, axis=1)

In [None]:
corona['일자'] = pd.to_datetime(corona['일자'])
corona = corona.drop_duplicates(['일자'])
check_corona = corona[['일자', '누적검사자']]

In [None]:
df = pd.merge(df,corona[['일자', '일일검사자']], on='일자', how='left')
df = df.fillna(0)

In [None]:
# 이벤트
event = pd.to_datetime(['2017-07-12', '2018-07-17', '2019-07-12', '2020-07-16'])
df['이벤트'] = df['일자'].apply(lambda x : 1 if x in event else 0)

In [None]:
df = df.drop(columns=['석식'])

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()
df[['년']] = lbe.fit_transform(df[['년']])
onehot_col = ['요일', '계절']
df = pd.concat([df[list((set(df.columns)-set(onehot_col)))],\
                pd.get_dummies(df[['요일', '계절']])], axis=1)

In [None]:
train = df.iloc[:train.shape[0], :]
test = df.iloc[train.shape[0]:, :]

In [None]:
train = train[train['석식계']!=0]

In [None]:
reg = setup(data=train,
            target='석식계',
            numeric_imputation = 'mean',
            normalize = True,
            silent= True)

In [None]:
best_5 = compare_models(sort='MAE', n_select=5)
blended = blend_models(estimator_list= best_5, fold=5, optimize='MAE')
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)
pred1 = predict_model(final_model, test)

In [None]:
sample_submission = pd.read_csv(path+'sample_submission.csv')
submission = sample_submission.copy()

In [None]:
submission['석식계'] = pred1.reset_index()['Label']

In [None]:
sub_path = '/content/drive/MyDrive/DACON/Dacon_Industry_Meal/submit/'
best_submit = pd.read_csv(sub_path+'20210605_01_79.csv')
df_82 = pd.read_csv(sub_path+'20210610_02_82.csv')
from sklearn.metrics import mean_absolute_error
def show_mae(data) : 
    result = mean_absolute_error(best_submit['중식계'], data['중식계'])+mean_absolute_error(best_submit['석식계'], data['석식계'])
    return display(result)

show_mae(submission)

In [None]:
mean_absolute_error(best_submit['석식계'], submission['석식계'])

In [None]:
submission = pd.read_csv(sub_path+'/20210614_01.csv')

In [None]:
submission

In [None]:
submission['석식계'] = pred1.reset_index()['Label']

In [None]:
submission.to_csv(sub_path+'/20210614_01.csv', index=False)