<a href="https://colab.research.google.com/github/herjh0405/DACON_Meal/blob/master/LH_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install pycaret
# !pip install kaggler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
np.random.seed(0)

from pycaret.regression import *
from kaggler.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

from tqdm.notebook import tqdm
import os, re
import glob
import calendar

In [None]:
# 한글 폰트 사용
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os

def change_matplotlib_font(font_download_url):
    FONT_PATH = 'MY_FONT'
    
    font_download_cmd = f"wget {font_download_url} -O {FONT_PATH}.zip"
    unzip_cmd = f"unzip -o {FONT_PATH}.zip -d {FONT_PATH}"
    os.system(font_download_cmd)
    os.system(unzip_cmd)
    
    font_files = fm.findSystemFonts(fontpaths=FONT_PATH)
    for font_file in font_files:
        fm.fontManager.addfont(font_file)

    font_name = fm.FontProperties(fname=font_files[0]).get_name()
    matplotlib.rc('font', family=font_name)
    print("font family: ", plt.rcParams['font.family'])

font_download_url = "https://fonts.google.com/download?family=Noto%20Sans%20KR"
change_matplotlib_font(font_download_url)

In [None]:
path = '/content/drive/MyDrive/구내식당/water/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
holiday = pd.read_csv(path+'holidays.csv', index_col=0)
corona = pd.read_csv(path+'corona_data.csv')

df = pd.concat([train.iloc[:, :-2], test])
target_df = train.iloc[:, -2:]
df.columns = ['일자', '요일', '정원','휴가자', '출장자', '야근자',\
                 '재택근무자', '조식', '중식', '석식']

In [None]:
dust_dir = os.path.join(path, '미세먼지_일별')
wdata_dir = os.path.join(path, '날씨_시간별')

In [None]:
w_attrs = ['강수', '기온', '습도', '강수형태']
w_years = os.listdir(wdata_dir)

In [None]:
def get_wdata(data_path, dtype='num'):
  datetime_list = []
  value_list_12 = []
  value_list_18 = []
  curr_mon = ''

  with open(data_path, 'r') as f:
    lines = f.readlines()
    for i, line in enumerate(lines):
      if line.strip() == '':
        break
      row_data = line.strip().split(',')
      row_data = [elem.strip() for elem in row_data]
      if i == 0:
        curr_mon = row_data[-1].split()[-1][:-2]
        continue
      if len(row_data) == 1:
        curr_mon = row_data[-1].split()[-1][:-2]
        continue
      r_day, r_hour, r_value = row_data
      if r_hour in ["1200", "1800"]: # 점심 12시, 저녁 6시 기준으로 처리
        if r_hour == "1200":
          datetime_list.append(curr_mon[:4]+'-'+curr_mon[4:]+'-'+str('%02d'%int(r_day)))

        if dtype == 'num':
          if r_hour == "1200":
            value_list_12.append(float(r_value))
          else:
            value_list_18.append(float(r_value))
        else:
          if r_hour == "1200":
            value_list_12.append(str(round(float(r_value))))
          else:
            value_list_18.append(str(round(float(r_value))))
          

  return datetime_list, value_list_12, value_list_18

In [None]:
# 강수, 기온, 습도, 강수형태 데이터
w_data_rain_12 = []
w_data_temp_12 = []
w_data_hum_12 = []
w_data_rtype_12 = []
w_data_rain_18 = []
w_data_temp_18 = []
w_data_hum_18 = []
w_data_rtype_18 = []
w_datetime = []

for year in w_years:
  w_subdir = os.path.join(wdata_dir, year)
  file_names = os.listdir(w_subdir)
  file_name = ""
  if year != '2021':
    file_name = f'{year}01_{year}12.csv'
  else:
    file_name = f'{year}01_{year}04.csv'
  file_path_rain = os.path.join(w_subdir, '충무공동_강수_'+file_name)
  file_path_temp = os.path.join(w_subdir, '충무공동_기온_'+file_name)
  file_path_hum = os.path.join(w_subdir, '충무공동_습도_'+file_name)
  file_path_rtype = os.path.join(w_subdir, '충무공동_강수형태_'+file_name)

  datetime_list_rain, value_list_rain_12, value_list_rain_18 = get_wdata(file_path_rain, dtype='num') # 강수 데이터
  datetime_list_temp, value_list_temp_12, value_list_temp_18 = get_wdata(file_path_temp, dtype='num') # 기온 데이터
  datetime_list_hum, value_list_hum_12, value_list_hum_18 = get_wdata(file_path_hum, dtype='num') # 습도 데이터
  datetime_list_rtype, value_list_rtype_12, value_list_rtype_18 = get_wdata(file_path_rtype, dtype='cat') # 강수형태 데이터
  
  w_datetime   += datetime_list_rain
  w_data_rain_12  += value_list_rain_12
  w_data_temp_12  += value_list_temp_12
  w_data_hum_12   += value_list_hum_12
  w_data_rtype_12 += value_list_rtype_12
  w_data_rain_18  += value_list_rain_18
  w_data_temp_18  += value_list_temp_18
  w_data_hum_18   += value_list_hum_18
  w_data_rtype_18 += value_list_rtype_18

In [None]:
w_df = pd.DataFrame({'일자':pd.Series(w_datetime, dtype='str'),
                   'rain_lunch':pd.Series(w_data_rain_12, dtype='float'),
                   'temp_lunch':pd.Series(w_data_temp_12, dtype='float'),
                   'hum_lunch':pd.Series(w_data_hum_12, dtype='float'),
                   'rain_type_lunch':pd.Series(w_data_rtype_12, dtype='str'),
                   'rain_dinner':pd.Series(w_data_rain_18, dtype='float'),
                   'temp_dinner':pd.Series(w_data_temp_18, dtype='float'),
                   'hum_dinner':pd.Series(w_data_hum_18, dtype='float'),
                   'rain_type_dinner':pd.Series(w_data_rtype_18, dtype='str')})

In [None]:
# 불쾌지수 컬럼 추가
# https://dacon.io/competitions/official/235736/codeshare/2753?page=1&dtype=recent
w_df['discomfort_index_lunch'] = 1.8*w_df['temp_lunch'] - 0.55*(1-w_df['hum_lunch']/100)*(1.8*w_df['temp_lunch']-26) + 32
w_df['discomfort_index_dinner'] = 1.8*w_df['temp_dinner'] - 0.55*(1-w_df['hum_dinner']/100)*(1.8*w_df['temp_dinner']-26) + 32

In [None]:
dust_file_paths = glob.glob(os.path.join(dust_dir, '*.xls'))
d_datetime = []
d_value1 = []
d_value2 = []

# 시간별 데이터의 경우 미세먼지 측정값 중 빈 값이 있는 경우가 어느 정도 있어서 배제했습니다.
for file_path in dust_file_paths:
  date_yyyymm = os.path.splitext(os.path.basename(file_path))[0] # yyyymm
  date_year = date_yyyymm[:4]
  date_mon = date_yyyymm[4:]
  dust_df = None

  if date_year == '2021':
    dust_df = pd.read_excel(file_path, header=[0, 1], skiprows=3)
  else:
    dust_df = pd.read_excel(file_path, header=[0, 1])
  cols = dust_df.columns
  date_col = cols[0]
  fine_dust_col = cols[1] # 미세먼지
  ufine_dust_col = cols[2] # 초미세먼지

  # 해당월의 일수 가져오기
  days = calendar.monthrange(int(date_year),int(date_mon))[1] 
  for day in range(1, days+1):
    day_1 = '%02d'%day
    curr_day_df =  date_year+ '-' + date_mon + '-' + day_1

    row_lunch = dust_df[dust_df[cols[0]] == curr_day_df]
    row_dinner = dust_df[dust_df[cols[0]] == curr_day_df]
    curr_date = date_year+'-'+date_mon+'-'+day_1
  
    d_datetime.append(curr_date)
    d_value1.append(row_lunch[fine_dust_col].values[0])
    d_value2.append(row_lunch[ufine_dust_col].values[0])

In [None]:
dust_df = pd.DataFrame({'일자':pd.Series(d_datetime, dtype='str'),
                   'fine_dust':pd.Series(d_value1, dtype='float'),
                   'ultra_fine_dust':pd.Series(d_value2, dtype='float')})

In [None]:
df['재택근무자'] = df['재택근무자'].astype('int')

# 날씨, 미세먼지 데이터 추가
df = pd.merge(df, dust_df, on='일자')
df = pd.merge(df, w_df, on='일자')
# train = pd.merge(train, dust_df, on='일자')
# train = pd.merge(train, w_df, on='일자')

# test = pd.merge(test, dust_df, on='일자')
# test = pd.merge(test, w_df, on='일자')

df['일자'] = pd.to_datetime(df['일자'])
df['년'] = df['일자'].dt.year
df['월'] = df['일자'].dt.month
df['일'] = df['일자'].dt.day
df['월일'] = df['일자'].apply(lambda x : str(x)[5:10])

In [None]:
holiday['date'] = pd.to_datetime(holiday['date'])
df['before_holiday'] = df['일자'].apply(lambda x : 1 if (x+dt.timedelta(1) in holiday['date'].tolist())\
                                      or ((x+dt.timedelta(1)).weekday() == 5) or ((x+dt.timedelta(1)).weekday() == 6) else 0)
df['after_holiday'] = df['일자'].apply(lambda x : 1 if (x-dt.timedelta(1) in holiday['date'].tolist())\
                                     or ((x-dt.timedelta(1)).weekday() == 5) or ((x-dt.timedelta(1)).weekday() == 6) else 0)
train = pd.concat([df.iloc[:train.shape[0],:], target_df], axis=1)

In [None]:
df['요일'] = df['일자'].dt.weekday
df['야근_가능'] = df['요일'].apply(lambda x : 1 if (x==2) or (x==4) else 0)
df['출근인원'] = df['정원']-(df['휴가자']+df['출장자']+df['재택근무자'])
df['휴가비율'] = df['휴가자']/df['정원']
df['출장비율'] = df['출장자']/df['정원']
df['야근비율'] = df['야근자']/df['출근인원']
df['재택비율'] = df['재택근무자']/df['정원']

In [None]:
month_to_season = {1: 3,2: 3,3:0,4:0,5:0,6:1,7:1,8:1,9:2,10:2,11:2,12: 3}
df['계절'] = df['월'].apply(lambda x : month_to_season[x])

In [None]:
df['주'] = df['일자'].dt.week
df['월_주'] = df['일자'].dt.week%4
df['일자'] = pd.to_numeric(df['일자'])

In [None]:
train = pd.concat([df.iloc[:train.shape[0], :], target_df], axis=1)
test = df.iloc[train.shape[0]:, :]

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
normal_col = ['fine_dust', 'ultra_fine_dust', 'rain_lunch', 'temp_lunch', 'hum_lunch',
       'rain_type_lunch', 'discomfort_index_lunch', '출근인원', '주', '일자']

scaler = MinMaxScaler(feature_range=(0, 1))
df[normal_col] = scaler.fit_transform(df[normal_col])

lbe = LabelEncoder()
df[['년']] = lbe.fit_transform(df[['년']])

In [None]:
# train_1 = df[['일자', '요일', 'fine_dust', 'ultra_fine_dust', 'rain_lunch', 'temp_lunch', 'hum_lunch',
#        'rain_type_lunch', 'discomfort_index_lunch', '년', '월', 'before_holiday', 'after_holiday', '야근_가능', '출근인원',
#        '휴가비율', '출장비율', '야근비율', '재택비율', '계절', '주', '월_주']]

train_1 = df[['일자', '요일', '월', '년', '휴가비율', '출장비율', '계절', '주', '월_주',
              '출근인원', '야근_가능', 'after_holiday', 'discomfort_index_lunch', 'rain_lunch']]

In [None]:
y = train['중식계']
X = train_1.iloc[:train.shape[0],:]
X_tst = train_1.iloc[train.shape[0]:,:]

In [None]:
from kaggler.model import AutoLGB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb

n_class = 1
n_fold = 5
cv = StratifiedKFold(n_splits=n_fold)
p = np.zeros((X.shape[0], n_class), dtype=float)
p_tst = np.zeros((X_tst.shape[0], n_class), dtype=float)
n_best = 96
# params = {'num_class' : 1}
params = {'bagging_freq': 1, 'verbosity': -1, 'seed': 42, 'num_threads': -1, 'feature_pre_filter': False,\
           'num_class': 1, 'objective': 'regression', 'metric': 'l1', 'boosting': 'gbdt', 'bagging_fraction': 0.7000000000000001,\
           'feature_fraction': 0.7000000000000001, 'lambda_l1': 10, 'lambda_l2': 0.1, 'learning_rate': 0.09713575840441935,\
           'max_depth': 6, 'min_child_samples': 10, 'num_leaves': 255}
features=X.columns

for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y)):
    # if i_cv == 0:
    #     clf = AutoLGB(objective='regression', metric='mae', params=params,
    #                   feature_selection=False, n_est=10000)
    #     clf.tune(X.iloc[i_trn], y[i_trn])
    #     n_best = clf.n_best
    #     features = clf.features
    #     params = clf.params
    #     print(f'best iteration: {n_best}')
    #     print(f'selected features ({len(features)}): {features}')        
    #     print(params)
    #     clf.fit(X.iloc[i_trn], y[i_trn])
    # else:
    #     train_data = lgb.Dataset(X[features].iloc[i_trn], label=y[i_trn])
    #     clf = lgb.train(params, train_data, n_best, verbose_eval=100)
    train_data = lgb.Dataset(X[features].iloc[i_trn], label=y[i_trn])
    clf = lgb.train(params, train_data, n_best, verbose_eval=100)
    
    p[i_val] = clf.predict(X[features].iloc[i_val]).reshape(-1, 1)
    p_tst += clf.predict(X_tst[features]).reshape(-1, 1) / n_fold

In [None]:
# 기본 column = ['요일', '월', '년', '휴가비율', '출장비율', '계절', '주', '월_주']
# CV MAE: 68.747507 - 기본
# CV MAE: 65.913536 - +휴일 전후, 일자, 야근_가능
# CV MAE: 65.748532 - +휴일 후, 일자, 야근
# CV MAE: 65.873269 - +출근 인원
# CV MAE: 65.724711 - +불쾌 지수
# CV MAE: 65.289932 - +rain_lunch

In [None]:
from sklearn.metrics import mean_absolute_error
print(f'CV MAE: {mean_absolute_error(y, p):f}')

In [None]:
def plot_feature_importance(importance,names,model_type):
    import seaborn as sns
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [None]:
plot_feature_importance(clf.feature_importance(), X.columns, 'LightGBM')