## ライブラリ読み込み

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import os, sys, gc, time, warnings, pickle, psutil, random
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from multiprocessing import Pool

warnings.filterwarnings('ignore')

In [2]:
from workalendar.usa import California
from workalendar.usa import Texas
from workalendar.usa import Wisconsin
from  datetime import datetime, timedelta
from datetime import date

## 各種パラメータ設定

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
########################### Vars
#################################################################################
VER = 207                        # Or model version
SEED = 1224                      # We want all things
seed_everything(SEED)            # to be as deterministic 
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913+28            # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False              # Use or not pretrained models <- 一旦Falseに変えてます

# FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
ORIGINAL = '../data/'
BASE     = './grid_part_1_update.pkl'
PRICE    = './grid_part_2_update.pkl'
CALENDAR = './grid_part_3_update.pkl'
LAGS     = './lags_df_28_update_base.pkl'
MEAN_ENC = './mean_encoding_df_update.pkl'

EVALUATION = './sales_train_evaluation.csv'
CALENDAR_CSV = '../data/calendar.csv'
PRICE_CSV    = '../data/sell_prices.csv'
SAMPLE_CSV   = '../data/sample_submission.csv'


VALIDATION_START_1 = 1830+28
VALIDATION_END_1   = 1857+28
VALIDATION_START_2 = 1858+28
VALIDATION_END_2   = 1885+28
VALIDATION_START_3 = 1886+28
VALIDATION_END_3   = 1913+28

remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

In [5]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'poisson',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

In [6]:
SPLIT_LIST = [[VALIDATION_START_1, VALIDATION_END_1],
              [VALIDATION_START_2, VALIDATION_END_2],
              [VALIDATION_START_3, VALIDATION_END_3]]

In [7]:
SPLIT_LIST

[[1858, 1885], [1886, 1913], [1914, 1941]]

## 休日系の情報を取得するための関数を定義

In [8]:
def get_working_day(year, month, day, store):
    if store in ["CA_1", "CA_2", "CA_3", "CA_4"]:
        return 1 if California().is_working_day(date(year+2011, month, day)) else 0
    elif store in ["TX_1", "TX_2", "TX_3"]:
        return 1 if Texas().is_working_day(date(year+2011, month, day)) else 0
    elif store in ["WI_1", "WI_2", "WI_3"]:
        return 1 if Wisconsin().is_working_day(date(year+2011, month, day)) else 0
    else:
        return 0

In [9]:
def get_holiday(year, month, day, store):
    if store in ["CA_1", "CA_2", "CA_3", "CA_4"]:
        return 1 if California().is_holiday(date(year+2011, month, day)) else 0
    elif store in ["TX_1", "TX_2", "TX_3"]:
        return 1 if Texas().is_holiday(date(year+2011, month, day)) else 0
    elif store in ["WI_1", "WI_2", "WI_3"]:
        return 1 if Wisconsin().is_holiday(date(year+2011, month, day)) else 0
    else:
        return 0

In [10]:
def get_holiday_length(tmp_XX):
    holiday_length = [] 
    stock = 0
    holiday_continue_count = 0

    working_list = tmp_XX["is_workingday"].to_list()

    for j in range(len(working_list)):

        # 勤労日が来た場合の挙動
        if working_list[j] == 1:
            if stock > 0:
                for k in range(stock):
                    holiday_length.append(holiday_continue_count)
                stock = 0
                holiday_continue_count = 0
                holiday_length.append(0)
            else:
                holiday_length.append(0)
        elif working_list[j] == 0:
            stock += 1
            holiday_continue_count += 1
            if j == len(working_list)-1:
                for k in range(stock):
                    holiday_length.append(holiday_continue_count)

    return holiday_length

In [15]:
ccc = pd.read_pickle(CALENDAR)

In [16]:
tmp = ccc.drop_duplicates(subset=["tm_y", "tm_m", "tm_d"])
years  = tmp["tm_y"].to_list()
months = tmp["tm_m"].to_list()
days   = tmp["tm_d"].to_list()

In [17]:
CA_workingdays = []
TX_workingdays = []
WI_workingdays = []
for i in range(len(years)):
    CA_workingdays.append(get_working_day(years[i], months[i], days[i], "CA_1"))
    TX_workingdays.append(get_working_day(years[i], months[i], days[i], "TX_1"))
    WI_workingdays.append(get_working_day(years[i], months[i], days[i], "WI_1"))

In [18]:
CA_holidays = []
TX_holidays = []
WI_holidays = []
for i in range(len(years)):
    CA_holidays.append(get_holiday(years[i], months[i], days[i], "CA_1"))
    TX_holidays.append(get_holiday(years[i], months[i], days[i], "TX_1"))
    WI_holidays.append(get_holiday(years[i], months[i], days[i], "WI_1"))

In [19]:
tmp_CA = tmp.copy()
tmp_CA["state_id"] = "CA"
tmp_CA["is_workingday"] = CA_workingdays
tmp_CA["is_holiday"] = CA_holidays
tmp_CA["holiday_length"] = get_holiday_length(tmp_CA)

In [20]:
tmp_TX = tmp.copy()
tmp_TX["state_id"] = "TX"
tmp_TX["is_workingday"] = TX_workingdays
tmp_TX["is_holiday"] = TX_holidays
tmp_TX["holiday_length"] = get_holiday_length(tmp_TX)

In [21]:
tmp_WI = tmp.copy()
tmp_WI["state_id"] = "WI"
tmp_WI["is_workingday"] = WI_workingdays
tmp_WI["is_holiday"] = WI_holidays
tmp_WI["holiday_length"] = get_holiday_length(tmp_WI)

In [22]:
tmp_CA["d"] = tmp_CA.apply(lambda x: int(x["d"][2:]), axis=1)
tmp_TX["d"] = tmp_TX.apply(lambda x: int(x["d"][2:]), axis=1)
tmp_WI["d"] = tmp_WI.apply(lambda x: int(x["d"][2:]), axis=1)

In [23]:
tmp = pd.concat([tmp_CA, tmp_TX, tmp_WI])

In [24]:
tmp.shape

(5907, 20)

In [25]:
for_merge = tmp[["d", "state_id", "is_workingday", "is_holiday", "holiday_length"]]

In [26]:
for_merge

Unnamed: 0,d,state_id,is_workingday,is_holiday,holiday_length
0,1,CA,0,0,2
10932,2,CA,0,0,2
21864,3,CA,1,0,0
32796,4,CA,1,0,0
43728,5,CA,1,0,0
...,...,...,...,...,...
47582947,1965,WI,1,0,0
47613437,1966,WI,1,0,0
47643927,1967,WI,1,0,0
47674417,1968,WI,0,0,2


In [58]:
for_merge.to_pickle("holiday_workingday_holidayLength.pkl")