## ライブラリ読み込み

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import os, sys, gc, time, warnings, pickle, psutil, random
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from multiprocessing import Pool

warnings.filterwarnings('ignore')

## 各種パラメータ設定

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
VER = 207                        # Or model version
SEED = 1224                      # We want all things
seed_everything(SEED)            # to be as deterministic 

TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913+28            # End day of our train set
P_HORIZON   = 28                 # Prediction horizon

# 読み込みが必要なpklファイルのパス
BASE     = './grid_part_1_update.pkl'
PRICE    = './grid_part_2_update.pkl'
CALENDAR = './grid_part_3_update.pkl'
MEAN_ENC = './mean_encoding_df_update_nonleak.pkl'

# 読み込みが必要なcsvファイルのパス
EVALUATION   = './sales_train_evaluation.csv'
CALENDAR_CSV = '../data/calendar.csv'
PRICE_CSV    = '../data/sell_prices.csv'
SAMPLE_CSV   = '../data/sample_submission.csv'

# CVを行う区切りのリスト
SPLIT_LIST = [[1886, 1913]]

In [5]:
lgb_params = {
                'boosting_type': 'gbdt',
                'objective': 'poisson',
                'tweedie_variance_power': 1.1,
                'metric': 'rmse',
                'subsample': 0.5,
                'subsample_freq': 1,
                'learning_rate': 0.03,
                'num_leaves': 2**11-1,
                'min_data_in_leaf': 2**12-1,
                'feature_fraction': 0.5,
                'max_bin': 100,
                'n_estimators': 3000,
                'boost_from_average': False,
                'verbose': -1,
            } 

In [6]:
"""categorical_feat = [
    "event_name_1", "event_type_1", "event_name_2", "event_type_2", 
    "tm_wm", "tm_dw", "tm_w_end", "tm_y",
]"""

categorical_feat = [
    "event_name_1", "tm_wm", "tm_dw", "tm_w_end", "tm_y",
]

## 学習用関数定義

In [7]:
def merge_df(original_df, feat_path, feat_names=None, merge_key=None) -> pd.DataFrame:
    """
    パスとキーを指定して、特徴量を追加する   
    - original_df : 特徴を追加するDataFrame
    - feat_path   : 追加する特徴(pkl)のパス
    - feat_names  : pklの中で、結合する特徴を絞り込みたい場合に指定する
    - merge_key   : pklをmergeで結合する場合、キーを指定する(Noneの場合concatする)
    
    """
    tmp = pd.read_pickle(feat_path)
    
    if feat_names is not None: 
        tmp = tmp[feat_names]
        
    if merge_key is None:
        tmp = tmp[tmp.index.isin(original_df.index)]
        original_df = pd.concat([original_df, tmp], axis=1)
    else:
        original_df = pd.merge(original_df, tmp, on=merge_key, how="left")
    
    return original_df

In [13]:
def get_data_by_store(store):
    """
    store_idを指定して、学習用のデータフレームを取得する
    """  
    # 3つのpklを結合してベースとなるデータフレームを作成
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # store_id(店のID)で絞り込み
    df = df[df['store_id']==store]
    
    # 平均エンコーディング(MEAN_ENC)用の特徴リスト
    mean_features = ['enc_timeseries_state_id_mean',
                   'enc_timeseries_state_id_std',
                   'enc_timeseries_store_id_mean',
                   'enc_timeseries_store_id_std',
                   'enc_timeseries_cat_id_mean',
                   'enc_timeseries_cat_id_std', 
                   'enc_timeseries_dept_id_mean',
                   'enc_timeseries_dept_id_std', 
                   'enc_timeseries_state_id_cat_id_mean',
                   'enc_timeseries_state_id_cat_id_std',
                   'enc_timeseries_state_id_dept_id_mean',
                   'enc_timeseries_state_id_dept_id_std',
                   'enc_timeseries_store_id_cat_id_mean',
                   'enc_timeseries_store_id_cat_id_std',
                   'enc_timeseries_store_id_dept_id_mean',
                   'enc_timeseries_store_id_dept_id_std', 
                   'enc_timeseries_item_id_mean',
                   'enc_timeseries_item_id_std', 
                   'enc_timeseries_item_id_state_id_mean',
                   'enc_timeseries_item_id_state_id_std',
                   'enc_timeseries_item_id_store_id_mean',
                   'enc_timeseries_item_id_store_id_std']
    
    # 前後の祝日に関する情報のリスト
    holiday_features = [
        'id', 'd','days_to_next_holiday','days_from_prev_holiday', 'monday_or_friday'
    ]
    
    # 各種特徴追加(追加する場合はここに羅列する)
    CROSTON_TSB = "../20200602_追加データの確認/Croston_TSB_update_plus28.pkl"
    CROSTON     = "../20200602_追加データの確認/Croston_update_plus28.pkl"
    WEEK_WM_AVE = "../20200602_追加データの確認/week_w_moving_average.pkl"
    W_M_AVE     = "../20200602_追加データの確認/weighted_average_update.pkl"
    HOLIDAY     = "../20200602_追加データの確認/holiday_workingday_holidayLength.pkl"
    PRI_STATE   = "../20200602_追加データの確認/sell_price_state.pkl"
    HOLIDAY2    = "../20200602_追加データの確認/holiday_features.pkl"
    
    df = merge_df(df, feat_path=MEAN_ENC, feat_names=mean_features, merge_key=None)                         # 平均エンコードした特徴
    df = merge_df(df, feat_path=CROSTON_TSB, feat_names=["id", "d", "Forecast"], merge_key=["id", "d"])     # CrostonTSB
    df = merge_df(df, feat_path=CROSTON, feat_names=["id", "d", "Forecast"], merge_key=["id", "d"])         # Croston
    df = merge_df(df, feat_path=WEEK_WM_AVE, feat_names=None, merge_key=["id", "tm_y", "tm_w"])             # 週の指数平滑化移動平均
    df = merge_df(df, feat_path=W_M_AVE, feat_names=None, merge_key=None)                                  # salesの指数平滑化移動平均
    df = merge_df(df, feat_path=HOLIDAY, feat_names=None, merge_key=["d", "state_id"])                      # 休日関連
    df = merge_df(df, feat_path=PRI_STATE, feat_names=None, merge_key=["id", "d"])                          # 値段の下一桁
    df = merge_df(df, feat_path=HOLIDAY2, feat_names=holiday_features, merge_key=["id", "d"])                # 前後の祝日
    
    features = df.columns
    
    return df, features

In [9]:
STORES_IDS = pd.read_csv(EVALUATION)['store_id']
STORES_IDS = list(STORES_IDS.unique())

In [None]:
for store_id in STORES_IDS:
    print(store_id)
    grid_df, _ = get_data_by_store(store_id)
    grid_df.to_pickle(f"./grid_df_master_{store_id}.pkl")

CA_1
