## ライブラリ読み込み

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import os, sys, gc, time, warnings, pickle, psutil, random
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from multiprocessing import Pool

warnings.filterwarnings('ignore')

## 各種パラメータ設定

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [3]:
########################### Vars
#################################################################################
VER = 207                        # Or model version
SEED = 1224                      # We want all things
seed_everything(SEED)            # to be as deterministic 
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913+28            # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False              # Use or not pretrained models <- 一旦Falseに変えてます

# FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
ORIGINAL = '../../input_update/m5-forecasting-accuracy/'
BASE     = './grid_part_1_update.pkl'
PRICE    = './grid_part_2_update.pkl'
CALENDAR = './grid_part_3_update.pkl'
LAGS     = './lags_df_28_update_base.pkl'
MEAN_ENC = './mean_encoding_df_update.pkl'

EVALUATION = './sales_train_evaluation.csv'
CALENDAR_CSV = '../data/calendar.csv'
PRICE_CSV    = '../data/sell_prices.csv'
SAMPLE_CSV   = '../data/sample_submission.csv'


VALIDATION_START_1 = 1830+28
VALIDATION_END_1   = 1857+28
VALIDATION_START_2 = 1858+28
VALIDATION_END_2   = 1885+28
VALIDATION_START_3 = 1886+28
VALIDATION_END_3   = 1913+28

remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

In [4]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'poisson',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

In [5]:
SPLIT_LIST = [[VALIDATION_START_1, VALIDATION_END_1],
              [VALIDATION_START_2, VALIDATION_END_2],
              [VALIDATION_START_3, VALIDATION_END_3]]

In [6]:
SPLIT_LIST

[[1858, 1885], [1886, 1913], [1914, 1941]]

## 学習用関数定義

In [7]:
def get_data_by_store(store):
    """
    store_idを指定して、必要なデータフレームを取得する
    """
    
    # 3つのcsvファイルを読み込んで結合する
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # store_id(店のID)で絞り込み
    # df = df[df['store_id']==store]
    
    # 平均エンコードした特徴
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    # ラグ変数
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 
    
    df = pd.concat([df, df3], axis=1)
    del df3 
    
    df = make_sell_price_state(df)
    
    # ローリング特徴の削除
    delete_columns = ['rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14',
        'rolling_mean_tmp_1_30', 'rolling_mean_tmp_1_60',
        'rolling_mean_tmp_7_7', 'rolling_mean_tmp_7_14',
        'rolling_mean_tmp_7_30', 'rolling_mean_tmp_7_60',
        'rolling_mean_tmp_14_7', 'rolling_mean_tmp_14_14',
        'rolling_mean_tmp_14_30', 'rolling_mean_tmp_14_60']
    
    df = df.drop(delete_columns, axis=1)
    
    # 特徴リストの生成
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # START_TRAINの値を元に、前半のデータを取り除く
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features


def make_sell_price_state(df):
    calendar_df = pd.read_csv(CALENDAR_CSV)
    calendar_df["d"] = calendar_df["d"].apply(lambda x: x[2:]).astype(np.int16)
    df = df.merge(calendar_df[["d", "wm_yr_wk"]], how="left", on="d")
    del calendar_df
    
    prices_df = pd.read_csv(os.path.join(PRICE_CSV))
    prices_df["sell_price_raw"] = prices_df["sell_price"].copy()
    prices_df.drop("sell_price", axis=1, inplace=True)
    
    df = df.merge(prices_df, how="left", on=["store_id", "item_id", "wm_yr_wk"])
    df["sell_price_state"] = df["sell_price_raw"].astype(str).apply(lambda x: x[-1])
    df["sell_price_state"] = df["sell_price_state"].astype('category')
    df.drop("sell_price_raw", axis=1, inplace=True)
    df.drop("wm_yr_wk", axis=1, inplace=True)
    del prices_df
    
    df["store_id"] = df["store_id"].astype('category')
    df["item_id"] = df["item_id"].astype('category')
    
    return df

In [8]:
def get_separate_data(grid_df, feature_columns, val_start, val_end):
    
    train_mask = ((grid_df['d']>=START_TRAIN) & (grid_df['d']<=val_start-1)) 
    valid_mask = ((grid_df['d']>val_start-1) & (grid_df["d"]<=val_end))
    
    df_train = grid_df[train_mask][feature_columns]
    df_valid = grid_df[valid_mask][feature_columns]
    # train_data = lgb.Dataset(df_train, label=grid_df[train_mask][TARGET],weight=grid_df[train_mask]["w_mean"])
    # valid_data = lgb.Dataset(df_valid, label=grid_df[valid_mask][TARGET],weight=grid_df[valid_mask]["w_mean"])
    train_data = lgb.Dataset(df_train, label=grid_df[train_mask][TARGET])
    valid_data = lgb.Dataset(df_valid, label=grid_df[valid_mask][TARGET])
    print(df_train.shape)
    print(df_valid.shape)
    
    return train_data, valid_data

In [9]:
def save_checkpoint(store_id, VER, period=100):
    def _callback(env):
        if period > 0 and env.iteration > 0 and env.iteration % period == 0:
            model_name = 'lgb_model_' + str(store_id) + '_v' + str(VER) + '_' + str(env.iteration) +  '.bin'
            print("saving model {} ...".format(model_name))
            pickle.dump(env.model, open(model_name, 'wb'))
            print("model save completed!")
    return _callback

In [10]:
def sum_weight(x):
    sum_w = 1/12
    for i in range(2, 13):
        # sum_w += x["w_Level_"+str(i)]/12
        sum_w += x["w_Level_"+str(i)]
    return sum_w

## 値段の状態を保存する

In [11]:
www, f = get_data_by_store("CA_1")

In [12]:
www = www[["id", "d", "sell_price_state"]]

In [14]:
www.shape

(47735397, 3)

In [14]:
www.to_pickle("sell_price_state.pkl")

In [15]:
www.shape

(47735397, 3)

In [15]:
www.tail()

Unnamed: 0,id,d,sell_price_state
47735392,FOODS_3_823_WI_3_evaluation,1969,8
47735393,FOODS_3_824_WI_3_evaluation,1969,8
47735394,FOODS_3_825_WI_3_evaluation,1969,8
47735395,FOODS_3_826_WI_3_evaluation,1969,8
47735396,FOODS_3_827_WI_3_evaluation,1969,0
