# v02000 Baseline

# モデルにもデータに問題がありそうなので根本からBaselineを作り直す必要がある

- [ ] Baseline を見直す
    - 予測変数を対数変換しない。
    - モデルを poission から regression に変更する
- [ ] WRMSSE の評価関数クラスを定義し、LightGBM の評価関数として使う。
- [ ] カテゴリ変数を指定する
- [ ] カテゴリごとの標準化でスコアが改善するか試す
- [ ] 特徴量の追加、Aggregated Sales Feature

In [1]:
SEED = 42
VERSION = 'v02000'

## Import Libraries

In [2]:
import os
import gc
import re
import json
import pickle
import datetime
from tqdm import tqdm
from typing import Union

import numpy as np
import pandas as pd
pd.options.display.max_columns = None

from typing import Union

import seaborn
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

from scipy.stats import linregress

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

import lightgbm as lgb

## Define Functions

In [3]:
def load_pickle(filepath):
    with open(filepath, 'rb') as file:
        return pickle.load(file)


def dump_pickle(data, filepath):
    with open(filepath, 'wb') as file:
        pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [5]:
class WRMSSEEvaluator(object):
    
    group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
        ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
        ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self, 
                 train_df: pd.DataFrame, 
                 valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, 
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1, 
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df, 
                                                      self.train_target_columns, 
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df, 
                                                      self.valid_target_columns, 
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in tqdm(range(len(self.train_series))):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series!=0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)
    
    def get_name(self, i):
        '''
        convert a str or list of strings to unique string 
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)
    
    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        '''
        transform 30490 sries to all 42840 series
        '''
        series_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False, disable=dis)):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T
    
    def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        self.scale = np.where(self.scale != 0 , self.scale, 1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1, 
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds, 
                                                self.valid_target_columns, 
                                                self.group_ids, 
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse], 
                                      axis=1, 
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)

## Load Data

In [6]:
def read_data():
    files = ['calendar', 'sample_submission', 'sales_train_validation', 'sell_prices']

    if os.path.exists('/kaggle/input/m5-forecasting-accuracy'):
        data_dir_path = '/kaggle/input/m5-forecasting-accuracy'
        dst_data = {}
        for file in files:
            print(f'Reading {file} ....')
            dst_data[file] = pd.read_csv(data_dir_path + file + '.csv')
    else:
        data_dir_path = '../data/reduced/'
        dst_data = {}
        for file in files:
            print(f'Reading {file} ....')
            dst_data[file] = pd.read_pickle(data_dir_path + file + '.pkl')
    return dst_data.values()

## Transform

In [7]:
def encode_map(filename='encode_map', use_cache=True):
    filepath = f'features/{filename}.pkl'
    if use_cache and os.path.exists(filepath):
        print('Load cache of encode_map.')
        return load_pickle(filepath)
    
    print('Processing encode_map.')
    train = pd.read_pickle('../data/reduced/sales_train_validation.pkl')
    categorical_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    encode_map = {
        col: {label: i for i, label in enumerate(sorted(train[col].unique()))}
        for col in categorical_cols
    }

    dump_pickle(encode_map, filepath)
    return encode_map

In [8]:
# MEMO: store_id, item_id, ごとの sell_price のランクを特徴量
def parse_sell_price(filename='encoded_sell_price', use_cache=True):
    filepath = f'features/{filename}.pkl'
    if use_cache and os.path.exists(filepath):
        print('Laod cache of parse_sell_price.')
        return pd.read_pickle(filepath)
    # Load Data
    print('Processing parse_sell_price.')
    df = pd.read_pickle('../data/reduced/sell_prices.pkl')
    # Initial Processing And Feature Engineearing
    df['Log1p_sell_price'] = np.log1p(df['sell_price'])
    df['area_id'] = df['store_id'].str.extract('(\w+)_\d+')
    df['sell_price_rate_by_wm_yr_wk__item_id'] = df['sell_price'] / \
        df.groupby(['wm_yr_wk', 'item_id'])['sell_price'].transform('mean')
    df['sell_price_rate_by_wm_yr_wk__area__item_id'] = df['sell_price'] / \
        df.groupby(['wm_yr_wk', 'area_id', 'item_id'])['sell_price'].transform('mean')
    df['sell_price_momentum'] = df['sell_price'] / \
        df.groupby(['store_id', 'item_id'])['sell_price'].transform(lambda x: x.shift(1))
    
    for g in ['store_id', 'item_id', ['store_id', 'item_id']]:
        g_str = g if type(g) == str else '--'.join(g)
        df[f'sell_price_rank_of_{g_str}'] = df.groupby(g)['sell_price'].transform(
            lambda x: x.rank(method='min')).replace([np.inf, -np.inf], -1)
        df[f'sell_price_pct_rank_of_{g_str}'] = df.groupby(g)['sell_price'].transform(
            lambda x: x.rank(method='min', pct=True)).replace([np.inf, -np.inf], -1)
    # Export DataFrame
    df.drop(['area_id'], axis=1, inplace=True)
    df = df.pipe(reduce_mem_usage)
    df.to_pickle(filepath)
    return df

In [9]:
def encode_calendar(filename='encoded_calendar', use_cache=True):
    filepath = f'features/{filename}.pkl'
    if use_cache and os.path.exists(filepath):
        print('Load cache of encode_calendar.')
        return pd.read_pickle(filepath)
    
    # Load Data
    print('Processing encode_calendar.')
    df = pd.read_pickle('../data/reduced/calendar.pkl')
    # Initial Processing And Feature Engineearing
    df['date'] = pd.to_datetime(df['date'])
    attrs = [
        "quarter",
        "month",
        "weekofyear",
        "day",
        "dayofweek",
        "is_year_end",
        "is_year_start",
        "is_quarter_end",
        "is_quarter_start",
        "is_month_end",
        "is_month_start",
    ]
    for attr in attrs:
        df[attr] = getattr(df['date'].dt, attr).astype(np.int8)

    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    # MEMO: N_Unique of event_name_1 == 31 and event_name_2 == 5.
    event_cols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    df[event_cols] = df[event_cols].fillna('None')
    for c in event_cols:
        le = preprocessing.LabelEncoder()
        df[c] = le.fit_transform(df[c].values)

    # for c in event_cols:
    #     for diff in [1, 2]:
    #         df[f"{c}_lag_t{diff}"] = df[c].shift(diff)
    # Drop columns.
    cols_to_drop = ['weekday', 'wday']
    df.drop(cols_to_drop, axis=1, inplace=True)
    # Export DataFrame
    df = df.pipe(reduce_mem_usage)
    df.to_pickle(filepath)
    return df

In [10]:
def hstack_sales_colums(df, latest_d, stack_days=28):
    for i in range(latest_d, latest_d + stack_days):
        df[f'd_{str(i + 1)}'] = 0
    return df

In [11]:
def melt_data(filename='melted_train', use_cache=True):
    filepath = f'features/{filename}.pkl'
    # check is exist cached file.
    if use_cache and os.path.exists(filepath):
        print('Load Cache of melt_data.')
        return pd.read_pickle(filepath)
    # Load Data
    print('Processing melt_data.')
    latest_d = 1913  # latest_d of evaluation data is 1941
    df = pd.read_pickle('../data/reduced/sales_train_validation.pkl')
    df = hstack_sales_colums(df, latest_d)
    # Melt Main Data and Join Optinal Data.
    id_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    df = pd.melt(df, id_vars=id_columns, var_name='d', value_name='sales')
    # Join calendar.
    calendar = pd.read_pickle('features/encoded_calendar.pkl')
    df = pd.merge(df, calendar, how='left', on='d')
    # Join sell_price.
    sell_price = pd.read_pickle('features/encoded_sell_price.pkl')
    df = pd.merge(df, sell_price, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
    # Label encoding main dataframe.
    with open('features/encode_map.pkl', 'rb') as file:
        encode_map = pickle.load(file)
    for label, encode_map in encode_map.items():
        df[label] = df[label].map(encode_map)
    # Drop Null Records.
    df.dropna(subset=['sell_price'], axis=0, inplace=True)
    # Cache DataFrame.
    df = df.pipe(reduce_mem_usage)
    df.to_pickle(filepath)
    return df

In [43]:
_ = encode_map(filename='encode_map', use_cache=True)
_ = parse_sell_price(filename='encoded_sell_price', use_cache=True)
_ = encode_calendar(filename='encoded_calendar', use_cache=True)

train = melt_data(filename='melted_train', use_cache=True)
print('\nTrain DataFrame:', train.shape)
print('Memory Usage:', train.memory_usage().sum() / 1024 ** 2, 'Mb')
# print(train.head())

Load cache of encode_map.
Laod cache of parse_sell_price.
Load cache of encode_calendar.
Processing melt_data.
Mem. usage decreased to 3800.34 Mb (32.0% reduction)

Train DataFrame: (46881677, 41)
Memory Usage: 3800.337357521057 Mb


In [44]:
print(train.shape)
train.head()

(46881677, 41)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,quarter,weekofyear,day,dayofweek,is_year_end,is_year_start,is_quarter_end,is_quarter_start,is_month_end,is_month_start,is_weekend,sell_price,Log1p_sell_price,sell_price_rate_by_wm_yr_wk__item_id,sell_price_rate_by_wm_yr_wk__area__item_id,sell_price_momentum,sell_price_rank_of_store_id,sell_price_pct_rank_of_store_id,sell_price_rank_of_item_id,sell_price_pct_rank_of_item_id,sell_price_rank_of_store_id--item_id,sell_price_pct_rank_of_store_id--item_id
7,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,d_1,12,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,0.459961,0.378418,0.983887,1.0,,4452.0,0.006374,71.0,0.025742,9.0,0.031921
8,HOBBIES_1_009_CA_1_validation,1445,3,1,0,0,d_1,2,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,1.55957,0.939941,0.993652,1.0,,-1.0,0.12561,10.0,0.003679,1.0,0.003546
9,HOBBIES_1_010_CA_1_validation,1446,3,1,0,0,d_1,0,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,3.169922,1.427734,1.0,1.0,,-1.0,0.471436,2750.0,0.975098,267.0,0.946777
11,HOBBIES_1_012_CA_1_validation,1448,3,1,0,0,d_1,0,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,5.980469,1.943359,0.983398,1.0,,-1.0,0.785156,1.0,0.000355,1.0,0.003546
14,HOBBIES_1_015_CA_1_validation,1451,3,1,0,0,d_1,4,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,0.700195,0.530762,1.0,1.0,,14336.0,0.020538,156.0,0.055389,15.0,0.053192


# ここまでは処理内容を担保済み

## Feature Engineering

In [45]:
class BaseFeature():
    def __init__(self, filename, use_cache=True):
        self.filepath = f'features/{filename}.pkl'
        self.use_cache = use_cache
        self.is_exist_cahce = False
        self.df = pd.DataFrame()

    def __enter__(self):
        if self.use_cache:
            self.check_exist_cahce()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        if not self.is_exist_cahce:
            with open(self.filepath, 'wb') as file:
                pickle.dump(self.df, file, protocol=pickle.HIGHEST_PROTOCOL)

    def check_exist_cahce(self):
        if os.path.exists(self.filepath):
            self.is_exist_cahce = True

    def get_feature(self, df):
        if self.is_exist_cahce:
            with open(self.filepath, 'rb') as file:
                self.df = pickle.load(file)
            return self.df
        else:
            self.df = self.create_feature(df)
            return self.df

    def create_feature(self, df):
        raise NotImplementedError

In [46]:
class AddBaseSalesFeature(BaseFeature):
    def create_feature(self, df):
        print('Create Sales Feature.')
        DAYS_PRED = 28
        col = 'sales'
        grouped_df = df.groupby(["id"])[col]

        for diff in [0, 1, 2, 4, 5, 6]:
            shift = DAYS_PRED + diff
            df[f"{col}_lag_t{shift}"] = grouped_df.transform(lambda x: x.shift(shift))

        for diff in [1, 2, 4, 5, 6, 7]:
            df[f"{col}_lag_t{shift}"] = grouped_df.transform(
                lambda x: x.shift(DAYS_PRED).diff(diff))
            
        for diff in [1, 2, 4, 5, 6, 7]:
            df[f"{col}_lag_t{shift}"] = grouped_df.transform(
                lambda x: x.shift(DAYS_PRED).pct_change(diff))

        for window in [7, 30, 60, 90, 180]:
            df[f"{col}_rolling_STD_t{window}"] = grouped_df.transform(
                lambda x: x.shift(DAYS_PRED).rolling(window).std())

        for window in [7, 30, 60, 90, 180]:
            df[f"{col}_rolling_MEAN_t{window}"] = grouped_df.transform(
                lambda x: x.shift(DAYS_PRED).rolling(window).mean())

        for window in [7, 30, 60]:
            df[f"{col}_rolling_MIN_t{window}"] = grouped_df.transform(
                lambda x: x.shift(DAYS_PRED).rolling(window).min())

        for window in [7, 30, 60]:
            df[f"{col}_rolling_MAX_t{window}"] = grouped_df.transform(
                lambda x: x.shift(DAYS_PRED).rolling(window).max())

        for window in [7, 14, 30, 60]:
            df[f"{col}_rolling_ZeroRatio_t{window}"] = grouped_df.transform(
                lambda x: 1 - (x == 0).shift(DAYS_PRED).rolling(window).mean())
            df[f"{col}_rolling_ZeroCount_t{window}"] = grouped_df.transform(
                lambda x: (x == 0).shift(DAYS_PRED).rolling(window).sum())

        df[f"{col}_rolling_SKEW_t30"] = grouped_df.transform(
            lambda x: x.shift(DAYS_PRED).rolling(30).skew())
        df[f"{col}_rolling_KURT_t30"] = grouped_df.transform(
            lambda x: x.shift(DAYS_PRED).rolling(30).kurt())
        return df.pipe(reduce_mem_usage)

In [47]:
def drop_null_rows(df):
    print('Drop Null Rows.')
    check_cols = ['lag', 'rolling']
    cheked_regex = '|'.join(check_cols)
    target_cols = df.columns[df.columns.str.contains(cheked_regex)]
    is_contain_null_rows = df[target_cols].isnull().any(axis=1)
    return df.loc[~(is_contain_null_rows), :]

In [48]:
def simple_fe(data, target='sales'):
    # rolling demand features
    data['lag_t28'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])[target].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])[target].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_mean_t30'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t30'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(30).std())
    data['rolling_skew_t30'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(30).skew())
    data['rolling_kurt_t30'] = data.groupby(['id'])[target].transform(lambda x: x.shift(28).rolling(30).kurt()) 
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    # time features
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    return data

In [49]:
def create_features(df, is_use_cache=True):
    '''
    TODO:
        - 当該月の特徴量
            - 月初・月末（１日）の売上
            - 15, 20日などのクレジットカードの締日のごとの統計量
        - 過去１ヶ月間の（特定item_idの売上 / スーパー全体の売上）
            - （特定item_idの売上個数 / スーパー全体の売上個数）
            - （特定item_idの売上個数 / スーパー全体の売上個数）
    '''

    with AddBaseSalesFeature(filename='add_sales_train', use_cache=is_use_cache) as feat:
        df = feat.get_feature(df)

    return df.reset_index(drop=True)

In [50]:
# train = create_features(train, is_use_cache=True)
train = simple_fe(train)
print('Train DataFrame:', train.shape)
print('Memory Usage:', train.memory_usage().sum() / 1024 ** 2, 'Mb')
# print(train.head())

Train DataFrame: (46881677, 73)
Memory Usage: 6572.348258018494 Mb


In [51]:
print(train.shape)
train.head()

(46881677, 73)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,quarter,weekofyear,day,dayofweek,is_year_end,is_year_start,is_quarter_end,is_quarter_start,is_month_end,is_month_start,is_weekend,sell_price,Log1p_sell_price,sell_price_rate_by_wm_yr_wk__item_id,sell_price_rate_by_wm_yr_wk__area__item_id,sell_price_momentum,sell_price_rank_of_store_id,sell_price_pct_rank_of_store_id,sell_price_rank_of_item_id,sell_price_pct_rank_of_item_id,sell_price_rank_of_store_id--item_id,sell_price_pct_rank_of_store_id--item_id,sales_lag_t28,sales_lag_t29,sales_lag_t30,sales_lag_t32,sales_lag_t33,sales_lag_t34,sales_rolling_STD_t7,sales_rolling_STD_t30,sales_rolling_STD_t60,sales_rolling_STD_t90,sales_rolling_STD_t180,sales_rolling_MEAN_t7,sales_rolling_MEAN_t30,sales_rolling_MEAN_t60,sales_rolling_MEAN_t90,sales_rolling_MEAN_t180,sales_rolling_MIN_t7,sales_rolling_MIN_t30,sales_rolling_MIN_t60,sales_rolling_MAX_t7,sales_rolling_MAX_t30,sales_rolling_MAX_t60,sales_rolling_ZeroRatio_t7,sales_rolling_ZeroCount_t7,sales_rolling_ZeroRatio_t14,sales_rolling_ZeroCount_t14,sales_rolling_ZeroRatio_t30,sales_rolling_ZeroCount_t30,sales_rolling_ZeroRatio_t60,sales_rolling_ZeroCount_t60,sales_rolling_SKEW_t30,sales_rolling_KURT_t30
0,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,d_1,12,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,0.459961,0.378418,0.983887,1.0,,4452.0,0.006374,71.0,0.025742,9.0,0.031921,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_009_CA_1_validation,1445,3,1,0,0,d_1,2,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,1.55957,0.939941,0.993652,1.0,,-1.0,0.12561,10.0,0.003679,1.0,0.003546,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_010_CA_1_validation,1446,3,1,0,0,d_1,0,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,3.169922,1.427734,1.0,1.0,,-1.0,0.471436,2750.0,0.975098,267.0,0.946777,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_012_CA_1_validation,1448,3,1,0,0,d_1,0,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,5.980469,1.943359,0.983398,1.0,,-1.0,0.785156,1.0,0.000355,1.0,0.003546,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_015_CA_1_validation,1451,3,1,0,0,d_1,4,2011-01-29,11101,1,2011,19,2,3,1,0,0,0,1,4,29,5,0,0,0,0,0,0,1,0.700195,0.530762,1.0,1.0,,14336.0,0.020538,156.0,0.055389,15.0,0.053192,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Train Model

In [52]:
def split_train_eval_submit(df, pred_interval=28):
    latest_date = df['date'].max()
    submit_date = latest_date - datetime.timedelta(days=pred_interval)
    submit_mask = (df["date"] > submit_date)

    eval_date = latest_date - datetime.timedelta(days=pred_interval * 2)
    eval_mask = ((df["date"] > eval_date) & (df["date"] <= submit_date))

    train_mask = ((~eval_mask) & (~submit_mask))
    return df[train_mask], df[eval_mask], df[submit_mask]

In [53]:
# # for LightGBM
# class WRMSSEForLightGBM(WRMSSEEvaluator):
#     def feval(self, preds, dtrain):
#         preds = preds.reshape(self.valid_df[self.valid_target_columns].shape)
#         score = self.score(preds)
#         return 'WRMSSE', score, False

In [54]:
class RondomSeed_LGBM_Model():
    def __init__(self, data, feature, target, n_fold, test_days, max_train_days, model_param, train_param, weight=None):
        self.n_fold = n_fold
        self.weight = weight
        train_dataset, valid_dataset = self.split_data(
            data, feature, target, test_days, max_train_days)
        self.models = self.fit(train_dataset, valid_dataset, model_param, train_param)

    def split_data(self, data, features, target, test_days, max_train_days):
        latest_date = data['date'].max()
        oldest_valid_date = latest_date - datetime.timedelta(days=test_days)
        valid_mask = (data["date"] > oldest_valid_date)
        oldest_train_date = oldest_valid_date - datetime.timedelta(days=max_train_days)
        train_mask = (data["date"] > oldest_train_date) & (data["date"] <= oldest_valid_date)

        train_X, train_y = data.loc[train_mask, features], data.loc[train_mask, target]
        valid_X, valid_y = data.loc[valid_mask, features], data.loc[valid_mask, target]

        train_dataset = lgb.Dataset(train_X, label=train_y)
        valid_dataset = lgb.Dataset(valid_X, label=valid_y, reference=train_dataset)

        if self.weight is not None:
            train_dataset.set_weight(self.weight.loc[train_mask])
            valid_dataset.set_weight(self.weight.loc[valid_mask])

        print('Train DataFrame Size:', train_mask.sum())
        print('Valid DataFrame Size:', valid_mask.sum())
        return train_dataset, valid_dataset

    def fit(self, train_dataset, valid_dataset, model_param, train_param,):
        models = []
        for n in range(self.n_fold):
            print(f"\n{n + 1} of {self.n_fold} Fold:\n")
            model_param['seed'] = model_param['seed'] + 1
            model = lgb.train(
                model_param,
                train_dataset,
                valid_sets=[train_dataset, valid_dataset],
                valid_names=["train", "valid"],
                **train_param
            )
            models.append(model)
        return models

    def get_models(self):
        return self.models

    def predict(self, data):
        models = self.get_models()
        preds = [m.predict(data, num_iteration=m.best_iteration) for m in models]
        avg_pred = np.mean(preds, axis=0)
        return avg_pred

    def save_importance(self, filepath, max_num_features=50, figsize=(20, 25), plot=False):
        models = self.get_models()
        # Define Feature Importance DataFrame.
        imp_df = pd.DataFrame(
            [m.feature_importance() for m in models],
            columns=models[0].feature_name()
        ).T
        imp_df['AVG_Importance'] = imp_df.iloc[:, :len(models)].mean(axis=1)
        imp_df['STD_Importance'] = imp_df.iloc[:, :len(models)].std(axis=1)
        imp_df.sort_values(by='AVG_Importance', inplace=True)
        # Plot Importance DataFrame.
        plt.figure(figsize=figsize)
        imp_df[-max_num_features:].plot(
            kind='barh', title='Feature importance', figsize=figsize,
            y='AVG_Importance', xerr='STD_Importance', align="center"
        )
        if plot:
            plt.show()
        plt.savefig(filepath)
        plt.close('all')

In [55]:
def train_model(df, feature, target):
    n_fold = 1
    max_train_days = 4 * 365
    test_days = 28

#     model_param = {
#         "boosting_type": "gbdt",
#         "metric": "rmse",
#         "objective": "regression",
#         "seed": SEED,
#         "learning_rate": 0.3,
#         "num_leaves": 2**6,
#         'min_data_in_leaf': 50,
#         "bagging_fraction": 0.8,
#         "bagging_freq": 1,
#         "feature_fraction": 1.0,
#         "lambda_l2": 0.1,
#         "verbosity": -1
#     }
    
    model_param = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.3,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75
    }

    train_param = {
        "num_boost_round": 100000,
        "early_stopping_rounds": 50,
        "verbose_eval": 100
    }

#     print(f'n_fold: {n_fold}')
#     print(f'max_train_days: {max_train_days}')
#     print(f'test_days: {test_days}')
#     print(model_param)
#     print(train_param)

    df = drop_null_rows(df)
    lgbm_model = RondomSeed_LGBM_Model(
        df, feature, target, n_fold, test_days, max_train_days,
        model_param, train_param
    )
    lgbm_model.save_importance(filepath=f'result/importance/{VERSION}.png')
    dump_pickle(lgbm_model, f'result/model/{VERSION}.pkl')

In [56]:
target = 'sales'
cols_to_drop = ['id', 'wm_yr_wk', 'd', 'date'] + [target]
features = train.columns.tolist()
features = [f for f in features if f not in cols_to_drop]
# 関数の定義を変える。split_train_valid_eval_submit
train_data, eval_data, submit_data = split_train_eval_submit(train)

In [65]:
def split_train_valid_eval(df, pred_interval=28):
    latest_date = df['date'].max()
    eval_date = latest_date - datetime.timedelta(days=pred_interval)
    eval_mask = (df["date"] > eval_date)

    valid_date = latest_date - datetime.timedelta(days=pred_interval * 2)
    valid_mask = ((df["date"] > valid_date) & (df["date"] <= eval_date))

    train_mask = ((~valid_mask) & (~eval_mask))
    return df[train_mask], df[valid_mask], df[eval_mask]

def run_lgb(data, features, target='sales'):
    # going to evaluate with the last 28 days
    train_data, valid_data, eval_data = split_train_valid_eval(data.iloc[-25000000:])
    del data;gc.collect()

    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.3,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75
    }

    train_set = lgb.Dataset(train_data[features], train_data[target].values)
    val_set = lgb.Dataset(valid_data[features], valid_data[target].values)
    
    model = lgb.train(
        params, 
        train_set, 
        num_boost_round = 2500, 
        early_stopping_rounds = 50, 
        valid_sets = [train_set, val_set], 
        verbose_eval = 100
    )
    val_pred = model.predict(valid_data[features], num_iteration=model.best_iteration)
    val_score = mean_squared_error(val_pred, valid_data[target].values, squared=False)
    valid_data['predict'] = val_pred
    print(f'validation score is {val_score}')
    eval_data['predict'] = model.predict(eval_data[features], num_iteration=model.best_iteration)
    return valid_data, eval_data

In [66]:
validt_result, result = run_lgb(train_data, features, target='sales')

Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.22055	valid_1's rmse: 2.36614
[200]	training's rmse: 2.15739	valid_1's rmse: 2.36143
Early stopping, best iteration is:
[171]	training's rmse: 2.1713	valid_1's rmse: 2.36023
validation score is 2.3602285363386035


In [57]:
train_model(train_data, features, target)

Drop Null Rows.
Train DataFrame Size: 19240518
Valid DataFrame Size: 488396

1 of 1 Fold:

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 3.10975	valid's rmse: 2.70882
Early stopping, best iteration is:
[95]	train's rmse: 3.11524	valid's rmse: 2.70686


# 収束が思ったとおりになっていないのでここからやり直し

## Evaluation

In [58]:
def estimate_wrmsse(eval_data, preds, eval_days=28):
    def ordered_d_cols(df_cols):
        return sorted(df_cols, key=lambda x: int((re.search(r"\d+", x)).group(0)))
    # Processing train data.
    df = pd.read_pickle('../data/reduced/sales_train_validation.pkl')
    train_idx_labels = df.iloc[:, :-eval_days].copy(deep=True)
    # Processing eval label data.
    eval_labels = pd.pivot_table(
        eval_data, index='id', columns='d', values='sales', fill_value=0).reset_index()
    eval_labels = eval_labels[ordered_d_cols(eval_labels.drop('id', axis=1).columns.tolist())]
    # Processing eval predict data.
    eval_data['pred'] = preds
    pred_labels = pd.pivot_table(
        eval_data, index='id', columns='d', values='pred', fill_value=0).reset_index()
    pred_labels = pred_labels[ordered_d_cols(pred_labels.drop('id', axis=1).columns.tolist())]
    # Estimate WRMSSE score.
    calendar = pd.read_pickle('../data/reduced/calendar.pkl')
    sell_prices = pd.read_pickle('../data/reduced/sell_prices.pkl')
    e = WRMSSEEvaluator(train_idx_labels, eval_labels, calendar, sell_prices)
    score = e.score(pred_labels)
    return score


def evaluation_model(eval_data, feature):
    lgbm_model = load_pickle(f'result/model/{VERSION}.pkl')
    preds = lgbm_model.predict(eval_data[feature].values)
    metric_scores = {}
    metric_scores['RMSE'] = mean_squared_error(eval_data['sales'].values, preds, squared=False)
    metric_scores['WRMSSE'] = estimate_wrmsse(eval_data, preds)
    for metric, score in metric_scores.items():
        print(f'{metric}: {score}')
    # Dump metric_scores as Json file.
    with open(f'result/score/{VERSION}.json', 'w') as file:
        json.dump(metric_scores, file, indent=4)
    return metric_scores

In [59]:
metric_scores = evaluation_model(eval_data, features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_data['pred'] = preds
100%|██████████| 42840/42840 [00:04<00:00, 9339.92it/s] 


RMSE: 2.1240870807187533
WRMSSE: 0.5906028752682618


## Submission

In [60]:
def create_submission_file(submit_data, feature, score):
    submission = pd.read_pickle('../data/reduced/sample_submission.pkl')['id'].to_frame()
    lgbm_model = load_pickle(f'result/model/{VERSION}.pkl')

    submit_data['pred'] = lgbm_model.predict(submit_data[feature].values)
    sub_validation = pd.pivot(submit_data, index='id', columns='date', values='pred').reset_index()
    sub_validation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    submission = pd.merge(submission, sub_validation, how='left', on='id')
    submission.fillna(0, inplace=True)
    submission.to_csv(f'submit/{VERSION}_{score:.04f}.csv.gz', index=False, compression='gzip')

    print('Submit DataFrame:', submission.shape)
    print(submission.head())

In [61]:
create_submission_file(submit_data, features, metric_scores['WRMSSE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_data['pred'] = lgbm_model.predict(submit_data[feature].values)


Submit DataFrame: (60980, 29)
                              id        F1        F2        F3        F4  \
0  HOBBIES_1_001_CA_1_validation  0.825113  0.722461  0.719059  0.753416   
1  HOBBIES_1_002_CA_1_validation  0.390858  0.376044  0.358181  0.323824   
2  HOBBIES_1_003_CA_1_validation  0.467663  0.423125  0.423125  0.423125   
3  HOBBIES_1_004_CA_1_validation  1.825545  1.637808  1.637808  1.520746   
4  HOBBIES_1_005_CA_1_validation  1.034260  0.852266  1.033663  1.044387   

         F5        F6        F7        F8        F9       F10       F11  \
0  0.797248  0.994578  1.052715  0.847650  0.906042  0.794916  1.017704   
1  0.402013  0.570770  0.594550  0.436204  0.402369  0.344073  0.262556   
2  0.466957  0.675235  0.703648  0.559714  0.513385  0.426311  0.315070   
3  1.794293  2.192388  2.293874  1.717844  1.731961  1.608565  1.485340   
4  1.095348  1.367307  1.750122  1.219289  1.160974  0.985625  0.933823   

        F12       F13       F14       F15       F16       F17 