In [6]:
import os
import gc
import re
import pickle
import datetime
from tqdm import tqdm

import numpy as np
import pandas as pd

from typing import Union

import seaborn
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')

from scipy.stats import linregress

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

In [135]:
train = pd.read_pickle("/Users/rui/Documents/repositories/m5_forecasting_accuracy/v01000/features/add_weight.pkl")

KeyboardInterrupt: 

In [3]:
train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,sales_rolling_MAX_t7,sales_rolling_MAX_t30,sales_rolling_MAX_t60,sales_rolling_SKEW_t30,sales_rolling_KURT_t30,sell_price_lag_t28,sell_price_price_change_t365,sell_price_rolling_price_std_t7,sell_price_rolling_price_std_t30,weight
700176,HOBBIES_1_008_CA_1_validation,1444,3,1,0,0,d_56,0,2011-03-25,11108,...,10.0,,,,,0.419922,,0.0,,0.678711
700177,HOBBIES_1_009_CA_1_validation,1445,3,1,0,0,d_56,0,2011-03-25,11108,...,5.0,,,,,1.55957,,0.0,,0.714355
700178,HOBBIES_1_010_CA_1_validation,1446,3,1,0,0,d_56,0,2011-03-25,11108,...,1.0,,,,,3.169922,,0.0,,0.142822
700179,HOBBIES_1_012_CA_1_validation,1448,3,1,0,0,d_56,1,2011-03-25,11108,...,3.0,,,,,5.980469,,0.0,,0.392822
700180,HOBBIES_1_015_CA_1_validation,1451,3,1,0,0,d_56,1,2011-03-25,11108,...,18.0,,,,,0.720215,,0.0,,0.5


In [4]:
def split_train_eval_submit(df, pred_interval=28):
    latest_date = df['date'].max()
    submit_date = latest_date - datetime.timedelta(days=pred_interval)
    submit_mask = (df["date"] > submit_date)

    eval_date = latest_date - datetime.timedelta(days=pred_interval * 2)
    eval_mask = ((df["date"] > eval_date) & (df["date"] <= submit_date))

    train_mask = ((~eval_mask) & (~submit_mask))
    return df[train_mask], df[eval_mask], df[submit_mask]

In [16]:
features = train.columns.tolist()
cols_to_drop = ['id', 'wm_yr_wk', 'd', 'date'] + ['sales']
features = [f for f in features if f not in cols_to_drop]

In [7]:
train_data, eval_data, submit_data = split_train_eval_submit(train)

In [10]:
def load_pickle(filepath):
    with open(filepath, 'rb') as file:
        return pickle.load(file)

In [12]:
models = load_pickle(f'result/model/v01003.pkl')

In [18]:
preds = [m.predict(eval_data[features].values,num_iteration=m.best_iteration) for m in models]
preds = np.mean(preds, axis=0)

In [31]:
def rmsle(preds, actual, weight=None):
    return np.sqrt(mean_squared_log_error(actual, preds, sample_weight=weight))

In [33]:
score = {}
score['RMSE'] = mean_squared_error(eval_data['sales'].values, preds, squared=False)
score['RMSLE'] = rmsle(preds, eval_data['sales'].values)
# TODO: WRMSSEのscoreを算出できるようにする。
score['WRMSSE'] = 111111  # e.score(valid_pred_df)
for key, val in score.items():
    print(f'{key}: {val}')

RMSE: 2.2041845139520113
RMSLE: 0.5234763492320385
WRMSSE: 111111


In [None]:
def estimate_wrmsse(train_data, eval_data, preds):
    def reverse_map(d):
        return {v: k for k, v in d.items()}

    # Processing train data.
    idx_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    train_idx_labels = pd.pivot_table(
        train_data,
        index=idx_cols,
        columns='d',
        values='sales'
    ).reset_index()
    del train_data; gc.collect()

    d_cols = train_idx_labels.drop(idx_cols, axis=1).columns.tolist()
    d_cols = sorted(d_cols, key=lambda x: int((re.search(r"\d+", x)).group(0)))
    # Decode map.
    with open('features/encode_map.pkl', 'rb') as file:
        encode_map = pickle.load(file)
    for label_col, label_map in encode_map.items():
        train_idx_labels[label_col] = train_idx_labels[label_col].map(reverse_map(label_map))
    del encode_map; gc.collect()

    train_idx_labels = train_idx_labels[idx_cols + d_cols]
    # Processing eval label data.
    eval_labels = pd.pivot_table(eval_data, index='id', columns='d', values='sales', fill_value=0).reset_index()
    d_cols = eval_labels.drop('id', axis=1).columns.tolist()
    d_cols = sorted(d_cols, key=lambda x: int((re.search(r"\d+", x)).group(0)))
    eval_labels = eval_labels[['id'] + d_cols]
    # Processing eval predict data.
    eval_data['pred'] = preds
    pred_labels = pd.pivot_table(eval_data, index='id', columns='d', values='pred', fill_value=0).reset_index()
    d_cols = pred_labels.drop('id', axis=1).columns.tolist()
    d_cols = sorted(d_cols, key=lambda x: int((re.search(r"\d+", x)).group(0)))
    pred_labels = pred_labels[['id'] + d_cols]
    # Estimate WRMSSE score.
    calendar = pd.read_pickle('../data/reduced/calendar.pkl')
    sell_prices = pd.read_pickle('../data/reduced/sell_prices.pkl')
    e = WRMSSEEvaluator(train_idx_labels, eval_labels, calendar, sell_prices)
    score = e.score(pred_labels)
    return score

print('WRMSSE:', estimate_wrmsse(train_data, eval_data, preds))

In [96]:
class WRMSSEEvaluator(object):

    group_ids = ('all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
                 ['state_id', 'cat_id'], ['state_id', 'dept_id'], ['store_id', 'cat_id'],
                 ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self,
                 train_df: pd.DataFrame,
                 valid_df: pd.DataFrame,
                 calendar: pd.DataFrame,
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1,
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df,
                                                      self.train_target_columns,
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df,
                                                      self.valid_target_columns,
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in range(len(self.train_series)):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series != 0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)

    def get_name(self, i):
        '''
        convert a str or list of strings to unique string
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)

    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(self.group_ids, leave=False):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        series_map = {}
        for i, group_id in enumerate(self.group_ids):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T

    def get_rmsse(self, valid_preds) -> pd.Series:
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        self.scale = np.where(self.scale != 0, self.scale, 1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1,
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds,
                                                self.valid_target_columns,
                                                self.group_ids,
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse],
                                      axis=1,
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)