# WRMSSE Evaluator with extra features

**Version 8** : Optimized version now takes 1.1GB as compaired to 2.7GB of previous one.  
**Version 9** : ignoring starting zeros for each series for the calculation of denominator in rmsse

In [8]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm import tqdm

class WRMSSEEvaluator(object):
    
    group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
        ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
        ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self, 
                 train_df: pd.DataFrame, 
                 valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, 
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1, 
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df, 
                                                      self.train_target_columns, 
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df, 
                                                      self.valid_target_columns, 
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in tqdm(range(len(self.train_series))):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series!=0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)
    
    def get_name(self, i):
        '''
        convert a str or list of strings to unique string 
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)
    
    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        '''
        transform 30490 sries to all 42840 series
        '''
        series_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False, disable=dis)):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T
    
    def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        self.scale = np.where(self.scale != 0 , self.scale, 1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1, 
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds, 
                                                self.valid_target_columns, 
                                                self.group_ids, 
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse], 
                                      axis=1, 
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)

In [19]:
%%time

train_df = pd.read_csv('../data/raw/sales_train_validation.csv')
calendar = pd.read_csv('../data/raw/calendar.csv')
prices = pd.read_csv('../data/raw/sell_prices.csv')

valid_days = 50
train_fold_df = train_df.iloc[:, :-valid_days]
valid_fold_df = train_df.iloc[:, -valid_days:].copy()

e = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
# del train_fold_df, train_df, calendar, prices

100%|██████████| 42840/42840 [00:04<00:00, 8875.84it/s]


CPU times: user 39.3 s, sys: 6.3 s, total: 45.6 s
Wall time: 46 s


In [10]:
print(train_fold_df.shape)
display(train_fold_df.head())
print(valid_fold_df.shape)
display(valid_fold_df.head())

(30490, 1892)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1877,d_1878,d_1879,d_1880,d_1881,d_1882,d_1883,d_1884,d_1885,all_id
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,1,2,2,0,1,1,1,all
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,1,1,1,all
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,1,1,0,all
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,4,1,3,5,0,6,6,all
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,2,2,3,1,0,0,0,0,all


(30490, 28)


Unnamed: 0,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,1,0,0,0,0,0,1,0,4,2,...,1,3,0,1,1,1,3,0,1,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,0,0,0,0,3,1,2,1,3,1,...,1,0,5,4,1,0,1,3,7,2
4,1,0,4,4,0,1,4,0,1,0,...,2,1,1,0,1,1,2,2,2,4


In [11]:
valid_preds = np.random.randint(4, size=valid_fold_df.shape)
e.score(valid_preds)

2.512871673022179

In [12]:
# for LightGBM
class WRMSSEForLightGBM(WRMSSEEvaluator):
    def feval(self, preds, dtrain):
        preds = preds.reshape(self.valid_df[self.valid_target_columns].shape)
        score = self.score(preds)
        return 'WRMSSE', score, False
    
    
# Usage
# evaluator = WRMSSEForLightGBM(train_fold_df, valid_fold_df, calendar, prices)
# model = lgb.train(params, dtrain,
#                   num_boost_round=10000,
#                   valid_sets=dvalid,
#                   feval=evaluator.feval,
#                   early_stopping_rounds=200)

### Individual series contributions to final score which equal is sum them

In [13]:
# 最終スコアにおける各Seriesの貢献度
e.contributors.sort_values(ascending=False)

all                    0.111380
HOUSEHOLD_2            0.111219
HOUSEHOLD              0.100083
FOODS                  0.096173
HOBBIES                0.084383
                         ...   
HOBBIES_2_070--TX_3    0.000000
HOUSEHOLD_1_124--WI    0.000000
HOBBIES_2_071--CA_4    0.000000
HOBBIES_2_072--CA_1    0.000000
HOUSEHOLD_1_124--CA    0.000000
Length: 42840, dtype: float64

### Individual series rmsse

In [14]:
# 各SeriesごとのRMSSE
e.rmsse.sort_values(ascending=False)

HOUSEHOLD_1_032--TX_1    50.956214
HOBBIES_2                30.536456
HOUSEHOLD_1_020--CA_3    27.118786
CA_4--HOUSEHOLD_2        25.925846
CA_4--HOBBIES_2          25.247807
                           ...    
FOODS_3_541--CA_1         0.119438
FOODS_3_234--TX_3         0.111873
HOUSEHOLD_2_062--TX_1     0.097383
FOODS_3_752--CA_2         0.077518
FOODS_2_285--TX_1         0.077330
Length: 42840, dtype: float64

In [15]:
e.rmsse.sort_values(ascending=True)

FOODS_2_285--TX_1         0.077330
FOODS_3_752--CA_2         0.077518
HOUSEHOLD_2_062--TX_1     0.097383
FOODS_3_234--TX_3         0.111873
FOODS_3_541--CA_1         0.119438
                           ...    
CA_4--HOBBIES_2          25.247807
CA_4--HOUSEHOLD_2        25.925846
HOUSEHOLD_1_020--CA_3    27.118786
HOBBIES_2                30.536456
HOUSEHOLD_1_032--TX_1    50.956214
Length: 42840, dtype: float64

### Individual series weights

In [23]:
e.weights[0].sort_values(ascending=False)

all                      0.083333
FOODS                    0.046505
CA                       0.035406
FOODS_3                  0.027986
HOUSEHOLD                0.025860
                           ...   
HOUSEHOLD_1_297--WI      0.000000
FOODS_3_745--CA_2        0.000000
FOODS_3_745--CA_1        0.000000
HOUSEHOLD_2_123--CA_3    0.000000
HOUSEHOLD_2_116--CA_4    0.000000
Name: 0, Length: 42840, dtype: float64

## weights for public test set

In [25]:
%%time

train_df = pd.read_csv('../data/raw/sales_train_validation.csv')
calendar = pd.read_csv('../data/raw/calendar.csv')
prices = pd.read_csv('../data/raw/sell_prices.csv')
for i in range(1914, 1942):
    train_df[f"d_{i}"] = 0

train_fold_df = train_df.iloc[:, :-28]
valid_fold_df = train_df.iloc[:, -28:].copy()

e = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
del train_fold_df, train_df, calendar, prices

100%|██████████| 42840/42840 [00:04<00:00, 8842.52it/s]


CPU times: user 41.1 s, sys: 6.71 s, total: 47.8 s
Wall time: 49.1 s


In [26]:
weights = e.weights.copy() * 12
weights.columns = ["weight"]
weights['series'] = weights.index
weights = weights[['series', 'weight']].reset_index(drop=True)
weights.to_csv("weights.csv", index=None)
weights

Unnamed: 0,series,weight
0,all,1.000000
1,CA,0.442371
2,TX,0.269297
3,WI,0.288332
4,CA_1,0.110888
...,...,...
42835,HOUSEHOLD_2_516--TX_2,0.000013
42836,HOUSEHOLD_2_516--TX_3,0.000008
42837,HOUSEHOLD_2_516--WI_1,0.000002
42838,HOUSEHOLD_2_516--WI_2,0.000002


### These weights match to given weights [here](https://raw.githubusercontent.com/Mcompetitions/M5-methods/master/validation/weights_validation.csv)

### Original version takes 2.7Gb of memory

In [None]:
# from typing import Union

# import numpy as np
# import pandas as pd
# from tqdm.auto import tqdm as tqdm

# class WRMSSEEvaluator(object):
    
#     group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
#         ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
#         ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

#     def __init__(self, 
#                  train_df: pd.DataFrame, 
#                  valid_df: pd.DataFrame, 
#                  calendar: pd.DataFrame, 
#                  prices: pd.DataFrame):
#         '''
#         intialize and calculate weights
#         '''
#         self.calendar = calendar
#         self.prices = prices
#         self.train_df = train_df
#         self.valid_df = valid_df
#         self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
#         self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

#         self.train_df['all_id'] = "all"

#         self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
#         self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

#         if not all([c in self.valid_df.columns for c in self.id_columns]):
#             self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
#                                       axis=1, 
#                                       sort=False)
#         self.train_series = self.trans_30490_to_42840(self.train_df, 
#                                                       self.train_target_columns, 
#                                                       self.group_ids)
#         self.valid_series = self.trans_30490_to_42840(self.valid_df, 
#                                                       self.valid_target_columns, 
#                                                       self.group_ids)
#         self.weights = self.get_weight_df()
    
#     def get_name(self, i):
#         '''
#         convert a str or list of strings to unique string 
#         used for naming each of 42840 series
#         '''
#         if type(i) == str or type(i) == int:
#             return str(i)
#         else:
#             return "--".join(i)
    
#     def get_weight_df(self) -> pd.DataFrame:
#         '''
#         returns weights for each of 42840 series in a dataFrame
#         '''
#         day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
#         weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
#         weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
#         weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)
#         weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
#         weight_df['value'] = weight_df['value'] * weight_df['sell_price']
#         weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
#         weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
#         weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
#         weights_map = {}
#         for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
#             lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
#             lv_weight = lv_weight / lv_weight.sum()
#             for i in range(len(lv_weight)):
#                     weights_map[self.get_name(lv_weight.index[i])] = np.array([lv_weight.iloc[i]])
#         weights = pd.DataFrame(weights_map).T / len(self.group_ids)
        
#         return weights

#     def trans_30490_to_42840(self, df, cols, group_ids):
#         '''
#         transform 30490 sries to all 42840 series
#         '''
#         series_map = {}
#         for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
#             tr = df.groupby(group_id)[cols].sum()
#             for i in range(len(tr)):
#                 series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
#         return pd.DataFrame(series_map).T
    
#     def get_rmsse(self, valid_preds) -> pd.Series:
#         '''
#         returns rmsse scores for all 42840 series
#         '''
#         score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
#         scale = ((self.train_series.iloc[:, 1:].values - self.train_series.iloc[:, :-1].values) ** 2).mean(axis=1)
#         rmsse = (score / scale).map(np.sqrt)
#         return rmsse

#     def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
#         assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

#         if isinstance(valid_preds, np.ndarray):
#             valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

#         valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)
#         valid_preds = self.trans_30490_to_42840(valid_preds, self.valid_target_columns, self.group_ids)
#         self.rmsse = self.get_rmsse(valid_preds)
#         self.contributors = pd.concat([self.weights, self.rmsse], axis=1, sort=False).prod(axis=1)
#         return np.sum(self.contributors)