In [1]:
import os
import gc
import re
import json
import pickle
import datetime
from tqdm import tqdm
from typing import Union

import numpy as np
import pandas as pd
pd.options.display.max_columns = None

from typing import Union

import seaborn
import matplotlib.pyplot as plt
plt.style.use("seaborn-darkgrid")

from scipy.stats import linregress

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# custom funcs
from script import WRMSSEEvaluator
from script import cache_result
from script import reduce_mem_usage
from script import load_pickle, dump_pickle
from script import get_groups

In [35]:
use_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'sales', 'sell_price']
df = pd.read_pickle('features/melted_and_merged_train.pkl')[use_cols]

isnan_sell_price = df['sell_price'].isnull().values
df.loc[isnan_sell_price, 'sales'] = np.nan
df.drop(['sell_price'], axis=1, inplace=True)

In [36]:
print(df.shape)
df.head()

(35093990, 7)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,sales
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,0.0
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1.0
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,1.0


In [37]:
class GaussianTargetEncoder():
        
    def __init__(self, group_cols, target_col="target", prior_cols=None):
        self.group_cols = group_cols
        self.target_col = target_col
        self.prior_cols = prior_cols

    def _get_prior(self, df):
        if self.prior_cols is None:
            prior = np.full(len(df), df[self.target_col].mean())
        else:
            prior = df[self.prior_cols].mean(1)
        return prior
                    
    def fit(self, df):
        self.stats = df.assign(mu_prior=self._get_prior(df), y=df[self.target_col])
        self.stats = self.stats.groupby(self.group_cols).agg(
            n        = ("y", "count"),
            mu_mle   = ("y", np.mean),
            sig2_mle = ("y", np.var),
            mu_prior = ("mu_prior", np.mean),
        )        
    
    def transform(self, df, prior_precision=1000, stat_type="mean"):
        
        precision = prior_precision + self.stats.n/self.stats.sig2_mle
        
        if stat_type == "mean":
            numer = prior_precision*self.stats.mu_prior\
                    + self.stats.n/self.stats.sig2_mle*self.stats.mu_mle
            denom = precision
        elif stat_type == "var":
            numer = 1.0
            denom = precision
        elif stat_type == "precision":
            numer = precision
            denom = 1.0
        else: 
            raise ValueError(f"stat_type={stat_type} not recognized.")
        
        mapper = dict(zip(self.stats.index, numer / denom))
        if isinstance(self.group_cols, str):
            keys = df[self.group_cols].values.tolist()
        elif len(self.group_cols) == 1:
            keys = df[self.group_cols[0]].values.tolist()
        else:
            keys = zip(*[df[x] for x in self.group_cols])
        
        values = np.array([mapper.get(k) for k in keys]).astype(float)
        
        prior = self._get_prior(df)
        values[~np.isfinite(values)] = prior[~np.isfinite(values)]
        
        return values
    
    def fit_transform(self, df, *args, **kwargs):
        self.fit(df)
        return self.transform(df, *args, **kwargs)

In [38]:
groups_and_priors = {
    # singe encodings
    ("state_id",):    None,
    ("store_id",):    None,
    ("cat_id",):      None,
    ("dept_id",):     None,
    ("item_id",):     None,
    
    # second-order interactions
    ("state_id", "dept_id"): ["gte_state_id", "gte_dept_id"],
    ("state_id", "item_id"): ["gte_state_id", "gte_item_id"],
    ("store_id", "dept_id"): ["gte_store_id", "gte_dept_id"],
    ("store_id", "item_id"): ["gte_store_id", "gte_item_id"],
}

In [39]:
for agg_f in ['mean', 'var']:
    features = []
    for group_cols, prior_cols in groups_and_priors.items():
        features.append(f"gte_{'_'.join(group_cols)}")
        print(f'Add {features[-1]}')

        gte = GaussianTargetEncoder(list(group_cols), "sales", prior_cols)    
        df[features[-1]] = gte.fit_transform(df, prior_precision=100, stat_type=agg_f)
    
    rename_dict = {feat: f'{feat}_{agg_f.upper()}' for feat in features}
    df = df.rename(columns=rename_dict)

Add gte_state_id
Add gte_store_id
Add gte_cat_id
Add gte_dept_id
Add gte_item_id
Add gte_state_id_dept_id
Add gte_state_id_item_id
Add gte_store_id_dept_id
Add gte_store_id_item_id
Add gte_state_id
Add gte_store_id
Add gte_cat_id
Add gte_dept_id
Add gte_item_id
Add gte_state_id_dept_id
Add gte_state_id_item_id
Add gte_store_id_dept_id
Add gte_store_id_item_id


In [40]:
print(df.shape)
df.head()

(35093990, 25)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,sales,gte_state_id_MEAN,gte_store_id_MEAN,gte_cat_id_MEAN,gte_dept_id_MEAN,gte_item_id_MEAN,gte_state_id_dept_id_MEAN,gte_state_id_item_id_MEAN,gte_store_id_dept_id_MEAN,gte_store_id_item_id_MEAN,gte_state_id_VAR,gte_store_id_VAR,gte_cat_id_VAR,gte_dept_id_VAR,gte_item_id_VAR,gte_state_id_dept_id_VAR,gte_state_id_item_id_VAR,gte_store_id_dept_id_VAR,gte_store_id_item_id_VAR
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,,1.413611,1.4886,0.644553,0.787821,0.404978,1.00168,0.624777,1.144049,0.613291,1e-06,5e-06,7.021895e-07,1e-06,5.2e-05,5e-06,0.000189,2.2e-05,0.000655
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,0.0,1.413611,1.4886,0.644553,0.787821,0.264319,1.00168,0.213723,1.144049,0.298373,1e-06,5e-06,7.021895e-07,1e-06,2.9e-05,5e-06,5.5e-05,2.2e-05,0.000298
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,,1.413611,1.4886,0.644553,0.787821,0.185004,1.00168,0.253441,1.144049,0.380935,1e-06,5e-06,7.021895e-07,1e-06,2.8e-05,5e-06,0.000103,2.2e-05,0.000535
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1.0,1.413611,1.4886,0.644553,0.787821,2.042447,1.00168,2.935063,1.144049,1.955045,1e-06,5e-06,7.021895e-07,1e-06,0.000626,5e-06,0.002073,2.2e-05,0.002983
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,1.0,1.413611,1.4886,0.644553,0.787821,0.74616,1.00168,1.035265,1.144049,1.033228,1e-06,5e-06,7.021895e-07,1e-06,0.0001,5e-06,0.000346,2.2e-05,0.001122


In [43]:
dst_cols = df.columns[df.columns.str.startswith('gte')]

In [44]:
df[dst_cols].head()

Unnamed: 0,gte_state_id_MEAN,gte_store_id_MEAN,gte_cat_id_MEAN,gte_dept_id_MEAN,gte_item_id_MEAN,gte_state_id_dept_id_MEAN,gte_state_id_item_id_MEAN,gte_store_id_dept_id_MEAN,gte_store_id_item_id_MEAN,gte_state_id_VAR,gte_store_id_VAR,gte_cat_id_VAR,gte_dept_id_VAR,gte_item_id_VAR,gte_state_id_dept_id_VAR,gte_state_id_item_id_VAR,gte_store_id_dept_id_VAR,gte_store_id_item_id_VAR
0,1.413611,1.4886,0.644553,0.787821,0.404978,1.00168,0.624777,1.144049,0.613291,1e-06,5e-06,7.021895e-07,1e-06,5.2e-05,5e-06,0.000189,2.2e-05,0.000655
1,1.413611,1.4886,0.644553,0.787821,0.264319,1.00168,0.213723,1.144049,0.298373,1e-06,5e-06,7.021895e-07,1e-06,2.9e-05,5e-06,5.5e-05,2.2e-05,0.000298
2,1.413611,1.4886,0.644553,0.787821,0.185004,1.00168,0.253441,1.144049,0.380935,1e-06,5e-06,7.021895e-07,1e-06,2.8e-05,5e-06,0.000103,2.2e-05,0.000535
3,1.413611,1.4886,0.644553,0.787821,2.042447,1.00168,2.935063,1.144049,1.955045,1e-06,5e-06,7.021895e-07,1e-06,0.000626,5e-06,0.002073,2.2e-05,0.002983
4,1.413611,1.4886,0.644553,0.787821,0.74616,1.00168,1.035265,1.144049,1.033228,1e-06,5e-06,7.021895e-07,1e-06,0.0001,5e-06,0.000346,2.2e-05,0.001122
