<a href="https://colab.research.google.com/github/ipejun-ai/m5-accuracy/blob/master/m5_custom_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
## In this kernel I would like to show: 
## 1. FE creation approaches
## 2. Sequential fe validation
## 3. Dimension reduction
## 4. FE validation by Permutation importance
## 5. Mean encodings
## 6. Parallelization for FE

In [11]:
import numpy as np 
import pandas as pd 
import os, sys, gc, warnings, psutil, random

warnings.filterwarnings('ignore')

In [12]:
#Load google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [13]:
DIRPATH="/content/gdrive/My Drive/kaggle/m5-forecasting-accuracy"
#DIRPATH="C:/Users/peiju/Documents/Study/kaggle/m5-forecasting-accuracy/"

In [14]:
########################### Load data
########################### Basic features were created here:
########################### https://www.kaggle.com/kyakovlev/m5-simple-fe
#################################################################################

# Read data
grid_df = pd.concat([pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_1.pkl'),
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)

# Subsampling
# to make all calculations faster.
# Keep only 5% of original ids.
keep_id = np.array_split(list(grid_df['id'].unique()), 20)[0]
grid_df = grid_df[grid_df['id'].isin(keep_id)].reset_index(drop=True)

# Let's "inspect" our grid DataFrame
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3002725 entries, 0 to 3002724
Data columns (total 34 columns):
 #   Column            Dtype   
---  ------            -----   
 0   id                category
 1   item_id           category
 2   dept_id           category
 3   cat_id            category
 4   store_id          category
 5   state_id          category
 6   d                 int16   
 7   sales             float64 
 8   release           int16   
 9   sell_price        float16 
 10  price_max         float16 
 11  price_min         float16 
 12  price_std         float16 
 13  price_mean        float16 
 14  price_norm        float16 
 15  price_nunique     float16 
 16  item_nunique      int16   
 17  price_momentum    float16 
 18  price_momentum_m  float16 
 19  price_momentum_y  float16 
 20  event_name_1      category
 21  event_type_1      category
 22  event_name_2      category
 23  event_type_2      category
 24  snap_CA           category
 25  snap_TX           

In [15]:
########################### Baseline model
#################################################################################

# We will need some global VARS for future

SEED = 42             # Our random seed for everything
random.seed(SEED)     # to make all tests "deterministic"
np.random.seed(SEED)
N_CORES = psutil.cpu_count()     # Available CPU cores

TARGET = 'sales'      # Our Target
END_TRAIN = 1913+28      # And we will use last 28 days as validation

SHIFT_DAY=28
# Drop some items from "TEST" set part (1914...)
grid_df = grid_df[grid_df['d']<=END_TRAIN].reset_index(drop=True)

# Features that we want to exclude from training
remove_features = ['id','d',TARGET]

# Our baseline model serves
# to do fast checks of
# new features performance 

# We will use LightGBM for our tests
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',         # Standart boosting type
                    'objective': 'regression',       # Standart loss for RMSE
                    'metric': ['rmse'],              # as we will use rmse as metric "proxy"
                    'subsample': 0.8,                
                    'subsample_freq': 1,
                    'learning_rate': 0.05,           # 0.5 is "fast enough" for us
                    'num_leaves': 2**7-1,            # We will need model only for fast check
                    'min_data_in_leaf': 2**8-1,      # So we want it to train faster even with drop in generalization 
                    'feature_fraction': 0.8,
                    'n_estimators': 5000,            # We don't want to limit training (you can change 5000 to any big enough number)
                    'early_stopping_rounds': 30,     # We will stop training almost immediately (if it stops improving) 
                    'seed': SEED,
                    'verbose': -1,
                } 

## RMSE
def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

# Small function to make fast features tests
# estimator = make_fast_test(grid_df)
# it will return lgb booster for future analisys
def make_fast_test(df):

    features_columns = [col for col in list(df) if col not in remove_features]

    tr_x, tr_y = df[df['d']<=(END_TRAIN-28)][features_columns], df[df['d']<=(END_TRAIN-28)][TARGET]              
    vl_x, v_y = df[df['d']>(END_TRAIN-28)][features_columns], df[df['d']>(END_TRAIN-28)][TARGET]
    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)
    
    estimator = lgb.train(
                            lgb_params,
                            train_data,
                            valid_sets = [train_data,valid_data],
                            verbose_eval = 500,
                        )
    
    return estimator

# Make baseline model
baseline_model = make_fast_test(grid_df)

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[379]	training's rmse: 2.79812	valid_1's rmse: 2.39787


In [16]:
########################### Lets test our normal Lags (7 days)
########################### Some more info about lags here:
########################### https://www.kaggle.com/kyakovlev/m5-lags-features
#################################################################################

# Small helper to make lags creation faster
from multiprocessing import Pool                # Multiprocess Runs

## Multiprocessing Run.
# :t_split - int of lags days                   # type: int
# :func - Function to apply on each split       # type: python function
# This function is NOT 'bulletproof', be carefull and pass only correct types of variables.
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

def make_normal_lag(lag_day):
    lag_df = grid_df[['id','d',TARGET]] # not good to use df from "global space"
    col_name = 'sales_lag_'+str(lag_day)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(lag_day)).astype(np.float16)
    return lag_df[[col_name]]

# Launch parallel lag creation
# and "append" to our grid
LAGS_SPLIT = [col for col in range(1,1+7)]
grid_df = pd.concat([grid_df, df_parallelize_run(make_normal_lag,LAGS_SPLIT)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[325]	training's rmse: 2.56457	valid_1's rmse: 2.26084


In [17]:
########################### Permutation importance Test
########################### https://www.kaggle.com/dansbecker/permutation-importance @dansbecker
#################################################################################

# Let's creat validation dataset and features
features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)

# Make normal prediction with our model and save score
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)


# Now we are looping over all our numerical features
for col in features_columns:
    
    # We will make validation set copy to restore
    # features states on each run
    temp_df = validation_df.copy()
    
    # Error here appears if we have "categorical" features and can't 
    # do np.random.permutation without disrupt categories
    # so we need to check if feature is numerical
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        
        # If our current rmse score is less than base score
        # it means that feature most probably is a bad one
        # and our model is learning on noise
        print(col, np.round(cur_score - base_score, 4))

# Remove Temp data
del temp_df, validation_df

# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if 'sales_lag_' not in col]
grid_df = grid_df[keep_cols]


# Results:
## Lags with 1 days shift (nearest past) are important
## Some other features are not important and probably just noise
## Better make several Permutation runs to confirm useless of the feature
## link again https://www.kaggle.com/dansbecker/permutation-importance @dansbecker

## price_nunique -0.002 : strong negative values are most probably noise
## price_max -0.0002 : values close to 0 need deeper investigation


Standart RMSE 2.260843574932562
release 0.0
sell_price 0.0038
price_max 0.0001
price_min 0.0009
price_std 0.0029
price_mean 0.0035
price_norm 0.0075
price_nunique -0.0001
item_nunique -0.0
price_momentum -0.0001
price_momentum_m 0.0034
price_momentum_y 0.0005
tm_d 0.0106
tm_w 0.0003
tm_m -0.0001
tm_y 0.0
tm_wm 0.0002
tm_dw 0.1403
tm_w_end 0.0087
sales_lag_1 0.5897
sales_lag_2 0.0465
sales_lag_3 0.0219
sales_lag_4 0.012
sales_lag_5 0.0183
sales_lag_6 0.019
sales_lag_7 0.0448


from eli5 documentation (seems it's perfect explanation)

The idea is the following: feature importance can be measured by looking at how much the score (accuracy, mse, rmse, mae, etc. - any score we’re interested in) decreases when a feature is not available.

To do that one can remove feature from the dataset, re-train the estimator and check the score. But it requires re-training an estimator for each feature, which can be computationally intensive. Also, it shows what may be important within a dataset, not what is important within a concrete trained model.

To avoid re-training the estimator we can remove a feature only from the test part of the dataset, and compute score without using this feature. It doesn’t work as-is, because estimators expect feature to be present. So instead of removing a feature we can **replace it with random noise** - feature column is still there, but it no longer contains useful information. This method works if noise is drawn from the **same distribution as original feature values** (as otherwise estimator may fail). The simplest way to get such noise is to shuffle values for a feature, i.e. use other examples’ feature values - this is how permutation importance is computed.

---

It's not good when feature remove (replaced by noise) but we have better score. Simple and easy. 

In [None]:
########################### Lets test far away Lags (7 days with 56 days shift)
########################### and check permutation importance
#################################################################################

LAGS_SPLIT = [col for col in range(56,56+7)]
grid_df = pd.concat([grid_df, df_parallelize_run(make_normal_lag,LAGS_SPLIT)], axis=1)
test_model = make_fast_test(grid_df)

features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)

for col in features_columns:
    temp_df = validation_df.copy()
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        print(col, np.round(cur_score - base_score, 4))

del temp_df, validation_df
        
# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if 'sales_lag_' not in col]
grid_df = grid_df[keep_cols]


# Results:
## Lags with 56 days shift (far away past) are not as important
## as nearest past lags
## and at some point will be just noise for our model

In [None]:
########################### PCA
#################################################################################

# The main question here - can we have 
# almost same rmse boost with less features
# less dimensionality?

# Lets try PCA and make 7->3 dimensionality reduction

# PCA is "unsupervised" learning
# and with shifted target we can be sure
# that we have no Target leakage
from sklearn.decomposition import PCA

def make_pca(df, pca_col, n_days):
    print('PCA:', pca_col, n_days)
    
    # We don't need any other columns to make pca
    pca_df = df[[pca_col,'d',TARGET]]
    
    # If we are doing pca for other series "levels" 
    # we need to agg first
    if pca_col != 'id':
        merge_base = pca_df[[pca_col,'d']]
        pca_df = pca_df.groupby([pca_col,'d'])[TARGET].agg(['sum']).reset_index()
        pca_df[TARGET] = pca_df['sum']
        del pca_df['sum']
    
    # Min/Max scaling
    pca_df[TARGET] = pca_df[TARGET]/pca_df[TARGET].max()
    
    # Making "lag" in old way (not parallel)
    LAG_DAYS = [col for col in range(1,n_days+1)]
    format_s = '{}_pca_'+pca_col+str(n_days)+'_{}'
    pca_df = pca_df.assign(**{
            format_s.format(col, l): pca_df.groupby([pca_col])[col].transform(lambda x: x.shift(l))
            for l in LAG_DAYS
            for col in [TARGET]
        })
    
    pca_columns = list(pca_df)[3:]
    pca_df[pca_columns] = pca_df[pca_columns].fillna(0)
    pca = PCA(random_state=SEED)
    
    # You can use fit_transform here
    pca.fit(pca_df[pca_columns])
    pca_df[pca_columns] = pca.transform(pca_df[pca_columns])
    
    print(pca.explained_variance_ratio_)
    
    # we will keep only 3 most "valuable" columns/dimensions 
    keep_cols = pca_columns[:3]
    print('Columns to keep:', keep_cols)
    
    # If we are doing pca for other series "levels"
    # we need merge back our results to merge_base df
    # and only than return resulted df
    # I'll skip that step here
    
    return pca_df[keep_cols]


# Make PCA
grid_df = pd.concat([grid_df, make_pca(grid_df,'id',7)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)

# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if '_pca_' not in col]
grid_df = grid_df[keep_cols]

In [None]:
########################### Mean/std target encoding
#################################################################################

# We will use these three columns for test
# (in combination with store_id)
icols = ['item_id','cat_id','dept_id']

# But we can use any other column or even multiple groups
# like these ones
#            'state_id',
#            'store_id',
#            'cat_id',
#            'dept_id',
#            ['state_id', 'cat_id'],
#            ['state_id', 'dept_id'],
#            ['store_id', 'cat_id'],
#            ['store_id', 'dept_id'],
#            'item_id',
#            ['item_id', 'state_id'],
#            ['item_id', 'store_id']

# There are several ways to do "mean" encoding
## K-fold scheme
## LOO (leave one out)
## Smoothed/regularized 
## Expanding mean
## etc 

# You can test as many options as you want
# and decide what to use
# Because of memory issues you can't 
# use many features.

# We will use simple target encoding
# by std and mean agg
for col in icols:
    print('Encoding', col)
    temp_df = grid_df[grid_df['d']<=(1913-28)] # to be sure we don't have leakage in our validation set
    
    temp_df = temp_df.groupby([col,'store_id']).agg({TARGET: ['std','mean']})
    joiner = '_'+col+'_encoding_'
    temp_df.columns = [joiner.join(col).strip() for col in temp_df.columns.values]
    temp_df = temp_df.reset_index()
    grid_df = grid_df.merge(temp_df, on=[col,'store_id'], how='left')
    del temp_df

# Make features test
test_model = make_fast_test(grid_df)

# Remove test features
keep_cols = [col for col in list(grid_df) if '_encoding_' not in col]
grid_df = grid_df[keep_cols]

# Bad thing that for some items  
# we are using past and future values.
# But we are looking for "categorical" similiarity
# on a "long run". So future here is not a big problem.

In [None]:
########################### Last non O sale
#################################################################################

def find_last_sale(df,n_day):
    
    # Limit initial df
    ls_df = df[['id','d',TARGET]]
    
    # Convert target to binary
    ls_df['non_zero'] = (ls_df[TARGET]>0).astype(np.int8)
    
    # Make lags to prevent any leakage
    ls_df['non_zero_lag'] = ls_df.groupby(['id'])['non_zero'].transform(lambda x: x.shift(n_day).rolling(2000,1).sum()).fillna(-1)

    temp_df = ls_df[['id','d','non_zero_lag']].drop_duplicates(subset=['id','non_zero_lag'])
    temp_df.columns = ['id','d_min','non_zero_lag']

    ls_df = ls_df.merge(temp_df, on=['id','non_zero_lag'], how='left')
    ls_df['last_sale'] = ls_df['d'] - ls_df['d_min']

    return ls_df[['last_sale']]


# Find last non zero
# Need some "dances" to fit in memory limit with groupers
grid_df = pd.concat([grid_df, find_last_sale(grid_df,1)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)

# Remove test features
keep_cols = [col for col in list(grid_df) if 'last_sale' not in col]
grid_df = grid_df[keep_cols]

In [None]:
#######snap
def sum_snap(df):
  df["snap"]=0
  df["snap"].where(df['state_id'] != 'CA',df["snap_CA"],inplace=True)
  df["snap"].where(df['state_id'] != 'TX',df["snap_TX"],inplace=True)
  df["snap"].where(df['state_id'] != 'WI',df["snap_WI"],inplace=True)
  df.drop(["snap_CA","snap_TX","snap_WI"],axis=1)
  return df

grid_df1=sum_snap(grid_df)

 # Make features test
test_model = make_fast_test(grid_df1) 


In [None]:
def change_event(df):
  event_type = pd.read_pickle("/content/gdrive/My Drive/kaggle/df_event_type.pickle.gz")
  event_type_columns=event_type.columns
  event_type_columns=event_type_columns.drop("d")

  event_type[event_type_columns]=event_type[event_type_columns].astype("category")

  event_type["d"]=event_type["d"].str.replace("d_","").astype("int16")
  df=pd.merge(df,event_type,left_on="d",right_on="d",how="left")
  return df

grid_df=change_event(grid_df)

# Make features test
test_model = make_fast_test(grid_df) 

In [None]:
def add_event(df):
  event_value = pd.read_pickle("/content/gdrive/My Drive/kaggle/output/event_importance/event_value.pkl")
  
  event_value_columns=event_value.columns
  event_value_columns=event_value_columns.drop("d")

  event_value_columns=["Event_total"]

  event_value[event_value_columns]=event_value[event_value_columns].astype("float16")

  event_value["d"]=event_value["d"].str.replace("d_","").astype("int16")
  #cols=["d", 'Sports', 'Religious', 'National', 'Cultural']
  cols=["d","Event_total"]
  df=pd.merge(df,event_value[cols],left_on="d",right_on="d",how="left")
  return df

grid_df1=add_event(grid_df)

# Make features test
test_model = make_fast_test(grid_df1) 


features_columns = [col for col in list(grid_df1) if col not in remove_features]
validation_df = grid_df1[grid_df1['d']>(END_TRAIN-28)].reset_index(drop=True)
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)

for col in features_columns:
    temp_df = validation_df.copy()
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        print(col, np.round(cur_score - base_score, 4))

del temp_df, validation_df

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[321]	training's rmse: 2.81841	valid_1's rmse: 2.38378
Standart RMSE 2.383784659504859
release 0.0
sell_price 0.0152
price_max 0.0347
price_min 0.0111
price_std 0.0298
price_mean 0.0111
price_norm 0.0123
price_nunique 0.0081
item_nunique 0.0013
price_momentum 0.0004
price_momentum_m 0.022
price_momentum_y 0.0122
tm_d 0.0081
tm_w 0.0035
tm_m 0.0018
tm_y 0.0
tm_wm 0.0003
tm_dw 0.1746
tm_w_end 0.0081
Event_total 0.0018


In [None]:
grid_df[grid_df["id"]=="HOBBIES_1_008_CA_1_validation"]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12.0,0,0.459961,0.5,0.419922,0.01976,0.476318,0.919922,4.0,16,,0.968750,0.949219,,,,,0,0,0,29,4,1,0,5,5,1
1525,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,2,15.0,0,0.459961,0.5,0.419922,0.01976,0.476318,0.919922,4.0,16,,0.968750,0.949219,,,,,0,0,0,30,4,1,0,5,6,1
3050,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,3,0.0,0,0.459961,0.5,0.419922,0.01976,0.476318,0.919922,4.0,16,,0.968750,0.949219,,,,,0,0,0,31,5,1,0,5,0,0
4575,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,4,0.0,0,0.459961,0.5,0.419922,0.01976,0.476318,0.919922,4.0,16,,0.968750,0.949219,,,,,1,1,0,1,5,2,0,1,1,0
6100,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,5,0.0,0,0.459961,0.5,0.419922,0.01976,0.476318,0.919922,4.0,16,,0.968750,0.949219,,,,,1,0,1,2,5,2,0,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2909700,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1909,4.0,0,0.479980,0.5,0.419922,0.01976,0.476318,0.959961,4.0,17,1.0,1.014648,1.000000,,,,,0,0,0,20,16,4,5,3,2,0
2911225,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1910,6.0,0,0.479980,0.5,0.419922,0.01976,0.476318,0.959961,4.0,17,1.0,1.014648,1.000000,,,,,0,0,0,21,16,4,5,3,3,0
2912750,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1911,3.0,0,0.479980,0.5,0.419922,0.01976,0.476318,0.959961,4.0,17,1.0,1.014648,1.000000,,,,,0,0,0,22,16,4,5,4,4,0
2914275,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1912,2.0,0,0.479980,0.5,0.419922,0.01976,0.476318,0.959961,4.0,17,1.0,1.014648,1.000000,,,,,0,0,0,23,16,4,5,4,5,1


In [None]:
def add_holiday(df):
  df["holiday"]=0
  #df["holiday"].where(df["tm_dw"]<=4,1,inplace=True)
  df.loc[df["tm_dw"]>=5,"holiday"]=1
  df.loc[df["event_type_1"]=="National","holiday"]=1

  #holidayの売上
  df["h_sales"]=0
  df.loc[df["holiday"]==1,"h_sales"]=df["sales"]

  #non-holidayの売上
  df["w_sales"]=0
  df.loc[df["holiday"]==0,"w_sales"]=df["sales"]

  for i in [7,14,30,60,180]: #,14,30,60,180
    print('Rolling period:', i)
    df["holiday_sum"] = df.groupby(['id'])["h_sales"].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).sum())
    df["holiday_count"] = df.groupby(['id'])["holiday"].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).sum())
    
    df["weekday_sum"] = df.groupby(['id'])["w_sales"].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).sum())
    df["weekday_count"] = i - df["holiday_count"]

    df.loc[df["holiday"]==1,"rolling_holiday_mean_"+str(i)]=(df["holiday_sum"]/df["holiday_count"]).astype(np.float16)
    df.loc[df["holiday"]==0,"rolling_holiday_mean_"+str(i)]=(df["weekday_sum"]/df["weekday_count"]).astype(np.float16)
    
  df=df.drop(["h_sales","w_sales","holiday_sum","holiday_count","weekday_sum","weekday_count"],axis=1)
  


  return df
grid_df1=add_holiday(grid_df)
grid_df1[grid_df["id"]=="HOBBIES_1_008_CA_1_validation"].tail(20)
grid_df1.loc[grid_df1["event_type_1"]=="National","holiday"]

Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180


35075      1
35076      1
35077      1
35078      1
35079      1
          ..
2812095    1
2812096    1
2812097    1
2812098    1
2812099    1
Name: holiday, Length: 77775, dtype: int64

In [None]:
grid_df1[grid_df["id"]=="HOBBIES_1_008_CA_1_validation"].to_csv("test.csv")

In [18]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_1.pkl')
grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)

icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
            ]

for col in icols:
    print('Encoding', col)
    col_name = '_'+'_'.join(col)+'_'
    grid_df['enc'+col_name+'mean'] = grid_df.groupby(col)[TARGET].transform('mean').astype(np.float16)
    grid_df['enc'+col_name+'std'] = grid_df.groupby(col)[TARGET].transform('std').astype(np.float16)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

Encoding ['state_id']
Encoding ['store_id']
Encoding ['cat_id']
Encoding ['dept_id']
Encoding ['state_id', 'cat_id']
Encoding ['state_id', 'dept_id']
Encoding ['store_id', 'cat_id']
Encoding ['store_id', 'dept_id']
Encoding ['item_id']
Encoding ['item_id', 'state_id']
Encoding ['item_id', 'store_id']


In [19]:
#################################################################################
print('Save Mean/Std encoding')
grid_df.to_pickle(DIRPATH+'/output/m5-custom-features/mean_encoding_df.pkl')

Save Mean/Std encoding


In [None]:
########################### Final list of new features
#################################################################################
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46881677 entries, 0 to 46881676
Data columns (total 24 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   id                         category
 1   d                          int16   
 2   enc_state_id_mean          float16 
 3   enc_state_id_std           float16 
 4   enc_store_id_mean          float16 
 5   enc_store_id_std           float16 
 6   enc_cat_id_mean            float16 
 7   enc_cat_id_std             float16 
 8   enc_dept_id_mean           float16 
 9   enc_dept_id_std            float16 
 10  enc_state_id_cat_id_mean   float16 
 11  enc_state_id_cat_id_std    float16 
 12  enc_state_id_dept_id_mean  float16 
 13  enc_state_id_dept_id_std   float16 
 14  enc_store_id_cat_id_mean   float16 
 15  enc_store_id_cat_id_std    float16 
 16  enc_store_id_dept_id_mean  float16 
 17  enc_store_id_dept_id_std   float16 
 18  enc_item_id_mean           float16 
 19  enc_item_id_std    

In [None]:
def add_nonzero(df):
  #0count
  df["is_zero"] = np.where(df["value"] == 0, 1, 0)
  df['rolling_7_zero_count'] = df.groupby(['id'])["is_zero"].shift(28).rolling(7).sum().fillna(0)
  df['rolling_28_zero_count'] = df.groupby(['id'])["is_zero"].shift(28).rolling(28).sum().fillna(0)
  
  return df






In [None]:
"""MOON PHASE CALCULATION
credits to: https://gist.github.com/miklb/ed145757971096565723
moonphase.py - Calculate Lunar Phase
Author: Sean B. Palmer, inamidst.com
Cf. http://en.wikipedia.org/wiki/Lunar_phase#Lunar_phase_calculation
"""
import math, decimal
dec = decimal.Decimal

def get_moon_phase(d):  # 0=new, 4=full; 4 days/phase
    diff = d - datetime(2001, 1, 1)
    days = dec(diff.days) + (dec(diff.seconds) / dec(86400))
    lunations = dec("0.20439731") + (days * dec("0.03386319269"))
    phase_index = math.floor((lunations % dec(1) * dec(8)) + dec('0.5'))
    return int(phase_index) & 7



  

In [None]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_1.pkl')
grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)

# Find last non zero
# Need some "dances" to fit in memory limit with groupers
grid_df = pd.concat([grid_df, find_last_sale(grid_df,1)], axis=1)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

In [None]:
#################################################################################
print('Save final sales')
grid_df.to_pickle(DIRPATH+'/output/m5-custom-features/finalsales_df.pkl')

Save final sales


In [None]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46881677 entries, 0 to 46881676
Data columns (total 3 columns):
 #   Column     Dtype   
---  ------     -----   
 0   id         category
 1   d          int16   
 2   last_sale  int16   
dtypes: category(1), int16(2)
memory usage: 269.7 MB


In [None]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.concat([pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_1.pkl'),
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)


grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)

# Find last non zero
# Need some "dances" to fit in memory limit with groupers
grid_df =sum_snap(grid_df)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

In [None]:
#################################################################################
print('Save snap')
grid_df.to_pickle(DIRPATH+'/output/m5-custom-features/snap_df.pkl')

Save snap


In [None]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46881677 entries, 0 to 46881676
Data columns (total 3 columns):
 #   Column  Dtype   
---  ------  -----   
 0   id      category
 1   d       int16   
 2   snap    int64   
dtypes: category(1), int16(1), int64(1)
memory usage: 538.0 MB


In [None]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.concat([pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_1.pkl'),
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)


grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)

# Find last non zero
# Need some "dances" to fit in memory limit with groupers
grid_df = change_event(grid_df)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

NameError: ignored

In [None]:
#################################################################################
print('Save event type')
grid_df.to_pickle(DIRPATH+'/output/m5-custom-features/event_type.pkl')

In [None]:
grid_df.info()

In [None]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.concat([pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_1.pkl'),
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)


grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)

# Find last non zero
# Need some "dances" to fit in memory limit with groupers
grid_df = add_event(grid_df)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

In [None]:
#################################################################################
print('Save event value')
grid_df.to_pickle(DIRPATH+'/output/m5-custom-features/event_value.pkl')

Save event value


In [None]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46881677 entries, 0 to 46881676
Data columns (total 3 columns):
 #   Column       Dtype   
---  ------       -----   
 0   id           category
 1   d            int16   
 2   Event_total  float16 
dtypes: category(1), float16(1), int16(1)
memory usage: 627.4 MB


In [None]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.concat([pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_1.pkl'),
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle(DIRPATH+'/output/m5-simple-fe/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)


grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)

# Find last non zero
# Need some "dances" to fit in memory limit with groupers
grid_df = add_holiday(grid_df)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180


In [None]:
#################################################################################
print('Save event value')
grid_df.to_pickle(DIRPATH+'/output/m5-custom-features/holiday_mean.pkl')

Save event value
