## Please input your directory for the top level folder
folder name : SUBMISSION MODEL

In [1]:
dir_ = 'C:/Users/jekim/OneDrive/바탕 화면/m5-forecasting-accuracy/' # input only here

#### setting other directory

In [2]:
raw_data_dir = dir_+'2. data/'
processed_data_dir = dir_+'2. data/processed/'
log_dir = dir_+'4. logs/'
model_dir = dir_+'5. models/'
submission_dir = dir_+'6. submissions/'

In [3]:
####################################################################################
########################### 1-1. recursive model by store ##########################
####################################################################################

In [11]:
ver, KKK = 'priv', 0
STORES_IDS = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']
CLUSTERS_IDS = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]

In [5]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

from multiprocessing import Pool

warnings.filterwarnings('ignore')

In [6]:
########################### Helpers
#################################################################################
## Seeder
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [7]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_cluster(cluster):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    df = df[df['d']>=START_TRAIN]
    
    df = df[df['tskm_10']==cluster]

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2
    
    df = pd.concat([df, df3], axis=1)
    del df3
    
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    df = df.reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for cluster_id in CLUSTERS_IDS:
        temp_df = pd.read_pickle(processed_data_dir+'test_item_cluster14_'+str(cluster_id)+'.pkl')
        temp_df['cluster_id'] = cluster_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [8]:
########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
#                     'n_estimators': 3,
                    'boost_from_average': False,
                    'verbose': -1,
                } 


In [9]:
########################### Vars
#################################################################################
VER = 1                          
SEED = 42                        
seed_everything(SEED)            
lgb_params['seed'] = SEED        
N_CORES = psutil.cpu_count()     


#LIMITS and const
TARGET      = 'sales'            
START_TRAIN = 0                
END_TRAIN   = 1941 - 28*KKK      
P_HORIZON   = 28                 
USE_AUX     = False             

remove_features = ['id','state_id','store_id', 'cat_id', 'dept_id', 'item_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = [#'enc_cat_id_mean','enc_cat_id_std',
                   #'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

ORIGINAL = raw_data_dir
BASE     = processed_data_dir+'grid_part_1.pkl'
PRICE    = processed_data_dir+'grid_part_2.pkl'
CALENDAR = processed_data_dir+'grid_part_3.pkl'
LAGS     = processed_data_dir+'lags_df_28.pkl'
MEAN_ENC = processed_data_dir+'mean_encoding_df.pkl'


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [12]:
_, MODEL_FEATURES = get_data_by_cluster(CLUSTERS_IDS[-1])
del _; gc.collect()

34

In [14]:
########################### Predict
#################################################################################

all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

main_time = time.time()

for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    grid_df = base_test.copy()
    
    # slow for loop version
    temp = []
    for a in ROLS_SPLIT:
        temp.append(make_lag_roll(a))
    temp = pd.concat(temp, axis=1)
    grid_df = pd.concat([grid_df, temp], axis=1)
    del temp; gc.collect()
    ###
    
    # fast multiprocessing version
    #     grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
    ###
    
    for cluster_id in CLUSTERS_IDS:
        
        model_path = model_dir+'lgb_model_item_cluster14'+str(cluster_id)+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path

        estimator = pickle.load(open(model_path, 'rb'))

        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['cluster_id']==cluster_id

        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])

    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))

    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  3.71 min round |  3.71 min total |  40251.61 day sales |
Predict | Day: 2
##########  3.67 min round |  7.37 min total |  37213.26 day sales |
Predict | Day: 3
##########  3.84 min round |  11.21 min total |  36819.71 day sales |
Predict | Day: 4
##########  3.69 min round |  14.90 min total |  37058.27 day sales |
Predict | Day: 5
##########  3.58 min round |  18.47 min total |  42192.48 day sales |
Predict | Day: 6
##########  3.66 min round |  22.13 min total |  49960.40 day sales |
Predict | Day: 7
##########  3.78 min round |  25.92 min total |  50882.98 day sales |
Predict | Day: 8
##########  3.28 min round |  29.19 min total |  45061.70 day sales |
Predict | Day: 9
##########  3.65 min round |  32.84 min total |  39040.61 day sales |
Predict | Day: 10
##########  3.93 min round |  36.77 min total |  44216.85 day sales |
Predict | Day: 11
##########  3.82 min round |  40.59 min total |  44641.84 day sales |
Predict | Day: 12
##########  3.51 min roun

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.967713,0.818604,0.874251,0.904125,0.934961,1.338867,1.258636,1.139069,0.912391,...,1.031942,1.456437,1.242612,0.963609,0.830769,0.891368,1.049745,1.186898,1.243376,1.120867
1,HOBBIES_1_001_CA_2_evaluation,0.691250,0.579076,0.572282,0.582017,0.716532,0.782619,0.839415,0.774008,0.651236,...,0.740027,0.820774,0.866144,0.760896,0.716741,0.682587,0.688319,0.738994,0.820492,0.720872
2,HOBBIES_1_001_CA_3_evaluation,0.985815,0.861391,0.829535,0.867200,0.899118,1.309659,1.226873,1.085777,0.774185,...,1.031270,1.356682,1.228735,1.027036,0.931328,1.002184,0.964970,0.922761,1.369050,1.024901
3,HOBBIES_1_001_CA_4_evaluation,0.777852,0.639976,0.696111,0.722072,0.951828,1.023256,1.112587,0.946275,0.763726,...,0.897788,0.916131,1.161901,0.923988,0.779097,0.755304,0.691481,0.776141,1.016303,0.955351
4,HOBBIES_1_001_TX_1_evaluation,0.287374,0.296396,0.262546,0.347808,0.357846,0.375284,0.357714,0.398494,0.358148,...,0.348955,0.429051,0.400368,0.462634,0.356719,0.411714,0.347748,0.396991,0.415792,0.455678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_825_TX_2_evaluation,0.707154,0.674172,0.660900,0.696764,0.731537,0.804405,0.831355,0.762137,0.713292,...,0.962501,1.068530,1.180935,1.022062,0.964575,0.959903,0.842162,0.854548,1.046834,0.972161
30486,FOODS_3_825_TX_3_evaluation,1.024016,0.835609,0.773829,0.942506,1.070744,1.220946,1.251242,1.057901,0.932939,...,1.192481,1.544519,1.672412,1.205691,1.099351,1.137346,1.021354,1.090434,1.498410,1.388516
30487,FOODS_3_825_WI_1_evaluation,1.674315,1.476276,1.458393,1.435960,1.441749,1.742903,1.783490,1.473520,1.180304,...,1.516912,1.932270,2.065084,1.617330,1.435544,1.495843,1.315440,1.502139,1.793887,1.805864
30488,FOODS_3_825_WI_2_evaluation,1.591133,1.495511,1.454866,1.523339,1.849849,1.881223,2.095278,1.605501,1.282031,...,2.021367,2.425738,2.412771,1.732986,1.603814,1.652202,1.580789,1.751337,2.090334,2.213467


In [15]:
########################### Export
#################################################################################
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv(submission_dir+'before_ensemble/submission_kaggle_recursive_store.csv', index=False)