Copyright 2020 Konstantin Yakovlev, Matthias Anderer

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)


In [3]:
LOSS_MULTIPLIER = 0.97 # Set multiplier according to desired under-/overshooting

In [4]:
# define custom loss function
def custom_asymmetric_train(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual < 0, -2 * residual, -2 * residual * LOSS_MULTIPLIER)
    hess = np.where(residual < 0, 2, 2 * LOSS_MULTIPLIER)
    return grad, hess

# define custom evaluation metric
def custom_asymmetric_valid(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    loss = np.where(residual < 0, (residual ** 2) , (residual ** 2) * LOSS_MULTIPLIER) 
    return "custom_asymmetric_eval", np.mean(loss), False

In [5]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]
    
    ############
    
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


In [6]:
########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric':'rmse',
        'n_jobs': -1,
        'seed': 42,
        'learning_rate': 0.2,
        'bagging_fraction': 0.85,
        'bagging_freq': 1, 
        'colsample_bytree': 0.85,
        'colsample_bynode': 0.85,
        #'min_data_per_leaf': 25,
        #'num_leaves': 200,
        'lambda_l1': 0.5,
        'lambda_l2': 0.5
}



In [7]:
########################### Vars
#################################################################################
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913+28            # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False               # Use or not pretrained models

#FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]

#PATHS for Features
ORIGINAL = 'C://Users//nkyam//Desktop//m5_forecast//'
BASE     = 'C://Users//nkyam//Desktop//m5_forecast//grid_part_1.pkl'
PRICE    = 'C://Users//nkyam//Desktop//m5_forecast//grid_part_2.pkl'
CALENDAR = 'C://Users//nkyam//Desktop//m5_forecast//grid_part_3.pkl'

#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())
STORES_IDS 


['CA_1',
 'CA_2',
 'CA_3',
 'CA_4',
 'TX_1',
 'TX_2',
 'TX_3',
 'WI_1',
 'WI_2',
 'WI_3']

In [8]:
########################### Train Models
#################################################################################
for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET])
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('test_'+store_id+'.pkl')
    del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          num_boost_round = 3600, 
                          early_stopping_rounds = 50, 
                          valid_sets = [train_data, valid_data],
                          verbose_eval = 100,
                          fobj = custom_asymmetric_train

                          )
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Train CA_1
[LightGBM] [Info] Saving data to binary file train_data.bin
[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.56784	valid_1's rmse: 2.22551
[200]	training's rmse: 2.47065	valid_1's rmse: 2.16504
[300]	training's rmse: 2.40963	valid_1's rmse: 2.10288
[400]	training's rmse: 2.36775	valid_1's rmse: 2.08315
[500]	training's rmse: 2.33403	valid_1's rmse: 2.06588
[600]	training's rmse: 2.30514	valid_1's rmse: 2.04442
[700]	training's rmse: 2.2817	valid_1's rmse: 2.03211
[800]	training's rmse: 2.25977	valid_1's rmse: 2.02493
[900]	training's rmse: 2.24008	valid_1's rmse: 2.01078
[1000]	training's rmse: 2.22229	valid_1's rmse: 1.99393
[1100]	train

'rm' is not recognized as an internal or external command,
operable program or batch file.


Train CA_2
[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.56784	valid_1's rmse: 2.42152
Early stopping, best iteration is:
[79]	training's rmse: 2.59884	valid_1's rmse: 2.40633
Train CA_3


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.56784	valid_1's rmse: 3.39523
Early stopping, best iteration is:
[113]	training's rmse: 2.55309	valid_1's rmse: 3.38983
Train CA_4


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's rmse: 3.55279	valid_1's rmse: 1.63163
Train TX_1


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7]	training's rmse: 3.05777	valid_1's rmse: 2.35158
Train TX_2


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.56784	valid_1's rmse: 2.68452
Early stopping, best iteration is:
[79]	training's rmse: 2.59884	valid_1's rmse: 2.67096
Train TX_3


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.56784	valid_1's rmse: 2.7333
Early stopping, best iteration is:
[79]	training's rmse: 2.59884	valid_1's rmse: 2.72858
Train WI_1


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	training's rmse: 3.36378	valid_1's rmse: 2.30818
Train WI_2


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7]	training's rmse: 3.05777	valid_1's rmse: 4.54745
Train WI_3


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.56784	valid_1's rmse: 2.86372
Early stopping, best iteration is:
[83]	training's rmse: 2.59171	valid_1's rmse: 2.86089


'rm' is not recognized as an internal or external command,
operable program or batch file.


In [9]:
########################### Predict
#################################################################################

# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  0.03 min round |  0.03 min total |  36431.78 day sales |
Predict | Day: 2
##########  0.03 min round |  0.07 min total |  33594.00 day sales |
Predict | Day: 3
##########  0.03 min round |  0.10 min total |  33540.71 day sales |
Predict | Day: 4
##########  0.03 min round |  0.14 min total |  33548.35 day sales |
Predict | Day: 5
##########  0.03 min round |  0.17 min total |  37788.57 day sales |
Predict | Day: 6
##########  0.03 min round |  0.20 min total |  46347.21 day sales |
Predict | Day: 7
##########  0.03 min round |  0.24 min total |  47015.21 day sales |
Predict | Day: 8
##########  0.03 min round |  0.27 min total |  38936.11 day sales |
Predict | Day: 9
##########  0.03 min round |  0.30 min total |  34022.81 day sales |
Predict | Day: 10
##########  0.03 min round |  0.34 min total |  36522.60 day sales |
Predict | Day: 11
##########  0.03 min round |  0.37 min total |  35502.60 day sales |
Predict | Day: 12
##########  0.03 min round |  0.41

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.839895,0.770471,0.776066,0.764469,0.709859,0.814695,0.804282,0.907845,0.649785,...,0.958874,1.031036,0.984997,0.828873,0.766669,0.764254,0.767072,0.897533,1.006754,0.998743
1,HOBBIES_1_002_CA_1_evaluation,0.268222,0.198797,0.204393,0.192796,0.295693,0.396288,0.385875,0.341451,0.261971,...,0.355360,0.423283,0.386442,0.267709,0.205505,0.203089,0.205907,0.310015,0.414997,0.397889
2,HOBBIES_1_003_CA_1_evaluation,0.370123,0.298645,0.304240,0.292644,0.395540,0.485437,0.475025,0.435846,0.361819,...,0.455208,0.512432,0.475592,0.369644,0.305353,0.302937,0.305755,0.409863,0.504146,0.414109
3,HOBBIES_1_004_CA_1_evaluation,1.380381,1.268805,1.274400,1.484349,1.808317,2.756200,2.745788,1.644015,1.331978,...,1.866848,3.061138,3.021723,1.369359,1.265003,1.262587,1.486951,1.812130,3.043479,1.420453
4,HOBBIES_1_005_CA_1_evaluation,1.089588,1.018110,1.023705,1.012109,1.138675,1.398029,1.387617,1.151425,1.081284,...,0.948675,1.297531,1.354270,1.067748,0.993143,0.990727,0.993545,1.121323,1.385063,1.192002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.962400,0.795768,0.795768,0.795768,0.923516,1.266077,1.287696,1.020187,0.795768,...,1.077906,1.371018,1.348690,0.962400,0.839836,0.839836,0.839836,0.967584,1.310145,1.331764
30486,FOODS_3_824_WI_3_evaluation,0.792059,0.644525,0.644525,0.644525,0.758873,1.119714,1.141332,0.849846,0.644525,...,0.897818,1.209209,1.202327,0.792059,0.688593,0.688593,0.688593,0.802941,1.163782,1.185400
30487,FOODS_3_825_WI_3_evaluation,1.038057,0.871425,0.871425,0.871425,1.033437,1.381427,1.403046,1.095844,0.871425,...,1.187828,1.486368,1.464040,1.038057,0.915493,0.915493,0.915493,1.077505,1.425495,1.447114
30488,FOODS_3_826_WI_3_evaluation,1.309850,1.143219,1.143219,1.143219,1.373694,1.949529,1.971148,1.367638,1.143219,...,1.528084,2.054469,2.032142,1.309850,1.187287,1.187287,1.187287,1.417762,1.993597,2.015216


In [10]:
########################### Export
#################################################################################
# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(VER)+'.csv', index=False)