Copyright 2020 Konstantin Yakovlev, Matthias Anderer

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)


In [3]:
LOSS_MULTIPLIER = 0.93 # Set multiplier according to desired under-/overshooting

In [4]:
# define custom loss function
def custom_asymmetric_train(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual < 0, -2 * residual, -2 * residual * LOSS_MULTIPLIER)
    hess = np.where(residual < 0, 2, 2 * LOSS_MULTIPLIER)
    return grad, hess

# define custom evaluation metric
def custom_asymmetric_valid(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    loss = np.where(residual < 0, (residual ** 2) , (residual ** 2) * LOSS_MULTIPLIER) 
    return "custom_asymmetric_eval", np.mean(loss), False

In [5]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]
    
    ############
    
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


In [6]:
########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric':'rmse',
        'n_jobs': -1,
        'seed': 42,
        'learning_rate': 0.2,
        'bagging_fraction': 0.85,
        'bagging_freq': 1, 
        'colsample_bytree': 0.85,
        'colsample_bynode': 0.85,
        #'min_data_per_leaf': 25,
        #'num_leaves': 200,
        'lambda_l1': 0.5,
        'lambda_l2': 0.5
}



In [7]:
########################### Vars
#################################################################################
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913+28            # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False               # Use or not pretrained models

#FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]

#PATHS for Features
ORIGINAL = 'C://Users//nkyam//Desktop//m5_forecast//'
BASE     = 'C://Users//nkyam//Desktop//m5_forecast//grid_part_1.pkl'
PRICE    = 'C://Users//nkyam//Desktop//m5_forecast//grid_part_2.pkl'
CALENDAR = 'C://Users//nkyam//Desktop//m5_forecast//grid_part_3.pkl'

#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())
STORES_IDS 


['CA_1',
 'CA_2',
 'CA_3',
 'CA_4',
 'TX_1',
 'TX_2',
 'TX_3',
 'WI_1',
 'WI_2',
 'WI_3']

In [8]:
########################### Train Models
#################################################################################
for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET])
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('test_'+store_id+'.pkl')
    del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          num_boost_round = 3600, 
                          early_stopping_rounds = 50, 
                          valid_sets = [train_data, valid_data],
                          verbose_eval = 100,
                          fobj = custom_asymmetric_train

                          )
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Train CA_1
[LightGBM] [Info] Saving data to binary file train_data.bin
[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.5754	valid_1's rmse: 2.21635
[200]	training's rmse: 2.47206	valid_1's rmse: 2.13312
[300]	training's rmse: 2.41198	valid_1's rmse: 2.09382
[400]	training's rmse: 2.36891	valid_1's rmse: 2.06743
[500]	training's rmse: 2.33539	valid_1's rmse: 2.04906
[600]	training's rmse: 2.30626	valid_1's rmse: 2.03047
[700]	training's rmse: 2.28403	valid_1's rmse: 2.01754
[800]	training's rmse: 2.26329	valid_1's rmse: 2.00448
[900]	training's rmse: 2.24395	valid_1's rmse: 1.99136
[1000]	training's rmse: 2.22739	valid_1's rmse: 1.98447
[1100]	train

'rm' is not recognized as an internal or external command,
operable program or batch file.


Train CA_2
[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.5754	valid_1's rmse: 2.41518
Early stopping, best iteration is:
[67]	training's rmse: 2.62896	valid_1's rmse: 2.41163
Train CA_3


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.5754	valid_1's rmse: 3.46767
Early stopping, best iteration is:
[126]	training's rmse: 2.54411	valid_1's rmse: 3.45801
Train CA_4


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's rmse: 3.56241	valid_1's rmse: 1.63107
Train TX_1


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	training's rmse: 3.24021	valid_1's rmse: 2.40318
Train TX_2


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.5754	valid_1's rmse: 2.69935
Early stopping, best iteration is:
[83]	training's rmse: 2.60455	valid_1's rmse: 2.69122
Train TX_3


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.5754	valid_1's rmse: 2.76713
Early stopping, best iteration is:
[64]	training's rmse: 2.63533	valid_1's rmse: 2.76167
Train WI_1


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	training's rmse: 3.37303	valid_1's rmse: 2.30753
Train WI_2


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[9]	training's rmse: 2.96848	valid_1's rmse: 4.53691
Train WI_3


'rm' is not recognized as an internal or external command,
operable program or batch file.


[LightGBM] [Info] Load from binary file train_data.bin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5643
[LightGBM] [Info] Number of data points in the train set: 4751349, number of used features: 29
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	training's rmse: 2.70234	valid_1's rmse: 2.89617


'rm' is not recognized as an internal or external command,
operable program or batch file.


In [9]:
########################### Predict
#################################################################################

# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  0.03 min round |  0.03 min total |  35575.66 day sales |
Predict | Day: 2
##########  0.03 min round |  0.07 min total |  32757.20 day sales |
Predict | Day: 3
##########  0.03 min round |  0.10 min total |  32717.46 day sales |
Predict | Day: 4
##########  0.03 min round |  0.14 min total |  32657.08 day sales |
Predict | Day: 5
##########  0.03 min round |  0.17 min total |  37024.53 day sales |
Predict | Day: 6
##########  0.03 min round |  0.20 min total |  44988.65 day sales |
Predict | Day: 7
##########  0.03 min round |  0.24 min total |  45641.62 day sales |
Predict | Day: 8
##########  0.03 min round |  0.27 min total |  37945.55 day sales |
Predict | Day: 9
##########  0.03 min round |  0.30 min total |  33180.25 day sales |
Predict | Day: 10
##########  0.03 min round |  0.34 min total |  35920.18 day sales |
Predict | Day: 11
##########  0.03 min round |  0.37 min total |  34844.80 day sales |
Predict | Day: 12
##########  0.03 min round |  0.40

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.772412,0.703821,0.699928,0.706671,0.813339,0.913220,0.885118,0.910792,0.774975,...,0.879216,0.917042,0.879716,0.776660,0.716527,0.714263,0.722627,0.827460,0.927341,0.928871
1,HOBBIES_1_002_CA_1_evaluation,0.292529,0.217544,0.213651,0.220394,0.323918,0.393746,0.375387,0.332409,0.288699,...,0.363094,0.394037,0.366456,0.293247,0.226721,0.224457,0.236350,0.338039,0.407866,0.232575
2,HOBBIES_1_003_CA_1_evaluation,0.373054,0.298069,0.294176,0.300919,0.404443,0.474271,0.455912,0.439501,0.369224,...,0.443619,0.474562,0.446981,0.373772,0.307247,0.304982,0.316875,0.418564,0.488392,0.339668
3,HOBBIES_1_004_CA_1_evaluation,1.610278,1.488754,1.484860,1.519716,1.942678,2.809188,3.354914,1.672799,1.559908,...,1.796715,2.624341,3.340223,1.605236,1.492170,1.489906,1.526382,1.947509,2.814019,3.108231
4,HOBBIES_1_005_CA_1_evaluation,0.986632,0.886754,0.853662,0.860406,0.977321,1.273883,1.352672,1.008370,0.935312,...,0.844929,1.079436,1.265367,0.855638,0.789113,0.786848,0.795212,0.910292,1.206854,1.110915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.856481,0.744237,0.744237,0.744237,0.878757,1.148707,1.178396,0.879034,0.744237,...,0.953110,1.210992,1.225624,0.865196,0.769621,0.769621,0.769621,0.904141,1.195935,1.225624
30486,FOODS_3_824_WI_3_evaluation,0.759313,0.647069,0.647069,0.647069,0.781588,1.072429,1.102118,0.781866,0.647069,...,0.884865,1.140466,1.155098,0.773779,0.678205,0.678205,0.678205,0.812724,1.125409,1.155098
30487,FOODS_3_825_WI_3_evaluation,0.875091,0.762847,0.762847,0.762847,0.897367,1.197371,1.227060,0.897644,0.762847,...,1.018849,1.283614,1.298246,0.907764,0.812189,0.812189,0.812189,0.946708,1.268557,1.298246
30488,FOODS_3_826_WI_3_evaluation,1.308865,1.196622,1.196622,1.196622,1.356631,1.742954,1.718346,1.331418,1.196622,...,1.436736,1.810992,1.771326,1.323332,1.227758,1.227758,1.227758,1.387767,1.795935,1.771326


In [10]:
########################### Export
#################################################################################
# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(VER)+'.csv', index=False)