# SETTINGS

In [None]:
########## LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc
from tqdm import tqdm
import importlib

from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
########## SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# DATA IMPORT

In [None]:
# read main data
df = pd.read_csv('../data/prepared/df_v1.csv', compression = 'gzip')
print(df.shape)
df.head()

In [None]:
# extract target
y = df['target']
X = df.drop('target', axis = 1)
del df
print(X.shape, y.shape)

In [None]:
# read items
items = pd.read_csv('../data/prepared/items_v1.csv', compression = 'gzip')
print(items.shape)

# keep existing IDs
items = items[items['itemID'].isin(X['itemID'].unique())]
print(items.shape)

# MODELING

In [None]:
##### IMPORT FUNCTIUONS

# profit function
import func_profit
importlib.reload(func_profit)
from func_profit import profit

In [None]:
##### LIST RELEVANT FEATURES

drop_feats = ['itemID', 'day_of_year', 
              'promotion_0', 'promotion_1', 'promotion_2',
              #'order_sum_last_7', 'order_count_last_7', 'order_sum_last_14', 'order_count_last_14', 'order_sum_last_21', 'order_count_last_21', 'order_sum_last_28', 'order_count_last_28'
             ]
features = [var for var in X.columns if var not in drop_feats]
print(len(features), ':', features)

In [None]:
##### MODEL PARAMETERS

# settings
cores = 4
seed  = 23

# rounds abd verbose
stop_rounds = 10
verbose     = 10

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'regression',
    'metrics':           'rmse',
    'learning_rate':     0.1,
    'n_estimators':      100,
    'bagging_fraction':  0.8,
    'feature_fraction':  0.8,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'silent':            True,
    'verbosity':         -1,
    'nthread' :          cores,
    'random_state':      seed,
}

In [None]:
##### PARTITIONING

# parameters
num_folds = 7  # no. CV folds
test_days = 14 # no. days in the test set

# placeholders
train_idx = []
valid_idx = []

# computations
train_days = X['day_of_year'].max() - X['day_of_year'].min() - num_folds - 1  # no. days in the train set

# partitioning loop
for fold in range(num_folds):
    
    # validation dates
    if fold == 0:
        v_end = X['day_of_year'].max() - (test_days + 1)
    else:
        v_end = v_end - 1
    v_start = v_end
    
    # training dates
    t_end   = v_start - (test_days + 1)
    t_start = t_end   - (train_days - 1)
    
    # extract index
    train_idx.append(list(X[(X.day_of_year >= t_start) & (X.day_of_year <= t_end)].index))
    valid_idx.append(list(X[(X.day_of_year >= v_start) & (X.day_of_year <= v_end)].index))     
        
    # print information
    print('-' * 40)
    print('FOLD {}/{}'.format(fold + 1, num_folds))
    print('-' * 40)
    print('- train period: {} -- {} (n = {})'.format(t_start, t_end, len(train_idx[fold])))
    print('- valid period: {} -- {} (n = {})'.format(v_start, v_end, len(valid_idx[fold])))
    print('-' * 40)   
    print('')
    
# convert to numpy array
train_idx = np.asarray(train_idx)
valid_idx = np.asarray(valid_idx)

In [None]:
##### CROSS-VALIDATION LOOP

# placeholders
importances   = pd.DataFrame()
preds_oof     = np.zeros((num_folds, items.shape[0]))
preds_test    = np.zeros(items.shape[0])
oof_rmse      = []
oof_profit    = []
oracle_profit = []
clfs          = []

# objects
num_cv_folds = 5
time_start   = time.time()

# modeling loop
for fold in range(num_folds):
        
    # extract samples
    X_train, y_train = X.iloc[train_idx[fold]][features], y.iloc[train_idx[fold]]
    X_valid, y_valid = X.iloc[valid_idx[fold]][features], y.iloc[valid_idx[fold]]
    X_test = X_valid

    # fill missings
    X_train.fillna(0, inplace = True)
    X_valid.fillna(0, inplace = True)
    X_test.fillna(0,  inplace = True)
    
    # training
    clf = LinearRegression()
    clf = clf.fit(X_train, y_train)
    clfs.append(clf)
    
    clf = lgb.LGBMRegressor(**lgb_params) 
    clf = clf.fit(X_train, y_train, 
                  eval_set              = [(X_train, y_train), (X_valid, y_valid)], 
                  eval_metric           = 'rmse', 
                  early_stopping_rounds = stop_rounds,
                  verbose               = verbose)
    clfs.append(clf)
    
    # inference
    preds_oof[fold, :] = clf.predict(X_valid)
    preds_test += clf.predict(X_test) / num_folds

    # evaluation
    oof_rmse.append(np.sqrt(mean_squared_error(y_valid, preds_oof[fold, :])))
    oof_profit.append(profit(y_valid, preds_oof[fold, :], price = X.iloc[valid_idx[fold]]['simulationPrice'].values))
    oracle_profit.append(profit(y_valid, y_valid,         price = X.iloc[valid_idx[fold]]['simulationPrice'].values))
    
    # feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold'] = fold + 1
    importances = pd.concat([importances, fold_importance_df], axis = 0)
    
    # information
    print('-' * 55)
    print('FOLD {:d}/{:d}: RMSE = {:.2f}, PROFIT = {:.0f}'.format(fold + 1, 
                                                                  num_folds, 
                                                                  oof_rmse[fold], 
                                                                  oof_profit[fold]))
    print('-' * 55)
    print('')
    

# print performance
print('')
print('Average RMSE: {:.2f}'.format(np.mean(oof_rmse)))
print('Total profit: {:.0f} ({:.2f}%)'.format(np.mean(oof_profit), 100 * np.mean(oof_profit) / np.mean(oracle_profit)))
print('Done in:      {:.2f} minutes'.format((time.time() - time_start) / 60))

In [None]:
##### EVALUATION

fig = plt.figure(figsize = (14, 7))

# residual plot
plt.subplot(1, 2, 1)
plt.scatter(y_valid, preds_oof[fold, :])
plt.title('Residual Plot')
plt.ylabel('Predicted demand')
plt.xlabel('Actual demand')

# feature importance
plt.subplot(1, 2, 2)
top_feats = 100
cols = importances[['Feature', 'Importance']].groupby('Feature').mean().sort_values(by = 'Importance', ascending = False)[0:top_feats].index
importance = importances.loc[importances.Feature.isin(cols)]
sns.barplot(x = 'Importance', y = 'Feature', data = importance.sort_values(by = 'Importance', ascending = False), ci = 0)
plt.title('Feature Importance')
plt.tight_layout()

# export
plt.savefig('../model_performance.pdf')