# SETTINGS

In [None]:
########## LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc
from tqdm import tqdm
import importlib

from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
########## HELPER FUNCTIONS

!pip install --upgrade dptools
from dptools import *

In [None]:
########## SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# DATA IMPORT

In [None]:
# read data
df_train = pd.read_csv('../data/prepared/df_v3.csv',      compression = 'gzip')
df_test  = pd.read_csv('../data/prepared/df_test_v3.csv', compression = 'gzip')
print(df_train.shape)
print(df_test.shape)
df_train.head()

In [None]:
# extract target
y = df_train['target']
X = df_train.drop('target', axis = 1)
del df_train
print(X.shape, y.shape)

# format test data
X_test = df_test.drop('target', axis = 1)
del df_test
print(X_test.shape)

In [None]:
# read items
items = pd.read_csv('../data/prepared/items_v1.csv', compression = 'gzip')
print(items.shape)

# keep existing IDs
items = items[items['itemID'].isin(X['itemID'].unique())]
print(items.shape)

# MODELING

In [None]:
##### IMPORT EVALUATION FUNCTIUONS

# profit function
import functions
importlib.reload(functions)
from functions import profit, asymmetric_mse, postprocess_preds

In [None]:
##### LIST RELEVANT FEATURES

drop_feats = ['itemID', 'day_of_year']
features = [var for var in X.columns if var not in drop_feats]
print(len(features), ':', features)

In [None]:
##### MODELING PARAMETERS

# paritioning
num_folds = 7   # no. CV folds
test_days = 14  # no. days in the test set

# settings
cores = 4
seed  = 23

# rounds abd verbose
stop_rounds = 10
verbose     = 20

# LGB parameters
lgb_params = {
    'boosting_type':    'gbdt',
    'objective':        'regression',
    'metrics':          'rmse',
    'learning_rate':    0.1,
    'n_estimators':     1000,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.8,
    'lambda_l1':        0.1,
    'lambda_l2':        0.1,
    'silent':           True,
    'verbosity':        -1,
    'nthread' :         cores,
    'random_state':     seed,
}

In [None]:
##### CROSS-VALIDATION LOOP

# placeholders
importances   = pd.DataFrame()
preds_oof     = np.zeros((num_folds, items.shape[0]))
reals_oof     = np.zeros((num_folds, items.shape[0]))
preds_test    = np.zeros(items.shape[0])
oof_rmse      = []
oof_profit    = []
oracle_profit = []
clfs          = []
train_idx     = []
valid_idx     = []

# objects
train_days = X['day_of_year'].max() - 2*test_days - num_folds - X['day_of_year'].min()  # no. days in the train set
time_start = time.time()

# modeling loop
for fold in range(num_folds):
    
    ##### PARTITIONING
    
    # validation dates
    if fold == 0:
        v_end = X['day_of_year'].max() - (test_days + 1)
    else:
        v_end = v_end - 1
    v_start = v_end
    
    # training dates
    t_end   = v_start - (test_days + 1)
    t_start = t_end   - (train_days - 1)
    
    # extract index
    train_idx.append(list(X[(X.day_of_year >= t_start) & (X.day_of_year <= t_end)].index))
    valid_idx.append(list(X[(X.day_of_year >= v_start) & (X.day_of_year <= v_end)].index))   
    
    # extract samples
    X_train, y_train = X.iloc[train_idx[fold]][features], y.iloc[train_idx[fold]]
    X_valid, y_valid = X.iloc[valid_idx[fold]][features], y.iloc[valid_idx[fold]]
    X_test = X_test[features]
    
    # information
    print('-' * 65)
    print('- train period days: {} -- {} (n = {})'.format(t_start, t_end, len(train_idx[fold])))
    print('- valid period days: {} -- {} (n = {})'.format(v_start, v_end, len(valid_idx[fold])))
    print('-' * 65)

    
    ##### MODELING
       
    # training
    clf = lgb.LGBMRegressor(**lgb_params) 
    clf = clf.fit(X_train, y_train, 
                  eval_set              = [(X_train, y_train), (X_valid, y_valid)], 
                  eval_metric           = 'rmse', 
                  early_stopping_rounds = stop_rounds,
                  verbose               = verbose)
    clfs.append(clf)
    
    # inference
    preds_oof[fold, :] = postprocess_preds(clf.predict(X_valid))
    reals_oof[fold, :] = y_valid
    preds_test += postprocess_preds(clf.predict(X_test)) / num_folds

    # evaluation
    oof_rmse.append(np.sqrt(mean_squared_error(y_valid, preds_oof[fold, :])))
    oof_profit.append(profit(y_valid, preds_oof[fold, :], price = X.iloc[valid_idx[fold]]['simulationPrice'].values))
    oracle_profit.append(profit(y_valid, y_valid,         price = X.iloc[valid_idx[fold]]['simulationPrice'].values))
    
    # feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold'] = fold + 1
    importances = pd.concat([importances, fold_importance_df], axis = 0)
    
    # information
    print('-' * 65)
    print('FOLD {:d}/{:d}: RMSE = {:.2f}, PROFIT = {:.0f}'.format(fold + 1, 
                                                                  num_folds, 
                                                                  oof_rmse[fold], 
                                                                  oof_profit[fold]))
    print('-' * 65)
    print('')
    

# print performance
print('')
print('-' * 65)
print('- AVERAGE RMSE: {:.2f}'.format(np.mean(oof_rmse)))
print('- TOTAL PROFIT: {:.0f} ({:.2f}%)'.format(np.mean(oof_profit), 100 * np.mean(oof_profit) / np.mean(oracle_profit)))
print('- RUNNING TIME: {:.2f} minutes'.format((time.time() - time_start) / 60))
print('-' * 65)

In [None]:
##### EVALUATION

fig = plt.figure(figsize = (12, 5))

# residual plot
plt.subplot(1, 2, 1)
plt.scatter(reals_oof.reshape(-1), preds_oof.reshape(-1))
axis_lim = np.max([reals_oof.max(), preds_oof.max()])
plt.ylim(top   = 1.02*axis_lim)
plt.xlim(right = 1.02*axis_lim)
plt.plot((0, axis_lim), (0, axis_lim), 'r--')
plt.title('Residual Plot')
plt.ylabel('Predicted demand')
plt.xlabel('Actual demand')

# feature importance
plt.subplot(1, 2, 2)
top_feats = 100
cols = importances[['Feature', 'Importance']].groupby('Feature').mean().sort_values(by = 'Importance', ascending = False)[0:top_feats].index
importance = importances.loc[importances.Feature.isin(cols)]
sns.barplot(x = 'Importance', y = 'Feature', data = importance.sort_values(by = 'Importance', ascending = False), ci = 0)
plt.title('Feature Importance')
plt.tight_layout()

# export
plt.savefig('../model_performance.pdf')

# SUBMISSION

In [None]:
##### LOGS

# model (RMSE, profit): description
#
# lgb_v1_df_v1 (102.49, -65.72%): lgb with 17 features
# lgb_v2_df_v2 (78.08, 20.69%):   lgb with 18 features
# lgb_v3_df_v3 (73.24, 20.22%):   lgb with 18 features
# lgb_v4_df_v3 (73.37, 21.09%):   lgb with 18 features, train_days = 101

In [None]:
##### SUBMISSION

# model name
name = 'lgb_v1_df_v4'
sub_name = name + '_profit_' + str(int(np.round(np.mean(oof_profit))))

# save OOF preds
oof = np.stack((preds_oof, reals_oof))
np.save('../oof_preds/' + sub_name + '.npy', oof)
print(oof.shape)

# save submissiion
sub = pd.read_csv('../submissions/sample_submission.csv', sep = '|')
sub['demandPrediction'] = postprocess_preds(preds_test)
sub.to_csv('../submissions/sub_' + sub_name + '.csv', sep = '|', index = False)
print(sub.shape)
sub.head()