# SETTINGS

In [None]:
########## LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc
from tqdm import tqdm
import importlib

from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [None]:
########## SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# DATA IMPORT

In [None]:
# read data
df_train = pd.read_csv('../data/prepared/df_v11.csv',      compression = 'gzip')
df_test  = pd.read_csv('../data/prepared/df_test_v11.csv', compression = 'gzip')
print(df_train.shape)
print(df_test.shape)
df_train.head()

In [None]:
# extract target
y = df_train['target']
X = df_train.drop('target', axis = 1)
del df_train
print(X.shape, y.shape)

# format test data
X_test = df_test.drop('target', axis = 1)
del df_test
print(X_test.shape)

In [None]:
# read items
items = pd.read_csv('../data/prepared/items_v1.csv', compression = 'gzip')
print(items.shape)

# keep existing IDs
items = items[items['itemID'].isin(X['itemID'].unique())]
print(items.shape)

# PARAMETER TUNING

In [None]:
##### IMPORT EVALUATION FUNCTIUONS

# profit function
import functions
importlib.reload(functions)
from functions import asymmetric_mse, profit, postprocess_preds

In [None]:
##### LIST RELEVANT FEATURES

drop_feats = ['itemID', 'day_of_year'] + ['category1', 'category2', 'category3'] #+ list(X.filter(like = '_all_).columns
features = [var for var in X.columns if var not in drop_feats]
print(len(features), 'features')
features

In [None]:
########## MODELING PARAMETERS

### TRAINING OPTIONS

# target transformation
target_transform = True

# train on positive sales only
train_on_positive = False


### TUNING PARAMETERS

# trials
tuning_trials = 1000


### CLASSIFIER PARAMETERS

# boosting types
boost_types = ['gbdt', 'goss']

# training params
lgb_reg_params = {    
    'boosting_type':    hp.choice('boosting_type', boost_types),    
    'objective':        'rmse',
    'metrics':          'rmse',
    'n_estimators':     10000,
    'learning_rate':    hp.uniform('learning_rate',  0.0001, 0.3),
    'max_depth':        hp.quniform('max_depth',          1,  16, 1),
    'num_leaves':       hp.quniform('num_leaves',        10,  64, 1),
    'bagging_fraction': hp.uniform('bagging_fraction',  0.3,   1),
    'feature_fraction': hp.uniform('feature_fraction',  0.3,   1),
    'lambda_l1':        hp.uniform('lambda_l1',           0,   1),
    'lambda_l2':        hp.uniform('lambda_l2',           0,   1),
    'silent':           True,
    'verbosity':        -1,
    'nthread' :         4,
    'random_state':     77,
}

# evaluation params
lgb_fit_params = {
    'eval_metric':           'rmse',
    'early_stopping_rounds': 100,
    'verbose':               False,
}

# combine params
lgb_space = dict()
lgb_space['reg_params'] = lgb_reg_params
lgb_space['fit_params'] = lgb_fit_params

In [None]:
##### HYPEROPT OBJECT

class HPOpt(object):

    # INIT
    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    # optimization process
    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn        = fn, 
                          space     = space, 
                          algo      = algo, 
                          max_evals = max_evals, 
                          trials    = trials)
        except Exception as e:
            return {'status': STATUS_FAIL, 'exception': str(e)}
        return result, trials
    
    
    # LGBM INITIALIZATION
    def lgb_reg(self, para):
        para['reg_params']['max_depth']  = int(para['reg_params']['max_depth'])
        para['reg_params']['num_leaves'] = int(para['reg_params']['num_leaves'])
        reg = lgb.LGBMRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    
    # TRAINING AND INFERENCE
    def train_reg(self, reg, para):
        
        # fit LGB
        reg.fit(self.x_train, self.y_train,
                eval_set              = [(self.x_train, self.y_train), (self.x_test, self.y_test)], 
                sample_weight         = self.x_train['simulationPrice'].values,
                eval_sample_weight    = [self.x_train['simulationPrice'].values, self.x_test['simulationPrice'].values],
                **para['fit_params'])
        
        # inference
        if target_transform:      
            preds = postprocess_preds(reg.predict(self.x_test)**2)
            reals = self.y_test**2
        else:
            preds = postprocess_preds(reg.predict(self.x_test))
            reals = self.y_test
        
        # impute zeros
        if train_on_positive:
            preds[(self.x_test['order_sum_last_28'] == 0) & (self.x_test['promo_in_test'] == 0)] = 0

        # compute loss [negative profit]
        loss = np.round(-profit(reals, preds, price = self.x_test['simulationPrice'].values))
                      
        return {'loss': loss, 'status': STATUS_OK}

In [None]:
##### DATA PARTITIONING

# validation dates
v_end   = 158          # 1 day before last validation fold in code_03_modeling
v_start = v_end        # same as v_start

# training dates
t_start = 28           # first day in the data
t_end   = v_start - 15 # validation day - two weeks

# extract index
train_idx = list(X[(X.day_of_year >= t_start) & (X.day_of_year <= t_end)].index)
valid_idx = list(X[(X.day_of_year >= v_start) & (X.day_of_year <= v_end)].index)   

# extract samples
X_train, y_train = X.iloc[train_idx][features], y.iloc[train_idx]
X_valid, y_valid = X.iloc[valid_idx][features], y.iloc[valid_idx]

# keep positive cases
if train_on_positive:
    y_train = y_train.loc[(X_train['order_sum_last_28'] > 0) | (X_train['promo_in_test'] > 0)]
    X_train = X_train.loc[(X_train['order_sum_last_28'] > 0) | (X_train['promo_in_test'] > 0)]
    
# target transformation
if target_transform:
    y_train = np.sqrt(y_train)
    y_valid = np.sqrt(y_valid)

# information
print('-' * 65)
print('- train period days: {} -- {} (n = {})'.format(t_start, t_end, len(train_idx)))
print('- valid period days: {} -- {} (n = {})'.format(v_start, v_end, len(valid_idx)))
print('-' * 65)

In [None]:
##### PARAMETER TUNING

# instantiate objects
hpo_obj = HPOpt(X_train, X_valid, y_train, y_valid)
trials  = Trials() 

# perform tuning
lgb_opt_params = hpo_obj.process(fn_name   = 'lgb_reg',
                                 space     = lgb_space, 
                                 trials    = trials, 
                                 algo      = tpe.suggest, 
                                 max_evals = tuning_trials)  

In [None]:
# merge best params to fixed params
params = list(lgb_opt_params[0].keys())
for par_id in range(len(params)):
    lgb_reg_params[params[par_id]] = lgb_opt_params[0][params[par_id]]
    
# postprocess
lgb_reg_params['boosting_type'] = boost_types[lgb_reg_params['boosting_type']]
lgb_reg_params['max_depth']     = int(lgb_reg_params['max_depth'])
lgb_reg_params['num_leaves']    = int(lgb_reg_params['num_leaves'])

# RESULTS

In [None]:
# print best params
print('Best meta-parameters:')
lgb_reg_params

In [None]:
##### LOSS DYNAMICS

# extract loss
y = [-x['loss'] for x in trials.results]

# plot results
fig = plt.figure(figsize = (15, 6))
plt.plot(range(1, len(y) + 1), y)
plt.ylabel('Profit')
plt.xlabel('Iteration')
plt.savefig('../lgb_meta_params_loss.pdf')

In [None]:
##### PARAMETER PLOTS

# plot relationships
meta_params = list(trials.vals.keys())
fig = plt.figure(figsize = (15, 15))
for i in range(len(meta_params)):
    
    # extract values and loss
    x = trials.vals[meta_params[i]]
    y = [-x['loss'] for x in trials.results]
        
    # plot results
    plt.subplot(4, 2, i + 1)
    plt.scatter(x, y)
    plt.xlabel(meta_params[i])
    if (i == 0) | (i == 3):
        plt.ylabel('Profit')
    
# export PDF
plt.savefig('../lgb_meta_params_plots.pdf')

In [None]:
# export dictionary
par_file = open('../lgb_meta_params.pkl', 'wb')
pickle.dump(lgb_reg_params, par_file)
par_file.close()