# Hyper Parameter Tuning for predicting Total Discount

Having identified that GBR model works well in previous section based on K fold cross validation scores and Training mean squared error , in this notebook we try to tune the model for better performance

We use bayesSearchCV to find the best parameters for our GBR model

In [1]:
#### IMPORTS

import pandas as pd
import numpy as np
from types import FunctionType
from sklearn.model_selection import train_test_split
import math
import matplotlib.pyplot as plt
# Importing sklearn methods
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn import tree

import skopt
import joblib

In [2]:
# data2.xlsx is the data obtained after running the feature engineering code
# data3.csv : converted data2.xlsx to data3.csv because of easiness of use of csv files in webapps
path = r"C:\Users\NISARG\Desktop\mech\Finance\Maverick\CODE"   #change the path to your local path
df = pd.read_csv(path + "\data3.csv")


count = 0
index_test = list()
index_train = list()
for i in range(len(df['upper_limit'])):
    if(df['Discount_Total'][i]>df['upper_limit'][i]):
        count+=1
        index_test.append(i)
    else:
        index_train.append(i)
print(count/len(df['upper_limit']))

0.3232645073885446


In [4]:

'''
Here we get the train and test datasets based on the logic described above
'''

df_test = df.iloc[index_test]
df_test = df_test.reset_index()
df_train = df.iloc[index_train]
df_train = df_train.reset_index()     

In [9]:
'''
Encoding categorical variables here
'''


from sklearn.preprocessing import LabelEncoder
def encode(highGTOData):
    lb_make = LabelEncoder()
    highGTOData['sdfc_Tier'] = lb_make.fit_transform(highGTOData['sdfc_Tier'])
    for i in range(len(highGTOData['GTO_2019'])):
        if(highGTOData['poc_image'][i]==0):
            highGTOData['poc_image'][i] = "Mainstream"
    highGTOData['poc_image'] = lb_make.fit_transform(highGTOData['poc_image'])
    highGTOData['segment'] = lb_make.fit_transform(highGTOData['segment'])
    highGTOData['sub_segment'] = lb_make.fit_transform(highGTOData['sub_segment'])
    highGTOData['Product Set'] = lb_make.fit_transform(highGTOData['Product Set'])
    highGTOData['Brand'] = lb_make.fit_transform(highGTOData['Brand'])
    highGTOData['Sub-Brand'] = lb_make.fit_transform(highGTOData['Sub-Brand'])
    highGTOData['Pack_Type'] = lb_make.fit_transform(highGTOData['Pack_Type'])
    highGTOData['Returnalility'] = lb_make.fit_transform(highGTOData['Returnalility'])
    highGTOData['province'] = lb_make.fit_transform(highGTOData['province'])
    highGTOData['GTO_growth'] = highGTOData['Expected_GTO'] - highGTOData['GTO_2019']
    return highGTOData



In [10]:
df_test = encode(df_test)
df_train = encode(df_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [19]:



lowGTOData_train = df_train[df_train.GTO_2019<10000]
lowGTOData_test = df_test[df_test.GTO_2019<10000]

midGTOData_train = df_train[(df_train.GTO_2019>10000)&(df_train.GTO_2019<50000)]
midGTOData_test = df_test[(df_test.GTO_2019>10000)&(df_test.GTO_2019<50000)]

highGTOData_train = df_train[df_train.GTO_2019>50000]
highGTOData_test = df_test[df_test.GTO_2019>50000]

In [15]:
'''
Keeping the essential columns as seen from the Exploratory Data Analysis
'''



lowGTOData_train = lowGTOData_train.reset_index()
lowGTOData_test = lowGTOData_test.reset_index()
target_train = lowGTOData_train['Discount_Total']
target_test = lowGTOData_test['Discount_Total']
colsToKeep = ['Volume_2019' , 'Volume_2018'  , 'Expected_GTO'  , 'Expected_product_volume', 'profitability_indicator' , 'upper_limit'  ,'sdfc_Tier'  , 'loyalty_index' , 'Returnalility', 'market_cap' ]
features_train = lowGTOData_train[colsToKeep]
features_test = lowGTOData_test[colsToKeep]

In [51]:
from skopt import BayesSearchCV

# Parameters to focus on in Gradient Boosting Regressor

    1) loss :
        a. ls : least square regression
        b. lad : least absolute deviation
        c. huber : Combination of ls & lad
        d. quantile : Used in quantile regression
        
    2) learning_rate : Learning rate is used to shrink the contribution of each tree in GBR , we try values of learning rate from 0.05 to 0.3 with step of 0.05
    
    3) n_estimators : No of boostings performed . Larger the number , better is the fit , but it can lead to overfitting as well
    
    4) subsample : The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. subsample interacts with the parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias.
    
    5) criterion : function to measure quality of split
        a. friedman_mse : mse with improvement by friedman
        b. mse : mean squared error
        
    6) max_depth : Maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. The best value depends on the interaction of the input variables. We try max_depth from 2 to 8 with step of 1
    
    7) max_features : auto , sqrt , log2 
        a. auto : max_features = n_features
        b. sqrt : max_features = sqrt(n_features)
        c. log2 : max_features = log2(n_features)

In [52]:
params = dict()
params['loss'] = ['ls', 'lad', 'huber', 'quantile']
params['learning_rate'] = [0.05,0.1,0.15,0.2,0.25,0.3]
params['n_estimators'] = [50,100,150,200,250,300,350,400]
params['subsample'] = [0.6,0.8,1]
params['criterion'] = ['friedman_mse', 'mse']
params['max_depth'] = [2,3,4,5,6,7,8]
params['max_features'] = ['auto', 'sqrt', 'log2']



In [53]:
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
# define the search
search = BayesSearchCV(estimator=GradientBoostingRegressor(), search_spaces=params, n_jobs=-1, cv=kfold)

In [54]:
search.fit(features_train,target_train)
# report the best result
print(search.best_score_)
print(search.best_params_)

0.7860501247020013
OrderedDict([('criterion', 'friedman_mse'), ('learning_rate', 0.05), ('loss', 'huber'), ('max_depth', 4), ('max_features', 'sqrt'), ('n_estimators', 400), ('subsample', 0.8)])


# Best Parameters for lowGTO Data
    1) Criteria = friedman_mse

    2) learning_rate = 0.05

    3) loss = huber

    4) max_depth = 4

    5) max_features = sqrt

    6) n_estimators = 400

    7) subsample = 0.8

# Running ML Model with best parameters and saving it

In [16]:
import joblib


gbr = GradientBoostingRegressor(criterion = 'friedman_mse' , n_estimators=400, learning_rate=0.05, max_depth=4, max_features = 'sqrt', loss='huber' , subsample = 0.8)

gbr.fit(features_train,target_train)
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=34234)
cross_val_scores = cross_val_score(gbr, features_train, target_train, cv=kfold )
mse_train = math.sqrt(mean_squared_error(target_train,gbr.predict(features_train)))
mse = math.sqrt(mean_squared_error(target_test, gbr.predict(features_test)))
print(cross_val_scores)
print(np.mean(cross_val_scores))
print(mse_train)
print(mse)


filename = 'lowGTOModel_TotalDiscount.sav'
joblib.dump(gbr, open(filename, 'wb'))




[0.72989008 0.68806548 0.88944078 0.66727193 0.78155321 0.89647984
 0.93695627 0.71685247 0.72706194 0.71846468]
0.7752036673578562
134.90748179073378
822.7794579889197


In [20]:
'''
Keeping the features as seen from EXPLORATORY DATA ANALYSIS
'''

midGTOData_train = midGTOData_train.reset_index()
midGTOData_test = midGTOData_test.reset_index()
target_train = midGTOData_train['Discount_Total']
target_test = midGTOData_test['Discount_Total']
colsToKeep = ['Volume_2019' , 'Volume_2018' ,'Volume_2019 Product' ,'Expected_GTO','Expected_product_volume' , 'profitability_indicator' , 'upper_limit'  ,'sdfc_Tier'  , 'loyalty_index' , 'Returnalility',  'inventory_lingering_factor', 'market_cap',
       'order_size']
features_train = midGTOData_train[colsToKeep]
features_test = midGTOData_test[colsToKeep]

In [27]:
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
# define the search
search = BayesSearchCV(estimator=GradientBoostingRegressor(), search_spaces=params, n_jobs=-1, cv=kfold)

In [28]:
search.fit(features_train,target_train)
# report the best result
print(search.best_score_)
print(search.best_params_)

0.49342351243431615
OrderedDict([('criterion', 'mse'), ('learning_rate', 0.1), ('loss', 'ls'), ('max_depth', 3), ('max_features', 'auto'), ('n_estimators', 100), ('subsample', 0.8)])


# Best Parameters for Mid GTO Data
    1) criterion : mse

    2) learning_rate : 0.1

    3) loss : ls

    4) max_depth = 3

    5) n_estimators = 100

    6) subsample = 0.8
    
    7) max_features = auto

# Train Model on best parameters and save it

In [21]:
gbr = GradientBoostingRegressor(criterion = 'mse' , n_estimators=100, learning_rate=0.1, max_depth=3, max_features = 'auto', loss='ls' , subsample = 0.8)

In [23]:
gbr.fit(features_train,target_train)
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=34234)
cross_val_scores = cross_val_score(gbr, features_train, target_train, cv=kfold )
mse_train = math.sqrt(mean_squared_error(target_train,gbr.predict(features_train)))
mse = math.sqrt(mean_squared_error(target_test, gbr.predict(features_test)))
print(cross_val_scores)
print(np.mean(cross_val_scores))
print(mse_train)
print(mse)


filename = 'midGTOModel_TotalDiscount.sav'
joblib.dump(gbr, open(filename, 'wb'))

[0.63638215 0.26322742 0.53102371 0.28744227 0.40290535 0.65835448
 0.57073938 0.39541201 0.48698544 0.54880762]
0.47812798331009193
1458.267350065082
6292.354418738532


In [24]:
'''
Keeping the features as seen from EXPLORATORY DATA ANALYSIS
'''


highGTOData_train = highGTOData_train.reset_index()
highGTOData_test = highGTOData_test.reset_index()
target_train = highGTOData_train['Discount_Total']
target_test = highGTOData_test['Discount_Total']
colsToKeep = ['Volume_2019' , 'Volume_2018' ,'Volume_2019 Product' ,'Expected_GTO','Expected_product_volume' , 'profitability_indicator' , 'upper_limit'  ,  'inventory_lingering_factor',
       'order_size']
features_train = highGTOData_train[colsToKeep]
features_test = highGTOData_test[colsToKeep]

In [44]:
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=34234)
# define the search
search = BayesSearchCV(estimator=GradientBoostingRegressor(), search_spaces=params, n_jobs=-1, cv=kfold)

In [45]:
search.fit(features_train,target_train)
# report the best result
print(search.best_score_)
print(search.best_params_)

0.2789369206161165
OrderedDict([('criterion', 'mse'), ('learning_rate', 0.3), ('loss', 'huber'), ('max_depth', 2), ('max_features', 'auto'), ('n_estimators', 200), ('subsample', 0.8)])


# Best Parameters for High GTO Data
    1) criterion - mse

    2) learning_rate = 0.3

    3) loss : huber

    4) max_depth : 2

    5) max_features - auto

    6) n_estimators = 200

    7) subsample = 0.8


# Train model based on best parameters and save it

In [25]:
gbr = GradientBoostingRegressor(criterion = 'mse' , n_estimators=200, learning_rate=0.3, max_depth=2, max_features = 'auto', loss='huber' , subsample = 0.8)

In [29]:
gbr.fit(features_train,target_train)
kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=34234)
cross_val_scores = cross_val_score(gbr, features_train, target_train, cv=kfold )
mse_train = math.sqrt(mean_squared_error(target_train,gbr.predict(features_train)))
mse = math.sqrt(mean_squared_error(target_test, gbr.predict(features_test)))
print(cross_val_scores)
print(np.mean(cross_val_scores))
print(mse_train)
print(mse)


filename = 'HighGTOModel_TotalDiscount.sav'
joblib.dump(gbr, open(filename, 'wb'))

[-5.10615611  0.41143118  0.94439123  0.63191923  0.63289689 -0.69701788
  0.48614344 -0.34885166  0.8316522   0.88129612]
-0.1332295360623002
776.6290346486898
181182.47494710097
