In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from graphviz import Source
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics


In [2]:
#import the dataset from GitHub
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/final_df.csv'
df = pd.read_csv(url)


In [3]:
print('Shape of final dataframe: {}'.format(df.shape))
df.head()

Shape of final dataframe: (421570, 16)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales
0,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,1,A,151315,24924.5
1,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,2,A,151315,50605.27
2,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,3,A,151315,13740.12
3,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,4,A,151315,39954.04
4,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,5,A,151315,32229.38


In [4]:
#convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

#convert Store, Dept, and Type columns to category
df['Store'] = df['Store'].astype('category')

df['Dept'] = df['Dept'].astype('category')

df['Type'] = df['Type'].astype('category')

In [5]:
#create separate features for Week, Month, and Year
df['Month'] = df['Date'].dt.month
df['Month'] = df['Month'].astype('category')

df['Week'] = df['Date'].dt.week
df['Week'] = df['Week'].astype('category')

df['Year'] = df['Date'].dt.year
df['Year'] = df['Year'].astype('category')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  category      
 1   Date          421570 non-null  datetime64[ns]
 2   Temperature   421570 non-null  float64       
 3   Fuel_Price    421570 non-null  float64       
 4   MarkDown1     421570 non-null  float64       
 5   MarkDown2     421570 non-null  float64       
 6   MarkDown3     421570 non-null  float64       
 7   MarkDown4     421570 non-null  float64       
 8   MarkDown5     421570 non-null  float64       
 9   CPI           421570 non-null  float64       
 10  Unemployment  421570 non-null  float64       
 11  IsHoliday     421570 non-null  bool          
 12  Dept          421570 non-null  category      
 13  Type          421570 non-null  category      
 14  Size          421570 non-null  int64         
 15  Weekly_Sales  421

In [7]:
df.drop('Date', axis=1, inplace=True)

In [8]:
#create dummy variables
df_dummies = pd.get_dummies(df)

In [9]:
X = df_dummies.drop('Weekly_Sales', axis=1).values
y = df_dummies['Weekly_Sales'].values.reshape(-1, 1)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(X_test.shape)

(316177, 207)
(105393, 207)


In [11]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))
print(y_train.shape)
print(y_test.shape)

(316177,)
(105393,)


We will try three types of boosting algorithms: AdaBoost, GradientBoost, and XGBoost

1. AdaBoost

In [18]:
abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=4), n_estimators=10, random_state=0, learning_rate=0.1)

In [19]:
abr.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=4),
                  learning_rate=0.1, n_estimators=10, random_state=0)

In [20]:
y_pred = abr.predict(X_test)

print('R2 with 10 estimators: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 10 estimators: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 10 estimators: 0.3896691620124285
RMSE with 10 estimators: 17853.477226748226


In [None]:
n_estimator_values = [5, 10, 15, 20]

for i in n_estimator_values:
    abr = AdaBoostRegressor(n_estimators=i, learning_rate=0.01, random_state=0)
    abr.fit(X_train, y_train)
    y_pred = abr.predict(X_test)
    print('R2 with {} estimators: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with {} estimators: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

2. GradientBoost

In [145]:
max_depth_values = [3, 4, 5, 6]

for i in max_depth_values:
    gbm = GradientBoostingRegressor(n_estimators=10, max_depth=i, random_state=0)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    print('R2 with max depth of {}: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with max depth of {}: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with max depth of 3: 0.3357317643814335
RMSE with max depth of 3: 18625.669812873115
R2 with max depth of 4: 0.3996314420296533
RMSE with max depth of 4: 17707.16877431389
R2 with max depth of 5: 0.4585046436679423
RMSE with max depth of 5: 16816.574098081528
R2 with max depth of 6: 0.5026368492808133
RMSE with max depth of 6: 16116.73109002628


In [147]:
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for i in eta_vals:
    gbm = GradientBoostingRegressor(n_estimators=10, max_depth=6, learning_rate=i, random_state=0)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    print('R2 with learning rate of {}: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with learning rate of {}: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with learning rate of 0.001: 0.010303460157773503
RMSE with learning rate of 0.001: 22734.80234753104
R2 with learning rate of 0.01: 0.09462403804377062
RMSE with learning rate of 0.01: 21744.760905853786
R2 with learning rate of 0.1: 0.5026368492808133
RMSE with learning rate of 0.1: 16116.73109002628
R2 with learning rate of 1.0: 0.8467946551651134
RMSE with learning rate of 1.0: 8944.933480653564


3. XGBoost

Note that XGBoost can be implemented using scikit-learn's API as well XGBoost's learning API. We will use the learning API below, which uses different syntax and requires that our data be organized as a DMatrix.

In [12]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)
print(type(DM_train))

<class 'xgboost.core.DMatrix'>


In [13]:
#set parameters
params = {'objective':'reg:squarederror'}

In [14]:
#train model using same parameters as above
xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=10)

In [15]:
#predict
y_pred = xgb_model.predict(DM_test)

print('R2 with 10 boost rounds: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 10 boost rounds: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 10 boost rounds: 0.7340838894839687
RMSE with 10 boost rounds: 11784.535848132255


In [36]:
#loop through several possible max_depth values

max_depth_values = [1, 2, 3, 4, 5, 6]

for i in max_depth_values:
    params['max_depth'] = i
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=10)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 10 boost rounds, max_depth of {}: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 10 boost rounds, max_depth of {}: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 10 boost rounds, max_depth of 1: 0.26683632220655873
RMSE with 10 boost rounds, max_depth of 1: 19567.738121296723
R2 with 10 boost rounds, max_depth of 2: 0.44483847988096226
RMSE with 10 boost rounds, max_depth of 2: 17027.458679634525
R2 with 10 boost rounds, max_depth of 3: 0.5476795525662517
RMSE with 10 boost rounds, max_depth of 3: 15369.624855814327
R2 with 10 boost rounds, max_depth of 4: 0.6203987433331946
RMSE with 10 boost rounds, max_depth of 4: 14080.042887870432
R2 with 10 boost rounds, max_depth of 5: 0.672092743939215
RMSE with 10 boost rounds, max_depth of 5: 13086.263789189634
R2 with 10 boost rounds, max_depth of 6: 0.7340838894839687
RMSE with 10 boost rounds, max_depth of 6: 11784.535848132255


In [141]:
#use 5-fold cross validation, max_depth of 6, 10 boosting rounds, and store the results as a dataframe

DM = xgb.DMatrix(data=X, label=y)
params = params = {'objective':'reg:squarederror', 'max_depth': 6}

cv_results = xgb.cv(params=params, dtrain=DM, nfold=5, num_boost_round=12, metrics='rmse', as_pandas=True)



In [142]:
display(cv_results[['test-rmse-mean', 'test-rmse-std']])

Unnamed: 0,test-rmse-mean,test-rmse-std
0,22464.298438,80.663643
1,19171.614453,91.393754
2,17037.104688,72.344611
3,15594.716211,86.886594
4,14618.105664,96.880185
5,13802.791406,79.111094
6,13152.750391,70.53313
7,12665.818555,72.605114
8,12214.922656,79.460325
9,11847.260742,106.886765


In [25]:
#get feature importances
feature_importances = xgb_model.get_score(importance_type='gain')
feature_importances

{'f129': 4327690203150.0,
 'f132': 4017447990000.0,
 'f92': 2585492756000.0,
 'f116': 2264361020833.3335,
 'f10': 461122758045.2778,
 'f94': 1694485097833.3333,
 'f127': 1947790763428.5715,
 'f198': 398482735250.61536,
 'f13': 135494408425.0,
 'f36': 161137404333.33334,
 'f7': 22470859888.44898,
 'f37': 145777230000.0,
 'f8': 30596358271.185184,
 'f20': 294939458458.3333,
 'f53': 37952574466.25,
 'f30': 87912097520.0,
 'f139': 128989143438.22223,
 'f27': 60197909320.0,
 'f25': 48022306833.333336,
 'f2': 176866848.0,
 'f19': 33679512920.0,
 'f52': 24930382995.714287,
 'f50': 34175377420.0,
 'f21': 131944753783.33333,
 'f24': 82212880316.66667,
 'f57': 1553796540033.3333,
 'f137': 73027124711.5,
 'f0': 17827292823.916668,
 'f14': 81127186475.0,
 'f1': 2286803019.8461537,
 'f46': 30302126066.666668,
 'f4': 19032424446.0,
 'f131': 858487919000.0,
 'f138': 653449429037.5,
 'f54': 20675547135.0,
 'f128': 1218644600400.0,
 'f11': 104946194085.71428,
 'f202': 249131198604.0,
 'f18': 3353948980

In [44]:
list_of_importances = []
list_of_indices = []

for key, val in feature_importances.items():
    list_of_importances.append(val)
    list_of_indices.append(int(key.strip('f')))

In [48]:
#get names of important columns
important_col_names = df_dummies.iloc[:, list_of_indices].columns

In [71]:
feature_importance_df = pd.DataFrame({'feature':important_col_names, 'importance':list_of_importances}).sort_values(by='importance', ascending=False)
feature_importance_df.head(25)

Unnamed: 0,feature,importance
0,Dept_91,4327690000000.0
1,Dept_94,4017448000000.0
2,Dept_37,2585493000000.0
3,Dept_71,2264361000000.0
6,Dept_87,1947791000000.0
5,Dept_39,1694485000000.0
25,Dept_1,1553797000000.0
35,Dept_90,1218645000000.0
45,Dept_7,860079300000.0
32,Dept_93,858487900000.0


Departments are the most important contributor to the XGBRegressor model, along with Size, Type_A, and a few stores and weeks.

Let's reduce our data set to only the 80 most important features and then run grid search cv to find the best parameters

In [130]:
columns_to_keep = feature_importance_df.iloc[:81, 0].to_list() #Weekly Sales is in the list to keep
df_dummies_reduced = df_dummies[columns_to_keep]
df_dummies_reduced.shape

(421570, 80)

In [131]:
X = df_dummies_reduced.drop('Weekly_Sales', axis=1).values
y = df_dummies_reduced['Weekly_Sales'].values

In [132]:
y = y.reshape(len(y))
print(y.shape)

(421570,)


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(X_test.shape)

(316177, 79)
(105393, 79)


In [134]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

<class 'xgboost.core.DMatrix'>


In [135]:
params = params = {'objective':'reg:squarederror', 'max_depth': 6}
    
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for eta in eta_vals:
    params['eta'] = eta
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=12)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 10 boost rounds, learning rate of {}: {}'.format(eta, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 10 boost rounds, learning rate of {}: {}'.format(eta, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 10 boost rounds, learning rate of 0.001: -0.47067893969896124
RMSE with 10 boost rounds, learning rate of 0.001: 27713.992403301538
R2 with 10 boost rounds, learning rate of 0.01: -0.2827836773156196
RMSE with 10 boost rounds, learning rate of 0.01: 25883.134522487748
R2 with 10 boost rounds, learning rate of 0.1: 0.30908409833833494
RMSE with 10 boost rounds, learning rate of 0.1: 18995.58841780906
R2 with 10 boost rounds, learning rate of 1.0: 0.42009269587657394
RMSE with 10 boost rounds, learning rate of 1.0: 17402.81271193626


Interestingly, using only half the features greatly reduces model performance. Let's loop through the same possible learning rates with all columns, and compare the performance.

In [136]:
X = df_dummies.drop('Weekly_Sales', axis=1).values
y = df_dummies['Weekly_Sales'].values

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(X_test.shape)

(316177, 207)
(105393, 207)


In [138]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [143]:
params = params = {'objective':'reg:squarederror', 'max_depth': 6}
    
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for eta in eta_vals:
    params['eta'] = eta
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=10)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 12 boost rounds, learning rate of {}: {}'.format(eta, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 12 boost rounds, learning rate of {}: {}'.format(eta, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 12 boost rounds, learning rate of 0.001: -0.47471063669404256
RMSE with 12 boost rounds, learning rate of 0.001: 27751.95376417429
R2 with 12 boost rounds, learning rate of 0.01: -0.31091276588973393
RMSE with 12 boost rounds, learning rate of 0.01: 26165.38041591253
R2 with 12 boost rounds, learning rate of 0.1: 0.43873843462267026
RMSE with 12 boost rounds, learning rate of 0.1: 17120.750900733157
R2 with 12 boost rounds, learning rate of 1.0: 0.8507973718665154
RMSE with 12 boost rounds, learning rate of 1.0: 8827.310303715436
