In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from graphviz import Source
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics


In [49]:
#import the dataset from GitHub
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/final_df.csv'
df = pd.read_csv(url)

In [50]:
print('Shape of final dataframe: {}'.format(df.shape))
df.head()

Shape of final dataframe: (418660, 16)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales
0,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,1,A,151315,24924.5
1,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,2,A,151315,50605.27
2,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,3,A,151315,13740.12
3,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,4,A,151315,39954.04
4,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,5,A,151315,32229.38


In [51]:
#convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

#convert Store, Dept, and Type columns to category
df['Store'] = df['Store'].astype('category')

df['Dept'] = df['Dept'].astype('category')

df['Type'] = df['Type'].astype('category')

In [52]:
#create separate features for Week, Month, and Year
df['Month'] = df['Date'].dt.month
df['Month'] = df['Month'].astype('category')

df['Week'] = df['Date'].dt.week
df['Week'] = df['Week'].astype('category')

df['Year'] = df['Date'].dt.year
df['Year'] = df['Year'].astype('category')

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418660 entries, 0 to 418659
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         418660 non-null  category      
 1   Date          418660 non-null  datetime64[ns]
 2   Temperature   418660 non-null  float64       
 3   Fuel_Price    418660 non-null  float64       
 4   MarkDown1     418660 non-null  float64       
 5   MarkDown2     418660 non-null  float64       
 6   MarkDown3     418660 non-null  float64       
 7   MarkDown4     418660 non-null  float64       
 8   MarkDown5     418660 non-null  float64       
 9   CPI           418660 non-null  float64       
 10  Unemployment  418660 non-null  float64       
 11  IsHoliday     418660 non-null  bool          
 12  Dept          418660 non-null  category      
 13  Type          418660 non-null  category      
 14  Size          418660 non-null  int64         
 15  Weekly_Sales  418

In [54]:
df.drop('Date', axis=1, inplace=True)

In [55]:
#create dummy variables
df_dummies = pd.get_dummies(df)

In [56]:
#split into train test based on year
X_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(293146, 207)
(125514, 207)


In [57]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))

print(y_train.shape)
print(y_test.shape)

(293146,)
(125514,)


In [58]:
df_dummies.columns

Index(['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday',
       ...
       'Week_46', 'Week_47', 'Week_48', 'Week_49', 'Week_50', 'Week_51',
       'Week_52', 'Year_2010', 'Year_2011', 'Year_2012'],
      dtype='object', length=208)

We will try three types of boosting algorithms: AdaBoost, GradientBoost, and XGBoost

1. AdaBoost

In [59]:
abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=4), n_estimators=10, random_state=0, learning_rate=0.1)

In [60]:
abr.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=4),
                  learning_rate=0.1, n_estimators=10, random_state=0)

In [61]:
y_pred = abr.predict(X_test)

print('R2 with 10 estimators: {:.2f}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 10 estimators: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 10 estimators: 0.42
RMSE with 10 estimators: 16893.17


In [62]:
n_estimator_values = [5, 10, 15, 20]

for i in n_estimator_values:
    abr = AdaBoostRegressor(n_estimators=i, learning_rate=0.01, random_state=0)
    abr.fit(X_train, y_train)
    y_pred = abr.predict(X_test)
    print('R2 with {} estimators: {:.2f}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with {} estimators: {:.2f}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 5 estimators: 0.38
RMSE with 5 estimators: 17372.51
R2 with 10 estimators: 0.38
RMSE with 10 estimators: 17368.38
R2 with 15 estimators: 0.38
RMSE with 15 estimators: 17372.88
R2 with 20 estimators: 0.38
RMSE with 20 estimators: 17373.64


2. GradientBoost

In [63]:
max_depth_values = [3, 4, 5, 6]

for i in max_depth_values:
    gbm = GradientBoostingRegressor(n_estimators=10, max_depth=i, random_state=0)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    print('R2 with max depth of {}: {:.2f}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with max depth of {}: {:.2f}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with max depth of 3: 0.36
RMSE with max depth of 3: 17727.94
R2 with max depth of 4: 0.41
RMSE with max depth of 4: 16916.22
R2 with max depth of 5: 0.47
RMSE with max depth of 5: 16124.25
R2 with max depth of 6: 0.52
RMSE with max depth of 6: 15355.86


In [64]:
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for i in eta_vals:
    gbm = GradientBoostingRegressor(n_estimators=10, max_depth=6, learning_rate=i, random_state=0)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    print('R2 with learning rate of {}: {:.2f}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with learning rate of {}: {:.2f}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with learning rate of 0.001: 0.01
RMSE with learning rate of 0.001: 22004.82
R2 with learning rate of 0.01: 0.10
RMSE with learning rate of 0.01: 21030.68
R2 with learning rate of 0.1: 0.52
RMSE with learning rate of 0.1: 15355.86
R2 with learning rate of 1.0: 0.86
RMSE with learning rate of 1.0: 8383.66


3. XGBoost

Note that XGBoost can be implemented using scikit-learn's API as well XGBoost's learning API. We will use the learning API below, which uses different syntax and requires that our data be organized as a DMatrix.

In [65]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)
print(type(DM_train))

<class 'xgboost.core.DMatrix'>


In [66]:
#set parameters
params = {'objective':'reg:squarederror'}

In [67]:
#train model using same parameters as above
xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)

In [68]:
#predict
y_pred = xgb_model.predict(DM_test)

print('R2 with 100 boost rounds: {:.2f}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 100 boost rounds: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds: 0.92
RMSE with 100 boost rounds: 6293.47


In [69]:
#loop through several possible max_depth values

max_depth_values = [1, 2, 3, 4, 5, 6]

for i in max_depth_values:
    params['max_depth'] = i
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=10)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, max_depth of {}: {:.2f}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, max_depth of {}: {:.2f}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, max_depth of 1: 0.29
RMSE with 100 boost rounds, max_depth of 1: 18629.34
R2 with 100 boost rounds, max_depth of 2: 0.47
RMSE with 100 boost rounds, max_depth of 2: 16028.03
R2 with 100 boost rounds, max_depth of 3: 0.59
RMSE with 100 boost rounds, max_depth of 3: 14218.24
R2 with 100 boost rounds, max_depth of 4: 0.64
RMSE with 100 boost rounds, max_depth of 4: 13248.16
R2 with 100 boost rounds, max_depth of 5: 0.69
RMSE with 100 boost rounds, max_depth of 5: 12267.21
R2 with 100 boost rounds, max_depth of 6: 0.74
RMSE with 100 boost rounds, max_depth of 6: 11372.58


In [70]:
#get feature importances
feature_importances = xgb_model.get_score(importance_type='gain')
feature_importances

{'f129': 3430640239000.0,
 'f132': 3356609328625.0,
 'f92': 2675688902142.857,
 'f116': 2193922297714.2856,
 'f10': 415425447570.93915,
 'f94': 1589142651166.6667,
 'f127': 1862358840833.3333,
 'f198': 589180198788.0,
 'f13': 204251834800.0,
 'f36': 151971583166.66666,
 'f42': 94451269650.0,
 'f37': 71976222733.33333,
 'f8': 35260063502.13793,
 'f20': 208044001675.0,
 'f53': 39132522662.72727,
 'f30': 87454468457.14285,
 'f139': 133515785080.0,
 'f7': 20810766821.636364,
 'f27': 39484990800.0,
 'f25': 40368807950.0,
 'f46': 25700254520.0,
 'f52': 30072442289.142857,
 'f24': 109737522928.57143,
 'f21': 109249875280.0,
 'f57': 1741582723400.0,
 'f137': 295845395114.2857,
 'f138': 319811702855.38464,
 'f43': 43545395200.0,
 'f0': 7287576034.181818,
 'f35': 10160640520.0,
 'f18': 34830920366.666664,
 'f41': 35578511400.0,
 'f131': 777690848400.0,
 'f203': 324764768.0,
 'f17': 17437343700.0,
 'f47': 8587673830.0,
 'f183': 24563200.0,
 'f6': 4190448458.6666665,
 'f1': 2343601004.2,
 'f176': 

In [71]:
list_of_importances = []
list_of_indices = []

for key, val in feature_importances.items():
    list_of_importances.append(val)
    list_of_indices.append(int(key.strip('f')))

In [72]:
#get names of important columns
important_col_names = df_dummies.iloc[:, list_of_indices].columns

In [73]:
feature_importance_df = pd.DataFrame({'feature':important_col_names, 'importance':list_of_importances}).sort_values(by='importance', ascending=False)
feature_importance_df.head(25)

Unnamed: 0,feature,importance
0,Dept_91,3430640000000.0
1,Dept_94,3356609000000.0
2,Dept_37,2675689000000.0
3,Dept_71,2193922000000.0
6,Dept_87,1862359000000.0
24,Dept_1,1741583000000.0
5,Dept_39,1589143000000.0
46,Dept_90,1195452000000.0
42,Dept_12,896342800000.0
56,Dept_6,797033800000.0


Departments are the most important contributor to the XGBRegressor model, along with Size, Type_A, and a few stores and weeks.

Let's reduce our data set to only the 80 most important features and then run grid search cv to find the best parameters

In [74]:
columns_to_keep = feature_importance_df.iloc[:81, 0].to_list() #Weekly Sales is in the list to keep
df_dummies_reduced = df_dummies[columns_to_keep]
df_dummies_reduced.shape

(418660, 75)

In [75]:
X_train = df_dummies_reduced.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies_reduced.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies_reduced.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies_reduced.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)


print(X_train.shape)
print(X_test.shape)

(293146, 74)
(125514, 74)


In [76]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))

print(y_train.shape)
print(y_test.shape)

(293146,)
(125514,)


In [77]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [78]:
params = params = {'objective':'reg:squarederror', 'max_depth': 5}
    
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for eta in eta_vals:
    params['eta'] = eta
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of {}, half the features: {:.2f}'.format(eta, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of {}, half the features: {:.2f}'.format(eta, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.001, half the features: -0.35
RMSE with 100 boost rounds, learning rate of 0.001, half the features: 25731.15
R2 with 100 boost rounds, learning rate of 0.01, half the features: 0.22
RMSE with 100 boost rounds, learning rate of 0.01, half the features: 19502.69
R2 with 100 boost rounds, learning rate of 0.1, half the features: 0.41
RMSE with 100 boost rounds, learning rate of 0.1, half the features: 17039.60
R2 with 100 boost rounds, learning rate of 1.0, half the features: 0.40
RMSE with 100 boost rounds, learning rate of 1.0, half the features: 17116.60


Interestingly, using only half the features greatly reduces model performance. Let's loop through the same possible learning rates with all columns, and compare the performance.

In [79]:
X_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(293146, 207)
(125514, 207)


In [80]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))

print(y_train.shape)
print(y_test.shape)

(293146,)
(125514,)


In [81]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [82]:
params = {'objective':'reg:squarederror', 'max_depth': 5}
    
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for eta in eta_vals:
    params['eta'] = eta
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of {}: {:.2f}'.format(eta, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of {}: {:.2f}'.format(eta, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.001: -0.33
RMSE with 100 boost rounds, learning rate of 0.001: 25504.59
R2 with 100 boost rounds, learning rate of 0.01: 0.40
RMSE with 100 boost rounds, learning rate of 0.01: 17179.55
R2 with 100 boost rounds, learning rate of 0.1: 0.83
RMSE with 100 boost rounds, learning rate of 0.1: 9192.74
R2 with 100 boost rounds, learning rate of 1.0: 0.88
RMSE with 100 boost rounds, learning rate of 1.0: 7541.65


Finally, let's experiment with changing the subsample of rows and columns for each tree in the ensemble. 

In [83]:
params = {'objective':'reg:squarederror', 'max_depth': 5, 'eta':0.1}
    
#loop through several possible values of subsamples (rows)
subsample_vals= [0.5, 0.6, 0.7, 0.8, 0.9]

for val in subsample_vals:
    params['subsample'] = val
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of 0.1, subsample of {}: {:.2f}'.format(val, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of 0.1, subsample of {}: {:.2f}'.format(val, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.5: 0.83
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.5: 9121.35
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.6: 0.83
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.6: 9067.61
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.7: 0.83
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.7: 9095.72
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.8: 0.83
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.8: 9090.98
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.9: 0.83
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.9: 9070.51


In [84]:
params = {'objective':'reg:squarederror', 'max_depth': 5, 'eta':0.1, 'subsample':0.8}
    
#loop through several possible values of subsamples (rows)
colsample_vals= [0.7, 0.8, 0.9]

for val in colsample_vals:
    params['colsample_bytree'] = val
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of {}: {:.2f}'.format(val, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of {}: {:.2f}'.format(val, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.7: 0.82
RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.7: 9266.87
R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.8: 0.83
RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.8: 9175.80
R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.9: 0.83
RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.9: 9101.74


Using the best parameters, we can build the final model:

In [85]:
X_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(293146, 207)
(125514, 207)


In [86]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))

print(y_train.shape)
print(y_test.shape)

(293146,)
(125514,)


In [87]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [88]:
params = {'objective':'reg:squarederror', 'max_depth': 5, 'eta':0.1, 'subsample':0.8, 'colsample_bytree':0.8}

xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)

In [89]:
#predict
y_pred = xgb_model.predict(DM_test)

print('R2 with 100 boost rounds: {:.2f}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 100 boost rounds: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds: 0.83
RMSE with 100 boost rounds: 9175.80


We can inspect the first 10 predictions and true values, to get a sense of the performance.

In [90]:
pd.DataFrame({'Predicted values':y_pred[:10], 'True values':y_test[:10]})

Unnamed: 0,Predicted values,True values
0,17458.4375,16567.69
1,40650.484375,44481.38
2,10802.493164,13926.04
3,24189.796875,40925.76
4,20084.144531,23105.81
5,10026.289062,3665.25
6,21727.472656,12377.76
7,29098.935547,36797.0
8,17012.357422,20705.21
9,15917.623047,29927.33


In [91]:
#save the model
xgb_model.save_model('xgb.model')

In [92]:
#load it back in and check if it works
loaded_model = xgb.Booster(model_file='xgb.model')
print(type(xgb_model))
print(type(loaded_model))


y_pred_2 = loaded_model.predict(DM_test)

print('R2 with 100 boost rounds: {:.2f}'.format(metrics.r2_score(y_test, y_pred_2)))
print('RMSE with 100 boost rounds: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred_2))))

<class 'xgboost.core.Booster'>
<class 'xgboost.core.Booster'>
R2 with 100 boost rounds: 0.83
RMSE with 100 boost rounds: 9175.80
