In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from graphviz import Source
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics


In [6]:
#import the dataset from GitHub
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/final_df.csv'
df = pd.read_csv(url)

In [7]:
print('Shape of final dataframe: {}'.format(df.shape))
df.head()

Shape of final dataframe: (418660, 16)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales
0,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,1,A,151315,24924.5
1,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,2,A,151315,50605.27
2,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,3,A,151315,13740.12
3,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,4,A,151315,39954.04
4,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,5,A,151315,32229.38


In [8]:
#convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

#convert Store, Dept, and Type columns to category
df['Store'] = df['Store'].astype('category')

df['Dept'] = df['Dept'].astype('category')

df['Type'] = df['Type'].astype('category')

In [9]:
#create separate features for Week, Month, and Year
df['Month'] = df['Date'].dt.month
df['Month'] = df['Month'].astype('category')

df['Week'] = df['Date'].dt.week
df['Week'] = df['Week'].astype('category')

df['Year'] = df['Date'].dt.year
df['Year'] = df['Year'].astype('category')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418660 entries, 0 to 418659
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         418660 non-null  category      
 1   Date          418660 non-null  datetime64[ns]
 2   Temperature   418660 non-null  float64       
 3   Fuel_Price    418660 non-null  float64       
 4   MarkDown1     418660 non-null  float64       
 5   MarkDown2     418660 non-null  float64       
 6   MarkDown3     418660 non-null  float64       
 7   MarkDown4     418660 non-null  float64       
 8   MarkDown5     418660 non-null  float64       
 9   CPI           418660 non-null  float64       
 10  Unemployment  418660 non-null  float64       
 11  IsHoliday     418660 non-null  bool          
 12  Dept          418660 non-null  category      
 13  Type          418660 non-null  category      
 14  Size          418660 non-null  int64         
 15  Weekly_Sales  418

In [11]:
df.drop('Date', axis=1, inplace=True)

In [12]:
#create dummy variables
df_dummies = pd.get_dummies(df)

In [13]:
X = df_dummies.drop('Weekly_Sales', axis=1).values
y = df_dummies['Weekly_Sales'].values.reshape(-1, 1)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(X_test.shape)

(313995, 207)
(104665, 207)


In [15]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))
print(y_train.shape)
print(y_test.shape)

(313995,)
(104665,)


In [18]:
df_dummies.columns

Index(['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday',
       ...
       'Week_46', 'Week_47', 'Week_48', 'Week_49', 'Week_50', 'Week_51',
       'Week_52', 'Year_2010', 'Year_2011', 'Year_2012'],
      dtype='object', length=208)

We will try three types of boosting algorithms: AdaBoost, GradientBoost, and XGBoost

1. AdaBoost

In [73]:
abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=4), n_estimators=10, random_state=0, learning_rate=0.1)

In [74]:
abr.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=4),
                  learning_rate=0.1, n_estimators=10, random_state=0)

In [75]:
y_pred = abr.predict(X_test)

print('R2 with 10 estimators: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 10 estimators: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 10 estimators: 0.39179278832176523
RMSE with 10 estimators: 17603.551747595495


In [76]:
n_estimator_values = [5, 10, 15, 20]

for i in n_estimator_values:
    abr = AdaBoostRegressor(n_estimators=i, learning_rate=0.01, random_state=0)
    abr.fit(X_train, y_train)
    y_pred = abr.predict(X_test)
    print('R2 with {} estimators: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with {} estimators: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 5 estimators: 0.3426874481677449
RMSE with 5 estimators: 18300.395740910837
R2 with 10 estimators: 0.34267880822370445
RMSE with 10 estimators: 18300.516013888362
R2 with 15 estimators: 0.342620361314925
RMSE with 15 estimators: 18301.329607629115
R2 with 20 estimators: 0.34262800337426025
RMSE with 20 estimators: 18301.2232305488


2. GradientBoost

In [77]:
max_depth_values = [3, 4, 5, 6]

for i in max_depth_values:
    gbm = GradientBoostingRegressor(n_estimators=10, max_depth=i, random_state=0)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    print('R2 with max depth of {}: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with max depth of {}: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with max depth of 3: 0.33604396768098754
RMSE with max depth of 3: 18392.644599766452
R2 with max depth of 4: 0.3960089002031397
RMSE with max depth of 4: 17542.43144901541
R2 with max depth of 5: 0.45229403711463245
RMSE with max depth of 5: 16705.068204278625
R2 with max depth of 6: 0.5032054179027944
RMSE with max depth of 6: 15909.734931455883


In [78]:
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for i in eta_vals:
    gbm = GradientBoostingRegressor(n_estimators=10, max_depth=6, learning_rate=i, random_state=0)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)
    print('R2 with learning rate of {}: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with learning rate of {}: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with learning rate of 0.001: 0.010260595304912634
RMSE with learning rate of 0.001: 22456.131831064627
R2 with learning rate of 0.01: 0.09462274139142202
RMSE with learning rate of 0.01: 21477.776036869196
R2 with learning rate of 0.1: 0.5032054179027944
RMSE with learning rate of 0.1: 15909.734931455883
R2 with learning rate of 1.0: 0.8649576186076552
RMSE with learning rate of 1.0: 8294.869589483635


3. XGBoost

Note that XGBoost can be implemented using scikit-learn's API as well XGBoost's learning API. We will use the learning API below, which uses different syntax and requires that our data be organized as a DMatrix.

In [79]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)
print(type(DM_train))

<class 'xgboost.core.DMatrix'>


In [80]:
#set parameters
params = {'objective':'reg:squarederror'}

In [81]:
#train model using same parameters as above
xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)

In [82]:
#predict
y_pred = xgb_model.predict(DM_test)

print('R2 with 100 boost rounds: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 100 boost rounds: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds: 0.9372716765592579
RMSE with 100 boost rounds: 5653.356314832314


In [83]:
#loop through several possible max_depth values

max_depth_values = [1, 2, 3, 4, 5, 6]

for i in max_depth_values:
    params['max_depth'] = i
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=10)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, max_depth of {}: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, max_depth of {}: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, max_depth of 1: 0.27155655239117515
RMSE with 100 boost rounds, max_depth of 1: 19265.15196668453
R2 with 100 boost rounds, max_depth of 2: 0.457405903852219
RMSE with 100 boost rounds, max_depth of 2: 16626.929317465718
R2 with 100 boost rounds, max_depth of 3: 0.5588187596767284
RMSE with 100 boost rounds, max_depth of 3: 14992.809996777418
R2 with 100 boost rounds, max_depth of 4: 0.6289070816855715
RMSE with 100 boost rounds, max_depth of 4: 13750.41646057653
R2 with 100 boost rounds, max_depth of 5: 0.6766331698036949
RMSE with 100 boost rounds, max_depth of 5: 12835.779658975487
R2 with 100 boost rounds, max_depth of 6: 0.7200978097742802
RMSE with 100 boost rounds, max_depth of 6: 11942.01650227534


In [84]:
#use 5-fold cross validation, max_depth of 6, 10 boosting rounds, and store the results as a dataframe

DM = xgb.DMatrix(data=X, label=y)
params = params = {'objective':'reg:squarederror', 'max_depth': 5}

cv_results = xgb.cv(params=params, dtrain=DM, nfold=5, num_boost_round=100, metrics='rmse', as_pandas=True)



In [85]:
display(cv_results[['test-rmse-mean', 'test-rmse-std']])

Unnamed: 0,test-rmse-mean,test-rmse-std
0,22912.976563,108.873873
1,19785.493359,109.328974
2,17774.450000,129.346151
3,16446.737695,115.554514
4,15504.715625,138.793441
...,...,...
95,6757.690430,178.896338
96,6741.648731,173.894482
97,6711.356543,176.533544
98,6690.538476,175.866592


In [86]:
#get feature importances
feature_importances = xgb_model.get_score(importance_type='gain')
feature_importances

{'f129': 4291276158000.0,
 'f132': 4771613394000.0,
 'f92': 2916370159428.5713,
 'f116': 2266829117666.6665,
 'f127': 1926685115000.0,
 'f94': 1671905930666.6667,
 'f10': 523206089917.9802,
 'f198': 754108810939.4286,
 'f13': 174116664433.33334,
 'f36': 180494663500.0,
 'f7': 19327349062.416668,
 'f37': 84127809400.0,
 'f8': 42956110348.55556,
 'f20': 195583122534.6154,
 'f53': 49724807806.25,
 'f30': 85841114163.63637,
 'f139': 171216315192.8,
 'f2': 915016076.2857143,
 'f52': 28840528748.57143,
 'f50': 49110144366.666664,
 'f21': 126533391700.0,
 'f24': 80258802842.85715,
 'f57': 1690811499333.3333,
 'f43': 104806443310.0,
 'f46': 72003264400.0,
 'f128': 1310997581600.0,
 'f14': 75405712300.0,
 'f0': 3678566772.5238094,
 'f203': 2594306200.285714,
 'f27': 31543465476.333332,
 'f25': 34207509120.0,
 'f35': 20822708200.0,
 'f6': 2870531538.0,
 'f1': 489368917.5,
 'f26': 32297320400.0,
 'f49': 35073441800.0,
 'f32': 15662776300.0,
 'f54': 13455294450.0,
 'f48': 11485184000.0,
 'f3': 188

In [87]:
list_of_importances = []
list_of_indices = []

for key, val in feature_importances.items():
    list_of_importances.append(val)
    list_of_indices.append(int(key.strip('f')))

In [88]:
#get names of important columns
important_col_names = df_dummies.iloc[:, list_of_indices].columns

In [89]:
feature_importance_df = pd.DataFrame({'feature':important_col_names, 'importance':list_of_importances}).sort_values(by='importance', ascending=False)
feature_importance_df.head(25)

Unnamed: 0,feature,importance
1,Dept_94,4771613000000.0
0,Dept_91,4291276000000.0
2,Dept_37,2916370000000.0
3,Dept_71,2266829000000.0
4,Dept_87,1926685000000.0
22,Dept_1,1690811000000.0
5,Dept_39,1671906000000.0
25,Dept_90,1310998000000.0
53,Dept_12,979814100000.0
45,Dept_93,910519100000.0


Departments are the most important contributor to the XGBRegressor model, along with Size, Type_A, and a few stores and weeks.

Let's reduce our data set to only the 80 most important features and then run grid search cv to find the best parameters

In [90]:
columns_to_keep = feature_importance_df.iloc[:81, 0].to_list() #Weekly Sales is in the list to keep
df_dummies_reduced = df_dummies[columns_to_keep]
df_dummies_reduced.shape

(418660, 81)

In [91]:
X = df_dummies_reduced.drop('Weekly_Sales', axis=1).values
y = df_dummies_reduced['Weekly_Sales'].values

In [92]:
y = y.reshape(len(y))
print(y.shape)

(418660,)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(X_test.shape)

(313995, 80)
(104665, 80)


In [94]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [95]:
params = params = {'objective':'reg:squarederror', 'max_depth': 5}
    
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for eta in eta_vals:
    params['eta'] = eta
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of {}, half the features: {}'.format(eta, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of {}, half the features: {}'.format(eta, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.001, half the features: -0.3567395642547124
RMSE with 100 boost rounds, learning rate of 0.001, half the features: 26291.94792681328
R2 with 100 boost rounds, learning rate of 0.01, half the features: 0.21015789555841935
RMSE with 100 boost rounds, learning rate of 0.01, half the features: 20060.63421782422
R2 with 100 boost rounds, learning rate of 0.1, half the features: 0.4268409224623614
RMSE with 100 boost rounds, learning rate of 0.1, half the features: 17088.82121197079
R2 with 100 boost rounds, learning rate of 1.0, half the features: 0.4450608991966025
RMSE with 100 boost rounds, learning rate of 1.0, half the features: 16815.01200191259


Interestingly, using only half the features greatly reduces model performance. Let's loop through the same possible learning rates with all columns, and compare the performance.

In [96]:
X = df_dummies.drop('Weekly_Sales', axis=1).values
y = df_dummies['Weekly_Sales'].values

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(X_test.shape)

(313995, 207)
(104665, 207)


In [98]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [99]:
params = {'objective':'reg:squarederror', 'max_depth': 5}
    
#loop through several possible eta values
eta_vals = [0.001, 0.01, 0.1, 1.0]

for eta in eta_vals:
    params['eta'] = eta
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of {}: {}'.format(eta, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of {}: {}'.format(eta, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.001: -0.3321114417015132
RMSE with 100 boost rounds, learning rate of 0.001: 26052.223652667246
R2 with 100 boost rounds, learning rate of 0.01: 0.3752884339923692
RMSE with 100 boost rounds, learning rate of 0.01: 17840.798652132773
R2 with 100 boost rounds, learning rate of 0.1: 0.8270665887996942
RMSE with 100 boost rounds, learning rate of 0.1: 9386.722363871955
R2 with 100 boost rounds, learning rate of 1.0: 0.9473626132296197
RMSE with 100 boost rounds, learning rate of 1.0: 5178.711035259626


Finally, let's experiment with changing the subsample of rows and columns for each tree in the ensemble. 

In [100]:
params = {'objective':'reg:squarederror', 'max_depth': 5, 'eta':0.1}
    
#loop through several possible values of subsamples (rows)
subsample_vals= [0.5, 0.6, 0.7, 0.8, 0.9]

for val in subsample_vals:
    params['subsample'] = val
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of 0.1, subsample of {}: {}'.format(val, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of 0.1, subsample of {}: {}'.format(val, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.5: 0.8328582436190664
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.5: 9228.200022765943
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.6: 0.8332156652370921
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.6: 9218.327793727245
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.7: 0.8326269163772739
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.7: 9234.583813349313
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.8: 0.8295511162889528
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.8: 9319.049084935039
R2 with 100 boost rounds, learning rate of 0.1, subsample of 0.9: 0.8295972843926698
RMSE with 100 boost rounds, learning rate of 0.1, subsample of 0.9: 9317.786911920557


In [102]:
params = {'objective':'reg:squarederror', 'max_depth': 5, 'eta':0.1, 'subsample':0.8}
    
#loop through several possible values of subsamples (rows)
colsample_vals= [0.7, 0.8, 0.9]

for val in colsample_vals:
    params['colsample_bytree'] = val
    xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)
    y_pred = xgb_model.predict(DM_test)

    print('R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of {}: {}'.format(val, metrics.r2_score(y_test, y_pred)))
    print('RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of {}: {}'.format(val, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.7: 0.8246080254427479
RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.7: 9453.211557576047
R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.8: 0.8299892004799574
RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.8: 9307.065566550747
R2 with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.9: 0.8302195054984023
RMSE with 100 boost rounds, learning rate of 0.1, colsample_bytree of 0.9: 9300.759525065103


Now let's use our XGBoost model with training and test data split by year

In [19]:
#split into train test based on year
X_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(293146, 207)
(125514, 207)


In [20]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))
print(y_train.shape)
print(y_test.shape)

(293146,)
(125514,)


In [21]:
#convert data into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [22]:
params = {'objective':'reg:squarederror', 'max_depth': 5, 'eta':0.1, 'subsample':0.8, 'colsample_bytree':0.8}

xgb_model = xgb.train(params=params, dtrain=DM_train, num_boost_round=100)

In [23]:
#predict
y_pred = xgb_model.predict(DM_test)

print('R2 with 100 boost rounds: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE with 100 boost rounds: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 100 boost rounds: 0.8278773653134514
RMSE with 100 boost rounds: 9175.79579142783


The results of an XGBoostRegressor are just as good when we divide our training and test data by year. We can inspect the first 10 predictions and true values, to get a sense of the performance.

In [24]:
pd.DataFrame({'Predicted values':y_pred[:10], 'True values':y_test[:10]})

Unnamed: 0,Predicted values,True values
0,17458.4375,16567.69
1,40650.484375,44481.38
2,10802.493164,13926.04
3,24189.796875,40925.76
4,20084.144531,23105.81
5,10026.289062,3665.25
6,21727.472656,12377.76
7,29098.935547,36797.0
8,17012.357422,20705.21
9,15917.623047,29927.33


In [17]:
#save the model
xgb_model.save_model('xgb.model')

In [27]:
#load it back in and check if it works
loaded_model = xgb.Booster(model_file='xgb.model')
print(type(xgb_model))
print(type(loaded_model))


y_pred_2 = loaded_model.predict(DM_test)

print('R2 with 100 boost rounds: {}'.format(metrics.r2_score(y_test, y_pred_2)))
print('RMSE with 100 boost rounds: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred_2))))

<class 'xgboost.core.Booster'>
<class 'xgboost.core.Booster'>
R2 with 100 boost rounds: 0.8278773653134514
RMSE with 100 boost rounds: 9175.79579142783
