In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.tree import export_graphviz
from graphviz import Source
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

In [24]:
#import the dataset from GitHub
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/final_df.csv'
df = pd.read_csv(url)

In [25]:
print('Shape of final dataframe: {}'.format(df.shape))
df.head()

Shape of final dataframe: (418660, 16)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales
0,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,1,A,151315,24924.5
1,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,2,A,151315,50605.27
2,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,3,A,151315,13740.12
3,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,4,A,151315,39954.04
4,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,5,A,151315,32229.38


In [26]:
#convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

#convert Store, Dept, and Type columns to category
df['Store'] = df['Store'].astype('category')

df['Dept'] = df['Dept'].astype('category')

df['Type'] = df['Type'].astype('category')

In [27]:
#create separate features for Week, Month, and Year
df['Month'] = df['Date'].dt.month
df['Month'] = df['Month'].astype('category')

df['Week'] = df['Date'].dt.week
df['Week'] = df['Week'].astype('category')

df['Year'] = df['Date'].dt.year
df['Year'] = df['Year'].astype('category')

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418660 entries, 0 to 418659
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         418660 non-null  category      
 1   Date          418660 non-null  datetime64[ns]
 2   Temperature   418660 non-null  float64       
 3   Fuel_Price    418660 non-null  float64       
 4   MarkDown1     418660 non-null  float64       
 5   MarkDown2     418660 non-null  float64       
 6   MarkDown3     418660 non-null  float64       
 7   MarkDown4     418660 non-null  float64       
 8   MarkDown5     418660 non-null  float64       
 9   CPI           418660 non-null  float64       
 10  Unemployment  418660 non-null  float64       
 11  IsHoliday     418660 non-null  bool          
 12  Dept          418660 non-null  category      
 13  Type          418660 non-null  category      
 14  Size          418660 non-null  int64         
 15  Weekly_Sales  418

In [29]:
df.drop('Date', axis=1, inplace=True)

In [30]:
#create dummy variables
df_dummies = pd.get_dummies(df)

In [31]:
X_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(293146, 207)
(125514, 207)


In [32]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))

print(y_train.shape)
print(y_test.shape)

(293146,)
(125514,)


In [33]:
#loop through different values of max_depth

max_depths = [5, 10, 15, 25, 30, None]

for i in max_depths:
    rf = RandomForestRegressor(n_estimators=10, max_depth=i, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print('R2 for max_depth of {}, 10 estimators: {:.2f}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE for max_depth of {}, 10 estimators: {:.2f}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 for max_depth of 5, 10 estimators: 0.46
RMSE for max_depth of 5, 10 estimators: 16198.77
R2 for max_depth of 10, 10 estimators: 0.71
RMSE for max_depth of 10, 10 estimators: 11907.45
R2 for max_depth of 15, 10 estimators: 0.81
RMSE for max_depth of 15, 10 estimators: 9753.95
R2 for max_depth of 25, 10 estimators: 0.87
RMSE for max_depth of 25, 10 estimators: 7855.84
R2 for max_depth of 30, 10 estimators: 0.90
RMSE for max_depth of 30, 10 estimators: 7166.53
R2 for max_depth of None, 10 estimators: 0.93
RMSE for max_depth of None, 10 estimators: 5938.21


In [34]:
#loop through different values of n_estimators

n_estimators = [5, 10, 25, 50, 100]

for i in n_estimators:
    rf = RandomForestRegressor(n_estimators=i, max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print('R2 with {} estimators, max_depth of 10: {:.2f}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with {} estimators, max_depth of 10: {:.2f}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 5 estimators, max_depth of 10: 0.71
RMSE with 5 estimators, max_depth of 10: 11935.32
R2 with 10 estimators, max_depth of 10: 0.71
RMSE with 10 estimators, max_depth of 10: 11907.45
R2 with 25 estimators, max_depth of 10: 0.71
RMSE with 25 estimators, max_depth of 10: 11893.63
R2 with 50 estimators, max_depth of 10: 0.71
RMSE with 50 estimators, max_depth of 10: 11881.47
R2 with 100 estimators, max_depth of 10: 0.71
RMSE with 100 estimators, max_depth of 10: 11881.29


The results for a random forest are similar to the results for a single decision tree, with performance improving as the tree is allowed to grow deeper. Interestingly, the number of estimators (trees) has a minimal effect on the performance of the random forest. 

We'll reduce the number of features like we did earlier and see if we can maintain similar performance with fewer features.

In [35]:
features = list(df_dummies.drop('Weekly_Sales', axis=1).columns)

In [36]:
#calculate feature importances
print('Total features: {}'.format(len(rf.feature_importances_)))
feature_importances = pd.DataFrame({'Feature': features, 'Feature Importance':rf.feature_importances_}).sort_values(by='Feature Importance', ascending=False)
display(feature_importances.iloc[:50, :])

Total features: 207


Unnamed: 0,Feature,Feature Importance
10,Size,0.227658
129,Dept_92,0.142576
132,Dept_95,0.124899
92,Dept_38,0.090861
116,Dept_72,0.063871
127,Dept_90,0.059034
94,Dept_40,0.048352
57,Dept_2,0.046879
128,Dept_91,0.030628
198,Week_47,0.029853


In [37]:
#drop all features except the top 50 and years
features_to_drop = feature_importances.iloc[50:, 0]
features_to_drop = features_to_drop[~features_to_drop.str.contains('Year')]

In [38]:
df_dummies_top_features = df_dummies.drop(features_to_drop, axis=1)
df_dummies_top_features.shape

(418660, 54)

In [39]:
X_train = df_dummies_top_features.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies_top_features.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies_top_features.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies_top_features.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(293146, 53)
(125514, 53)


In [40]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))

print(y_train.shape)
print(y_test.shape)

(293146,)
(125514,)


In [41]:
#train a ranadom forest with 10 trees and max_depth of 10 
rf = RandomForestRegressor(n_estimators=10, max_depth=10, random_state=0)

In [42]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, n_estimators=10, random_state=0)

In [43]:
y_pred = rf.predict(X_test)

print('R2: {:.2f}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2: 0.71
RMSE: 11920.49
