In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import metrics




In [3]:
#again, the next few cells will be swapped out once I get internet access and figure out the ipynb import
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/final_df.csv'
df = pd.read_csv(url)

In [4]:
print('Shape of final dataframe: {}'.format(df.shape))
df.head()

Shape of final dataframe: (421570, 16)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Type,Size
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,1,24924.5,A,151315
1,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,2,50605.27,A,151315
2,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,3,13740.12,A,151315
3,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,4,39954.04,A,151315
4,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,5,32229.38,A,151315


In [None]:
#convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

#convert Store, Dept, and Type columns to category
df['Store'] = df['Store'].astype('category')

df['Dept'] = df['Dept'].astype('category')

df['Type'] = df['Type'].astype('category')

In [1]:
#create separate features for Week, Month, and Year
df['Week'] = df['Date'].dt.week
df['Week'] = df['Week'].astype('category')

df['Month'] = df['Date'].dt.month
df['Month'] = df['Month'].astype('category')

df['Year'] = df['Date'].dt.year
df['Year'] = df['Year'].astype('category')

df.info()

NameError: name 'df' is not defined

In [9]:
df.drop('Date', axis=1, inplace=True)

In [10]:
mapping = {True:1, False:0}
df['IsHoliday'] = df['IsHoliday'].map(mapping)
df['IsHoliday'] = df['IsHoliday'].astype('category')

In [12]:
df_dummies = pd.get_dummies(df)

In [13]:
df_dummies.columns[120:]

Index(['Dept_77', 'Dept_78', 'Dept_79', 'Dept_80', 'Dept_81', 'Dept_82',
       'Dept_83', 'Dept_85', 'Dept_87', 'Dept_90', 'Dept_91', 'Dept_92',
       'Dept_93', 'Dept_94', 'Dept_95', 'Dept_96', 'Dept_97', 'Dept_98',
       'Dept_99', 'Type_A', 'Type_B', 'Type_C', 'Month_1', 'Month_2',
       'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8',
       'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Week_1', 'Week_2',
       'Week_3', 'Week_4', 'Week_5', 'Week_6', 'Week_7', 'Week_8', 'Week_9',
       'Week_10', 'Week_11', 'Week_12', 'Week_13', 'Week_14', 'Week_15',
       'Week_16', 'Week_17', 'Week_18', 'Week_19', 'Week_20', 'Week_21',
       'Week_22', 'Week_23', 'Week_24', 'Week_25', 'Week_26', 'Week_27',
       'Week_28', 'Week_29', 'Week_30', 'Week_31', 'Week_32', 'Week_33',
       'Week_34', 'Week_35', 'Week_36', 'Week_37', 'Week_38', 'Week_39',
       'Week_40', 'Week_41', 'Week_42', 'Week_43', 'Week_44', 'Week_45',
       'Week_46', 'Week_47', 'Week_48', 'Week_49'

In [14]:
X = df_dummies.drop('Weekly_Sales', axis=1).values
y = df_dummies['Weekly_Sales'].values.reshape(-1, 1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [16]:
reg = LinearRegression()

In [17]:
reg.fit(X_train, y_train)

LinearRegression()

In [18]:
print('R2 on test data: {}'.format(reg.score(X_test, y_test)))
print('R2 on training data: {}'.format(reg.score(X_train, y_train)))

R2 on test data: 0.6601009145977024
R2 on training data: 0.6635431984048396


In [19]:
y_pred = reg.predict(X_test)
print(metrics.mean_squared_error(y_test, y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

177513059.73104158
13323.402708431566


In [20]:
#see if scaling the data improves model performance
df.head()

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales,Month,Week,Year
0,1,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,1,A,151315,24924.5,5,17,2010
1,1,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,2,A,151315,50605.27,5,17,2010
2,1,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,3,A,151315,13740.12,5,17,2010
3,1,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,4,A,151315,39954.04,5,17,2010
4,1,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,5,A,151315,32229.38,5,17,2010


In [21]:
df[['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size', 'Weekly_Sales']] = StandardScaler().fit_transform(df[['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size', 'Weekly_Sales']])

In [22]:
df.head()

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales,Month,Week,Year
0,1,-0.963798,-1.720834,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,1.018774,0.078201,0,1,A,0.239209,0.393782,5,17,2010
1,1,-0.963798,-1.720834,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,1.018774,0.078201,0,2,A,0.239209,1.524538,5,17,2010
2,1,-0.963798,-1.720834,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,1.018774,0.078201,0,3,A,0.239209,-0.09868,5,17,2010
3,1,-0.963798,-1.720834,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,1.018774,0.078201,0,4,A,0.239209,1.055551,5,17,2010
4,1,-0.963798,-1.720834,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,1.018774,0.078201,0,5,A,0.239209,0.715425,5,17,2010


In [23]:
df_dummies = pd.get_dummies(df)

In [55]:
X = df_dummies.drop('Weekly_Sales', axis=1).values
y = df_dummies['Weekly_Sales'].values.reshape(-1, 1)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [26]:
reg = LinearRegression()

In [27]:
reg.fit(X_train, y_train)

LinearRegression()

In [28]:
print('R2 on scaled test data: {}'.format(reg.score(X_test, y_test)))
print('R2 on scaled training data: {}'.format(reg.score(X_train, y_train)))

R2 on scaled test data: 0.6601014298047991
R2 on scaled training data: 0.6635432201443301


It turns out that scaling has a minimal effect on model performance. In addition, the performance on training and test data is similar, which suggests that our model is probably not overfitting. Nonetheless, we can check use L1 and L2 regularization to prevent overfitting, just in case.

In [29]:
lasso = Lasso(alpha=0.1)

In [30]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.1)

In [31]:
print('R2 using L1 regularization on scaled test data: {}'.format(lasso.score(X_test, y_test)))
print('R2 using L1 regularization on scaled training data: {}'.format(lasso.score(X_train, y_train)))

R2 using L1 regularization on scaled test data: 0.04913331266023324
R2 using L1 regularization on scaled training data: 0.04943574573086773


In [32]:
lasso.coef_.shape

(208,)

In [33]:
df_dummies.drop('Weekly_Sales', axis=1).columns

Index(['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size',
       ...
       'Week_46', 'Week_47', 'Week_48', 'Week_49', 'Week_50', 'Week_51',
       'Week_52', 'Year_2010', 'Year_2011', 'Year_2012'],
      dtype='object', length=208)

In [34]:
pd.DataFrame({'col1':lasso.coef_, 'col2':df_dummies.drop('Weekly_Sales', axis=1).columns}).sort_values(by='col1', ascending=False)

Unnamed: 0,col1,col2
9,0.143401,Size
0,0.000000,Temperature
143,-0.000000,Month_3
133,0.000000,Dept_95
134,-0.000000,Dept_96
...,...,...
73,-0.000000,Dept_18
74,-0.000000,Dept_19
75,-0.000000,Dept_20
76,-0.000000,Dept_21


Lasso reduces all coefficients to 0 except for size, which as we saw before, had the highest correlation to the target

In [35]:
ridge = Ridge(alpha=0.1)

In [36]:
ridge.fit(X_train, y_train)

Ridge(alpha=0.1)

In [37]:
print('R2 using L2 regularization on scaled test data: {}'.format(ridge.score(X_test, y_test)))
print('R2 using L2 regularization on scaled training data: {}'.format(ridge.score(X_train, y_train)))

R2 using L2 regularization on scaled test data: 0.6601003598981615
R2 using L2 regularization on scaled training data: 0.6635431910979379


In [38]:
pd.DataFrame({'col1':ridge.coef_.reshape(208,), 'col2':df_dummies.drop('Weekly_Sales', axis=1).columns}).sort_values(by='col1', ascending=False)

Unnamed: 0,col1,col2
130,2.749361,Dept_92
133,2.482668,Dept_95
93,2.129144,Dept_38
117,1.635704,Dept_72
114,1.514765,Dept_65
...,...,...
105,-0.739285,Dept_50
103,-0.764441,Dept_48
119,-0.807547,Dept_77
137,-0.846780,Dept_99


With ridge, the most significant columns are all department numbers