In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics


In [2]:
#import the data
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/final_df.csv'
df = pd.read_csv(url)

In [3]:
print('Shape of final dataframe: {}'.format(df.shape))
df.head()

Shape of final dataframe: (418660, 16)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales
0,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,1,A,151315,24924.5
1,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,2,A,151315,50605.27
2,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,3,A,151315,13740.12
3,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,4,A,151315,39954.04
4,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,5,A,151315,32229.38


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418660 entries, 0 to 418659
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         418660 non-null  int64  
 1   Date          418660 non-null  object 
 2   Temperature   418660 non-null  float64
 3   Fuel_Price    418660 non-null  float64
 4   MarkDown1     418660 non-null  float64
 5   MarkDown2     418660 non-null  float64
 6   MarkDown3     418660 non-null  float64
 7   MarkDown4     418660 non-null  float64
 8   MarkDown5     418660 non-null  float64
 9   CPI           418660 non-null  float64
 10  Unemployment  418660 non-null  float64
 11  IsHoliday     418660 non-null  bool   
 12  Dept          418660 non-null  int64  
 13  Type          418660 non-null  object 
 14  Size          418660 non-null  int64  
 15  Weekly_Sales  418660 non-null  float64
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 48.3+ MB
None


In [5]:
#convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

#convert Store, Dept, and Type columns to category
df['Store'] = df['Store'].astype('category')

df['Dept'] = df['Dept'].astype('category')

df['Type'] = df['Type'].astype('category')

In [6]:
#create separate features for Week, Month, and Year
df['Week'] = df['Date'].dt.week
df['Week'] = df['Week'].astype('category')

df['Month'] = df['Date'].dt.month
df['Month'] = df['Month'].astype('category')

df['Year'] = df['Date'].dt.year
df['Year'] = df['Year'].astype('category')

In [7]:
df.drop('Date', axis=1, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418660 entries, 0 to 418659
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Store         418660 non-null  category
 1   Temperature   418660 non-null  float64 
 2   Fuel_Price    418660 non-null  float64 
 3   MarkDown1     418660 non-null  float64 
 4   MarkDown2     418660 non-null  float64 
 5   MarkDown3     418660 non-null  float64 
 6   MarkDown4     418660 non-null  float64 
 7   MarkDown5     418660 non-null  float64 
 8   CPI           418660 non-null  float64 
 9   Unemployment  418660 non-null  float64 
 10  IsHoliday     418660 non-null  bool    
 11  Dept          418660 non-null  category
 12  Type          418660 non-null  category
 13  Size          418660 non-null  int64   
 14  Weekly_Sales  418660 non-null  float64 
 15  Week          418660 non-null  category
 16  Month         418660 non-null  category
 17  Year          418660 non-null

We'll test the following KNN models:


1. KNN trained on all data

2. KNN with weeks discarded

3. KNN with scaled X

4. KNN with weeks discarded and most departments discarded

5. KNN models with different n_neighbors

1. KNN trained on all data

In [9]:
df_dummies = pd.get_dummies(df, drop_first=True)


In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print('Shape: {}'.format(df_dummies.shape))
pd.DataFrame(df_dummies.columns, columns=['Feature Name'])

Shape: (418660, 202)


Unnamed: 0,Feature Name
0,Temperature
1,Fuel_Price
2,MarkDown1
3,MarkDown2
4,MarkDown3
5,MarkDown4
6,MarkDown5
7,CPI
8,Unemployment
9,IsHoliday


In [12]:
X_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(313995, 201)
(104665, 201)


In [13]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [14]:
y_pred = knn.predict(X_test)

print('R2: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2: -0.07929968366481854
RMSE: 23450.145594684906


A default KNN model performs poorly and trains slowly. We can see if reducing the dimensionality improves either.

2. KNN with weeks discarded

In [15]:
#remove week columns, so that the only date information is month and year
list_of_week_columns = list(df_dummies.columns[df_dummies.columns.str.contains('Week_')])
df_dummies_no_weeks = df_dummies.drop(list_of_week_columns, axis=1)
print('Shape without weeks: {}'.format(df_dummies_no_weeks.shape))

Shape without weeks: (418660, 151)


In [17]:
X_train = df_dummies_no_weeks.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies_no_weeks.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies_no_weeks.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies_no_weeks.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(313995, 150)
(104665, 150)


In [18]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [19]:
y_pred = knn.predict(X_test)

print('R2: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2: -0.020470092925467087
RMSE: 22802.090195390494


3. KNN with scaled X

In [20]:
#use standard scaler on numeric columns
df[['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size']] = StandardScaler().fit_transform(df[['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size']])

In [21]:
df_dummies_scaled = pd.get_dummies(df, drop_first=True)

In [23]:
X_train = df_dummies_scaled.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies_scaled.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies_scaled.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies_scaled.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

In [24]:
knn = KNeighborsRegressor()
knn.fit(X_train_scaled, y_train)

KNeighborsRegressor()

In [25]:
y_pred = knn.predict(X_test_scaled)

print('R2: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2: -0.020470092925467087
RMSE: 22802.090195390494


Scaling does not affect KNNeighborsRegressor, so we'll do without it

4. KNN with most departments discarded

In [26]:
corr = pd.Series(df_dummies.corr()['Weekly_Sales']).sort_values()

In [27]:
#find departments with the strongest correlation to the target
corr[corr.index.str.contains('Dept')][np.abs(corr) >= 0.08]

Dept_28   -0.082442
Dept_59   -0.081597
Dept_60   -0.081386
Dept_13    0.080210
Dept_94    0.091251
Dept_91    0.096899
Dept_2     0.151179
Dept_40    0.158312
Dept_90    0.160273
Dept_72    0.184587
Dept_38    0.247211
Dept_95    0.295073
Dept_92    0.324672
Name: Weekly_Sales, dtype: float64

In [28]:
#drop all other departments
dept_to_drop = corr[corr.index.str.contains('Dept')][np.abs(corr) <= 0.08].index
df_dummies_few_dept = df_dummies.drop(dept_to_drop, axis=1)
df_dummies_few_dept.shape

(418660, 135)

In [30]:
X_train = df_dummies_few_dept.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies_few_dept.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies_few_dept.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies_few_dept.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(313995, 134)
(104665, 134)


In [31]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [32]:
y_pred = knn.predict(X_test)

print('R2: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2: -0.021860560557133812
RMSE: 22817.619692832068


Let's also drop the weeks

In [33]:
df_dummies_few_dept_no_weeks = df_dummies_few_dept.drop(list_of_week_columns, axis=1)

In [35]:
X_train = df_dummies_few_dept_no_weeks.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies_few_dept_no_weeks.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies_few_dept_no_weeks.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies_few_dept_no_weeks.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

print(X_train.shape)
print(X_test.shape)

(313995, 83)
(104665, 83)


In [36]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [37]:
y_pred = knn.predict(X_test)

print('R2: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2: -0.0017854934564953684
RMSE: 22592.3749871973


In [38]:
k_range = [3, 4, 5, 6, 7]

for i in k_range:
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print('R2 with {} neighbors: {}'.format(i, metrics.r2_score(y_test, y_pred)))
    print('RMSE with {} neighbors: {}'.format(i, np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2 with 3 neighbors: -0.02159765119610446
RMSE with 3 neighbors: 22814.684188735035
R2 with 4 neighbors: -0.008334149589836803
RMSE with 4 neighbors: 22666.097703707976
R2 with 5 neighbors: -0.0017854934564953684
RMSE with 5 neighbors: 22592.3749871973
R2 with 6 neighbors: 0.004365445458384687
RMSE with 6 neighbors: 22522.909874069486
R2 with 7 neighbors: 0.007779291642015629
RMSE with 7 neighbors: 22484.263277796523


Increasing the number of neighbors improves performance, but it takes much longer to train. Let's reduce the feature space to only 10% of the original dataset and redo this process

In [39]:
print(df.shape)
df_dummies = pd.get_dummies(df, drop_first=True)

print(df_dummies.shape)

(418660, 18)
(418660, 202)


In [40]:
X = df_dummies.drop('Weekly_Sales', axis=1).values
y = df_dummies['Weekly_Sales'].values.reshape(-1, 1)

print(X.shape)
print(y.shape)

(418660, 201)
(418660, 1)


In [43]:
X_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), :].drop('Weekly_Sales', axis=1).values
X_test = df_dummies.loc[df['Year']==2012, :].drop('Weekly_Sales', axis=1).values
y_train = df_dummies.loc[(df['Year']==2010) | (df['Year']==2011), 'Weekly_Sales'].values.reshape(-1, 1)
y_test = df_dummies.loc[df['Year']==2012, 'Weekly_Sales'].values.reshape(-1, 1)

In [44]:
#select the top 25% most correlated features
from sklearn.feature_selection import SelectPercentile

selection = SelectPercentile(percentile=25)# score_func=f_classif by default
selection.fit(X_train, y_train)
X_train_selected = selection.transform(X_train)

print(X_train.shape)
print(X_train_selected.shape)

  return f(**kwargs)
  f = msb / msw


(293146, 201)
(293146, 50)


In [45]:
knn = KNeighborsRegressor(n_jobs=-1)
knn.fit(X_train_selected, y_train)

KNeighborsRegressor(n_jobs=-1)

In [46]:
X_test_selected = selection.transform(X_test)

y_pred = knn.predict(X_test_selected)

print('R2: {}'.format(metrics.r2_score(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

R2: 0.48366306067525966
RMSE: 15892.467981988304


Reducing the dataset to only 50 features greatly improved the result. We can see what features were included, which will give us some insight as we explore additional models.

In [47]:
mask = selection.get_support()
print(df_dummies.drop('Weekly_Sales', axis=1).columns[mask])

Index(['MarkDown3', 'MarkDown4', 'Store_2', 'Store_11', 'Store_14', 'Dept_2',
       'Dept_3', 'Dept_4', 'Dept_5', 'Dept_7', 'Dept_8', 'Dept_9', 'Dept_10',
       'Dept_11', 'Dept_13', 'Dept_14', 'Dept_16', 'Dept_17', 'Dept_21',
       'Dept_22', 'Dept_23', 'Dept_24', 'Dept_26', 'Dept_33', 'Dept_34',
       'Dept_37', 'Dept_38', 'Dept_40', 'Dept_44', 'Dept_46', 'Dept_49',
       'Dept_55', 'Dept_60', 'Dept_65', 'Dept_67', 'Dept_71', 'Dept_72',
       'Dept_74', 'Dept_79', 'Dept_81', 'Dept_82', 'Dept_87', 'Dept_90',
       'Dept_91', 'Dept_92', 'Dept_93', 'Dept_94', 'Dept_95', 'Dept_96',
       'Dept_97'],
      dtype='object')


The most important features, in terms of correlation, for the KNN model are departments and stores