# Machine Learning Notebook - Matt

In [1]:
import HelperFunctions as hf
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

## Load Data

In [2]:
#Load in the raw data
calendar_df = hf.load_calendar_data()
prices_df = hf.load_sell_price_data()
sales_df = hf.load_sales_train_validation_data()

## Clean Data

In [3]:
#Clean the raw data
df = hf.rawToClean(sales_df, calendar_df, prices_df, days=800, items=100, dropNAPrices=True)

## Feature Engineering

In [4]:
df = hf.rollingMeanDemandFeature(df, windowSize=7, shift=1) #Mean of previous week
df = hf.rollingMeanDemandFeature(df, windowSize=28, shift=1) #Mean of previous 28 days

In [5]:
df = hf.lagFeature(df, var='sold', lag=1) #Amount sold day before
df = hf.lagFeature(df, var='sold', lag=7) #Amount sold a week before
df = hf.lagFeature(df, var='sold', lag=28) #Amount sold 28 days before

In [6]:
df.dropna(inplace=True) #Drop rows with NAs (as result of lagged features)

In [7]:
df.drop(['id'], axis=1, inplace=True) #Drop the id column

In [8]:
df.isnull().sum()

d                    0
sold                 0
wday                 0
month                0
year                 0
snap_CA              0
snap_TX              0
snap_WI              0
sell_price           0
Cultural             0
National             0
Religious            0
Sporting             0
NoEvent              0
Christmas            0
weekend              0
midweek              0
monfri               0
FOODS_1              0
FOODS_2              0
FOODS_3              0
HOBBIES_1            0
HOBBIES_2            0
HOUSEHOLD_1          0
HOUSEHOLD_2          0
FOODS                0
HOBBIES              0
HOUSEHOLD            0
CA_1                 0
CA_2                 0
CA_3                 0
CA_4                 0
TX_1                 0
TX_2                 0
TX_3                 0
WI_1                 0
WI_2                 0
WI_3                 0
CA                   0
TX                   0
WI                   0
rolling_mean_7_1     0
rolling_mean_28_1    0
sold_lag_1 

In [9]:
df.head(10)

Unnamed: 0,d,sold,wday,month,year,snap_CA,snap_TX,snap_WI,sell_price,Cultural,...,WI_2,WI_3,CA,TX,WI,rolling_mean_7_1,rolling_mean_28_1,sold_lag_1,sold_lag_7,sold_lag_28
2800,29,0,1,2,2011,0,0,0,1.84,0,...,0,0,0,1,0,0.428571,0.857143,0.0,0.0,1.0
2803,29,7,1,2,2011,0,0,0,2.98,0,...,0,0,1,0,0,1.571429,2.142857,0.0,3.0,0.0
2804,29,0,1,2,2011,0,0,0,0.57,0,...,0,0,1,0,0,3.428571,2.178571,0.0,24.0,13.0
2806,29,63,1,2,2011,0,0,0,0.42,0,...,0,0,1,0,0,5.0,5.035714,8.0,3.0,3.0
2808,29,0,1,2,2011,0,0,0,1.87,0,...,0,0,1,0,0,0.571429,0.321429,0.0,1.0,0.0
2813,29,0,1,2,2011,0,0,0,9.42,0,...,0,0,1,0,0,0.428571,0.178571,0.0,1.0,1.0
2817,29,0,1,2,2011,0,0,0,6.97,0,...,0,0,0,1,0,0.285714,0.214286,0.0,2.0,0.0
2819,29,12,1,2,2011,0,0,0,6.98,0,...,1,0,0,0,1,10.0,16.571429,9.0,9.0,11.0
2821,29,0,1,2,2011,0,0,0,3.67,0,...,0,0,0,1,0,0.142857,0.107143,1.0,0.0,0.0
2823,29,25,1,2,2011,0,0,0,1.97,0,...,0,0,0,1,0,15.428571,12.321429,18.0,11.0,5.0


In [10]:
df.shape

(43967, 46)

In [11]:
df.columns

Index(['d', 'sold', 'wday', 'month', 'year', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'Cultural', 'National', 'Religious', 'Sporting',
       'NoEvent', 'Christmas', 'weekend', 'midweek', 'monfri', 'FOODS_1',
       'FOODS_2', 'FOODS_3', 'HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1',
       'HOUSEHOLD_2', 'FOODS', 'HOBBIES', 'HOUSEHOLD', 'CA_1', 'CA_2', 'CA_3',
       'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3', 'CA', 'TX',
       'WI', 'rolling_mean_7_1', 'rolling_mean_28_1', 'sold_lag_1',
       'sold_lag_7', 'sold_lag_28'],
      dtype='object')

# Machine Learning

## Train/Test Split
We also isolate the target variables from the features

In [12]:
#Standardise features
cols_to_norm = ['d','wday','month','year','sell_price','rolling_mean_7_1','rolling_mean_28_1','sold_lag_1',
                'sold_lag_7','sold_lag_28']

df[cols_to_norm] = StandardScaler().fit_transform(df[cols_to_norm])

In [13]:
#Isolate features and target variable
y = df.sold.values
X_df = df.drop(['sold'], axis=1)
X = X_df.values

In [14]:
#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(35173, 45)
(35173,)
(8794, 45)
(8794,)


## Linear Regression
The results of fitting a linear regression model on the data and analysing the feature's p-values raises some interesting findings. 
* Amongst date features, d, month and year are all insignificant features. 
* Amongst the event related features only sporting, religious and xmas are significant. As expected, Xmas is the most influential feature of them all.  Interestingly, this model suggests than there is a significant decrease in sales on a sporting event day, but an increase in sales on a religious event day.
* The LR also confirms our findings in analysis that weekend/monfri/midweek are all significant features for predicting sales volume. 
* Interestingly, among the product categories, HOBBIES_2 is the only insignificant predictor. 
* CA_4 is the only store that is an insignificant predictor.
* The mean of sold items in the week leading up is the second most influential feature.
* snap_CA is insignificant unlike the other snap features - are less Californians drawn in to offers perhaps?
* More expensive items sell in lower quantities

In [16]:
mod = sm.OLS(df['sold'], df[X_df.columns])
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   sold   R-squared:                       0.629
Model:                            OLS   Adj. R-squared:                  0.628
Method:                 Least Squares   F-statistic:                     2065.
Date:                Thu, 25 Feb 2021   Prob (F-statistic):               0.00
Time:                        18:27:03   Log-Likelihood:                -97493.
No. Observations:               43967   AIC:                         1.951e+05
Df Residuals:                   43930   BIC:                         1.954e+05
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
d                    -0.5048      0.54

In [None]:
###Train test split
###Normalise features
###More Data Analysis
###Feature Extraction - average of weekday - exponential averages - Eve Events
###Modelling
###Feature selection - Linear Regression/RandomForests/Lasso/Correlation plots