# Machine Learning Notebook - Matt

In [35]:
import HelperFunctions as hf
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

## Load Data

In [2]:
#Load in the raw data
calendar_df = hf.load_calendar_data()
prices_df = hf.load_sell_price_data()
sales_df = hf.load_sales_train_validation_data()

## Clean Data

In [3]:
#Clean the raw data
df = hf.rawToClean(sales_df, calendar_df, prices_df, days=600, items=100, dropNAPrices=True)

## Feature Engineering

In [4]:
df = hf.rollingMeanDemandFeature(df, windowSize=7, shift=1) #Mean of previous week
df = hf.rollingMeanDemandFeature(df, windowSize=28, shift=1) #Mean of previous 28 days

In [5]:
df = hf.lagFeature(df, var='sold', lag=1) #Amount sold day before
df = hf.lagFeature(df, var='sold', lag=7) #Amount sold a week before
df = hf.lagFeature(df, var='sold', lag=28) #Amount sold 28 days before

In [6]:
df.dropna(inplace=True) #Drop rows with NAs (as result of lagged features)

In [7]:
df.drop(['id'], axis=1, inplace=True) #Drop the id column

In [8]:
df.isnull().sum()

d                    0
sold                 0
wday                 0
month                0
year                 0
snap_CA              0
snap_TX              0
snap_WI              0
sell_price           0
sporting             0
cultural             0
national             0
religious            0
christmas            0
weekend              0
midweek              0
monfri               0
FOODS_1              0
FOODS_2              0
FOODS_3              0
HOBBIES_1            0
HOBBIES_2            0
HOUSEHOLD_1          0
HOUSEHOLD_2          0
FOODS                0
HOBBIES              0
HOUSEHOLD            0
CA_1                 0
CA_2                 0
CA_3                 0
CA_4                 0
TX_1                 0
TX_2                 0
TX_3                 0
WI_1                 0
WI_2                 0
WI_3                 0
CA                   0
TX                   0
WI                   0
rolling_mean_7_1     0
rolling_mean_28_1    0
sold_lag_1           0
sold_lag_7 

In [10]:
df.head(10)

Unnamed: 0,d,sold,wday,month,year,snap_CA,snap_TX,snap_WI,sell_price,sporting,...,WI_2,WI_3,CA,TX,WI,rolling_mean_7_1,rolling_mean_28_1,sold_lag_1,sold_lag_7,sold_lag_28
2800,29,2,1,2,2011,0,0,0,5.47,0,...,0,0,1,0,0,1.714286,1.642857,0.0,3.0,3.0
2801,29,2,1,2,2011,0,0,0,5.48,0,...,0,0,1,0,0,1.0,0.75,0.0,2.0,1.0
2808,29,0,1,2,2011,0,0,0,1.0,0,...,0,0,1,0,0,0.0,140.357143,0.0,0.0,360.0
2812,29,0,1,2,2011,0,0,0,3.98,0,...,0,0,0,1,0,1.0,0.928571,0.0,4.0,3.0
2813,29,16,1,2,2011,0,0,0,1.56,0,...,0,0,0,0,1,1.857143,3.285714,0.0,5.0,17.0
2815,29,0,1,2,2011,0,0,0,4.98,0,...,0,0,1,0,0,0.0,0.892857,0.0,0.0,3.0
2816,29,3,1,2,2011,0,0,0,1.78,0,...,0,0,1,0,0,1.0,1.535714,3.0,1.0,1.0
2817,29,4,1,2,2011,0,0,0,1.48,0,...,0,1,0,0,1,3.857143,3.321429,6.0,5.0,3.0
2818,29,12,1,2,2011,0,0,0,0.88,0,...,0,0,0,1,0,5.571429,5.285714,6.0,8.0,4.0
2820,29,0,1,2,2011,0,0,0,3.68,0,...,0,0,1,0,0,1.0,0.607143,1.0,0.0,1.0


In [11]:
df.shape

(30124, 45)

In [17]:
df.wday

2800     1
2801     1
2808     1
2812     1
2813     1
        ..
59994    5
59995    5
59996    5
59997    5
59999    5
Name: wday, Length: 30124, dtype: int64

# Machine Learning

## Train/Test Split
We also isolate the target variables from the features

In [18]:
#Standardise features
cols_to_norm = ['d','wday','month','year','sell_price','rolling_mean_7_1','rolling_mean_28_1','sold_lag_1',
                'sold_lag_7','sold_lag_28']

df[cols_to_norm] = StandardScaler().fit_transform(df[cols_to_norm])

In [42]:
#Isolate features and target variable
y = df.sold.values
X_df = df.drop(['sold'], axis=1)
X = X_df.values

In [32]:
#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [34]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(24099, 44)
(24099,)
(6025, 44)
(6025,)


## Linear Regression
The results of fitting a linear regression model on the data and analysing the feature's p-values raises some interesting findings. 
* Amongst date features, d, month and year are all insignificant features. 
* Amongst the event related features only sporting, religious and xmas are significant. As expected, Xmas is the most influential feature of them all.  Interestingly, this model suggests than there is a significant decrease in sales on a sporting event day, but an increase in sales on a religious event day.
* The LR also confirms our findings in analysis that weekend/monfri/midweek are all significant features for predicting sales volume. 
* Interestingly, among the product categories, HOBBIES_2 is the only insignificant predictor. 
* CA_4 is the only store that is an insignificant predictor.
* The mean of sold items in the week leading up is the second most influential feature.
* snap_CA is insignificant unlike the other snap features - are less Californians drawn in to offers perhaps?
* More expensive items sell in lower quantities

In [44]:
mod = sm.OLS(df['sold'], df[X_df.columns])
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   sold   R-squared:                       0.768
Model:                            OLS   Adj. R-squared:                  0.768
Method:                 Least Squares   F-statistic:                     2843.
Date:                Wed, 24 Feb 2021   Prob (F-statistic):               0.00
Time:                        15:32:11   Log-Likelihood:                -73610.
No. Observations:               30124   AIC:                         1.473e+05
Df Residuals:                   30088   BIC:                         1.476e+05
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
d                    -0.8247      0.62

In [None]:
###Train test split
###Normalise features
###More Data Analysis
###Feature Extraction - average of weekday - exponential averages - Eve Events
###Modelling
###Feature selection - Linear Regression/RandomForests/Lasso/Correlation plots