# grocery_sales_forecasting_ml

This baseline model is non-parametric. This model is simply average of history sale (from 2013 to 2016 of the same day.)

In [1]:
# Import libraries necessary for this project
import os.path
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from IPython.display import display
import matplotlib.pyplot as plt
import datetime as dt

types = {'id': 'int32', 'item_nbr': 'int32', 'store_nbr': 'int16', 'onpromotion': bool}

%matplotlib inline

## Approach 1: Median of the same item in all stores from 2017/01/01 to 2017/08/15

In [2]:
train_data = pd.read_csv('input/train.csv', usecols=[1,2,3,4], 
                        parse_dates=['date'], dtype=types, 
                        converters={'unit_sales':lambda u: float(u) if float(u)>0 else 0},
                        skiprows=range(1,124035460))
print("Favorita grocery sales forecasting training data has {} samples with {} features each.".format(*train_data.shape))
display(train_data.head(5))
unit_sale_log_transformed = train_data['unit_sales'].apply(pd.np.log1p).values.reshape(-1, 1)
train_data = train_data.drop('unit_sales', axis = 1)

Favorita grocery sales forecasting training data has 1461581 samples with 4 features each.


Unnamed: 0,date,store_nbr,item_nbr,unit_sales
0,2017-08-02,1,96995,1.0
1,2017-08-02,1,103520,1.0
2,2017-08-02,1,103665,2.0
3,2017-08-02,1,105574,8.0
4,2017-08-02,1,105575,8.0


In [3]:
unit_sale_log_transformed.shape

(1461581, 1)

In [4]:
items = pd.read_csv("input/items.csv")
train_data= pd.merge(train_data,items, right_on='item_nbr',left_on='item_nbr',how='left')
display(train_data.tail(5))

Unnamed: 0,date,store_nbr,item_nbr,family,class,perishable
1461576,2017-08-15,54,2089339,GROCERY I,1006,0
1461577,2017-08-15,54,2106464,BEVERAGES,1148,0
1461578,2017-08-15,54,2110456,BEVERAGES,1120,0
1461579,2017-08-15,54,2113914,CLEANING,3040,0
1461580,2017-08-15,54,2116416,GROCERY I,1060,0


In [5]:
stores = pd.read_csv("input/stores.csv")
train_data= pd.merge(train_data,stores, right_on='store_nbr',left_on='store_nbr',how='left')
display(train_data.tail(5))

Unnamed: 0,date,store_nbr,item_nbr,family,class,perishable,city,state,type,cluster
1461576,2017-08-15,54,2089339,GROCERY I,1006,0,El Carmen,Manabi,C,3
1461577,2017-08-15,54,2106464,BEVERAGES,1148,0,El Carmen,Manabi,C,3
1461578,2017-08-15,54,2110456,BEVERAGES,1120,0,El Carmen,Manabi,C,3
1461579,2017-08-15,54,2113914,CLEANING,3040,0,El Carmen,Manabi,C,3
1461580,2017-08-15,54,2116416,GROCERY I,1060,0,El Carmen,Manabi,C,3


In [6]:
# holidays_events = pd.read_csv("input/holidays_events.csv")
# train_data= pd.merge(train_data,stores, right_on='date',left_on='date',how='left')
# display(train_data.tail(5))

In [7]:
train_data['day_of_week'] = train_data['date'].dt.weekday_name
features_raw = train_data.drop(['date', 'city', 'state'], axis = 1)
features = pd.get_dummies(features_raw, columns = ['store_nbr', 'item_nbr', 'day_of_week', 'family', 'class', 'type', 'cluster'])
display(features.head(5))

Unnamed: 0,perishable,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,store_nbr_6,store_nbr_7,store_nbr_8,store_nbr_9,...,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [12]:
from sklearn.metrics import make_scorer
def nwrmsle(ground_truth, predictions, w):
    nominator = 0
    batch = 50000
    n_batches = int(len(w)//batch)
    for index in range(n_batches):
        nominator += np.sum(w[index*batch:(index+1)*batch]*np.square(np.log1p(predictions[index*batch:(index+1)*batch]) - np.log1p(ground_truth[index*batch:(index+1)*batch])))
    if len(w) < n_batches * batch:
        nominator += np.sum(w[n_batches * batch:]*np.square(np.log1p(predictions[n_batches * batch:]) - np.log1p(ground_truth[n_batches * batch:])))
    return np.sqrt(nominator)/np.sum(w)

score = make_scorer(nwrmsle, greater_is_better=False)


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, unit_sale_log_transformed, test_size = 0.2, random_state = 42)

print("Training set has {} samples with {} variables.".format(*X_train.shape))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 1169264 samples with 4289 variables.
Testing set has 292317 samples.


In [10]:
from time import time
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test):
    results = {}
    start = time()
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    results['train_time'] = time() - start

    predictions = learner.predict(X_test).reshape(-1, 1)
    result['score'] = score(y_test, predictions, X_test['perishable'])
    return results

In [13]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state = 36, max_depth = 50)
sample_size = 200000
regressor.fit(X_train[:sample_size], y_train[:sample_size])
predictions = list(regressor.predict(X_test))
print()
# result = train_predict(regressor, 5000, X_train, y_train, X_test, y_test)

result = nwrmsle(list(y_test)[:sample_size], predictions[:sample_size], list(X_test['perishable'])[:sample_size])
print("The score is {}".format(result))


The score is 0.39095606062152205
