# grocery_sales_forecasting_ml

This baseline model is non-parametric. This model is simply average of history sale (from 2013 to 2016 of the same day.)

In [1]:
# Import libraries necessary for this project
import os.path
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from IPython.display import display
import matplotlib.pyplot as plt
import datetime as dt
from time import time
import sys

types = {'id': 'int32', 'item_nbr': 'int32', 'store_nbr': 'int8', 'onpromotion': bool}

%matplotlib inline

## import data

In [2]:
train_data_raw = pd.read_csv('input/train.csv', usecols = [1,2,3,4,5],
                             parse_dates=['date'], dtype=types, 
                             infer_datetime_format = True,
                             converters={'unit_sales':lambda u: float(u) if float(u)>0 else 0},
                             skiprows=range(1,124035460))  # 124035460:2017/08/01 # 101688779: 2017/01/01 #117477877:2017/06/01
train_data_raw['onpromotion'].fillna(False, inplace = True)
train_data_raw['onpromotion'] = train_data_raw['onpromotion'].map({False : 0, True : 1})
train_data_raw['onpromotion'] = train_data_raw['onpromotion'].astype('int8')
print("Favorita grocery sales forecasting training data has {} samples with {} features each.".format(*train_data_raw.shape))
display(train_data_raw.head(5))

Favorita grocery sales forecasting training data has 1461581 samples with 5 features each.


Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2017-08-02,1,96995,1.0,0
1,2017-08-02,1,103520,1.0,0
2,2017-08-02,1,103665,2.0,0
3,2017-08-02,1,105574,8.0,0
4,2017-08-02,1,105575,8.0,0


In [3]:
test_data_raw = pd.read_csv('input/test.csv', usecols=[1,2,3,4],
                            parse_dates=['date'], dtype=types)
test_data_raw['onpromotion'].fillna(False, inplace = True)
test_data_raw['onpromotion'] = test_data_raw['onpromotion'].map({False : 0, True : 1})
test_data_raw['onpromotion'] = test_data_raw['onpromotion'].astype('int8')

print("Favorita grocery sales forecasting testing data has {} samples with {} features each.".format(*test_data_raw.shape))
print(test_data_raw.dtypes)

Favorita grocery sales forecasting testing data has 3370464 samples with 4 features each.
date           datetime64[ns]
store_nbr                int8
item_nbr                int32
onpromotion              int8
dtype: object


In [4]:
uitem_train = set(train_data_raw.item_nbr.unique())
uitem_test = set(test_data_raw.item_nbr.unique())
items = uitem_train.intersection(uitem_test)

There are 19 items appear in training data, but not in testing data; 68 items appear only testing data, not in training data.

In [5]:
train_data = train_data_raw.loc[train_data_raw['item_nbr'].isin(items)].copy()
# train_data.reset_index(inplace=True)
print("There are {} samples after droping unuseful items.".format(train_data.shape[0]))
test_data = test_data_raw.loc[test_data_raw['item_nbr'].isin(items)].copy()
# test_data.reset_index(inplace=True)
print("There are {} testing data after droping unuseful items.".format(test_data.shape[0]))

There are 1460827 samples after droping unuseful items.
There are 3311712 testing data after droping unuseful items.


In [6]:
# from sklearn.preprocessing import OneHotEncoder
def pre_process(feature_raw):

    items = pd.read_csv("input/items.csv", dtype={'perishable': np.dtype('int8')})
    feature_raw= pd.merge(feature_raw,items, right_on='item_nbr',left_on='item_nbr',how='left')
    
    stores = pd.read_csv("input/stores.csv")
    stores.drop(['city', 'state'], axis = 1, inplace=True)
    feature_raw= pd.merge(feature_raw,stores, right_on='store_nbr',left_on='store_nbr',how='left')

    # holidays_events = pd.read_csv("input/holidays_events.csv")
    # train_data= pd.merge(train_data,stores, right_on='date',left_on='date',how='left')
    # display(train_data.tail(5))
    
    feature_raw['day_of_week'] = feature_raw['date'].dt.weekday_name

    if feature_raw.isnull().values.any():
        print("There is NaN before one-hot-encoding!")
    else:
        print("There is no NaN before one-hot-encoding")
    feature_raw.drop(['date'], axis = 1, inplace = True)
    feature_one_hot_coded = pd.get_dummies(feature_raw, columns = ['item_nbr', 'family', 'class', 'store_nbr', 'day_of_week', 'type', 'cluster'])
    if feature_one_hot_coded.isnull().values.any():
        print("There is NaN after one-hot-encoding!")
    else:
        print("There is no NaN after one-hot-encoding")
        
    return feature_one_hot_coded

In [7]:
train_processed = pre_process(train_data_raw)

unit_sale = train_processed['unit_sales'].values.reshape(-1,1)
train_processed.drop('unit_sales', axis = 1, inplace=True)

There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding


Evaluation metric: Normalized Weighted Root Mean Squared Logarithmic Error (NWRMSLE)

In [8]:
from sklearn.metrics import make_scorer
def nwrmsle(ground_truth, predictions, w):
    nominator = 0
    batch = 50000
    n_batches = int(len(w)//batch)
    for index in range(n_batches):
        istart = index*batch; iend = (index+1)*batch
        nominator += np.sum(w[istart:iend]*np.square(np.log1p(predictions[istart:iend]) - np.log1p(ground_truth[istart:iend])))
    if n_batches * batch < len(w):
        istart = n_batches * batch
        nominator += np.sum(w[istart:]*np.square(np.log1p(predictions[istart:]) - np.log1p(ground_truth[istart:])))
    return np.sqrt(nominator)/np.sum(w)

nwrmsle_scorer = make_scorer(score_func=nwrmsle, greater_is_better=False, w = [])


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_processed, unit_sale, test_size = 0.1, random_state = 42)

print("Training set has {} samples with {} variables.".format(*X_train.shape))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 1315422 samples with 4290 variables.
Testing set has 146159 samples.


In [10]:
def train_predict(learner, X_train, y_train, X_test, y_test,  sample_size=X_train.shape[0]):
    results = dict()
    start = time()
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    results['train_time'] = time() - start
    
    start = time()
    predictions_temp = learner.predict(X_test)
    predictions = [x if x>=0 else 0 for x in predictions_temp]
    results['test_time'] = time() - start
    results['learner'] = learner
    
    w = X_test['perishable'].apply(lambda x: 1.25 if x else 1).values.ravel()
    results['score'] = nwrmsle(y_test.flatten(), predictions, w = w)
    return results

## DecisionTreeRegressor

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor

samples_1 = int(len(y_train)*0.01)
samples_10 = int(len(y_train)*0.1)
samples_100 = len(y_train)

regressor_A = DecisionTreeRegressor(random_state = 36, max_depth = 50)
regressor_B = linear_model.LinearRegression()
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split':2, 'learning_rate': 0.01}
regressor_C = GradientBoostingRegressor(**params)


results = {}
for regressor in [regressor_B]:
    reg_name = regressor.__class__.__name__
    results[reg_name] = {}
    for i, samples in enumerate([samples_100]):
        results[reg_name][i] = train_predict(regressor, X_train, y_train, X_test, y_test, samples)
print(results.items())

model = results['DecisionTreeRegressor'][0]['learner']
filename = model.__class__.__name__ + '_20171111.sav'
pickle.dump(model, open(filename, 'wb'))

dict_items([('DecisionTreeRegressor', {0: {'train_time': 1218.2657670974731, 'test_time': 0.8543581962585449, 'learner': DecisionTreeRegressor(criterion='mse', max_depth=50, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=36, splitter='best'), 'score': 0.0018926331758773868}})])


In [12]:
filename = model.__class__.__name__ + '_20171111.sav'
model = pickle.load(open(filename, 'rb'))
predictions = []
batch = 1000000
n_items = test_data.shape[0]
n_batches = int(n_items//batch) if n_items%batch==0 else int(n_items//batch)+1
start = time()
for index in range(n_batches):
    istart = index*batch
    iend = (index+1)*batch if (index+1)*batch < n_items else n_items
    
    test_one_hot_encoded = pre_process(test_data[istart:iend])
    
    test_processed = pd.DataFrame(0, index=range(istart-istart,iend-istart), columns=train_processed.columns, dtype=np.dtype('int8'))
    test_processed[test_one_hot_encoded.columns] = test_one_hot_encoded
    
    prediction_part = model.predict(test_processed)
    predictions += list(prediction_part.ravel())
    print('{} batch is complete within {} seconds.'.format(index, time()-start))
    

There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
False
False
0 batch is complete within 2163.1334824562073 seconds.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
False
False
1 batch is complete within 4341.765029907227 seconds.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
False
False
2 batch is complete within 6532.146698236465 seconds.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
False
False
3 batch is complete within 7117.059661149979 seconds.


In [17]:
display(test_data_raw.head(5))

Unnamed: 0,date,store_nbr,item_nbr,onpromotion
0,2017-08-16,1,96995,0
1,2017-08-16,1,99197,0
2,2017-08-16,1,103501,0
3,2017-08-16,1,103520,0
4,2017-08-16,1,103665,0


In [18]:
test_data_raw = pd.read_csv('input/test.csv', usecols=[0,1,2,3,4],parse_dates=['date'], dtype=types)
test_data['unit_sales'] = [p if p >= 0 else 0 for p in predictions]
test_result= pd.merge(test_data_raw, test_data, left_on=['date','store_nbr','item_nbr', 'onpromotion'],
                      right_on=['date','store_nbr','item_nbr', 'onpromotion'],how='left')
test_result.fillna(0)[['id', 'unit_sales']].to_csv(model.__class__.__name__ +'.csv', index=False, float_format='%.3f') 