# grocery_sales_forecasting_ml

This baseline model is non-parametric. This model is simply average of history sale (from 2013 to 2016 of the same day.)

In [1]:
# Import libraries necessary for this project
import os.path
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from IPython.display import display
import matplotlib.pyplot as plt
import datetime as dt
from time import time
import sys

types = {'id': 'int32', 'item_nbr': 'int32', 'store_nbr': np.dtype('int8')}

%matplotlib inline

## import data

In [3]:
train_data_raw = pd.read_csv('input/train.csv', usecols=[0,1,2,3,4,5], index_col=0,
                             parse_dates=['date'], dtype=types, 
                             converters={'unit_sales':lambda u: float(u) if float(u)>0 else 0},
                             skiprows=range(1,124035460))  # 124035460:2017/08/01 # 101688779: 2017/01/01 #117477877:2017/06/01
train_data_raw['onpromotion'] = [1 if p  else 0 for p in train_data_raw['onpromotion']]
print("Favorita grocery sales forecasting training data has {} samples with {} features each.".format(*train_data_raw.shape))
display(train_data_raw.head(5))

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


Favorita grocery sales forecasting training data has 1461722 samples with 5 features each.


Unnamed: 0_level_0,date,store_nbr,item_nbr,unit_sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4716030,2013-04-24,46,749421,5.0,1
7467838,2013-06-26,1,888630,2.0,1
7861246,2013-07-04,38,627887,1.0,1
8777150,2013-07-25,12,357961,2.0,1
8908286,2013-07-28,8,793345,2.0,1


In [None]:
test_data_raw = pd.read_csv('input/test.csv', usecols=[0, 1,2,3,4], index_col=0,
                            parse_dates=['date'], dtype=types)
test_data_raw['onpromotion'] = [1 if p  else 0 for p in test_data_raw['onpromotion']]

print("Favorita grocery sales forecasting testing data has {} samples with {} features each.".format(*test_data_raw.shape))
print(test_data_raw.dtypes)

In [None]:
uitem_train = set(train_data_raw.item_nbr.unique())
uitem_test = set(test_data_raw.item_nbr.unique())
items = uitem_train.intersection(uitem_test)

There are 19 items appear in training data, but not in testing data; 68 items appear only testing data, not in training data.

In [None]:
train_data = train_data_raw.loc[train_data_raw['item_nbr'].isin(items)].copy()
train_data.reset_index(inplace=True)
print("There are {} samples after droping unuseful items.".format(train_data.shape[0]))
test_data = test_data_raw.loc[test_data_raw['item_nbr'].isin(items)].copy()
test_data.reset_index(inplace=True)
print("There are {} testing data after droping unuseful items.".format(test_data.shape[0]))

In [None]:
# from sklearn.preprocessing import OneHotEncoder
def pre_process(feature_raw):

    items = pd.read_csv("input/items.csv", dtype={'perishable': np.dtype('int8')})
    feature_raw= pd.merge(feature_raw,items, right_on='item_nbr',left_on='item_nbr',how='left')
    
    stores = pd.read_csv("input/stores.csv")
    stores.drop(['city', 'state'], axis = 1)
    feature_raw= pd.merge(feature_raw,stores, right_on='store_nbr',left_on='store_nbr',how='left')

    # holidays_events = pd.read_csv("input/holidays_events.csv")
    # train_data= pd.merge(train_data,stores, right_on='date',left_on='date',how='left')
    # display(train_data.tail(5))
    
    feature_raw['day_of_week'] = feature_raw['date'].dt.weekday_name

    if feature_raw.isnull().values.any():
        print("There is NaN before one-hot-encoding!")
    else:
        print("There is no NaN before one-hot-encoding")
    feature_one_hot_coded = pd.get_dummies(feature_raw, columns = ['item_nbr'])
    if feature_one_hot_coded.isnull().values.any():
        print("There is NaN after one-hot-encoding!")
    else:
        print("There is no NaN after one-hot-encoding")
    
#     enc = OneHotEncoder()
#     feature_one_hot_coded_sklearn = enc.fit_transform(feature_raw)
#     if feature_one_hot_coded_sklearn.isnull().values.any():
#         print("There is NaN after one-hot-encoding!")
#     else:
#         print("There is no NaN after one-hot-encoding")
        
    return feature_one_hot_coded

In [None]:
# train_data_raw = pd.read_csv('input/train.csv', skiprows=range(1,125035437)) #124899960
# print(sys.getsizeof(train_data_raw))
print(train_data_raw.shape)
display(train_data_raw.head(5))

train_processed = pre_process(train_data_raw)

unit_sale = train_processed['unit_sales'].values.reshape(-1,1)
train_processed = train_processed.drop('unit_sales', axis = 1)

Evaluation metric: Normalized Weighted Root Mean Squared Logarithmic Error (NWRMSLE)

no           yes
125035430    125035460
125035430    125035445
125035434    125035438
125035437

In [None]:
from sklearn.metrics import make_scorer
def nwrmsle(ground_truth, predictions, w):
    nominator = 0
    batch = 50000
    n_batches = int(len(w)//batch)
    for index in range(n_batches):
        istart = index*batch; iend = (index+1)*batch
        nominator += np.sum(w[istart:iend]*np.square(np.log1p(predictions[istart:iend]) - np.log1p(ground_truth[istart:iend])))
    if n_batches * batch < len(w):
        istart = n_batches * batch
        nominator += np.sum(w[istart:]*np.square(np.log1p(predictions[istart:]) - np.log1p(ground_truth[istart:])))
    return np.sqrt(nominator)/np.sum(w)

nwrmsle_scorer = make_scorer(score_func=nwrmsle, greater_is_better=False, w = [])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_processed, unit_sale, test_size = 0.1, random_state = 42)

print("Training set has {} samples with {} variables.".format(*X_train.shape))
print("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
def train_predict(learner, X_train, y_train, X_test, y_test,  sample_size=X_train.shape[0]):
    results = dict()
    start = time()
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    results['train_time'] = time() - start
    
    start = time()
    predictions_temp = learner.predict(X_test)
    predictions = [x if x>=0 else 0 for x in predictions_temp]
    results['test_time'] = time() - start
    results['learner'] = learner
    
    w = X_test['perishable'].apply(lambda x: 1.25 if x else 1).values.ravel()
    results['score'] = nwrmsle(y_test.flatten(), predictions, w = w)
    return results

## DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor

samples_1 = int(len(y_train)*0.01)
samples_10 = int(len(y_train)*0.1)
samples_100 = len(y_train)

regressor_A = DecisionTreeRegressor(random_state = 36, max_depth = 50)
regressor_B = linear_model.LinearRegression()
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split':2, 'learning_rage': 0.01}
regressor_C = GradientBoostingRegressor(**params)


results = {}
for regressor in [regressor_A]:
    reg_name = regressor.__class__.__name__
    results[reg_name] = {}
    for i, samples in enumerate([samples_100]):
        results[reg_name][i] = train_predict(regressor, X_train, y_train, X_test, y_test, samples)
print(results.items())

model = results['DecisionTreeRegressor'][0]['learner']
filename = 'DecisionTreeRegressor.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
filename = 'DecisionTreeRegressor.sav'
model = pickle.load(open(filename, 'rb'))
predictions = []
batch = 1000000
n_items = test_data.shape[0]
n_batches = int(n_items//batch) if n_items%batch==0 else int(n_items//batch)+1
start = time()
for index in range(n_batches):
    istart = index*batch
    iend = (index+1)*batch if (index+1)*batch < n_items else n_items
    
    test_one_hot_encoded = pre_process(test_data[istart:iend])
    print(test_one_hot_encoded.isnull().values.any())
    
    test_processed = pd.DataFrame(0, index=range(istart-istart,iend-istart), columns=train_processed.columns, dtype=np.dtype('int8'))
    test_processed[test_one_hot_encoded.columns] = test_one_hot_encoded
    print(test_processed.isnull().values.any())
    
    prediction_part = model.predict(test_processed)
    predictions += list(prediction_part.ravel())
    print('{} batch is complete within {} seconds.'.format(index, time()-start))
    

In [None]:
test_data['unit_sales'] = predictions
test_result= pd.merge(test_data_raw, test_data, left_on=['date','store_nbr','item_nbr', 'onpromotion'],
                      right_on=['date','store_nbr','item_nbr', 'onpromotion'],how='left')
test_result.fillna(0)[['id', 'unit_sales']].to_csv(model.__class__.__name__ +'.csv', index=False, float_format='%.3f') 