# grocery_sales_forecasting_storewise

This baseline model is non-parametric. This model is simply average of history sale (from 2013 to 2016 of the same day.)

In [1]:
# Import libraries necessary for this project
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from IPython.display import display
import matplotlib.pyplot as plt
import datetime as dt
from time import time
import sys

types = {'id': 'int32', 'item_nbr': 'int32', 'store_nbr': 'int8', 'onpromotion': bool}

%matplotlib inline

## import data

There are 19 items appear in training data, but not in testing data; 68 items appear only testing data, not in training data.

In [2]:
# from sklearn.preprocessing import OneHotEncoder
def pre_process(feature_raw):

    items = pd.read_csv("input/items.csv", dtype={'perishable': np.dtype('int8')})
    feature_raw= pd.merge(feature_raw,items, right_on='item_nbr',left_on='item_nbr',how='left')
    
#     oil = pd.read_csv("input/oil.csv")
#     oil.fillna(method ='bfll', inplace = True)
#     oil['dcoilwtico'] = oil['dcoilwtico'].apply(np.log1p)
#     feature_raw= pd.merge(feature_raw,oil, right_on='date',left_on='date',how='left')
    
    feature_raw['onpromotion'].fillna(False, inplace = True)
    feature_raw['onpromotion'] = feature_raw['onpromotion'].map({False : 0, True : 1})
    feature_raw['onpromotion'] = feature_raw['onpromotion'].astype('int8')

    # holidays_events = pd.read_csv("input/holidays_events.csv")
    # train_data= pd.merge(train_data,stores, right_on='date',left_on='date',how='left')
    # display(train_data.tail(5))
    
    feature_raw['day_of_week'] = feature_raw['date'].dt.weekday_name
#     display(feature_raw.dtypes)

    if feature_raw.isnull().values.any():
        print("There is NaN before one-hot-encoding!")
    else:
        print("There is no NaN before one-hot-encoding")
    feature_raw.drop(['date', 'store_nbr'], axis = 1, inplace = True)
    
#     display(feature_raw.head(5))
    feature_one_hot_coded = pd.get_dummies(feature_raw, columns = ['item_nbr', 'family', 'class', 'day_of_week'])

    if feature_one_hot_coded.isnull().values.any():
        print("There is NaN after one-hot-encoding!")
    else:
        print("There is no NaN after one-hot-encoding")
        
    return feature_one_hot_coded

In [3]:
def load_data(store):
    filename_train = 'train_store_' + str(store) + '.csv'
    train_data_raw = pd.read_csv('input_each_store/' + filename_train, usecols = [1,2,3,4,5],
                                 parse_dates=['date'], dtype=types, 
                                 infer_datetime_format = True,
                                 converters={'unit_sales':lambda u: float(u) if float(u)>0 else 0},)
    print("Raw training data has {} samples with {} features each.".format(*train_data_raw.shape))
#     display(train_data_raw.head(5))
    
    filename_test = 'test_store_' + str(store) + '.csv'
    test_data_raw = pd.read_csv('input_each_store/' + filename_test, usecols=[1,2,3,4], parse_dates=['date'], dtype=types)
    print("Public testing data has {} samples with {} features each.".format(*test_data_raw.shape))
#     display(test_data_raw.head(5))
    
    uitem_train = set(train_data_raw.item_nbr.unique())
    uitem_test = set(test_data_raw.item_nbr.unique())
    items_trained = uitem_train.intersection(uitem_test)
    
    train_data = train_data_raw.loc[train_data_raw['item_nbr'].isin(items_trained)].copy()
    print("Training data has {} samples after droping unuseful items.".format(train_data.shape[0]))
    
    unit_sales = train_data['unit_sales'].apply(np.log1p).values
    train_data.drop('unit_sales', axis = 1, inplace=True)
    train_processed = pre_process(train_data)
    
    return [train_processed, unit_sales, items_trained]

Evaluation metric: Normalized Weighted Root Mean Squared Logarithmic Error (NWRMSLE)

In [4]:
from sklearn import metrics
def nwrmsle(ground_truth, predictions, w):
    return metrics.mean_squared_error(ground_truth, predictions, sample_weight=w)**0.5
# metrics.nwrmsle_scorer = make_scorer(score_func=nwrmsle, greater_is_better=False, w = [])


In [5]:
def train_predict(learner, X_train, y_train, X_test, y_test):
    results = dict()
    start = time()
    sample_size=X_train.shape[0]
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    results['train_time'] = time() - start
    
    start = time()
    predictions_temp = learner.predict(X_test)
    predictions = [x if x>=0 else 0 for x in predictions_temp]
    results['test_time'] = time() - start
    results['learner'] = learner
    
    w = X_test['perishable'].apply(lambda x: 1.25 if x else 1).values.ravel()
    results['score'] = nwrmsle(y_test.flatten(), predictions, w = w)
    return results

## DecisionTreeRegressor

In [6]:
def predict_submission(model, columns, store, items_trained):
    filename_test = 'test_store_' + str(store) + '.csv'
    test_data_raw = pd.read_csv('input_each_store/' + filename_test, usecols=[0,1,2,3,4], parse_dates=['date'], dtype=types)
    
    test_data = test_data_raw.loc[test_data_raw['item_nbr'].isin(items_trained)].copy()
    test_data.drop('id', axis = 1, inplace=True)
    
    test_one_hot_encoded = pre_process(test_data)
    test_processed = pd.DataFrame(0, index = range(test_data.shape[0]), columns=columns, dtype=np.dtype('int8'))
    test_processed[test_one_hot_encoded.columns] = test_one_hot_encoded
#     print(test_processed.isnull().values.any())
    predictions = model.predict(test_processed)
    test_data['unit_sales'] = [p if p >= 0 else 0 for p in predictions]
    
    test_store= pd.merge(test_data_raw, test_data, left_on=['date','item_nbr', 'onpromotion'],
                      right_on=['date','item_nbr', 'onpromotion'],how='left')
#     display(test_store.head(5))
    return test_store

In [None]:
# prediction_store = predict_submission(results[store]['learner'], train_processed.columns.values, store, items_trained)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor

results = {}
store_list = range(1,55)
predictions = pd.DataFrame(columns = ['id', 'unit_sales'])

regressor_A = DecisionTreeRegressor(random_state = 36, max_depth = 50)
regressor_B = linear_model.LinearRegression()
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split':2, 'learning_rate': 0.01}
regressor_C = GradientBoostingRegressor(**params)

regressor = regressor_A
reg_name = regressor.__class__.__name__

file_submission_temp = reg_name + dt.datetime.today().strftime("%Y-%m-%d") + '_temp.csv'
file_submission = reg_name + dt.datetime.today().strftime("%Y-%m-%d") + '.csv'

for store in store_list:
    print("Store {}:".format(store))
    train_processed, unit_sales, items_trained = load_data(store)
    print("Processed training set has {} samples with {} variables.".format(*train_processed.shape))
    X_train, X_test, y_train, y_test = train_test_split(train_processed, unit_sales, test_size = 0.1, random_state = 42)

    results[store]= train_predict(regressor, X_train, y_train, X_test, y_test)
    prediction_store = predict_submission(results[store]['learner'], train_processed.columns.values, store, items_trained)
    predictions = pd.concat([predictions,  prediction_store[['id', 'unit_sales']]], axis = 0, ignore_index=True)
    predictions.fillna(0, inplace=True)
    if not os.path.isfile(file_submission_temp):
        prediction_store.to_csv(file_submission_temp, index=False, float_format='%.3f')
    else:
        prediction_store.to_csv(file_submission_temp, mode = 'a', header = False, index=False, float_format='%.3f')


predictions.set_index('id').to_csv(file_submission)

Store 1:
Raw training data has 1702008 samples with 5 features each.
Public testing data has 62416 samples with 4 features each.
Training data has 1639533 samples after droping unuseful items.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Processed training set has 1639533 samples with 3717 variables.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Store 2:
Raw training data has 1991925 samples with 5 features each.
Public testing data has 62416 samples with 4 features each.
Training data has 1919599 samples after droping unuseful items.
There is no NaN before one-hot-encoding
There is NaN after one-hot-encoding!
Processed training set has 1919599 samples with 3743 variables.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Store 3:
Raw training data has 2289721 samples with 5 features each.
Public testing data has 62416 samples with 4 features each.
Training data has 2201865 samples after dro

There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Store 21:
Raw training data has 1373419 samples with 5 features each.
Public testing data has 62416 samples with 4 features each.
Training data has 1326405 samples after droping unuseful items.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Processed training set has 1326405 samples with 3300 variables.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Store 22:
Raw training data has 923483 samples with 5 features each.
Public testing data has 62416 samples with 4 features each.
Training data has 895107 samples after droping unuseful items.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Processed training set has 895107 samples with 2828 variables.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Store 23:
Raw training data has 1556095 samples with 5 features each.
Public testing data ha

There is no NaN after one-hot-encoding
Processed training set has 1393792 samples with 2845 variables.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Store 41:
Raw training data has 1700624 samples with 5 features each.
Public testing data has 62416 samples with 4 features each.
Training data has 1638204 samples after droping unuseful items.
There is no NaN before one-hot-encoding
There is NaN after one-hot-encoding!
Processed training set has 1638204 samples with 3695 variables.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Store 42:
Raw training data has 1461683 samples with 5 features each.
Public testing data has 62416 samples with 4 features each.
Training data has 1405391 samples after droping unuseful items.
There is no NaN before one-hot-encoding
There is no NaN after one-hot-encoding
Processed training set has 1405391 samples with 3742 variables.
There is no NaN before one-hot-encoding
There is no NaN after one-h