# grocery_sales_forecasting_MovingAverage

This baseline model is non-parametric. This model is simply average of history sale (from 2013 to 2016 of the same day.)

In [1]:
# Import libraries necessary for this project
import os.path
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from IPython.display import display
import matplotlib.pyplot as plt

types = {'id': 'int32', 'item_nbr': 'int32', 'store_nbr': 'int16', 'onpromotion': bool}

%matplotlib inline

## Approach 1: Median of the same item in all stores from 2017/01/01 to 2017/08/15

In [2]:
train_data = pd.read_csv('input/train.csv', usecols=[1,2,3,4], 
                        parse_dates=['date'], dtype=types, 
                        converters={'unit_sales':lambda u: float(u) if float(u)>0 else 0},
                        skiprows=range(1,121688779))
print("Favorita grocery sales forecasting training data has {} samples with {} features each.".format(*train_data.shape))
train_data['unit_sales'] = train_data['unit_sales'].apply(pd.np.log1p)

u_dates = train_data.date.unique()
u_stores = train_data.store_nbr.unique()
u_items = train_data.item_nbr.unique()
train_data.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train_data = train_data.reindex(pd.MultiIndex.from_product([u_dates, u_stores, u_items], names=['date', 'store_nbr', 'item_nbr']))

del u_dates, u_stores, u_items

train_data.unit_sales.fillna(0, inplace=True)
train_data.reset_index(inplace=True)
lastdate = train_data.at[train_data.shape[0]-1, 'date']
print(lastdate)

Favorita grocery sales forecasting training data has 3808262 samples with 4 features each.
2017-08-15 00:00:00


In [3]:
test_data = pd.read_csv('input/test.csv', usecols=[0,2,3], dtype=types).set_index(['item_nbr', 'store_nbr'])

In [4]:
# Simple Moving averages : unweighted mean of the previous n data
from datetime import timedelta
for i in [1,2,3,4,5]:
    col = 'MA' + str(i)
    temp = train_data[train_data.date>lastdate-timedelta(int(i))].groupby(
        ['item_nbr', 'store_nbr'])['unit_sales'].mean().to_frame(col)
    test_data = test_data.join(temp, how='left')
display(test_data.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,id,MA1,MA2,MA3,MA4,MA5
item_nbr,store_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
96995,1,125497040,0.0,0.0,0.0,0.0,0.0
96995,1,125707694,0.0,0.0,0.0,0.0,0.0
96995,1,125918348,0.0,0.0,0.0,0.0,0.0
96995,1,126129002,0.0,0.0,0.0,0.0,0.0
96995,1,126339656,0.0,0.0,0.0,0.0,0.0


In [5]:
test_data['unit_sales'] = test_data.iloc[:,1:].median(axis=1)
test_data['unit_sales'] = test_data['unit_sales'].fillna(0).apply(pd.np.expm1)
display(test_data.head(5))
test_data[['id', 'unit_sales']].to_csv('median_ma5.csv', index=False, float_format='%.2f') 

Unnamed: 0_level_0,Unnamed: 1_level_0,id,MA1,MA2,MA3,MA4,MA5,unit_sales
item_nbr,store_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
96995,1,125497040,0.0,0.0,0.0,0.0,0.0,0.0
96995,1,125707694,0.0,0.0,0.0,0.0,0.0,0.0
96995,1,125918348,0.0,0.0,0.0,0.0,0.0,0.0
96995,1,126129002,0.0,0.0,0.0,0.0,0.0,0.0
96995,1,126339656,0.0,0.0,0.0,0.0,0.0,0.0
