In [12]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
#import sweetviz as sv
import HelperFunctions as hf


# Overview

[M5 forecasting challenge](https://www.kaggle.com/c/m5-forecasting-accuracy/data)


## Data Description

- `calendar.csv` - Contains information about the dates on which the products are sold.

- `sales_train_validation.csv` - Contains the historical daily unit sales data per product and store `[d_1 - d_1913]`

- `sell_prices.csv` - Contains information about the price of the products sold per store and date.

- `sample_submission.csv` - The correct format for submissions. Reference the [Evaluation](https://www.kaggle.com/c/m5-forecasting-accuracy/overview/evaluation) tab for more info.

- `sales_train_evaluation.csv` - Includes sales `[d_1 - d_1941]` (labels used for the Public leaderboard)


## Task

- Forecast daily sales for the next 28 days



# 1. Download the Data

In [5]:
%%bash

kaggle competitions download m5-forecasting-accuracy -p data


Downloading m5-forecasting-accuracy.zip to data



  0%|          | 0.00/45.8M [00:00<?, ?B/s]  2%|▏         | 1.00M/45.8M [00:00<00:13, 3.52MB/s]  4%|▍         | 2.00M/45.8M [00:00<00:12, 3.74MB/s]  7%|▋         | 3.00M/45.8M [00:00<00:13, 3.38MB/s]  9%|▊         | 4.00M/45.8M [00:01<00:12, 3.39MB/s] 11%|█         | 5.00M/45.8M [00:01<00:12, 3.56MB/s] 13%|█▎        | 6.00M/45.8M [00:01<00:10, 4.08MB/s] 15%|█▌        | 7.00M/45.8M [00:01<00:09, 4.51MB/s] 17%|█▋        | 8.00M/45.8M [00:01<00:08, 4.91MB/s] 20%|█▉        | 9.00M/45.8M [00:02<00:07, 5.05MB/s] 22%|██▏       | 10.0M/45.8M [00:02<00:07, 5.22MB/s] 24%|██▍       | 11.0M/45.8M [00:02<00:07, 4.96MB/s] 26%|██▌       | 12.0M/45.8M [00:02<00:07, 5.02MB/s] 28%|██▊       | 13.0M/45.8M [00:02<00:06, 5.44MB/s] 31%|███       | 14.0M/45.8M [00:03<00:06, 5.23MB/s] 33%|███▎      | 15.0M/45.8M [00:03<00:06, 5.31MB/s] 35%|███▍      | 16.0M/45.8M [00:03<00:06, 5.19MB/s] 37%|███▋      | 17.0M/45.8M [00:03<00:05, 5.27MB/s] 39%|███▉      | 18.0M/45.8M [00:04<00:05, 5.08MB/s] 

In [6]:
import os 
import tarfile
import zipfile

def fetch_walmat_data():
    zip_path = os.path.join('data', 'm5-forecasting-accuracy.zip')    
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('data')
    
    print(zip_ref.namelist())

fetch_walmat_data()


['calendar.csv', 'sales_train_evaluation.csv', 'sales_train_validation.csv', 'sample_submission.csv', 'sell_prices.csv']


# 2. Read the Data

In [7]:
def load_calendar_data():    
    csv_path = os.path.join('data', 'calendar.csv')
    return pd.read_csv(csv_path)


def load_sell_price_data():
    csv_path = os.path.join('data', 'sell_prices.csv')
    return pd.read_csv(csv_path)


def load_sales_train_validation_data():
    csv_path = os.path.join('data', 'sales_train_validation.csv')
    return pd.read_csv(csv_path)


def load_sales_train_evaluation_data():
    csv_path = os.path.join('data', 'sales_train_evaluation.csv')
    return pd.read_csv(csv_path)


## 2.1 Read Calendar

In [8]:
calendar = load_calendar_data()
calendar.head()


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [9]:
print(f'shape: {calendar.shape}\n')

calendar.info()

shape: (1969, 14)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
date            1969 non-null object
wm_yr_wk        1969 non-null int64
weekday         1969 non-null object
wday            1969 non-null int64
month           1969 non-null int64
year            1969 non-null int64
d               1969 non-null object
event_name_1    162 non-null object
event_type_1    162 non-null object
event_name_2    5 non-null object
event_type_2    5 non-null object
snap_CA         1969 non-null int64
snap_TX         1969 non-null int64
snap_WI         1969 non-null int64
dtypes: int64(7), object(7)
memory usage: 215.5+ KB


### Sweetviz

In [22]:
# quick eda by sweetViz
calendar_report = sv.analyze(calendar)

# display the report
calendar_report.show_html('Calendar.html')


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=15.0), HTML(value='')), layout=Layout(dis…


Report Calendar.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [10]:
#Read data files
price_df = load_sell_price_data()
sales_df = load_sales_train_evaluation_data()

In [26]:
#Create the melted dataframe
df = hf.meltM5(sales_df, days = 800, items = 10000)

In [27]:
#Join with calendar and price data
df = hf.joinDataSets(df, calendar, price_df, dropPriceNA=False)

Initial inspection of the raw data

In [28]:
df.shape

(8000000, 22)

In [29]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOUSEHOLD_1_032_CA_2_evaluation,HOUSEHOLD_1_032,HOUSEHOLD_1,HOUSEHOLD,CA_2,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
1,HOBBIES_2_123_TX_1_evaluation,HOBBIES_2_123,HOBBIES_2,HOBBIES,TX_1,TX,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
2,HOBBIES_2_099_CA_4_evaluation,HOBBIES_2_099,HOBBIES_2,HOBBIES,CA_4,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
3,HOUSEHOLD_1_493_TX_3_evaluation,HOUSEHOLD_1_493,HOUSEHOLD_1,HOUSEHOLD,TX_3,TX,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
4,FOODS_2_359_CA_1_evaluation,FOODS_2_359,FOODS_2,FOODS,CA_1,CA,d_1,2,2011-01-29,11101,...,1,2011,,,,,0,0,0,2.23


In [30]:
df.describe()

Unnamed: 0,sold,wm_yr_wk,wday,month,year,snap_CA,snap_TX,snap_WI,sell_price
count,8000000.0,8000000.0,8000000.0,8000000.0,8000000.0,8000000.0,8000000.0,8000000.0,4741225.0
mean,1.009291,11188.12,3.99375,6.1825,2011.7,0.33375,0.33125,0.33,4.320895
std,4.302489,62.02339,2.001552,3.481982,0.6726812,0.4715517,0.4706628,0.4702127,3.195661
min,0.0,11101.0,1.0,1.0,2011.0,0.0,0.0,0.0,0.05
25%,0.0,11129.0,2.0,3.0,2011.0,0.0,0.0,0.0,2.08
50%,0.0,11206.0,4.0,6.0,2012.0,0.0,0.0,0.0,3.34
75%,1.0,11234.0,6.0,9.0,2012.0,1.0,1.0,1.0,5.84
max,693.0,11311.0,7.0,12.0,2013.0,1.0,1.0,1.0,30.98


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000000 entries, 0 to 7999999
Data columns (total 22 columns):
id              object
item_id         object
dept_id         object
cat_id          object
store_id        object
state_id        object
d               object
sold            int64
date            object
wm_yr_wk        int64
weekday         object
wday            int64
month           int64
year            int64
event_name_1    object
event_type_1    object
event_name_2    object
event_type_2    object
snap_CA         int64
snap_TX         int64
snap_WI         int64
sell_price      float64
dtypes: float64(1), int64(8), object(13)
memory usage: 1.4+ GB


In [37]:
#These are likely seasonal products, such as xmas trees, or brussel sprouts
print("Percentage of products that are not on sale: ", df['sell_price'].isna().sum() / len(df['sell_price']) * 100)

Percentage of products that are not on sale:  40.7346875
