In [21]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv


# Overview

[M5 forecasting challenge](https://www.kaggle.com/c/m5-forecasting-accuracy/data)


## Data Description

- `calendar.csv` - Contains information about the dates on which the products are sold.

- `sales_train_validation.csv` - Contains the historical daily unit sales data per product and store `[d_1 - d_1913]`

- `sell_prices.csv` - Contains information about the price of the products sold per store and date.

- `sample_submission.csv` - The correct format for submissions. Reference the [Evaluation](https://www.kaggle.com/c/m5-forecasting-accuracy/overview/evaluation) tab for more info.

- `sales_train_evaluation.csv` - Includes sales `[d_1 - d_1941]` (labels used for the Public leaderboard)


## Task

- Forecast daily sales for the next 28 days



# 1. Download the Data

In [6]:
%%bash

kaggle competitions download m5-forecasting-accuracy -p data


Downloading m5-forecasting-accuracy.zip to data



  0%|          | 0.00/45.8M [00:00<?, ?B/s]  4%|▍         | 2.00M/45.8M [00:00<00:02, 20.5MB/s] 11%|█         | 5.00M/45.8M [00:00<00:02, 21.2MB/s] 17%|█▋        | 8.00M/45.8M [00:00<00:01, 23.3MB/s] 24%|██▍       | 11.0M/45.8M [00:00<00:01, 24.7MB/s] 33%|███▎      | 15.0M/45.8M [00:00<00:01, 26.4MB/s] 39%|███▉      | 18.0M/45.8M [00:00<00:01, 27.0MB/s] 46%|████▌     | 21.0M/45.8M [00:00<00:01, 24.8MB/s] 52%|█████▏    | 24.0M/45.8M [00:00<00:00, 25.4MB/s] 59%|█████▉    | 27.0M/45.8M [00:01<00:00, 25.1MB/s] 66%|██████▌   | 30.0M/45.8M [00:01<00:00, 25.1MB/s] 72%|███████▏  | 33.0M/45.8M [00:01<00:00, 26.0MB/s] 79%|███████▊  | 36.0M/45.8M [00:01<00:00, 23.1MB/s] 85%|████████▌ | 39.0M/45.8M [00:01<00:00, 19.9MB/s] 92%|█████████▏| 42.0M/45.8M [00:01<00:00, 18.7MB/s] 98%|█████████▊| 45.0M/45.8M [00:02<00:00, 20.6MB/s]100%|██████████| 45.8M/45.8M [00:02<00:00, 23.3MB/s]


In [13]:
import os 
import tarfile
import zipfile

def fetch_walmat_data():
    zip_path = os.path.join('data', 'm5-forecasting-accuracy.zip')    
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('data')
    
    print(zip_ref.namelist())

fetch_walmat_data()


['calendar.csv', 'sales_train_evaluation.csv', 'sales_train_validation.csv', 'sample_submission.csv', 'sell_prices.csv']


# 2. Read the Data

In [14]:
def load_calendar_data():    
    csv_path = os.path.join('data', 'calendar.csv')
    return pd.read_csv(csv_path)


def load_sell_price_data():
    csv_path = os.path.join('data', 'sell_prices.csv')
    return pd.read_csv(csv_path)


def load_sales_train_validation_data():
    csv_path = os.path.join('data', 'sales_train_validation.csv')
    return pd.read_csv(csv_path)


def load_sales_train_evaluation_data():
    csv_path = os.path.join('data', 'sales_train_evaluation.csv')
    return pd.read_csv(csv_path)


## 2.1 Read Calendar

In [15]:
calendar = load_calendar_data()
calendar.head()


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [20]:
print(f'shape: {calendar.shape}\n')

calendar.info()

shape: (1969, 14)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1969 non-null   object
 1   wm_yr_wk      1969 non-null   int64 
 2   weekday       1969 non-null   object
 3   wday          1969 non-null   int64 
 4   month         1969 non-null   int64 
 5   year          1969 non-null   int64 
 6   d             1969 non-null   object
 7   event_name_1  162 non-null    object
 8   event_type_1  162 non-null    object
 9   event_name_2  5 non-null      object
 10  event_type_2  5 non-null      object
 11  snap_CA       1969 non-null   int64 
 12  snap_TX       1969 non-null   int64 
 13  snap_WI       1969 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 215.5+ KB


### Sweetviz

In [22]:
# quick eda by sweetViz
calendar_report = sv.analyze(calendar)

# display the report
calendar_report.show_html('Calendar.html')


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=15.0), HTML(value='')), layout=Layout(dis…


Report Calendar.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
