# M5 Forecasting - Exploratory Data Analysis

This notebook explores the M5 Forecasting dataset to understand:
- Data structure and dimensions
- Temporal patterns
- Product hierarchies
- Missing values and data quality

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append('..')

from src.data.loader import M5DataLoader

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
data_loader = M5DataLoader(data_path='../data/raw')
calendar, sales, prices = data_loader.load_all()

In [None]:
# Explore calendar data
print("Calendar Data:")
print(f"Shape: {calendar.shape}")
print(f"Date range: {calendar['date'].min()} to {calendar['date'].max()}")
print("\nColumns:", calendar.columns.tolist())
calendar.head()

In [None]:
# Explore sales data
print("Sales Data:")
print(f"Shape: {sales.shape}")
print(f"Number of unique products: {sales['id'].nunique()}")
print("\nHierarchy:")
print(f"  Categories: {sales['cat_id'].nunique()}")
print(f"  Departments: {sales['dept_id'].nunique()}")
print(f"  Stores: {sales['store_id'].nunique()}")
print(f"  States: {sales['state_id'].nunique()}")
sales.head()

In [None]:
# Reshape and visualize demand patterns
df = data_loader.create_base_dataset()
print(f"Combined dataset shape: {df.shape}")
df.head()

In [None]:
# Sample time series visualization
sample_ids = df['id'].unique()[:5]

fig, axes = plt.subplots(len(sample_ids), 1, figsize=(15, 3*len(sample_ids)))
if len(sample_ids) == 1:
    axes = [axes]

for idx, sample_id in enumerate(sample_ids):
    sample_data = df[df['id'] == sample_id].sort_values('date')
    axes[idx].plot(sample_data['date'], sample_data['demand'])
    axes[idx].set_title(f'Demand Over Time: {sample_id}')
    axes[idx].set_xlabel('Date')
    axes[idx].set_ylabel('Demand')
    axes[idx].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Demand statistics by category
if 'cat_id' in df.columns:
    category_stats = df.groupby('cat_id')['demand'].agg(['mean', 'std', 'min', 'max'])
    print("Demand Statistics by Category:")
    print(category_stats)
    
    category_stats.plot(kind='bar', figsize=(12, 6))
    plt.title('Demand Statistics by Category')
    plt.ylabel('Demand')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()