### Objective: Set up demand forecasting pipelines focused on the 87 high-impact third-level categories.
#### Key Steps:
- Filtered dataset to only include relevant categories.
- Calculated total demand coverage.
- Prepared modeling granularity plan to balance performance with scalability.

In [1]:
from datasets import load_dataset

dataset = load_dataset("Dingdong-Inc/FreshRetailNet-50K", split="train")
train_df = dataset.to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Calculate top N categories globally
top_n = 20

category_counts = train_df['third_category_id'].value_counts().reset_index()
category_counts.columns = ['third_category_id', 'count']

top_global_categories = category_counts.head(top_n)['third_category_id'].tolist()
print(f"Top {top_n} third-level categories globally: {top_global_categories}")

Top 20 third-level categories globally: [60, 113, 81, 103, 77, 123, 154, 168, 65, 94, 105, 104, 82, 181, 179, 101, 172, 167, 98, 149]


In [3]:
# Step 2: Get each store's top 5 third_category_id
store_category_rank = (
    train_df.groupby(['store_id', 'third_category_id'])
    .size()
    .reset_index(name='count')
)

# Rank categories within each store
store_category_rank['rank'] = store_category_rank.groupby('store_id')['count'].rank(method='first', ascending=False)

# Filter top 5 per store
store_top5_categories = store_category_rank[store_category_rank['rank'] <= 5]

# Store as a dictionary for fast lookup later
store_top5_dict = store_top5_categories.groupby('store_id')['third_category_id'].apply(list).to_dict()

# Preview
for store, cats in list(store_top5_dict.items())[:5]:
    print(f"Store {store}: Top categories -> {cats}")

Store 0: Top categories -> [1, 58, 59, 60, 81]
Store 1: Top categories -> [10, 16, 81, 103, 113]
Store 2: Top categories -> [10, 77, 81, 113, 167]
Store 3: Top categories -> [60, 65, 77, 112, 113]
Store 4: Top categories -> [60, 65, 77, 81, 103]


In [6]:
store_top5_categories

Unnamed: 0,store_id,third_category_id,count,rank
0,0,1,180,3.0
8,0,58,180,4.0
9,0,59,180,5.0
10,0,60,270,1.0
14,0,81,270,2.0
...,...,...,...,...
38712,897,60,270,1.0
38713,897,65,180,2.0
38716,897,81,180,3.0
38722,897,101,180,4.0


In [7]:
unique_top_categories = store_top5_categories['third_category_id'].unique()
print(f"🔹 Total unique third-level categories in top 5s across all stores: {len(unique_top_categories)}")

🔹 Total unique third-level categories in top 5s across all stores: 87


In [8]:
from collections import defaultdict

category_to_stores = defaultdict(list)

for _, row in store_top5_categories.iterrows():
    category_to_stores[row['third_category_id']].append(row['store_id'])

# Optional preview
for cat, stores in list(category_to_stores.items())[:5]:
    print(f"Category {cat} appears in {len(stores)} stores → Sample: {stores[:5]}")

Category 1.0 appears in 175 stores → Sample: [np.float64(0.0), np.float64(10.0), np.float64(12.0), np.float64(50.0), np.float64(52.0)]
Category 58.0 appears in 109 stores → Sample: [np.float64(0.0), np.float64(8.0), np.float64(35.0), np.float64(57.0), np.float64(67.0)]
Category 59.0 appears in 23 stores → Sample: [np.float64(0.0), np.float64(8.0), np.float64(59.0), np.float64(104.0), np.float64(115.0)]
Category 60.0 appears in 673 stores → Sample: [np.float64(0.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0)]
Category 81.0 appears in 377 stores → Sample: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(4.0), np.float64(8.0)]


In [17]:
import numpy as np

# 1. Flatten the hours_sale column to a 1D array and sum
total_demand = np.sum([np.sum(hours) for hours in train_df['hours_sale']])

# 2. Filter for top 87 third-level categories
unique_top_categories = store_top5_categories['third_category_id'].unique()
top_cat_df = train_df[train_df['third_category_id'].isin(unique_top_categories)]

# 3. Flatten and sum demand for top categories
top_cat_demand = np.sum([np.sum(hours) for hours in top_cat_df['hours_sale']])

# 4. Coverage
coverage_percent = (top_cat_demand / total_demand) * 100

# 5. Print results
print(f"🔹 Total demand (all categories): {total_demand:,.0f} units")
print(f"🔹 Demand from top 87 third-level categories: {top_cat_demand:,.0f} units")
print(f"🔹 Coverage of total demand: {coverage_percent:.2f}%")

🔹 Total demand (all categories): 4,493,661 units
🔹 Demand from top 87 third-level categories: 3,734,714 units
🔹 Coverage of total demand: 83.11%
