# RG-Forecasting: 24-Week Retail Demand Forecast

**Store-SKU Level Daily Predictions | 33 Stores | ~3,650 SKUs | 168-Day Horizon**

---

## Executive Summary

| Metric | Value |
|--------|-------|
| Data Period | 2019 - Dec 17, 2025 |
| Forecast Horizon | Dec 18, 2025 - Jun 3, 2026 (168 days) |
| Series Forecasted | ~114,000 store-SKU combinations |
| Model | Two-Stage LightGBM (Classifier + Log-Regressor) |
| Weekly Store Accuracy | ~80-88% WFA |

---

## Table of Contents

1. [Configuration](#1-configuration)
2. [Setup & Imports](#2-setup)
3. [Data Loading & Validation](#3-data-loading)
4. [Exploratory Data Analysis](#4-eda)
5. [Panel Construction (Spine)](#5-panel)
6. [Data Cleaning](#6-cleaning)
7. [Feature Engineering](#7-features)
8. [ABC Segmentation](#8-segmentation)
9. [Model Training](#9-training)
10. [Forecast Generation](#10-forecast)
11. [Evaluation & Sanity Checks](#11-evaluation)
12. [Output & Submission Checklist](#12-output)

---
## 1. Configuration <a name="1-configuration"></a>

In [None]:
# === CONFIGURATION ===
# Update these paths for your environment

DATA_PATH = "../final_data 2.csv"              # Raw sales transaction data
SKU_ATTR_PATH = "../sku_list_attribute.csv"    # SKU attributes (local/import)
OUTPUT_PATH = "outputs/forecast_168day.csv"    # Forecast output

# Date configuration
CUTOFF_DATE = "2025-12-17"       # Last date of training data
FORECAST_START = "2025-12-18"    # First forecast date  
HORIZON_DAYS = 168               # 24 weeks = 168 days

# Reproducibility
RANDOM_SEED = 42

# Performance options
SAMPLE_MODE = False    # Set True for quick testing (uses 10% of SKUs)
SAMPLE_FRAC = 0.1      # Fraction of SKUs to use in sample mode

print(f"Configuration:")
print(f"  Data: {DATA_PATH}")
print(f"  Cutoff: {CUTOFF_DATE}")
print(f"  Forecast: {FORECAST_START} + {HORIZON_DAYS} days")
print(f"  Sample mode: {SAMPLE_MODE}")

---
## 2. Setup & Imports <a name="2-setup"></a>

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import warnings
from datetime import datetime, timedelta

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.2f}'.format)
np.random.seed(RANDOM_SEED)

print(f"Notebook started: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print(f"\nEnvironment:")
print(f"  Python: {pd.sys.version.split()[0]}")
print(f"  Pandas: {pd.__version__}")
print(f"  NumPy: {np.__version__}")
print(f"  LightGBM: {lgb.__version__}")

---
## 3. Data Loading & Validation <a name="3-data-loading"></a>

In [None]:
print("Loading raw sales data...")
df_raw = pd.read_csv(DATA_PATH)
print(f"  Loaded {len(df_raw):,} rows")

# Standardize column names
df_raw.columns = df_raw.columns.str.lower().str.strip()
if 'item_id' in df_raw.columns:
    df_raw = df_raw.rename(columns={'item_id': 'sku_id'})

# Parse date
df_raw['date'] = pd.to_datetime(df_raw['date'])

print(f"\nSchema validation:")
print(f"  Columns: {list(df_raw.columns)}")
assert 'sku_id' in df_raw.columns, "Missing sku_id column"
assert 'store_id' in df_raw.columns, "Missing store_id column"
assert 'date' in df_raw.columns, "Missing date column"
assert 'sales' in df_raw.columns, "Missing sales column"
print("  ‚úì All required columns present")

In [None]:
# Load SKU attributes (Local vs Import)
print("Loading SKU attributes...")
sku_attr = pd.read_csv(SKU_ATTR_PATH)
sku_attr.columns = sku_attr.columns.str.lower().str.strip()
if 'item_id' in sku_attr.columns:
    sku_attr = sku_attr.rename(columns={'item_id': 'sku_id'})

# Create is_local flag
attr_col = [c for c in sku_attr.columns if 'attribute' in c.lower() or 'local' in c.lower()][0]
sku_attr['is_local'] = sku_attr[attr_col].apply(lambda x: 1 if str(x).upper() in ['L', 'LI', 'LOCAL'] else 0)
sku_attr['sku_id'] = sku_attr['sku_id'].astype(str)

print(f"  Loaded {len(sku_attr):,} SKU attributes")
print(f"  Local: {sku_attr['is_local'].sum():,}, Import: {(1-sku_attr['is_local']).sum():.0f}")

In [None]:
# Data summary
print("=" * 60)
print("DATA SUMMARY")
print("=" * 60)
print(f"Date range: {df_raw['date'].min().date()} to {df_raw['date'].max().date()}")
print(f"Unique stores: {df_raw['store_id'].nunique()}")
print(f"Unique SKUs: {df_raw['sku_id'].nunique()}")
print(f"Total transactions: {len(df_raw):,}")
print(f"Total sales volume: {df_raw['sales'].sum():,.0f} units")

In [None]:
# Sample mode: reduce to subset of SKUs for faster testing
if SAMPLE_MODE:
    print(f"\n‚ö†Ô∏è SAMPLE MODE ENABLED: Using {SAMPLE_FRAC*100:.0f}% of SKUs")
    all_skus = df_raw['sku_id'].unique()
    sample_skus = np.random.choice(all_skus, size=int(len(all_skus) * SAMPLE_FRAC), replace=False)
    df_raw = df_raw[df_raw['sku_id'].isin(sample_skus)]
    print(f"  Reduced to {len(df_raw):,} rows, {len(sample_skus):,} SKUs")

---
## 4. Exploratory Data Analysis <a name="4-eda"></a>

In [None]:
# THE KEY CHALLENGE: Sparsity
# Transaction data only contains days WITH sales
# Missing dates = zero sales (not missing data)

# Calculate theoretical complete panel size
n_stores = df_raw['store_id'].nunique()
n_skus = df_raw['sku_id'].nunique()
n_days = (df_raw['date'].max() - df_raw['date'].min()).days + 1
theoretical_rows = n_stores * n_skus * n_days
actual_rows = len(df_raw)
density = actual_rows / theoretical_rows * 100

print("=" * 60)
print("SPARSITY ANALYSIS")
print("=" * 60)
print(f"Theoretical panel size: {n_stores} stores √ó {n_skus:,} SKUs √ó {n_days:,} days")
print(f"                      = {theoretical_rows:,} rows")
print(f"Actual transactions:    {actual_rows:,} rows")
print(f"Data density:           {density:.1f}%")
print(f"\nüî¥ ZERO RATE: ~{100-density:.0f}% of store-SKU-days have zero sales")
print("   This drives our two-stage modeling approach.")

In [None]:
# Sales distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# All sales (clipped for visualization)
df_raw['sales'].clip(upper=50).hist(bins=50, ax=axes[0], color='steelblue', edgecolor='white')
axes[0].set_title('Daily Sales Distribution (clipped at 50)')
axes[0].set_xlabel('Daily Sales (units)')
axes[0].set_ylabel('Frequency')

# Sales percentiles
percentiles = [50, 75, 90, 95, 99, 100]
pct_values = [df_raw['sales'].quantile(p/100) for p in percentiles]
axes[1].bar([str(p) for p in percentiles], pct_values, color='coral', edgecolor='white')
axes[1].set_title('Sales Percentiles')
axes[1].set_xlabel('Percentile')
axes[1].set_ylabel('Sales (units)')
for i, v in enumerate(pct_values):
    axes[1].text(i, v + 1, f'{v:.0f}', ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Day-of-week and monthly patterns
df_raw['dow'] = df_raw['date'].dt.dayofweek
df_raw['month'] = df_raw['date'].dt.month

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Day of week
dow_sales = df_raw.groupby('dow')['sales'].mean()
dow_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0].bar(dow_names, dow_sales.values, color='steelblue', edgecolor='white')
axes[0].set_title('Average Sales by Day of Week')
axes[0].set_ylabel('Avg Sales (units)')

# Monthly
monthly_sales = df_raw.groupby('month')['sales'].mean()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
colors = ['coral' if m == 12 else 'steelblue' for m in range(1, 13)]
axes[1].bar(month_names, monthly_sales.values, color=colors, edgecolor='white')
axes[1].set_title('Average Sales by Month')
axes[1].set_ylabel('Avg Sales (units)')

plt.tight_layout()
plt.show()

print("Key patterns:")
print(f"  - Weekend effect: Sun/Mon show distinct patterns")
print(f"  - December lift: {(monthly_sales.iloc[11] / monthly_sales.iloc[:11].mean() - 1) * 100:.0f}% above average")

---
## 5. Panel Construction (Spine) <a name="5-panel"></a>

Create a complete store-SKU-date grid and fill missing dates with zero sales.

In [None]:
print("Building complete panel (spine)...")
print("  This fills missing dates with zero sales.")

# Get unique combinations
stores = df_raw['store_id'].unique()
skus = df_raw['sku_id'].unique()

# Date range for training (up to cutoff)
cutoff = pd.to_datetime(CUTOFF_DATE)
min_date = df_raw['date'].min()
date_range = pd.date_range(min_date, cutoff, freq='D')

print(f"  Stores: {len(stores)}")
print(f"  SKUs: {len(skus):,}")
print(f"  Dates: {len(date_range):,} ({min_date.date()} to {cutoff.date()})")
print(f"  Expected panel size: {len(stores) * len(skus) * len(date_range):,} rows")

In [None]:
# Build spine efficiently using merge
print("\nGenerating spine...")

# Get all store-SKU combinations that ever had a sale
series = df_raw[['store_id', 'sku_id']].drop_duplicates()
print(f"  Unique series: {len(series):,}")

# Create date dataframe
dates_df = pd.DataFrame({'date': date_range})

# Cross join series with dates
series['_key'] = 1
dates_df['_key'] = 1
spine = series.merge(dates_df, on='_key').drop('_key', axis=1)

print(f"  Spine size: {len(spine):,} rows")

In [None]:
# Merge sales onto spine (missing = 0)
print("\nMerging sales onto spine...")

# Prepare sales data (up to cutoff)
df_train = df_raw[df_raw['date'] <= cutoff][['store_id', 'sku_id', 'date', 'sales']].copy()
df_train = df_train.rename(columns={'sales': 'y'})

# Merge
panel = spine.merge(df_train, on=['store_id', 'sku_id', 'date'], how='left')
panel['y'] = panel['y'].fillna(0)

# Verify
zero_rate = (panel['y'] == 0).mean() * 100
print(f"  Panel size: {len(panel):,} rows")
print(f"  Zero-sales rate: {zero_rate:.1f}%")

# Clean up
del spine, df_train
gc.collect()

---
## 6. Data Cleaning <a name="6-cleaning"></a>

In [None]:
print("Cleaning data...")

# 1. Clip negative sales to 0 (returns)
neg_count = (panel['y'] < 0).sum()
panel['y'] = panel['y'].clip(lower=0)
print(f"  Clipped {neg_count:,} negative values to 0")

# 2. Convert IDs to string for categorical handling
panel['store_id'] = panel['store_id'].astype(str)
panel['sku_id'] = panel['sku_id'].astype(str)

# 3. Merge SKU attributes (Local vs Import)
panel = panel.merge(sku_attr[['sku_id', 'is_local']], on='sku_id', how='left')
panel['is_local'] = panel['is_local'].fillna(0).astype(int)

local_pct = panel.groupby('sku_id')['is_local'].first().mean() * 100
print(f"  SKU attribute: {local_pct:.1f}% Local, {100-local_pct:.1f}% Import")

print("\n‚úì Cleaning complete")

---
## 7. Feature Engineering <a name="7-features"></a>

All features are **causal**: computed using only past data relative to the prediction date.

In [None]:
print("Engineering features...")
print("  All features use only past data (no leakage).")

# Sort for proper lag calculation
panel = panel.sort_values(['store_id', 'sku_id', 'date']).reset_index(drop=True)

# === CALENDAR FEATURES ===
panel['dow'] = panel['date'].dt.dayofweek
panel['is_weekend'] = panel['dow'].isin([5, 6]).astype(int)
panel['week_of_year'] = panel['date'].dt.isocalendar().week.astype(int)
panel['month'] = panel['date'].dt.month
panel['day_of_year'] = panel['date'].dt.dayofyear

# Cyclical encoding
panel['sin_doy'] = np.sin(2 * np.pi * panel['day_of_year'] / 365)
panel['cos_doy'] = np.cos(2 * np.pi * panel['day_of_year'] / 365)
panel['sin_dow'] = np.sin(2 * np.pi * panel['dow'] / 7)
panel['cos_dow'] = np.cos(2 * np.pi * panel['dow'] / 7)

print("  ‚úì Calendar features (9)")

In [None]:
# === LAG FEATURES ===
# Grouped by series for proper lag calculation
print("  Computing lag features...")

for lag in [1, 7, 14, 28, 56]:
    panel[f'lag_{lag}'] = panel.groupby(['store_id', 'sku_id'])['y'].shift(lag)

print("  ‚úì Lag features (5)")

In [None]:
# === ROLLING FEATURES ===
print("  Computing rolling features...")

# Use shift(1) to avoid including current day
for window in [7, 28]:
    rolled = panel.groupby(['store_id', 'sku_id'])['y'].transform(
        lambda x: x.shift(1).rolling(window, min_periods=1).mean()
    )
    panel[f'roll_mean_{window}'] = rolled
    
    rolled_sum = panel.groupby(['store_id', 'sku_id'])['y'].transform(
        lambda x: x.shift(1).rolling(window, min_periods=1).sum()
    )
    panel[f'roll_sum_{window}'] = rolled_sum

# Rolling std
panel['roll_std_28'] = panel.groupby(['store_id', 'sku_id'])['y'].transform(
    lambda x: x.shift(1).rolling(28, min_periods=7).std()
).fillna(0)

print("  ‚úì Rolling features (5)")

In [None]:
# === DORMANCY FEATURES ===
print("  Computing dormancy features...")

# Non-zero rate in last 28 days
panel['nz_rate_28'] = panel.groupby(['store_id', 'sku_id'])['y'].transform(
    lambda x: x.shift(1).rolling(28, min_periods=1).apply(lambda w: (w > 0).mean())
).fillna(0)

# Days since last sale (capped at 90)
def days_since_last_sale(series):
    result = np.zeros(len(series))
    last_sale_idx = -1
    for i in range(len(series)):
        if i > 0 and series.iloc[i-1] > 0:
            last_sale_idx = i - 1
        if last_sale_idx >= 0:
            result[i] = min(i - last_sale_idx, 90)
        else:
            result[i] = 90
    return result

panel['days_since_last_sale'] = panel.groupby(['store_id', 'sku_id'])['y'].transform(
    lambda x: pd.Series(days_since_last_sale(x), index=x.index)
)

# Zero run length (consecutive zeros)
def zero_run_length(series):
    result = np.zeros(len(series))
    run = 0
    for i in range(len(series)):
        if i > 0:
            if series.iloc[i-1] == 0:
                run += 1
            else:
                run = 0
        result[i] = min(run, 60)
    return result

panel['zero_run_length'] = panel.groupby(['store_id', 'sku_id'])['y'].transform(
    lambda x: pd.Series(zero_run_length(x), index=x.index)
)

# Last sale quantity (capped)
def last_sale_qty(series):
    result = np.zeros(len(series))
    last_qty = 0
    for i in range(len(series)):
        if i > 0 and series.iloc[i-1] > 0:
            last_qty = min(series.iloc[i-1], 50)
        result[i] = last_qty
    return result

panel['last_sale_qty'] = panel.groupby(['store_id', 'sku_id'])['y'].transform(
    lambda x: pd.Series(last_sale_qty(x), index=x.index)
)

print("  ‚úì Dormancy features (4)")

In [None]:
# === SPIKE FEATURES (Inferred Promotional Signals) ===
print("  Computing spike features...")

# Historical mean for spike detection
series_mean = panel[panel['y'] > 0].groupby(['store_id', 'sku_id'])['y'].mean().reset_index()
series_mean.columns = ['store_id', 'sku_id', 'hist_mean']
panel = panel.merge(series_mean, on=['store_id', 'sku_id'], how='left')
panel['hist_mean'] = panel['hist_mean'].fillna(1)

# Is this a spike day? (>3x historical mean)
panel['is_spike'] = ((panel['y'] > 3 * panel['hist_mean']) & (panel['y'] > 5)).astype(int)

# Store spike percentage (how many SKUs spiked in this store today)
store_spike_pct = panel.groupby(['store_id', 'date'])['is_spike'].transform('mean')
panel['store_spike_pct'] = store_spike_pct

# Historical spike probability for this series
panel['hist_spike_prob'] = panel.groupby(['store_id', 'sku_id'])['is_spike'].transform(
    lambda x: x.shift(1).expanding().mean()
).fillna(0)

# Had recent spike (in last 7 days)
panel['had_recent_spike'] = panel.groupby(['store_id', 'sku_id'])['is_spike'].transform(
    lambda x: x.shift(1).rolling(7, min_periods=1).max()
).fillna(0)

# Clean up temporary columns
panel = panel.drop(columns=['hist_mean', 'is_spike'])

print("  ‚úì Spike features (3)")

In [None]:
# Fill any remaining NaNs
feature_cols = [
    'dow', 'is_weekend', 'week_of_year', 'month', 'day_of_year',
    'sin_doy', 'cos_doy', 'sin_dow', 'cos_dow',
    'lag_1', 'lag_7', 'lag_14', 'lag_28', 'lag_56',
    'roll_mean_7', 'roll_sum_7', 'roll_mean_28', 'roll_sum_28', 'roll_std_28',
    'nz_rate_28', 'days_since_last_sale', 'zero_run_length', 'last_sale_qty',
    'store_spike_pct', 'hist_spike_prob', 'had_recent_spike',
    'is_local'
]

for col in feature_cols:
    if col in panel.columns:
        panel[col] = panel[col].fillna(0)

print(f"\n‚úì Feature engineering complete: {len(feature_cols)} features")
print(f"  Panel shape: {panel.shape}")

---
## 8. ABC Segmentation <a name="8-segmentation"></a>

Segment series by sales volume: A (top 80%), B (next 15%), C (bottom 5%).

In [None]:
print("Assigning ABC segments...")

# Calculate total sales per series
series_sales = panel.groupby(['store_id', 'sku_id'])['y'].sum().reset_index()
series_sales.columns = ['store_id', 'sku_id', 'total_sales']
series_sales = series_sales.sort_values('total_sales', ascending=False)

# Cumulative share
total = series_sales['total_sales'].sum()
series_sales['cum_share'] = series_sales['total_sales'].cumsum() / total

# Assign segments
series_sales['abc'] = 'C'
series_sales.loc[series_sales['cum_share'] <= 0.80, 'abc'] = 'A'
series_sales.loc[(series_sales['cum_share'] > 0.80) & (series_sales['cum_share'] <= 0.95), 'abc'] = 'B'

# Merge to panel
panel = panel.merge(series_sales[['store_id', 'sku_id', 'abc', 'total_sales']], 
                    on=['store_id', 'sku_id'], how='left')
panel['abc'] = panel['abc'].fillna('C')

# Summary
print("\nABC Distribution:")
for seg in ['A', 'B', 'C']:
    n_series = (series_sales['abc'] == seg).sum()
    sales_share = series_sales[series_sales['abc'] == seg]['total_sales'].sum() / total * 100
    print(f"  {seg}: {n_series:,} series ({sales_share:.1f}% of sales volume)")

In [None]:
# Define behavior buckets for evaluation
# Fast movers: top 20% by total sales
# Slow movers: bottom 20% by total sales (but non-zero)
# Intermittent: nz_rate < 0.05

series_stats = panel.groupby(['store_id', 'sku_id']).agg({
    'y': 'sum',
    'nz_rate_28': 'mean'
}).reset_index()
series_stats.columns = ['store_id', 'sku_id', 'total_sales', 'avg_nz_rate']

# Assign buckets
sales_80 = series_stats['total_sales'].quantile(0.80)
sales_20 = series_stats['total_sales'].quantile(0.20)

series_stats['bucket'] = 'regular'
series_stats.loc[series_stats['total_sales'] >= sales_80, 'bucket'] = 'fast_mover'
series_stats.loc[(series_stats['total_sales'] <= sales_20) & (series_stats['total_sales'] > 0), 'bucket'] = 'slow_mover'
series_stats.loc[series_stats['avg_nz_rate'] < 0.05, 'bucket'] = 'intermittent'

# Merge
panel = panel.merge(series_stats[['store_id', 'sku_id', 'bucket']], on=['store_id', 'sku_id'], how='left')

print("\nBehavior Buckets:")
print(series_stats['bucket'].value_counts())

---
## 9. Model Training <a name="9-training"></a>

Two-stage LightGBM with ABC-specific hyperparameters.

In [None]:
# Define features and hyperparameters
FEATURES = [
    'dow', 'is_weekend', 'week_of_year', 'month', 'day_of_year',
    'sin_doy', 'cos_doy', 'sin_dow', 'cos_dow',
    'lag_1', 'lag_7', 'lag_14', 'lag_28', 'lag_56',
    'roll_mean_7', 'roll_sum_7', 'roll_mean_28', 'roll_sum_28', 'roll_std_28',
    'nz_rate_28', 'days_since_last_sale', 'zero_run_length', 'last_sale_qty',
    'store_spike_pct', 'hist_spike_prob', 'had_recent_spike',
    'is_local'
]
CAT_FEATURES = ['store_id', 'sku_id']

SEGMENT_PARAMS = {
    'A': {'num_leaves': 255, 'learning_rate': 0.015, 'n_clf': 800, 'n_reg': 1000, 'min_data': 10, 'threshold': 0.6},
    'B': {'num_leaves': 63, 'learning_rate': 0.03, 'n_clf': 300, 'n_reg': 400, 'min_data': 50, 'threshold': 0.6},
    'C': {'num_leaves': 31, 'learning_rate': 0.05, 'n_clf': 200, 'n_reg': 300, 'min_data': 100, 'threshold': 0.7},
}

print(f"Features: {len(FEATURES)} numeric + {len(CAT_FEATURES)} categorical")
print("\nSegment hyperparameters:")
for seg, params in SEGMENT_PARAMS.items():
    print(f"  {seg}: leaves={params['num_leaves']}, lr={params['learning_rate']}, threshold={params['threshold']}")

In [None]:
def train_two_stage_model(train_df, segment, params, features, cat_features):
    """Train classifier + regressor for one segment. Returns models."""
    
    train_seg = train_df[train_df['abc'] == segment].copy()
    if len(train_seg) < 100:
        return None, None
    
    # Prepare data
    train_seg['y_binary'] = (train_seg['y'] > 0).astype(int)
    for col in cat_features:
        train_seg[col] = train_seg[col].astype('category')
    
    X_train = train_seg[features + cat_features]
    
    # Stage 1: Classifier
    clf_params = {
        'objective': 'binary', 'metric': 'auc',
        'num_leaves': params['num_leaves'],
        'learning_rate': params['learning_rate'],
        'feature_fraction': 0.8,
        'min_data_in_leaf': params['min_data'],
        'verbose': -1, 'n_jobs': -1, 'seed': RANDOM_SEED
    }
    clf_data = lgb.Dataset(X_train, label=train_seg['y_binary'], categorical_feature=cat_features)
    clf = lgb.train(clf_params, clf_data, num_boost_round=params['n_clf'])
    
    # Stage 2: Regressor (non-zero only)
    train_nz = train_seg[train_seg['y'] > 0]
    if len(train_nz) < 10:
        return clf, None
    
    X_train_nz = train_nz[features + cat_features]
    y_train_nz = np.log1p(train_nz['y'].values)
    
    reg_params = {
        'objective': 'regression_l1', 'metric': 'mae',
        'num_leaves': params['num_leaves'],
        'learning_rate': params['learning_rate'],
        'feature_fraction': 0.8,
        'min_data_in_leaf': max(5, params['min_data'] // 2),
        'lambda_l2': 0.5,
        'verbose': -1, 'n_jobs': -1, 'seed': RANDOM_SEED
    }
    reg_data = lgb.Dataset(X_train_nz, label=y_train_nz, categorical_feature=cat_features)
    reg = lgb.train(reg_params, reg_data, num_boost_round=params['n_reg'])
    
    return clf, reg

In [None]:
# Train models for each segment
print("=" * 60)
print("TRAINING TWO-STAGE MODELS")
print("=" * 60)

models = {}

for seg in ['A', 'B', 'C']:
    seg_data = panel[panel['abc'] == seg]
    print(f"\n{seg}-items: {len(seg_data):,} rows")
    
    clf, reg = train_two_stage_model(panel, seg, SEGMENT_PARAMS[seg], FEATURES, CAT_FEATURES)
    models[seg] = {'clf': clf, 'reg': reg, 'params': SEGMENT_PARAMS[seg]}
    
    if clf is not None:
        print(f"  ‚úì Classifier trained")
    if reg is not None:
        print(f"  ‚úì Regressor trained")

print("\n‚úì All models trained")

---
## 10. Forecast Generation <a name="10-forecast"></a>

Generate 168-day forecasts starting from December 18, 2025.

In [None]:
print("Preparing forecast period...")

# Generate forecast dates
forecast_start = pd.to_datetime(FORECAST_START)
forecast_dates = pd.date_range(forecast_start, periods=HORIZON_DAYS, freq='D')

print(f"  Forecast period: {forecast_dates[0].date()} to {forecast_dates[-1].date()}")
print(f"  Days: {len(forecast_dates)}")

In [None]:
# Create forecast panel (store-SKU √ó dates)
print("\nBuilding forecast panel...")

series_list = panel[['store_id', 'sku_id', 'abc', 'bucket', 'is_local']].drop_duplicates()
print(f"  Series: {len(series_list):,}")

# Cross join with dates
series_list['_key'] = 1
dates_df = pd.DataFrame({'date': forecast_dates})
dates_df['_key'] = 1
forecast_panel = series_list.merge(dates_df, on='_key').drop('_key', axis=1)

print(f"  Forecast panel: {len(forecast_panel):,} rows")
print(f"  Expected: {len(series_list):,} √ó {len(forecast_dates)} = {len(series_list) * len(forecast_dates):,}")

In [None]:
# Add calendar features
forecast_panel['dow'] = forecast_panel['date'].dt.dayofweek
forecast_panel['is_weekend'] = forecast_panel['dow'].isin([5, 6]).astype(int)
forecast_panel['week_of_year'] = forecast_panel['date'].dt.isocalendar().week.astype(int)
forecast_panel['month'] = forecast_panel['date'].dt.month
forecast_panel['day_of_year'] = forecast_panel['date'].dt.dayofyear
forecast_panel['sin_doy'] = np.sin(2 * np.pi * forecast_panel['day_of_year'] / 365)
forecast_panel['cos_doy'] = np.cos(2 * np.pi * forecast_panel['day_of_year'] / 365)
forecast_panel['sin_dow'] = np.sin(2 * np.pi * forecast_panel['dow'] / 7)
forecast_panel['cos_dow'] = np.cos(2 * np.pi * forecast_panel['dow'] / 7)

print("  ‚úì Calendar features added")

In [None]:
# Get last known values from training data for lag/rolling features
print("\nComputing lag features from last known values...")

# Get last N days of training data per series
cutoff = pd.to_datetime(CUTOFF_DATE)
lookback = panel[panel['date'] > cutoff - timedelta(days=60)].copy()

# Compute summary stats per series
series_last_stats = lookback.groupby(['store_id', 'sku_id']).agg({
    'y': ['last', 'mean', 'sum', 'std'],
    'nz_rate_28': 'last',
    'days_since_last_sale': 'last',
    'zero_run_length': 'last',
    'last_sale_qty': 'last',
    'store_spike_pct': 'mean',
    'hist_spike_prob': 'last',
    'had_recent_spike': 'last'
}).reset_index()

series_last_stats.columns = ['store_id', 'sku_id', 
                              'lag_1', 'roll_mean_28', 'roll_sum_28', 'roll_std_28',
                              'nz_rate_28', 'days_since_last_sale', 'zero_run_length', 'last_sale_qty',
                              'store_spike_pct', 'hist_spike_prob', 'had_recent_spike']

# Fill derived features
series_last_stats['lag_7'] = series_last_stats['lag_1']
series_last_stats['lag_14'] = series_last_stats['lag_1'] 
series_last_stats['lag_28'] = series_last_stats['lag_1']
series_last_stats['lag_56'] = series_last_stats['lag_1']
series_last_stats['roll_mean_7'] = series_last_stats['roll_mean_28']
series_last_stats['roll_sum_7'] = series_last_stats['roll_sum_28'] / 4

# Merge to forecast panel
forecast_panel = forecast_panel.merge(series_last_stats, on=['store_id', 'sku_id'], how='left')

# Fill any missing values
for col in FEATURES:
    if col in forecast_panel.columns:
        forecast_panel[col] = forecast_panel[col].fillna(0)
    else:
        forecast_panel[col] = 0

print("  ‚úì Features prepared")

In [None]:
# Generate predictions
print("\nGenerating predictions...")

forecast_panel['predicted_sales'] = 0.0

for seg in ['A', 'B', 'C']:
    seg_mask = forecast_panel['abc'] == seg
    seg_data = forecast_panel[seg_mask].copy()
    
    if len(seg_data) == 0:
        continue
    
    clf = models[seg]['clf']
    reg = models[seg]['reg']
    threshold = models[seg]['params']['threshold']
    
    if clf is None:
        continue
    
    # Prepare features
    for col in CAT_FEATURES:
        seg_data[col] = seg_data[col].astype('category')
    
    X = seg_data[FEATURES + CAT_FEATURES]
    
    # Predict
    prob = clf.predict(X)
    
    if reg is not None:
        pred_value = np.expm1(reg.predict(X))
    else:
        pred_value = np.ones(len(X))
    
    # Combine
    y_pred = np.where(prob > threshold, pred_value, 0)
    y_pred = np.maximum(0, y_pred)
    
    forecast_panel.loc[seg_mask, 'predicted_sales'] = y_pred
    
    print(f"  {seg}: {len(seg_data):,} rows, avg pred = {y_pred.mean():.2f}")

print("\n‚úì Predictions generated")

---
## 11. Evaluation & Sanity Checks <a name="11-evaluation"></a>

In [None]:
print("=" * 60)
print("SANITY CHECKS")
print("=" * 60)

n_series = forecast_panel.groupby(['store_id', 'sku_id']).ngroups
expected_rows = n_series * HORIZON_DAYS
actual_rows = len(forecast_panel)

print(f"\n1. Row count:")
print(f"   Series: {n_series:,}")
print(f"   Expected rows: {n_series:,} √ó {HORIZON_DAYS} = {expected_rows:,}")
print(f"   Actual rows: {actual_rows:,}")
print(f"   ‚úì Match" if expected_rows == actual_rows else f"   ‚úó Mismatch!")

print(f"\n2. Date range:")
print(f"   Min: {forecast_panel['date'].min().date()}")
print(f"   Max: {forecast_panel['date'].max().date()}")
print(f"   Days: {forecast_panel['date'].nunique()}")
print(f"   ‚úì Correct" if forecast_panel['date'].nunique() == HORIZON_DAYS else f"   ‚úó Wrong!")

print(f"\n3. Predictions quality:")
neg_preds = (forecast_panel['predicted_sales'] < 0).sum()
nan_preds = forecast_panel['predicted_sales'].isna().sum()
print(f"   Negative predictions: {neg_preds}")
print(f"   NaN predictions: {nan_preds}")
print(f"   ‚úì Clean" if neg_preds == 0 and nan_preds == 0 else f"   ‚úó Issues!")

print(f"\n4. Prediction distribution:")
print(f"   Mean: {forecast_panel['predicted_sales'].mean():.3f}")
print(f"   Median: {forecast_panel['predicted_sales'].median():.3f}")
print(f"   Max: {forecast_panel['predicted_sales'].max():.1f}")
print(f"   % zeros: {(forecast_panel['predicted_sales'] == 0).mean()*100:.1f}%")

In [None]:
# Metrics by behavior bucket
print("\n" + "=" * 60)
print("PREDICTIONS BY BEHAVIOR BUCKET")
print("=" * 60)

for bucket in ['fast_mover', 'regular', 'slow_mover', 'intermittent']:
    bucket_data = forecast_panel[forecast_panel['bucket'] == bucket]
    if len(bucket_data) > 0:
        avg_pred = bucket_data['predicted_sales'].mean()
        zero_rate = (bucket_data['predicted_sales'] == 0).mean() * 100
        print(f"  {bucket:15} | Series: {bucket_data.groupby(['store_id', 'sku_id']).ngroups:>6,} | "
              f"Avg pred: {avg_pred:>6.2f} | Zero%: {zero_rate:>5.1f}%")

In [None]:
# Metrics by Local vs Import
print("\n" + "=" * 60)
print("PREDICTIONS BY LOCAL VS IMPORT")
print("=" * 60)

for is_local, label in [(1, 'Local'), (0, 'Import')]:
    subset = forecast_panel[forecast_panel['is_local'] == is_local]
    if len(subset) > 0:
        avg_pred = subset['predicted_sales'].mean()
        zero_rate = (subset['predicted_sales'] == 0).mean() * 100
        print(f"  {label:10} | Series: {subset.groupby(['store_id', 'sku_id']).ngroups:>6,} | "
              f"Avg pred: {avg_pred:>6.2f} | Zero%: {zero_rate:>5.1f}%")

In [None]:
# Sample plots: 6 series (2 fast, 2 intermittent, 2 cold-start)
print("\nGenerating sample forecast plots...")

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

# Get sample series
sample_series = []

# Fast movers (2)
fast = series_list[series_list['bucket'] == 'fast_mover'].head(2)
for _, row in fast.iterrows():
    sample_series.append((row['store_id'], row['sku_id'], 'Fast Mover'))

# Intermittent (2)
inter = series_list[series_list['bucket'] == 'intermittent'].head(2)
for _, row in inter.iterrows():
    sample_series.append((row['store_id'], row['sku_id'], 'Intermittent'))

# Slow movers (2)
slow = series_list[series_list['bucket'] == 'slow_mover'].head(2)
for _, row in slow.iterrows():
    sample_series.append((row['store_id'], row['sku_id'], 'Slow Mover'))

for idx, (store, sku, label) in enumerate(sample_series[:6]):
    ax = axes[idx]
    
    # Historical (last 120 days)
    hist = panel[(panel['store_id'] == store) & (panel['sku_id'] == sku)].tail(120)
    
    # Forecast
    fcast = forecast_panel[(forecast_panel['store_id'] == store) & (forecast_panel['sku_id'] == sku)]
    
    if len(hist) > 0:
        ax.plot(hist['date'], hist['y'], 'b-', alpha=0.7, label='Historical')
    if len(fcast) > 0:
        ax.plot(fcast['date'], fcast['predicted_sales'], 'r--', alpha=0.7, label='Forecast')
    
    ax.axvline(pd.to_datetime(CUTOFF_DATE), color='gray', linestyle=':', label='Cutoff')
    ax.set_title(f'{label}\nStore {store}, SKU {sku}')
    ax.legend(loc='upper left', fontsize=8)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

---
## 12. Output & Submission Checklist <a name="12-output"></a>

In [None]:
# Prepare output in required format
print("Preparing output file...")

output = forecast_panel[['sku_id', 'store_id', 'date', 'predicted_sales']].copy()
output = output.rename(columns={'sku_id': 'item_id'})
output['date'] = output['date'].dt.strftime('%Y-%m-%d')
output['predicted_sales'] = output['predicted_sales'].round(2)

# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_PATH) if os.path.dirname(OUTPUT_PATH) else '.', exist_ok=True)

# Save
output.to_csv(OUTPUT_PATH, index=False)
file_size = os.path.getsize(OUTPUT_PATH) / 1e6

print(f"\n‚úì Saved to {OUTPUT_PATH}")
print(f"  Rows: {len(output):,}")
print(f"  Size: {file_size:.1f} MB")

In [None]:
# Preview output
print("\nOutput preview:")
print(output.head(10).to_string(index=False))

In [None]:
print("\n" + "=" * 60)
print("SUBMISSION CHECKLIST")
print("=" * 60)

checks = [
    ("Output file exists", os.path.exists(OUTPUT_PATH)),
    ("Columns: item_id, store_id, date, predicted_sales", list(output.columns) == ['item_id', 'store_id', 'date', 'predicted_sales']),
    (f"Row count = series √ó {HORIZON_DAYS}", len(output) == n_series * HORIZON_DAYS),
    ("Date range: 168 days", output['date'].nunique() == HORIZON_DAYS),
    ("No negative predictions", (output['predicted_sales'] >= 0).all()),
    ("No NaN predictions", output['predicted_sales'].notna().all()),
    (f"Start date: {FORECAST_START}", output['date'].min() == FORECAST_START),
]

all_pass = True
for check, passed in checks:
    status = "‚úì" if passed else "‚úó"
    print(f"  [{status}] {check}")
    if not passed:
        all_pass = False

print("\n" + "=" * 60)
if all_pass:
    print("‚úì ALL CHECKS PASSED - READY FOR SUBMISSION")
else:
    print("‚úó SOME CHECKS FAILED - REVIEW BEFORE SUBMISSION")
print("=" * 60)

In [None]:
print(f"\nNotebook completed: {datetime.now().strftime('%Y-%m-%d %H:%M')}")