In [1]:
import pandas as pd
import numpy as np

# Load data
receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv')
purchase_orders = pd.read_csv('./Project_materials/data/kernel/purchase_orders.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_localize(None)
purchase_orders['delivery_date'] = pd.to_datetime(purchase_orders['delivery_date'], utc=True).dt.tz_localize(None)
purchase_orders = purchase_orders[purchase_orders['quantity'] > 0]

print("="*80)
print("TRAINING vs TEST PO DISTRIBUTION ANALYSIS")
print("="*80)

# ============================================================================
# 1. WHAT POs ARE IN TRAINING? (2024 horizons)
# ============================================================================
print("\n[1] POs SEEN DURING TRAINING (2024)")
print("-"*80)

# Training uses dates 2024-01-01 to 2024-11-01
# With horizons up to 150 days, the latest PO date used in training is ~2024-11-01 + 150 = ~2025-03-30

train_start = pd.to_datetime('2024-01-01')
train_end_max = pd.to_datetime('2024-11-01') + pd.Timedelta(days=150)

pos_train = purchase_orders[
    (purchase_orders['delivery_date'] >= train_start) &
    (purchase_orders['delivery_date'] <= train_end_max)
]

print(f"Training period PO dates: {train_start.date()} to {train_end_max.date()}")
print(f"Number of POs: {len(pos_train)}")
print(f"Total quantity: {pos_train['quantity'].sum():,.0f} kg")
print(f"Mean quantity: {pos_train['quantity'].mean():,.0f} kg")
print(f"Median quantity: {pos_train['quantity'].median():,.0f} kg")
print(f"Max quantity: {pos_train['quantity'].max():,.0f} kg")
print(f"95th percentile quantity: {pos_train['quantity'].quantile(0.95):,.0f} kg")
print(f"Unique products: {pos_train['product_id'].nunique()}")
print("\nStatus distribution (training):")
print(pos_train['status'].value_counts())

# ============================================================================
# 2. WHAT POs ARE IN TEST? (2025 Jan-May)
# ============================================================================
print("\n[2] POs SEEN IN TEST (2025 Jan-May)")
print("-"*80)

test_start = pd.to_datetime('2025-01-01')
test_end = pd.to_datetime('2025-05-31')

pos_test = purchase_orders[
    (purchase_orders['delivery_date'] >= test_start) &
    (purchase_orders['delivery_date'] <= test_end)
]

print(f"Test period PO dates: {test_start.date()} to {test_end.date()}")
print(f"Number of POs: {len(pos_test)}")
print(f"Total quantity: {pos_test['quantity'].sum():,.0f} kg")
print(f"Mean quantity: {pos_test['quantity'].mean():,.0f} kg")
print(f"Median quantity: {pos_test['quantity'].median():,.0f} kg")
print(f"Max quantity: {pos_test['quantity'].max():,.0f} kg")
print(f"95th percentile quantity: {pos_test['quantity'].quantile(0.95):,.0f} kg")
print(f"Unique products: {pos_test['product_id'].nunique()}")
print("\nStatus distribution (test):")
print(pos_test['status'].value_counts())

# ============================================================================
# 3. DISTRIBUTION COMPARISON
# ============================================================================
print("\n[3] DISTRIBUTION MISMATCH ANALYSIS")
print("-"*80)

print(f"Mean quantity ratio (test/train): {pos_test['quantity'].mean() / pos_train['quantity'].mean():.2f}x")
print(f"Median quantity ratio (test/train): {pos_test['quantity'].median() / pos_train['quantity'].median():.2f}x")
print(f"Max quantity ratio (test/train): {pos_test['quantity'].max() / pos_train['quantity'].max():.2f}x")

# Product overlap
products_train = set(pos_train['product_id'].unique())
products_test = set(pos_test['product_id'].unique())
print(f"\nProduct overlap:")
print(f"  Products only in training: {len(products_train - products_test)}")
print(f"  Products only in test: {len(products_test - products_train)}")
print(f"  Products in both: {len(products_train & products_test)}")

# ============================================================================
# 4. CHECK SPECIFIC PROBLEMATIC PRODUCTS
# ============================================================================
print("\n[4] CHECKING PRODUCT 91900143 (The problem child from diagnostics)")
print("-"*80)

product_91900143_train = pos_train[pos_train['product_id'] == 91900143.0]
product_91900143_test = pos_test[pos_test['product_id'] == 91900143.0]

print(f"Product 91900143 in TRAINING:")
print(f"  Number of POs: {len(product_91900143_train)}")
if len(product_91900143_train) > 0:
    print(f"  Mean quantity: {product_91900143_train['quantity'].mean():,.0f} kg")
    print(f"  Total quantity: {product_91900143_train['quantity'].sum():,.0f} kg")

print(f"\nProduct 91900143 in TEST:")
print(f"  Number of POs: {len(product_91900143_test)}")
if len(product_91900143_test) > 0:
    print(f"  Mean quantity: {product_91900143_test['quantity'].mean():,.0f} kg")
    print(f"  Total quantity: {product_91900143_test['quantity'].sum():,.0f} kg")

if len(product_91900143_train) > 0 and len(product_91900143_test) > 0:
    print(f"\n⚠️  QUANTITY RATIO (test/train): {product_91900143_test['quantity'].sum() / product_91900143_train['quantity'].sum():.2f}x")

# ============================================================================
# 5. OUT OF DISTRIBUTION CHECK
# ============================================================================
print("\n[5] OUT-OF-DISTRIBUTION VALUES IN TEST")
print("-"*80)

# Are there test PO quantities that are way outside training range?
train_max_qty = pos_train['quantity'].max()
test_above_train_max = pos_test[pos_test['quantity'] > train_max_qty]

print(f"Training max PO quantity: {train_max_qty:,.0f} kg")
print(f"Test POs ABOVE training max: {len(test_above_train_max)}")
if len(test_above_train_max) > 0:
    print(f"  These test POs are: {test_above_train_max['quantity'].tolist()}")
    print(f"  ⚠️  MODEL NEVER SAW VALUES THIS HIGH!")

# Check percentiles
train_p99 = pos_train['quantity'].quantile(0.99)
test_above_train_p99 = pos_test[pos_test['quantity'] > train_p99]
print(f"\nTraining 99th percentile: {train_p99:,.0f} kg")
print(f"Test POs above training 99th percentile: {len(test_above_train_p99)} ({len(test_above_train_p99)/len(pos_test)*100:.1f}%)")

print("\n" + "="*80)

TRAINING vs TEST PO DISTRIBUTION ANALYSIS

[1] POs SEEN DURING TRAINING (2024)
--------------------------------------------------------------------------------
Training period PO dates: 2024-01-01 to 2025-03-31
Number of POs: 1178
Total quantity: 129,460,070 kg
Mean quantity: 109,898 kg
Median quantity: 50,000 kg
Max quantity: 1,600,000 kg
95th percentile quantity: 400,000 kg
Unique products: 37

Status distribution (training):
status
Closed     1128
Deleted      32
Open         18
Name: count, dtype: int64

[2] POs SEEN IN TEST (2025 Jan-May)
--------------------------------------------------------------------------------
Test period PO dates: 2025-01-01 to 2025-05-31
Number of POs: 147
Total quantity: 19,148,974 kg
Mean quantity: 130,265 kg
Median quantity: 50,000 kg
Max quantity: 1,350,000 kg
95th percentile quantity: 500,000 kg
Unique products: 17

Status distribution (test):
status
Closed     144
Open         2
Deleted      1
Name: count, dtype: int64

[3] DISTRIBUTION MISMATCH AN