In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("CRITICAL ISSUE ANALYSIS: LEAKAGE & PATTERN VERIFICATION")
print("="*80)

# ============================================================================
# LOAD DATA
# ============================================================================
print("\n[1] LOADING DATA")
receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv')
prediction_mapping = pd.read_csv('./Project_materials/data/prediction_mapping.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_localize(None)
receivals = receivals[receivals['net_weight'] > 0]
receivals = receivals[receivals['rm_id'].notna()]
receivals = receivals.sort_values('date_arrival')

print(f"Total receivals: {len(receivals)}")

# ============================================================================
# RECREATE TRAINING DATA
# ============================================================================
print("\n[2] RECREATING TRAINING DATA")
train_dates = pd.date_range(start='2024-01-01', end='2024-11-30', freq='MS')
forecast_horizons = [7, 30, 60, 90, 150]

training_data = []
active_rm_ids = receivals[receivals['date_arrival'] >= '2024-01-01']['rm_id'].unique()

for i, train_date in enumerate(train_dates):
    for rm_id in active_rm_ids:
        hist = receivals[
            (receivals['rm_id'] == rm_id) &
            (receivals['date_arrival'] < train_date)
        ]
        
        if len(hist) == 0:
            continue
        
        cutoff_365 = train_date - timedelta(days=365)
        cutoff_180 = train_date - timedelta(days=180)
        cutoff_90 = train_date - timedelta(days=90)
        cutoff_30 = train_date - timedelta(days=30)
        
        recent_365 = hist[hist['date_arrival'] >= cutoff_365]
        recent_180 = hist[hist['date_arrival'] >= cutoff_180]
        recent_90 = hist[hist['date_arrival'] >= cutoff_90]
        recent_30 = hist[hist['date_arrival'] >= cutoff_30]
        
        if len(recent_365) > 0:
            total_365 = recent_365['net_weight'].sum()
            count_365 = len(recent_365)
            days_since = (train_date - recent_365['date_arrival'].max()).days
        else:
            total_365 = count_365 = days_since = 0
        
        if len(recent_180) > 0:
            total_180 = recent_180['net_weight'].sum()
            count_180 = len(recent_180)
        else:
            total_180 = count_180 = 0
        
        if len(recent_90) > 0:
            total_90 = recent_90['net_weight'].sum()
            count_90 = len(recent_90)
        else:
            total_90 = count_90 = 0
        
        if len(recent_30) > 0:
            total_30 = recent_30['net_weight'].sum()
            count_30 = len(recent_30)
            rate_30 = total_30 / 30
        else:
            total_30 = count_30 = rate_30 = 0
        
        rate_90 = total_90 / 90 if total_90 > 0 else 0
        
        if len(recent_90) > 0:
            days_ago = (train_date - recent_90['date_arrival']).dt.days
            weights = 1.0 / (days_ago + 1)
            recency_weighted = (recent_90['net_weight'] * weights).sum()
        else:
            recency_weighted = 0
        
        if len(recent_90) > 0:
            active_days_90 = recent_90['date_arrival'].dt.date.nunique()
            active_ratio_90 = active_days_90 / 90
        else:
            active_ratio_90 = 0
        
        for horizon in forecast_horizons:
            forecast_end = train_date + timedelta(days=horizon)
            
            actual = receivals[
                (receivals['rm_id'] == rm_id) &
                (receivals['date_arrival'] >= train_date) &
                (receivals['date_arrival'] <= forecast_end)
            ]
            target = actual['net_weight'].sum()
            
            training_data.append({
                'rm_id': rm_id,
                'train_date': train_date,
                'forecast_horizon': horizon,
                'total_weight_365d': total_365,
                'count_365d': count_365,
                'days_since_last': days_since,
                'total_weight_90d': total_90,
                'count_90d': count_90,
                'rate_90': rate_90,
                'total_weight_180d': total_180,
                'count_180d': count_180,
                'total_30': total_30,
                'count_30': count_30,
                'rate_30': rate_30,
                'recency_weighted': recency_weighted,
                'active_ratio_90': active_ratio_90,
                'target': target
            })

train_df = pd.DataFrame(training_data)
print(f"Total training samples: {len(train_df)}")

# ============================================================================
# ANALYSIS 1: CUMULATIVE PATTERN CHECK (EXPECTED BEHAVIOR)
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 1: CUMULATIVE PATTERN CHECK")
print("="*80)
print("NOTE: Targets SHOULD be cumulative by design - this is expected!")
print("-"*80)

leakage_check = []
sample_rm_ids = train_df['rm_id'].value_counts().head(20).index  # Top 20 most frequent

for rm_id in sample_rm_ids:
    rm_data = train_df[train_df['rm_id'] == rm_id].sort_values(['train_date', 'forecast_horizon'])
    for date in rm_data['train_date'].unique():
        date_data = rm_data[rm_data['train_date'] == date].sort_values('forecast_horizon')
        targets = date_data['target'].values
        
        if len(targets) == 5:  # Should have all 5 horizons
            leakage_check.append({
                'rm_id': rm_id,
                'train_date': date,
                'horizon_7': targets[0],
                'horizon_30': targets[1], 
                'horizon_60': targets[2],
                'horizon_90': targets[3],
                'horizon_150': targets[4],
                'is_cumulative': targets[0] <= targets[1] <= targets[2] <= targets[3] <= targets[4],
                'has_deliveries_all_horizons': all(t > 0 for t in targets)
            })

leakage_df = pd.DataFrame(leakage_check)
print(f"\nTotal samples checked: {len(leakage_df)}")
print(f"Percentage of cumulative patterns: {leakage_df['is_cumulative'].mean() * 100:.1f}%")
print(f"Percentage with deliveries in ALL horizons: {leakage_df['has_deliveries_all_horizons'].mean() * 100:.1f}%")

# Show examples of non-cumulative patterns (these are the interesting ones)
non_cumulative = leakage_df[~leakage_df['is_cumulative']]
if len(non_cumulative) > 0:
    print(f"\nNon-cumulative patterns found: {len(non_cumulative)}")
    print("\nExample non-cumulative patterns (first 5):")
    print(non_cumulative.head())
else:
    print("\nAll patterns are cumulative (as expected by design)")

# ============================================================================
# ANALYSIS 2: DOES FORECAST_HORIZON DOMINATE PREDICTIONS?
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 2: FORECAST_HORIZON DOMINANCE TEST")
print("="*80)
print("Testing if model relies too heavily on forecast_horizon vs actual features")
print("-"*80)

# Prepare data for modeling
feature_cols = ['forecast_horizon', 'total_weight_365d', 'rate_90', 'days_since_last',
                'total_weight_90d', 'count_365d', 'recency_weighted', 'active_ratio_90']

X = train_df[feature_cols].copy()
y = train_df['target'].copy()

# Train/val split (temporal)
split_date = pd.to_datetime('2024-09-01')
train_mask = train_df['train_date'] < split_date
val_mask = train_df['train_date'] >= split_date

X_train = X[train_mask]
y_train = y[train_mask]
X_val = X[val_mask]
y_val = y[val_mask]

print(f"Train samples: {len(X_train)}, Val samples: {len(X_val)}")

# Test 1: Model with ALL features
print("\n--- Test 1: Full Model (with forecast_horizon) ---")
rf_full = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1)
rf_full.fit(X_train, y_train)

train_score_full = rf_full.score(X_train, y_train)
val_score_full = rf_full.score(X_val, y_val)
print(f"Train R²: {train_score_full:.4f}")
print(f"Val R²: {val_score_full:.4f}")

# Feature importance
importances_full = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_full.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importances (Full Model):")
print(importances_full)

# Test 2: Model WITHOUT forecast_horizon
print("\n--- Test 2: Model WITHOUT forecast_horizon ---")
feature_cols_no_horizon = [f for f in feature_cols if f != 'forecast_horizon']
X_train_no_horizon = X_train[feature_cols_no_horizon]
X_val_no_horizon = X_val[feature_cols_no_horizon]

rf_no_horizon = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1)
rf_no_horizon.fit(X_train_no_horizon, y_train)

train_score_no_horizon = rf_no_horizon.score(X_train_no_horizon, y_train)
val_score_no_horizon = rf_no_horizon.score(X_val_no_horizon, y_val)
print(f"Train R²: {train_score_no_horizon:.4f}")
print(f"Val R²: {val_score_no_horizon:.4f}")

importances_no_horizon = pd.DataFrame({
    'feature': feature_cols_no_horizon,
    'importance': rf_no_horizon.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importances (No Horizon):")
print(importances_no_horizon)

# Test 3: Model with ONLY forecast_horizon
print("\n--- Test 3: Model with ONLY forecast_horizon ---")
X_train_only_horizon = X_train[['forecast_horizon']]
X_val_only_horizon = X_val[['forecast_horizon']]

rf_only_horizon = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1)
rf_only_horizon.fit(X_train_only_horizon, y_train)

train_score_only_horizon = rf_only_horizon.score(X_train_only_horizon, y_train)
val_score_only_horizon = rf_only_horizon.score(X_val_only_horizon, y_val)
print(f"Train R²: {train_score_only_horizon:.4f}")
print(f"Val R²: {val_score_only_horizon:.4f}")

# Summary
print("\n" + "-"*80)
print("HORIZON DOMINANCE SUMMARY:")
print("-"*80)
performance_drop = (val_score_full - val_score_no_horizon) / val_score_full * 100
horizon_only_performance = val_score_only_horizon / val_score_full * 100

print(f"Performance drop without horizon: {performance_drop:.1f}%")
print(f"Horizon-only model achieves: {horizon_only_performance:.1f}% of full model performance")
print(f"Forecast_horizon importance: {importances_full.iloc[0]['importance']:.3f}" if importances_full.iloc[0]['feature'] == 'forecast_horizon' else f"Forecast_horizon importance: {importances_full[importances_full['feature']=='forecast_horizon']['importance'].values[0]:.3f}")

if performance_drop > 50:
    print("⚠️  CRITICAL: Model is HEAVILY dependent on forecast_horizon!")
elif performance_drop > 25:
    print("⚠️  WARNING: Model has SIGNIFICANT dependence on forecast_horizon")
else:
    print("✓ Model shows reasonable balance between horizon and other features")

# ============================================================================
# ANALYSIS 3: SAME FEATURES, DIFFERENT HORIZONS - VARIANCE CHECK
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 3: PREDICTION CONSISTENCY CHECK")
print("="*80)
print("For same rm_id, date, and features - do predictions scale reasonably with horizon?")
print("-"*80)

# Pick a few examples and check predictions
sample_examples = train_df[train_df['rm_id'].isin(sample_rm_ids[:5])].groupby(['rm_id', 'train_date']).filter(lambda x: len(x) == 5)
unique_combos = sample_examples[['rm_id', 'train_date']].drop_duplicates().head(10)

consistency_check = []
for _, row in unique_combos.iterrows():
    subset = train_df[(train_df['rm_id'] == row['rm_id']) & (train_df['train_date'] == row['train_date'])].sort_values('forecast_horizon')
    
    if len(subset) == 5:
        X_subset = subset[feature_cols]
        
        pred_full = rf_full.predict(X_subset)
        pred_no_horizon = rf_no_horizon.predict(X_subset[feature_cols_no_horizon])
        actual = subset['target'].values
        
        consistency_check.append({
            'rm_id': row['rm_id'],
            'train_date': row['train_date'],
            'actual_7': actual[0],
            'actual_150': actual[4],
            'pred_full_7': pred_full[0],
            'pred_full_150': pred_full[4],
            'pred_no_horizon_7': pred_no_horizon[0],
            'pred_no_horizon_150': pred_no_horizon[4],
            'actual_ratio': actual[4] / (actual[0] + 1),
            'pred_full_ratio': pred_full[4] / (pred_full[0] + 1),
            'pred_no_horizon_ratio': pred_no_horizon[4] / (pred_no_horizon[0] + 1)
        })

consistency_df = pd.DataFrame(consistency_check)
print("\nSample predictions for same (rm_id, date) with different horizons:")
print(consistency_df[['rm_id', 'train_date', 'actual_7', 'actual_150', 'pred_full_7', 'pred_full_150']].head(10))

print("\nRatio Analysis (150-day / 7-day):")
print(f"Actual ratio (mean): {consistency_df['actual_ratio'].mean():.2f}")
print(f"Full model ratio (mean): {consistency_df['pred_full_ratio'].mean():.2f}")
print(f"No-horizon model ratio (mean): {consistency_df['pred_no_horizon_ratio'].mean():.2f}")

# ============================================================================
# ANALYSIS 4: PER-RM_ID DELIVERY PATTERN ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 4: RM_ID DELIVERY PATTERN DIVERSITY")
print("="*80)
print("Do different RM_IDs have different delivery patterns?")
print("-"*80)

# Calculate per-RM_ID delivery statistics
rm_patterns = []
for rm_id in sample_rm_ids:
    rm_deliveries = receivals[receivals['rm_id'] == rm_id]
    
    if len(rm_deliveries) > 0:
        # Calculate inter-arrival times
        rm_deliveries_sorted = rm_deliveries.sort_values('date_arrival')
        inter_arrival = rm_deliveries_sorted['date_arrival'].diff().dt.days.dropna()
        
        rm_patterns.append({
            'rm_id': rm_id,
            'total_deliveries': len(rm_deliveries),
            'total_weight': rm_deliveries['net_weight'].sum(),
            'avg_delivery_size': rm_deliveries['net_weight'].mean(),
            'std_delivery_size': rm_deliveries['net_weight'].std(),
            'avg_inter_arrival_days': inter_arrival.mean() if len(inter_arrival) > 0 else np.nan,
            'std_inter_arrival_days': inter_arrival.std() if len(inter_arrival) > 0 else np.nan
        })

patterns_df = pd.DataFrame(rm_patterns)
print("\nDelivery pattern summary:")
print(patterns_df.describe())

print("\nPattern diversity metrics:")
print(f"CV of avg_delivery_size: {patterns_df['avg_delivery_size'].std() / patterns_df['avg_delivery_size'].mean():.3f}")
print(f"CV of avg_inter_arrival_days: {patterns_df['avg_inter_arrival_days'].std() / patterns_df['avg_inter_arrival_days'].mean():.3f}")

# ============================================================================
# VISUALIZATION
# ============================================================================
print("\n[5] GENERATING VISUALIZATIONS")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Feature importance comparison
ax = axes[0, 0]
comparison_df = pd.DataFrame({
    'Full Model': importances_full.set_index('feature')['importance'],
    'No Horizon': importances_no_horizon.set_index('feature')['importance']
})
comparison_df.plot(kind='barh', ax=ax)
ax.set_xlabel('Importance')
ax.set_title('Feature Importance: Full vs No Horizon')
ax.legend()

# Plot 2: R² comparison
ax = axes[0, 1]
r2_comparison = pd.DataFrame({
    'Model': ['Full Model', 'No Horizon', 'Only Horizon'],
    'Train R²': [train_score_full, train_score_no_horizon, train_score_only_horizon],
    'Val R²': [val_score_full, val_score_no_horizon, val_score_only_horizon]
})
x = np.arange(len(r2_comparison))
width = 0.35
ax.bar(x - width/2, r2_comparison['Train R²'], width, label='Train R²')
ax.bar(x + width/2, r2_comparison['Val R²'], width, label='Val R²')
ax.set_xlabel('Model Type')
ax.set_ylabel('R² Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(r2_comparison['Model'], rotation=45)
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Plot 3: Actual vs predicted ratios
ax = axes[1, 0]
ax.scatter(consistency_df['actual_ratio'], consistency_df['pred_full_ratio'], alpha=0.6, label='Full Model')
ax.scatter(consistency_df['actual_ratio'], consistency_df['pred_no_horizon_ratio'], alpha=0.6, label='No Horizon')
ax.plot([0, consistency_df['actual_ratio'].max()], [0, consistency_df['actual_ratio'].max()], 'r--', label='Perfect')
ax.set_xlabel('Actual Ratio (150d/7d)')
ax.set_ylabel('Predicted Ratio (150d/7d)')
ax.set_title('Horizon Ratio Prediction Consistency')
ax.legend()
ax.grid(alpha=0.3)

# Plot 4: Delivery pattern diversity
ax = axes[1, 1]
ax.scatter(patterns_df['avg_inter_arrival_days'], patterns_df['avg_delivery_size'], 
           s=patterns_df['total_deliveries']*2, alpha=0.6)
ax.set_xlabel('Avg Inter-arrival Days')
ax.set_ylabel('Avg Delivery Size (kg)')
ax.set_title('RM_ID Delivery Pattern Diversity\n(size = total deliveries)')
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('leakage_analysis.png', dpi=150, bbox_inches='tight')
print("✅ Saved: leakage_analysis.png")
plt.close()

# ============================================================================
# FINAL VERDICT
# ============================================================================
print("\n" + "="*80)
print("FINAL VERDICT")
print("="*80)

print("\n1. CUMULATIVE TARGETS:")
cumulative_pct = leakage_df['is_cumulative'].mean() * 100
print(f"   {cumulative_pct:.1f}% of patterns are cumulative")
print("   ✓ This is EXPECTED by design - not a problem")

print("\n2. FORECAST_HORIZON DOMINANCE:")
horizon_importance = importances_full[importances_full['feature']=='forecast_horizon']['importance'].values[0]
print(f"   Horizon feature importance: {horizon_importance:.3f}")
print(f"   Performance drop without horizon: {performance_drop:.1f}%")
if horizon_importance > 0.5 or performance_drop > 50:
    print("   ⚠️  CRITICAL ISSUE: Model is over-reliant on forecast_horizon")
    print("   This suggests the model is learning the cumulative pattern, not delivery dynamics")
elif horizon_importance > 0.3 or performance_drop > 25:
    print("   ⚠️  WARNING: Moderate dependence on forecast_horizon")
else:
    print("   ✓ Reasonable balance between horizon and other features")

print("\n3. PREDICTION CONSISTENCY:")
ratio_error = abs(consistency_df['pred_full_ratio'].mean() - consistency_df['actual_ratio'].mean()) / consistency_df['actual_ratio'].mean() * 100
print(f"   Ratio prediction error: {ratio_error:.1f}%")
if ratio_error > 50:
    print("   ⚠️  Poor consistency - model not capturing delivery patterns well")
else:
    print("   ✓ Reasonable ratio consistency")

print("\n4. PATTERN DIVERSITY:")
cv_delivery = patterns_df['avg_delivery_size'].std() / patterns_df['avg_delivery_size'].mean()
print(f"   Coefficient of variation in delivery sizes: {cv_delivery:.3f}")
if cv_delivery > 2.0:
    print("   ✓ High diversity - RM_IDs have distinct delivery patterns")
else:
    print("   ⚠️  Low diversity - RM_IDs are similar")

print("\n" + "="*80)
print("RECOMMENDATIONS:")
print("="*80)
if horizon_importance > 0.5:
    print("• CRITICAL: Redesign features to capture actual delivery dynamics")
    print("• Consider: delivery frequency, seasonality, supplier patterns")
    print("• Avoid: Using horizon as a direct feature - use it only for target calculation")
if performance_drop > 50:
    print("• Model is learning shortcuts - need more informative features")
if patterns_df['avg_inter_arrival_days'].std() > 100:
    print("• High variability in delivery patterns - consider RM_ID-specific models")
    
print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

CRITICAL ISSUE ANALYSIS: LEAKAGE & PATTERN VERIFICATION

[1] LOADING DATA
Total receivals: 122383

[2] RECREATING TRAINING DATA
Total training samples: 2725

ANALYSIS 1: CUMULATIVE PATTERN CHECK
NOTE: Targets SHOULD be cumulative by design - this is expected!
--------------------------------------------------------------------------------

Total samples checked: 220
Percentage of cumulative patterns: 100.0%
Percentage with deliveries in ALL horizons: 23.2%

All patterns are cumulative (as expected by design)

ANALYSIS 2: FORECAST_HORIZON DOMINANCE TEST
Testing if model relies too heavily on forecast_horizon vs actual features
--------------------------------------------------------------------------------
Train samples: 1895, Val samples: 830

--- Test 1: Full Model (with forecast_horizon) ---
Train R²: 0.9836
Val R²: 0.5891

Feature Importances (Full Model):
             feature  importance
0   forecast_horizon    0.332059
5         count_365d    0.327771
1  total_weight_365d    0.201