In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("COMPREHENSIVE DIAGNOSTIC ANALYSIS")
print("="*80)

# ============================================================================
# LOAD DATA
# ============================================================================
print("\n[1] LOADING DATA")
receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv')
purchase_orders = pd.read_csv('./Project_materials/data/kernel/purchase_orders.csv')
prediction_mapping = pd.read_csv('./Project_materials/data/prediction_mapping.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_localize(None)
receivals = receivals[receivals['net_weight'] > 0]
receivals = receivals[receivals['rm_id'].notna()]
receivals = receivals.sort_values('date_arrival')

print(f"Total receivals: {len(receivals)}")
print(f"Date range: {receivals['date_arrival'].min()} to {receivals['date_arrival'].max()}")
print(f"Unique rm_ids: {receivals['rm_id'].nunique()}")

# ============================================================================
# RECREATE TRAINING DATA (YOUR EXACT PROCESS)
# ============================================================================
print("\n[2] RECREATING TRAINING DATA")
print("-"*80)

train_dates = pd.date_range(start='2024-01-01', end='2024-11-30', freq='MS')
forecast_horizons = [7, 30, 60, 90, 150]

training_data = []
active_rm_ids = receivals[receivals['date_arrival'] >= '2024-01-01']['rm_id'].unique()

for i, train_date in enumerate(train_dates):
    for rm_id in active_rm_ids:
        hist = receivals[
            (receivals['rm_id'] == rm_id) &
            (receivals['date_arrival'] < train_date)
        ]
        
        if len(hist) == 0:
            continue
        
        cutoff_365 = train_date - timedelta(days=365)
        cutoff_180 = train_date - timedelta(days=180)
        cutoff_90 = train_date - timedelta(days=90)
        cutoff_30 = train_date - timedelta(days=30)
        
        recent_365 = hist[hist['date_arrival'] >= cutoff_365]
        recent_180 = hist[hist['date_arrival'] >= cutoff_180]
        recent_90 = hist[hist['date_arrival'] >= cutoff_90]
        recent_30 = hist[hist['date_arrival'] >= cutoff_30]
        
        if len(recent_365) > 0:
            total_365 = recent_365['net_weight'].sum()
            count_365 = len(recent_365)
            days_since = (train_date - recent_365['date_arrival'].max()).days
        else:
            total_365 = count_365 = days_since = 0
        
        if len(recent_180) > 0:
            total_180 = recent_180['net_weight'].sum()
            count_180 = len(recent_180)
        else:
            total_180 = count_180 = 0
        
        if len(recent_90) > 0:
            total_90 = recent_90['net_weight'].sum()
            count_90 = len(recent_90)
        else:
            total_90 = count_90 = 0
        
        if len(recent_30) > 0:
            total_30 = recent_30['net_weight'].sum()
            count_30 = len(recent_30)
            rate_30 = total_30 / 30
        else:
            total_30 = count_30 = rate_30 = 0
        
        rate_90 = total_90 / 90 if total_90 > 0 else 0
        
        if len(recent_90) > 0:
            days_ago = (train_date - recent_90['date_arrival']).dt.days
            weights = 1.0 / (days_ago + 1)
            recency_weighted = (recent_90['net_weight'] * weights).sum()
        else:
            recency_weighted = 0
        
        if len(recent_90) > 0:
            active_days_90 = recent_90['date_arrival'].dt.date.nunique()
            active_ratio_90 = active_days_90 / 90
        else:
            active_ratio_90 = 0
        
        for horizon in forecast_horizons:
            forecast_end = train_date + timedelta(days=horizon)
            
            actual = receivals[
                (receivals['rm_id'] == rm_id) &
                (receivals['date_arrival'] >= train_date) &
                (receivals['date_arrival'] <= forecast_end)
            ]
            target = actual['net_weight'].sum()
            
            training_data.append({
                'rm_id': rm_id,
                'train_date': train_date,
                'forecast_horizon': horizon,
                'total_weight_365d': total_365,
                'count_365d': count_365,
                'days_since_last': days_since,
                'total_weight_90d': total_90,
                'count_90d': count_90,
                'rate_90': rate_90,
                'total_weight_180d': total_180,
                'count_180d': count_180,
                'total_30': total_30,
                'count_30': count_30,
                'rate_30': rate_30,
                'recency_weighted': recency_weighted,
                'active_ratio_90': active_ratio_90,
                'target': target
            })

train_df = pd.DataFrame(training_data)
print(f"Total training samples: {len(train_df)}")

# ============================================================================
# ANALYSIS 1: TARGET DISTRIBUTION
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 1: TARGET DISTRIBUTION")
print("="*80)

print("\nTarget Statistics:")
print(train_df['target'].describe())

zero_pct = (train_df['target'] == 0).mean() * 100
print(f"\nPercentage of ZERO targets: {zero_pct:.1f}%")
print(f"Percentage of NON-ZERO targets: {100-zero_pct:.1f}%")

non_zero_targets = train_df[train_df['target'] > 0]['target']
print(f"\nNon-zero target statistics:")
print(non_zero_targets.describe())

# Percentiles
print("\nTarget percentiles:")
for p in [10, 25, 50, 75, 90, 95, 99]:
    val = train_df['target'].quantile(p/100)
    print(f"  {p}th percentile: {val:,.0f} kg")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# All targets
axes[0].hist(train_df['target'], bins=100, edgecolor='black')
axes[0].set_xlabel('Target (kg)')
axes[0].set_ylabel('Frequency')
axes[0].set_title(f'All Targets (n={len(train_df)})\nZeros: {zero_pct:.1f}%')
axes[0].axvline(train_df['target'].mean(), color='red', linestyle='--', label=f'Mean: {train_df["target"].mean():,.0f}')
axes[0].legend()

# Non-zero targets only
if len(non_zero_targets) > 0:
    axes[1].hist(non_zero_targets, bins=100, edgecolor='black')
    axes[1].set_xlabel('Target (kg)')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title(f'Non-Zero Targets Only (n={len(non_zero_targets)})')
    axes[1].axvline(non_zero_targets.mean(), color='red', linestyle='--', label=f'Mean: {non_zero_targets.mean():,.0f}')
    axes[1].legend()

plt.tight_layout()
plt.savefig('01_target_distribution.png', dpi=150, bbox_inches='tight')
print("\n✅ Saved: 01_target_distribution.png")
plt.close()

# ============================================================================
# ANALYSIS 2: TARGET BY FORECAST HORIZON
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 2: TARGET BY FORECAST HORIZON")
print("="*80)

horizon_analysis = train_df.groupby('forecast_horizon').agg({
    'target': ['count', 'mean', 'std', 'median', lambda x: (x == 0).mean() * 100]
})
horizon_analysis.columns = ['count', 'mean', 'std', 'median', 'zero_pct']
print("\nTarget statistics by horizon:")
print(horizon_analysis)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(horizon_analysis.index, horizon_analysis['mean'])
axes[0].set_xlabel('Forecast Horizon (days)')
axes[0].set_ylabel('Mean Target (kg)')
axes[0].set_title('Mean Target by Horizon')
axes[0].grid(axis='y', alpha=0.3)

axes[1].bar(horizon_analysis.index, horizon_analysis['zero_pct'], color='orange')
axes[1].set_xlabel('Forecast Horizon (days)')
axes[1].set_ylabel('Percentage Zero (%)')
axes[1].set_title('Percentage of Zero Targets by Horizon')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('02_target_by_horizon.png', dpi=150, bbox_inches='tight')
print("\n✅ Saved: 02_target_by_horizon.png")
plt.close()

# ============================================================================
# ANALYSIS 3: TARGET BY RM_ID
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 3: TARGET BY RM_ID")
print("="*80)

rm_analysis = train_df.groupby('rm_id').agg({
    'target': ['count', 'mean', 'std', lambda x: (x == 0).mean() * 100, 'sum']
})
rm_analysis.columns = ['count', 'mean', 'std', 'zero_pct', 'total']
rm_analysis = rm_analysis.sort_values('count', ascending=False)

print("\nTop 20 rm_ids by frequency:")
print(rm_analysis.head(20))

print("\nTop 20 rm_ids by total weight:")
print(rm_analysis.sort_values('total', ascending=False).head(20))

print(f"\nRM_IDs that are 100% zeros: {(rm_analysis['zero_pct'] == 100).sum()}")
print(f"RM_IDs that are <50% zeros: {(rm_analysis['zero_pct'] < 50).sum()}")

# Plot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Count by rm_id
top_20_counts = rm_analysis.head(20)
axes[0, 0].barh(range(len(top_20_counts)), top_20_counts['count'])
axes[0, 0].set_yticks(range(len(top_20_counts)))
axes[0, 0].set_yticklabels([f"rm_{int(x)}" for x in top_20_counts.index])
axes[0, 0].set_xlabel('Number of Samples')
axes[0, 0].set_title('Top 20 RM_IDs by Sample Count')
axes[0, 0].invert_yaxis()

# Mean by rm_id
top_20_mean = rm_analysis.sort_values('mean', ascending=False).head(20)
axes[0, 1].barh(range(len(top_20_mean)), top_20_mean['mean'])
axes[0, 1].set_yticks(range(len(top_20_mean)))
axes[0, 1].set_yticklabels([f"rm_{int(x)}" for x in top_20_mean.index])
axes[0, 1].set_xlabel('Mean Target (kg)')
axes[0, 1].set_title('Top 20 RM_IDs by Mean Target')
axes[0, 1].invert_yaxis()

# Zero percentage distribution
axes[1, 0].hist(rm_analysis['zero_pct'], bins=50, edgecolor='black')
axes[1, 0].set_xlabel('Zero Percentage (%)')
axes[1, 0].set_ylabel('Number of RM_IDs')
axes[1, 0].set_title('Distribution of Zero Percentage Across RM_IDs')
axes[1, 0].axvline(rm_analysis['zero_pct'].median(), color='red', linestyle='--', label=f'Median: {rm_analysis["zero_pct"].median():.1f}%')
axes[1, 0].legend()

# Sample count distribution
axes[1, 1].hist(rm_analysis['count'], bins=50, edgecolor='black')
axes[1, 1].set_xlabel('Number of Samples')
axes[1, 1].set_ylabel('Number of RM_IDs')
axes[1, 1].set_title('Distribution of Sample Counts Across RM_IDs')

plt.tight_layout()
plt.savefig('03_target_by_rm_id.png', dpi=150, bbox_inches='tight')
print("\n✅ Saved: 03_target_by_rm_id.png")
plt.close()

# ============================================================================
# ANALYSIS 4: TEMPORAL PATTERNS
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 4: TEMPORAL PATTERNS")
print("="*80)

monthly_analysis = train_df.groupby(train_df['train_date'].dt.to_period('M')).agg({
    'target': ['count', 'mean', 'median', lambda x: (x == 0).mean() * 100]
})
monthly_analysis.columns = ['count', 'mean', 'median', 'zero_pct']
print("\nTarget statistics by month:")
print(monthly_analysis)

# Plot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Mean by month
monthly_analysis['mean'].plot(ax=axes[0, 0], marker='o')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Mean Target (kg)')
axes[0, 0].set_title('Mean Target by Month')
axes[0, 0].grid(alpha=0.3)

# Zero percentage by month
monthly_analysis['zero_pct'].plot(ax=axes[0, 1], marker='o', color='orange')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Zero Percentage (%)')
axes[0, 1].set_title('Percentage of Zero Targets by Month')
axes[0, 1].grid(alpha=0.3)

# Sample count by month
monthly_analysis['count'].plot(ax=axes[1, 0], marker='o', color='green')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Sample Count')
axes[1, 0].set_title('Number of Samples by Month')
axes[1, 0].grid(alpha=0.3)

# Median by month
monthly_analysis['median'].plot(ax=axes[1, 1], marker='o', color='purple')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('Median Target (kg)')
axes[1, 1].set_title('Median Target by Month')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('04_temporal_patterns.png', dpi=150, bbox_inches='tight')
print("\n✅ Saved: 04_temporal_patterns.png")
plt.close()

# ============================================================================
# ANALYSIS 5: FEATURE-TARGET RELATIONSHIPS
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 5: FEATURE-TARGET RELATIONSHIPS")
print("="*80)

features = ['forecast_horizon', 'total_weight_365d', 'rate_90', 'days_since_last',
            'total_weight_90d', 'count_365d', 'recency_weighted', 'active_ratio_90', 'rm_id']

# Correlations
correlations = train_df[features + ['target']].corr()['target'].sort_values(ascending=False)
print("\nFeature correlations with target:")
print(correlations)

# Plot scatter plots
fig, axes = plt.subplots(3, 3, figsize=(18, 15))

for i, col in enumerate(features):
    row, col_idx = i // 3, i % 3
    
    # Sample for plotting (too many points)
    sample = train_df.sample(min(5000, len(train_df)))
    
    axes[row, col_idx].scatter(sample[col], sample['target'], alpha=0.3, s=10)
    axes[row, col_idx].set_xlabel(col)
    axes[row, col_idx].set_ylabel('Target (kg)')
    axes[row, col_idx].set_title(f'{col} vs Target\nCorr: {correlations[col]:.3f}')
    axes[row, col_idx].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('05_feature_target_relationships.png', dpi=150, bbox_inches='tight')
print("\n✅ Saved: 05_feature_target_relationships.png")
plt.close()

# ============================================================================
# ANALYSIS 6: TRAIN/VAL SPLIT COMPARISON
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 6: TRAIN/VAL SPLIT COMPARISON")
print("="*80)

split_date = pd.to_datetime('2024-09-01')
train_mask = train_df['train_date'] < split_date
val_mask = train_df['train_date'] >= split_date

train_split = train_df[train_mask]
val_split = train_df[val_mask]

print(f"\nTraining set (before {split_date.date()}):")
print(f"  Samples: {len(train_split)}")
print(f"  Mean target: {train_split['target'].mean():,.0f} kg")
print(f"  Median target: {train_split['target'].median():,.0f} kg")
print(f"  Zero percentage: {(train_split['target'] == 0).mean() * 100:.1f}%")

print(f"\nValidation set (>= {split_date.date()}):")
print(f"  Samples: {len(val_split)}")
print(f"  Mean target: {val_split['target'].mean():,.0f} kg")
print(f"  Median target: {val_split['target'].median():,.0f} kg")
print(f"  Zero percentage: {(val_split['target'] == 0).mean() * 100:.1f}%")

# Feature distributions comparison
print("\nFeature distribution comparison (train vs val):")
for feat in features:
    train_mean = train_split[feat].mean()
    val_mean = val_split[feat].mean()
    diff_pct = ((val_mean - train_mean) / (train_mean + 1e-10)) * 100
    print(f"  {feat:20s}: Train={train_mean:12.2f}, Val={val_mean:12.2f}, Diff={diff_pct:+6.1f}%")

# Plot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Target distributions
axes[0, 0].hist(train_split['target'], bins=100, alpha=0.5, label='Train', edgecolor='black')
axes[0, 0].hist(val_split['target'], bins=100, alpha=0.5, label='Val', edgecolor='black')
axes[0, 0].set_xlabel('Target (kg)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Target Distribution: Train vs Val')
axes[0, 0].legend()

# Non-zero target distributions
train_nonzero = train_split[train_split['target'] > 0]['target']
val_nonzero = val_split[val_split['target'] > 0]['target']
axes[0, 1].hist(train_nonzero, bins=100, alpha=0.5, label='Train', edgecolor='black')
axes[0, 1].hist(val_nonzero, bins=100, alpha=0.5, label='Val', edgecolor='black')
axes[0, 1].set_xlabel('Target (kg)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Non-Zero Target Distribution: Train vs Val')
axes[0, 1].legend()

# Zero percentage by horizon
train_zero_pct = train_split.groupby('forecast_horizon').apply(lambda x: (x['target'] == 0).mean() * 100)
val_zero_pct = val_split.groupby('forecast_horizon').apply(lambda x: (x['target'] == 0).mean() * 100)
x = np.arange(len(train_zero_pct))
width = 0.35
axes[1, 0].bar(x - width/2, train_zero_pct.values, width, label='Train')
axes[1, 0].bar(x + width/2, val_zero_pct.values, width, label='Val')
axes[1, 0].set_xlabel('Forecast Horizon (days)')
axes[1, 0].set_ylabel('Zero Percentage (%)')
axes[1, 0].set_title('Zero Percentage by Horizon: Train vs Val')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(train_zero_pct.index)
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

# Mean target by horizon
train_mean = train_split.groupby('forecast_horizon')['target'].mean()
val_mean = val_split.groupby('forecast_horizon')['target'].mean()
axes[1, 1].bar(x - width/2, train_mean.values, width, label='Train')
axes[1, 1].bar(x + width/2, val_mean.values, width, label='Val')
axes[1, 1].set_xlabel('Forecast Horizon (days)')
axes[1, 1].set_ylabel('Mean Target (kg)')
axes[1, 1].set_title('Mean Target by Horizon: Train vs Val')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(train_mean.index)
axes[1, 1].legend()
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('06_train_val_comparison.png', dpi=150, bbox_inches='tight')
print("\n✅ Saved: 06_train_val_comparison.png")
plt.close()

# ============================================================================
# ANALYSIS 7: TEST SET (2025) CHARACTERISTICS
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 7: TEST SET (2025) CHARACTERISTICS")
print("="*80)

forecast_start = pd.to_datetime('2025-01-01')

# Compute features for test set
test_features = []

for rm_id in prediction_mapping['rm_id'].unique():
    hist = receivals[
        (receivals['rm_id'] == rm_id) &
        (receivals['date_arrival'] < forecast_start)
    ]
    
    if len(hist) == 0:
        test_features.append({
            'rm_id': rm_id,
            'has_history': False,
            'days_since_last': 9999,
            'total_weight_365d': 0,
            'count_365d': 0
        })
        continue
    
    cutoff_365 = forecast_start - timedelta(days=365)
    recent_365 = hist[hist['date_arrival'] >= cutoff_365]
    
    test_features.append({
        'rm_id': rm_id,
        'has_history': True,
        'days_since_last': (forecast_start - hist['date_arrival'].max()).days,
        'total_weight_365d': recent_365['net_weight'].sum() if len(recent_365) > 0 else 0,
        'count_365d': len(recent_365) if len(recent_365) > 0 else 0
    })

test_features_df = pd.DataFrame(test_features)

print(f"\nTest set RM_IDs: {len(test_features_df)}")
print(f"RM_IDs with no history: {(~test_features_df['has_history']).sum()}")
print(f"RM_IDs with history: {test_features_df['has_history'].sum()}")

print(f"\nDays since last delivery (for RM_IDs with history):")
days_since_stats = test_features_df[test_features_df['has_history']]['days_since_last'].describe()
print(days_since_stats)

print(f"\nRM_IDs inactive >365 days: {(test_features_df['days_since_last'] > 365).sum()}")
print(f"RM_IDs inactive >180 days: {(test_features_df['days_since_last'] > 180).sum()}")
print(f"RM_IDs inactive >90 days: {(test_features_df['days_since_last'] > 90).sum()}")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Days since last
axes[0].hist(test_features_df[test_features_df['has_history']]['days_since_last'], bins=50, edgecolor='black')
axes[0].set_xlabel('Days Since Last Delivery')
axes[0].set_ylabel('Number of RM_IDs')
axes[0].set_title('Test Set: Days Since Last Delivery')
axes[0].axvline(365, color='red', linestyle='--', label='365 days')
axes[0].axvline(180, color='orange', linestyle='--', label='180 days')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Total weight 365d
axes[1].hist(test_features_df[test_features_df['total_weight_365d'] > 0]['total_weight_365d'], 
             bins=50, edgecolor='black')
axes[1].set_xlabel('Total Weight in Last 365 Days (kg)')
axes[1].set_ylabel('Number of RM_IDs')
axes[1].set_title('Test Set: Historical Weight (Last 365 Days)')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('07_test_set_characteristics.png', dpi=150, bbox_inches='tight')
print("\n✅ Saved: 07_test_set_characteristics.png")
plt.close()

# ============================================================================
# ANALYSIS 8: DATA LEAKAGE CHECK
# ============================================================================
print("\n" + "="*80)
print("ANALYSIS 8: DATA LEAKAGE CHECK")
print("="*80)

# Check if forecast_horizon dominance makes sense
print("\nChecking forecast_horizon relationship:")
print(train_df.groupby('forecast_horizon')['target'].agg(['count', 'mean', 'std']))

print("\nCorrelation between forecast_horizon and target:")
print(f"  Pearson: {train_df['forecast_horizon'].corr(train_df['target']):.3f}")

# Check for any obvious leakage patterns
print("\nSample of data to check for leakage:")
print(train_df[['rm_id', 'train_date', 'forecast_horizon', 'total_weight_365d', 
                'days_since_last', 'target']].head(20))

# ============================================================================
# SUMMARY AND RECOMMENDATIONS
# ============================================================================
print("\n" + "="*80)
print("SUMMARY AND KEY FINDINGS")
print("="*80)

print(f"\n1. ZERO INFLATION:")
print(f"   - {zero_pct:.1f}% of training targets are exactly zero")
print(f"   - This is {'SEVERE' if zero_pct > 50 else 'MODERATE' if zero_pct > 30 else 'MILD'}")

print(f"\n2. TARGET DISTRIBUTION:")
print(f"   - Mean: {train_df['target'].mean():,.0f} kg")
print(f"   - Median: {train_df['target'].median():,.0f} kg")
print(f"   - High {'skew' if train_df['target'].mean() > 2 * train_df['target'].median() else 'variance'}")

print(f"\n3. TEMPORAL STABILITY:")
train_mean = train_split['target'].mean()
val_mean = val_split['target'].mean()
temporal_diff = abs(val_mean - train_mean) / train_mean * 100
print(f"   - Train mean: {train_mean:,.0f} kg")
print(f"   - Val mean: {val_mean:,.0f} kg")
print(f"   - Difference: {temporal_diff:.1f}% ({'UNSTABLE' if temporal_diff > 20 else 'STABLE'})")

print(f"\n4. TEST SET CHARACTERISTICS:")
inactive_pct = (test_features_df['days_since_last'] > 365).sum() / len(test_features_df) * 100
print(f"   - {inactive_pct:.1f}% of test RM_IDs inactive >365 days")
print(f"   - This is {'VERY HIGH' if inactive_pct > 50 else 'HIGH' if inactive_pct > 30 else 'MODERATE'}")

print(f"\n5. FEATURE IMPORTANCE:")
print(f"   - forecast_horizon correlation: {correlations['forecast_horizon']:.3f}")
print(f"   - This is {'EXPECTED' if abs(correlations['forecast_horizon']) > 0.5 else 'SUSPICIOUS'}")

print("\n" + "="*80)
print("DIAGNOSTIC COMPLETE")
print("="*80)
print("\nAll plots saved:")
print("  - 01_target_distribution.png")
print("  - 02_target_by_horizon.png")
print("  - 03_target_by_rm_id.png")
print("  - 04_temporal_patterns.png")
print("  - 05_feature_target_relationships.png")
print("  - 06_train_val_comparison.png")
print("  - 07_test_set_characteristics.png")

COMPREHENSIVE DIAGNOSTIC ANALYSIS

[1] LOADING DATA
Total receivals: 122383
Date range: 2004-06-15 11:34:00 to 2024-12-19 13:36:00
Unique rm_ids: 203

[2] RECREATING TRAINING DATA
--------------------------------------------------------------------------------
Total training samples: 2725

ANALYSIS 1: TARGET DISTRIBUTION

Target Statistics:
count    2.725000e+03
mean     3.048785e+05
std      7.993253e+05
min      0.000000e+00
25%      0.000000e+00
50%      2.556000e+04
75%      1.681800e+05
max      8.900645e+06
Name: target, dtype: float64

Percentage of ZERO targets: 33.4%
Percentage of NON-ZERO targets: 66.6%

Non-zero target statistics:
count    1.816000e+03
mean     4.574856e+05
std      9.428974e+05
min      1.370000e+02
25%      2.556000e+04
50%      9.659700e+04
75%      4.062238e+05
max      8.900645e+06
Name: target, dtype: float64

Target percentiles:
  10th percentile: 0 kg
  25th percentile: 0 kg
  50th percentile: 25,560 kg
  75th percentile: 168,180 kg
  90th percentile