In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("DELIVERY TIMING PATTERN ANALYSIS")
print("="*80)

# ============================================================================
# LOAD DATA
# ============================================================================
print("\n[1] LOADING DATA")
receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_localize(None)
receivals = receivals[receivals['net_weight'] > 0]
receivals = receivals[receivals['rm_id'].notna()]
receivals = receivals.sort_values('date_arrival')

print(f"Total receivals: {len(receivals)}")

# ============================================================================
# DELIVERY INTERVAL ANALYSIS
# ============================================================================
print("\n[2] ANALYZING DELIVERY INTERVALS")
print("-"*80)

pattern_analysis = []

# Get RM_IDs that were active in 2024 (these are what the model trains on)
active_2024 = receivals[receivals['date_arrival'] >= '2024-01-01']['rm_id'].unique()
print(f"RM_IDs active in 2024: {len(active_2024)}")

for rm_id in active_2024:
    rm_hist = receivals[receivals['rm_id'] == rm_id].sort_values('date_arrival')
    
    if len(rm_hist) > 1:
        # Calculate inter-arrival times
        rm_hist = rm_hist.copy()
        rm_hist['days_between'] = rm_hist['date_arrival'].diff().dt.days
        
        intervals = rm_hist['days_between'].iloc[1:]  # Skip first NaN
        
        # Calculate regularity metrics
        if len(intervals) > 0 and intervals.mean() > 0:
            cv = intervals.std() / intervals.mean()
        else:
            cv = 999
        
        # Check for seasonality/patterns
        # Weekly pattern: many intervals are multiples of 7
        if len(intervals[intervals > 0]) > 0:
            weekly_pattern = (intervals % 7 == 0).sum() / len(intervals[intervals > 0])
            monthly_pattern = ((intervals >= 25) & (intervals <= 35)).sum() / len(intervals[intervals > 0])
        else:
            weekly_pattern = 0
            monthly_pattern = 0
        
        pattern_analysis.append({
            'rm_id': rm_id,
            'total_deliveries': len(rm_hist),
            'avg_days_between': intervals.mean(),
            'median_days_between': intervals.median(),
            'std_days_between': intervals.std(),
            'min_days_between': intervals.min(),
            'max_days_between': intervals.max(),
            'cv_days_between': cv,
            'avg_delivery_size': rm_hist['net_weight'].mean(),
            'total_weight': rm_hist['net_weight'].sum(),
            'weekly_pattern_pct': weekly_pattern * 100,
            'monthly_pattern_pct': monthly_pattern * 100
        })

pattern_df = pd.DataFrame(pattern_analysis)

print("\n=== OVERALL TIMING STATISTICS ===")
print(f"Total RM_IDs analyzed: {len(pattern_df)}")
print(f"\nDays between deliveries:")
print(f"  Mean: {pattern_df['avg_days_between'].mean():.1f} days")
print(f"  Median: {pattern_df['median_days_between'].median():.1f} days")
print(f"  Std: {pattern_df['std_days_between'].mean():.1f} days")

print(f"\nDelivery regularity (Coefficient of Variation):")
regular_patterns = pattern_df[pattern_df['cv_days_between'] < 100]
print(f"  Mean CV: {regular_patterns['cv_days_between'].mean():.2f}")
print(f"  Median CV: {regular_patterns['cv_days_between'].median():.2f}")

print(f"\nPattern classification:")
very_regular = (pattern_df['cv_days_between'] < 0.5).sum()
regular = ((pattern_df['cv_days_between'] >= 0.5) & (pattern_df['cv_days_between'] < 1.0)).sum()
irregular = ((pattern_df['cv_days_between'] >= 1.0) & (pattern_df['cv_days_between'] < 2.0)).sum()
very_irregular = (pattern_df['cv_days_between'] >= 2.0).sum()

print(f"  Very Regular (CV < 0.5): {very_regular} ({very_regular/len(pattern_df)*100:.1f}%)")
print(f"  Regular (0.5 ≤ CV < 1.0): {regular} ({regular/len(pattern_df)*100:.1f}%)")
print(f"  Irregular (1.0 ≤ CV < 2.0): {irregular} ({irregular/len(pattern_df)*100:.1f}%)")
print(f"  Very Irregular (CV ≥ 2.0): {very_irregular} ({very_irregular/len(pattern_df)*100:.1f}%)")

print(f"\nPeriodicity patterns:")
print(f"  Weekly pattern (≥25% intervals divisible by 7): {(pattern_df['weekly_pattern_pct'] >= 25).sum()} RM_IDs")
print(f"  Monthly pattern (≥25% intervals 25-35 days): {(pattern_df['monthly_pattern_pct'] >= 25).sum()} RM_IDs")

# ============================================================================
# DELIVERY FREQUENCY DISTRIBUTION
# ============================================================================
print("\n[3] DELIVERY FREQUENCY DISTRIBUTION")
print("-"*80)

# Classify RM_IDs by delivery frequency
def classify_frequency(avg_days):
    if avg_days < 7:
        return 'Very Frequent (<7d)'
    elif avg_days < 30:
        return 'Frequent (7-30d)'
    elif avg_days < 90:
        return 'Moderate (30-90d)'
    elif avg_days < 180:
        return 'Infrequent (90-180d)'
    else:
        return 'Rare (>180d)'

pattern_df['frequency_class'] = pattern_df['avg_days_between'].apply(classify_frequency)
freq_dist = pattern_df['frequency_class'].value_counts()
print("\nFrequency distribution:")
print(freq_dist)
print("\nFrequency distribution (%):")
print((freq_dist / len(pattern_df) * 100).round(1))

# ============================================================================
# CRITICAL INSIGHT: WHAT HAPPENS IN 7 vs 150 DAYS?
# ============================================================================
print("\n[4] EXPECTED DELIVERIES IN DIFFERENT HORIZONS")
print("-"*80)

# For each RM_ID, calculate expected number of deliveries in 7, 30, 60, 90, 150 days
pattern_df['expected_deliveries_7d'] = 7 / pattern_df['avg_days_between']
pattern_df['expected_deliveries_30d'] = 30 / pattern_df['avg_days_between']
pattern_df['expected_deliveries_60d'] = 60 / pattern_df['avg_days_between']
pattern_df['expected_deliveries_90d'] = 90 / pattern_df['avg_days_between']
pattern_df['expected_deliveries_150d'] = 150 / pattern_df['avg_days_between']

print("\nExpected deliveries by horizon (mean across RM_IDs):")
print(f"  7 days: {pattern_df['expected_deliveries_7d'].mean():.2f} deliveries")
print(f"  30 days: {pattern_df['expected_deliveries_30d'].mean():.2f} deliveries")
print(f"  60 days: {pattern_df['expected_deliveries_60d'].mean():.2f} deliveries")
print(f"  90 days: {pattern_df['expected_deliveries_90d'].mean():.2f} deliveries")
print(f"  150 days: {pattern_df['expected_deliveries_150d'].mean():.2f} deliveries")

# Calculate expected ratios
pattern_df['ratio_150_to_7'] = pattern_df['expected_deliveries_150d'] / (pattern_df['expected_deliveries_7d'] + 0.1)
print(f"\nExpected ratio (150d / 7d): {pattern_df['ratio_150_to_7'].mean():.2f}")
print(f"Median ratio: {pattern_df['ratio_150_to_7'].median():.2f}")
print(f"Std ratio: {pattern_df['ratio_150_to_7'].std():.2f}")

# ============================================================================
# PROBABILITY OF ZERO DELIVERIES
# ============================================================================
print("\n[5] PROBABILITY OF ZERO DELIVERIES BY HORIZON")
print("-"*80)

# For a Poisson-like process, probability of 0 events = exp(-lambda)
# where lambda = expected number of events
pattern_df['prob_zero_7d'] = np.exp(-pattern_df['expected_deliveries_7d'])
pattern_df['prob_zero_30d'] = np.exp(-pattern_df['expected_deliveries_30d'])
pattern_df['prob_zero_90d'] = np.exp(-pattern_df['expected_deliveries_90d'])
pattern_df['prob_zero_150d'] = np.exp(-pattern_df['expected_deliveries_150d'])

print("\nExpected % of RM_IDs with ZERO deliveries:")
print(f"  7 days: {pattern_df['prob_zero_7d'].mean() * 100:.1f}%")
print(f"  30 days: {pattern_df['prob_zero_30d'].mean() * 100:.1f}%")
print(f"  90 days: {pattern_df['prob_zero_90d'].mean() * 100:.1f}%")
print(f"  150 days: {pattern_df['prob_zero_150d'].mean() * 100:.1f}%")

# ============================================================================
# EXAMPLES OF DIFFERENT PATTERNS
# ============================================================================
print("\n[6] EXAMPLE RM_IDs WITH DIFFERENT PATTERNS")
print("-"*80)

print("\n--- Very Regular Deliveries ---")
very_regular_rms = pattern_df.nsmallest(5, 'cv_days_between')
print(very_regular_rms[['rm_id', 'total_deliveries', 'avg_days_between', 'cv_days_between', 
                         'expected_deliveries_7d', 'expected_deliveries_150d']])

print("\n--- Very Irregular Deliveries ---")
very_irregular_rms = pattern_df[pattern_df['cv_days_between'] < 100].nlargest(5, 'cv_days_between')
print(very_irregular_rms[['rm_id', 'total_deliveries', 'avg_days_between', 'cv_days_between',
                           'expected_deliveries_7d', 'expected_deliveries_150d']])

print("\n--- Frequent Deliverers ---")
frequent_rms = pattern_df.nsmallest(5, 'avg_days_between')
print(frequent_rms[['rm_id', 'total_deliveries', 'avg_days_between', 'cv_days_between',
                     'expected_deliveries_7d', 'expected_deliveries_150d']])

print("\n--- Rare Deliverers ---")
rare_rms = pattern_df.nlargest(5, 'avg_days_between')
print(rare_rms[['rm_id', 'total_deliveries', 'avg_days_between', 'cv_days_between',
                'expected_deliveries_7d', 'expected_deliveries_150d']])

# ============================================================================
# VISUALIZATIONS
# ============================================================================
print("\n[7] GENERATING VISUALIZATIONS")

fig, axes = plt.subplots(3, 2, figsize=(14, 15))

# Plot 1: Distribution of average days between deliveries
ax = axes[0, 0]
ax.hist(pattern_df['avg_days_between'], bins=50, edgecolor='black')
ax.axvline(pattern_df['avg_days_between'].mean(), color='red', linestyle='--', 
           label=f'Mean: {pattern_df["avg_days_between"].mean():.1f}d')
ax.axvline(pattern_df['avg_days_between'].median(), color='orange', linestyle='--',
           label=f'Median: {pattern_df["avg_days_between"].median():.1f}d')
ax.set_xlabel('Average Days Between Deliveries')
ax.set_ylabel('Number of RM_IDs')
ax.set_title('Distribution of Delivery Intervals')
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Plot 2: CV distribution
ax = axes[0, 1]
cv_plot = pattern_df[pattern_df['cv_days_between'] < 5]  # Remove extreme outliers for viz
ax.hist(cv_plot['cv_days_between'], bins=50, edgecolor='black')
ax.axvline(1.0, color='red', linestyle='--', label='CV = 1.0 (irregular)')
ax.axvline(0.5, color='orange', linestyle='--', label='CV = 0.5 (regular)')
ax.set_xlabel('Coefficient of Variation')
ax.set_ylabel('Number of RM_IDs')
ax.set_title('Distribution of Delivery Regularity')
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Plot 3: Expected deliveries by horizon
ax = axes[1, 0]
horizon_means = [
    pattern_df['expected_deliveries_7d'].mean(),
    pattern_df['expected_deliveries_30d'].mean(),
    pattern_df['expected_deliveries_60d'].mean(),
    pattern_df['expected_deliveries_90d'].mean(),
    pattern_df['expected_deliveries_150d'].mean()
]
horizons = [7, 30, 60, 90, 150]
ax.bar(horizons, horizon_means)
ax.set_xlabel('Forecast Horizon (days)')
ax.set_ylabel('Expected Number of Deliveries')
ax.set_title('Expected Deliveries by Horizon')
ax.grid(axis='y', alpha=0.3)

# Plot 4: Probability of zero deliveries
ax = axes[1, 1]
zero_probs = [
    pattern_df['prob_zero_7d'].mean() * 100,
    pattern_df['prob_zero_30d'].mean() * 100,
    pattern_df['prob_zero_90d'].mean() * 100,
    pattern_df['prob_zero_150d'].mean() * 100
]
horizons_zero = [7, 30, 90, 150]
ax.bar(horizons_zero, zero_probs, color='orange')
ax.set_xlabel('Forecast Horizon (days)')
ax.set_ylabel('Expected % with Zero Deliveries')
ax.set_title('Probability of Zero Deliveries by Horizon')
ax.grid(axis='y', alpha=0.3)

# Plot 5: Frequency classes
ax = axes[2, 0]
freq_dist.plot(kind='bar', ax=ax, color='steelblue')
ax.set_xlabel('Delivery Frequency Class')
ax.set_ylabel('Number of RM_IDs')
ax.set_title('RM_ID Distribution by Delivery Frequency')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

# Plot 6: Avg interval vs CV (regularity)
ax = axes[2, 1]
scatter_df = pattern_df[pattern_df['cv_days_between'] < 5]
scatter = ax.scatter(scatter_df['avg_days_between'], scatter_df['cv_days_between'],
                     s=scatter_df['total_deliveries']*2, alpha=0.6,
                     c=scatter_df['total_deliveries'], cmap='viridis')
ax.set_xlabel('Average Days Between Deliveries')
ax.set_ylabel('Coefficient of Variation')
ax.set_title('Delivery Pattern Map\n(size & color = total deliveries)')
ax.axhline(1.0, color='red', linestyle='--', alpha=0.5, label='CV = 1.0')
ax.axhline(0.5, color='orange', linestyle='--', alpha=0.5, label='CV = 0.5')
ax.legend()
ax.grid(alpha=0.3)
plt.colorbar(scatter, ax=ax, label='Total Deliveries')

plt.tight_layout()
plt.savefig('delivery_timing_patterns.png', dpi=150, bbox_inches='tight')
print("✅ Saved: delivery_timing_patterns.png")
plt.close()

# ============================================================================
# FINAL RECOMMENDATIONS
# ============================================================================
print("\n" + "="*80)
print("CRITICAL INSIGHTS FOR FEATURE ENGINEERING")
print("="*80)

avg_cv = regular_patterns['cv_days_between'].mean()
pct_regular = ((pattern_df['cv_days_between'] < 1.0).sum() / len(pattern_df) * 100)

print("\n1. DELIVERY TIMING PATTERNS:")
if avg_cv > 1.5:
    print("   ⚠️  HIGHLY IRREGULAR: Most RM_IDs have unpredictable delivery timing")
elif avg_cv > 1.0:
    print("   ⚠️  IRREGULAR: Delivery timing is somewhat unpredictable")
else:
    print("   ✓ REGULAR: Many RM_IDs have predictable delivery patterns")

print(f"   Average CV: {avg_cv:.2f}")
print(f"   Regular patterns (<CV 1.0): {pct_regular:.1f}%")

print("\n2. HORIZON-SPECIFIC INSIGHTS:")
avg_expected_7d = pattern_df['expected_deliveries_7d'].mean()
avg_expected_150d = pattern_df['expected_deliveries_150d'].mean()
print(f"   7-day horizon: {avg_expected_7d:.2f} expected deliveries → {pattern_df['prob_zero_7d'].mean()*100:.1f}% zeros")
print(f"   150-day horizon: {avg_expected_150d:.2f} expected deliveries → {pattern_df['prob_zero_150d'].mean()*100:.1f}% zeros")

print("\n3. FEATURES YOU SHOULD ADD:")
print("   ✓ avg_days_between_deliveries (per RM_ID)")
print("   ✓ cv_days_between_deliveries (regularity metric)")
print("   ✓ expected_deliveries_in_horizon = horizon / avg_days_between")
print("   ✓ days_since_last / avg_days_between (how 'overdue' is next delivery)")
print("   ✓ weekly/monthly pattern indicators")

print("\n4. MODELING APPROACH:")
if pattern_df['prob_zero_7d'].mean() > 0.3:
    print("   → TWO-STAGE MODEL recommended (high zero probability)")
    print("     Stage 1: Classify will_deliver (yes/no)")
    print("     Stage 2: Predict amount if yes")
else:
    print("   → SINGLE-STAGE with zero-inflated features may work")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

DELIVERY TIMING PATTERN ANALYSIS

[1] LOADING DATA
Total receivals: 122383

[2] ANALYZING DELIVERY INTERVALS
--------------------------------------------------------------------------------
RM_IDs active in 2024: 60

=== OVERALL TIMING STATISTICS ===
Total RM_IDs analyzed: 54

Days between deliveries:
  Mean: 35.9 days
  Median: 2.5 days
  Std: 43.0 days

Delivery regularity (Coefficient of Variation):
  Mean CV: 2.75
  Median CV: 1.90

Pattern classification:
  Very Regular (CV < 0.5): 0 (0.0%)
  Regular (0.5 ≤ CV < 1.0): 11 (20.4%)
  Irregular (1.0 ≤ CV < 2.0): 17 (31.5%)
  Very Irregular (CV ≥ 2.0): 21 (38.9%)

Periodicity patterns:
  Weekly pattern (≥25% intervals divisible by 7): 43 RM_IDs
  Monthly pattern (≥25% intervals 25-35 days): 4 RM_IDs

[3] DELIVERY FREQUENCY DISTRIBUTION
--------------------------------------------------------------------------------

Frequency distribution:
frequency_class
Very Frequent (<7d)     28
Frequent (7-30d)        10
Moderate (30-90d)        9
