In [5]:
"""
Comprehensive EDA to test breakthrough hypotheses for breaking the 6,000 barrier
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv', parse_dates=['date_arrival'])
receivals = receivals.sort_values(['rm_id', 'date_arrival'])

# Focus on 2024 data
receivals_2024 = receivals[receivals['date_arrival'].dt.year == 2024].copy()
active_rms = receivals_2024['rm_id'].unique()
print(f"Active RMs in 2024: {len(active_rms)}")

# =============================================================================
# HYPOTHESIS 1: DELIVERY MOMENTUM PATTERNS
# =============================================================================
print("\n" + "="*80)
print("HYPOTHESIS 1: DELIVERY MOMENTUM PATTERNS")
print("="*80)

def analyze_delivery_bursts(df):
    """Analyze burst patterns in deliveries"""
    results = []
    
    for rm_id in active_rms[:20]:  # Sample for speed
        rm_data = df[df['rm_id'] == rm_id].copy()
        if len(rm_data) < 5:
            continue
            
        # Calculate gaps between deliveries
        rm_data = rm_data.sort_values('date_arrival')
        gaps = rm_data['date_arrival'].diff().dt.days.dropna()
        
        # Define burst as gap < 7 days
        burst_threshold = 7
        
        # Find bursts
        bursts = []
        current_burst = []
        
        for i, gap in enumerate(gaps):
            if gap <= burst_threshold:
                if not current_burst:
                    current_burst.append(i)
                current_burst.append(i+1)
            else:
                if len(current_burst) > 1:
                    bursts.append(len(current_burst))
                current_burst = []
        
        if len(current_burst) > 1:
            bursts.append(len(current_burst))
        
        results.append({
            'rm_id': rm_id,
            'total_deliveries': len(rm_data),
            'num_bursts': len(bursts),
            'avg_burst_length': np.mean(bursts) if bursts else 0,
            'max_burst_length': max(bursts) if bursts else 0,
            'avg_gap_days': gaps.mean(),
            'median_gap_days': gaps.median(),
            'gap_std': gaps.std(),
            'pct_in_bursts': sum(bursts) / len(rm_data) * 100 if bursts else 0
        })
    
    return pd.DataFrame(results)

burst_analysis = analyze_delivery_bursts(receivals_2024)
print("\nBurst Analysis Summary:")
print(burst_analysis.describe())
print("\nKey Findings:")
print(f"- Average % of deliveries in bursts: {burst_analysis['pct_in_bursts'].mean():.1f}%")
print(f"- Average burst length: {burst_analysis['avg_burst_length'].mean():.1f} deliveries")
print(f"- Median gap between deliveries: {burst_analysis['median_gap_days'].median():.1f} days")

# Momentum Analysis
print("\nMomentum Analysis:")
for rm_id in active_rms[:10]:
    rm_data = receivals_2024[receivals_2024['rm_id'] == rm_id].copy()
    if len(rm_data) < 10:
        continue
    
    # Calculate rolling momentum (acceleration of delivery rate)
    rm_data = rm_data.sort_values('date_arrival')
    rm_data['days_since_start'] = (rm_data['date_arrival'] - rm_data['date_arrival'].min()).dt.days
    
    # 30-day rolling sum
    rm_data.set_index('date_arrival', inplace=True)
    rm_data['rolling_30d'] = rm_data['net_weight'].rolling('30D').sum()
    rm_data['rolling_7d'] = rm_data['net_weight'].rolling('7D').sum()
    
    # Momentum = recent/longer-term
    rm_data['momentum'] = rm_data['rolling_7d'] / (rm_data['rolling_30d'] + 1)
    
    # Check if momentum predicts next period
    rm_data['next_7d'] = rm_data['net_weight'].shift(-1).rolling('7D').sum()
    
    correlation = rm_data[['momentum', 'next_7d']].corr().iloc[0, 1]
    if not np.isnan(correlation):
        print(f"RM {rm_id}: Momentum-Future correlation = {correlation:.3f}")

# =============================================================================
# HYPOTHESIS 2: RM INTERDEPENDENCIES
# =============================================================================
print("\n" + "="*80)
print("HYPOTHESIS 2: RM INTERDEPENDENCIES")
print("="*80)

# Create daily delivery matrix
daily_deliveries = receivals_2024.groupby(['date_arrival', 'rm_id'])['net_weight'].sum().unstack(fill_value=0)
daily_binary = (daily_deliveries > 0).astype(int)

# Co-occurrence analysis
print("\nCo-occurrence Analysis (sampling 20 RMs):")
sample_rms = active_rms[:20]
cooccurrence = pd.DataFrame(index=sample_rms, columns=sample_rms, dtype=float)

for rm1 in sample_rms:
    for rm2 in sample_rms:
        if rm1 != rm2 and rm1 in daily_binary.columns and rm2 in daily_binary.columns:
            # Count days both delivered
            both_delivered = ((daily_binary[rm1] == 1) & (daily_binary[rm2] == 1)).sum()
            rm1_delivered = (daily_binary[rm1] == 1).sum()
            if rm1_delivered > 0:
                cooccurrence.loc[rm1, rm2] = both_delivered / rm1_delivered

# Find strong co-occurrences
strong_pairs = []
for i, rm1 in enumerate(sample_rms):
    for j, rm2 in enumerate(sample_rms):
        if i < j and not pd.isna(cooccurrence.loc[rm1, rm2]):
            score = cooccurrence.loc[rm1, rm2]
            if score > 0.3:  # 30% co-occurrence
                strong_pairs.append((rm1, rm2, score))

strong_pairs.sort(key=lambda x: x[2], reverse=True)
print(f"\nFound {len(strong_pairs)} strong co-occurrence pairs (>30%)")
if strong_pairs:
    print("Top 5 co-occurring pairs:")
    for rm1, rm2, score in strong_pairs[:5]:
        print(f"  RM {rm1} - RM {rm2}: {score:.1%} co-occurrence")

# Supplier analysis
print("\nSupplier Clustering:")
supplier_groups = receivals_2024.groupby('supplier_id')['rm_id'].unique()
multi_rm_suppliers = supplier_groups[supplier_groups.apply(len) > 1]
print(f"Suppliers with multiple RMs: {len(multi_rm_suppliers)}")

if len(multi_rm_suppliers) > 0:
    print("\nTop suppliers by RM diversity:")
    for supplier_id in multi_rm_suppliers.head().index:
        rms = supplier_groups[supplier_id]
        print(f"  Supplier {supplier_id}: {len(rms)} RMs")
        
        # Check if these RMs deliver together
        supplier_deliveries = receivals_2024[receivals_2024['supplier_id'] == supplier_id]
        dates_by_rm = supplier_deliveries.groupby('rm_id')['date_arrival'].apply(set)
        
        if len(dates_by_rm) > 1:
            overlap = len(set.intersection(*dates_by_rm.values))
            total = len(set.union(*dates_by_rm.values))
            print(f"    Date overlap: {overlap}/{total} = {overlap/total:.1%}")

# =============================================================================
# HYPOTHESIS 3: VOLATILITY-BASED QUANTILE ADJUSTMENT
# =============================================================================
print("\n" + "="*80)
print("HYPOTHESIS 3: VOLATILITY-BASED QUANTILE ADJUSTMENT")
print("="*80)

volatility_analysis = []

for rm_id in active_rms[:30]:  # Sample
    rm_data = receivals_2024[receivals_2024['rm_id'] == rm_id].copy()
    
    if len(rm_data) < 5:
        continue
    
    # Monthly aggregation
    rm_monthly = rm_data.set_index('date_arrival').resample('M')['net_weight'].sum()
    
    if len(rm_monthly) > 2:
        cv = rm_monthly.std() / rm_monthly.mean() if rm_monthly.mean() > 0 else np.inf
        
        # Calculate quantiles of actual deliveries
        quantiles = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
        q_values = rm_monthly.quantile(quantiles).values
        
        volatility_analysis.append({
            'rm_id': rm_id,
            'cv': cv,
            'mean_monthly': rm_monthly.mean(),
            'std_monthly': rm_monthly.std(),
            'q05': q_values[0] if len(q_values) > 0 else 0,
            'q10': q_values[1] if len(q_values) > 1 else 0,
            'q15': q_values[2] if len(q_values) > 2 else 0,
            'q20': q_values[3] if len(q_values) > 3 else 0,
            'q25': q_values[4] if len(q_values) > 4 else 0,
            'q30': q_values[5] if len(q_values) > 5 else 0,
            'zero_months': (rm_monthly == 0).sum(),
            'total_months': len(rm_monthly)
        })

vol_df = pd.DataFrame(volatility_analysis)

print("\nVolatility Distribution:")
print(vol_df['cv'].describe())

# Cluster by volatility
vol_df['volatility_cluster'] = pd.qcut(vol_df['cv'], q=3, labels=['Low', 'Medium', 'High'])

print("\nOptimal Quantiles by Volatility Cluster:")
for cluster in ['Low', 'Medium', 'High']:
    cluster_data = vol_df[vol_df['volatility_cluster'] == cluster]
    if len(cluster_data) > 0:
        # Find which quantile best approximates 20% of mean
        target = cluster_data['mean_monthly'].mean() * 0.2
        
        best_q = None
        best_diff = np.inf
        for q_col in ['q05', 'q10', 'q15', 'q20', 'q25', 'q30']:
            diff = abs(cluster_data[q_col].mean() - target)
            if diff < best_diff:
                best_diff = diff
                best_q = q_col
        
        print(f"  {cluster} volatility (CV={cluster_data['cv'].mean():.2f}): Best quantile = {best_q}")
        print(f"    Zero-month rate: {cluster_data['zero_months'].mean()/cluster_data['total_months'].mean():.1%}")

# =============================================================================
# HYPOTHESIS 4: ZERO-INFLATED DISTRIBUTION
# =============================================================================
print("\n" + "="*80)
print("HYPOTHESIS 4: ZERO-INFLATED DISTRIBUTION MODELING")
print("="*80)

from scipy import stats

print("\nAnalyzing delivery amount distributions (non-zero only):")

distribution_fits = []

for rm_id in active_rms[:20]:
    rm_data = receivals_2024[receivals_2024['rm_id'] == rm_id]
    weights = rm_data['net_weight'].values
    weights_nonzero = weights[weights > 0]
    
    if len(weights_nonzero) < 10:
        continue
    
    # Fit various distributions
    results = {
        'rm_id': rm_id,
        'zero_pct': 0,  # No zeros in delivery data
        'mean': weights_nonzero.mean(),
        'std': weights_nonzero.std(),
        'skew': stats.skew(weights_nonzero),
        'kurtosis': stats.kurtosis(weights_nonzero)
    }
    
    # Test log-normal
    log_weights = np.log(weights_nonzero + 1)
    _, p_lognormal = stats.normaltest(log_weights)
    results['lognormal_p'] = p_lognormal
    
    # Test exponential
    _, p_exp = stats.kstest(weights_nonzero, 'expon', 
                            args=(weights_nonzero.min(), 
                                  weights_nonzero.mean() - weights_nonzero.min()))
    results['exponential_p'] = p_exp
    
    distribution_fits.append(results)

dist_df = pd.DataFrame(distribution_fits)

print("\nDistribution Characteristics:")
print(f"Average skewness: {dist_df['skew'].mean():.2f}")
print(f"Average kurtosis: {dist_df['kurtosis'].mean():.2f}")
print(f"Log-normal fits (p>0.05): {(dist_df['lognormal_p'] > 0.05).sum()}/{len(dist_df)}")
print(f"Exponential fits (p>0.05): {(dist_df['exponential_p'] > 0.05).sum()}/{len(dist_df)}")

# Analyze zero patterns at different aggregation levels
print("\nZero-inflation at different time aggregations:")
for window in [7, 14, 30]:
    zero_rates = []
    
    for rm_id in active_rms[:30]:
        rm_data = receivals_2024[receivals_2024['rm_id'] == rm_id].copy()
        
        # Create complete date range
        date_range = pd.date_range(start='2024-01-01', end='2024-12-31', freq='D')
        daily = rm_data.set_index('date_arrival').reindex(date_range, fill_value=0)['net_weight']
        
        # Rolling sum
        rolling = daily.rolling(window).sum().dropna()
        zero_rate = (rolling == 0).mean()
        zero_rates.append(zero_rate)
    
    print(f"  {window}-day windows: {np.mean(zero_rates):.1%} zero rate")

# =============================================================================
# HYPOTHESIS 5: MICRO-SEASONALITY PATTERNS
# =============================================================================
print("\n" + "="*80)
print("HYPOTHESIS 5: MICRO-SEASONALITY PATTERNS")
print("="*80)

receivals_2024['day_of_week'] = receivals_2024['date_arrival'].dt.dayofweek
receivals_2024['day_of_month'] = receivals_2024['date_arrival'].dt.day
receivals_2024['week_of_month'] = (receivals_2024['day_of_month'] - 1) // 7

print("\nDay-of-Week Effects:")
dow_stats = receivals_2024.groupby('day_of_week').agg({
    'net_weight': ['sum', 'count', 'mean'],
    'rm_id': 'nunique'
})
dow_stats.columns = ['total_weight', 'num_deliveries', 'avg_weight', 'unique_rms']
dow_stats['pct_of_total'] = dow_stats['total_weight'] / dow_stats['total_weight'].sum() * 100
dow_stats.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
print(dow_stats)

print("\nDay-of-Month Effects (grouped):")
receivals_2024['month_period'] = pd.cut(receivals_2024['day_of_month'], 
                                       bins=[0, 10, 20, 31], 
                                       labels=['Early', 'Mid', 'Late'])
dom_stats = receivals_2024.groupby('month_period').agg({
    'net_weight': ['sum', 'count', 'mean']
})
dom_stats.columns = ['total_weight', 'num_deliveries', 'avg_weight']
dom_stats['pct_of_total'] = dom_stats['total_weight'] / dom_stats['total_weight'].sum() * 100
print(dom_stats)

print("\nWeek-of-Month Effects:")
wom_stats = receivals_2024.groupby('week_of_month').agg({
    'net_weight': ['sum', 'count']
})
wom_stats.columns = ['total_weight', 'num_deliveries']
wom_stats['pct_of_total'] = wom_stats['total_weight'] / wom_stats['total_weight'].sum() * 100
wom_stats.index = ['Week 1', 'Week 2', 'Week 3', 'Week 4', 'Week 5']
print(wom_stats)

# Monthly patterns
print("\nMonthly Patterns in 2024:")
monthly_stats = receivals_2024.groupby(receivals_2024['date_arrival'].dt.month).agg({
    'net_weight': 'sum',
    'rm_id': 'nunique'
})
monthly_stats.columns = ['total_weight', 'active_rms']
monthly_stats['weight_per_rm'] = monthly_stats['total_weight'] / monthly_stats['active_rms']
monthly_stats.index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                       'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
print(monthly_stats)

# =============================================================================
# ADDITIONAL INSIGHTS
# =============================================================================
print("\n" + "="*80)
print("ADDITIONAL INSIGHTS FOR MODEL DESIGN")
print("="*80)

# RM-specific patterns
print("\nRM-Specific Delivery Patterns (Top 10 by volume):")
top_rms = receivals_2024.groupby('rm_id')['net_weight'].sum().nlargest(10).index

for rm_id in top_rms:
    rm_data = receivals_2024[receivals_2024['rm_id'] == rm_id]
    
    # Delivery frequency
    days_active = (rm_data['date_arrival'].max() - rm_data['date_arrival'].min()).days + 1
    delivery_frequency = len(rm_data) / days_active * 30  # Deliveries per 30 days
    
    # Typical delivery size
    p20 = rm_data['net_weight'].quantile(0.2)
    p50 = rm_data['net_weight'].quantile(0.5)
    p80 = rm_data['net_weight'].quantile(0.8)
    
    print(f"\nRM {rm_id}:")
    print(f"  Deliveries/month: {delivery_frequency:.1f}")
    print(f"  Size quantiles - P20: {p20:,.0f}, P50: {p50:,.0f}, P80: {p80:,.0f}")
    print(f"  P80/P20 ratio: {p80/p20:.1f}x")

# Test stationarity
print("\n" + "="*80)
print("STATIONARITY CHECK")
print("="*80)

# Compare first half vs second half of 2024
first_half = receivals_2024[receivals_2024['date_arrival'] < '2024-07-01']
second_half = receivals_2024[receivals_2024['date_arrival'] >= '2024-07-01']

print("\nFirst Half vs Second Half 2024:")
print(f"First half total: {first_half['net_weight'].sum()/1e6:.1f}M kg")
print(f"Second half total: {second_half['net_weight'].sum()/1e6:.1f}M kg")
print(f"Ratio: {second_half['net_weight'].sum()/first_half['net_weight'].sum():.3f}")

# Per RM stationarity
stationary_rms = []
non_stationary_rms = []

for rm_id in active_rms:
    rm_first = first_half[first_half['rm_id'] == rm_id]['net_weight'].sum()
    rm_second = second_half[second_half['rm_id'] == rm_id]['net_weight'].sum()
    
    if rm_first > 0 and rm_second > 0:
        ratio = rm_second / rm_first
        if 0.5 < ratio < 2.0:
            stationary_rms.append(rm_id)
        else:
            non_stationary_rms.append((rm_id, ratio))

print(f"\nStationary RMs (0.5 < ratio < 2.0): {len(stationary_rms)}/{len(active_rms)}")
print(f"Non-stationary RMs: {len(non_stationary_rms)}/{len(active_rms)}")

if non_stationary_rms:
    non_stationary_rms.sort(key=lambda x: abs(x[1] - 1), reverse=True)
    print("\nMost non-stationary RMs:")
    for rm_id, ratio in non_stationary_rms[:5]:
        print(f"  RM {rm_id}: {ratio:.2f}x change")

print("\n" + "="*80)
print("EDA COMPLETE - Ready for hypothesis evaluation")
print("="*80)

Loading data...


AttributeError: Can only use .dt accessor with datetimelike values