In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta

# Load data
receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv')
purchase_orders = pd.read_csv('./Project_materials/data/kernel/purchase_orders.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_localize(None)
purchase_orders['delivery_date'] = pd.to_datetime(purchase_orders['delivery_date'], utc=True).dt.tz_localize(None)

print("="*80)
print("DEBUG: NaN CORRELATION ISSUE")
print("="*80)

# Clean data
receivals = receivals[receivals['net_weight'] > 0]
receivals = receivals[receivals['rm_id'].notna()]
receivals = receivals.sort_values('date_arrival')

print(f"\nTotal receivals after cleaning: {len(receivals)}")
print(f"Date range: {receivals['date_arrival'].min()} to {receivals['date_arrival'].max()}")

# ============================================================================
# REPLICATE THE EXACT SAMPLING LOGIC FROM DIAGNOSTIC EDA
# ============================================================================
print("\n[1] REPLICATING DIAGNOSTIC EDA SAMPLING")
print("-"*80)

test_samples = []

# Use same parameters as diagnostic
sample_dates = pd.date_range('2024-09-01', '2024-11-30', freq='W')
sample_rm_ids = receivals['rm_id'].unique()[:30]  # First 30 rm_ids

print(f"Sampling from {len(sample_dates)} dates")
print(f"Using {len(sample_rm_ids)} rm_ids: {sample_rm_ids[:5]}...")

for i, train_date in enumerate(sample_dates):
    print(f"\n--- Processing date {i+1}/{len(sample_dates)}: {train_date.date()} ---")
    
    samples_this_date = 0
    
    for rm_id in sample_rm_ids:
        hist = receivals[(receivals['rm_id'] == rm_id) & (receivals['date_arrival'] < train_date)]
        
        if len(hist) == 0:
            continue
        
        # Compute features (EXACT REPLICA)
        cutoff_365 = train_date - timedelta(days=365)
        cutoff_90 = train_date - timedelta(days=90)
        recent_365 = hist[hist['date_arrival'] >= cutoff_365]
        recent_90 = hist[hist['date_arrival'] >= cutoff_90]
        
        total_365 = recent_365['net_weight'].sum() if len(recent_365) > 0 else 0
        total_90 = recent_90['net_weight'].sum() if len(recent_90) > 0 else 0
        count_365 = len(recent_365)
        
        # Get actual delivery in next 30 days
        forecast_end = train_date + timedelta(days=30)
        actual = receivals[(receivals['rm_id'] == rm_id) & 
                          (receivals['date_arrival'] >= train_date) & 
                          (receivals['date_arrival'] <= forecast_end)]
        target = actual['net_weight'].sum()
        
        test_samples.append({
            'rm_id': rm_id,
            'train_date': train_date,
            'total_365': total_365,
            'total_90': total_90,
            'count_365': count_365,
            'daily_rate': total_365 / 365 if total_365 > 0 else 0,
            'target': target
        })
        
        samples_this_date += 1
    
    print(f"  Generated {samples_this_date} samples for this date")

print(f"\n{'='*80}")
print(f"Total samples generated: {len(test_samples)}")

sample_df = pd.DataFrame(test_samples)

# ============================================================================
# DIAGNOSTIC 1: Check for all-zero or all-constant columns
# ============================================================================
print("\n[2] CHECKING FOR CONSTANT/ZERO COLUMNS")
print("-"*80)

for col in ['total_365', 'total_90', 'count_365', 'daily_rate', 'target']:
    unique_vals = sample_df[col].nunique()
    zero_count = (sample_df[col] == 0).sum()
    nonzero_count = (sample_df[col] > 0).sum()
    
    print(f"\n{col}:")
    print(f"  Unique values: {unique_vals}")
    print(f"  Zero count: {zero_count}/{len(sample_df)} ({zero_count/len(sample_df)*100:.1f}%)")
    print(f"  Non-zero count: {nonzero_count}/{len(sample_df)} ({nonzero_count/len(sample_df)*100:.1f}%)")
    print(f"  Min: {sample_df[col].min():.2f}")
    print(f"  Max: {sample_df[col].max():.2f}")
    print(f"  Mean: {sample_df[col].mean():.2f}")
    print(f"  Std: {sample_df[col].std():.2f}")

# ============================================================================
# DIAGNOSTIC 2: Check variance
# ============================================================================
print("\n[3] CHECKING VARIANCE (NaN correlation needs variance > 0)")
print("-"*80)

for col in ['total_365', 'total_90', 'count_365', 'daily_rate', 'target']:
    variance = sample_df[col].var()
    print(f"{col:20s}: variance = {variance:.6f}")
    if variance == 0:
        print(f"  ⚠️  ZERO VARIANCE - This will cause NaN correlation!")

# ============================================================================
# DIAGNOSTIC 3: Compute correlations with detailed output
# ============================================================================
print("\n[4] COMPUTING CORRELATIONS")
print("-"*80)

# Filter to only samples with non-zero target
nonzero_target_df = sample_df[sample_df['target'] > 0].copy()
print(f"Samples with target > 0: {len(nonzero_target_df)}/{len(sample_df)} ({len(nonzero_target_df)/len(sample_df)*100:.1f}%)")

if len(nonzero_target_df) > 1:
    print("\n--- Correlation on ALL samples ---")
    try:
        corr_all = sample_df[['total_365', 'total_90', 'count_365', 'daily_rate', 'target']].corr()['target'].drop('target')
        for feat, corr in corr_all.items():
            print(f"{feat:20s}: {corr:+.4f}")
    except Exception as e:
        print(f"Error computing correlation: {e}")
    
    print("\n--- Correlation on samples with target > 0 ---")
    try:
        corr_nonzero = nonzero_target_df[['total_365', 'total_90', 'count_365', 'daily_rate', 'target']].corr()['target'].drop('target')
        for feat, corr in corr_nonzero.items():
            print(f"{feat:20s}: {corr:+.4f}")
    except Exception as e:
        print(f"Error computing correlation: {e}")
else:
    print("⚠️  NOT ENOUGH NON-ZERO TARGETS TO COMPUTE CORRELATION")

# ============================================================================
# DIAGNOSTIC 4: Detailed inspection of problem cases
# ============================================================================
print("\n[5] INSPECTING PROBLEM CASES")
print("-"*80)

# Find samples where features are all zero
all_zero_features = sample_df[
    (sample_df['total_365'] == 0) & 
    (sample_df['total_90'] == 0) & 
    (sample_df['count_365'] == 0)
]
print(f"\nSamples with ALL features = 0: {len(all_zero_features)}")

if len(all_zero_features) > 0:
    print("\nFirst few examples:")
    print(all_zero_features[['rm_id', 'train_date', 'total_365', 'total_90', 'count_365', 'target']].head(10).to_string(index=False))
    
    # Check if these rm_ids have ANY history
    problem_rm_id = all_zero_features.iloc[0]['rm_id']
    print(f"\n--- Checking history for rm_id={problem_rm_id} ---")
    hist_for_problem = receivals[receivals['rm_id'] == problem_rm_id]
    print(f"Total receivals for rm_id={problem_rm_id}: {len(hist_for_problem)}")
    if len(hist_for_problem) > 0:
        print(f"Date range: {hist_for_problem['date_arrival'].min()} to {hist_for_problem['date_arrival'].max()}")
        print(f"Total weight: {hist_for_problem['net_weight'].sum():.2f}")

# Find samples where features > 0 but target = 0
features_positive_target_zero = sample_df[
    (sample_df['total_365'] > 0) & 
    (sample_df['target'] == 0)
]
print(f"\nSamples with features > 0 but target = 0: {len(features_positive_target_zero)}")

# Find samples where features = 0 but target > 0  
features_zero_target_positive = sample_df[
    (sample_df['total_365'] == 0) & 
    (sample_df['target'] > 0)
]
print(f"Samples with features = 0 but target > 0: {len(features_zero_target_positive)}")

if len(features_zero_target_positive) > 0:
    print("\nFirst few examples:")
    print(features_zero_target_positive[['rm_id', 'train_date', 'total_365', 'count_365', 'target']].head(5).to_string(index=False))

# ============================================================================
# DIAGNOSTIC 5: Check the actual rm_ids being sampled
# ============================================================================
print("\n[6] ANALYZING RM_ID SELECTION")
print("-"*80)

print(f"\nFirst 30 rm_ids (as used in diagnostic): {sample_rm_ids}")

# Check how many of these rm_ids have data in the sampling window
active_in_window = []
for rm_id in sample_rm_ids:
    data_in_window = receivals[
        (receivals['rm_id'] == rm_id) & 
        (receivals['date_arrival'] >= '2024-09-01') & 
        (receivals['date_arrival'] <= '2024-12-31')
    ]
    if len(data_in_window) > 0:
        active_in_window.append(rm_id)

print(f"\nOut of {len(sample_rm_ids)} sampled rm_ids:")
print(f"  Active in Sep-Dec 2024: {len(active_in_window)}")
print(f"  Inactive in Sep-Dec 2024: {len(sample_rm_ids) - len(active_in_window)}")

# Show activity pattern
print("\n--- Activity patterns for first 10 rm_ids ---")
for rm_id in sample_rm_ids[:10]:
    rm_data = receivals[receivals['rm_id'] == rm_id]
    last_delivery = rm_data['date_arrival'].max() if len(rm_data) > 0 else None
    total_deliveries = len(rm_data)
    
    print(f"rm_id={rm_id}: {total_deliveries} total deliveries, last on {last_delivery}")

# ============================================================================
# DIAGNOSTIC 6: Compare to what SHOULD work
# ============================================================================
print("\n[7] TESTING WITH GUARANTEED NON-CONSTANT DATA")
print("-"*80)

# Create a simple test case we KNOW should work
simple_test = []

# Use only rm_ids that are very active
very_active_rm_ids = receivals['rm_id'].value_counts().head(10).index.tolist()
print(f"Using top 10 most active rm_ids: {very_active_rm_ids}")

for train_date in pd.date_range('2024-09-01', '2024-11-30', freq='W'):
    for rm_id in very_active_rm_ids:
        hist = receivals[(receivals['rm_id'] == rm_id) & (receivals['date_arrival'] < train_date)]
        
        if len(hist) == 0:
            continue
        
        cutoff_365 = train_date - timedelta(days=365)
        recent_365 = hist[hist['date_arrival'] >= cutoff_365]
        
        total_365 = recent_365['net_weight'].sum() if len(recent_365) > 0 else 0
        
        forecast_end = train_date + timedelta(days=30)
        actual = receivals[(receivals['rm_id'] == rm_id) & 
                          (receivals['date_arrival'] >= train_date) & 
                          (receivals['date_arrival'] <= forecast_end)]
        target = actual['net_weight'].sum()
        
        # Only include if there's actual activity
        if total_365 > 0 or target > 0:
            simple_test.append({
                'total_365': total_365,
                'target': target
            })

simple_df = pd.DataFrame(simple_test)
print(f"\nGenerated {len(simple_df)} samples from active rm_ids")

if len(simple_df) > 1:
    print(f"Samples with target > 0: {(simple_df['target'] > 0).sum()}")
    print(f"Samples with total_365 > 0: {(simple_df['total_365'] > 0).sum()}")
    
    try:
        corr = simple_df['total_365'].corr(simple_df['target'])
        print(f"\nCorrelation (should NOT be NaN): {corr:+.4f}")
        
        if pd.isna(corr):
            print("⚠️  STILL NaN! Deep issue with correlation computation.")
        else:
            print("✅ Correlation works with active rm_ids!")
    except Exception as e:
        print(f"Error: {e}")

print("\n" + "="*80)
print("DEBUG COMPLETE - Check output above for root cause")
print("="*80)

DEBUG: NaN CORRELATION ISSUE

Total receivals after cleaning: 122383
Date range: 2004-06-15 11:34:00 to 2024-12-19 13:36:00

[1] REPLICATING DIAGNOSTIC EDA SAMPLING
--------------------------------------------------------------------------------
Sampling from 13 dates
Using 30 rm_ids: [365. 379. 389. 369. 366.]...

--- Processing date 1/13: 2024-09-01 ---
  Generated 30 samples for this date

--- Processing date 2/13: 2024-09-08 ---
  Generated 30 samples for this date

--- Processing date 3/13: 2024-09-15 ---
  Generated 30 samples for this date

--- Processing date 4/13: 2024-09-22 ---
  Generated 30 samples for this date

--- Processing date 5/13: 2024-09-29 ---
  Generated 30 samples for this date

--- Processing date 6/13: 2024-10-06 ---
  Generated 30 samples for this date

--- Processing date 7/13: 2024-10-13 ---
  Generated 30 samples for this date

--- Processing date 8/13: 2024-10-20 ---
  Generated 30 samples for this date

--- Processing date 9/13: 2024-10-27 ---
  Generate