In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from datetime import timedelta

receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv')
purchase_orders = pd.read_csv('./Project_materials/data/kernel/purchase_orders.csv')
prediction_mapping = pd.read_csv('./Project_materials/data/prediction_mapping.csv')
sample_submission = pd.read_csv('./Project_materials/data/sample_submission.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_localize(None)
purchase_orders['delivery_date'] = pd.to_datetime(purchase_orders['delivery_date'], utc=True).dt.tz_localize(None)
purchase_orders['created_date_time'] = pd.to_datetime(purchase_orders['created_date_time'], utc=True).dt.tz_localize(None)
prediction_mapping['forecast_start_date'] = pd.to_datetime(prediction_mapping['forecast_start_date'])
prediction_mapping['forecast_end_date'] = pd.to_datetime(prediction_mapping['forecast_end_date'])

print("="*80)
print("LightGBM STEP 5: REMOVE CALIBRATION")
print("="*80)

# ============================================================================
# DATA CLEANING
# ============================================================================
print("\n[1] DATA CLEANING")
receivals = receivals[receivals['net_weight'] > 0]
receivals = receivals[receivals['rm_id'].notna()]
receivals = receivals.sort_values('date_arrival')
print(f"Clean receivals: {len(receivals)}")

# ============================================================================
# TRAINING DATA GENERATION (STEP 1 FEATURES)
# ============================================================================
print("\n[2] CREATING TRAINING DATA WITH STEP 1 FEATURES")
print("-"*80)

train_dates = pd.date_range(start='2024-01-01', end='2024-11-30', freq='MS')
forecast_horizons = [7, 30, 60, 90, 150]

print(f"Using {len(train_dates)} training dates x {len(forecast_horizons)} horizons")

training_data = []
active_rm_ids = receivals[receivals['date_arrival'] >= '2024-01-01']['rm_id'].unique()
print(f"Active rm_ids in 2024: {len(active_rm_ids)}")

for i, train_date in enumerate(train_dates):
    print(f"Processing date {i+1}/{len(train_dates)}: {train_date.date()}...")
    
    for rm_id in active_rm_ids:
        hist = receivals[
            (receivals['rm_id'] == rm_id) &
            (receivals['date_arrival'] < train_date)
        ]
        
        if len(hist) == 0:
            continue
        
        cutoff_365 = train_date - timedelta(days=365)
        cutoff_180 = train_date - timedelta(days=180)
        cutoff_90 = train_date - timedelta(days=90)
        cutoff_30 = train_date - timedelta(days=30)
        
        recent_365 = hist[hist['date_arrival'] >= cutoff_365]
        recent_180 = hist[hist['date_arrival'] >= cutoff_180]
        recent_90 = hist[hist['date_arrival'] >= cutoff_90]
        recent_30 = hist[hist['date_arrival'] >= cutoff_30]
        
        # Basic aggregations
        if len(recent_365) > 0:
            total_365 = recent_365['net_weight'].sum()
            count_365 = len(recent_365)
            days_since = (train_date - recent_365['date_arrival'].max()).days
        else:
            total_365 = count_365 = days_since = 0
        
        if len(recent_180) > 0:
            total_180 = recent_180['net_weight'].sum()
            count_180 = len(recent_180)
        else:
            total_180 = count_180 = 0
        
        if len(recent_90) > 0:
            total_90 = recent_90['net_weight'].sum()
            count_90 = len(recent_90)
        else:
            total_90 = count_90 = 0
        
        if len(recent_30) > 0:
            total_30 = recent_30['net_weight'].sum()
            count_30 = len(recent_30)
            rate_30 = total_30 / 30
        else:
            total_30 = count_30 = rate_30 = 0
        
        # Rates
        rate_90 = total_90 / 90 if total_90 > 0 else 0
        
        # Recency-weighted sum
        if len(recent_90) > 0:
            days_ago = (train_date - recent_90['date_arrival']).dt.days
            weights = 1.0 / (days_ago + 1)
            recency_weighted = (recent_90['net_weight'] * weights).sum()
        else:
            recency_weighted = 0
        
        # Active days ratio
        if len(recent_90) > 0:
            active_days_90 = recent_90['date_arrival'].dt.date.nunique()
            active_ratio_90 = active_days_90 / 90
        else:
            active_ratio_90 = 0
        
        for horizon in forecast_horizons:
            forecast_end = train_date + timedelta(days=horizon)
            
            actual = receivals[
                (receivals['rm_id'] == rm_id) &
                (receivals['date_arrival'] >= train_date) &
                (receivals['date_arrival'] <= forecast_end)
            ]
            target = actual['net_weight'].sum()
            
            training_data.append({
                'rm_id': rm_id,
                'train_date': train_date,
                'forecast_horizon': horizon,
                'total_weight_365d': total_365,
                'count_365d': count_365,
                'days_since_last': days_since,
                'total_weight_90d': total_90,
                'count_90d': count_90,
                'rate_90': rate_90,
                'total_weight_180d': total_180,
                'count_180d': count_180,
                'total_30': total_30,
                'count_30': count_30,
                'rate_30': rate_30,
                'recency_weighted': recency_weighted,
                'active_ratio_90': active_ratio_90,
                'target': target
            })

print(f"\nGenerated {len(training_data)} training samples")
train_df = pd.DataFrame(training_data)

print(f"Samples with target > 0: {(train_df['target'] > 0).sum()} ({(train_df['target'] > 0).sum() / len(train_df) * 100:.1f}%)")

# ============================================================================
# TIME-BASED TRAIN/VAL SPLIT
# ============================================================================
print("\n[3] TIME-BASED TRAIN/VAL SPLIT")
print("-"*80)

split_date = pd.to_datetime('2024-09-01')

train_mask = train_df['train_date'] < split_date
val_mask = train_df['train_date'] >= split_date

feature_cols = [c for c in train_df.columns if c not in ['target', 'train_date']]

X_train = train_df[train_mask][feature_cols]
y_train = train_df[train_mask]['target']
X_val = train_df[val_mask][feature_cols]
y_val = train_df[val_mask]['target']

print(f"Training samples (before {split_date.date()}): {len(X_train)}")
print(f"Validation samples (>= {split_date.date()}): {len(X_val)}")

train_mean = y_train.mean()
val_mean = y_val.mean()
print(f"\nTarget statistics:")
print(f"  Training mean: {train_mean:,.0f} kg")
print(f"  Validation mean: {val_mean:,.0f} kg")
print(f"  Difference: {((val_mean - train_mean) / train_mean * 100):+.1f}%")

print(f"\nNumber of features: {len(feature_cols)}")

# ============================================================================
# TRAIN LightGBM MODELS (TWO-STAGE WITH ALPHA=0.10)
# ============================================================================
print("\n[4] TRAINING LightGBM MODELS")
print("-"*80)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

# Classifier
y_train_bin = (y_train > 0).astype(int)
y_val_bin = (y_val > 0).astype(int)

clf = lgb.LGBMClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42,
    verbose=-1
)
print("Training LightGBM Classifier...")
clf.fit(
    X_train, y_train_bin,
    eval_set=[(X_val, y_val_bin)],
    callbacks=[lgb.log_evaluation(period=100)]
)

# Regressor with alpha=0.10
model = lgb.LGBMRegressor(
    objective='quantile',
    alpha=0.10,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

print(f"Training LightGBM Regressor (quantile={model.alpha})...")
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.log_evaluation(period=100)]
)

# ============================================================================
# NO CALIBRATION (REMOVED!)
# ============================================================================
print("\n[4c] CALIBRATION: REMOVED")
print("-"*80)
print("⚠️  Skipping per-horizon calibration entirely")
print("   Let the model's forecast_horizon feature handle horizon effects naturally")
print("   Hypothesis: Sep-Nov calibration doesn't generalize to 2025")

# ============================================================================
# MAKE PREDICTIONS
# ============================================================================
print("\n[5] MAKING PREDICTIONS")
print("-"*80)

forecast_start = pd.to_datetime('2025-01-01')

# Pre-compute features for all rm_ids
rm_features = {}

for rm_id in prediction_mapping['rm_id'].unique():
    hist = receivals[
        (receivals['rm_id'] == rm_id) &
        (receivals['date_arrival'] < forecast_start)
    ]
    
    if len(hist) == 0:
        rm_features[rm_id] = {
            'total_weight_365d': 0,
            'count_365d': 0,
            'days_since_last': 999,
            'total_weight_90d': 0,
            'count_90d': 0,
            'rate_90': 0,
            'total_weight_180d': 0,
            'count_180d': 0,
            'total_30': 0,
            'count_30': 0,
            'rate_30': 0,
            'recency_weighted': 0,
            'active_ratio_90': 0
        }
        continue
    
    cutoff_365 = forecast_start - timedelta(days=365)
    cutoff_180 = forecast_start - timedelta(days=180)
    cutoff_90 = forecast_start - timedelta(days=90)
    cutoff_30 = forecast_start - timedelta(days=30)
    
    recent_365 = hist[hist['date_arrival'] >= cutoff_365]
    recent_180 = hist[hist['date_arrival'] >= cutoff_180]
    recent_90 = hist[hist['date_arrival'] >= cutoff_90]
    recent_30 = hist[hist['date_arrival'] >= cutoff_30]
    
    if len(recent_365) > 0:
        total_365 = recent_365['net_weight'].sum()
        count_365 = len(recent_365)
        days_since = (forecast_start - recent_365['date_arrival'].max()).days
    else:
        total_365 = count_365 = days_since = 0
    
    if len(recent_180) > 0:
        total_180 = recent_180['net_weight'].sum()
        count_180 = len(recent_180)
    else:
        total_180 = count_180 = 0
    
    if len(recent_90) > 0:
        total_90 = recent_90['net_weight'].sum()
        count_90 = len(recent_90)
        rate_90 = total_90 / 90
    else:
        total_90 = count_90 = rate_90 = 0
    
    if len(recent_30) > 0:
        total_30 = recent_30['net_weight'].sum()
        count_30 = len(recent_30)
        rate_30 = total_30 / 30
    else:
        total_30 = count_30 = rate_30 = 0
    
    # Recency-weighted
    if len(recent_90) > 0:
        days_ago = (forecast_start - recent_90['date_arrival']).dt.days
        weights = 1.0 / (days_ago + 1)
        recency_weighted = (recent_90['net_weight'] * weights).sum()
    else:
        recency_weighted = 0
    
    # Active ratio
    if len(recent_90) > 0:
        active_days_90 = recent_90['date_arrival'].dt.date.nunique()
        active_ratio_90 = active_days_90 / 90
    else:
        active_ratio_90 = 0
    
    rm_features[rm_id] = {
        'total_weight_365d': total_365,
        'count_365d': count_365,
        'days_since_last': days_since,
        'total_weight_90d': total_90,
        'count_90d': count_90,
        'rate_90': rate_90,
        'total_weight_180d': total_180,
        'count_180d': count_180,
        'total_30': total_30,
        'count_30': count_30,
        'rate_30': rate_30,
        'recency_weighted': recency_weighted,
        'active_ratio_90': active_ratio_90
    }

print(f"Pre-computed features for {len(rm_features)} rm_ids")

# Make predictions
predictions = []

for idx, row in prediction_mapping.iterrows():
    rm_id = row['rm_id']
    forecast_end = row['forecast_end_date']
    horizon = (forecast_end - forecast_start).days + 1
    
    feat = rm_features[rm_id]
    
    feature_dict = {
        'rm_id': rm_id,
        'forecast_horizon': horizon,
        'total_weight_365d': feat['total_weight_365d'],
        'count_365d': feat['count_365d'],
        'days_since_last': feat['days_since_last'],
        'total_weight_90d': feat['total_weight_90d'],
        'count_90d': feat['count_90d'],
        'rate_90': feat['rate_90'],
        'total_weight_180d': feat['total_weight_180d'],
        'count_180d': feat['count_180d'],
        'total_30': feat['total_30'],
        'count_30': feat['count_30'],
        'rate_30': feat['rate_30'],
        'recency_weighted': feat['recency_weighted'],
        'active_ratio_90': feat['active_ratio_90']
    }
    
    feature_vector = pd.DataFrame([feature_dict])[feature_cols]
    
    # Two-stage prediction
    reg_pred = max(0, model.predict(feature_vector)[0])
    prob_pos = clf.predict_proba(feature_vector)[:, 1][0]
    pred = reg_pred * prob_pos
    
    # ========================================================================
    # NO CALIBRATION - REMOVED THIS LINE:
    # pred *= calibration.get(int(horizon), 1.0)
    # ========================================================================
    
    # Guardrails (original Step 1 version)
    days_inactive = feat['days_since_last']
    total_365 = feat['total_weight_365d']
    cap_upper = (total_365 / 365.0) * horizon * 1.5
    
    if days_inactive > 365:
        pred = 0.0
    elif 180 < days_inactive <= 365:
        cold_cap = 0.08 * total_365
        pred = min(pred, cold_cap)
    
    pred = max(0.0, min(pred, cap_upper))
    
    predictions.append({'ID': row['ID'], 'predicted_weight': pred})
    
    if (idx + 1) % 5000 == 0:
        print(f"Processed {idx + 1}/{len(prediction_mapping)}...")

predictions_df = pd.DataFrame(predictions)

print("\n[6] PREDICTION STATISTICS")
print("-"*80)
print(f"Predictions mean: {predictions_df['predicted_weight'].mean():,.0f} kg")
print("\nPrediction statistics:")
print(predictions_df['predicted_weight'].describe())
print(f"Predictions > 0: {(predictions_df['predicted_weight'] > 0).sum()}")

# ============================================================================
# CREATE SUBMISSION
# ============================================================================
print("\n[7] CREATING SUBMISSION")
print("-"*80)

submission = sample_submission.copy()
submission['predicted_weight'] = predictions_df['predicted_weight'].values
submission.to_csv('lightgbm_step5_no_calibration.csv', index=False)
print("Saved to 'lightgbm_step5_no_calibration.csv'")

print("\n" + "="*80)
print("COMPLETE - STEP 5: NO CALIBRATION")
print("="*80)
print("\nChanges from Step 4 (6,140):")
print("  ✅ Kept all Step 1 features and two-stage model")
print("  ✅ Kept alpha=0.10 from Step 4")
print("  ✅ Kept original guardrails from Step 1")
print("  ❌ REMOVED: Per-horizon calibration entirely")
print("  ✅ Hypothesis: Calibration overfits to Sep-Nov 2024, hurts 2025 generalization")
print("  ✅ Let forecast_horizon feature handle horizon effects naturally")
print("  ✅ Expected: Unknown direction, but could improve if calibration was harmful")
print("\n" + "="*80)

AttributeError: module 'matplotlib' has no attribute 'get_data_path'