# 简化版区间估计基准测试 (Simplified Benchmark)

## 核心指标
1.  **预测区间命中率 (Prediction Interval Hit Rate)**: 全局覆盖率。
2.  **不同宽度区间命中率 (Hit Rate by Width)**: 考察在不同锐度下的可靠性。
3.  **分桶回归命中率 (Bin Regression Accuracy)**: [NEW]
    - **Bin Width = 0.2 (2万美金)**: 将 [0, 5.2] 分为 ~26 个桶。
    - **Bin Width = 0.4 (4万美金)**: 将 [0, 5.2] 分为 ~13 个桶。

## 基线模型
- CatBoost (MultiQuantile)
- RealMLP (Quantile Regression)

In [8]:
import numpy as np
import pandas as pd
import torch
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from typing import Dict, List

HAS_REALMLP = False
try:
    from pytabkit.models.sklearn.sklearn_interfaces import RealMLP_TD_Regressor
    HAS_REALMLP = True
except ImportError:
    pass

class SimplifiedMetrics:
    def __init__(self, alpha: float = 0.1):
        self.alpha = alpha
        self.target_coverage = 1.0 - alpha
        # 定义分桶 (0.2 和 0.4)
        self.bin_edges_02 = np.arange(0, 5.21, 0.2)
        self.bin_edges_04 = np.arange(0, 5.21, 0.4)

    def compute(self, 
                y_true: np.ndarray, 
                y_lower: np.ndarray, 
                y_upper: np.ndarray, 
                model_name: str = "Model") -> Dict:
        
        # 1. Coverage Metrics
        covered = (y_true >= y_lower) & (y_true <= y_upper)
        global_picp = np.mean(covered)
        widths = y_upper - y_lower
        
        # Stratified by Width
        fixed_bins = [0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 100.0]
        bin_indices = np.digitize(widths, fixed_bins) - 1
        stratified_w = {}
        for i in range(len(fixed_bins)-1):
            mask = (bin_indices == i)
            count = np.sum(mask)
            if count > 0:
                local_cov = np.mean(covered[mask])
                range_str = f"Width [{fixed_bins[i]:.1f}, {fixed_bins[i+1]:.1f})"
                stratified_w[range_str] = f"{local_cov:.4f} (n={count})"
        
        # 2. Bin Classification Accuracy
        y_pred_point = (y_lower + y_upper) / 2.0
        
        # Width 0.2
        true_bins_02 = np.digitize(y_true, self.bin_edges_02)
        pred_bins_02 = np.digitize(y_pred_point, self.bin_edges_02)
        bin_acc_02 = np.mean(true_bins_02 == pred_bins_02)

        # Width 0.4
        true_bins_04 = np.digitize(y_true, self.bin_edges_04)
        pred_bins_04 = np.digitize(y_pred_point, self.bin_edges_04)
        bin_acc_04 = np.mean(true_bins_04 == pred_bins_04)

        return {
            'Model': model_name,
            '预测区间命中率': round(global_picp, 4),
            '不同宽度区间命中率': stratified_w,
            '分桶回归命中率_0.2': round(bin_acc_02, 4),
            '分桶回归命中率_0.4': round(bin_acc_04, 4)
        }

def eval_catboost(X_train, y_train, X_val, y_val, alpha=0.1):
    print("Running CatBoost...")
    model = CatBoostRegressor(
        iterations=500, 
        loss_function='MultiQuantile:alpha=0.05,0.95', 
        verbose=0,
        random_seed=42
    )
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    preds = model.predict(X_val)
    return preds[:,0], preds[:,1]

def eval_realmlp(X_train, y_train, X_val, y_val, alpha=0.1):
    if not HAS_REALMLP:
        return None, None
    
    print("Running RealMLP (Quantile)...")
    q_lower = alpha / 2.0
    q_upper = 1.0 - (alpha / 2.0)
    
    model = RealMLP_TD_Regressor(
        train_metric_name=f'multi_pinball({q_lower},{q_upper})',
        random_state=42,
        device='cpu'
    )
    
    model.fit(X_train, y_train, X_val=X_val, y_val=y_val)
    preds = model.predict(X_val)
    return preds[:,0], preds[:,1]

# Main Execution
data = fetch_california_housing()
X, y = data.data, data.target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

metrics = SimplifiedMetrics(0.1)
results_list = []

# 1. CatBoost
cb_low, cb_high = eval_catboost(X_train, y_train, X_val, y_val)
results_list.append(metrics.compute(y_val, cb_low, cb_high, "CatBoost"))

# 2. RealMLP
try:
    rm_low, rm_high = eval_realmlp(X_train, y_train, X_val, y_val)
    if rm_low is not None:
        results_list.append(metrics.compute(y_val, rm_low, rm_high, "RealMLP"))
except Exception as e:
    print(f"RealMLP Failed: {e}")

# Report
print("\n=== 对比结果 ===")
for res in results_list:
    print(f"\n模型: {res['Model']}")
    print(f"   分桶回归命中率 (0.2): {res['分桶回归命中率_0.2']}")
    print(f"   分桶回归命中率 (0.4): {res['分桶回归命中率_0.4']}")
    print(f"   预测区间命中率: {res['预测区间命中率']}")
    print(f"   不同宽度区间命中率:")
    for k, v in res['不同宽度区间命中率'].items():
        print(f"     {k}: {v}")


Running CatBoost...


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores


Running RealMLP (Quantile)...


/Users/caipengxiang/miniconda3/envs/huawei_tabular/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:175: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
`Trainer.fit` stopped: `max_epochs=256` reached.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores



=== 对比结果 ===

模型: CatBoost
   分桶回归命中率 (0.2): 0.1797
   分桶回归命中率 (0.4): 0.3413
   预测区间命中率: 0.8711
   不同宽度区间命中率:
     Width [0.0, 0.4): 0.7745 (n=204)
     Width [0.4, 0.8): 0.8545 (n=605)
     Width [0.8, 1.2): 0.8807 (n=872)
     Width [1.2, 1.6): 0.8840 (n=836)
     Width [1.6, 2.0): 0.8944 (n=720)
     Width [2.0, 100.0): 0.8652 (n=890)

模型: RealMLP
   分桶回归命中率 (0.2): 0.2464
   分桶回归命中率 (0.4): 0.4312
   预测区间命中率: 0.7597
   不同宽度区间命中率:
     Width [0.0, 0.4): 0.7566 (n=530)
     Width [0.4, 0.8): 0.7819 (n=1683)
     Width [0.8, 1.2): 0.7398 (n=1076)
     Width [1.2, 1.6): 0.7541 (n=488)
     Width [1.6, 2.0): 0.7330 (n=206)
     Width [2.0, 100.0): 0.7591 (n=137)


/Users/caipengxiang/miniconda3/envs/huawei_tabular/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:175: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
