**Kevin Xu**

In [144]:
import polars as pl
import pandas as pd
import pyarrow
from tsfresh.feature_selection.selection import select_features
from sklearn.preprocessing import StandardScaler
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.linear_model import LogisticRegression  
from sklearn.model_selection import GridSearchCV  
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFdr, f_classif 

# Load data with proper decimal handling (for European-style CSV)
features = pl.read_csv("data/features_processed1.csv")
patient_info = pl.read_csv("data/patient_info_processed.csv", ignore_errors=True)

# 打印两个文件的 ID 差异
features_ids = features["ID"].unique().to_list()
patient_info_ids = patient_info["ID"].unique().to_list()
print("仅存在于 features 的 ID:", set(features_ids) - set(patient_info_ids))
print("仅存在于 patient_info 的 ID:", set(patient_info_ids) - set(features_ids))

# 直接筛选两个 DataFrame 的交集 ID
dataX = features.filter(pl.col("ID").is_in(patient_info["ID"]))
dataY = patient_info.filter(pl.col("ID").is_in(features["ID"]))

# 通过 Join 对齐 ID
merged = features.join(patient_info, on="ID", how="inner")

仅存在于 features 的 ID: set()
仅存在于 patient_info 的 ID: {66, 99, 4, 69, 100, 103, 40, 106, 12, 76, 16, 17, 80, 54, 25, 28, 29, 62}


**Processing Data Seperately**

In [None]:
# 筛选 HRV=0 的数据并确保 ID 唯一性（统一 ID 类型为字符串）
data_hrv0 = (
    merged.filter(pl.col("HRV") == 0)
    .with_columns(pl.col("ID").cast(pl.Utf8))  # 关键修复：统一类型
    .unique(subset=["ID"], keep="first")
    .sort("ID")
)

# 分割特征和目标变量（保留 ACC_TIME）
dataX_hrv0 = data_hrv0.select(pl.exclude(["ADHD", "HRV_TIME", "HRV"]))
dataY_hrv0 = data_hrv0.select(["ID", "ADHD"])

# 时间编码函数（保持不变）
def cyclical_time_encoding_polars(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.with_columns(
            pl.col("ACC_TIME").str.split(":").list.eval(
                pl.element().cast(pl.UInt32).fill_null(0)
            ).alias("time_parts"),
        )
        .with_columns(
            (pl.col("time_parts").list.get(0)*3600 
             + pl.col("time_parts").list.get(1)*60 
             + pl.col("time_parts").list.get(2)
            ).alias("total_seconds")
        )
        .with_columns(
            (2 * np.pi * pl.col("total_seconds") / 86400).alias("radians")
        )
        .with_columns(
            pl.col("radians").sin().alias("ACC_TIME_SIN"),
            pl.col("radians").cos().alias("ACC_TIME_COS")
        )
        .drop(["time_parts", "total_seconds", "radians", "ACC_TIME"])
    )

dataX_hrv0 = cyclical_time_encoding_polars(dataX_hrv0)

# 索引对齐函数（确保类型一致性）
def align_polars_data(X: pl.DataFrame, y: pl.DataFrame) -> tuple[pl.DataFrame, pl.DataFrame]:
    """强制 ID 类型一致后对齐"""
    # 统一类型
    X = X.with_columns(pl.col("ID").cast(pl.Utf8))
    y = y.with_columns(pl.col("ID").cast(pl.Utf8))
    
    # 获取共同 ID
    common_ids = (
        X.select("ID")
        .join(y.select("ID"), on="ID", how="semi")
        .unique()
        .sort("ID")
    )
    return (
        X.join(common_ids, on="ID").sort("ID"),
        y.join(common_ids, on="ID").sort("ID")
    )

dataX_hrv0, dataY_hrv0 = align_polars_data(dataX_hrv0, dataY_hrv0)

# 类型转换（确保 ADHD 为 Float32）
dataY_hrv0 = dataY_hrv0.with_columns(
    pl.col("ADHD").cast(pl.Float32).fill_null(-1)
)

# 最终验证
assert dataX_hrv0["ID"].equals(dataY_hrv0["ID"]), f"""
索引未对齐详情：
- X 类型: {dataX_hrv0['ID'].dtype}, Y 类型: {dataY_hrv0['ID'].dtype}
- 前5个ID对比：
  X: {dataX_hrv0["ID"].head(5).to_list()}
  Y: {dataY_hrv0["ID"].head(5).to_list()}
"""

In [146]:
dataX_hrv0

ID,ACC__variance_larger_than_standard_deviation,ACC__has_duplicate_max,ACC__has_duplicate_min,ACC__has_duplicate,ACC__sum_values,ACC__abs_energy,ACC__mean_abs_change,ACC__mean_change,ACC__mean_second_derivative_central,ACC__median,ACC__mean,ACC__length,ACC__standard_deviation,ACC__variation_coefficient,ACC__variance,ACC__skewness,ACC__kurtosis,ACC__root_mean_square,ACC__absolute_sum_of_changes,ACC__longest_strike_below_mean,ACC__longest_strike_above_mean,ACC__count_above_mean,ACC__count_below_mean,ACC__last_location_of_maximum,ACC__first_location_of_maximum,ACC__last_location_of_minimum,ACC__first_location_of_minimum,ACC__percentage_of_reoccurring_values_to_all_values,ACC__percentage_of_reoccurring_datapoints_to_all_datapoints,ACC__sum_of_reoccurring_values,ACC__sum_of_reoccurring_data_points,ACC__ratio_value_number_to_time_series_length,ACC__sample_entropy,ACC__maximum,ACC__minimum,ACC__benford_correlation,…,ACC__permutation_entropy__dimension_7__tau_1,"ACC__matrix_profile__feature_""""""""min""""""""__threshold_0.98","ACC__matrix_profile__feature_""""""""max""""""""__threshold_0.98","ACC__matrix_profile__feature_""""""""mean""""""""__threshold_0.98","ACC__matrix_profile__feature_""""""""median""""""""__threshold_0.98","ACC__matrix_profile__feature_""""""""25""""""""__threshold_0.98","ACC__matrix_profile__feature_""""""""75""""""""__threshold_0.98",SEX,AGE,ACC,ACC_DAYS,HRV_HOURS,CPT_II,ADD,BIPOLAR,UNIPOLAR,ANXIETY,SUBSTANCE,OTHER,CT,MDQ_POS,WURS,ASRS,MADRS,HADS_A,HADS_D,MED,MED_Antidepr,MED_Moodstab,MED_Antipsych,MED_Anxiety_Benzo,MED_Sleep,MED_Analgesics_Opioids,MED_Stimulants,filter_$,ACC_TIME_SIN,ACC_TIME_COS
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64
"""10""",1.0,0.0,1.0,1.0,887574.0,2.2537724e8,42.327399,0.0,-0.000655,9.0,68.401202,12976.0,112.65013,1.646903,12690.051737,3.395866,22.100605,131.790653,549198.0,365.0,133.0,4256.0,8720.0,0.926942,0.926865,1.0,0.0,0.901639,0.999075,28141.0,873258.0,0.009402,0.18662,1726.0,0.0,0.933277,…,5.878335,3.534709,17.137102,12.516565,12.927879,11.304366,14.140833,1,3,1,9.0,0,0,0,0,0,1,1,1,0,0,42,38,16,10,5,0,0,0,0,0,0,0,0,1,0.551937,-0.833886
"""108""",1.0,0.0,1.0,1.0,3.354992e6,3.1757e9,171.380145,0.0,-0.04343,143.0,353.157053,9500.0,457.778304,1.296246,209560.975545,1.549465,2.273884,578.170286,1.62794e6,517.0,176.0,3297.0,6203.0,0.940526,0.940421,1.0,0.0,0.960526,0.999368,78885.0,3.336332e6,0.016,0.312297,3637.0,0.0,0.963084,…,6.904089,2.329597,8.991218,6.440952,6.5034,5.759727,7.189762,1,3,1,6.6,0,1,0,0,1,0,1,0,1,1,65,51,0,0,0,9,0,0,0,0,0,0,0,1,0.442289,-0.896873
"""13""",1.0,0.0,1.0,1.0,3.378861e6,2.8651e9,177.520324,0.0,0.0,166.0,325.422421,10383.0,412.357136,1.267144,170038.407711,1.702196,3.571307,525.298163,1.843016e6,270.0,50.0,3913.0,6470.0,0.449196,0.449099,1.0,0.0,0.954248,0.999326,79353.0,3.359021e6,0.014736,0.263236,3418.0,0.0,0.986672,…,6.499379,2.852002,15.577571,12.445792,12.941399,11.538077,13.859342,0,3,1,7.2,0,1,0,0,0,0,0,1,0,0,48,0,5,3,2,1,0,0,0,0,0,0,1,1,1.2246e-16,-1.0
"""14""",1.0,0.0,1.0,1.0,1.743156e6,1.1998e9,117.746384,0.0,-0.003372,31.0,189.555894,9196.0,307.462115,1.622013,94532.9524,2.13723,4.825765,361.198546,1.082678e6,327.0,100.0,2692.0,6504.0,0.015115,0.015007,1.0,0.0,0.970803,0.999565,59300.0,1.7356e6,0.014898,0.275212,2150.0,0.0,0.987569,…,6.48523,2.631817,11.868038,9.305237,9.703475,8.756306,10.405446,0,4,1,6.4,0,1,0,0,0,0,1,1,0,0,52,49,9,9,0,0,0,0,0,0,0,0,0,1,-0.5,-0.866025
"""18""",1.0,0.0,1.0,1.0,2.495675e6,1.7377e9,122.902955,0.0,-0.029895,57.0,213.744005,11676.0,321.149272,1.502495,103136.85473,2.163544,5.967227,385.776301,1.434892e6,153.0,123.0,3812.0,7864.0,0.147482,0.147396,1.0,0.0,0.974843,0.999657,70419.0,2.485211e6,0.013618,0.293919,2990.0,0.0,0.97324,…,6.763202,2.702909,11.395531,8.474024,8.641447,7.758865,9.348692,0,2,1,8.1,0,1,0,1,0,1,0,0,1,1,43,52,37,11,13,1,1,0,0,0,0,0,0,1,0.5,-0.866025
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""72""",1.0,0.0,1.0,1.0,2.329983e6,1.6699e9,127.529888,0.000273,-0.021201,79.0,211.970797,10992.0,327.085048,1.543067,106984.628605,2.750711,11.220433,389.764349,1.401681e6,213.0,82.0,3542.0,7450.0,0.603439,0.603348,0.983715,0.0,0.97351,0.999636,82702.0,2.317964e6,0.013737,0.356675,3991.0,0.0,0.99403,…,6.785565,1.907301,10.360375,7.423277,7.638297,6.437911,8.625025,1,2,1,7.6,0,1,0,0,0,0,1,0,0,0,25,28,10,5,5,1,1,0,0,0,0,0,0,1,0.707107,-0.707107
"""74""",1.0,0.0,1.0,1.0,2.561778e6,2.3386e9,136.330167,0.0,-0.021813,42.0,248.306484,10317.0,406.226791,1.635989,165020.205379,2.479431,8.666597,476.105362,1.406382e6,396.0,64.0,3114.0,7203.0,0.57759,0.577493,1.0,0.0,0.967105,0.999515,93835.0,2.544984e6,0.014733,0.21202,4259.0,0.0,0.988354,…,6.137602,2.156442,11.156693,8.252393,8.562514,7.561757,9.325718,0,2,1,7.2,0,1,1,0,1,0,0,0,0,0,40,42,9,3,2,0,0,0,0,0,0,0,0,1,0.5,-0.866025
"""8""",1.0,1.0,1.0,1.0,1.114027e6,7.19229773e8,110.91615,0.000653,-0.016071,47.0,181.703963,6131.0,290.334329,1.597843,84294.022459,2.497339,7.776272,342.505989,679916.0,366.0,65.0,1893.0,4238.0,0.967868,0.936552,0.998858,0.0,0.973856,0.999348,58544.0,1.106366e6,0.024955,0.292656,2263.0,0.0,0.983961,…,6.42847,10.554882,20.791346,17.935688,18.167298,16.951096,18.887614,0,2,1,4.3,0,1,0,1,0,1,1,1,1,1,70,63,19,17,4,1,0,0,0,1,0,0,0,1,1.2246e-16,-1.0
"""92""",1.0,0.0,1.0,1.0,2.530825e6,2.2593e9,152.551494,0.0,-0.001302,59.0,263.517805,9604.0,407.196382,1.545233,165808.89379,2.22725,6.228762,485.026316,1.464952e6,598.0,186.0,3051.0,6553.0,0.880779,0.880675,1.0,0.0,0.980392,0.999688,87561.0,2.52212e6,0.015931,0.233238,3512.0,0.0,0.989473,…,6.351625,5.073497,29.160531,22.889814,22.918779,21.093454,25.42621,1,2,1,6.7,0,1,0,1,0,1,0,0,1,1,60,54,9,12,0,0,0,0,0,0,0,0,0,1,-0.932008,-0.362438


In [147]:
# ===================== 基础依赖导入 =====================
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import NearMiss
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer, roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.utils.validation import check_X_y
from sklearn.exceptions import NotFittedError
import pandas as pd
import numpy as np
import shap

# ===================== 修复特征选择器 =====================
class EnhancedFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, fdr_levels = 0.05, min_features=5, alpha=0.01, max_iter=20000):
        self.fdr_levels = fdr_levels
        self.min_features = max(min_features, 3)
        self.alpha = alpha
        self.max_iter = max_iter
        self.primary_cols_ = []    # 首次选择的列名
        self.primary_idx_ = []     # 首次选择的列索引
        self.secondary_mask_ = []  # 二次选择的布尔掩码
        self.scaler_ = None        # 需要初始化scaler_

    def fit(self, X, y):
        # 初始特征选择
        self._primary_selection(X, y)  # 修正方法名
        
        # 准备二次选择数据
        X_primary = self._get_primary_features(X)
        
        # 标准化处理
        self.scaler_ = StandardScaler().fit(X_primary)
        X_scaled = self.scaler_.transform(X_primary)
        
        # 二次特征选择
        self._secondary_selection(X_scaled, y)
        return self

    def _primary_selection(self, X, y):
        """初次特征选择（FDR/方差）"""
        # 统一使用正确的属性名
        self.primary_cols_ = []
        
        if hasattr(X, 'columns'):
            df_X = X
        else:
            df_X = pd.DataFrame(X, columns=[f"x{i}" for i in range(X.shape[1])])

        # FDR选择逻辑
        fdr = self.fdr_levels
        try:
            X_selected = select_features(df_X, y, fdr_level=fdr)
            print(X_selected)
            if len(X_selected.columns) >= self.min_features:
                    self.primary_cols_ = X_selected.columns.tolist()
        except Exception as e:
                print(f"FDR {fdr} 失败: {str(e)}")
        
        # 保底策略
        if not self.primary_cols_:
            print("启用方差保底选择")
            variances = np.var(df_X, axis=0)
            selected_idx = np.argsort(variances)[-self.min_features:]
            self.primary_cols_ = df_X.columns[selected_idx].tolist()
        
        # 记录列索引
        self.primary_idx_ = [df_X.columns.get_loc(col) for col in self.primary_cols_]

    def _secondary_selection(self, X, y):
        """二次特征选择（模型筛选）"""
        try:
            en = LassoCV(
                alphas=[self.alpha],
                max_iter=self.max_iter,
                cv=3,
                random_state=42
            )
            en.fit(X, y)
            self.secondary_mask_ = en.coef_ != 0
            
            # 保底机制
            if np.sum(self.secondary_mask_) < self.min_features:
                print(f"二次选择特征不足({np.sum(self.secondary_mask_)}个)，启用重要性排序")
                top_idx = np.argsort(np.abs(en.coef_))[::-1][:self.min_features]
                self.secondary_mask_ = np.zeros_like(en.coef_, dtype=bool)
                self.secondary_mask_[top_idx] = True
                
        except Exception as e:
            print(f"模型选择失败: {str(e)}, 使用全部初选特征")
            self.secondary_mask_ = np.ones(X.shape[1], dtype=bool)

    def transform(self, X):
        # 获取首次选择特征
        X_primary = self._get_primary_features(X)
        
        # 标准化
        if self.scaler_ is None:
            raise NotFittedError("需要先调用fit方法")
        X_scaled = self.scaler_.transform(X_primary)
        
        # 应用二次选择
        X_final = X_scaled[:, self.secondary_mask_]
        
        # 最终维度验证
        if X_final.shape[1] == 0:
            raise ValueError("最终特征数量为0，请检查选择参数")
        
        assert X_final.shape[1] == len(self.get_feature_names()), "特征维度不匹配"

            
        return X_final

    def _get_primary_features(self, X):
        """统一获取首次选择特征"""
        if isinstance(X, pd.DataFrame):
            return X[self.primary_cols_].values
        else:
            return X[:, self.primary_idx_]

    def get_params(self, deep=True):
        return {'fdr_levels': self.fdr_levels,
                'min_features': self.min_features,
                'alpha': self.alpha}

    def set_params(self, **params):
        if 'fdr_levels' in params:
            params['fdr_levels'] = tuple(params['fdr_levels']) if isinstance(params['fdr_levels'], list) else params['fdr_levels']
        for key, value in params.items():
            setattr(self, key, value)
        return self
    
    def get_feature_names(self):
        """获取最终选择的特征名称"""
        return [col for col, mask in zip(self.primary_cols_, self.secondary_mask_) if mask]

# ===================== 数据准备 ===================== 
# 转换时保留原始数据副本
raw_dataX = dataX_hrv0.to_pandas().copy()
raw_dataY = dataY_hrv0.to_pandas()["ADHD"].copy()

# 索引对齐增强版
def safe_align_index(X, y):
    """安全对齐索引的三重校验"""
    # 第一层校验：索引完全匹配
    if X.index.equals(y.index):
        return X, y
    
    # 第二层校验：ID列匹配
    if 'ID' in X.columns and 'ID' in y.columns:
        common_ids = np.intersect1d(X['ID'], y['ID'])
        X = X[X['ID'].isin(common_ids)].set_index('ID')
        y = y[y['ID'].isin(common_ids)].set_index('ID')
    else:
        # 第三层校验：强制重置索引
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        print("警告：无法对齐索引，已重置索引")
    
    return X, y


dataX_pd, dataY_pd = safe_align_index(raw_dataX, raw_dataY)
dataX_pd = dataX_pd.apply(pd.to_numeric, errors='coerce')

# 增强型特征去重（处理大小写和空格）
dataX_pd.columns = dataX_pd.columns.str.strip().str.lower()
dataX_pd = dataX_pd.loc[:, ~dataX_pd.columns.duplicated(keep='first')]

# 动态缺失值处理（保留原始数据）
missing_threshold = 0.3
missing_cols = dataX_pd.columns[dataX_pd.isna().mean() > missing_threshold]
if len(missing_cols) > 0:
    print(f"删除高缺失率列: {missing_cols.tolist()}")
    dataX_pd = dataX_pd.drop(columns=missing_cols)

# 数据分割（先分割再填充）
X_temp, X_test_raw, y_temp, y_test = train_test_split(
    dataX_pd, dataY_pd,
    test_size=0.2,
    stratify=dataY_pd if dataY_pd.nunique() > 1 else None,
    random_state=6
)

# 安全填充（用训练集中位数填充）
train_median = X_temp.median()
X_train_raw = X_temp.fillna(train_median)
X_test_raw = X_test_raw.fillna(train_median)  # 使用训练集统计量

# 最终数据校验
print(f"[数据报告] 总样本: {len(dataX_pd)} | 特征数: {X_train_raw.shape[1]}")
print(f"训练集正样本比例: {y_temp.mean():.1%} | 测试集正样本比例: {y_test.mean():.1%}")

# 特征类型转换（保留列名）
X_train_raw = X_train_raw.astype(np.float64)
X_test_raw = X_test_raw.astype(np.float64)


X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    dataX_pd, dataY_pd,
    test_size=0.2,
    stratify=dataY_pd,
    random_state=6
)

# ===================== 动态参数配置 =====================
n_positive = sum(y_train == 1)
safe_n_neighbors = max(1, min(3, (n_positive - 1) // 2))  # 修正1：避免除零问题

# 交叉验证策略优化
if n_positive < 5:
    cv_strategy = StratifiedKFold(n_splits=3, shuffle=True)
elif 5 <= n_positive < 20:
    cv_strategy = StratifiedKFold(n_splits=5, shuffle=True)
else:
    cv_strategy = StratifiedKFold(n_splits=10, shuffle=True)

# 模型参数配置修正
base_params = {
    'random_state': 42,
    'categorical_features': None,
    'monotonic_cst': None,
    'scoring': 'balanced_accuracy'  # 修正2：使用原生支持的正则化方式
}

if n_positive < 5:
    scoring = make_scorer(roc_auc_score, needs_proba=True)  # 修正3：需要概率预测
    model_config = {
        **base_params,
        'max_depth': 2,              # 限制树深
        'min_samples_leaf': 20       # 防止过拟合
    }
else:
    scoring = make_scorer(balanced_accuracy_score)
    model_config = {
        **base_params,
        'learning_rate': 0.05,
        'max_depth': 3,
        'class_weight': None,        # 修正4：HistGradientBoosting无此参数
        'l2_regularization': 0.1     # 改用正确的正则化参数
    }

# 安全邻居数最终校验（修正5）
safe_n_neighbors = min(safe_n_neighbors, n_positive - 1) if n_positive > 1 else 0

# ===================== 管道构建 =====================
def create_SMOTE_pipeline():
    return Pipeline([
    ('feature_selector', EnhancedFeatureSelector()),
    ('smote', SMOTE(
        sampling_strategy=0.5,  # 将少数类扩至多数类的50%
        k_neighbors=3,          # 降低k值适应小样本
        random_state=42
    )),
    ('classifier', HistGradientBoostingClassifier())
])

# ===================== 模型列表 =====================
models = [
    create_SMOTE_pipeline()
]

# ===================== 验证流程 =====================


results = []
for model in models:
    try:
        # 数据准备增强
        X_array = X_train_raw.astype(np.float64).values
        y_array = y_train.astype(np.int32).values  # 修改点1：使用int32节省内存
        
        # 动态调整交叉验证
        cv_strategy = StratifiedKFold(
            n_splits=min(5, np.bincount(y_array).min()),  # 根据最少类别样本数调整
            shuffle=True,
            random_state=42
        )
        
        # 交叉验证流程
        scores = cross_val_score(
            clone(model),
            X_array,
            y_array,
            cv=cv_strategy,  # 修改点2：动态交叉验证
            scoring=scoring,
            n_jobs=-1,       # 修改点3：启用并行
            error_score='raise'
        )
        
        # 全量训练
        final_model = clone(model).fit(X_array, y_array)
        
        # 特征重要性获取优化
        classifier = final_model.named_steps['classifier']
        selector = final_model.named_steps['feature_selector']
        
        # 修改点4：统一特征重要性获取方式
        if hasattr(classifier, 'feature_importances_'):
            importances = classifier.feature_importances_
        elif hasattr(classifier, 'coef_'):
            importances = np.abs(classifier.coef_[0])
        else:
            explainer = shap.TreeExplainer(classifier)
            shap_values = explainer.shap_values(X_array)
            importances = np.abs(shap_values).mean(axis=0)
        
        # 获取实际选中的特征名称
        try:
            selected_features = selector.get_feature_names()  # 修改点5
            top_features = selected_features[:3]
        except Exception as e:
            print(f"特征名称获取失败: {str(e)}")
            top_features = []
        
        results.append({
            'Model': name,
            'CV Score': f"{np.mean(scores):.3f} ± {np.std(scores):.3f}",
            'Features': len(importances),
            'Top Features': top_features,
            'CV Folds': cv_strategy.n_splits  # 新增指标
        })
        
    except Exception as e:
        error_msg = f"{name} 失败: {type(e).__name__} - {str(e)}"
        print(error_msg)
        continue

# ===================== 结果展示 =====================
if results:
    result_df = pd.DataFrame(results)
    # 添加排序逻辑
    result_df['Mean Score'] = result_df['CV Score'].str.extract(r'(\d+\.\d+)').astype(float)
    print(result_df.sort_values('Mean Score', ascending=False)
                  .drop('Mean Score', axis=1)
                  .to_markdown(index=False))
else:
    print("所有模型运行失败，请检查数据")


model = create_SMOTE_pipeline()

 # 全量训练
model.fit(X_train_raw, y_train)
    
    # 测试评估
test_proba = model.predict_proba(X_test_raw)[:, 1]
test_auc = roc_auc_score(y_test, test_proba)
    
    # 结果展示
print(f"""
    [模型报告]
    测试集AUC: {test_auc:.3f}
    SMOTE参数: k_neighbors={model.named_steps['smote'].k_neighbors}
    """)

[数据报告] 总样本: 47 | 特征数: 812
训练集正样本比例: 24.3% | 测试集正样本比例: 20.0%
    x791  x787  x793  x790  x794  x789  x792  x809  x797  x788  ...  \
0    2.0   0.0   2.0   2.0   2.0   2.0   2.0   0.0   0.0   2.0  ...   
1    2.0   0.0   2.0   2.0   2.0   2.0   2.0   0.0   0.0   2.0  ...   
2    0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0  52.0   0.0  ...   
3    1.0   1.0   0.0   1.0   1.0   0.0   0.0   1.0  15.0   0.0  ...   
4    2.0   0.0   2.0   2.0   2.0   2.0   2.0   0.0   0.0   2.0  ...   
5    0.0   1.0   0.0   0.0   0.0   0.0   0.0   1.0  38.0   1.0  ...   
6    0.0   1.0   0.0   0.0   0.0   0.0   1.0   1.0  28.0   0.0  ...   
7    2.0   0.0   2.0   2.0   2.0   2.0   2.0   0.0   0.0   2.0  ...   
8    0.0   1.0   1.0   1.0   1.0   0.0   0.0   1.0  62.0   0.0  ...   
9    2.0   0.0   2.0   2.0   2.0   2.0   2.0   0.0   0.0   2.0  ...   
10   2.0   0.0   2.0   2.0   2.0   2.0   2.0   0.0   0.0   2.0  ...   
11   2.0   0.0   2.0   2.0   2.0   2.0   2.0   0.0   0.0   2.0  ...   
12   2.0   0.0   

In [148]:
import joblib

# 保存训练参数
joblib.dump({
    'model': model,
    'train_median': X_train_raw.median(),
    'feature_columns': X_train_raw.columns
}, 'trained_model.pkl')

['trained_model.pkl']