In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e8/sample_submission.csv
/kaggle/input/playground-series-s5e8/train.csv
/kaggle/input/playground-series-s5e8/test.csv


In [3]:
from pathlib import Path
import polars as pl

BASE = Path("/kaggle/input/playground-series-s5e8")
train, test, sub = (pl.read_csv(BASE / f) for f in ("train.csv", "test.csv", "sample_submission.csv"))

In [4]:
train

id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
i64,i64,str,str,str,str,i64,str,str,str,i64,str,i64,i64,i64,i64,str,i64
0,42,"""technician""","""married""","""secondary""","""no""",7,"""no""","""no""","""cellular""",25,"""aug""",117,3,-1,0,"""unknown""",0
1,38,"""blue-collar""","""married""","""secondary""","""no""",514,"""no""","""no""","""unknown""",18,"""jun""",185,1,-1,0,"""unknown""",0
2,36,"""blue-collar""","""married""","""secondary""","""no""",602,"""yes""","""no""","""unknown""",14,"""may""",111,2,-1,0,"""unknown""",0
3,27,"""student""","""single""","""secondary""","""no""",34,"""yes""","""no""","""unknown""",28,"""may""",10,2,-1,0,"""unknown""",0
4,26,"""technician""","""married""","""secondary""","""no""",889,"""yes""","""no""","""cellular""",3,"""feb""",902,1,-1,0,"""unknown""",1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
749995,29,"""services""","""single""","""secondary""","""no""",1282,"""no""","""yes""","""unknown""",4,"""jul""",1006,2,-1,0,"""unknown""",1
749996,69,"""retired""","""divorced""","""tertiary""","""no""",631,"""no""","""no""","""cellular""",19,"""aug""",87,1,-1,0,"""unknown""",0
749997,50,"""blue-collar""","""married""","""secondary""","""no""",217,"""yes""","""no""","""cellular""",17,"""apr""",113,1,-1,0,"""unknown""",0
749998,32,"""technician""","""married""","""secondary""","""no""",-274,"""no""","""no""","""cellular""",26,"""aug""",108,6,-1,0,"""unknown""",0


In [8]:
import polars as pl
from collections import Counter
from itertools import combinations
import math

def eda_bank_pl(df: pl.DataFrame, top_corr: int = 10):
    ID, TARGET = "id", "y"
    CATS = [c for c in ("job","marital","education","default","housing","loan","contact","month","poutcome") if c in df.columns]
    NUMS = [c for c in ("age","balance","day","duration","campaign","pdays","previous") if c in df.columns]  # duration 可能泄漏

    # 1) 基本信息
    dtype_counts = dict(Counter(map(str, df.dtypes)))
    print("▶ SHAPE / MEM / DTYPES")
    print({"shape": (df.height, df.width),
           "mem_MB": round(df.estimated_size() / 2**20, 2),
           "dtypes_counts": dtype_counts})
    print("\n▶ HEAD(3)")
    print(df.head(3))

    # 2) 缺失率 & 基数
    miss = (
        df.null_count()
          .transpose(include_header=True, header_name="col", column_names=["n_miss"])
          .with_columns(
              pl.col("n_miss").cast(pl.UInt64),
              (pl.col("n_miss") / df.height * 100).round(2).alias("miss_%")
          )
          .filter(pl.col("n_miss") > 0)
          .sort("miss_%", descending=True)
    )
    if miss.height:
        print("\n▶ MISSING % (non-zero)")
        print(miss)

    nunique = (
        df.select(pl.all().n_unique())
          .transpose(include_header=True, header_name="col", column_names=["nunique"])
          .sort("nunique", descending=True)
    )
    print("\n▶ NUNIQUE (all cols)")
    print(nunique)

    dup = df.height - df.unique().height
    print("\n▶ DUPLICATES (rows):", int(dup))

    # 3) 数值特征概要 & 相关性
    if NUMS:
        print("\n▶ NUMERIC SUMMARY")
        print(df.select(NUMS).describe())

        if len(NUMS) >= 2:
            pairs = []
            num_df = df.select(NUMS)
            for a, b in combinations(NUMS, 2):
                r = num_df.select(pl.corr(a, b)).to_series().item()
                r = 0.0 if r is None or (isinstance(r, float) and math.isnan(r)) else float(r)
                pairs.append({"A": a, "B": b, "|r|": abs(r), "r": r})
            top = pl.DataFrame(pairs).sort("|r|", descending=True).head(top_corr)
            print(f"\n▶ TOP {top_corr} |corr| PAIRS (numeric)")
            print(top)

    # 4) 类别特征：基数与与目标的关联
    if CATS:
        card = (
            df.select(CATS).select(pl.all().n_unique())
              .transpose(include_header=True, header_name="col", column_names=["nunique"])
              .sort("nunique")
        )
        print("\n▶ CATEGORICAL CARDINALITY (low→high)")
        print(card)

        if TARGET in df.columns:
            for c in CATS:
                out = (
                    df.group_by(c)
                      .agg(n=pl.len(), rate=pl.col(TARGET).mean())
                      .sort(["rate","n"], descending=[True, True])
                      .head(6)
                )
                print(f"\n▶ {c} vs {TARGET} (top by rate)")
                print(out)

    # 5) 目标分布
    if TARGET in df.columns:
        counts = df.group_by(TARGET).len().sort(TARGET)
        ctr = float(df.select(pl.col(TARGET).mean()).to_series().item())
        print(f"\n▶ TARGET '{TARGET}' counts")
        print(counts)
        print(f"CTR (mean of {TARGET}): {ctr:.4f}")


eda_bank_pl(train)


▶ SHAPE / MEM / DTYPES
{'shape': (750000, 18), 'mem_MB': 86.64, 'dtypes_counts': {'Int64': 9, 'String': 9}}

▶ HEAD(3)
shape: (3, 18)
┌─────┬─────┬─────────────┬─────────┬───┬───────┬──────────┬──────────┬─────┐
│ id  ┆ age ┆ job         ┆ marital ┆ … ┆ pdays ┆ previous ┆ poutcome ┆ y   │
│ --- ┆ --- ┆ ---         ┆ ---     ┆   ┆ ---   ┆ ---      ┆ ---      ┆ --- │
│ i64 ┆ i64 ┆ str         ┆ str     ┆   ┆ i64   ┆ i64      ┆ str      ┆ i64 │
╞═════╪═════╪═════════════╪═════════╪═══╪═══════╪══════════╪══════════╪═════╡
│ 0   ┆ 42  ┆ technician  ┆ married ┆ … ┆ -1    ┆ 0        ┆ unknown  ┆ 0   │
│ 1   ┆ 38  ┆ blue-collar ┆ married ┆ … ┆ -1    ┆ 0        ┆ unknown  ┆ 0   │
│ 2   ┆ 36  ┆ blue-collar ┆ married ┆ … ┆ -1    ┆ 0        ┆ unknown  ┆ 0   │
└─────┴─────┴─────────────┴─────────┴───┴───────┴──────────┴──────────┴─────┘

▶ NUNIQUE (all cols)
shape: (18, 2)
┌──────────┬─────────┐
│ col      ┆ nunique │
│ ---      ┆ ---     │
│ str      ┆ u32     │
╞══════════╪═════════╡
│ id       ┆

In [1]:
# ===== Decision Tree x3 (gini / entropy / log_loss) =====
from pathlib import Path
import polars as pl
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

# --- config ---
BASE = Path("/kaggle/input/playground-series-s5e8")
TARGET, ID = "y", "id"
DROP_DURATION = True          # 是否丢弃潜在泄漏特征 'duration'
SUBMIT_PROBA  = True          # True=提交概率；False=提交0/1
RANDOM_STATE  = 42
CRITERIA = ["gini", "entropy", "log_loss"]

# --- load with polars (fast), then to pandas for sklearn ---
train_pl = pl.read_csv(BASE / "train.csv")
test_pl  = pl.read_csv(BASE / "test.csv")
sub_pd   = pd.read_csv(BASE / "sample_submission.csv")

if DROP_DURATION and "duration" in train_pl.columns:
    train_pl = train_pl.drop("duration")
    if "duration" in test_pl.columns:
        test_pl = test_pl.drop("duration")

train = train_pl.to_pandas()
test  = test_pl.to_pandas()

# --- split X/y ---
y = train[TARGET].astype(int).values
X = train.drop(columns=[TARGET, ID])
X_test = test.drop(columns=[ID], errors="ignore")

# --- columns ---
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# --- preprocessor: OneHot(dense) + passthrough numeric ---
pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop",
)

def run_one(criterion: str):
    print(f"\n=== Criterion: {criterion} ===")
    clf = DecisionTreeClassifier(
        criterion=criterion,
        random_state=RANDOM_STATE,
        max_depth=8,
        min_samples_leaf=100,
        class_weight="balanced",
    )
    pipe = Pipeline([("prep", pre), ("clf", clf)])

    # 5-fold OOF
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    oof = np.zeros(len(y), dtype=float)
    for tr_idx, va_idx in skf.split(X, y):
        pipe.fit(X.iloc[tr_idx], y[tr_idx])
        oof[va_idx] = pipe.predict_proba(X.iloc[va_idx])[:, 1]

    auc = roc_auc_score(y, oof)
    f1  = f1_score(y, (oof >= 0.5).astype(int))
    print(f"[CV] AUC={auc:.4f} | F1@0.5={f1:.4f}")

    # fit full & predict test
    pipe.fit(X, y)
    test_proba = pipe.predict_proba(X_test)[:, 1]
    test_pred  = (test_proba >= 0.5).astype(int)

    out = sub_pd.copy()
    out["y"] = test_proba if SUBMIT_PROBA else test_pred
    out_path = f"submission_{criterion}.csv"
    out.to_csv(out_path, index=False)
    print(f"Saved -> {out_path}")
    return {"criterion": criterion, "auc": auc, "f1@0.5": f1, "file": out_path}

# --- run all three ---
results = [run_one(c) for c in CRITERIA]

# --- summary ---
print("\n=== Summary ===")
for r in results:
    print(f"{r['criterion']:<8}  AUC={r['auc']:.4f}  F1@0.5={r['f1@0.5']:.4f}  -> {r['file']}")



=== Criterion: gini ===
[CV] AUC=0.8103 | F1@0.5=0.4092
Saved -> submission_gini.csv

=== Criterion: entropy ===
[CV] AUC=0.8111 | F1@0.5=0.4088
Saved -> submission_entropy.csv

=== Criterion: log_loss ===
[CV] AUC=0.8111 | F1@0.5=0.4088
Saved -> submission_log_loss.csv

=== Summary ===
gini      AUC=0.8103  F1@0.5=0.4092  -> submission_gini.csv
entropy   AUC=0.8111  F1@0.5=0.4088  -> submission_entropy.csv
log_loss  AUC=0.8111  F1@0.5=0.4088  -> submission_log_loss.csv


In [5]:
# ===== ID3Fast (vectorized, multiway, information gain) with CV + submission =====
from pathlib import Path
import polars as pl
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from joblib import Parallel, delayed

# --- config ---
BASE = Path("/kaggle/input/playground-series-s5e8")
TARGET, ID = "y", "id"
DROP_DURATION = True          # 是否丢弃潜在泄漏特征 'duration'
RANDOM_STATE  = 42
N_SPLITS      = 5
SUBMIT_PROBA  = True          # 提交概率（比赛常用）
N_JOBS        = -1            # 并行折数

# ID3 超参
MAX_DEPTH         = 8
MIN_SAMPLES_LEAF  = 100       # 子节点最少样本
N_BINS_NUMERIC    = 10        # 数值分箱数（等分位）
SPECIAL_PDAYS     = True      # pdays == -1 单独成类
MIN_GAIN          = 1e-6      # 最小信息增益阈值（过小则不分裂，省时）

# ---------- 工具 ----------
def entropy_from_pos_neg(pos: np.ndarray, neg: np.ndarray) -> np.ndarray:
    n = pos + neg
    with np.errstate(divide="ignore", invalid="ignore"):
        p = np.where(n > 0, pos / n, 0.0)
        q = 1.0 - p
        ent = -(np.where(p > 0, p * np.log2(p), 0.0) + np.where(q > 0, q * np.log2(q), 0.0))
    return ent

def parent_entropy(y: np.ndarray) -> float:
    pos = y.sum()
    neg = len(y) - pos
    return float(entropy_from_pos_neg(np.array([pos]), np.array([neg]))[0])

def calc_info_gain(y: np.ndarray, codes: np.ndarray, K: int, min_leaf: int) -> tuple[float, np.ndarray]:
    """返回 (gain, big_idx)；big_idx 是满足 min_leaf 的组索引。
       仅基于 codes 有效的样本（codes>=0）计算父熵与子熵，避免权重偏移。
    """
    mask = codes >= 0
    if not mask.any():
        return 0.0, np.array([], dtype=int)

    y_eff = y[mask]
    g = codes[mask]

    # 每组总数/正数
    cnt  = np.bincount(g, minlength=K)
    posf = np.bincount(g, weights=y_eff, minlength=K)
    pos  = posf.astype(np.int64)

    big  = np.where(cnt >= min_leaf)[0]
    if big.size <= 1:
        return 0.0, np.array([], dtype=int)

    n_eff = int(mask.sum())
    ent_parent = parent_entropy(y_eff)
    ent_child  = entropy_from_pos_neg(pos[big], (cnt[big] - pos[big]))
    w = cnt[big] / n_eff
    gain = ent_parent - float(np.sum(w * ent_child))
    return gain, big

# ---------- ID3 快速版 ----------
class ID3Fast:
    """整数编码 + 向量化信息增益 + 批量预测"""
    def __init__(self, max_depth=8, min_samples_leaf=100, n_bins_numeric=10,
                 random_state=42, special_pdays=True, min_gain=1e-6):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.n_bins_numeric = n_bins_numeric
        self.random_state = random_state
        self.special_pdays = special_pdays
        self.min_gain = min_gain

        # 拟合得到的元信息
        self.num_edges_ = {}        # 数值列 -> edges (np.ndarray)
        self.cat_categories_ = {}   # 类别列 -> pandas Index（训练出现过的类别）
        self.col_names_ = []
        self.col_is_num_ = []       # bool list
        self.col_K_ = []            # 每列的类别数（编码后 0..K-1，未知为 -1）
        self.tree_ = None
        self.major_proba_ = 0.0     # 根节点正类概率（未知回退）

    # ---- 编码器：拟合 ----
    def _fit_encoders(self, X: pd.DataFrame):
        self.col_names_ = X.columns.tolist()
        self.col_is_num_.clear()
        self.num_edges_.clear()
        self.cat_categories_.clear()
        self.col_K_.clear()

        for c in self.col_names_:
            if pd.api.types.is_numeric_dtype(X[c]):
                self.col_is_num_.append(True)
                col = X[c].to_numpy()
                if self.special_pdays and c == "pdays":
                    mask = (col != -1)
                    edges = self._quantile_edges(col[mask], self.n_bins_numeric) if mask.any() else np.array([])
                    self.num_edges_[c] = edges
                    K = max(len(edges) - 1, 0) + 1  # +1 for "pdays=-1"
                else:
                    edges = self._quantile_edges(col, self.n_bins_numeric)
                    self.num_edges_[c] = edges
                    K = max(len(edges) - 1, 0)
                self.col_K_.append(K)
            else:
                self.col_is_num_.append(False)
                cats = pd.Index(pd.Series(X[c], dtype="string").dropna().unique())
                self.cat_categories_[c] = cats
                self.col_K_.append(len(cats))

    @staticmethod
    def _quantile_edges(x: np.ndarray, n_bins: int) -> np.ndarray:
        x = x[~np.isnan(x)]
        if x.size == 0:
            return np.array([])
        qs = np.unique(np.quantile(x, np.linspace(0, 1, n_bins + 1)))
        if qs.size <= 2:
            lo, hi = np.min(x), np.max(x)
            qs = np.unique(np.linspace(lo, hi, min(n_bins + 1, int(len(np.unique(x))) + 1)))
        return qs if qs.size > 1 else np.array([])

    # ---- 编码器：应用（DataFrame -> int 矩阵）----
    def _encode_df(self, X: pd.DataFrame) -> np.ndarray:
        Xc = np.empty((len(X), len(self.col_names_)), dtype=np.int32)
        for j, c in enumerate(self.col_names_):
            if self.col_is_num_[j]:
                vals = X[c].to_numpy()
                if self.special_pdays and c == "pdays":
                    mask = (vals == -1)
                    edges = self.num_edges_.get(c, np.array([]))
                    if edges.size < 2:
                        codes = np.full(len(vals), -1, dtype=np.int32)
                    else:
                        codes = np.digitize(vals, edges[1:-1], right=False).astype(np.int32)
                    K_base = max(len(edges) - 1, 0)
                    codes[mask] = K_base  # 特殊类
                else:
                    edges = self.num_edges_.get(c, np.array([]))
                    if edges.size < 2:
                        codes = np.full(len(vals), -1, dtype=np.int32)
                    else:
                        codes = np.digitize(vals, edges[1:-1], right=False).astype(np.int32)
                Xc[:, j] = codes
            else:
                cats = self.cat_categories_.get(c, pd.Index([]))
                codes = pd.Categorical(pd.Series(X[c], dtype="string"), categories=cats).codes.astype(np.int32)
                Xc[:, j] = codes  # 未见过的类别为 -1
        return Xc

    # ---- 训练 ----
    def fit(self, X: pd.DataFrame, y: np.ndarray):
        rng = np.random.RandomState(self.random_state)
        y = y.astype(np.int8, copy=False)
        self._fit_encoders(X)
        X_codes = self._encode_df(X)

        # 根节点概率（未知回退）
        self.major_proba_ = float(y.mean()) if len(y) else 0.0

        feats = list(range(X_codes.shape[1]))
        rng.shuffle(feats)
        self.tree_ = self._build(X_codes, y, depth=0, features=feats)
        return self

    def _build(self, Xc: np.ndarray, y: np.ndarray, depth: int, features: list):
        # ——护栏：features 必须是全局列索引，Xc 不做列切片——
        assert Xc.ndim == 2 and all(0 <= f < Xc.shape[1] for f in features), "Feature index out of range."

        node = {
            "is_leaf": False,
            "n": int(len(y)),
            "proba": float(y.mean()) if len(y) else 0.0,
        }
        if (depth >= self.max_depth or
            len(y) < 2 * self.min_samples_leaf or
            y.min() == y.max() or
            len(features) == 0):
            node["is_leaf"] = True
            return node

        best_feat, best_gain, best_big_idx = None, 0.0, None
        for f in features:
            K = self.col_K_[f]
            if K <= 1:
                continue
            gain, big_idx = calc_info_gain(y, Xc[:, f], K, self.min_samples_leaf)
            if gain > best_gain:
                best_feat, best_gain, best_big_idx = f, gain, big_idx

        if best_feat is None or best_gain <= self.min_gain or best_big_idx is None or best_big_idx.size == 0:
            node["is_leaf"] = True
            return node

        node["feat_idx"] = best_feat
        node["children"] = {}
        remaining = [f for f in features if f != best_feat]

        col = Xc[:, best_feat]
        for val in best_big_idx:
            idx = (col == val)
            # 仅切“行”，不切“列” —— 关键修复点
            child = self._build(Xc[idx], y[idx], depth + 1, remaining)
            node["children"][int(val)] = child
        return node

    # ---- 预测（批量路由）----
    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        Xc = self._encode_df(X)
        n = Xc.shape[0]
        out = np.empty(n, dtype=float)

        def apply_node(node, idxs):
            if len(idxs) == 0:
                return
            if node.get("is_leaf", False) or "feat_idx" not in node:
                out[idxs] = node.get("proba", self.major_proba_)
                return
            f = node["feat_idx"]
            col = Xc[idxs, f]

            # 先给默认（未知或被过滤小类）概率
            out[idxs] = node.get("proba", self.major_proba_)
            # 针对每个子值下钻
            for val, child in node["children"].items():
                m = (col == val)
                if m.any():
                    sub = idxs[m]
                    apply_node(child, sub)

        apply_node(self.tree_, np.arange(n))
        out = np.clip(out, 1e-9, 1 - 1e-9)
        return np.vstack([1 - out, out]).T

# ---------- 数据加载 ----------
train_pl = pl.read_csv(BASE / "train.csv")
test_pl  = pl.read_csv(BASE / "test.csv")
sub_pd   = pd.read_csv(BASE / "sample_submission.csv")

if DROP_DURATION and "duration" in train_pl.columns:
    train_pl = train_pl.drop("duration")
    if "duration" in test_pl.columns:
        test_pl = test_pl.drop("duration")

train = train_pl.to_pandas()
test  = test_pl.to_pandas()

y = train[TARGET].astype(int).values
X = train.drop(columns=[TARGET, ID])
X_test = test.drop(columns=[ID], errors="ignore")

# ---------- 5 折 OOF（并行） ----------
def run_fold(tr_idx, va_idx):
    clf = ID3Fast(
        max_depth=MAX_DEPTH,
        min_samples_leaf=MIN_SAMPLES_LEAF,
        n_bins_numeric=N_BINS_NUMERIC,
        random_state=RANDOM_STATE,
        special_pdays=SPECIAL_PDAYS,
        min_gain=MIN_GAIN,
    )
    clf.fit(X.iloc[tr_idx], y[tr_idx])
    proba_va = clf.predict_proba(X.iloc[va_idx])[:, 1]
    return va_idx, proba_va, clf

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
folds = list(skf.split(X, y))
results = Parallel(n_jobs=N_JOBS, prefer="threads")(delayed(run_fold)(tr, va) for tr, va in folds)

oof = np.zeros(len(y), dtype=float)
for va_idx, proba_va, _ in results:
    oof[va_idx] = proba_va

auc = roc_auc_score(y, oof)
f1  = f1_score(y, (oof >= 0.5).astype(int))
print(f"[ID3Fast][CV] AUC={auc:.4f} | F1@0.5={f1:.4f}")

# ---------- 全量训练 + 测试提交 ----------
final_clf = ID3Fast(
    max_depth=MAX_DEPTH,
    min_samples_leaf=MIN_SAMPLES_LEAF,
    n_bins_numeric=N_BINS_NUMERIC,
    random_state=RANDOM_STATE,
    special_pdays=SPECIAL_PDAYS,
    min_gain=MIN_GAIN,
)
final_clf.fit(X, y)
test_proba = final_clf.predict_proba(X_test)[:, 1]
test_pred  = (test_proba >= 0.5).astype(int)

out = sub_pd.copy()
out["y"] = test_proba if SUBMIT_PROBA else test_pred
out.to_csv("submission_id3_fast.csv", index=False)
print("Saved -> submission_id3_fast.csv")
print(out.head())


[ID3Fast][CV] AUC=0.8095 | F1@0.5=0.3657
Saved -> submission_id3_fast.csv
       id         y
0  750000  0.036364
1  750001  0.194175
2  750002  0.102473
3  750003  0.003040
4  750004  0.244898


In [7]:
# ===== C4.5 (gain ratio; numeric=binary split, categorical=multiway) — fixed =====

# --- gain ratio 计算（分类：多叉；数值：二分阈值） ---
def calc_gain_ratio_multi(y: np.ndarray, codes: np.ndarray, K: int, min_leaf: int) -> tuple[float, np.ndarray]:
    mask = codes >= 0
    if not mask.any():
        return 0.0, np.array([], dtype=int)

    y_eff = y[mask]
    g = codes[mask]

    cnt  = np.bincount(g, minlength=K)
    posf = np.bincount(g, weights=y_eff, minlength=K)
    pos  = posf.astype(np.int64)

    big = np.where(cnt >= min_leaf)[0]
    if big.size <= 1:
        return 0.0, np.array([], dtype=int)

    n_eff = int(mask.sum())
    ent_parent = parent_entropy(y_eff)
    ent_child  = entropy_from_pos_neg(pos[big], (cnt[big] - pos[big]))
    w = cnt[big] / n_eff
    ig = ent_parent - float(np.sum(w * ent_child))
    split_info = -np.sum(np.where(w > 0, w * np.log2(w), 0.0))
    if split_info <= 1e-12:
        return 0.0, big
    gr = ig / split_info
    return float(gr), big

def calc_gain_ratio_threshold(y: np.ndarray, codes: np.ndarray, K: int, min_leaf: int) -> tuple[float, int]:
    """数值列：在编码后的 bin 边界 0..K-2 上枚举二分阈值，返回 (best_gr, best_thr_code)。"""
    mask = codes >= 0
    if not mask.any() or K <= 1:
        return 0.0, -1

    y_eff = y[mask].astype(np.int32, copy=False)
    g     = codes[mask].astype(np.int32, copy=False)

    cnt  = np.bincount(g, minlength=K).astype(np.int64)
    posf = np.bincount(g, weights=y_eff, minlength=K)
    pos  = posf.astype(np.int64)

    cnt_cum = np.cumsum(cnt)
    pos_cum = np.cumsum(pos)
    n_eff = int(cnt_cum[-1])

    best_gr, best_t = 0.0, -1
    ent_parent = parent_entropy(y_eff)

    for t in range(0, K - 1):
        lc = int(cnt_cum[t]); rc = int(n_eff - lc)
        if lc < min_leaf or rc < min_leaf:
            continue
        lp = int(pos_cum[t]);  rp = int(pos_cum[-1] - lp)

        ent_left  = entropy_from_pos_neg(np.array([lp]), np.array([lc - lp]))[0]
        ent_right = entropy_from_pos_neg(np.array([rp]), np.array([rc - rp]))[0]

        wl = lc / n_eff; wr = rc / n_eff
        ig = ent_parent - (wl * ent_left + wr * ent_right)
        split_info = - ( (wl * np.log2(wl) if wl > 0 else 0.0) + (wr * np.log2(wr) if wr > 0 else 0.0) )
        if split_info <= 1e-12:
            continue
        gr = ig / split_info
        if gr > best_gr:
            best_gr, best_t = float(gr), int(t)

    return best_gr, best_t

# --- C4.5 快速版：继承 ID3Fast 的编码器，使用增益率；数值列保存阈值并正确路由 ---
class C45Fast(ID3Fast):
    def _build(self, Xc: np.ndarray, y: np.ndarray, depth: int, features: list):
        node = {"is_leaf": False, "n": int(len(y)), "proba": float(y.mean()) if len(y) else 0.0}
        if (depth >= self.max_depth or len(y) < 2 * self.min_samples_leaf or y.min() == y.max() or len(features) == 0):
            node["is_leaf"] = True
            return node

        best_feat, best_score, best_payload = None, 0.0, None
        for f in features:
            K = self.col_K_[f]
            if K <= 1:
                continue
            col = Xc[:, f]
            if self.col_is_num_[f]:
                score, thr = calc_gain_ratio_threshold(y, col, K, self.min_samples_leaf)
                if score > best_score:
                    best_feat, best_score, best_payload = f, score, ("num", int(thr))
            else:
                score, big_idx = calc_gain_ratio_multi(y, col, K, self.min_samples_leaf)
                if score > best_score:
                    best_feat, best_score, best_payload = f, score, ("cat", big_idx)

        if best_feat is None or best_score <= self.min_gain or best_payload is None:
            node["is_leaf"] = True
            return node

        kind, info = best_payload
        node["feat_idx"] = best_feat
        node["split_type"] = kind
        node["children"] = {}
        remaining = [f for f in features if f != best_feat]
        col = Xc[:, best_feat]

        if kind == "num":
            thr = info
            if thr < 0:
                node["is_leaf"] = True
                return node
            node["thr"] = int(thr)   # ✅ 保存阈值（编码后的 bin 索引）
            m_le  = (col >= 0) & (col <= thr)
            m_gt  = (col >  thr)
            if m_le.sum() >= self.min_samples_leaf:
                node["children"]["le"] = self._build(Xc[m_le], y[m_le], depth + 1, remaining)
            if m_gt.sum() >= self.min_samples_leaf:
                node["children"]["gt"] = self._build(Xc[m_gt], y[m_gt], depth + 1, remaining)
        else:
            big_idx = info
            for val in big_idx:
                m = (col == val)
                if m.sum() >= self.min_samples_leaf:
                    node["children"][int(val)] = self._build(Xc[m], y[m], depth + 1, remaining)
        return node

    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        Xc = self._encode_df(X)
        n = Xc.shape[0]
        out = np.empty(n, dtype=float)

        def apply_node(node, idxs):
            if len(idxs) == 0:
                return
            if node.get("is_leaf", False) or "feat_idx" not in node:
                out[idxs] = node.get("proba", self.major_proba_)
                return

            f = node["feat_idx"]
            col = Xc[idxs, f]
            out[idxs] = node.get("proba", self.major_proba_)

            if node.get("split_type") == "num":
                thr = node.get("thr", None)  # ✅ 读取保存的阈值
                if thr is None:
                    return
                m_le = (col >= 0) & (col <= thr)
                m_gt = (col >  thr)
                if "le" in node["children"] and m_le.any():
                    apply_node(node["children"]["le"], idxs[m_le])
                if "gt" in node["children"] and m_gt.any():
                    apply_node(node["children"]["gt"], idxs[m_gt])
            else:
                for val, child in node["children"].items():
                    m = (col == int(val))
                    if m.any():
                        apply_node(child, idxs[m])

        apply_node(self.tree_, np.arange(n))
        out = np.clip(out, 1e-9, 1 - 1e-9)
        return np.vstack([1 - out, out]).T

# --- 运行 C4.5（沿用现有 folds 并行） ---
def run_fold_c45(tr_idx, va_idx):
    clf = C45Fast(
        max_depth=MAX_DEPTH,
        min_samples_leaf=MIN_SAMPLES_LEAF,
        n_bins_numeric=N_BINS_NUMERIC,
        random_state=RANDOM_STATE,
        special_pdays=SPECIAL_PDAYS,
        min_gain=MIN_GAIN,
    )
    clf.fit(X.iloc[tr_idx], y[tr_idx])
    proba_va = clf.predict_proba(X.iloc[va_idx])[:, 1]
    return va_idx, proba_va, clf

results_c45 = Parallel(n_jobs=N_JOBS, prefer="threads")(delayed(run_fold_c45)(tr, va) for tr, va in folds)

oof_c45 = np.zeros(len(y), dtype=float)
for va_idx, proba_va, _ in results_c45:
    oof_c45[va_idx] = proba_va

auc_c45 = roc_auc_score(y, oof_c45)
f1_c45  = f1_score(y, (oof_c45 >= 0.5).astype(int))
print(f"[C4.5][CV] AUC={auc_c45:.4f} | F1@0.5={f1_c45:.4f}")

# --- 全量训练 + 提交 ---
final_c45 = C45Fast(
    max_depth=MAX_DEPTH,
    min_samples_leaf=MIN_SAMPLES_LEAF,
    n_bins_numeric=N_BINS_NUMERIC,
    random_state=RANDOM_STATE,
    special_pdays=SPECIAL_PDAYS,
    min_gain=MIN_GAIN,
)
final_c45.fit(X, y)
test_proba_c45 = final_c45.predict_proba(X_test)[:, 1]
test_pred_c45  = (test_proba_c45 >= 0.5).astype(int)

out_c45 = sub_pd.copy()
out_c45["y"] = test_proba_c45 if SUBMIT_PROBA else test_pred_c45
out_c45.to_csv("submission_c45_fast.csv", index=False)
print("Saved -> submission_c45_fast.csv")
print(out_c45.head())


[C4.5][CV] AUC=0.8165 | F1@0.5=0.3836
Saved -> submission_c45_fast.csv
       id         y
0  750000  0.045688
1  750001  0.100774
2  750002  0.126440
3  750003  0.003601
4  750004  0.168831
