In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e8/sample_submission.csv
/kaggle/input/playground-series-s5e8/train.csv
/kaggle/input/playground-series-s5e8/test.csv


In [2]:
from pathlib import Path
import polars as pl

BASE = Path("/kaggle/input/playground-series-s5e8")
train, test, sub = (pl.read_csv(BASE / f) for f in ("train.csv", "test.csv", "sample_submission.csv"))

In [5]:
# ===== XGBoost (Faster, Fixed) — CV + EarlyStopping(in constructor) + CPU/GPU auto =====
from pathlib import Path
import os, warnings, subprocess
import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import __version__ as sklearn_version
from packaging import version
from joblib import Parallel, delayed

import xgboost as xgb

# ----------------------- Config -----------------------
BASE = Path("/kaggle/input/playground-series-s5e8")
TARGET, ID = "y", "id"
DROP_DURATION = True          # 现实部署建议 True（duration 有潜在泄漏）
FOLDS = 5
SEED = 42
EARLY_STOP = 200
VERBOSE = False

# —— 极速模式（可能略降分；提速明显）——
ULTRA_FAST = False

# 外层并行（多核时 2 折并行较稳；核少时自动关）
CPU = os.cpu_count() or 8
OUTER_JOBS = 2 if CPU >= 16 else 1
INNER_JOBS = max(1, CPU // OUTER_JOBS)

# ----------------------- 环境/版本探测 -----------------------
def has_gpu() -> bool:
    try:
        out = subprocess.run(["nvidia-smi", "-L"], capture_output=True, text=True)
        return out.returncode == 0 and "GPU" in out.stdout
    except Exception:
        return False

XGB_VER = version.parse(xgb.__version__)
HAS_GPU = has_gpu()

# XGBoost 2.0+：用 device；老版本：用 tree_method
if XGB_VER >= version.parse("2.0.0"):
    DEVICE_KW = dict(tree_method="hist", device=("cuda" if HAS_GPU else "cpu"))
else:
    DEVICE_KW = dict(tree_method=("gpu_hist" if HAS_GPU else "hist"))

# sklearn 1.2+ 的 OneHotEncoder 使用 sparse_output
HAS_SPARSE_OUTPUT = version.parse(sklearn_version) >= version.parse("1.2")

# XGB 原生类别支持（1.6+）
HAS_NATIVE_CAT = XGB_VER >= version.parse("1.6.0")

# ----------------------- Load -------------------------
train_pl = pl.read_csv(BASE / "train.csv")
test_pl  = pl.read_csv(BASE / "test.csv")
sub      = pd.read_csv(BASE / "sample_submission.csv")

if DROP_DURATION and "duration" in train_pl.columns:
    train_pl = train_pl.drop("duration")
    if "duration" in test_pl.columns:
        test_pl = test_pl.drop("duration")

train = train_pl.to_pandas()
test  = test_pl.to_pandas()

y = train[TARGET].astype(int).values
X = train.drop(columns=[TARGET, ID])
X_test = test.drop(columns=[ID], errors="ignore")

# ----------------------- 轻量特征工程 & 同步到测试集 -----------------------
# 1) pdays：-1 表示从未联系过
if "pdays" in X.columns:
    X["was_prev_contacted"] = (X["pdays"] != -1).astype(np.int8)
    X_test["was_prev_contacted"] = (X_test["pdays"] != -1).astype(np.int8)
    X["pdays_pos"] = X["pdays"].where(X["pdays"] >= 0, other=np.nan)
    X_test["pdays_pos"] = X_test["pdays"].where(X_test["pdays"] >= 0, other=np.nan)

# 2) month 周期（同步创建到测试集）
month_map = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,
             "jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
if "month" in X.columns:
    X["month_num"] = pd.Series(X["month"]).map(month_map).fillna(0).astype(int)
    X_test["month_num"] = pd.Series(X_test["month"]).map(month_map).fillna(0).astype(int)
    X["month_sin"] = np.sin(2*np.pi*X["month_num"]/12)
    X["month_cos"] = np.cos(2*np.pi*X["month_num"]/12)
    X_test["month_sin"] = np.sin(2*np.pi*X_test["month_num"]/12)
    X_test["month_cos"] = np.cos(2*np.pi*X_test["month_num"]/12)

# 3) 温和去极值
for col in ["balance", "campaign", "previous"]:
    if col in X.columns:
        lo, hi = X[col].quantile([0.001, 0.999])
        X[col] = X[col].clip(lo, hi)
        if col in X_test.columns:
            X_test[col] = X_test[col].clip(lo, hi)

# —— 统一降精度（先对齐列，防止测试集缺列报 KeyError）——
for c in X.columns:
    if c not in X_test.columns:
        X_test[c] = np.nan

for c in X.columns:
    if str(X[c].dtype) == "category":
        continue
    if pd.api.types.is_integer_dtype(X[c]):
        X[c] = X[c].astype(np.int32)
        X_test[c] = X_test[c].astype(np.int32, errors="ignore")
    elif pd.api.types.is_float_dtype(X[c]) or pd.api.types.is_bool_dtype(X[c]):
        X[c] = X[c].astype(np.float32)
        X_test[c] = X_test[c].astype(np.float32, errors="ignore")

# —— 将 object 转为 category（原生类别更快/更省内存；One-Hot 也可用）——
for c in X.columns:
    if X[c].dtype == "object":
        X[c] = X[c].astype("category")
for c in X.columns:
    if str(X[c].dtype) == "category" and c in X_test.columns:
        X_test[c] = X_test[c].astype("category")

# 最终对齐列顺序
X_test = X_test.reindex(columns=X.columns)

# 列分组
def is_cat(s: pd.Series) -> bool:
    return (s.dtype == "object") or (str(s.dtype) == "category")
cat_cols = [c for c in X.columns if is_cat(X[c])]
num_cols = [c for c in X.columns if c not in cat_cols]

# ----------------------- Params -----------------------
pos = y.sum(); neg = len(y) - pos
scale_pos_weight = neg / max(1, pos)

base_params = dict(
    objective="binary:logistic",
    eval_metric="auc",
    learning_rate=0.03,
    n_estimators=2000,            # 会被 early stopping 截断
    max_depth=6,
    min_child_weight=4,
    subsample=0.85,
    colsample_bytree=0.85,
    colsample_bynode=0.85,
    reg_alpha=0.0,
    reg_lambda=1.0,
    max_bin=512,
    random_state=SEED,
    n_jobs=INNER_JOBS,            # 外层并行时限制内层线程数
    scale_pos_weight=scale_pos_weight,
    early_stopping_rounds=EARLY_STOP,   # 在构造器里传
    **DEVICE_KW,                        # 正确设定 CPU/GPU
)

# 更快的直方图设置（基本不掉分）
base_params.update(dict(
    max_bin=256,
    colsample_bytree=0.75,
    colsample_bynode=0.75,
    subsample=0.80,
    min_child_weight=6,
))
if XGB_VER >= version.parse("1.6.0"):
    base_params["sampling_method"] = "gradient_based"

# ✅ 仅对 XGBoost < 2.0 且有 GPU 时设置 predictor，避免 2.x 的“unused parameter”警告
if XGB_VER < version.parse("2.0.0") and HAS_GPU:
    base_params["predictor"] = "gpu_predictor"

# 极速模式（可能略降分）
if ULTRA_FAST:
    FOLDS = 3
    EARLY_STOP = 100
    base_params.update(dict(
        learning_rate=0.05,
        max_depth=5,
        min_child_weight=8,
        subsample=0.75,
        colsample_bytree=0.70,
        colsample_bynode=0.70,
        n_estimators=1500,
        max_bin=256,
    ))
    base_params["early_stopping_rounds"] = EARLY_STOP

# ----------------------- Encoders (One-Hot fallback prefit) -----------------------
pre = None
if not HAS_NATIVE_CAT and len(cat_cols) > 0:
    if HAS_SPARSE_OUTPUT:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True, dtype=np.float32)
    else:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True, dtype=np.float32)
    pre = ColumnTransformer([("cat", ohe, cat_cols)], remainder="passthrough")
    pre.fit(X)  # 预先全量拟合（不依赖 y），各折复用

# ----------------------- Helpers -----------------------
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

def best_iter_of(model: xgb.XGBClassifier) -> int:
    if hasattr(model, "best_iteration_") and model.best_iteration_ is not None:
        return int(model.best_iteration_)
    try:
        return int(model.get_booster().best_ntree_limit)
    except Exception:
        return int(getattr(model, "n_estimators", 2000))

def run_fold(tr_idx, va_idx):
    if HAS_NATIVE_CAT:
        params = base_params | dict(enable_categorical=True)
        model = xgb.XGBClassifier(**params)
        model.fit(X.iloc[tr_idx], y[tr_idx], eval_set=[(X.iloc[va_idx], y[va_idx])], verbose=VERBOSE)
        proba_va = model.predict_proba(X.iloc[va_idx])[:, 1]
    else:
        Xtr = pre.transform(X.iloc[tr_idx]) if pre is not None else X.iloc[tr_idx]
        Xva = pre.transform(X.iloc[va_idx]) if pre is not None else X.iloc[va_idx]
        model = xgb.XGBClassifier(**base_params)
        model.fit(Xtr, y[tr_idx], eval_set=[(Xva, y[va_idx])], verbose=VERBOSE)
        proba_va = model.predict_proba(Xva)[:, 1]
    return va_idx, proba_va, model

# ----------------------- OOF CV（可外层并行） -----------------------
folds = list(skf.split(X, y))
if OUTER_JOBS > 1:
    results = Parallel(n_jobs=OUTER_JOBS, prefer="threads")(delayed(run_fold)(tr, va) for tr, va in folds)
else:
    results = [run_fold(tr, va) for tr, va in folds]

oof = np.zeros(len(y), dtype=float)
best_iters = []
for va_idx, proba_va, model in results:
    oof[va_idx] = proba_va
    bi = best_iter_of(model)
    best_iters.append(bi)
    print(f"[Fold] best_iter={bi}  AUC={roc_auc_score(y[va_idx], oof[va_idx]):.4f}")

auc = roc_auc_score(y, oof)
f1_05 = f1_score(y, (oof >= 0.5).astype(int))
prec, rec, thr = precision_recall_curve(y, oof)
f1s = 2 * prec[:-1] * rec[:-1] / (prec[:-1] + rec[:-1] + 1e-12)
best_ix = int(np.nanargmax(f1s))
best_thr = float(thr[best_ix])
print(f"[OOF] AUC={auc:.4f} | F1@0.5={f1_05:.4f} | BestF1={f1s[best_ix]:.4f} @thr={best_thr:.4f}")

# ----------------------- Full fit with tuned n_estimators -----------------------
best_round = max(100, int(np.median(best_iters)))
print(f"[Full] training with n_estimators={best_round} (median of CV best iters)")

if HAS_NATIVE_CAT:
    params_full = base_params | dict(enable_categorical=True, n_estimators=best_round)
    final = xgb.XGBClassifier(**params_full)
    # 用随机 5% 作为早停验证（更快；若不稳定可改回 10%）
    rng = np.random.RandomState(SEED)
    idx = rng.permutation(len(X))
    cut = int(0.95 * len(X))
    tr_idx, va_idx = idx[:cut], idx[cut:]
    final.fit(X.iloc[tr_idx], y[tr_idx], eval_set=[(X.iloc[va_idx], y[va_idx])], verbose=VERBOSE)
    test_proba = final.predict_proba(X_test)[:, 1]
else:
    if pre is None:
        Xt_all, Xt_tst = X, X_test
    else:
        Xt_all = pre.transform(X)
        Xt_tst = pre.transform(X_test)
    params_full = base_params | dict(n_estimators=best_round)
    final = xgb.XGBClassifier(**params_full)

    rng = np.random.RandomState(SEED)
    idx = rng.permutation(len(X))
    cut = int(0.95 * len(X))
    tr_idx, va_idx = idx[:cut], idx[cut:]

    X_tr, y_tr = Xt_all[tr_idx], y[tr_idx]
    X_va, y_va = Xt_all[va_idx], y[va_idx]
    final.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=VERBOSE)
    test_proba = final.predict_proba(Xt_tst)[:, 1]

# ----------------------- Submit -----------------------
out = sub.copy()
out["y"] = test_proba
out.to_csv("submission_xgb_sota.csv", index=False)
print("Saved -> submission_xgb_sota.csv")
print(out.head())


[Fold] best_iter=2000  AUC=0.8598
[Fold] best_iter=2000  AUC=0.8553
[Fold] best_iter=2000  AUC=0.8550
[Fold] best_iter=2000  AUC=0.8570
[Fold] best_iter=2000  AUC=0.8544
[OOF] AUC=0.8563 | F1@0.5=0.4713 | BestF1=0.5310 @thr=0.7031
[Full] training with n_estimators=2000 (median of CV best iters)
Saved -> submission_xgb_sota.csv
       id         y
0  750000  0.231733
1  750001  0.349984
2  750002  0.356784
3  750003  0.001271
4  750004  0.618832


In [6]:
# ===== LightGBM (Fast, Fixed) — CV + EarlyStopping + CPU/GPU auto =====
from pathlib import Path
import os, warnings, subprocess
import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve
from joblib import Parallel, delayed

import lightgbm as lgb
from packaging import version

warnings.filterwarnings("ignore")

# ----------------------- Config -----------------------
BASE = Path("/kaggle/input/playground-series-s5e8")
TARGET, ID = "y", "id"
DROP_DURATION = True          # 现实部署建议 True（duration 有潜在泄漏）
FOLDS = 5
SEED = 42
EARLY_STOP = 200
VERBOSE = False               # 控制训练日志

# —— 极速模式（可能略降分；提速明显）——
ULTRA_FAST = False

# 外层并行（多核时 2 折并行较稳；核少时自动关）
CPU = os.cpu_count() or 8
OUTER_JOBS = 2 if CPU >= 16 else 1
INNER_JOBS = max(1, CPU // OUTER_JOBS)

# ----------------------- 环境/版本探测 -----------------------
def has_gpu() -> bool:
    try:
        out = subprocess.run(["nvidia-smi", "-L"], capture_output=True, text=True)
        return out.returncode == 0 and "GPU" in out.stdout
    except Exception:
        return False

HAS_GPU = has_gpu()
LGB_VER = version.parse(lgb.__version__)

# LightGBM 设备参数
DEVICE_KW = dict(device_type=("gpu" if HAS_GPU else "cpu"))

# ----------------------- Load -------------------------
train_pl = pl.read_csv(BASE / "train.csv")
test_pl  = pl.read_csv(BASE / "test.csv")
sub      = pd.read_csv(BASE / "sample_submission.csv")

if DROP_DURATION and "duration" in train_pl.columns:
    train_pl = train_pl.drop("duration")
    if "duration" in test_pl.columns:
        test_pl = test_pl.drop("duration")

train = train_pl.to_pandas()
test  = test_pl.to_pandas()

y = train[TARGET].astype(int).values
X = train.drop(columns=[TARGET, ID])
X_test = test.drop(columns=[ID], errors="ignore")

# ----------------------- 轻量特征工程 & 同步到测试集 -----------------------
# 1) pdays：-1 表示从未联系过
if "pdays" in X.columns:
    X["was_prev_contacted"] = (X["pdays"] != -1).astype(np.int8)
    X_test["was_prev_contacted"] = (X_test["pdays"] != -1).astype(np.int8)
    X["pdays_pos"] = X["pdays"].where(X["pdays"] >= 0, other=np.nan)
    X_test["pdays_pos"] = X_test["pdays"].where(X_test["pdays"] >= 0, other=np.nan)

# 2) month 周期
month_map = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,
             "jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
if "month" in X.columns:
    X["month_num"] = pd.Series(X["month"]).map(month_map).fillna(0).astype(int)
    X_test["month_num"] = pd.Series(X_test["month"]).map(month_map).fillna(0).astype(int)
    X["month_sin"] = np.sin(2*np.pi*X["month_num"]/12)
    X["month_cos"] = np.cos(2*np.pi*X["month_num"]/12)
    X_test["month_sin"] = np.sin(2*np.pi*X_test["month_num"]/12)
    X_test["month_cos"] = np.cos(2*np.pi*X_test["month_num"]/12)

# 3) 温和去极值
for col in ["balance", "campaign", "previous"]:
    if col in X.columns:
        lo, hi = X[col].quantile([0.001, 0.999])
        X[col] = X[col].clip(lo, hi)
        if col in X_test.columns:
            X_test[col] = X_test[col].clip(lo, hi)

# —— 统一降精度（先对齐列，防止测试集缺列报错）——
for c in X.columns:
    if c not in X_test.columns:
        X_test[c] = np.nan

for c in X.columns:
    if str(X[c].dtype) == "category":
        continue
    if pd.api.types.is_integer_dtype(X[c]):
        X[c] = X[c].astype(np.int32)
        X_test[c] = X_test[c].astype(np.int32, errors="ignore")
    elif pd.api.types.is_float_dtype(X[c]) or pd.api.types.is_bool_dtype(X[c]):
        X[c] = X[c].astype(np.float32)
        X_test[c] = X_test[c].astype(np.float32, errors="ignore")

# —— 将 object 转为 category（LightGBM 原生类别支持，速度/内存友好）——
for c in X.columns:
    if X[c].dtype == "object":
        X[c] = X[c].astype("category")
for c in X.columns:
    if str(X[c].dtype) == "category" and c in X_test.columns:
        X_test[c] = X_test[c].astype("category")

# 最终对齐列顺序
X_test = X_test.reindex(columns=X.columns)

# 类别列/数值列
def is_cat(s: pd.Series) -> bool:
    return (s.dtype == "object") or (str(s.dtype) == "category")

cat_cols = [c for c in X.columns if is_cat(X[c])]
num_cols = [c for c in X.columns if c not in cat_cols]

# ----------------------- Params -----------------------
pos = y.sum(); neg = len(y) - pos
scale_pos_weight = neg / max(1, pos)

base_params = dict(
    objective="binary",
    metric="auc",
    learning_rate=0.03,
    n_estimators=2000,            # 会被 early stopping 截断
    num_leaves=63,                # ~ 2^6-1（对应 max_depth≈6）
    max_depth=-1,                 # 由 num_leaves 控制复杂度
    min_data_in_leaf=40,
    feature_fraction=0.85,
    bagging_fraction=0.85,
    bagging_freq=1,
    lambda_l1=0.0,
    lambda_l2=1.0,
    max_bin=255,
    random_state=SEED,
    n_jobs=INNER_JOBS,            # ✅ 外层并行时限制内层线程数，避免超订
    scale_pos_weight=scale_pos_weight,
    force_col_wise=True,          # ✅ CPU 上更稳更省内存
    **DEVICE_KW,                  # ✅ 正确设定 CPU/GPU
    verbosity=-1,
)

# 稍更“竞赛友好”的直方图/抽样设置（速度与稳健折中）
base_params.update(dict(
    feature_fraction=0.75,
    bagging_fraction=0.80,
    min_data_in_leaf=60,
))

# 极速模式（可能略降分）
if ULTRA_FAST:
    FOLDS = 3
    EARLY_STOP = 100
    base_params.update(dict(
        learning_rate=0.05,
        num_leaves=48,
        min_data_in_leaf=80,
        feature_fraction=0.70,
        bagging_fraction=0.75,
        n_estimators=1500,
        max_bin=255,
    ))

# ----------------------- Helpers -----------------------
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

def best_iter_of(model: lgb.LGBMClassifier) -> int:
    bi = getattr(model, "best_iteration_", None)
    if bi is not None and bi > 0:
        return int(bi)
    return int(getattr(model, "n_estimators", 2000))

def fit_with_es(model, X_tr, y_tr, X_va, y_va, cat_cols, early_stopping_rounds, verbose):
    """兼容不同 LightGBM 版本的早停写法（优先用 callbacks）。"""
    try:
        # 新写法：callbacks
        return model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="auc",
            categorical_feature=cat_cols if len(cat_cols) > 0 else "auto",
            callbacks=[
                lgb.early_stopping(early_stopping_rounds, verbose=verbose),
                lgb.log_evaluation(period=50 if verbose else 0),
            ]
        )
    except TypeError:
        # 旧版本兜底
        return model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="auc",
            categorical_feature=cat_cols if len(cat_cols) > 0 else "auto",
            early_stopping_rounds=early_stopping_rounds,
            verbose=verbose
        )

def run_fold(tr_idx, va_idx):
    model = lgb.LGBMClassifier(**base_params)
    model = fit_with_es(model,
                        X.iloc[tr_idx], y[tr_idx],
                        X.iloc[va_idx], y[va_idx],
                        cat_cols=cat_cols,
                        early_stopping_rounds=EARLY_STOP,
                        verbose=VERBOSE)
    proba_va = model.predict_proba(X.iloc[va_idx])[:, 1]
    return va_idx, proba_va, model

# ----------------------- OOF CV（可外层并行） -----------------------
folds = list(skf.split(X, y))
if OUTER_JOBS > 1:
    results = Parallel(n_jobs=OUTER_JOBS, prefer="threads")(delayed(run_fold)(tr, va) for tr, va in folds)
else:
    results = [run_fold(tr, va) for tr, va in folds]

oof = np.zeros(len(y), dtype=float)
best_iters = []
for va_idx, proba_va, model in results:
    oof[va_idx] = proba_va
    bi = best_iter_of(model)
    best_iters.append(bi)
    print(f"[Fold] best_iter={bi}  AUC={roc_auc_score(y[va_idx], oof[va_idx]):.4f}")

auc = roc_auc_score(y, oof)
f1_05 = f1_score(y, (oof >= 0.5).astype(int))
prec, rec, thr = precision_recall_curve(y, oof)
f1s = 2 * prec[:-1] * rec[:-1] / (prec[:-1] + rec[:-1] + 1e-12)
best_ix = int(np.nanargmax(f1s))
best_thr = float(thr[best_ix])
print(f"[OOF] AUC={auc:.4f} | F1@0.5={f1_05:.4f} | BestF1={f1s[best_ix]:.4f} @thr={best_thr:.4f}")

# ----------------------- Full fit with tuned n_estimators -----------------------
best_round = max(100, int(np.median(best_iters)))
print(f"[Full] training with n_estimators={best_round} (median of CV best iters)")

params_full = dict(base_params)
params_full["n_estimators"] = best_round

final = lgb.LGBMClassifier(**params_full)

# 用随机 5% 作为早停验证（更快；若不稳定可改 10%）
rng = np.random.RandomState(SEED)
idx = rng.permutation(len(X))
cut = int(0.95 * len(X))
tr_idx, va_idx = idx[:cut], idx[cut:]

final = fit_with_es(final,
                    X.iloc[tr_idx], y[tr_idx],
                    X.iloc[va_idx], y[va_idx],
                    cat_cols=cat_cols,
                    early_stopping_rounds=EARLY_STOP,
                    verbose=VERBOSE)

test_proba = final.predict_proba(X_test)[:, 1]

# ----------------------- Submit -----------------------
out = sub.copy()
out["y"] = test_proba
out.to_csv("submission_lgbm_sota.csv", index=False)
print("Saved -> submission_lgbm_sota.csv")
print(out.head())




[Fold] best_iter=1925  AUC=0.8605
[Fold] best_iter=1535  AUC=0.8566
[Fold] best_iter=1229  AUC=0.8563
[Fold] best_iter=1827  AUC=0.8579
[Fold] best_iter=1363  AUC=0.8558
[OOF] AUC=0.8574 | F1@0.5=0.4749 | BestF1=0.5328 @thr=0.6971
[Full] training with n_estimators=1535 (median of CV best iters)
Saved -> submission_lgbm_sota.csv
       id         y
0  750000  0.216924
1  750001  0.334075
2  750002  0.348150
3  750003  0.002067
4  750004  0.659697
