# パス定義

In [1]:
#すべての実行を行うディレクトリを指定してください（各モデルのtrain、testの予測確率を保存するために使います）
ALL_FILE_DIR = r"C:\Users\imasu\OneDrive\デスクトップ\コンペ"
#train.csvのpathを入れてください
TRAIN_CSV = r"C:\Users\imasu\OneDrive\デスクトップ\三菱UFJデータ分析コンペ\train.csv"
#test.csvのpathを入れてください
TEST_CSV = r"C:\Users\imasu\OneDrive\デスクトップ\三菱UFJデータ分析コンペ\test.csv"

# モデル学習

## 〇CatBoost

### 前処理

In [8]:
# ==== ① 前処理 & ユーティリティ 定義セル ====
import os, json, re
from datetime import datetime

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from catboost import CatBoostError
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

# =========================
# 列定義
# =========================
TEXT_COLS = ["name", "desc", "keywords_text"]
CAT_COLS  = ["country", "currency"]

NUM_COLS  = [
    # ----- 既存 -----
    "goal", "goal_log1p", "disable_communication",
    "name_len", "name_wc", "desc_len", "desc_wc", "keywords_count",
    "dur_launch_to_deadline_days", "dur_create_to_launch_days", "dur_launch_to_statechange_days",
    "launched_dow", "launched_hour", "launched_month",
    "deadline_dow", "deadline_hour", "deadline_month",

    # ----- 新規: テキスト統計 -----
    "name_unique_wc", "name_type_token_ratio", "name_avg_word_len", "name_max_word_len",
    "name_exclam_cnt", "name_quest_cnt", "name_digit_cnt", "name_upper_ratio", "name_char_diversity",
    "desc_unique_wc", "desc_type_token_ratio", "desc_avg_word_len", "desc_max_word_len",
    "desc_exclam_cnt", "desc_quest_cnt", "desc_digit_cnt", "desc_upper_ratio", "desc_char_diversity",
    "keywords_unique_count", "keywords_avg_len",

    # ----- 新規: テキスト相互関係 -----
    "overlap_nk_cnt", "overlap_nk_jac",
    "overlap_nd_cnt", "overlap_nd_jac",
    "overlap_kd_cnt", "overlap_kd_jac",

    # ----- 新規: 期間/比率 -----
    "dur_create_to_deadline_days", "dur_create_to_statechange_days",
    "ratio_launch_deadline_over_create_launch",

    # ----- 新規: 暦特徴 -----
    "launched_dayofyear", "launched_weekofyear", "launched_quarter",
    "launched_is_weekend", "launched_is_month_start", "launched_is_month_end",
    "deadline_dayofyear", "deadline_weekofyear", "deadline_quarter",
    "deadline_is_weekend", "deadline_is_month_start", "deadline_is_month_end",

    # ----- 新規: 周期エンコード（sin/cos）-----
    "launched_hour_sin", "launched_hour_cos",
    "launched_dow_sin", "launched_dow_cos",
    "launched_month_sin", "launched_month_cos",
    "deadline_hour_sin", "deadline_hour_cos",
    "deadline_dow_sin", "deadline_dow_cos",
    "deadline_month_sin", "deadline_month_cos",

    # ----- 新規: 目標と日数の組み合わせ -----
    "goal_sqrt", "goal_per_day", "goal_per_day_log1p",

    # ----- 新規: フラグ/欠損 -----
    "flag_deadline_before_launch", "flag_statechange_before_launch",
    "flag_missing_deadline", "flag_missing_launched", "flag_missing_created", "flag_missing_state_changed",
    "flag_name_empty", "flag_desc_empty", "flag_keywords_empty",
    "flag_has_number_in_name", "flag_has_number_in_desc", "flag_has_number_in_keywords",
    "flag_has_currency_sign_in_name", "flag_has_currency_sign_in_desc",
]

FEATURE_COLS = TEXT_COLS + CAT_COLS + NUM_COLS

# =========================
# 小ヘルパー
# =========================
def _safe_div(a, b, eps=1e-6):
    return a / (b + eps)

def _to_str(x):
    if isinstance(x, str):
        return x
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    try:
        return str(x)
    except Exception:
        return ""

def _split_tokens_space(s):
    s = _to_str(s)
    s = re.sub(r"\s+", " ", s.strip())
    return s.split(" ") if s else []

def _count_upper_ratio(s):
    s = _to_str(s)
    if not s:
        return 0.0
    letters = [ch for ch in s if ch.isalpha()]
    if not letters:
        return 0.0
    upp = sum(ch.isupper() for ch in letters)
    return _safe_div(upp, len(letters))

def _char_diversity(s):
    s = _to_str(s)
    if not s:
        return 0.0
    return _safe_div(len(set(s)), len(s))

def _avg_word_len(tokens):
    if not tokens:
        return 0.0
    return float(np.mean([len(_to_str(t)) for t in tokens]))

def _max_word_len(tokens):
    if not tokens:
        return 0
    return max(len(_to_str(t)) for t in tokens)

def _digit_count(s):
    s = _to_str(s)
    return sum(ch.isdigit() for ch in s)

def _has_number(s):
    s = _to_str(s)
    return int(any(ch.isdigit() for ch in s))

def _has_currency_sign(s):
    s = _to_str(s)
    return int(any(sign in s for sign in ["$", "€", "£", "¥", "₩", "₹"]))

# =========================
# 特徴量生成
# =========================
def _build_common_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # --- テキスト ---
    for col in ["name", "desc", "keywords"]:
        out[col] = out[col].fillna("").astype(str)

    # keywords: ハイフンをスペースへ
    out["keywords_text"] = out["keywords"].str.replace("-", " ", regex=False)

    # 文字/単語の基本
    out["name_len"] = out["name"].map(len)
    out["name_wc"]  = out["name"].apply(lambda s: len(_split_tokens_space(s)))
    out["desc_len"] = out["desc"].map(len)
    out["desc_wc"]  = out["desc"].apply(lambda s: len(_split_tokens_space(s)))

    # keywords 統計
    def _kw_count(s: str) -> int:
        s = s.strip()
        if s == "":
            return 0
        return s.count("-") + 1
    out["keywords_count"] = out["keywords"].apply(_kw_count)

    # --- 新規テキスト統計（name / desc）---
    for base in ["name", "desc"]:
        toks = out[base].apply(_split_tokens_space)
        out[f"{base}_unique_wc"]       = toks.apply(lambda t: len(set(t)))
        out[f"{base}_type_token_ratio"]= _safe_div(out[f"{base}_unique_wc"], out[f"{base}_wc"])
        out[f"{base}_avg_word_len"]    = toks.apply(_avg_word_len)
        out[f"{base}_max_word_len"]    = toks.apply(_max_word_len)
        out[f"{base}_exclam_cnt"]      = out[base].str.count("!")
        out[f"{base}_quest_cnt"]       = out[base].str.count(r"\?")
        out[f"{base}_digit_cnt"]       = out[base].apply(_digit_count)
        out[f"{base}_upper_ratio"]     = out[base].apply(_count_upper_ratio)
        out[f"{base}_char_diversity"]  = out[base].apply(_char_diversity)

    # keywords の追加統計
    kw_toks = out["keywords_text"].apply(_split_tokens_space)
    out["keywords_unique_count"] = kw_toks.apply(lambda t: len(set(t)))
    out["keywords_avg_len"]      = kw_toks.apply(_avg_word_len)

    # --- テキスト相互関係 ---
    name_set = out["name"].apply(lambda s: set(_split_tokens_space(s)))
    desc_set = out["desc"].apply(lambda s: set(_split_tokens_space(s)))
    kw_set   = out["keywords_text"].apply(lambda s: set(_split_tokens_space(s)))

    def _overlap_stats(a_set, b_set):
        inter = a_set & b_set
        union = a_set | b_set
        cnt = len(inter)
        jac = _safe_div(cnt, len(union) if union else 1.0)
        return [cnt, jac]

    nk_df = pd.DataFrame(
        [_overlap_stats(a, b) for a, b in zip(name_set, kw_set)],
        columns=["overlap_nk_cnt", "overlap_nk_jac"],
        index=out.index
    )
    nd_df = pd.DataFrame(
        [_overlap_stats(a, b) for a, b in zip(name_set, desc_set)],
        columns=["overlap_nd_cnt", "overlap_nd_jac"],
        index=out.index
    )
    kd_df = pd.DataFrame(
        [_overlap_stats(a, b) for a, b in zip(kw_set,  desc_set)],
        columns=["overlap_kd_cnt", "overlap_kd_jac"],
        index=out.index
    )

    out = pd.concat([out, nk_df, nd_df, kd_df], axis=1)

    # --- 数値/論理 ---
    out["goal"] = pd.to_numeric(out["goal"], errors="coerce").fillna(0).clip(lower=0)
    out["goal_log1p"] = np.log1p(out["goal"])
    out["goal_sqrt"]  = np.sqrt(out["goal"])
    out["disable_communication"] = (
        pd.Series(out["disable_communication"]).astype("boolean").fillna(False).astype("int8")
    )

    # --- カテゴリ ---
    for col in CAT_COLS:
        out[col] = out[col].astype(str).fillna("Unknown").replace({"nan": "Unknown"})

    # --- 時刻（UNIX秒） ---
    to_dt = lambda c: pd.to_datetime(out[c], unit="s", utc=True, errors="coerce")
    dl  = to_dt("deadline")
    sc  = to_dt("state_changed_at")
    cr  = to_dt("created_at")
    lau = to_dt("launched_at")

    def _days(a, b):
        sec = (a - b).dt.total_seconds()
        return pd.Series(sec).replace([np.inf, -np.inf], np.nan).astype(float) / 86400.0

    # 期間
    out["dur_launch_to_deadline_days"]     = _days(dl, lau)
    out["dur_create_to_launch_days"]       = _days(lau, cr)
    out["dur_launch_to_statechange_days"]  = _days(sc, lau)
    out["dur_create_to_deadline_days"]     = _days(dl, cr)
    out["dur_create_to_statechange_days"]  = _days(sc, cr)

    out["ratio_launch_deadline_over_create_launch"] = _safe_div(
        out["dur_launch_to_deadline_days"], out["dur_create_to_launch_days"]
    )

    # 募集日数を用いたgoalの正規化
    out["goal_per_day"]       = _safe_div(out["goal"], out["dur_launch_to_deadline_days"])
    out["goal_per_day_log1p"] = np.log1p(out["goal_per_day"].replace([np.inf, -np.inf], np.nan))

    # 暦情報
    out["launched_dow"]   = lau.dt.dayofweek
    out["launched_hour"]  = lau.dt.hour
    out["launched_month"] = lau.dt.month
    out["deadline_dow"]   = dl.dt.dayofweek
    out["deadline_hour"]  = dl.dt.hour
    out["deadline_month"] = dl.dt.month

    # 追加の暦特徴
    out["launched_dayofyear"] = lau.dt.dayofyear
    out["deadline_dayofyear"] = dl.dt.dayofyear

    try:
        out["launched_weekofyear"] = lau.dt.isocalendar().week.astype(float)
        out["deadline_weekofyear"] = dl.dt.isocalendar().week.astype(float)
    except Exception:
        out["launched_weekofyear"] = np.nan
        out["deadline_weekofyear"] = np.nan

    out["launched_quarter"] = lau.dt.quarter
    out["deadline_quarter"] = dl.dt.quarter

    out["launched_is_weekend"] = (out["launched_dow"] >= 5).astype(float)
    out["deadline_is_weekend"] = (out["deadline_dow"] >= 5).astype(float)

    out["launched_is_month_start"] = lau.dt.is_month_start.astype(float)
    out["launched_is_month_end"]   = lau.dt.is_month_end.astype(float)
    out["deadline_is_month_start"] = dl.dt.is_month_start.astype(float)
    out["deadline_is_month_end"]   = dl.dt.is_month_end.astype(float)

    # 周期エンコード（sin/cos）
    def _sin_cyc(v, T):
        v = pd.to_numeric(v, errors="coerce")
        return np.sin(2 * np.pi * v / T)
    def _cos_cyc(v, T):
        v = pd.to_numeric(v, errors="coerce")
        return np.cos(2 * np.pi * v / T)

    out["launched_hour_sin"]  = _sin_cyc(out["launched_hour"], 24)
    out["launched_hour_cos"]  = _cos_cyc(out["launched_hour"], 24)
    out["launched_dow_sin"]   = _sin_cyc(out["launched_dow"], 7)
    out["launched_dow_cos"]   = _cos_cyc(out["launched_dow"], 7)
    out["launched_month_sin"] = _sin_cyc(out["launched_month"], 12)
    out["launched_month_cos"] = _cos_cyc(out["launched_month"], 12)

    out["deadline_hour_sin"]  = _sin_cyc(out["deadline_hour"], 24)
    out["deadline_hour_cos"]  = _cos_cyc(out["deadline_hour"], 24)
    out["deadline_dow_sin"]   = _sin_cyc(out["deadline_dow"], 7)
    out["deadline_dow_cos"]   = _cos_cyc(out["deadline_dow"], 7)
    out["deadline_month_sin"] = _sin_cyc(out["deadline_month"], 12)
    out["deadline_month_cos"] = _cos_cyc(out["deadline_month"], 12)

    # 整合性/欠損フラグ
    out["flag_deadline_before_launch"]    = (out["dur_launch_to_deadline_days"] < 0).astype(int)
    out["flag_statechange_before_launch"] = (out["dur_launch_to_statechange_days"] < 0).astype(int)

    out["flag_missing_deadline"]      = dl.isna().astype(int)
    out["flag_missing_launched"]      = lau.isna().astype(int)
    out["flag_missing_created"]       = cr.isna().astype(int)
    out["flag_missing_state_changed"] = sc.isna().astype(int)

    out["flag_name_empty"]     = (out["name_len"] == 0).astype(int)
    out["flag_desc_empty"]     = (out["desc_len"] == 0).astype(int)
    out["flag_keywords_empty"] = (out["keywords"].str.strip() == "").astype(int)

    out["flag_has_number_in_name"]     = out["name"].apply(_has_number)
    out["flag_has_number_in_desc"]     = out["desc"].apply(_has_number)
    out["flag_has_number_in_keywords"] = out["keywords"].apply(_has_number)

    out["flag_has_currency_sign_in_name"] = out["name"].apply(_has_currency_sign)
    out["flag_has_currency_sign_in_desc"] = out["desc"].apply(_has_currency_sign)

    # 欠損の埋め（数値は -1）
    for col in NUM_COLS:
        out[col] = (
            pd.to_numeric(out[col], errors="coerce")
            .replace([np.inf, -np.inf], np.nan)
            .fillna(-1)
        )

    return out[FEATURE_COLS]

def make_train_features(train_df: pd.DataFrame):
    X = _build_common_features(train_df)
    y = train_df["final_status"].astype(int)
    return X, y

def make_test_features(test_df: pd.DataFrame):
    X_test = _build_common_features(test_df)
    return X_test

def _make_pool(X: pd.DataFrame, y=None, used_text_cols=None, used_cat_cols=None):
    if used_text_cols is None:
        used_text_cols = TEXT_COLS
    if used_cat_cols is None:
        used_cat_cols = CAT_COLS
    used_cols = list(used_text_cols) + list(used_cat_cols) + NUM_COLS
    X_use = X[used_cols].copy()
    kwargs = {}
    if used_cat_cols:
        kwargs["cat_features"] = used_cat_cols
    if used_text_cols:
        kwargs["text_features"] = used_text_cols
    return Pool(X_use, y, **kwargs), used_cols

def predict_with_model(model: CatBoostClassifier, X_test: pd.DataFrame) -> np.ndarray:
    used_cols = getattr(model, "_feature_cols_used", FEATURE_COLS)
    used_text = getattr(model, "_text_cols_used", TEXT_COLS)
    used_cat  = getattr(model, "_cat_cols_used", CAT_COLS)
    X_use = X_test[used_cols].copy()
    kwargs = {}
    if used_cat:
        kwargs["cat_features"] = used_cat
    if used_text:
        kwargs["text_features"] = used_text
    test_pool = Pool(X_use, **kwargs)
    proba = model.predict_proba(test_pool)[:, 1]
    return proba

def search_best_threshold(y_true, y_proba, metric: str = "f1", grid=None):
    if grid is None:
        grid = np.linspace(0.01, 0.99, 99)
    best_thr, best_score = 0.5, -1.0
    for thr in grid:
        y_hat = (y_proba >= thr).astype(int)
        if metric == "accuracy":
            score = accuracy_score(y_true, y_hat)
        else:
            score = f1_score(y_true, y_hat, zero_division=0)
        if score > best_score:
            best_score, best_thr = score, thr
    return best_thr, best_score

def _train_catboost_on_split(
    X_tr: pd.DataFrame,
    y_tr: pd.Series,
    X_va: pd.DataFrame,
    y_va: pd.Series,
    random_state: int = 42,
):
    # テキスト有効性チェック
    token_like_count = 0
    for c in ["name_wc", "desc_wc", "keywords_count"]:
        if c in X_tr.columns:
            token_like_count += X_tr[c].fillna(0).sum()
    use_text = token_like_count > 0

    # クラス重み
    pos = int(y_tr.sum()); neg = int(len(y_tr) - pos)
    class_weights = [1.0, neg / max(pos, 1)] if pos > 0 else [1.0, 1.0]

    used_text_cols = TEXT_COLS if use_text else []
    train_pool, used_cols = _make_pool(X_tr, y_tr, used_text_cols, CAT_COLS)
    valid_pool, _         = _make_pool(X_va, y_va, used_text_cols, CAT_COLS)

    def _new_model():
        return CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            depth=8,
            learning_rate=0.05,
            iterations=2500,
            l2_leaf_reg=3.0,
            random_seed=random_state,
            verbose=200,
            od_type="Iter",
            od_wait=150,
            class_weights=class_weights,
        )

    model = _new_model()
    try:
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    except CatBoostError as e:
        if "Dictionary size is 0" in str(e) or "text_feature_estimators" in str(e):
            print("[WARN] テキスト辞書サイズが0のため、テキスト列を除外して再学習します。")
            used_text_cols = []
            train_pool, used_cols = _make_pool(X_tr, y_tr, used_text_cols, CAT_COLS)
            valid_pool, _         = _make_pool(X_va, y_va, used_text_cols, CAT_COLS)
            model = _new_model()
            model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
        else:
            raise

    setattr(model, "_feature_cols_used", used_cols)
    setattr(model, "_text_cols_used", used_text_cols)
    setattr(model, "_cat_cols_used", CAT_COLS)
    return model

def train_catboost_kfold(
    train_df: pd.DataFrame,
    n_splits: int = 5,
    metric: str = "f1",
    random_state: int = 42,
):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    y_all = train_df["final_status"].astype(int).values
    oof_proba = np.zeros(len(train_df), dtype=float)
    oof_pred  = np.zeros(len(train_df), dtype=int)
    models = []; fold_thresholds = []; fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(np.zeros(len(train_df)), y_all), 1):
        print(f"\n========== Fold {fold}/{n_splits} ==========")
        df_tr = train_df.iloc[tr_idx].copy()
        df_va = train_df.iloc[va_idx].copy()
        X_tr, y_tr = make_train_features(df_tr)
        X_va, y_va = make_train_features(df_va)

        model = _train_catboost_on_split(X_tr, y_tr, X_va, y_va, random_state=random_state)
        models.append(model)

        proba_va = predict_with_model(model, X_va)
        thr, _ = search_best_threshold(y_va.values, proba_va, metric=metric)
        fold_thresholds.append(thr)

        oof_proba[va_idx] = proba_va
        oof_pred[va_idx]  = (proba_va >= thr).astype(int)

        try:
            auc = roc_auc_score(y_va, proba_va)
        except ValueError:
            auc = np.nan
        f1  = f1_score(y_va, (proba_va >= thr).astype(int), zero_division=0)
        acc = accuracy_score(y_va, (proba_va >= thr).astype(int))
        fold_scores.append({"auc": float(auc), "f1": float(f1), "acc": float(acc)})
        print(f"[Fold {fold}] AUC={auc:.4f}  F1={f1:.4f}  ACC={acc:.4f}  thr={thr:.3f}")

    glob_thr, _ = search_best_threshold(y_all, oof_proba, metric=metric)
    try:
        oof_auc = roc_auc_score(y_all, oof_proba)
    except ValueError:
        oof_auc = np.nan
    oof_f1  = f1_score(y_all, (oof_proba >= glob_thr).astype(int), zero_division=0)
    oof_acc = accuracy_score(y_all, (oof_proba >= glob_thr).astype(int))
    oof_scores = {"auc": float(oof_auc), "f1@glob": float(oof_f1), "acc@glob": float(oof_acc)}
    print(f"\n[OOF] AUC={oof_auc:.4f}  F1@glob={oof_f1:.4f}  ACC@glob={oof_acc:.4f}  glob_thr={glob_thr:.3f}")

    return {
        "models": models,
        "oof_proba": oof_proba,
        "oof_pred": oof_pred,
        "fold_thresholds": fold_thresholds,
        "global_best_threshold": float(glob_thr),
        "fold_scores": fold_scores,
        "oof_scores": oof_scores,
    }

def predict_test_with_models(models, X_test: pd.DataFrame) -> np.ndarray:
    preds = []
    for i, m in enumerate(models, 1):
        proba = predict_with_model(m, X_test)
        preds.append(proba)
    return np.mean(np.column_stack(preds), axis=1)


### 学習

In [10]:
# ==== ② 学習（K-Fold） & OOF保存 セル ====
# ここでは、ALL_FILE_DIR, TRAIN_CSV が事前定義済みであることを前提にします。

# 保存先ディレクトリの用意
cat_dir = os.path.join(ALL_FILE_DIR, "CatBoost")
os.makedirs(cat_dir, exist_ok=True)

# 学習用データの読み込み
train_df = pd.read_csv(TRAIN_CSV)

# K-Fold 学習
N_SPLITS = 5
METRIC = "f1"
RANDOM_STATE = 42

res = train_catboost_kfold(
    train_df=train_df,
    n_splits=N_SPLITS,
    metric=METRIC,
    random_state=RANDOM_STATE,
)

# OOFファイルの保存（id, 確率, 正解ラベル）
oof_df = pd.DataFrame({
    "id": train_df["id"].values,
    "proba": res["oof_proba"],
    "label": train_df["final_status"].astype(int).values,
})
oof_path = os.path.join(cat_dir, "oof.csv")
oof_df.to_csv(oof_path, index=False)
print(f"✅ OOFを保存しました: {oof_path}")

# 後続③で使うためにモデル群と（必要なら）閾値を保持
models = res["models"]
glob_thr = res["global_best_threshold"]
print(f"参考: OOFスコア = {res['oof_scores']}")


### 予測

In [None]:
# ==== ③ テスト推論 & 保存 セル ====
# ここでは、ALL_FILE_DIR, TEST_CSV が事前定義済み、かつ ②の models がメモリにある前提です。

# 保存先（②と同じ）
cat_dir = os.path.join(ALL_FILE_DIR, "CatBoost")
os.makedirs(cat_dir, exist_ok=True)

# テストデータ読み込み & 特徴量
test_df = pd.read_csv(TEST_CSV)
X_test = make_test_features(test_df)

# 推論（fold平均の確率）
test_proba = predict_test_with_models(models, X_test)

# 保存（id, 予測確率）
test_pred_df = pd.DataFrame({
    "id": test_df["id"].values,
    "proba": test_proba,
})
test_pred_path = os.path.join(cat_dir, "test_pred.csv")
test_pred_df.to_csv(test_pred_path, index=False)
print(f"✅ テスト予測を保存しました: {test_pred_path}")


## 〇XGBoost

### 前処理

In [5]:
# 必要ライブラリ
import os
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, log_loss, roc_curve, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

from xgboost import XGBClassifier


# ---------- 派生特徴（pandas） ----------
def _base_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # 文字列欠損を埋める
    for col in ["name", "desc", "keywords", "country", "currency"]:
        if col in X.columns:
            X[col] = X[col].fillna("")
        else:
            X[col] = ""

    # keywords 正規化（ハイフン→スペース）
    X["keywords_norm"] = X["keywords"].str.replace("-", " ", regex=False)

    # 結合テキスト
    X["text"] = (X["name"].astype(str) + " " +
                 X["desc"].astype(str) + " " +
                 X["keywords_norm"].astype(str)).str.strip()

    # 長さ系/語数系
    X["name_len"] = X["name"].str.len().fillna(0)
    X["desc_len"] = X["desc"].str.len().fillna(0)
    X["name_wc"]  = X["name"].str.split().apply(len).fillna(0)
    X["desc_wc"]  = X["desc"].str.split().apply(len).fillna(0)
    X["keywords_count"] = X["keywords"].apply(
        lambda s: 0 if pd.isna(s) or s=="" else len(str(s).split("-"))
    )

    # テキスト品質系
    def _uniq_ratio(s):
        toks = str(s).split()
        return (len(set(toks)) / max(len(toks), 1)) if toks else 0.0
    def _avg_wlen(s):
        toks = str(s).split()
        return (np.mean([len(w) for w in toks]) if toks else 0.0)

    X["name_unique_ratio"] = X["name"].apply(_uniq_ratio)
    X["desc_unique_ratio"] = X["desc"].apply(_uniq_ratio)
    X["avg_word_len_name"] = X["name"].apply(_avg_wlen)
    X["avg_word_len_desc"] = X["desc"].apply(_avg_wlen)
    X["exclam_count"] = X["text"].str.count("!").fillna(0)
    X["question_count"] = X["text"].str.count(r"\?").fillna(0)
    X["has_desc"] = (X["desc_len"] > 0).astype(int)
    X["has_keywords"] = (X["keywords_count"] > 0).astype(int)

    # bool → int
    if "disable_communication" in X.columns:
        X["disable_communication"] = X["disable_communication"].astype(float).fillna(0).astype(int)
    else:
        X["disable_communication"] = 0

    # UNIX秒 → datetime（UTC）
    for c in ["deadline", "state_changed_at", "created_at", "launched_at"]:
        if c in X.columns:
            X[c + "_dt"] = pd.to_datetime(X[c], unit="s", utc=True, errors="coerce")
        else:
            X[c + "_dt"] = pd.NaT

    # 期間（日）
    X["days_campaign"] = (X["deadline"] - X["launched_at"]) / 86400.0
    X["prep_days"]     = (X["launched_at"] - X["created_at"]) / 86400.0
    X["decision_days"] = (X["state_changed_at"] - X["launched_at"]) / 86400.0

    # launched_atのカレンダ特徴
    X["launched_year"]  = X["launched_at_dt"].dt.year.fillna(0).astype(int)
    X["launched_month"] = X["launched_at_dt"].dt.month.fillna(0).astype(int)
    X["launched_dow"]   = X["launched_at_dt"].dt.dayofweek.fillna(0).astype(int)
    X["launched_hour"]  = X["launched_at_dt"].dt.hour.fillna(0).astype(int)
    try:
        X["launched_week"] = X["launched_at_dt"].dt.isocalendar().week.astype("Int64").fillna(0).astype(int)
    except Exception:
        X["launched_week"] = 0
    X["is_weekend"] = (X["launched_dow"] >= 5).astype(int)

    # 周期表現（sin/cos）
    X["month_sin"] = np.sin(2*np.pi * X["launched_month"]/12.0)
    X["month_cos"] = np.cos(2*np.pi * X["launched_month"]/12.0)
    X["dow_sin"]   = np.sin(2*np.pi * X["launched_dow"]/7.0)
    X["dow_cos"]   = np.cos(2*np.pi * X["launched_dow"]/7.0)
    X["hour_sin"]  = np.sin(2*np.pi * X["launched_hour"]/24.0)
    X["hour_cos"]  = np.cos(2*np.pi * X["launched_hour"]/24.0)

    # 金額系
    X["goal"] = pd.to_numeric(X.get("goal", 0.0), errors="coerce").fillna(0.0)
    X["log_goal"] = np.log1p(X["goal"])
    X["goal_per_day"] = X["log_goal"] / (X["days_campaign"].replace(0, np.nan) + 1.0)
    X["goal_per_day"] = X["goal_per_day"].fillna(0.0)
    X["goal_x_days"] = X["log_goal"] * (X["days_campaign"] + 1.0)
    X["goal_div_prep"] = X["log_goal"] / (X["prep_days"].replace(0, np.nan) + 1.0)
    X["goal_div_prep"] = X["goal_div_prep"].fillna(0.0)

    # カテゴリ結合
    X["country_currency"] = (X["country"].astype(str) + "_" + X["currency"].astype(str)).str.strip()

    # 数値欠損埋め
    for numcol in [
        "goal","log_goal","goal_per_day","goal_x_days","goal_div_prep",
        "disable_communication",
        "name_len","desc_len","name_wc","desc_wc","keywords_count",
        "name_unique_ratio","desc_unique_ratio","avg_word_len_name","avg_word_len_desc",
        "exclam_count","question_count","has_desc","has_keywords",
        "days_campaign","prep_days","decision_days",
        "launched_year","launched_month","launched_dow","launched_hour","launched_week","is_weekend",
        "month_sin","month_cos","dow_sin","dow_cos","hour_sin","hour_cos",
    ]:
        X[numcol] = pd.to_numeric(X.get(numcol, 0.0), errors="coerce").fillna(0.0)

    return X


def _get_feature_groups():
    text_col = "text"
    cat_cols  = ["country", "currency", "country_currency"]
    num_cols  = [
        "goal","log_goal","goal_per_day","goal_x_days","goal_div_prep",
        "disable_communication",
        "name_len","desc_len","name_wc","desc_wc","keywords_count",
        "name_unique_ratio","desc_unique_ratio","avg_word_len_name","avg_word_len_desc",
        "exclam_count","question_count","has_desc","has_keywords",
        "days_campaign","prep_days","decision_days",
        "launched_year","launched_month","launched_dow","launched_hour","launched_week","is_weekend",
        "month_sin","month_cos","dow_sin","dow_cos","hour_sin","hour_cos",
    ]
    return text_col, cat_cols, num_cols


def _build_preprocess():
    text_col, cat_cols, num_cols = _get_feature_groups()

    preprocess = ColumnTransformer(
        transformers=[
            ("txt_word", TfidfVectorizer(max_features=15000, ngram_range=(1,2), min_df=3), text_col),
            ("txt_char", TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=3, max_features=8000), text_col),
            ("kw_tfidf", TfidfVectorizer(max_features=3000, ngram_range=(1,2), min_df=2), "keywords_norm"),
            ("txt_lsa",
             Pipeline([
                 ("tf",  TfidfVectorizer(max_features=12000, ngram_range=(1,2), min_df=3)),
                 ("svd", TruncatedSVD(n_components=64, random_state=42)),
             ]),
             text_col),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
            ("num", StandardScaler(with_mean=False), num_cols),
            ("bin", KBinsDiscretizer(n_bins=5, encode="onehot", strategy="quantile"),
             ["log_goal","days_campaign","prep_days"]),
        ],
        sparse_threshold=0.3,
        remainder="drop",
        verbose_feature_names_out=False,
    )
    return preprocess


# ---------- 学習用/推論用の変換 ----------
def make_features_train(train: pd.DataFrame):
    Xp = _base_feature_engineering(train)
    preprocess = _build_preprocess()
    X_trans = preprocess.fit_transform(Xp)

    if "final_status" not in train.columns:
        raise ValueError("trainに final_status カラムが必要です。")
    y = train["final_status"].astype(int).values
    return X_trans, y, preprocess


def make_features_test(test: pd.DataFrame, preprocess: ColumnTransformer):
    Xp = _base_feature_engineering(test)
    X_test = preprocess.transform(Xp)
    return X_test


# ---------- モデリング ----------
def make_sample_weight_binary(y: np.ndarray) -> np.ndarray:
    classes = np.array([0, 1])
    cw = compute_class_weight(class_weight="balanced", classes=classes, y=y)
    w0, w1 = float(cw[0]), float(cw[1])
    return np.where(y == 1, w1, w0).astype(float)


def train_model(X, y, sample_weight=None):
    model = XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
        tree_method="hist",
    )
    model.fit(X, y, sample_weight=sample_weight)
    return model


def find_best_threshold(y_true, proba, metric: str = "f1"):
    thresholds = np.linspace(0.0, 1.0, 1001)[1:-1]
    best_th = 0.5
    best_score = -np.inf

    if metric == "f1":
        for th in thresholds:
            pred = (proba >= th).astype(int)
            score = f1_score(y_true, pred)
            if score > best_score:
                best_score, best_th = score, th
        return float(best_th), {"f1": float(best_score)}

    elif metric == "accuracy":
        for th in thresholds:
            pred = (proba >= th).astype(int)
            score = accuracy_score(y_true, pred)
            if score > best_score:
                best_score, best_th = score, th
        return float(best_th), {"accuracy": float(best_score)}

    elif metric == "youden":
        fpr, tpr, ths = roc_curve(y_true, proba)
        j = tpr - fpr
        i = int(np.argmax(j))
        return float(ths[i]), {"youden": float(j[i]), "tpr": float(tpr[i]), "fpr": float(fpr[i])}

    else:
        raise ValueError("metric must be 'f1', 'youden', or 'accuracy'")


def kfold_train_find_threshold(train_df: pd.DataFrame,
                               n_splits: int = 5,
                               metric: str = "f1",
                               random_state: int = 42):
    if "final_status" not in train_df.columns:
        raise ValueError("trainに final_status カラムが必要です。")
    y_all = train_df["final_status"].astype(int).values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_proba = np.zeros(len(train_df), dtype=float)
    fold_scores = []

    for fold, (trn_idx, val_idx) in enumerate(skf.split(train_df, y_all), 1):
        trn_df = train_df.iloc[trn_idx].reset_index(drop=True)
        val_df = train_df.iloc[val_idx].reset_index(drop=True)

        X_tr, y_tr, preprocess_tr = make_features_train(trn_df)
        w_tr = make_sample_weight_binary(y_tr)
        model_tr = train_model(X_tr, y_tr, w_tr)

        X_val = make_features_test(val_df, preprocess_tr)
        proba_val = model_tr.predict_proba(X_val)[:, 1]
        oof_proba[val_idx] = proba_val

        try:
            auc = roc_auc_score(val_df["final_status"].values, proba_val)
        except ValueError:
            auc = np.nan
        ll = log_loss(val_df["final_status"].values, np.clip(proba_val, 1e-6, 1-1e-6))
        fold_scores.append({"fold": fold, "auc": float(auc), "logloss": float(ll)})

    best_th, th_info = find_best_threshold(y_all, oof_proba, metric=metric)

    X_full, y_full, preprocess_full = make_features_train(train_df.reset_index(drop=True))
    w_full = make_sample_weight_binary(y_full)
    model_full = train_model(X_full, y_full, w_full)

    summary = {
        "best_threshold": float(best_th),
        "threshold_metric": metric,
        "threshold_info": th_info,
        "fold_scores": fold_scores,
        "oof_auc": float(roc_auc_score(y_all, oof_proba)),
        "oof_logloss": float(log_loss(y_all, np.clip(oof_proba, 1e-6, 1-1e-6))),
    }
    return model_full, preprocess_full, best_th, summary, oof_proba


### 学習

In [6]:
# ==== データ読込 ====
train = pd.read_csv(TRAIN_CSV)

# ==== KFold学習（OOF作成 & 最終モデル確定）====
model, preprocess, best_th, cvsum, oof_proba = kfold_train_find_threshold(
    train_df=train,
    n_splits=5,
    metric="f1",
    random_state=42
)

# ==== OOF保存（LightGBMディレクトリに保存）====
lightgbm_dir = os.path.join(ALL_FILE_DIR, "LightGBM")
os.makedirs(lightgbm_dir, exist_ok=True)

if "id" not in train.columns:
    raise ValueError("train に 'id' 列が必要です。")
if "final_status" not in train.columns:
    raise ValueError("train に 'final_status' 列が必要です。")

oof_df = pd.DataFrame({
    "id": train["id"].values,
    "proba": oof_proba,
    "final_status": train["final_status"].astype(int).values
})
oof_path = os.path.join(lightgbm_dir, "oof.csv")
oof_df.to_csv(oof_path, index=False)
print(f"✅ OOFを保存しました: {oof_path}")




✅ OOFを保存しました: C:\Users\imasu\OneDrive\デスクトップ\コンペ\LightGBM\oof.csv


### 予測

In [7]:
# ==== test読込 ====
test = pd.read_csv(TEST_CSV)

# ==== 予測確率 ====
X_test = make_features_test(test, preprocess)   # ← ②で作った preprocess を使用
test_proba = model.predict_proba(X_test)[:, 1]  # ← ②で作った model を使用

# ==== 保存（XGBoostディレクトリに保存）====
xgboost_dir = os.path.join(ALL_FILE_DIR, "XGBoost")
os.makedirs(xgboost_dir, exist_ok=True)

if "id" not in test.columns:
    raise ValueError("test に 'id' 列が必要です。")

test_pred_df = pd.DataFrame({
    "id": test["id"].values,
    "proba": test_proba
})
test_pred_path = os.path.join(xgboost_dir, "test_pred.csv")
test_pred_df.to_csv(test_pred_path, index=False)
print(f"✅ test予測を保存しました: {test_pred_path}")


✅ test予測を保存しました: C:\Users\imasu\OneDrive\デスクトップ\コンペ\XGBoost\test_pred.csv


## 〇LightGBM

### 前処理

In [2]:
# ========== ① 前処理：ここだけ実行して関数を定義しておく ==========

import os, re, json
import numpy as np
import pandas as pd
from scipy import sparse
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

# =========================
# 設定（次元数の主なノブ）
# =========================
BOW_MAX_FEATURES = {"name": 3000, "desc": 6000, "keywords": 2000, "all_scripts": 5000}
TFIDF_MAX_FEATURES = {"name": 500, "desc": 1500, "keywords": 500, "all_scripts": 1000}
ALPHAS = [chr(i) for i in range(ord('a'), ord('z')+1)]
TZ = "Asia/Tokyo"
POS_WORDS = set("good great excellent success happy love amazing awesome best positive win growth benefit advantage improve safe secure".split())
NEG_WORDS = set("bad poor terrible fail sad hate awful worst negative lose loss risk disadvantage decline unsafe insecure".split())

# =========================
# ユーティリティ
# =========================
def _prep_text_series(s: pd.Series, kind: str) -> pd.Series:
    s = s.fillna("").astype(str).str.lower()
    if kind == "keywords":
        s = s.str.replace(r"[-_]+", " ", regex=True)
    s = s.str.replace(r"[^a-z0-9!? ]", " ", regex=True)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s

def _word_tokens(text: str):
    return [] if not text else text.split()

def _letter_counts(text: str):
    c = dict.fromkeys(ALPHAS, 0)
    for ch in text:
        if 'a' <= ch <= 'z':
            c[ch] += 1
    return [c[a] for a in ALPHAS]

def _text_basic_stats(s: pd.Series, prefix: str):
    char_len = s.str.len().astype(np.int32)
    word_cnt = s.apply(lambda x: len(_word_tokens(x))).astype(np.int32)
    avg_wlen = (char_len / (word_cnt.replace(0, np.nan))).fillna(0).astype(np.float32)
    q_cnt = s.str.count(r"\?").fillna(0).astype(np.int32)
    e_cnt = s.str.count(r"!").fillna(0).astype(np.int32)
    alpha_mat = np.vstack([_letter_counts(t) for t in s.tolist()]).astype(np.int32)
    alpha_cols = [f"{prefix}_alpha_{a}" for a in ALPHAS]
    df = pd.DataFrame({
        f"{prefix}_char_len": char_len,
        f"{prefix}_word_cnt": word_cnt,
        f"{prefix}_avg_wlen": avg_wlen,
        f"{prefix}_q_cnt": q_cnt,
        f"{prefix}_e_cnt": e_cnt,
    })
    df_alpha = pd.DataFrame(alpha_mat, columns=alpha_cols, index=df.index)
    return pd.concat([df, df_alpha], axis=1)

def _sentiment_counts(s: pd.Series, prefix: str):
    pos = s.apply(lambda x: sum(1 for w in _word_tokens(x) if w in POS_WORDS)).astype(np.int32)
    neg = s.apply(lambda x: sum(1 for w in _word_tokens(x) if w in NEG_WORDS)).astype(np.int32)
    words = s.apply(lambda x: len(_word_tokens(x))).replace(0, np.nan)
    score = ((pos - neg) / words).fillna(0).astype(np.float32)
    return pd.DataFrame({f"{prefix}_pos": pos, f"{prefix}_neg": neg, f"{prefix}_sent_score": score})

def _to_dt(s: pd.Series):
    return pd.to_datetime(s, unit="s", utc=True, errors="coerce").dt.tz_convert(TZ)

def _time_features(dt: pd.Series, prefix: str):
    weekday = dt.dt.weekday.fillna(0).astype(int)
    return pd.DataFrame({
        f"{prefix}_year": dt.dt.year.fillna(0).astype(int),
        f"{prefix}_month": dt.dt.month.fillna(0).astype(int),
        f"{prefix}_day": dt.dt.day.fillna(0).astype(int),
        f"{prefix}_weekday": weekday,
        f"{prefix}_is_weekend": weekday.isin([5, 6]).astype(np.int8),
        f"{prefix}_hour": dt.dt.hour.fillna(0).astype(int),
    })

def _days_diff(a: pd.Series, b: pd.Series):
    return ((b - a).dt.total_seconds() / (3600*24)).astype(np.float32)

def _ohe_fit(df: pd.DataFrame, cols):
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True, dtype=np.float32)
    X = ohe.fit_transform(df[cols].astype(str))
    names = ohe.get_feature_names_out(cols)
    return X, names, ohe

def _ohe_transform(df: pd.DataFrame, cols, ohe):
    X = ohe.transform(df[cols].astype(str))
    names = ohe.get_feature_names_out(cols)
    return X, names

def _ohe_fit_from_ints(df: pd.DataFrame, cols):
    tmp = df[cols].astype("Int64").astype(str).replace("<NA>", "NA")
    return _ohe_fit(tmp, cols)

def _ohe_transform_from_ints(df: pd.DataFrame, cols, ohe: OneHotEncoder):
    tmp = df[cols].astype("Int64").astype(str).replace("<NA>", "NA")
    return _ohe_transform(tmp, cols, ohe)

def _make_interactions(ohe_mat, ohe_names, num_col: np.ndarray, num_name: str):
    num_col = num_col.reshape(-1, 1).astype(np.float32)
    scaled = ohe_mat.multiply(num_col)
    names = [f"{name}*{num_name}" for name in ohe_names]
    return scaled, names

def _sparse_block_to_df(X_csr, names):
    out = {}
    X_csr = X_csr.tocsr()
    for j, name in enumerate(names):
        col = X_csr.getcol(j)
        s = pd.arrays.SparseArray(col.toarray().ravel().astype(np.float32), dtype=pd.SparseDtype("float32", 0.0))
        out[name] = s
    return pd.DataFrame(out)

def _cyc(x_int_series: pd.Series, T: int):
    x = x_int_series.fillna(0).astype(np.float32)
    return np.sin(2*np.pi*x/T).astype(np.float32), np.cos(2*np.pi*x/T).astype(np.float32)

def _jaccard(a: str, b: str):
    A, B = set(_word_tokens(a)), set(_word_tokens(b))
    if not A and not B:
        return 0.0
    return len(A & B) / max(1, len(A | B))

def _rule_flags(raw_s: pd.Series, prefix: str):
    return pd.DataFrame({
        f"{prefix}_has_url": raw_s.str.contains(r"http[s]?://", case=False, regex=True).astype(np.int8),
        f"{prefix}_has_email": raw_s.str.contains(r"[^@\s]+@[^@\s]+\.[^@\s]+", regex=True).astype(np.int8),
        f"{prefix}_num_cnt": raw_s.str.count(r"\d+").fillna(0).astype(np.int16),
        f"{prefix}_upper_ratio": (raw_s.apply(lambda x: sum(c.isupper() for c in x)) / (raw_s.str.len().replace(0, np.nan))).fillna(0).astype(np.float32),
        f"{prefix}_money_symbol": raw_s.str.count(r"[$€£¥]").fillna(0).astype(np.int8),
        f"{prefix}_stopword_ratio": (raw_s.str.lower().str.split().apply(lambda w: 0 if not w else sum(t in {"the","a","to","for","and","of","in"} for t in w)/len(w))).astype(np.float32),
    })

# =========================
# 学習時：fit + transform
# =========================
def fit_features(train: pd.DataFrame):
    df = train.copy()

    raw_name = df["name"].fillna("").astype(str)
    raw_desc = df["desc"].fillna("").astype(str)
    raw_keywords = df["keywords"].fillna("").astype(str)

    df["name"] = raw_name
    df["desc"] = raw_desc
    df["keywords"] = raw_keywords
    df["all_scripts"] = df["name"] + " " + df["desc"] + " " + df["keywords"]

    name_s = _prep_text_series(df["name"], "name")
    desc_s = _prep_text_series(df["desc"], "desc")
    keyw_s = _prep_text_series(df["keywords"], "keywords")
    all_s  = _prep_text_series(df["all_scripts"], "all_scripts")

    created = _to_dt(df["created_at"])
    launched = _to_dt(df["launched_at"])
    deadline = _to_dt(df["deadline"])
    changed  = _to_dt(df["state_changed_at"])
    t_created, t_launched = _time_features(created, "created"), _time_features(launched, "launched")
    t_deadline, t_changed = _time_features(deadline, "deadline"), _time_features(changed, "changed")

    diff_df = pd.DataFrame({
        "d_created_launched": _days_diff(created, launched),
        "d_launched_deadline": _days_diff(launched, deadline),
        "d_created_deadline": _days_diff(created, deadline),
        "d_created_changed": _days_diff(created, changed),
        "d_launched_changed": _days_diff(launched, changed),
        "d_deadline_changed": _days_diff(deadline, changed),
    })

    geo_mat, geo_names, geo_ohe = _ohe_fit(df, ["country", "currency"])

    comm = df["disable_communication"].fillna(False).astype(int)
    goal = df["goal"].fillna(0).astype(np.float32).values
    goal_digits = df["goal"].fillna(0).astype(np.int64).astype(str)\
                    .str.replace(r"[^0-9]", "", regex=True)\
                    .apply(lambda x: len(x.lstrip("0")) if len(x.lstrip("0"))>0 else 1)\
                    .astype(np.int32).values
    proj_df = pd.DataFrame({
        "disable_communication": comm.astype(np.int8),
        "goal": goal.astype(np.float32),
        "goal_log1p": np.log1p(np.clip(goal, 0, None)).astype(np.float32),
        "goal_digits": goal_digits.astype(np.int32),
    })

    stats_name, stats_desc = _text_basic_stats(name_s, "name"), _text_basic_stats(desc_s, "desc")
    stats_keyw, stats_all  = _text_basic_stats(keyw_s, "keywords"), _text_basic_stats(all_s, "all")
    sent_name, sent_desc   = _sentiment_counts(name_s, "name"), _sentiment_counts(desc_s, "desc")
    sent_keyw, sent_all    = _sentiment_counts(keyw_s, "keywords"), _sentiment_counts(all_s, "all")

    def _fit_vec(vec_cls, series, field, max_feats, **kw):
        vec = vec_cls(max_features=max_feats[field], token_pattern=r"(?u)\b\w+\b", min_df=2, **kw)
        X = vec.fit_transform(series)
        names = [f"{vec_cls.__name__.lower()}_{field}__{v}".replace("tfidfvectorizer", "tfidf").replace("countvectorizer", "bow")
                 for v in vec.get_feature_names_out()]
        return X, names, vec

    bow_name_X, bow_name_names, bow_name_vec = _fit_vec(CountVectorizer, name_s, "name", BOW_MAX_FEATURES)
    bow_desc_X, bow_desc_names, bow_desc_vec = _fit_vec(CountVectorizer, desc_s, "desc", BOW_MAX_FEATURES)
    bow_keyw_X, bow_keyw_names, bow_keyw_vec = _fit_vec(CountVectorizer, keyw_s, "keywords", BOW_MAX_FEATURES)
    bow_all_X,  bow_all_names,  bow_all_vec  = _fit_vec(CountVectorizer, all_s,  "all_scripts", BOW_MAX_FEATURES)

    tfidf_name_X, tfidf_name_names, tfidf_name_vec = _fit_vec(TfidfVectorizer, name_s, "name", TFIDF_MAX_FEATURES, sublinear_tf=True)
    tfidf_desc_X, tfidf_desc_names, tfidf_desc_vec = _fit_vec(TfidfVectorizer, desc_s, "desc", TFIDF_MAX_FEATURES, sublinear_tf=True)
    tfidf_keyw_X, tfidf_keyw_names, tfidf_keyw_vec = _fit_vec(TfidfVectorizer, keyw_s, "keywords", TFIDF_MAX_FEATURES, sublinear_tf=True)
    tfidf_all_X,  tfidf_all_names,  tfidf_all_vec  = _fit_vec(TfidfVectorizer, all_s,  "all_scripts", TFIDF_MAX_FEATURES, sublinear_tf=True)

    tfidf_char = TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=8000, sublinear_tf=True, min_df=2)
    X_char = tfidf_char.fit_transform(all_s)
    tfidf_char_names = [f"tfidf_char__{v}" for v in tfidf_char.get_feature_names_out()]

    tfidf_charwb = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), max_features=6000, sublinear_tf=True, min_df=2)
    X_charwb = tfidf_charwb.fit_transform(all_s)
    tfidf_charwb_names = [f"tfidf_charwb__{v}" for v in tfidf_charwb.get_feature_names_out()]

    tfidf_desc_bi = TfidfVectorizer(ngram_range=(1,2), max_features=3000,
                                    token_pattern=r"(?u)\b\w+\b", sublinear_tf=True, min_df=2)
    X_tfidf_desc_bi = tfidf_desc_bi.fit_transform(desc_s)
    names_desc_bi = [f"tfidf_desc_bi__{v}" for v in tfidf_desc_bi.get_feature_names_out()]

    token_pattern = tfidf_desc_vec.get_params().get("token_pattern", r"(?u)\b\w+\b")
    stop_words   = tfidf_desc_vec.get_params().get("stop_words", None)

    count_vec = CountVectorizer(
        vocabulary=tfidf_desc_vec.vocabulary_,
        ngram_range=tfidf_desc_vec.ngram_range,
        lowercase=getattr(tfidf_desc_vec, "lowercase", True),
        token_pattern=token_pattern,
        stop_words=stop_words,
    )
    X_counts = count_vec.transform(desc_s)

    n_topics = 30
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        learning_method="batch",
        max_iter=20,
        random_state=42,
        evaluate_every=0
    )
    doc_topic = lda.fit_transform(X_counts)
    topic_cols = [f"lda_topic_{i}" for i in range(n_topics)]
    topic_df = pd.DataFrame(doc_topic, columns=topic_cols, index=desc_s.index)

    tfidf_all_for_lsa = TfidfVectorizer(max_features=8000, sublinear_tf=True, min_df=2)
    X_tfidf_all = tfidf_all_for_lsa.fit_transform(all_s)

    svd = TruncatedSVD(n_components=120, random_state=42)
    X_lsa = svd.fit_transform(X_tfidf_all)
    lsa_df = pd.DataFrame(X_lsa, columns=[f"lsa_{i}" for i in range(X_lsa.shape[1])], index=all_s.index)

    nmf = NMF(n_components=60, init="nndsvd", random_state=42, max_iter=400)
    X_nmf = nmf.fit_transform(X_tfidf_all)
    nmf_df = pd.DataFrame(X_nmf, columns=[f"nmf_{i}" for i in range(X_nmf.shape[1])], index=all_s.index)

    tcat_df = pd.concat([_time_features(created, "created"),
                         _time_features(launched, "launched"),
                         _time_features(deadline, "deadline"),
                         _time_features(changed, "changed")], axis=1)
    ohe_cols = [
        "created_year","created_month","created_weekday","created_is_weekend","created_hour",
        "launched_year","launched_month","launched_weekday","launched_is_weekend","launched_hour",
        "deadline_year","deadline_month","deadline_weekday","deadline_is_weekend","deadline_hour",
        "changed_year","changed_month","changed_weekday","changed_is_weekend","changed_hour",
    ]
    time_mat, time_names, time_ohe = _ohe_fit_from_ints(tcat_df, ohe_cols)

    cyc_parts = []
    for p, tdf in [("created", _time_features(created,"created")),
                   ("launched", _time_features(launched,"launched")),
                   ("deadline", _time_features(deadline,"deadline")),
                   ("changed", _time_features(changed,"changed"))]:
        s_hour = tdf[f"{p}_hour"].astype(np.int32)
        s_wd   = tdf[f"{p}_weekday"].astype(np.int32)
        sh, ch = _cyc(s_hour, 24)
        sw, cw = _cyc(s_wd, 7)
        cyc_parts.append(pd.DataFrame({
            f"{p}_hour_sin": sh, f"{p}_hour_cos": ch,
            f"{p}_wd_sin": sw,   f"{p}_wd_cos": cw,
        }, index=tdf.index))
    cyc_df = pd.concat(cyc_parts, axis=1)

    dense_df = pd.concat([
        pd.DataFrame({
            "disable_communication": comm.astype(np.int8),
            "goal": goal.astype(np.float32),
            "goal_log1p": np.log1p(np.clip(goal, 0, None)).astype(np.float32),
            "goal_digits": goal_digits.astype(np.int32),
        }),
        _time_features(created, "created"),
        _time_features(launched, "launched"),
        _time_features(deadline, "deadline"),
        _time_features(changed, "changed"),
        diff_df,
        stats_name, stats_desc, stats_keyw, stats_all,
        sent_name, sent_desc, sent_keyw, sent_all,
        cyc_df,
    ], axis=1).fillna(0)

    overlap_df = pd.DataFrame({
        "name_desc_jacc": [ _jaccard(n,d) for n,d in zip(name_s, desc_s) ],
        "desc_keyw_jacc": [ _jaccard(d,k) for d,k in zip(desc_s, keyw_s) ],
        "name_keyw_jacc": [ _jaccard(n,k) for n,k in zip(name_s, keyw_s) ],
    }, index=df.index).astype(np.float32)

    cv_for_cos = CountVectorizer(min_df=2, token_pattern=r"(?u)\b\w+\b")
    both = pd.concat([name_s, desc_s], ignore_index=True)
    X_nd = cv_for_cos.fit_transform(both)
    X_name = X_nd[:len(name_s)]
    X_desc = X_nd[len(name_s):]
    num = (X_name.multiply(X_desc)).sum(axis=1).A.ravel()
    den = np.sqrt(X_name.multiply(X_name).sum(axis=1).A.ravel() * X_desc.multiply(X_desc).sum(axis=1).A.ravel()) + 1e-9
    cos_df = pd.DataFrame({"name_desc_cos": (num/den).astype(np.float32)}, index=df.index)

    bin_cols = ["goal","d_created_launched","d_launched_deadline","d_created_deadline"]
    bin_artifacts = {}
    bin_dfs = []
    for col in bin_cols:
        x = dense_df[col].values.reshape(-1,1)
        kbd = KBinsDiscretizer(n_bins=8, encode="onehot-dense", strategy="quantile")
        Xb = kbd.fit_transform(x)
        bin_dfs.append(pd.DataFrame(Xb, columns=[f"{col}_bin_{i}" for i in range(Xb.shape[1])], index=dense_df.index))
        bin_artifacts[col] = kbd
    bin_df = pd.concat(bin_dfs, axis=1) if bin_dfs else pd.DataFrame(index=dense_df.index)

    rule_df = pd.concat([
        _rule_flags(raw_name, "raw_name"),
        _rule_flags(raw_desc, "raw_desc")
    ], axis=1)

    inter_geo_goal_X, inter_geo_goal_names = _make_interactions(geo_mat,  geo_names, goal, "goal")
    inter_geo_dig_X,  inter_geo_dig_names  = _make_interactions(geo_mat,  geo_names, goal_digits.astype(np.float32), "goal_digits")
    inter_time_goal_X, inter_time_goal_names = _make_interactions(time_mat, time_names, goal, "goal")
    inter_time_dig_X,  inter_time_dig_names  = _make_interactions(time_mat, time_names, goal_digits.astype(np.float32), "goal_digits")

    def _select_scalar_cols(df_stats, prefixes):
        cols = []
        for p in prefixes:
            cols += [f"{p}_pos", f"{p}_neg", f"{p}_sent_score", f"{p}_q_cnt", f"{p}_e_cnt", f"{p}_char_len", f"{p}_word_cnt", f"{p}_avg_wlen"]
            cols += [c for c in df_stats.columns if c.startswith(f"{p}_alpha_")]
        return cols

    scalar_cols = list(dict.fromkeys(
        _select_scalar_cols(pd.concat([stats_name, sent_name], axis=1), ["name"]) +
        _select_scalar_cols(pd.concat([stats_desc, sent_desc], axis=1), ["desc"]) +
        _select_scalar_cols(pd.concat([stats_keyw, sent_keyw], axis=1), ["keywords"]) +
        _select_scalar_cols(pd.concat([stats_all,  sent_all ], axis=1), ["all"])
    ))
    scalars = dense_df[scalar_cols].values.astype(np.float32)
    X_text_goal = sparse.csr_matrix(scalars * goal.reshape(-1, 1))
    X_text_dig  = sparse.csr_matrix(scalars * goal_digits.reshape(-1, 1).astype(np.float32))
    text_goal_names = [f"{c}*goal" for c in scalar_cols]
    text_dig_names  = [f"{c}*goal_digits" for c in scalar_cols]

    cats = pd.DataFrame({
        "country": df["country"].astype(str),
        "currency": df["currency"].astype(str),
        "c_wd":    _time_features(created,"created")["created_weekday"].astype(str),
    })
    raw_for_hasher = [
        [f"country={c}", f"currency={cur}", f"c_wd={wd}"]
        for c, cur, wd in zip(cats["country"], cats["currency"], cats["c_wd"])
    ]
    fh = FeatureHasher(n_features=2**14, input_type="string")
    X_hash = fh.transform(raw_for_hasher)
    hash_names = [f"hash_{i}" for i in range(X_hash.shape[1])]

    parts = [
        dense_df.reset_index(drop=True),
        _sparse_block_to_df(geo_mat, geo_names),
        _sparse_block_to_df(time_mat, time_names),
        _sparse_block_to_df(bow_name_X, bow_name_names),
        _sparse_block_to_df(bow_desc_X, bow_desc_names),
        _sparse_block_to_df(bow_keyw_X, bow_keyw_names),
        _sparse_block_to_df(bow_all_X,  bow_all_names),
        _sparse_block_to_df(tfidf_name_X, tfidf_name_names),
        _sparse_block_to_df(tfidf_desc_X, tfidf_desc_names),
        _sparse_block_to_df(tfidf_keyw_X, tfidf_keyw_names),
        _sparse_block_to_df(tfidf_all_X,  tfidf_all_names),
        _sparse_block_to_df(X_char, tfidf_char_names),
        _sparse_block_to_df(X_charwb, tfidf_charwb_names),
        _sparse_block_to_df(X_tfidf_desc_bi, names_desc_bi),
        _sparse_block_to_df(inter_geo_goal_X, inter_geo_goal_names),
        _sparse_block_to_df(inter_geo_dig_X,  inter_geo_dig_names),
        _sparse_block_to_df(inter_time_goal_X, inter_time_goal_names),
        _sparse_block_to_df(inter_time_dig_X,  inter_time_dig_names),
        _sparse_block_to_df(X_text_goal, text_goal_names),
        _sparse_block_to_df(X_text_dig,  text_dig_names),
        topic_df,
        lsa_df,
        nmf_df,
        overlap_df,
        cos_df,
        bin_df,
        rule_df,
        _sparse_block_to_df(X_hash.tocsr(), hash_names),
        df['final_status']  # 目的変数
    ]

    feat_train = pd.concat(parts, axis=1)

    artifacts = {
        "geo_ohe": geo_ohe,
        "time_ohe": time_ohe,
        "ohe_time_cols": ohe_cols,
        "bow_vecs": {"name": bow_name_vec, "desc": bow_desc_vec, "keywords": bow_keyw_vec, "all_scripts": bow_all_vec},
        "tfidf_vecs": {"name": tfidf_name_vec, "desc": tfidf_desc_vec, "keywords": tfidf_keyw_vec, "all_scripts": tfidf_all_vec},
        "tfidf_char": tfidf_char,
        "tfidf_charwb": tfidf_charwb,
        "tfidf_desc_bi": tfidf_desc_bi,
        "lda_desc": {"count_vec": count_vec, "model": lda, "n_topics": n_topics, "topic_cols": topic_cols},
        "lsa": {"tfidf_all": tfidf_all_for_lsa, "svd": svd, "n_components": svd.n_components},
        "nmf": {"tfidf_all": tfidf_all_for_lsa, "model": nmf, "n_components": nmf.n_components},
        "feature_hasher": {"hasher": fh, "combo_spec": ["country", "currency", "created_weekday"], "n_features": 2**14},
        "binning": bin_artifacts,
        "scalar_cols": scalar_cols,
        "cv_for_cos": cv_for_cos,
    }
    return feat_train, artifacts

# =========================
# 推論時：train の artifacts で transform
# =========================
def transform_features(test: pd.DataFrame, artifacts: dict, status: str = "train"):
    df = test.copy()
    if status == "train":
        y = df['final_status']

    raw_name = df["name"].fillna("").astype(str)
    raw_desc = df["desc"].fillna("").astype(str)
    raw_keywords = df["keywords"].fillna("").astype(str)

    df["name"] = raw_name
    df["desc"] = raw_desc
    df["keywords"] = raw_keywords
    df["all_scripts"] = df["name"] + " " + df["desc"] + " " + df["keywords"]

    name_s = _prep_text_series(df["name"], "name")
    desc_s = _prep_text_series(df["desc"], "desc")
    keyw_s = _prep_text_series(df["keywords"], "keywords")
    all_s  = _prep_text_series(df["all_scripts"], "all_scripts")

    created = _to_dt(df["created_at"])
    launched = _to_dt(df["launched_at"])
    deadline = _to_dt(df["deadline"])
    changed  = _to_dt(df["state_changed_at"])
    t_created, t_launched = _time_features(created, "created"), _time_features(launched, "launched")
    t_deadline, t_changed = _time_features(deadline, "deadline"), _time_features(changed, "changed")

    diff_df = pd.DataFrame({
        "d_created_launched": _days_diff(created, launched),
        "d_launched_deadline": _days_diff(launched, deadline),
        "d_created_deadline": _days_diff(created, deadline),
        "d_created_changed": _days_diff(created, changed),
        "d_launched_changed": _days_diff(launched, changed),
        "d_deadline_changed": _days_diff(deadline, changed),
    })

    geo_mat, geo_names = _ohe_transform(df, ["country", "currency"], artifacts["geo_ohe"])

    comm = df["disable_communication"].fillna(False).astype(int)
    goal = df["goal"].fillna(0).astype(np.float32).values
    goal_digits = df["goal"].fillna(0).astype(np.int64).astype(str)\
                    .str.replace(r"[^0-9]", "", regex=True)\
                    .apply(lambda x: len(x.lstrip("0")) if len(x.lstrip("0"))>0 else 1)\
                    .astype(np.int32).values
    proj_df = pd.DataFrame({
        "disable_communication": comm.astype(np.int8),
        "goal": goal.astype(np.float32),
        "goal_log1p": np.log1p(np.clip(goal, 0, None)).astype(np.float32),
        "goal_digits": goal_digits.astype(np.int32),
    })

    stats_name, stats_desc = _text_basic_stats(name_s, "name"), _text_basic_stats(desc_s, "desc")
    stats_keyw, stats_all  = _text_basic_stats(keyw_s, "keywords"), _text_basic_stats(all_s, "all")
    sent_name, sent_desc   = _sentiment_counts(name_s, "name"), _sentiment_counts(desc_s, "desc")
    sent_keyw, sent_all    = _sentiment_counts(keyw_s, "keywords"), _sentiment_counts(all_s, "all")

    def _xform(vec, series, field, prefix):
        X = vec.transform(series)
        names = [f"{prefix}_{field}__{v}" for v in vec.get_feature_names_out()]
        return X, names

    bow_name_X, bow_name_names = _xform(artifacts["bow_vecs"]["name"], name_s, "name", "bow")
    bow_desc_X, bow_desc_names = _xform(artifacts["bow_vecs"]["desc"], desc_s, "desc", "bow")
    bow_keyw_X, bow_keyw_names = _xform(artifacts["bow_vecs"]["keywords"], keyw_s, "keywords", "bow")
    bow_all_X,  bow_all_names  = _xform(artifacts["bow_vecs"]["all_scripts"], all_s, "all_scripts", "bow")

    tfidf_name_X, tfidf_name_names = _xform(artifacts["tfidf_vecs"]["name"], name_s, "name", "tfidf")
    tfidf_desc_X, tfidf_desc_names = _xform(artifacts["tfidf_vecs"]["desc"], desc_s, "desc", "tfidf")
    tfidf_keyw_X, tfidf_keyw_names = _xform(artifacts["tfidf_vecs"]["keywords"], keyw_s, "keywords", "tfidf")
    tfidf_all_X,  tfidf_all_names  = _xform(artifacts["tfidf_vecs"]["all_scripts"], all_s, "all_scripts", "tfidf")

    tfidf_char = artifacts.get("tfidf_char")
    tfidf_charwb = artifacts.get("tfidf_charwb")
    tfidf_desc_bi = artifacts.get("tfidf_desc_bi")

    X_char, tfidf_char_names = (tfidf_char.transform(all_s), [f"tfidf_char__{v}" for v in tfidf_char.get_feature_names_out()]) if tfidf_char else (None, [])
    X_charwb, tfidf_charwb_names = (tfidf_charwb.transform(all_s), [f"tfidf_charwb__{v}" for v in tfidf_charwb.get_feature_names_out()]) if tfidf_charwb else (None, [])
    X_tfidf_desc_bi, names_desc_bi = (tfidf_desc_bi.transform(desc_s), [f"tfidf_desc_bi__{v}" for v in tfidf_desc_bi.get_feature_names_out()]) if tfidf_desc_bi else (None, [])

    lda_topic_df = None
    if "lda_desc" in artifacts and artifacts["lda_desc"] is not None:
        lda_pack = artifacts["lda_desc"]
        X_counts_desc = lda_pack["count_vec"].transform(desc_s)
        doc_topic = lda_pack["model"].transform(X_counts_desc)
        lda_topic_df = pd.DataFrame(doc_topic.astype(np.float32), columns=lda_pack["topic_cols"], index=df.index)

    lsa_df = pd.DataFrame(index=df.index)
    nmf_df = pd.DataFrame(index=df.index)
    if "lsa" in artifacts and artifacts["lsa"]:
        lsa_pack = artifacts["lsa"]
        X_tfidf_all = lsa_pack["tfidf_all"].transform(all_s)
        X_lsa = lsa_pack["svd"].transform(X_tfidf_all)
        lsa_df = pd.DataFrame(X_lsa, columns=[f"lsa_{i}" for i in range(X_lsa.shape[1])], index=df.index)
    if "nmf" in artifacts and artifacts["nmf"]:
        nmf_pack = artifacts["nmf"]
        X_tfidf_all_nmf = nmf_pack["tfidf_all"].transform(all_s)
        X_nmf = nmf_pack["model"].transform(X_tfidf_all_nmf)
        nmf_df = pd.DataFrame(X_nmf, columns=[f"nmf_{i}" for i in range(X_nmf.shape[1])], index=df.index)

    tcat_df = pd.concat([t_created, t_launched, t_deadline, t_changed], axis=1)
    ohe_cols = artifacts["ohe_time_cols"]
    time_mat, time_names = _ohe_transform_from_ints(tcat_df, ohe_cols, artifacts["time_ohe"])

    cyc_parts = []
    for p, tdf in [("created", t_created), ("launched", t_launched), ("deadline", t_deadline), ("changed", t_changed)]:
        s_hour = tdf[f"{p}_hour"].astype(np.int32)
        s_wd   = tdf[f"{p}_weekday"].astype(np.int32)
        sh = np.sin(2*np.pi*s_hour/24).astype(np.float32)
        ch = np.cos(2*np.pi*s_hour/24).astype(np.float32)
        sw = np.sin(2*np.pi*s_wd/7).astype(np.float32)
        cw = np.cos(2*np.pi*s_wd/7).astype(np.float32)
        cyc_parts.append(pd.DataFrame({
            f"{p}_hour_sin": sh, f"{p}_hour_cos": ch,
            f"{p}_wd_sin": sw,   f"{p}_wd_cos": cw,
        }, index=tdf.index))
    cyc_df = pd.concat(cyc_parts, axis=1)

    dense_df = pd.concat([
        proj_df,
        t_created, t_launched, t_deadline, t_changed,
        diff_df,
        stats_name, stats_desc, stats_keyw, stats_all,
        sent_name, sent_desc, sent_keyw, sent_all,
        cyc_df,
    ], axis=1).fillna(0)

    overlap_df = pd.DataFrame({
        "name_desc_jacc": [len(set(_word_tokens(n)) & set(_word_tokens(d))) / max(1, len(set(_word_tokens(n)) | set(_word_tokens(d)))) for n, d in zip(name_s, desc_s)],
        "desc_keyw_jacc": [len(set(_word_tokens(d)) & set(_word_tokens(k))) / max(1, len(set(_word_tokens(d)) | set(_word_tokens(k)))) for d, k in zip(desc_s, keyw_s)],
        "name_keyw_jacc": [len(set(_word_tokens(n)) & set(_word_tokens(k))) / max(1, len(set(_word_tokens(n)) | set(_word_tokens(k)))) for n, k in zip(name_s, keyw_s)],
    }, index=df.index).astype(np.float32)

    cv_for_cos = artifacts.get("cv_for_cos")
    if cv_for_cos is not None:
        both = pd.concat([name_s, desc_s], ignore_index=True)
        X_nd = cv_for_cos.transform(both)
        X_name = X_nd[:len(name_s)]
        X_desc = X_nd[len(name_s):]
        num = (X_name.multiply(X_desc)).sum(axis=1).A.ravel()
        den = np.sqrt(X_name.multiply(X_name).sum(axis=1).A.ravel() * X_desc.multiply(X_desc).sum(axis=1).A.ravel()) + 1e-9
        cos_df = pd.DataFrame({"name_desc_cos": (num/den).astype(np.float32)}, index=df.index)
    else:
        cos_df = pd.DataFrame({"name_desc_cos": np.zeros(len(df), dtype=np.float32)}, index=df.index)

    bin_artifacts = artifacts.get("binning", {})
    bin_cols = ["goal","d_created_launched","d_launched_deadline","d_created_deadline"]
    bin_dfs = []
    for col in bin_cols:
        kbd = bin_artifacts.get(col)
        if kbd is not None:
            x = dense_df[col].values.reshape(-1,1)
            Xb = kbd.transform(x)
            bin_dfs.append(pd.DataFrame(Xb, columns=[f"{col}_bin_{i}" for i in range(Xb.shape[1])], index=dense_df.index))
    bin_df = pd.concat(bin_dfs, axis=1) if bin_dfs else pd.DataFrame(index=dense_df.index)

    rule_df = pd.concat([
        _rule_flags(raw_name, "raw_name"),
        _rule_flags(raw_desc, "raw_desc")
    ], axis=1)

    inter_geo_goal_X, inter_geo_goal_names = _make_interactions(geo_mat,  geo_names, goal, "goal")
    inter_geo_dig_X,  inter_geo_dig_names  = _make_interactions(geo_mat,  geo_names, goal_digits.astype(np.float32), "goal_digits")
    inter_time_goal_X, inter_time_goal_names = _make_interactions(time_mat, time_names, goal, "goal")
    inter_time_dig_X,  inter_time_dig_names  = _make_interactions(time_mat, time_names, goal_digits.astype(np.float32), "goal_digits")

    scalar_cols = artifacts["scalar_cols"]
    scalars = dense_df[scalar_cols].values.astype(np.float32)
    X_text_goal = sparse.csr_matrix(scalars * goal.reshape(-1, 1))
    X_text_dig  = sparse.csr_matrix(scalars * goal_digits.reshape(-1, 1).astype(np.float32))
    text_goal_names = [f"{c}*goal" for c in scalar_cols]
    text_dig_names  = [f"{c}*goal_digits" for c in scalar_cols]

    hash_df = pd.DataFrame(index=df.index)
    if "feature_hasher" in artifacts and artifacts["feature_hasher"]:
        fh_pack = artifacts["feature_hasher"]
        fh = fh_pack["hasher"]
        created_wd = t_created["created_weekday"].astype(str)
        cats = pd.DataFrame({
            "country": df["country"].astype(str),
            "currency": df["currency"].astype(str),
            "c_wd":    created_wd,
        })
        raw_for_hasher = [
            [f"country={c}", f"currency={cur}", f"c_wd={wd}"]
            for c, cur, wd in zip(cats["country"], cats["currency"], cats["c_wd"])
        ]
        X_hash = fh.transform(raw_for_hasher).tocsr()
        hash_names = [f"hash_{i}" for i in range(X_hash.shape[1])]
        hash_df = _sparse_block_to_df(X_hash, hash_names)

    parts = [
        dense_df.reset_index(drop=True),
        _sparse_block_to_df(geo_mat, geo_names),
        _sparse_block_to_df(time_mat, time_names),
        _sparse_block_to_df(bow_name_X, bow_name_names),
        _sparse_block_to_df(bow_desc_X, bow_desc_names),
        _sparse_block_to_df(bow_keyw_X, bow_keyw_names),
        _sparse_block_to_df(bow_all_X,  bow_all_names),
        _sparse_block_to_df(tfidf_name_X, tfidf_name_names),
        _sparse_block_to_df(tfidf_desc_X, tfidf_desc_names),
        _sparse_block_to_df(tfidf_keyw_X, tfidf_keyw_names),
        _sparse_block_to_df(tfidf_all_X,  tfidf_all_names),
    ]
    if X_char is not None:
        parts.append(_sparse_block_to_df(X_char, tfidf_char_names))
    if X_charwb is not None:
        parts.append(_sparse_block_to_df(X_charwb, tfidf_charwb_names))
    if X_tfidf_desc_bi is not None:
        parts.append(_sparse_block_to_df(X_tfidf_desc_bi, names_desc_bi))

    parts.extend([
        _sparse_block_to_df(inter_geo_goal_X, inter_geo_goal_names),
        _sparse_block_to_df(inter_geo_dig_X,  inter_geo_dig_names),
        _sparse_block_to_df(inter_time_goal_X, inter_time_goal_names),
        _sparse_block_to_df(inter_time_dig_X,  inter_time_dig_names),
        _sparse_block_to_df(X_text_goal, text_goal_names),
        _sparse_block_to_df(X_text_dig,  text_dig_names),
        lda_topic_df if lda_topic_df is not None else pd.DataFrame(index=df.index),
        lsa_df,
        nmf_df,
        overlap_df,
        cos_df,
        bin_df,
        rule_df,
        hash_df,
    ])

    if status == "train":
        parts.append(y)

    feat_test = pd.concat(parts, axis=1)
    return feat_test


### 学習

In [3]:
# ========== ② 交差検証 → OOFと学習物を ALL_FILE_DIR/LightGBM に保存 ==========

import os, json
import numpy as np
import pandas as pd
from pathlib import Path
from joblib import dump
from datetime import datetime

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_recall_curve
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# ---- 事前に ① を実行して fit_features / transform_features を定義済みにしてください ----

# === 出力先 ===
OUT_DIR = Path(ALL_FILE_DIR) / "LightGBM"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# === データ読み込み ===
df_raw = pd.read_csv(TRAIN_CSV)
y_all = df_raw["final_status"].astype(int).values

# === CV 設定 ===
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_prob = np.zeros(len(df_raw), dtype=float)
oof_pred = np.zeros(len(df_raw), dtype=int)
fold_logs = []
models = []
fold_artifacts = []
fold_cols = []

def best_threshold_for_f1(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f1 = 2 * precision * recall / (precision + recall + 1e-15)
    f1 = f1[:-1]  # thresholdsに対応
    if len(f1) == 0:
        return 0.5, 0.0, 0.0, 0.0
    idx = int(np.argmax(f1))
    return float(thresholds[idx]), float(f1[idx]), float(precision[idx]), float(recall[idx])

for fold, (tr_idx, va_idx) in enumerate(skf.split(df_raw, y_all), 1):
    df_tr_raw = df_raw.iloc[tr_idx].reset_index(drop=True)
    df_va_raw = df_raw.iloc[va_idx].reset_index(drop=True)

    # --- 特徴量：trainでfit → valをtransform（リーク防止） ---
    feat_tr, artifacts = fit_features(df_tr_raw)
    feat_va = transform_features(df_va_raw, artifacts)

    # クラス不均衡に応じた重み
    pos = int((feat_tr['final_status'] == 1).sum())
    neg = int((feat_tr['final_status'] == 0).sum())
    spw = float(neg / max(pos, 1))

    X_tr = feat_tr.drop(columns=['final_status'])
    y_tr = feat_tr['final_status'].astype(int).values
    X_va = feat_va.drop(columns=['final_status'])
    y_va = feat_va['final_status'].astype(int).values

    clf = lgb.LGBMClassifier(
        n_estimators=3000, learning_rate=0.02,
        num_leaves=63, max_depth=9,
        min_child_samples=150, min_split_gain=0.1,
        subsample=0.9, subsample_freq=1,
        colsample_bytree=0.15,
        reg_lambda=4.0, reg_alpha=0.3,
        random_state=42, n_jobs=-1,
        scale_pos_weight=spw
    )

    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        callbacks=[early_stopping(50), log_evaluation(50)]
    )

    va_prob = clf.predict_proba(X_va, num_iteration=getattr(clf, "best_iteration_", None))[:, 1]

    best_t, f1_best, p_best, r_best = best_threshold_for_f1(y_va, va_prob)
    va_pred = (va_prob >= best_t).astype(int)

    oof_prob[va_idx] = va_prob
    oof_pred[va_idx] = va_pred

    fold_logs.append({
        "fold": fold,
        "acc": float(accuracy_score(y_va, va_pred)),
        "auc": float(roc_auc_score(y_va, va_prob)),
        "f1": float(f1_score(y_va, va_pred)),
        "best_iter": int(getattr(clf, "best_iteration_", 0) or 0),
        "best_threshold": float(best_t),
        "precision@best": float(p_best),
        "recall@best": float(r_best),
    })

    # 保存用に
    models.append(clf)
    fold_artifacts.append(artifacts)
    fold_cols.append(list(X_tr.columns))

# === OOF 全体の指標 & 最適閾値 ===
def _best_threshold_from_oof(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f1 = 2 * precision * recall / (precision + recall + 1e-15)
    f1 = f1[:-1]
    if len(f1) == 0:
        return 0.5
    return float(thresholds[int(np.argmax(f1))])

best_threshold_oof = _best_threshold_from_oof(y_all, oof_prob)

print(pd.DataFrame(fold_logs))
print(f"OOF Accuracy: {accuracy_score(y_all, oof_pred):.4f}")
print(f"OOF ROC-AUC : {roc_auc_score(y_all, oof_prob):.4f}")
print(f"OOF F1-score: {f1_score(y_all, (oof_prob>=best_threshold_oof).astype(int)):.4f}")

# === OOF を保存（id, proba, y）===
oof_df = pd.DataFrame({
    "id": df_raw["id"].to_numpy(),
    "proba": oof_prob.astype(float),
    "y": y_all.astype(int),
})
oof_path = OUT_DIR / "oof.csv"
oof_df.to_csv(oof_path, index=False)
print(f"✅ OOFを保存: {oof_path}")

# === 各 fold のモデル・artifacts・列順を保存 ===
for k, (m, art, cols) in enumerate(zip(models, fold_artifacts, fold_cols), 1):
    dump(m,   OUT_DIR / f"model_fold{k}.joblib")
    dump(art, OUT_DIR / f"artifacts_fold{k}.joblib")
    with open(OUT_DIR / f"fold{k}_cols.json", "w", encoding="utf-8") as f:
        json.dump(cols, f, ensure_ascii=False)

# === メタ情報 & foldログ ===
manifest = {
    "created_at": datetime.now().isoformat(),
    "n_folds": N_SPLITS,
    "best_threshold": float(best_threshold_oof),
    "aggregator": "logit",  # ③での確率集約方式
}
with open(OUT_DIR / "manifest.json", "w", encoding="utf-8") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

with open(OUT_DIR / "fold_logs.json", "w", encoding="utf-8") as f:
    json.dump(fold_logs, f, ensure_ascii=False, indent=2)

print(f"✅ 学習物を保存しました: {OUT_DIR}")




[LightGBM] [Info] Number of positive: 19348, number of negative: 41204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.131550 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3929108
[LightGBM] [Info] Number of data points in the train set: 60552, number of used features: 21948
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319527 -> initscore=-0.755946
[LightGBM] [Info] Start training from score -0.755946
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.745241	valid_0's binary_logloss: 0.586715
[100]	valid_0's auc: 0.756018	valid_0's binary_logloss: 0.582288
[150]	valid_0's auc: 0.764842	valid_0's binary_logloss: 0.577707
[200]	valid_0's auc: 0.770304	valid_0's binary_logloss: 0.572047
[250]	valid_0's auc: 0.774086	valid_0's binary_logloss: 0.566876
[300]	valid_0's auc: 0.776936	valid_0's binary_logloss: 0.562546
[350]	valid_0's auc: 0.778717	valid_0's binary_log



[LightGBM] [Info] Number of positive: 19348, number of negative: 41204
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.995438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3927686
[LightGBM] [Info] Number of data points in the train set: 60552, number of used features: 21958
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319527 -> initscore=-0.755946
[LightGBM] [Info] Start training from score -0.755946
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.743515	valid_0's binary_logloss: 0.585759
[100]	valid_0's auc: 0.757293	valid_0's binary_logloss: 0.580408
[150]	valid_0's auc: 0.765372	valid_0's binary_logloss: 0.575282
[200]	valid_0's auc: 0.771269	valid_0's binary_logloss: 0.568659
[250]	valid_0's auc: 0.774664	valid_0's binary_logloss: 0.56305
[300]	valid_0's auc: 0.777164	valid_0's binary_loglo



[LightGBM] [Info] Number of positive: 19348, number of negative: 41204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.649609 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3927578
[LightGBM] [Info] Number of data points in the train set: 60552, number of used features: 21940
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319527 -> initscore=-0.755946
[LightGBM] [Info] Start training from score -0.755946
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.744931	valid_0's binary_logloss: 0.585814
[100]	valid_0's auc: 0.758816	valid_0's binary_logloss: 0.581417
[150]	valid_0's auc: 0.767867	valid_0's binary_logloss: 0.575758
[200]	valid_0's auc: 0.772962	valid_0's binary_logloss: 0.570076
[250]	valid_0's auc: 0.776286	valid_0's binary_logloss: 0.564896
[300]	valid_0's auc: 0.778921	valid_0's binary_logloss: 0.560378
[350]	valid_0's auc: 0.780712	valid_0's binary_log



[LightGBM] [Info] Number of positive: 19348, number of negative: 41204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.064028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3926480
[LightGBM] [Info] Number of data points in the train set: 60552, number of used features: 21903
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319527 -> initscore=-0.755946
[LightGBM] [Info] Start training from score -0.755946
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.737596	valid_0's binary_logloss: 0.587466
[100]	valid_0's auc: 0.751752	valid_0's binary_logloss: 0.582796
[150]	valid_0's auc: 0.760591	valid_0's binary_logloss: 0.577421
[200]	valid_0's auc: 0.766058	valid_0's binary_logloss: 0.571911
[250]	valid_0's auc: 0.769601	valid_0's binary_logloss: 0.566992
[300]	valid_0's auc: 0.772492	valid_0's binary_logloss: 0.562583
[350]	valid_0's auc: 0.774372	valid_0's binary_log



[LightGBM] [Info] Number of positive: 19348, number of negative: 41204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.171060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3926679
[LightGBM] [Info] Number of data points in the train set: 60552, number of used features: 21960
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319527 -> initscore=-0.755946
[LightGBM] [Info] Start training from score -0.755946
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.741627	valid_0's binary_logloss: 0.586104
[100]	valid_0's auc: 0.755821	valid_0's binary_logloss: 0.580939
[150]	valid_0's auc: 0.765989	valid_0's binary_logloss: 0.574739
[200]	valid_0's auc: 0.771955	valid_0's binary_logloss: 0.56845
[250]	valid_0's auc: 0.77607	valid_0's binary_logloss: 0.563259
[300]	valid_0's auc: 0.77845	valid_0's binary_logloss: 0.559317
[350]	valid_0's auc: 0.780627	valid_0's binary_loglos

### 予測

In [4]:
# ========== ③ テスト推論 → test_pred.csv（id, proba）を保存 ==========

import os, json
import numpy as np
import pandas as pd
from pathlib import Path
from joblib import load

# ---- 事前に ① を実行して transform_features が使える状態にしてください --

# === 入出力先 ===
OUT_DIR = Path(ALL_FILE_DIR) / "LightGBM"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# === 読み込み ===
test = pd.read_csv(TEST_CSV)

with open(OUT_DIR / "manifest.json", "r", encoding="utf-8") as f:
    manifest = json.load(f)
n_folds = int(manifest.get("n_folds", 5))
aggregator = manifest.get("aggregator", "logit")

# === foldごとに確率を出す ===
fold_probs = []
for k in range(1, n_folds+1):
    artifacts = load(OUT_DIR / f"artifacts_fold{k}.joblib")
    model     = load(OUT_DIR / f"model_fold{k}.joblib")
    with open(OUT_DIR / f"fold{k}_cols.json", "r", encoding="utf-8") as f:
        cols = json.load(f)

    feat_test = transform_features(test.copy(), artifacts, status="test")
    X_test = feat_test.reindex(columns=cols, fill_value=0).astype(np.float32)
    num_iter = getattr(model, "best_iteration_", None)
    prob = model.predict_proba(X_test, num_iteration=num_iter)[:, 1]
    fold_probs.append(prob)

P = np.column_stack(fold_probs)  # shape=(n_test, n_folds)

def aggregate_probs(P, method="logit", weights=None):
    if weights is None:
        w = np.ones(P.shape[1]) / P.shape[1]
    else:
        w = np.asarray(weights, dtype=np.float64)
        w = w / w.sum()
    if method == "mean":
        return (P * w).sum(axis=1)
    elif method == "logit":
        eps = 1e-6
        P_clip = np.clip(P, eps, 1 - eps)
        logits = np.log(P_clip / (1 - P_clip))
        return 1 / (1 + np.exp(-(logits * w).sum(axis=1)))
    elif method == "rank":
        n = P.shape[0]
        ranks = np.empty_like(P, dtype=np.float64)
        for j in range(P.shape[1]):
            r = pd.Series(P[:, j]).rank(method="average").to_numpy()
            ranks[:, j] = (r - 1) / (n - 1 + 1e-12)
        return (ranks * w).sum(axis=1)
    else:
        raise ValueError("unknown aggregator:", method)

y_test_prob = aggregate_probs(P, method=aggregator)

# === 保存（id, proba）===
test_pred = pd.DataFrame({
    "id": test["id"].to_numpy(),
    "proba": y_test_prob.astype(float),
})
out_path = OUT_DIR / "test_pred.csv"
test_pred.to_csv(out_path, index=False)
print(f"✅ test_pred.csv を保存しました: {out_path}")


✅ test_pred.csv を保存しました: C:\Users\imasu\OneDrive\デスクトップ\コンペ\LightGBM\test_pred.csv


## 〇ExtraTrees

### 前処理

In [None]:
# ========= ここから Part ① =========
import numpy as np
import pandas as pd
from datetime import datetime, timezone
from pathlib import Path
import json, re, joblib
import sklearn
from scipy import sparse
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score


# ===== ユーティリティ =====
def _safe_str(x):
    if pd.isna(x):
        return ""
    return str(x)

def _split_space(text):
    text = _safe_str(text)
    if not text:
        return []
    return text.strip().split()

def _split_hyphen(text):
    text = _safe_str(text)
    if not text:
        return []
    parts = [p for p in text.strip().split('-') if p != ""]
    return parts

def _unix_to_dt(series):
    s = pd.to_numeric(series, errors="coerce")
    s = s.where(s >= 0, np.nan)
    return pd.to_datetime(s, unit="s", utc=True)

def _text_basic_features(series):
    """
    スペース区切りテキストの基本統計＋拡張。
    追加: alnum_ratio, char_diversity
    """
    s = series.fillna("").astype(str)

    len_chars = s.str.len().astype(np.float32)
    words = s.apply(_split_space)
    len_words = words.apply(len).astype(np.float32)

    def avg_len(ws):
        if len(ws) == 0: return 0.0
        ls = [len(w) for w in ws if len(w) > 0]
        return float(np.mean(ls)) if ls else 0.0
    avg_word_len = words.apply(avg_len).astype(np.float32)

    def uniq_ratio(ws):
        n = len(ws)
        return float(len(set(ws))/n) if n > 0 else 0.0
    uniq_word_ratio = words.apply(uniq_ratio).astype(np.float32)

    def digit_ratio(ws):
        n = len(ws)
        if n == 0: return 0.0
        d = sum(1 for w in ws if any(ch.isdigit() for ch in w))
        return float(d)/n
    digit_token_ratio = words.apply(digit_ratio).astype(np.float32)

    def upper_ratio_func(t):
        if len(t) == 0: return 0.0
        upp = sum(1 for ch in t if ch.isupper())
        return float(upp)/len(t)
    upper_ratio = s.apply(upper_ratio_func).astype(np.float32)

    # 新規: 英数字比
    alnum_chars = s.str.count(r"[A-Za-z0-9]").astype(np.float32)
    alnum_ratio = (alnum_chars / (len_chars.replace(0, np.nan))).fillna(0).astype(np.float32)

    # 新規: 文字多様性
    def char_div(t):
        n = len(t)
        return float(len(set(t)))/n if n > 0 else 0.0
    char_diversity = s.apply(char_div).astype(np.float32)

    # 軽量フラグ/カウント
    has_digit = s.str.contains(r"\d").fillna(False).astype(np.int8)
    exclaim_cnt = s.str.count("!").fillna(0).astype(np.int16)
    question_cnt = s.str.count(r"\?").fillna(0).astype(np.int16)

    return pd.DataFrame({
        "len_chars": len_chars,
        "len_words": len_words,
        "avg_word_len": avg_word_len,
        "uniq_word_ratio": uniq_word_ratio,
        "digit_token_ratio": digit_token_ratio,
        "upper_ratio": upper_ratio,
        "alnum_ratio": alnum_ratio,
        "char_diversity": char_diversity,
        "has_digit": has_digit,
        "exclaim_cnt": exclaim_cnt,
        "question_cnt": question_cnt,
    })

def _keywords_features(series):
    """keywords（ハイフン区切り）から簡易統計。"""
    s = series.fillna("").astype(str)
    kws = s.apply(_split_hyphen)

    kw_count = kws.apply(len).astype(np.float32)

    def avg_len(ws):
        if len(ws) == 0:
            return 0.0
        ls = [len(w) for w in ws if len(w) > 0]
        return float(np.mean(ls)) if ls else 0.0
    kw_avg_len = kws.apply(avg_len).astype(np.float32)

    def uniq_ratio(ws):
        n = len(ws)
        return float(len(set(ws))/n) if n > 0 else 0.0
    kw_uniq_ratio = kws.apply(uniq_ratio).astype(np.float32)

    return pd.DataFrame({
        "kw_count": kw_count,
        "kw_avg_len": kw_avg_len,
        "kw_uniq_ratio": kw_uniq_ratio,
    })

def _cyclical_enc(df_dt, prefix):
    """dt列から month/dow/hour の sin/cos"""
    out = {}
    month = df_dt.dt.month.fillna(0).astype(float)
    month_norm = (month - 1.0) / 12.0
    out[f"{prefix}_month_sin"] = np.sin(2*np.pi*month_norm).astype(np.float32)
    out[f"{prefix}_month_cos"] = np.cos(2*np.pi*month_norm).astype(np.float32)

    dow = df_dt.dt.dayofweek.fillna(0).astype(float)
    dow_norm = dow / 7.0
    out[f"{prefix}_dow_sin"] = np.sin(2*np.pi*dow_norm).astype(np.float32)
    out[f"{prefix}_dow_cos"] = np.cos(2*np.pi*dow_norm).astype(np.float32)

    hour = df_dt.dt.hour.fillna(0).astype(float)
    hour_norm = hour / 24.0
    out[f"{prefix}_hour_sin"] = np.sin(2*np.pi*hour_norm).astype(np.float32)
    out[f"{prefix}_hour_cos"] = np.cos(2*np.pi*hour_norm).astype(np.float32)

    return pd.DataFrame(out)

def _dt_flag_features(dt_series, prefix):
    """月初/月末/週末/四半期などのフラグを作る"""
    return pd.DataFrame({
        f"{prefix}_is_weekend": dt_series.dt.dayofweek.isin([5, 6]).fillna(False).astype(np.int8),
        f"{prefix}_is_month_start": dt_series.dt.is_month_start.fillna(False).astype(np.int8),
        f"{prefix}_is_month_end": dt_series.dt.is_month_end.fillna(False).astype(np.int8),
        f"{prefix}_quarter": dt_series.dt.quarter.fillna(0).astype(np.int8),
    })

def _tokenize(text):
    return set(re.findall(r"(?u)\b\w+\b", _safe_str(text).lower()))

def _overlap_features(name_s, desc_s, kw_s):
    """name/desc/keywords のトークン集合オーバーラップ"""
    rows = []
    for n, d, k in zip(name_s.fillna(""), desc_s.fillna(""), kw_s.fillna("")):
        ns = _tokenize(n)
        ds = _tokenize(d)
        ks = _tokenize(str(k).replace("-", " "))
        def jac(a,b):
            u = a|b
            return len(a&b)/len(u) if len(u)>0 else 0.0
        def contain(a,b):
            return len(a&b)/len(a) if len(a)>0 else 0.0
        rows.append((
            jac(ns, ds), jac(ns, ks),
            contain(ns, ds), contain(ks, ns)
        ))
    return pd.DataFrame(rows, columns=[
        "jac_name_desc","jac_name_kw","contain_name_in_desc","contain_kw_in_name"
    ]).astype(np.float32)

def _target_mean_maps(df, col, y, k=20.0):
    """m-推定のターゲット平均エンコード辞書"""
    s = df[col].astype(str).fillna("")
    y = pd.Series(y).astype(int)
    global_mean = float(y.mean())
    g = pd.DataFrame({"cat": s, "y": y}).groupby("cat")["y"].agg(["mean","count"])
    m = ((g["mean"]*g["count"] + k*global_mean) / (g["count"] + k)).to_dict()
    return m, global_mean


# ===== 特徴量ビルダー（train用） =====
def build_features_from_train(train: pd.DataFrame):
    df = train.copy()
    y = df["final_status"].astype(int).values

    df["disable_communication"] = df["disable_communication"].fillna(False).astype(int)

    deadline_dt = _unix_to_dt(df["deadline"])
    state_changed_at_dt = _unix_to_dt(df["state_changed_at"])
    created_at_dt = _unix_to_dt(df["created_at"])
    launched_at_dt = _unix_to_dt(df["launched_at"])

    def days_diff(a, b):
        delta = (a - b).dt.total_seconds()/86400.0
        return pd.to_numeric(delta, errors="coerce").fillna(0.0).astype(np.float32)

    # 日・時間の両方
    dur_launch_to_deadline = days_diff(deadline_dt, launched_at_dt)
    dur_create_to_launch = days_diff(launched_at_dt, created_at_dt)
    dur_launch_to_statechg = days_diff(state_changed_at_dt, launched_at_dt)
    dur_total_create_to_deadline = days_diff(deadline_dt, created_at_dt)
    dur_launch_to_deadline_h = (dur_launch_to_deadline * 24).astype(np.float32)
    dur_create_to_launch_h   = (dur_create_to_launch * 24).astype(np.float32)

    def dt_parts(dt_series, prefix):
        out = pd.DataFrame({
            f"{prefix}_year": dt_series.dt.year.fillna(0).astype(np.int16),
            f"{prefix}_month": dt_series.dt.month.fillna(0).astype(np.int8),
            f"{prefix}_day": dt_series.dt.day.fillna(0).astype(np.int8),
            f"{prefix}_dow": dt_series.dt.dayofweek.fillna(0).astype(np.int8),
            f"{prefix}_hour": dt_series.dt.hour.fillna(0).astype(np.int8),
        })
        return out
    launch_parts = dt_parts(launched_at_dt, "launch")
    deadline_parts = dt_parts(deadline_dt, "deadline")
    launch_cyc = _cyclical_enc(launched_at_dt, "launch")
    deadline_cyc = _cyclical_enc(deadline_dt, "deadline")
    launch_flags = _dt_flag_features(launched_at_dt, "launch")
    deadline_flags = _dt_flag_features(deadline_dt, "deadline")

    # テキスト統計
    name_stats = _text_basic_features(df["name"]).add_prefix("name_")
    desc_stats = _text_basic_features(df["desc"]).add_prefix("desc_")
    kw_stats   = _keywords_features(df["keywords"])
    ov_stats   = _overlap_features(df["name"], df["desc"], df["keywords"])

    # 派生比率など
    goal = pd.to_numeric(df["goal"], errors="coerce").fillna(0).astype(np.float32)
    goal_log1p = np.log1p(goal).astype(np.float32)
    gpd_launch_deadline = (goal / np.where(dur_launch_to_deadline > 0, dur_launch_to_deadline, 1)).astype(np.float32)
    gpd_launch_deadline = gpd_launch_deadline.where(dur_launch_to_deadline > 0, 0)
    gpd_create_deadline = (goal / np.where(dur_total_create_to_deadline > 0, dur_total_create_to_deadline, 1)).astype(np.float32)
    gpd_create_deadline = gpd_create_deadline.where(dur_total_create_to_deadline > 0, 0)
    name_desc_word_ratio = (name_stats["name_len_words"] / (desc_stats["desc_len_words"].replace(0, np.nan))).fillna(0).astype(np.float32)
    kw_to_name_ratio     = (kw_stats["kw_count"] / (name_stats["name_len_words"].replace(0, np.nan))).fillna(0).astype(np.float32)

    # カテゴリ頻度 & ターゲット平均
    cat_df = df[["country", "currency"]].copy()
    for c in cat_df.columns:
        cat_df[c] = cat_df[c].astype("category").astype(str).fillna("")
    country_freq = cat_df["country"].map(cat_df["country"].value_counts(normalize=True)).astype(np.float32).fillna(0)
    currency_freq = cat_df["currency"].map(cat_df["currency"].value_counts(normalize=True)).astype(np.float32).fillna(0)
    te_country_map, te_global = _target_mean_maps(df, "country", y, k=20.0)
    te_currency_map, _        = _target_mean_maps(df, "currency", y, k=20.0)
    country_te = cat_df["country"].map(te_country_map).fillna(te_global).astype(np.float32)
    currency_te = cat_df["currency"].map(te_currency_map).fillna(te_global).astype(np.float32)

    # 数値まとめ
    num_df = pd.concat([
        pd.DataFrame({
            "disable_communication": df["disable_communication"].astype(np.int8),
            "goal": goal,
            "goal_log1p": goal_log1p,
            "dur_launch_to_deadline_days": dur_launch_to_deadline,
            "dur_create_to_launch_days": dur_create_to_launch,
            "dur_launch_to_statechg_days": dur_launch_to_statechg,
            "dur_total_create_to_deadline_days": dur_total_create_to_deadline,
            "dur_launch_to_deadline_hours": dur_launch_to_deadline_h,
            "dur_create_to_launch_hours": dur_create_to_launch_h,
            "goal_per_day_launch_to_deadline": gpd_launch_deadline,
            "goal_per_day_create_to_deadline": gpd_create_deadline,
            "name_desc_word_ratio": name_desc_word_ratio,
            "kw_to_name_ratio": kw_to_name_ratio,
            "country_freq": country_freq,
            "currency_freq": currency_freq,
            "country_te": country_te,
            "currency_te": currency_te,
        }),
        launch_parts, deadline_parts,
        launch_cyc, deadline_cyc,
        launch_flags, deadline_flags,
        name_stats, desc_stats, kw_stats,
        ov_stats
    ], axis=1)

    # OneHotEncoder
    if sklearn.__version__ >= "1.2":
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    else:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    X_cat = ohe.fit_transform(cat_df)

    # TF-IDF（word）
    tfv_name = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), max_features=8000,
                               lowercase=True, token_pattern=r"(?u)\b\w+\b")
    X_name = tfv_name.fit_transform(df["name"].fillna("").astype(str))
    tfv_desc = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), max_features=20000,
                               lowercase=True, token_pattern=r"(?u)\b\w+\b")
    X_desc = tfv_desc.fit_transform(df["desc"].fillna("").astype(str))
    tfv_kw = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), max_features=3000,
                             lowercase=True, token_pattern=r"(?u)\b\w+\b")
    kw_text = df["keywords"].fillna("").astype(str).str.replace("-", " ")
    X_kw = tfv_kw.fit_transform(kw_text)

    # TF-IDF（char）
    tfv_name_char = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), max_features=2000)
    X_name_char = tfv_name_char.fit_transform(df["name"].fillna("").astype(str))
    tfv_desc_char = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), max_features=5000)
    X_desc_char = tfv_desc_char.fit_transform(df["desc"].fillna("").astype(str))

    X_num = sparse.csr_matrix(num_df.fillna(0.0).values.astype(np.float32))
    X = sparse.hstack([X_num, X_cat, X_name, X_desc, X_kw, X_name_char, X_desc_char], format="csr")

    fe_obj = {
        "ohe": ohe,
        "num_columns": list(num_df.columns),
        "cat_columns": list(cat_df.columns),
        "cat_feature_names": list(ohe.get_feature_names_out(cat_df.columns)),
        "tfv_name": tfv_name, "tfv_desc": tfv_desc, "tfv_kw": tfv_kw,
        "tfv_name_char": tfv_name_char,
        "tfv_desc_char": tfv_desc_char,
        "te_maps": {
            "country": te_country_map,
            "currency": te_currency_map,
            "global_mean": te_global
        }
    }
    return X, y, fe_obj


# ===== 特徴量ビルダー（test用） =====
def build_features_from_test(test: pd.DataFrame, fe_obj):
    df = test.copy()
    df["disable_communication"] = df["disable_communication"].fillna(False).astype(int)

    deadline_dt = _unix_to_dt(df["deadline"])
    state_changed_at_dt = _unix_to_dt(df["state_changed_at"])
    created_at_dt = _unix_to_dt(df["created_at"])
    launched_at_dt = _unix_to_dt(df["launched_at"])

    def days_diff(a, b):
        delta = (a - b).dt.total_seconds()/86400.0
        return pd.to_numeric(delta, errors="coerce").fillna(0.0).astype(np.float32)

    dur_launch_to_deadline = days_diff(deadline_dt, launched_at_dt)
    dur_create_to_launch = days_diff(launched_at_dt, created_at_dt)
    dur_launch_to_statechg = days_diff(state_changed_at_dt, launched_at_dt)
    dur_total_create_to_deadline = days_diff(deadline_dt, created_at_dt)
    dur_launch_to_deadline_h = (dur_launch_to_deadline * 24).astype(np.float32)
    dur_create_to_launch_h   = (dur_create_to_launch * 24).astype(np.float32)

    def dt_parts(dt_series, prefix):
        out = pd.DataFrame({
            f"{prefix}_year": dt_series.dt.year.fillna(0).astype(np.int16),
            f"{prefix}_month": dt_series.dt.month.fillna(0).astype(np.int8),
            f"{prefix}_day": dt_series.dt.day.fillna(0).astype(np.int8),
            f"{prefix}_dow": dt_series.dt.dayofweek.fillna(0).astype(np.int8),
            f"{prefix}_hour": dt_series.dt.hour.fillna(0).astype(np.int8),
        })
        return out
    launch_parts = dt_parts(launched_at_dt, "launch")
    deadline_parts = dt_parts(deadline_dt, "deadline")
    launch_cyc = _cyclical_enc(launched_at_dt, "launch")
    deadline_cyc = _cyclical_enc(deadline_dt, "deadline")
    launch_flags = _dt_flag_features(launched_at_dt, "launch")
    deadline_flags = _dt_flag_features(deadline_dt, "deadline")

    name_stats = _text_basic_features(df["name"]).add_prefix("name_")
    desc_stats = _text_basic_features(df["desc"]).add_prefix("desc_")
    kw_stats   = _keywords_features(df["keywords"])
    ov_stats   = _overlap_features(df["name"], df["desc"], df["keywords"])

    goal = pd.to_numeric(df["goal"], errors="coerce").fillna(0).astype(np.float32)
    goal_log1p = np.log1p(goal).astype(np.float32)
    gpd_launch_deadline = (goal / np.where(dur_launch_to_deadline > 0, dur_launch_to_deadline, 1)).astype(np.float32)
    gpd_launch_deadline = gpd_launch_deadline.where(dur_launch_to_deadline > 0, 0)
    gpd_create_deadline = (goal / np.where(dur_total_create_to_deadline > 0, dur_total_create_to_deadline, 1)).astype(np.float32)
    gpd_create_deadline = gpd_create_deadline.where(dur_total_create_to_deadline > 0, 0)
    name_desc_word_ratio = (name_stats["name_len_words"] / (desc_stats["desc_len_words"].replace(0, np.nan))).fillna(0).astype(np.float32)
    kw_to_name_ratio     = (kw_stats["kw_count"] / (name_stats["name_len_words"].replace(0, np.nan))).fillna(0).astype(np.float32)

    # カテゴリ
    cat_df = df[fe_obj["cat_columns"]].copy()
    for c in cat_df.columns:
        cat_df[c] = cat_df[c].astype("category").astype(str).fillna("")
    ohe = fe_obj["ohe"]
    X_cat = ohe.transform(cat_df)

    # 頻度/TE（trainのmapを適用）
    te_maps = fe_obj["te_maps"]
    te_global = te_maps["global_mean"]
    country_freq = cat_df["country"].map(cat_df["country"].value_counts(normalize=True)).astype(np.float32).fillna(0)
    currency_freq = cat_df["currency"].map(cat_df["currency"].value_counts(normalize=True)).astype(np.float32).fillna(0)
    country_te = cat_df["country"].map(te_maps["country"]).fillna(te_global).astype(np.float32)
    currency_te = cat_df["currency"].map(te_maps["currency"]).fillna(te_global).astype(np.float32)

    num_df = pd.concat([
        pd.DataFrame({
            "disable_communication": df["disable_communication"].astype(np.int8),
            "goal": goal,
            "goal_log1p": goal_log1p,
            "dur_launch_to_deadline_days": dur_launch_to_deadline,
            "dur_create_to_launch_days": dur_create_to_launch,
            "dur_launch_to_statechg_days": dur_launch_to_statechg,
            "dur_total_create_to_deadline_days": dur_total_create_to_deadline,
            "dur_launch_to_deadline_hours": dur_launch_to_deadline_h,
            "dur_create_to_launch_hours": dur_create_to_launch_h,
            "goal_per_day_launch_to_deadline": gpd_launch_deadline,
            "goal_per_day_create_to_deadline": gpd_create_deadline,
            "name_desc_word_ratio": name_desc_word_ratio,
            "kw_to_name_ratio": kw_to_name_ratio,
            "country_freq": country_freq,
            "currency_freq": currency_freq,
            "country_te": country_te,
            "currency_te": currency_te,
        }),
        launch_parts, deadline_parts,
        launch_cyc, deadline_cyc,
        launch_flags, deadline_flags,
        name_stats, desc_stats, kw_stats,
        ov_stats
    ], axis=1)

    # 欠落列を補完＆順序合わせ
    for col in fe_obj["num_columns"]:
        if col not in num_df.columns:
            num_df[col] = 0.0
    num_df = num_df[fe_obj["num_columns"]].fillna(0.0).astype(np.float32)

    # TF-IDF（word/char）
    X_name = fe_obj["tfv_name"].transform(df["name"].fillna("").astype(str))
    X_desc = fe_obj["tfv_desc"].transform(df["desc"].fillna("").astype(str))
    kw_text = df["keywords"].fillna("").astype(str).str.replace("-", " ")
    X_kw = fe_obj["tfv_kw"].transform(kw_text)
    X_name_char = fe_obj["tfv_name_char"].transform(df["name"].fillna("").astype(str))
    X_desc_char = fe_obj["tfv_desc_char"].transform(df["desc"].fillna("").astype(str))

    X_num = sparse.csr_matrix(num_df.values.astype(np.float32))
    X_test = sparse.hstack([X_num, X_cat, X_name, X_desc, X_kw, X_name_char, X_desc_char], format="csr")
    return X_test


# ===== 学習・推論関連 =====
def train_extratrees(X, y, random_state=42):
    """ExtraTreesで学習（balanced_subsample）。"""
    model = ExtraTreesClassifier(
        n_estimators=800,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        bootstrap=False,
        n_jobs=-1,
        random_state=random_state,
        class_weight="balanced_subsample",
        verbose=0
    )
    model.fit(X, y)
    return model

def predict_with_model(model, X_test, threshold=0.5):
    """予測確率とラベルを返す。"""
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    else:
        proba = model.decision_function(X_test)
        proba = (proba - proba.min()) / (proba.max() - proba.min() + 1e-8)
    pred = (proba >= threshold).astype(int)
    return proba, pred

def find_best_threshold(y_true, proba, metric="f1"):
    """検証データ上でF1最大の閾値を返す。"""
    y_true = np.asarray(y_true).astype(int)
    proba = np.asarray(proba).astype(float)
    if metric.lower() == "f1":
        precisions, recalls, thresholds = precision_recall_curve(y_true, proba)
        precisions = precisions[:-1]
        recalls = recalls[:-1]
        f1s = 2 * precisions * recalls / (precisions + recalls + 1e-12)
        idx = int(np.argmax(f1s))
        best_thr = float(thresholds[idx])
        best_score = float(f1s[idx])
        return best_thr, best_score
    raise ValueError("Unsupported metric. Use metric='f1'.")

def kfold_train_and_threshold(train: pd.DataFrame, n_splits=5, random_state=42, metric="f1", save_dir=None):
    """
    Stratified K-Foldで学習・検証し、OOF確率とベスト閾値を計算。
    save_dirを指定すると oof.csv / meta.json / foldモデル等を保存。
    """
    y_all = train["final_status"].astype(int).values
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof_proba = np.zeros(len(train), dtype=float)
    fold_artifacts = []
    fold_thresholds, fold_f1s, fold_auc = [], [], []

    out_dir = None
    if save_dir is not None:
        out_dir = Path(save_dir)
        out_dir.mkdir(parents=True, exist_ok=True)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(np.zeros(len(train)), y_all), 1):
        tr_df = train.iloc[tr_idx].reset_index(drop=True)
        va_df = train.iloc[va_idx].reset_index(drop=True)

        X_tr, y_tr, fe_obj = build_features_from_train(tr_df)
        X_va = build_features_from_test(va_df, fe_obj)

        model = train_extratrees(X_tr, y_tr, random_state=random_state + fold)

        va_proba = model.predict_proba(X_va)[:, 1]
        oof_proba[va_idx] = va_proba

        best_thr, best_f1 = find_best_threshold(va_df["final_status"].values, va_proba, metric=metric)
        auc = roc_auc_score(va_df["final_status"].values, va_proba)
        fold_thresholds.append(best_thr); fold_f1s.append(best_f1); fold_auc.append(auc)

        fold_artifacts.append({
            "model": model, "fe_obj": fe_obj, "valid_idx": va_idx,
            "best_thr": best_thr, "best_f1": best_f1,
        })
        print(f"[Fold {fold}] best_thr={best_thr:.4f}, best_f1={best_f1:.4f}, va_auc={auc:.4f}")

        if out_dir is not None:
            joblib.dump(model, out_dir / f"model_fold{fold}.pkl")
            joblib.dump(fe_obj, out_dir / f"fe_obj_fold{fold}.pkl")
            np.save(out_dir / f"oof_proba_fold{fold}.npy", va_proba)
            np.save(out_dir / f"valid_idx_fold{fold}.npy", va_idx)

    # OOF全体
    global_best_thr, global_best_f1 = find_best_threshold(y_all, oof_proba, metric=metric)
    oof_auc = roc_auc_score(y_all, oof_proba)
    oof_pred = (oof_proba >= global_best_thr).astype(int)
    oof_acc = accuracy_score(y_all, oof_pred)

    result = {
        "oof_proba": oof_proba,
        "oof_pred": oof_pred,
        "oof_scores": {"auc": oof_auc, "f1@best": global_best_f1, "acc@best": oof_acc},
        "fold_artifacts": fold_artifacts,
        "fold_thresholds": fold_thresholds,
        "global_best_thr": global_best_thr
    }
    print(f"[OOF] AUC={oof_auc:.4f}, best_thr={global_best_thr:.4f}, F1@best={global_best_f1:.4f}, ACC@best={oof_acc:.4f}")

    # 保存：oof.csv（id, 確率, 正解ラベル）とmeta.json
    if out_dir is not None:
        id_col = "id" if "id" in train.columns else None
        oof_df = pd.DataFrame({"row": np.arange(len(train)), "oof_proba": oof_proba, "final_status": y_all})
        if id_col:
            oof_df.insert(0, "id", train[id_col].values)
        oof_df.to_csv(out_dir / "oof.csv", index=False, encoding="utf-8")

        meta = {
            "n_splits": n_splits,
            "fold_thresholds": [float(x) for x in fold_thresholds],
            "fold_auc": [float(x) for x in fold_auc],
            "global_best_thr": float(global_best_thr),
            "oof_auc": float(oof_auc),
            "oof_f1": float(global_best_f1),
            "oof_acc": float(oof_acc),
        }
        with open(out_dir / "meta.json", "w", encoding="utf-8") as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)

    return result

def predict_test_with_fold_models(test: pd.DataFrame, fold_artifacts, final_threshold=None):
    """
    foldごとにfe_objで変換→各モデルの確率を平均。
    final_threshold を与えた場合は (proba, pred) を返す。
    """
    proba_list = []
    for i, art in enumerate(fold_artifacts, 1):
        fe_obj = art["fe_obj"]
        model = art["model"]
        X_te = build_features_from_test(test, fe_obj)
        p = model.predict_proba(X_te)[:, 1]
        proba_list.append(p)
        print(f"[Test] Fold {i} done.")

    proba_mean = np.mean(np.vstack(proba_list), axis=0)
    if final_threshold is None:
        return proba_mean
    pred = (proba_mean >= final_threshold).astype(int)
    return proba_mean, pred
# ========= ここまで Part ① =========


### 学習

In [None]:
# ========= ここから Part ② =========
import pandas as pd
from pathlib import Path

# 事前定義済みのパス変数を利用
# ALL_FILE_DIR, TRAIN_CSV は既に定義されている前提
EXTRA_DIR = Path(ALL_FILE_DIR) / "ExtraTrees"
EXTRA_DIR.mkdir(parents=True, exist_ok=True)

# 学習データの読み込み
train = pd.read_csv(TRAIN_CSV)

# 交差検証（OOF作成＆保存）
cv = kfold_train_and_threshold(
    train=train,
    n_splits=5,
    random_state=42,
    metric="f1",
    save_dir=EXTRA_DIR  # ← ALL_FILE_DIR/ExtraTrees に保存
)
# 保存物：
# - {ALL_FILE_DIR}/ExtraTrees/oof.csv  （列: id, oof_proba, final_status）
# - {ALL_FILE_DIR}/ExtraTrees/meta.json
# - 各foldの model/fe_obj / oof断片 など
# ========= ここまで Part ② =========


### 予測

In [None]:
# ========= ここから Part ③ =========
import pandas as pd
from pathlib import Path
import joblib
import numpy as np

EXTRA_DIR = Path(ALL_FILE_DIR) / "ExtraTrees"
EXTRA_DIR.mkdir(parents=True, exist_ok=True)

# テストデータの読み込み
test = pd.read_csv(TEST_CSV)

# Part②で作ったcv（fold_artifacts, global_best_thr）を利用して予測
# ※同一セッションで順に実行していれば cv がメモリにあります
proba_test = predict_test_with_fold_models(test, cv["fold_artifacts"])  # 確率のみ

# 提出用/保存用のCSV（id, 予測確率）
test_pred_df = pd.DataFrame({
    "id": test["id"].values,
    "pred_proba": proba_test
})
test_pred_df.to_csv(EXTRA_DIR / "test_pred.csv", index=False, encoding="utf-8")

print("Saved:", EXTRA_DIR / "test_pred.csv")
# ========= ここまで Part ③ =========


## 〇ロジスティック回帰

### 前処理

In [None]:
# ========= ① 前処理ユーティリティ定義 =========
import numpy as np
import pandas as pd
import re

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.impute import SimpleImputer

class CategoryCountEncoder(BaseEstimator, TransformerMixin):
    """
    指定カテゴリ列の出現頻度（件数 or 比率）を数値化するエンコーダ（目的変数は使用しない）。
    """
    def __init__(self, cols, normalize=True, min_count=1):
        self.cols = cols
        self.normalize = normalize
        self.min_count = min_count

    def _to_frame(self, X):
        if isinstance(X, pd.DataFrame):
            return X
        cols = list(self.cols) if self.cols is not None else [f"col_{i}" for i in range(X.shape[1])]
        return pd.DataFrame(X, columns=cols)

    def fit(self, X, y=None):
        X = self._to_frame(X).astype(str)
        n = len(X)
        self.freq_maps_ = {}
        for c in list(self.cols):
            vc = X[c].value_counts()
            if self.min_count > 1:
                vc = vc[vc >= self.min_count]
            if self.normalize and n > 0:
                vc = vc / n
            self.freq_maps_[c] = vc.to_dict()
        self.feature_names_out_ = np.array([f"{c}_freq" for c in list(self.cols)])
        return self

    def transform(self, X):
        X = self._to_frame(X).astype(str)
        arrs = []
        for c in list(self.cols):
            mp = self.freq_maps_.get(c, {})
            s = X[c].map(mp).fillna(0.0).astype(float).values.reshape(-1, 1)
            arrs.append(s)
        return np.hstack(arrs)

    def get_feature_names_out(self, input_features=None):
        if hasattr(self, "feature_names_out_"):
            return self.feature_names_out_
        return np.array([f"{c}_freq" for c in list(self.cols)])


def _prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()

    # --- テキスト列の用意 ---
    for c in ["name", "desc", "keywords"]:
        if c not in d.columns:
            d[c] = ""
    d["name"] = d["name"].fillna("").astype(str)
    d["desc"] = d["desc"].fillna("").astype(str)
    d["keywords"] = d["keywords"].fillna("").astype(str)
    d["keywords_str"] = d["keywords"].str.replace("-", " ", regex=False)

    # --- ブール/数値 ---
    if "disable_communication" in d.columns:
        d["disable_communication"] = d["disable_communication"].fillna(False).astype(int)
    else:
        d["disable_communication"] = 0

    d["goal"] = pd.to_numeric(d.get("goal", np.nan), errors="coerce")
    d["goal_log"] = np.log1p(d["goal"])

    # --- 時刻（UNIX秒想定） ---
    def to_dt(col):
        return pd.to_datetime(d.get(col, np.nan), unit="s", utc=True, errors="coerce")
    dt_deadline = to_dt("deadline")
    dt_created  = to_dt("created_at")
    dt_launch   = to_dt("launched_at")

    # 期間
    d["dur_launch_to_deadline_days"] = (dt_deadline - dt_launch).dt.total_seconds() / 86400.0
    d["dur_create_to_launch_hours"]  = (dt_launch - dt_created).dt.total_seconds() / 3600.0
    d["dur_create_to_deadline_days"] = (dt_deadline - dt_created).dt.total_seconds() / 86400.0

    # 期間の派生
    d["invalid_launch_to_deadline"] = (d["dur_launch_to_deadline_days"] <= 0).astype(int)
    d["dur_launch_to_deadline_log"] = np.log1p(d["dur_launch_to_deadline_days"].clip(lower=0))
    d["dur_create_to_launch_log"]   = np.log1p((d["dur_create_to_launch_hours"] / 24.0).clip(lower=0))
    d["dur_create_to_deadline_log"] = np.log1p(d["dur_create_to_deadline_days"].clip(lower=0))

    # 比率/密度
    dur_days_pos = d["dur_launch_to_deadline_days"].where(d["dur_launch_to_deadline_days"] > 0, np.nan)
    desc_wc_safe = d["desc"].str.split().apply(len).replace(0, np.nan)
    d["goal_per_day_log"]        = np.log1p(d["goal"] / dur_days_pos)
    d["goal_per_word_desc_log"]  = np.log1p(d["goal"] / desc_wc_safe)

    # カレンダー
    d["launched_month"] = dt_launch.dt.month
    d["launched_dow"]   = dt_launch.dt.weekday
    d["launched_hour"]  = dt_launch.dt.hour
    d["deadline_month"] = dt_deadline.dt.month
    d["deadline_dow"]   = dt_deadline.dt.weekday
    d["deadline_hour"]  = dt_deadline.dt.hour

    # 周期性（sin/cos）
    def cyc_sin(x, period, offset=0): return np.sin(2 * np.pi * (x - offset) / period)
    def cyc_cos(x, period, offset=0): return np.cos(2 * np.pi * (x - offset) / period)
    d["launched_month_sin"] = cyc_sin(d["launched_month"], 12, offset=1)
    d["launched_month_cos"] = cyc_cos(d["launched_month"], 12, offset=1)
    d["deadline_month_sin"] = cyc_sin(d["deadline_month"], 12, offset=1)
    d["deadline_month_cos"] = cyc_cos(d["deadline_month"], 12, offset=1)
    d["launched_dow_sin"] = cyc_sin(d["launched_dow"], 7)
    d["launched_dow_cos"] = cyc_cos(d["launched_dow"], 7)
    d["deadline_dow_sin"] = cyc_sin(d["deadline_dow"], 7)
    d["deadline_dow_cos"] = cyc_cos(d["deadline_dow"], 7)
    d["launched_hour_sin"] = cyc_sin(d["launched_hour"], 24)
    d["launched_hour_cos"] = cyc_cos(d["launched_hour"], 24)
    d["deadline_hour_sin"] = cyc_sin(d["deadline_hour"], 24)
    d["deadline_hour_cos"] = cyc_cos(d["deadline_hour"], 24)

    # 追加の時間フラグ
    d["launched_is_weekend"]     = ((dt_launch.dt.weekday >= 5).astype(float))
    d["deadline_is_weekend"]     = ((dt_deadline.dt.weekday >= 5).astype(float))
    d["launched_is_month_start"] = (dt_launch.dt.is_month_start.astype(float))
    d["deadline_is_month_end"]   = (dt_deadline.dt.is_month_end.astype(float))

    # テキスト統計
    d["name_len"] = d["name"].str.len()
    d["desc_len"] = d["desc"].str.len()
    d["name_wc"]  = d["name"].str.split().apply(len)
    d["desc_wc"]  = d["desc"].str.split().apply(len)
    d["kw_count"] = d["keywords"].apply(lambda s: 0 if s == "" else s.count("-") + 1)

    # 追加のテキスト統計
    d["name_avg_word_len"] = d["name_len"] / d["name_wc"].replace(0, np.nan)
    d["desc_avg_word_len"] = d["desc_len"] / d["desc_wc"].replace(0, np.nan)
    d["name_digit_ratio"]  = d["name"].str.count(r"\d") / d["name_len"].replace(0, np.nan)
    d["desc_digit_ratio"]  = d["desc"].str.count(r"\d") / d["desc_len"].replace(0, np.nan)
    d["name_upper_ratio"]  = d["name"].str.count(r"[A-Z]") / d["name"].str.count(r"[A-Za-z]").replace(0, np.nan)
    d["desc_upper_ratio"]  = d["desc"].str.count(r"[A-Z]") / d["desc"].str.count(r"[A-Za-z]").replace(0, np.nan)
    d["name_excl_count"]   = d["name"].str.count("!")
    d["desc_excl_count"]   = d["desc"].str.count("!")
    d["name_qmark_count"]  = d["name"].str.count(r"\?")
    d["desc_qmark_count"]  = d["desc"].str.count(r"\?")
    d["name_ellipsis_count"]= d["name"].str.count(r"\.\.\.")
    d["desc_ellipsis_count"]= d["desc"].str.count(r"\.\.\.")
    d["has_url"]           = d["desc"].str.contains(r"https?://", case=False, regex=True).fillna(False).astype(int)
    d["has_currency_symbol"]= d[["name","desc"]].apply(lambda s: int(bool(re.search(r"[$€£¥]", " ".join(s.values)))), axis=1)

    # ASCII比率
    def ascii_ratio(s):
        n = len(s)
        if n == 0: return np.nan
        return sum(ord(ch) < 128 for ch in s) / n
    d["name_ascii_ratio"] = d["name"].apply(ascii_ratio)
    d["desc_ascii_ratio"] = d["desc"].apply(ascii_ratio)

    # キーワード多様性・平均長
    kw_lists = d["keywords"].apply(lambda s: [] if s == "" else s.split("-"))
    d["kw_unique_ratio"] = kw_lists.apply(lambda xs: (len(set(xs)) / len(xs)) if len(xs) > 0 else 0.0)
    d["kw_avg_len"]      = kw_lists.apply(lambda xs: (np.mean([len(x) for x in xs]) if len(xs) > 0 else 0.0))

    # name と desc のJaccard
    name_tokens = d["name"].str.lower().str.split()
    desc_tokens = d["desc"].str.lower().str.split()
    def jaccard(a, b):
        sa, sb = set(a), set(b)
        u = len(sa | sb)
        return len(sa & sb) / u if u > 0 else 0.0
    d["name_desc_jaccard"] = [jaccard(a, b) for a, b in zip(name_tokens, desc_tokens)]

    # desc の語彙多様性など
    desc_words_list = d["desc"].str.lower().str.findall(r"\b\w+\b")
    d["desc_ttr"] = desc_words_list.apply(lambda ws: (len(set(ws)) / len(ws)) if len(ws) > 0 else np.nan)
    d["desc_stopword_ratio"] = desc_words_list.apply(
        lambda ws: (sum(w in ENGLISH_STOP_WORDS for w in ws) / len(ws)) if len(ws) > 0 else np.nan
    )
    d["desc_sent_count"] = d["desc"].apply(lambda s: max(1, len(re.findall(r"[.!?]+", s))))
    d["desc_avg_words_per_sent"] = d["desc_wc"] / d["desc_sent_count"].replace(0, np.nan)

    # keywords と name/desc の整合性
    kw_words_list = d["keywords_str"].str.lower().str.split()
    def cover_ratio(a_list, b_list):
        sa, sb = set(a_list), set(b_list)
        return (len(sa & sb) / len(sa)) if len(sa) > 0 else 0.0
    d["kw_in_name_ratio"] = [cover_ratio(kw, nm) for kw, nm in zip(kw_words_list, name_tokens)]
    d["kw_in_desc_ratio"] = [cover_ratio(kw, ds) for kw, ds in zip(kw_words_list, desc_tokens)]
    d["name_kw_jaccard"]  = [jaccard(nm, kw) for nm, kw in zip(name_tokens, kw_words_list)]
    d["desc_kw_jaccard"]  = [jaccard(ds, kw) for ds, kw in zip(desc_tokens, kw_words_list)]

    # カテゴリ欠損埋め + 交互カテゴリ
    for c in ["country", "currency"]:
        if c in d.columns:
            d[c] = d[c].fillna("UNK").astype(str)
        else:
            d[c] = "UNK"
    d["country_currency"] = (d["country"].astype(str) + "_" + d["currency"].astype(str))

    return d


def _build_preprocessor():
    numeric_features = [
        "goal_log",
        "dur_launch_to_deadline_days", "dur_create_to_launch_hours", "dur_create_to_deadline_days",
        "dur_launch_to_deadline_log", "dur_create_to_launch_log", "dur_create_to_deadline_log",
        "invalid_launch_to_deadline",
        "goal_per_day_log", "goal_per_word_desc_log",
        "name_len", "desc_len", "name_wc", "desc_wc",
        "name_avg_word_len", "desc_avg_word_len",
        "name_digit_ratio", "desc_digit_ratio",
        "name_upper_ratio", "desc_upper_ratio",
        "name_excl_count", "desc_excl_count",
        "name_qmark_count", "desc_qmark_count",
        "name_ellipsis_count", "desc_ellipsis_count",
        "name_ascii_ratio", "desc_ascii_ratio",
        "desc_ttr", "desc_stopword_ratio",
        "desc_sent_count", "desc_avg_words_per_sent",
        "kw_count", "kw_unique_ratio", "kw_avg_len",
        "name_desc_jaccard", "kw_in_name_ratio", "kw_in_desc_ratio",
        "name_kw_jaccard", "desc_kw_jaccard",
        "disable_communication", "has_url", "has_currency_symbol",
        "launched_month_sin", "launched_month_cos",
        "deadline_month_sin", "deadline_month_cos",
        "launched_dow_sin", "launched_dow_cos",
        "deadline_dow_sin", "deadline_dow_cos",
        "launched_hour_sin", "launched_hour_cos",
        "deadline_hour_sin", "deadline_hour_cos",
        "launched_is_weekend", "deadline_is_weekend",
        "launched_is_month_start", "deadline_is_month_end",
    ]
    categorical_features = [
        "country", "currency", "country_currency",
        "launched_month", "launched_dow", "launched_hour",
        "deadline_month", "deadline_dow", "deadline_hour",
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num",
             Pipeline([
                 ("imputer", SimpleImputer(strategy="median")),
                 ("poly", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
                 ("scaler", StandardScaler(with_mean=False)),
             ]),
             numeric_features),
            ("cat_oh", OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ("cat_freq", CategoryCountEncoder(cols=["country", "currency", "country_currency"], normalize=True, min_count=1),
             ["country", "currency", "country_currency"]),
            ("name_tfidf", TfidfVectorizer(max_features=1200, ngram_range=(1, 2), min_df=1, sublinear_tf=True), "name"),
            ("desc_tfidf", TfidfVectorizer(max_features=6000, ngram_range=(1, 2), min_df=2, sublinear_tf=True), "desc"),
            ("kw_tfidf",   TfidfVectorizer(max_features=1200, ngram_range=(1, 2), min_df=1, sublinear_tf=True), "keywords_str"),
            ("name_char_tfidf", TfidfVectorizer(analyzer="char", ngram_range=(3, 5), max_features=2500, min_df=2, sublinear_tf=True), "name"),
            ("desc_char_tfidf", TfidfVectorizer(analyzer="char", ngram_range=(3, 5), max_features=5000, min_df=2, sublinear_tf=True), "desc"),
            ("kw_char_tfidf",   TfidfVectorizer(analyzer="char", ngram_range=(3, 5), max_features=1200, min_df=1, sublinear_tf=True), "keywords_str"),
        ],
        remainder="drop",
        sparse_threshold=0.3,
    )
    return preprocessor


def make_features_train(train: pd.DataFrame):
    """学習用X, y, preprocessorを返す（このセルではfit/保存はしない）。"""
    d = _prepare_dataframe(train)
    if "final_status" not in d.columns:
        raise ValueError("trainにfinal_statusがありません。")
    y = d["final_status"].astype(int).values
    preprocessor = _build_preprocessor()
    X = preprocessor.fit_transform(d)
    return X, y, preprocessor


def make_features_test(test: pd.DataFrame, preprocessor):
    """学習済みpreprocessorでテスト特徴量X_testを返す（idも返す）。"""
    d = _prepare_dataframe(test)
    X_test = preprocessor.transform(d)
    ids = d["id"].values if "id" in d.columns else np.arange(len(d))
    return X_test, ids


### 学習

In [None]:
# ========= ② CVしてOOF保存 =========
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss, precision_recall_curve, accuracy_score, roc_curve
from sklearn.linear_model import LogisticRegression

# --- 閾値探索 ---
def find_best_threshold(y_true, proba, metric="f1"):
    y_true = np.asarray(y_true).astype(int)
    proba = np.asarray(proba).astype(float)
    if metric == "f1":
        P, R, th = precision_recall_curve(y_true, proba)
        f1s = (2 * P[:-1] * R[:-1]) / np.clip(P[:-1] + R[:-1], 1e-12, None)
        idx = np.nanargmax(f1s)
        return float(th[idx]), {"f1": float(f1s[idx])}
    elif metric == "accuracy":
        best_acc, best_th = -1.0, 0.5
        for t in np.linspace(0.0, 1.0, 1001):
            pred = (proba >= t).astype(int)
            acc = accuracy_score(y_true, pred)
            if acc > best_acc:
                best_acc, best_th = acc, t
        return float(best_th), {"accuracy": float(best_acc)}
    elif metric == "youden":
        fpr, tpr, th = roc_curve(y_true, proba)
        j = tpr - fpr
        idx = np.nanargmax(j)
        return float(th[idx]), {"youden": float(j[idx])}
    else:
        raise ValueError("metric は 'f1' | 'accuracy' | 'youden'")

# --- 学習器（ロジスティック回帰） ---
def train_model(X, y, *, C=1.0, class_weight="balanced", random_state=42, max_iter=1000):
    model = LogisticRegression(
        solver="saga", penalty="l2", C=C,
        class_weight=class_weight, random_state=random_state, max_iter=max_iter
    )
    model.fit(X, y)
    return model

# --- 交差検証でOOF作成 ---
def cross_validate_oof(train_df: pd.DataFrame,
                       n_splits: int = 5,
                       random_state: int = 42,
                       C: float = 1.0,
                       class_weight="balanced",
                       threshold_metric: str = "f1"):
    if "final_status" not in train_df.columns:
        raise ValueError("trainにfinal_statusがありません。")

    y = train_df["final_status"].astype(int).values
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof_proba = np.zeros(len(train_df), dtype=float)
    oof_fold  = np.full(len(train_df), -1, dtype=int)
    fold_metrics = []
    models = []
    preprocessors = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(train_df, y), start=1):
        d_tr = _prepare_dataframe(train_df.iloc[tr_idx].copy())
        d_va = _prepare_dataframe(train_df.iloc[va_idx].copy())
        y_tr = y[tr_idx]; y_va = y[va_idx]

        preproc = _build_preprocessor()
        X_tr = preproc.fit_transform(d_tr)
        X_va = preproc.transform(d_va)

        model = train_model(X_tr, y_tr, C=C, class_weight=class_weight, random_state=random_state)

        proba_va = model.predict_proba(X_va)[:, 1]
        oof_proba[va_idx] = proba_va
        oof_fold[va_idx] = fold

        try:
            auc = roc_auc_score(y_va, proba_va) if len(np.unique(y_va)) > 1 else np.nan
        except ValueError:
            auc = np.nan
        ll = log_loss(y_va, np.clip(proba_va, 1e-6, 1-1e-6))
        fold_metrics.append({"fold": fold, "auc": float(auc), "logloss": float(ll)})

        models.append(model); preprocessors.append(preproc)

    oof_auc = roc_auc_score(y, oof_proba) if len(np.unique(y)) > 1 else np.nan
    oof_ll  = log_loss(y, np.clip(oof_proba, 1e-6, 1-1e-6))
    best_th, best_scores = find_best_threshold(y, oof_proba, metric=threshold_metric)
    oof_pred = (oof_proba >= best_th).astype(int)

    oof_df = pd.DataFrame({
        "row": np.arange(len(train_df)),
        "id": train_df["id"].values if "id" in train_df.columns else np.arange(len(train_df)),
        "y": y,
        "oof_proba": oof_proba,
        "oof_pred": oof_pred,
        "fold": oof_fold
    })

    return {
        "oof_proba": oof_proba,
        "oof_pred": oof_pred,
        "best_threshold": best_th,
        "best_scores": best_scores,
        "oof_df": oof_df,
        "fold_metrics": fold_metrics,
        "oof_auc": float(oof_auc),
        "oof_logloss": float(oof_ll),
        "models": models,
        "preprocessors": preprocessors,
        "kf": skf
    }

# --- データ読み込み & CV ---
train = pd.read_csv(TRAIN_CSV)
cv = cross_validate_oof(
    train_df=train,
    n_splits=5,
    random_state=42,
    C=1.0,
    class_weight="balanced",
    threshold_metric="f1"
)
print("OOF AUC:", cv["oof_auc"])
print("OOF Logloss:", cv["oof_logloss"])
print("Best threshold (OOF):", cv["best_threshold"], "scores:", cv["best_scores"])

# --- OOFの保存（id, 確率, 正解ラベルのみ） ---
SAVE_DIR = os.path.join(ALL_FILE_DIR, "ロジスティック回帰")
os.makedirs(SAVE_DIR, exist_ok=True)

oof_save = cv["oof_df"][["id", "oof_proba", "y"]].rename(
    columns={"oof_proba": "prob_1", "y": "final_status"}
)
oof_path = os.path.join(SAVE_DIR, "oof.csv")
oof_save.to_csv(oof_path, index=False)
print(f"Saved OOF -> {oof_path}  shape={oof_save.shape}")


### 予測

In [None]:
# ========= ③ 全学習→test予測→保存 =========
import os
import numpy as np
import pandas as pd

def fit_full_model_and_predict_test(train_df: pd.DataFrame,
                                    test_df: pd.DataFrame,
                                    C: float = 1.0,
                                    class_weight="balanced",
                                    random_state: int = 42):
    """全学習で前処理→学習→test確率を返す。"""
    if "final_status" not in train_df.columns:
        raise ValueError("trainにfinal_statusがありません。")
    y = train_df["final_status"].astype(int).values

    d_full = _prepare_dataframe(train_df.copy())
    preproc = _build_preprocessor()
    X_full = preproc.fit_transform(d_full)
    model = train_model(X_full, y, C=C, class_weight=class_weight, random_state=random_state)

    d_test = _prepare_dataframe(test_df.copy())
    X_test = preproc.transform(d_test)
    proba_test = model.predict_proba(X_test)[:, 1]
    test_ids = test_df["id"].values if "id" in test_df.columns else np.arange(len(test_df))
    return model, preproc, proba_test, test_ids

# --- データ読み込み ---
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

# --- 全学習→test予測 ---
model_full, preproc_full, proba_test, test_ids = fit_full_model_and_predict_test(
    train_df=train, test_df=test, C=1.0, class_weight="balanced", random_state=42
)

# --- 保存（id, 予測確率のみ） ---
SAVE_DIR = os.path.join(ALL_FILE_DIR, "ロジスティック回帰")
os.makedirs(SAVE_DIR, exist_ok=True)

test_pred = pd.DataFrame({"id": test_ids, "prob_1": proba_test})
test_path = os.path.join(SAVE_DIR, "test_pred.csv")
test_pred.to_csv(test_path, index=False)
print(f"Saved Test Pred -> {test_path}  shape={test_pred.shape}")
print(test_pred.head())


## 〇線形SVM

### 前処理

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score, roc_auc_score

# ========= ヘルパー =========
def _safe_ratio(num, den):
    den = np.where(den == 0, 1.0, den)
    return num / den

def _bin_with_edges(x: pd.Series, edges: np.ndarray) -> np.ndarray:
    inner = edges[1:-1]
    return np.digitize(x.to_numpy(), inner, right=True).astype(int)

# ========= 共通の下ごしらえ =========
def _prepare_tabular(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()

    name_raw = d.get("name", "").fillna("").astype(str)
    desc_raw = d.get("desc", "").fillna("").astype(str)
    kw_raw   = d.get("keywords", "").fillna("").astype(str)

    d["name"] = name_raw.str.lower()
    d["desc"] = desc_raw.str.lower()
    d["keywords"] = kw_raw.str.lower()
    d["disable_communication"] = d.get("disable_communication", False).fillna(False).astype(int)
    d["country"]  = d.get("country", "UNK").fillna("UNK").astype(str)
    d["currency"] = d.get("currency", "UNK").fillna("UNK").astype(str)
    d["goal"] = pd.to_numeric(d.get("goal", 0), errors="coerce").fillna(0)

    for col_raw, col_lc, prefix in [(name_raw, d["name"], "name"), (desc_raw, d["desc"], "desc")]:
        d[f"{prefix}_len"]       = col_lc.str.split().str.len().fillna(0).astype(int)
        d[f"{prefix}_char_len"]  = col_raw.str.len().fillna(0).astype(int)
        d[f"{prefix}_avg_tok_len"] = _safe_ratio(
            d[f"{prefix}_char_len"].astype(float),
            d[f"{prefix}_len"].replace(0, 1)
        )
        d[f"{prefix}_num_digits"] = col_raw.str.count(r"\d").fillna(0).astype(int)
        d[f"{prefix}_num_punct"]  = col_raw.str.count(r"[.,]").fillna(0).astype(int)
        d[f"{prefix}_num_bang"]   = col_raw.str.count(r"!").fillna(0).astype(int)
        d[f"{prefix}_num_qmark"]  = col_raw.str.count(r"\?").fillna(0).astype(int)
        upper_cnt = col_raw.str.count(r"[A-Z]").fillna(0).astype(int)
        d[f"{prefix}_upper_ratio"] = _safe_ratio(upper_cnt, d[f"{prefix}_char_len"].replace(0, 1))

        uniq = col_lc.str.split().apply(lambda xs: len(set(xs)) if isinstance(xs, list) else 0).astype(int)
        d[f"{prefix}_uniq"] = uniq
        d[f"{prefix}_ttr"]  = _safe_ratio(uniq.astype(float), d[f"{prefix}_len"].replace(0, 1))

    d["keywords_len"]  = d["keywords"].apply(lambda s: 0 if s == "" else len(s.split("-"))).astype(int)
    d["keywords_uniq"] = d["keywords"].apply(lambda s: 0 if s == "" else len(set(s.split("-")))).astype(int)
    d["keywords_dup_rate"] = _safe_ratio(d["keywords_len"] - d["keywords_uniq"],
                                         d["keywords_len"].replace(0, 1))

    def _overlap_rate(row):
        ns = set(row["name"].split()) if row["name"] else set()
        ks = set(row["keywords"].split("-")) if row["keywords"] else set()
        inter = len(ns & ks)
        return inter / (len(ks) if len(ks) > 0 else 1)
    d["overlap_name_keywords"] = d.apply(_overlap_rate, axis=1)

    SEC_DAY = 86400.0
    for col in ["deadline", "state_changed_at", "created_at", "launched_at"]:
        d[col] = pd.to_numeric(d.get(col, np.nan), errors="coerce")

    d["dur_launch_to_deadline"] = (d["deadline"] - d["launched_at"]) / SEC_DAY
    d["dur_create_to_launch"]   = (d["launched_at"] - d["created_at"]) / SEC_DAY
    d["dur_launch_to_statechg"] = (d["state_changed_at"] - d["launched_at"]) / SEC_DAY
    d["dur_create_to_deadline"] = (d["deadline"] - d["created_at"]) / SEC_DAY

    d[["dur_launch_to_deadline", "dur_create_to_launch", "dur_launch_to_statechg", "dur_create_to_deadline"]] = (
        d[["dur_launch_to_deadline", "dur_create_to_launch", "dur_launch_to_statechg", "dur_create_to_deadline"]]
        .clip(-10, 3650).fillna(0)
    )

    launched_dt = pd.to_datetime(d["launched_at"], unit="s", errors="coerce", utc=True)
    d["launched_month"]    = launched_dt.dt.month.fillna(0).astype(int)
    d["launched_dow"]      = launched_dt.dt.dayofweek.fillna(0).astype(int)
    d["launched_hour"]     = launched_dt.dt.hour.fillna(0).astype(int)
    d["launched_week"]     = launched_dt.dt.isocalendar().week.astype("Int64").fillna(0).astype(int)
    d["launched_quarter"]  = launched_dt.dt.quarter.fillna(0).astype(int)
    d["launched_year"]     = launched_dt.dt.year.fillna(0).astype(int)
    d["launched_is_weekend"] = launched_dt.dt.dayofweek.isin([5, 6]).fillna(False).astype(int)

    d["goal_log"] = np.log1p(d["goal"].clip(lower=0))
    d["goal_per_day"] = np.where(d["dur_launch_to_deadline"] > 0,
                                 d["goal"] / d["dur_launch_to_deadline"].replace(0, 1e-6),
                                 0.0)
    d["goal_per_day_log"] = np.log1p(np.maximum(d["goal_per_day"], 0))
    d["name_over_desc"] = d["name_len"] / (1.0 + d["desc_len"])
    d["kw_over_name"]   = d["keywords_len"] / (1.0 + d["name_len"])
    d["goal_times_duration"] = d["goal_log"] * d["dur_launch_to_deadline"]

    return d

# ========= train から特徴量 =========
def make_features_from_train(
    train_df: pd.DataFrame,
    *,
    max_feat_name_word: int = 10000,
    max_feat_desc_word: int = 20000,
    max_feat_name_char: int = 3000,
    max_feat_desc_char: int = 6000,
    max_feat_kw_tfidf: int = 5000,
    min_df_word: int = 2,
    min_df_char: int = 5
):
    d = _prepare_tabular(train_df)
    y = d["final_status"].astype(int).values

    vec_name_word = TfidfVectorizer(tokenizer=str.split, token_pattern=None, lowercase=False,
                                    ngram_range=(1, 2), min_df=min_df_word, max_features=max_feat_name_word)
    vec_desc_word = TfidfVectorizer(tokenizer=str.split, token_pattern=None, lowercase=False,
                                    ngram_range=(1, 2), min_df=min_df_word, max_features=max_feat_desc_word)
    X_name_w = vec_name_word.fit_transform(d["name"])
    X_desc_w = vec_desc_word.fit_transform(d["desc"])

    vec_name_char = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), min_df=min_df_char, max_features=max_feat_name_char)
    vec_desc_char = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), min_df=min_df_char, max_features=max_feat_desc_char)
    X_name_c = vec_name_char.fit_transform(d["name"])
    X_desc_c = vec_desc_char.fit_transform(d["desc"])

    vec_kw_bow = CountVectorizer(tokenizer=lambda s: s.split("-"), token_pattern=None, lowercase=False, min_df=2, binary=True)
    vec_kw_tfidf = TfidfVectorizer(tokenizer=lambda s: s.split("-"), token_pattern=None, lowercase=False,
                                   ngram_range=(1, 2), min_df=2, max_features=max_feat_kw_tfidf)
    X_kw_bow   = vec_kw_bow.fit_transform(d["keywords"])
    X_kw_tfidf = vec_kw_tfidf.fit_transform(d["keywords"])

    q = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
    goal_edges = np.unique(np.quantile(d["goal_log"].to_numpy(), q))
    dur_edges  = np.unique(np.quantile(d["dur_launch_to_deadline"].to_numpy(), q))
    d["goal_log_bin"] = _bin_with_edges(d["goal_log"], goal_edges)
    d["dur_l2d_bin"]  = _bin_with_edges(d["dur_launch_to_deadline"], dur_edges)

    cat_cols = [
        "country", "currency",
        "launched_month", "launched_dow", "launched_hour",
        "launched_week", "launched_quarter", "launched_year",
        "goal_log_bin", "dur_l2d_bin"
    ]
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    X_cat = ohe.fit_transform(d[cat_cols])

    freq_maps = {}
    for col in ["country", "currency"]:
        vc = d[col].value_counts(normalize=True)
        freq_maps[col] = vc.to_dict()
        d[f"{col}_freq"] = d[col].map(freq_maps[col]).fillna(0.0).astype(float)

    num_cols = [
        "goal_log", "goal_per_day", "goal_per_day_log", "goal_times_duration",
        "disable_communication",
        "name_len", "name_char_len", "name_avg_tok_len", "name_num_digits", "name_num_punct", "name_num_bang", "name_num_qmark", "name_upper_ratio", "name_uniq", "name_ttr",
        "desc_len", "desc_char_len", "desc_avg_tok_len", "desc_num_digits", "desc_num_punct", "desc_num_bang", "desc_num_qmark", "desc_upper_ratio", "desc_uniq", "desc_ttr",
        "keywords_len", "keywords_uniq", "keywords_dup_rate", "overlap_name_keywords",
        "dur_launch_to_deadline", "dur_create_to_launch", "dur_launch_to_statechg", "dur_create_to_deadline",
        "launched_is_weekend",
        "name_over_desc", "kw_over_name",
        "country_freq", "currency_freq"
    ]
    scaler = StandardScaler(with_mean=False)
    X_num = csr_matrix(scaler.fit_transform(d[num_cols].astype("float32")))

    X_train = hstack([X_name_w, X_desc_w, X_name_c, X_desc_c, X_kw_bow, X_kw_tfidf, X_cat, X_num], format="csr")

    preprocessors = {
        "vec_name_word": vec_name_word, "vec_desc_word": vec_desc_word,
        "vec_name_char": vec_name_char, "vec_desc_char": vec_desc_char,
        "vec_kw_bow": vec_kw_bow, "vec_kw_tfidf": vec_kw_tfidf,
        "ohe": ohe, "scaler": scaler,
        "cat_cols": cat_cols, "num_cols": num_cols,
        "freq_maps": freq_maps,
        "goal_edges": goal_edges, "dur_edges": dur_edges,
    }
    return X_train, y, preprocessors

# ========= test から特徴量 =========
def make_features_from_test(test_df: pd.DataFrame, preprocessors: dict):
    d = _prepare_tabular(test_df)

    vec_name_word = preprocessors["vec_name_word"]
    vec_desc_word = preprocessors["vec_desc_word"]
    vec_name_char = preprocessors["vec_name_char"]
    vec_desc_char = preprocessors["vec_desc_char"]
    vec_kw_bow    = preprocessors["vec_kw_bow"]
    vec_kw_tfidf  = preprocessors["vec_kw_tfidf"]

    goal_edges = preprocessors["goal_edges"]
    dur_edges  = preprocessors["dur_edges"]
    d["goal_log"] = np.log1p(d["goal"].clip(lower=0))
    d["goal_log_bin"] = _bin_with_edges(d["goal_log"], goal_edges)
    d["dur_l2d_bin"]  = _bin_with_edges(d["dur_launch_to_deadline"], dur_edges)

    ohe = preprocessors["ohe"]
    cat_cols = preprocessors["cat_cols"]
    X_cat = ohe.transform(d[cat_cols])

    freq_maps = preprocessors["freq_maps"]
    for col in ["country", "currency"]:
        d[f"{col}_freq"] = d[col].map(freq_maps[col]).fillna(0.0).astype(float)

    num_cols = preprocessors["num_cols"]
    scaler = preprocessors["scaler"]
    X_num = csr_matrix(scaler.transform(d[num_cols].astype("float32")))

    X_name_w = vec_name_word.transform(d["name"])
    X_desc_w = vec_desc_word.transform(d["desc"])
    X_name_c = vec_name_char.transform(d["name"])
    X_desc_c = vec_desc_char.transform(d["desc"])
    X_kw_bow   = vec_kw_bow.transform(d["keywords"])
    X_kw_tfidf = vec_kw_tfidf.transform(d["keywords"])

    X_test = hstack([X_name_w, X_desc_w, X_name_c, X_desc_c, X_kw_bow, X_kw_tfidf, X_cat, X_num], format="csr")
    return X_test

# ========= 学習器 =========
def train_linear_svm(
    X_train, y,
    C: float = 1.0,
    class_weight="balanced",
    random_state: int = 42,
    calibrate: bool = True,
    method: str = "sigmoid",
    cv: int = 5
):
    base = LinearSVC(C=C, class_weight=class_weight, random_state=random_state, max_iter=5000)
    if calibrate:
        model = CalibratedClassifierCV(base, method=method, cv=cv)
        model.fit(X_train, y)
        return model
    else:
        return base.fit(X_train, y)

def predict_with_trained_model(model, X_test, return_proba: bool = True):
    y_pred = model.predict(X_test)
    if return_proba and hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
        return y_pred, proba
    return y_pred

# ========= CVと閾値探索 =========
def find_best_threshold(y_true: np.ndarray, proba: np.ndarray, metric: str = "f1"):
    y_true = np.asarray(y_true).astype(int)
    proba = np.asarray(proba).astype(float)
    grid = np.linspace(0.0, 1.0, 1001)
    best_thr, best_score = 0.5, -1.0
    for t in grid:
        pred = (proba >= t).astype(int)
        if metric == "f1":
            score = f1_score(y_true, pred, zero_division=0)
        elif metric == "accuracy":
            score = accuracy_score(y_true, pred)
        elif metric == "balanced_accuracy":
            score = balanced_accuracy_score(y_true, pred)
        else:
            raise ValueError("metric must be one of {'f1','accuracy','balanced_accuracy'}")
        if score > best_score:
            best_score, best_thr = score, t
    return float(best_thr), float(best_score)

def run_cv_train_predict(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    n_splits: int = 5,
    random_state: int = 42,
    threshold_metric: str = "f1",
    svm_C: float = 1.0,
    calibrate: bool = True,
    calib_method: str = "sigmoid",
    calib_cv: int = 3
):
    y_all = train_df["final_status"].astype(int).to_numpy()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof_proba = np.zeros(len(train_df), dtype=float)
    test_proba_folds = []
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(np.zeros(len(y_all)), y_all), start=1):
        tr_df = train_df.iloc[tr_idx].reset_index(drop=True)
        va_df = train_df.iloc[va_idx].reset_index(drop=True)

        X_tr, y_tr, preproc = make_features_from_train(tr_df)
        X_va = make_features_from_test(va_df, preproc)
        y_va = va_df["final_status"].astype(int).to_numpy()

        model = train_linear_svm(
            X_tr, y_tr,
            C=svm_C,
            class_weight="balanced",
            random_state=random_state + fold,
            calibrate=calibrate,
            method=calib_method,
            cv=calib_cv
        )

        if hasattr(model, "predict_proba"):
            va_proba = model.predict_proba(X_va)[:, 1]
        else:
            margins = model.decision_function(X_va)
            va_proba = 1.0 / (1.0 + np.exp(-margins))
        oof_proba[va_idx] = va_proba

        try:
            fold_auc = roc_auc_score(y_va, va_proba)
        except ValueError:
            fold_auc = np.nan
        fold_scores.append({"fold": fold, "val_auc": fold_auc, "n_tr": len(tr_idx), "n_va": len(va_idx)})

        X_te_fold = make_features_from_test(test_df, preproc)
        if hasattr(model, "predict_proba"):
            te_proba_fold = model.predict_proba(X_te_fold)[:, 1]
        else:
            te_marg = model.decision_function(X_te_fold)
            te_proba_fold = 1.0 / (1.0 + np.exp(-te_marg))
        test_proba_folds.append(te_proba_fold)

        del X_tr, X_va, X_te_fold

    best_thr, best_thr_score = find_best_threshold(y_all, oof_proba, metric=threshold_metric)
    oof_pred = (oof_proba >= best_thr).astype(int)
    try:
        oof_auc = roc_auc_score(y_all, oof_proba)
    except ValueError:
        oof_auc = np.nan

    test_proba = np.mean(np.vstack(test_proba_folds), axis=0)
    test_pred = (test_proba >= best_thr).astype(int)

    result = {
        "oof_proba": oof_proba,
        "oof_pred": oof_pred,
        "best_threshold": best_thr,
        "best_threshold_score": best_thr_score,
        "oof_auc": oof_auc,
        "test_proba": test_proba,
        "test_pred": test_pred,
        "fold_scores": fold_scores,
    }
    return result


### 学習

In [None]:
# 出力ディレクトリの用意
SVM_DIR = os.path.join(ALL_FILE_DIR, "線形SVM")
os.makedirs(SVM_DIR, exist_ok=True)

# データ読み込み
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

# CV実行（確率が必要なので calibrate=True 推奨）
cv_out = run_cv_train_predict(
    train_df=train,
    test_df=test,
    n_splits=5,
    random_state=42,
    threshold_metric="f1",   # "accuracy" / "balanced_accuracy" でもOK
    svm_C=1.0,
    calibrate=True,
    calib_method="sigmoid",
    calib_cv=3
)

# OOF列を追加（参照用）
train["oof_proba"] = cv_out["oof_proba"]
train["oof_pred"]  = cv_out["oof_pred"]

# 保存用に必要列だけ抽出：id・確率・正解ラベル（= final_status）
train_oof = train[["id", "oof_proba", "final_status"]].copy()
train_oof.to_csv(os.path.join(SVM_DIR, "oof.csv"), index=False)

# 参考出力
print("Best threshold:", cv_out["best_threshold"])
print("Best CV metric score:", cv_out["best_threshold_score"])
print("OOF ROC-AUC:", cv_out["oof_auc"])
print("Fold AUCs:", [f["val_auc"] for f in cv_out["fold_scores"]])
print(train_oof.head())


### 予測

In [None]:
# ②で得た cv_out と test を利用して保存
test_pred = pd.DataFrame({
    "id": test["id"],
    "final_status_proba": cv_out["test_proba"]
})
test_pred.to_csv(os.path.join(SVM_DIR, "test_pred.csv"), index=False)
print(test_pred.head())
print("保存先:", SVM_DIR)


# アンサンブル

## oof.csv読み込み

In [45]:
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_curve

BASE = Path(ALL_FILE_DIR)

# 使用する各モデルのサブフォルダ名
SUBDIRS = {
    "lgb": "LightGBM",
    "xgb": "XGBoost",
    "cat": "CatBoost",
    "ext": "ExtraTrees",
    "log": "ロジスティック回帰",
    "svm": "線形SVM",
}

FEATURE_ORDER = ["lgb_oof", "xgb_oof", "cat_oof", "ext_oof", "log_oof", "svm_oof"]

# =========================
# ユーティリティ
# =========================
def best_thr_f1(y_true: np.ndarray, prob: np.ndarray) -> Tuple[float, float]:
    pr, rc, th = precision_recall_curve(y_true, prob)
    f1 = 2 * pr * rc / (pr + rc + 1e-15)
    f1 = f1[:-1]  # thresholds と長さ合わせ
    idx = int(np.argmax(f1))
    return float(th[idx]), float(f1[idx])


def _read_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"ファイルが見つかりません: {path}")
    return pd.read_csv(path)

def step1_read_oof() -> pd.DataFrame:
    def read_oof_one(key: str, subdir: str) -> pd.Series:
        df = _read_csv(BASE / subdir / "oof.csv")
        # 期待カラム: 'proba' と 'final_status'（final_status は1回だけ採用）
        if "proba" not in df.columns:
            raise KeyError(f"{subdir}/oof.csv に 'proba' 列がありません。")
        return df["proba"]

    # 予測確率の結合
    prob_cols = []
    for key in ["lgb", "xgb", "cat", "ext", "log", "svm"]:
        s = read_oof_one(key, SUBDIRS[key]).astype(float).rename(FEATURE_ORDER[len(prob_cols)])
        prob_cols.append(s)

    # ラベル（final_status）はどれか1つから取得
    any_oof = _read_csv(BASE / SUBDIRS["lgb"] / "oof.csv")
    if "final_status" not in any_oof.columns:
        raise KeyError("oof.csv に 'final_status' 列が見つかりません。")

    oof_all = pd.concat(prob_cols + [any_oof["final_status"].astype(int)], axis=1)
    oof_all.columns = FEATURE_ORDER + ["final_status"]
    print(f"[STEP1] OOF 読込完了: shape={oof_all.shape}, cols={list(oof_all.columns)}")
    return oof_all

oof_all = step1_read_oof()

[STEP1] OOF 読込完了: shape=(75690, 7), cols=['lgb_oof', 'xgb_oof', 'cat_oof', 'ext_oof', 'log_oof', 'svm_oof', 'final_status']


## メタモデル学習

In [46]:
def step2_train_meta_from_oof(
    oof_df: pd.DataFrame,
    feature_cols: List[str] = FEATURE_ORDER,
    n_splits: int = 6,
    random_state: int = 42,
) -> Dict:
    X = oof_df[feature_cols].to_numpy(dtype=float)
    y = oof_df["final_status"].astype(int).to_numpy()

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_meta = np.zeros(len(y), dtype=float)
    coefs, intercepts = [], []

    for tr, va in skf.split(X, y):
        clf = LogisticRegression(
            penalty="l2", solver="liblinear", C=1.0,
            class_weight="balanced", max_iter=1000, random_state=random_state
        )
        clf.fit(X[tr], y[tr])
        oof_meta[va] = clf.predict_proba(X[va])[:, 1]
        coefs.append(clf.coef_.ravel())
        intercepts.append(clf.intercept_[0])

    # OOF評価 & 最適閾値（F1最大）
    auc = roc_auc_score(y, oof_meta)
    thr, f1b = best_thr_f1(y, oof_meta)
    acc = accuracy_score(y, (oof_meta >= thr).astype(int))
    print(f"[META] OOF AUC={auc:.4f}  F1@best={f1b:.4f}  ACC@best={acc:.4f}  thr={thr:.3f}")

    # 全データで最終モデル
    final_clf = LogisticRegression(
        penalty="l2", solver="liblinear", C=1.0,
        class_weight="balanced", max_iter=1000, random_state=random_state
    )
    final_clf.fit(X, y)

    meta = {
        "model": final_clf,
        "feature_cols": feature_cols,
        "oof_meta_proba": oof_meta,
        "best_threshold": float(thr),
        "metrics_oof": {"auc": float(auc), "f1@best": float(f1b), "acc@best": float(acc)},
        "avg_coef": np.mean(coefs, axis=0).tolist(),
        "avg_intercept": float(np.mean(intercepts)),
    }
    print("[STEP2] メタ学習完了")
    return meta

meta = step2_train_meta_from_oof(oof_all, feature_cols=FEATURE_ORDER)

[META] OOF AUC=0.7937  F1@best=0.6199  ACC@best=0.6982  thr=0.473
[STEP2] メタ学習完了


## test_pred.csv読み込み

In [47]:
def step3_read_test_pred() -> Tuple[pd.DataFrame, pd.Series]:
    def read_pred_one(key: str, subdir: str) -> pd.DataFrame:
        df = _read_csv(BASE / subdir / "test_pred.csv")
        if "proba" not in df.columns:
            raise KeyError(f"{subdir}/test_pred.csv に 'proba' 列がありません。")
        return df

    # 各モデルの確率
    preds: Dict[str, pd.DataFrame] = {k: read_pred_one(k, v) for k, v in SUBDIRS.items()}
    # test_meta_df 構築
    meta_df = pd.DataFrame({
        "lgb_oof": preds["lgb"]["proba"].astype(float),
        "xgb_oof": preds["xgb"]["proba"].astype(float),
        "cat_oof": preds["cat"]["proba"].astype(float),
        "ext_oof": preds["ext"]["proba"].astype(float),
        "log_oof": preds["log"]["proba"].astype(float),
        "svm_oof": preds["svm"]["proba"].astype(float),
    })

    # id を取得
    id_series: Optional[pd.Series] = None
    test_csv = BASE / "test.csv"
    if test_csv.exists():
        test_df = pd.read_csv(test_csv)
        if "id" in test_df.columns and len(test_df) == len(meta_df):
            id_series = test_df["id"]
            print("[STEP3] id: test.csv から取得")
    if id_series is None:
        # test_pred 内の id を探す（最初に見つかったものを使う）
        for k in ["lgb", "xgb", "cat", "ext", "log", "svm"]:
            if "id" in preds[k].columns and len(preds[k]) == len(meta_df):
                id_series = preds[k]["id"]
                print(f"[STEP3] id: {SUBDIRS[k]}/test_pred.csv から取得")
                break
    if id_series is None:
        # 最後の手段：連番
        id_series = pd.Series(np.arange(len(meta_df)), name="id")
        print("[STEP3] id: 該当が無かったため 0..N-1 の連番を採用")

    print(f"[STEP3] test_meta_df 読込完了: shape={meta_df.shape}")
    return meta_df, id_series

test_meta_df, id_series = step3_read_test_pred()

[STEP3] id: LightGBM/test_pred.csv から取得
[STEP3] test_meta_df 読込完了: shape=(32439, 6)


## 最終予測

In [48]:
def step4_predict_and_save(meta: Dict, test_meta_df: pd.DataFrame, id_series: pd.Series) -> Path:
    Xtest = test_meta_df[meta["feature_cols"]].to_numpy(dtype=float)
    prob = meta["model"].predict_proba(Xtest)[:, 1]
    thr = meta["best_threshold"]
    pred = (prob >= thr).astype(int)

    submit_df = pd.DataFrame({"id": id_series.to_numpy(), "pred": pred.astype(int)})
    out_path = BASE / "submission.csv"

    # ★ ヘッダー無しで保存（index=False, header=False）
    submit_df.to_csv(out_path, index=False, header=False)
    print(f"[STEP4] submission.csv 保存完了（ヘッダー無し）: {out_path}")
    print({"best_threshold_used": thr, "oof_metrics": meta["metrics_oof"]})
    return out_path

step4_predict_and_save(meta, test_meta_df, id_series)

[STEP4] submission.csv 保存完了（ヘッダー無し）: C:\Users\imasu\OneDrive\デスクトップ\コンペ\submission.csv
{'best_threshold_used': 0.4725619415164519, 'oof_metrics': {'auc': 0.7937287264662982, 'f1@best': 0.6198978318385271, 'acc@best': 0.6982031972519488}}


WindowsPath('C:/Users/imasu/OneDrive/デスクトップ/コンペ/submission.csv')