In [43]:
SEED = 42

# === 時短フラグ ===
FAST_TUNE = True
TUNE_FRAC = 0.60
N_SPLITS_TUNE = 3

# === イテレーション/試行数 ===
EARLY_STOP_TUNE = 100
EARLY_STOP_FULL = 200
N_TRIALS_TUNE = 20
N_TRIALS_REFINE = 10

OPTUNA_TIMEOUT_SEC = 1800

DATA_DIR = r"G:\マイドライブ\MUFJ_competition_2025\data"
OUT_DIR  = r"C:\Users\koshihiramatsu\projects\MUFJ_competition_2025\model-proposal_A_v2"

# しきい値の固定（Noneで自動に戻す）
SUBMIT_THRESHOLD_OVERRIDE = 0.315


In [10]:
import os, re, json, math, warnings, itertools, textwrap
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils import check_random_state

from catboost import CatBoostClassifier, Pool
import optuna


In [11]:
from typing import Optional, Tuple, List

def detect_submit_sep(sample_submit_path: str) -> str:
    # カンマ/タブ/空白の順で試す。列数=2なら採用。
    for sep in [",", "\t", r"\s+"]:
        try:
            df = pd.read_csv(sample_submit_path, header=None, sep=sep, engine="python")
            if df.shape[1] == 2:
                return sep
        except Exception:
            pass
    # デフォルト: カンマ
    return ","

def is_binary(col: pd.Series) -> bool:
    vals = pd.unique(col.dropna())
    return set(vals).issubset({0,1})

def detect_columns(train: pd.DataFrame, test: pd.DataFrame) -> Tuple[str, str]:
    # 目的変数: train にのみ存在し、かつ {0,1} のどれか
    only_in_train = [c for c in train.columns if c not in test.columns]
    candid_tgt = [c for c in only_in_train if is_binary(train[c])]
    if len(candid_tgt) == 1:
        target_col = candid_tgt[0]
    else:
        # フォールバック: 名前に label/target/default が入っていて2値
        name_hits = [c for c in train.columns if any(k in c.lower() for k in ["label", "target", "default", "loanstatus"])]
        name_hits = [c for c in name_hits if c in train.columns and is_binary(train[c])]
        if len(name_hits) >= 1:
            target_col = name_hits[0]
        else:
            raise ValueError("目的変数を自動検出できない。TARGET_COL を手動指定して。")

    # ID列: train&test 共通 かつ 一意/整数っぽい/名前に id を含む を優先
    common = [c for c in test.columns if c in train.columns]
    # 1) 名前に 'id'
    id_like = [c for c in common if 'id' in c.lower()]
    def unique_int_like(df, c):
        s = df[c]
        nunique = s.nunique(dropna=True)
        return (nunique == len(s)) and (np.issubdtype(s.dropna().dtype, np.integer) or np.issubdtype(s.dropna().dtype, np.number))
    for c in id_like + common:
        if unique_int_like(test, c):
            id_col = c
            break
    else:
        # だめなら test の最左列
        id_col = test.columns[0]

    return target_col, id_col

def next_version_number(out_dir: str) -> int:
    os.makedirs(out_dir, exist_ok=True)
    pattern = re.compile(r"submission_A_v(\d+)\.csv$")
    ns = []
    for f in os.listdir(out_dir):
        m = pattern.match(f)
        if m:
            ns.append(int(m.group(1)))
    return (max(ns) + 1) if ns else 1


In [12]:
train_path = os.path.join(DATA_DIR, "train.csv")
test_path  = os.path.join(DATA_DIR, "test.csv")
sample_path= os.path.join(DATA_DIR, "sample_submit.csv")

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

SUBMIT_SEP = detect_submit_sep(sample_path)

TARGET_COL, ID_COL = detect_columns(train, test)

print("TARGET_COL:", TARGET_COL)
print("ID_COL:", ID_COL)
print("train shape:", train.shape, "test shape:", test.shape)
print("target dist:", train[TARGET_COL].value_counts(normalize=True).to_dict())

# 目的変数・ID の存在確認
assert TARGET_COL in train.columns
assert ID_COL in test.columns and ID_COL in train.columns


TARGET_COL: LoanStatus
ID_COL: id
train shape: (7552, 16) test shape: (7552, 15)
target dist: {0: 0.8723516949152542, 1: 0.12764830508474576}


In [13]:
# 説明変数
features = [c for c in train.columns if c not in [TARGET_COL]]
# CatBoostはカテゴリ・数値を混在でそのまま扱えるが、objectは文字列化して欠損は 'MISSING' に寄せる
cat_cols = [c for c in features if train[c].dtype == 'object' or pd.api.types.is_categorical_dtype(train[c])]

def prep_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in cat_cols:
        out[c] = out[c].astype(str).fillna("MISSING")
    return out

X_train = prep_df(train[features])
y_train = train[TARGET_COL].astype(int).values
X_test  = prep_df(test[features])

cat_features_idx = [X_train.columns.get_loc(c) for c in cat_cols]

print("n_features:", len(features), "| n_categoricals:", len(cat_cols))


n_features: 15 | n_categoricals: 6


In [14]:
from sklearn.model_selection import StratifiedKFold

skf_full = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
skf_tune = StratifiedKFold(n_splits=N_SPLITS_TUNE, shuffle=True, random_state=SEED)


In [15]:
# クラス比を保ってサブセットを作る（FAST_TUNE時のみ）
if FAST_TUNE:
    # 層化抽出
    from sklearn.model_selection import train_test_split
    idx_all = np.arange(len(X_train))
    idx_tune, _ = train_test_split(
        idx_all, train_size=TUNE_FRAC, stratify=y_train, random_state=SEED
    )
    X_tune = X_train.iloc[idx_tune].reset_index(drop=True)
    y_tune = y_train[idx_tune]
    print(f"TUNE SUBSET: {len(X_tune)} rows ({TUNE_FRAC*100:.0f}%)")
else:
    X_tune, y_tune = X_train, y_train


TUNE SUBSET: 4531 rows (60%)


In [16]:
# 再利用できるよう、foldごとにPoolを前計算しておく（作成コストと前処理のばらつきを削減）
def build_pools(X, y, skf, cat_idx):
    pools = []
    for tr_idx, va_idx in skf.split(X, y):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        pools.append((
            Pool(X_tr, y_tr, cat_features=cat_idx),
            Pool(X_va, y_va, cat_features=cat_idx),
            va_idx
        ))
    return pools

pools_tune = build_pools(X_tune, y_tune, skf_tune, cat_features_idx)
pools_full = build_pools(X_train, y_train, skf_full, cat_features_idx)


In [17]:
def eval_oof_f1(probs, y_true):
    thresholds = np.linspace(0.05, 0.95, 181)
    f1s = [f1_score(y_true, (probs >= t).astype(int)) for t in thresholds]
    j = int(np.argmax(f1s))
    return f1s[j], float(thresholds[j])

def make_objective(pools, y_all, early_stop, iterations):
    def objective(trial):
        # 探索空間をやや狭くして収束を早める
        params = {
            "iterations": iterations,
            "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.15, log=True),
            "depth": trial.suggest_int("depth", 4, 8),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 1e1, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 5.0),
            "random_strength": trial.suggest_float("random_strength", 0.0, 2.0),
            "subsample": trial.suggest_float("subsample", 0.7, 1.0),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.7, 3.0),
            "loss_function": "Logloss",
            "eval_metric": "Logloss",
            "random_seed": SEED,
            "verbose": False,
            "thread_count": -1,
            "use_best_model": True,
            "allow_writing_files": False,
            # GPUが使えるなら次行を有効化（失敗したらコメントアウト）
            # "task_type": "GPU", "devices": "0",
        }

        oof = np.zeros(len(y_all), dtype=float)
        for train_pool, valid_pool, va_idx in pools:
            model = CatBoostClassifier(**params)
            model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=early_stop)
            oof[va_idx] = model.predict_proba(valid_pool)[:, 1]

        f1, th = eval_oof_f1(oof[oof > 0], y_all[oof > 0])  # チューニングsubsetのOOF部分のみで算出
        trial.set_user_attr("best_threshold", th)
        return f1
    return objective


In [18]:
sampler = optuna.samplers.TPESampler(seed=SEED, n_startup_trials=8, multivariate=True, group=True)
pruner  = optuna.pruners.SuccessiveHalvingPruner(min_resource=50, reduction_factor=3, min_early_stopping_rate=0)

study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)
obj = make_objective(
    pools=pools_tune,
    y_all=y_tune,
    early_stop=EARLY_STOP_TUNE,
    iterations=3000,  # チューニング時は小さめ
)
study.optimize(obj, n_trials=N_TRIALS_TUNE, timeout=OPTUNA_TIMEOUT_SEC, show_progress_bar=True)

# 余力があれば絞り込み（top数点の近傍を探索）
best_params = study.best_trial.params.copy()
best_th = study.best_trial.user_attrs["best_threshold"]
best_score = study.best_value
print("FAST TUNE  best_f1:", round(best_score, 6), "th:", round(best_th, 4), "params:", best_params)


[I 2025-08-10 14:58:29,482] A new study created in memory with name: no-name-cca92d46-4424-47ab-ba80-b9b2f27f4808
Best trial: 0. Best value: 0.591433:   5%|▌         | 1/20 [00:31<10:05, 31.85s/it, 31.85/1800 seconds]

[I 2025-08-10 14:59:01,331] Trial 0 finished with value: 0.5914332784184514 and parameters: {'learning_rate': 0.054816785328198704, 'depth': 8, 'l2_leaf_reg': 1.5702970884055387, 'bagging_temperature': 2.993292420985183, 'random_strength': 0.31203728088487304, 'subsample': 0.7467983561008608, 'scale_pos_weight': 0.8335923079868587}. Best is trial 0 with value: 0.5914332784184514.


Best trial: 1. Best value: 0.611848:  10%|█         | 2/20 [00:53<07:46, 25.93s/it, 53.64/1800 seconds]

[I 2025-08-10 14:59:23,123] Trial 1 finished with value: 0.6118479221927497 and parameters: {'learning_rate': 0.12093510864110771, 'depth': 7, 'l2_leaf_reg': 1.3311216080736887, 'bagging_temperature': 0.10292247147901223, 'random_strength': 1.9398197043239886, 'subsample': 0.9497327922401265, 'scale_pos_weight': 1.188379954560035}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  15%|█▌        | 3/20 [01:20<07:31, 26.54s/it, 80.91/1800 seconds]

[I 2025-08-10 14:59:50,389] Trial 2 finished with value: 0.6038910505836576 and parameters: {'learning_rate': 0.040198648566549054, 'depth': 4, 'l2_leaf_reg': 0.08179499475211674, 'bagging_temperature': 2.6237821581611893, 'random_strength': 0.8638900372842315, 'subsample': 0.7873687420594125, 'scale_pos_weight': 2.1072616578614727}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  20%|██        | 4/20 [01:53<07:43, 28.99s/it, 113.66/1800 seconds]

[I 2025-08-10 15:00:23,143] Trial 3 finished with value: 0.605833956619297 and parameters: {'learning_rate': 0.03755115322842063, 'depth': 5, 'l2_leaf_reg': 0.1256277350380703, 'bagging_temperature': 2.28034992108518, 'random_strength': 1.5703519227860272, 'subsample': 0.7599021346475079, 'scale_pos_weight': 1.8827392083513066}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  25%|██▌       | 5/20 [02:07<05:53, 23.57s/it, 127.61/1800 seconds]

[I 2025-08-10 15:00:37,090] Trial 4 finished with value: 0.6080691642651297 and parameters: {'learning_rate': 0.07783972061633886, 'depth': 4, 'l2_leaf_reg': 0.6647135865318028, 'bagging_temperature': 0.8526206184364576, 'random_strength': 0.13010318597055903, 'subsample': 0.984665661176, 'scale_pos_weight': 2.9209536760714867}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  30%|███       | 6/20 [02:19<04:36, 19.72s/it, 139.86/1800 seconds]

[I 2025-08-10 15:00:49,340] Trial 5 finished with value: 0.5984375 and parameters: {'learning_rate': 0.1101962340138614, 'depth': 5, 'l2_leaf_reg': 0.019634341572933336, 'bagging_temperature': 3.4211651325607844, 'random_strength': 0.8803049874792026, 'subsample': 0.7366114704534336, 'scale_pos_weight': 1.8389068932559214}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  35%|███▌      | 7/20 [02:42<04:28, 20.64s/it, 162.38/1800 seconds]

[I 2025-08-10 15:01:11,866] Trial 6 finished with value: 0.5945165945165946 and parameters: {'learning_rate': 0.031707193242141093, 'depth': 8, 'l2_leaf_reg': 0.059750279999602945, 'bagging_temperature': 3.31261142176991, 'random_strength': 0.6234221521788219, 'subsample': 0.8560204063533432, 'scale_pos_weight': 1.957433642489543}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  40%|████      | 8/20 [03:21<05:19, 26.59s/it, 201.71/1800 seconds]

[I 2025-08-10 15:01:51,196] Trial 7 finished with value: 0.6022641509433962 and parameters: {'learning_rate': 0.04039512666823938, 'depth': 8, 'l2_leaf_reg': 2.1154290797261215, 'bagging_temperature': 4.697494707820946, 'random_strength': 1.7896547008552977, 'subsample': 0.8793699936433255, 'scale_pos_weight': 2.820310740553168}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  45%|████▌     | 9/20 [03:39<04:20, 23.72s/it, 219.14/1800 seconds]

[I 2025-08-10 15:02:08,620] Trial 8 finished with value: 0.5969275786393563 and parameters: {'learning_rate': 0.1405200625953881, 'depth': 6, 'l2_leaf_reg': 5.095071061056762, 'bagging_temperature': 0.26696131796423683, 'random_strength': 1.549124519811738, 'subsample': 0.8594249939696944, 'scale_pos_weight': 1.246329998629404}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  50%|█████     | 10/20 [04:02<03:54, 23.48s/it, 242.09/1800 seconds]

[I 2025-08-10 15:02:31,570] Trial 9 finished with value: 0.6068965517241379 and parameters: {'learning_rate': 0.11734022068569182, 'depth': 8, 'l2_leaf_reg': 5.1729540340743325, 'bagging_temperature': 1.3735538453643197, 'random_strength': 1.438297245481995, 'subsample': 0.9714503920763212, 'scale_pos_weight': 1.686738972090934}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 1. Best value: 0.611848:  55%|█████▌    | 11/20 [04:22<03:23, 22.59s/it, 262.66/1800 seconds]

[I 2025-08-10 15:02:52,137] Trial 10 finished with value: 0.5849189570119803 and parameters: {'learning_rate': 0.11449573866978238, 'depth': 6, 'l2_leaf_reg': 0.04863208345581101, 'bagging_temperature': 0.7853497842533179, 'random_strength': 1.8553822991238997, 'subsample': 0.8909556519845516, 'scale_pos_weight': 1.4658602071694027}. Best is trial 1 with value: 0.6118479221927497.


Best trial: 11. Best value: 0.616333:  60%|██████    | 12/20 [04:49<03:10, 23.77s/it, 289.14/1800 seconds]

[I 2025-08-10 15:03:18,616] Trial 11 finished with value: 0.6163328197226502 and parameters: {'learning_rate': 0.047940254989892885, 'depth': 4, 'l2_leaf_reg': 0.8469509872099742, 'bagging_temperature': 0.33124353469809986, 'random_strength': 0.41895376324721256, 'subsample': 0.9896831195949556, 'scale_pos_weight': 2.6498824785986583}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 11. Best value: 0.616333:  65%|██████▌   | 13/20 [05:16<02:53, 24.82s/it, 316.37/1800 seconds]

[I 2025-08-10 15:03:45,851] Trial 12 finished with value: 0.6074766355140186 and parameters: {'learning_rate': 0.039555204952676566, 'depth': 4, 'l2_leaf_reg': 0.17457152885399302, 'bagging_temperature': 2.4494337488959634, 'random_strength': 0.6339656121983112, 'subsample': 0.9645791201416001, 'scale_pos_weight': 2.4965312247595093}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 11. Best value: 0.616333:  70%|███████   | 14/20 [05:52<02:50, 28.34s/it, 352.83/1800 seconds]

[I 2025-08-10 15:04:22,313] Trial 13 finished with value: 0.6070878274268104 and parameters: {'learning_rate': 0.03207080982450353, 'depth': 6, 'l2_leaf_reg': 3.7201370765695967, 'bagging_temperature': 0.35443417875650557, 'random_strength': 0.26230591077722676, 'subsample': 0.9113573205044627, 'scale_pos_weight': 2.8678346005919453}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 11. Best value: 0.616333:  75%|███████▌  | 15/20 [06:26<02:29, 29.82s/it, 386.07/1800 seconds]

[I 2025-08-10 15:04:55,557] Trial 14 finished with value: 0.6050129645635264 and parameters: {'learning_rate': 0.04624264979307338, 'depth': 5, 'l2_leaf_reg': 4.594445376407278, 'bagging_temperature': 0.589516842634676, 'random_strength': 0.8940804297855756, 'subsample': 0.87191472925697, 'scale_pos_weight': 1.3893551953095746}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 11. Best value: 0.616333:  80%|████████  | 16/20 [06:49<01:51, 27.93s/it, 409.64/1800 seconds]

[I 2025-08-10 15:05:19,118] Trial 15 finished with value: 0.6120826709062003 and parameters: {'learning_rate': 0.07611076429158616, 'depth': 6, 'l2_leaf_reg': 1.0564190915518583, 'bagging_temperature': 0.8668908588310074, 'random_strength': 1.8499535502022153, 'subsample': 0.983213595873115, 'scale_pos_weight': 0.8122240972441206}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 11. Best value: 0.616333:  85%|████████▌ | 17/20 [07:07<01:14, 24.94s/it, 427.63/1800 seconds]

[I 2025-08-10 15:05:37,109] Trial 16 finished with value: 0.6049204052098408 and parameters: {'learning_rate': 0.09658987375968167, 'depth': 4, 'l2_leaf_reg': 0.06957754858123973, 'bagging_temperature': 2.439820083435086, 'random_strength': 1.856421032641976, 'subsample': 0.965971693905709, 'scale_pos_weight': 0.8225022394639633}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 11. Best value: 0.616333:  90%|█████████ | 18/20 [07:28<00:47, 23.64s/it, 448.24/1800 seconds]

[I 2025-08-10 15:05:57,725] Trial 17 finished with value: 0.6080760095011877 and parameters: {'learning_rate': 0.04826986509482155, 'depth': 4, 'l2_leaf_reg': 1.1940730890525697, 'bagging_temperature': 0.892188275865698, 'random_strength': 0.05852979307125716, 'subsample': 0.9388311774030238, 'scale_pos_weight': 1.8122195117886022}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 11. Best value: 0.616333:  95%|█████████▌| 19/20 [07:56<00:25, 25.10s/it, 476.73/1800 seconds]

[I 2025-08-10 15:06:26,217] Trial 18 finished with value: 0.602291325695581 and parameters: {'learning_rate': 0.05042332206553957, 'depth': 7, 'l2_leaf_reg': 0.14954784869817256, 'bagging_temperature': 1.4798870805924518, 'random_strength': 1.611553124939411, 'subsample': 0.9981357296646711, 'scale_pos_weight': 1.130447511416614}. Best is trial 11 with value: 0.6163328197226502.


Best trial: 19. Best value: 0.617021: 100%|██████████| 20/20 [08:25<00:00, 25.25s/it, 505.10/1800 seconds]

[I 2025-08-10 15:06:54,579] Trial 19 finished with value: 0.6170212765957447 and parameters: {'learning_rate': 0.06116108646095842, 'depth': 5, 'l2_leaf_reg': 5.478690083944246, 'bagging_temperature': 0.8884344994647464, 'random_strength': 1.865589408671679, 'subsample': 0.9516049519127788, 'scale_pos_weight': 1.1386783078556455}. Best is trial 19 with value: 0.6170212765957447.
FAST TUNE  best_f1: 0.617021 th: 0.345 params: {'learning_rate': 0.06116108646095842, 'depth': 5, 'l2_leaf_reg': 5.478690083944246, 'bagging_temperature': 0.8884344994647464, 'random_strength': 1.865589408671679, 'subsample': 0.9516049519127788, 'scale_pos_weight': 1.1386783078556455}





In [None]:
# ==== 9

params = dict(best_params)
params.update({
    "iterations": 10000,
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "random_seed": SEED,
    "verbose": False,
    "thread_count": -1,
    "use_best_model": True,
    "allow_writing_files": False,
    # "task_type": "GPU", "devices": "0",  # 使えるなら
})

oof = np.zeros(len(X_train), dtype=float)
test_prob = np.zeros(len(X_test), dtype=float)
fold_f1s = []

for fold, (train_pool, valid_pool, va_idx) in enumerate(pools_full, 1):
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=EARLY_STOP_FULL)
    oof[va_idx] = model.predict_proba(valid_pool)[:,1]
    test_prob  += model.predict_proba(Pool(X_test, cat_features=cat_features_idx))[:,1] / skf_full.n_splits

oof_f1, best_th_full = eval_oof_f1(oof, y_train)
print("OOF F1:", round(oof_f1, 6), "| th(full):", round(best_th_full, 4))


OOF F1: 0.634523 | th(full): 0.36


In [32]:
# ==== 9b: CatBoost seed-bagging ====
SEED_BAG = [42, 2025, 777]

params_cb = dict(best_params)
params_cb.update({
    "iterations": 10000,
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "verbose": False,
    "thread_count": -1,
    "use_best_model": True,
    "allow_writing_files": False,
})

oof_cb = np.zeros(len(X_train), dtype=float)
test_cb = np.zeros(len(X_test), dtype=float)

for fold, (tr_pool, va_pool, va_idx) in enumerate(pools_full, 1):
    fold_prob = np.zeros(len(va_idx))
    fold_test = np.zeros(len(X_test))
    for sd in SEED_BAG:
        p = dict(params_cb); p["random_seed"] = sd
        m = CatBoostClassifier(**p)
        m.fit(tr_pool, eval_set=va_pool, early_stopping_rounds=EARLY_STOP_FULL)
        fold_prob += m.predict_proba(va_pool)[:,1] / len(SEED_BAG)
        fold_test += m.predict_proba(Pool(X_test, cat_features=cat_features_idx))[:,1] / len(SEED_BAG)
    oof_cb[va_idx] = fold_prob
    test_cb += fold_test / skf_full.n_splits

f1_cb, th_cb = eval_oof_f1(oof_cb, y_train)
print("CB-bag  OOF F1:", round(f1_cb,6), "| th:", round(th_cb,4))


CB-bag  OOF F1: 0.642419 | th: 0.34


In [25]:
# ==== 9c: LightGBM baseline (5-fold) ====
import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# バージョン確認（任意）
print("LightGBM version:", lgb.__version__)

# LGBMは pandas の category dtype を自動認識
X_train_lgb = X_train.copy()
X_test_lgb  = X_test.copy()
for c in cat_cols:
    X_train_lgb[c] = X_train_lgb[c].astype("category")
    X_test_lgb[c]  = X_test_lgb[c].astype("category")

params_lgb = {
    "objective": "binary",
    "learning_rate": 0.03,
    "num_leaves": 63,
    "min_child_samples": 50,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.0,
    "reg_lambda": 5.0,
    "n_estimators": 10000,        # 早停前提で大きめ
    "random_state": SEED,
    "n_jobs": -1,
    "verbose": -1,
    "scale_pos_weight": 1.2,
}

oof_lgb = np.zeros(len(X_train), dtype=float)
test_lgb = np.zeros(len(X_test), dtype=float)

for fold, (tr_idx, va_idx) in enumerate(skf_full.split(X_train_lgb, y_train), 1):
    X_tr, X_va = X_train_lgb.iloc[tr_idx], X_train_lgb.iloc[va_idx]
    y_tr, y_va = y_train[tr_idx], y_train[va_idx]

    m = LGBMClassifier(**params_lgb)
    m.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="binary_logloss",
        callbacks=[
            early_stopping(stopping_rounds=200, verbose=False),
            log_evaluation(period=0),
        ],
    )

    oof_lgb[va_idx] = m.predict_proba(X_va)[:, 1]
    test_lgb += m.predict_proba(X_test_lgb)[:, 1] / skf_full.n_splits

f1_lgb, th_lgb = eval_oof_f1(oof_lgb, y_train)
print("LGBM     OOF F1:", round(f1_lgb, 6), "| th:", round(th_lgb, 4))


LightGBM version: 4.6.0
LGBM     OOF F1: 0.628571 | th: 0.32


In [26]:
# ==== 9d: Soft ensemble of CB-bag & LGBM ====
weights = np.linspace(0.0, 1.0, 21)  # 0,0.05, …,1.0
best = (-1, None, None)  # (f1, w, th)

for w in weights:
    oof_ens = w*oof_cb + (1-w)*oof_lgb
    f1, th = eval_oof_f1(oof_ens, y_train)
    if f1 > best[0]:
        best = (f1, w, th)

best_f1, best_w, best_th = best
print(f"Ensemble OOF F1: {best_f1:.6f} | w(CB)={best_w:.2f} | th={best_th:.4f}")

# テスト側の確率も同じ重みで合成
test_ens = best_w*test_cb + (1-best_w)*test_lgb

# 以降の提出セル（11）で使われる変数名に載せ替え
oof = w*oof_cb + (1-best_w)*oof_lgb   # タイポ回避で再計算
oof = best_w*oof_cb + (1-best_w)*oof_lgb
test_prob = test_ens
best_th_full = best_th


Ensemble OOF F1: 0.644037 | w(CB)=0.45 | th=0.3150


In [33]:
# ==== 9d': Soft ensemble (finer grid) ====
weights = np.linspace(0.0, 1.0, 101)  # 0.00〜1.00 を0.01刻み
best = (-1, None, None)
for w in weights:
    oof_ens = w*oof_cb + (1-w)*oof_lgb
    f1, th = eval_oof_f1(oof_ens, y_train)
    if f1 > best[0]:
        best = (f1, w, th)
best_f1, best_w, best_th = best
print(f"Ensemble OOF F1: {best_f1:.6f} | w(CB)={best_w:.2f} | th={best_th:.4f}")

test_ens = best_w*test_cb + (1-best_w)*test_lgb
oof = best_w*oof_cb + (1-best_w)*oof_lgb
test_prob = test_ens
best_th_full = best_th


Ensemble OOF F1: 0.644332 | w(CB)=0.46 | th=0.3150


In [42]:
# STEP2-2: 提出しきい値を固定
SUBMIT_THRESHOLD_OVERRIDE = 0.315
print("SUBMIT_THRESHOLD_OVERRIDE =", SUBMIT_THRESHOLD_OVERRIDE)


SUBMIT_THRESHOLD_OVERRIDE = 0.315


In [20]:
TARGET_F1 = 0.64
print("TARGET:", TARGET_F1, " | CURRENT:", round(oof_f1, 6))


TARGET: 0.64  | CURRENT: 0.634523


In [44]:
# ==== セル11（常にfold再構築＋アンサンブル情報を記録） ====
import os, json, numpy as np, pandas as pd
from sklearn.metrics import f1_score, confusion_matrix, classification_report

assert 'oof' in locals() and 'test_prob' in locals(), "先にセル9/9dまで実行してoof/test_probを作ってから実行"

# 提出閾値の決定（優先: override → best_th_full → best_th → 0.5）
threshold_for_submit = locals().get("SUBMIT_THRESHOLD_OVERRIDE", None)
if threshold_for_submit is None:
    threshold_for_submit = locals().get("best_th_full", None)
if threshold_for_submit is None:
    threshold_for_submit = locals().get("best_th", 0.5)

threshold_source = (
    "override" if locals().get("SUBMIT_THRESHOLD_OVERRIDE", None) is not None
    else "best_th_full" if locals().get("best_th_full", None) is not None
    else "best_th" if locals().get("best_th", None) is not None
    else "default_0.5"
)


# ---- foldごとの指標を毎回作り直す ----
fold_reports = []
fold_f1s = []
if 'pools_full' in locals():
    for fold, (_tr_pool, _va_pool, va_idx) in enumerate(pools_full, 1):
        y_va = y_train[va_idx]
        y_pred_va = (oof[va_idx] >= threshold_for_submit).astype(int)
        f1v = f1_score(y_va, y_pred_va)
        cm  = confusion_matrix(y_va, y_pred_va)
        rep = classification_report(y_va, y_pred_va, digits=4)
        fold_f1s.append(f1v)
        fold_reports.append((f"FOLD {fold}", f1v, cm, rep))
else:
    # 予備（fold境界がないとき）
    y_pred = (oof >= threshold_for_submit).astype(int)
    f1v = f1_score(y_train, y_pred)
    cm  = confusion_matrix(y_train, y_pred)
    rep = classification_report(y_train, y_pred, digits=4)
    fold_f1s = [f1v]
    fold_reports = [("GLOBAL", f1v, cm, rep)]

# ---- 提出予測 ----
test_pred = (test_prob >= threshold_for_submit).astype(int)
assert len(test_pred) == len(test)
assert set(np.unique(test_pred)).issubset({0,1})

# 自動ナンバリング
OUT_DIR  = r"C:\Users\koshihiramatsu\projects\MUFJ_competition_2025\model-proposal_A_v2"
os.makedirs(OUT_DIR, exist_ok=True)
n = next_version_number(OUT_DIR)
sub_name = f"submission_A_v{n}.csv"
log_name = f"run_A2_v{n}.txt"

# 出力（sample_submitの区切りに合わせる）
sep = locals().get("SUBMIT_SEP", ",")
submit_df = pd.DataFrame({ID_COL: test[ID_COL].values, "pred": test_pred})
if sep == r"\s+":
    with open(os.path.join(OUT_DIR, sub_name), "w", encoding="utf-8") as f:
        for i, p in submit_df[[ID_COL, "pred"]].itertuples(index=False):
            f.write(f"{i} {p}\n")
else:
    submit_df.to_csv(os.path.join(OUT_DIR, sub_name), header=False, index=False, sep=sep)

print("Saved:", os.path.join(OUT_DIR, sub_name))

# ---- ログ ----
def safe(x): 
    return float(x) if isinstance(x, (np.floating, np.float64, np.float32)) else x

oof_f1_global_best, _ = eval_oof_f1(oof, y_train)
oof_f1_at_submit = f1_score(y_train, (oof >= threshold_for_submit).astype(int))

log_lines = [
    f"version: {n}",
    f"seed: {SEED}",
    f"n_splits: {skf_full.n_splits if 'skf_full' in locals() else N_SPLITS}",
    f"target_col: {TARGET_COL}",
    f"id_col: {ID_COL}",
    f"n_features: {len(features)}",
    f"n_categoricals: {len(cat_cols)}",
    f"train_shape: {train.shape}",
    f"test_shape: {test.shape}",
    f"target_pos_ratio: {train[TARGET_COL].mean():.6f}",
    "",
    f"best_oof_f1_from_study: {locals().get('best_score', float('nan')):.6f}",
    f"oof_f1_global_best: {oof_f1_global_best:.6f}",
    f"oof_f1_at_submit_th: {oof_f1_at_submit:.6f}",
    f"threshold_source: {threshold_source}",
    f"submit_threshold: {float(threshold_for_submit):.6f}",
    f"fold_f1s: {[round(safe(x), 6) for x in fold_f1s]}",
    "",
    # アンサンブル情報（存在する場合のみ）
    f"oof_f1_cb: {locals().get('f1_cb', float('nan')):.6f}",
    f"oof_f1_lgb: {locals().get('f1_lgb', float('nan')):.6f}",
    f"ensemble_w_cb: {locals().get('best_w', float('nan'))}",
    "",
    "best_params_cb:",
    json.dumps(locals().get('best_params', {}), indent=2),
    "params_lgb:",
    json.dumps(locals().get('params_lgb', {}), indent=2),
    "",
]
for title, f1v, cm, rep in fold_reports:
    log_lines += [title, f"F1@submit_th: {f1v:.6f}", "confusion_matrix:", str(cm), "report:", rep, "-"*40]

with open(os.path.join(OUT_DIR, log_name), "w", encoding="utf-8") as f:
    f.write("\n".join([str(x) for x in log_lines]))

print("Saved:", os.path.join(OUT_DIR, log_name))


Saved: C:\Users\koshihiramatsu\projects\MUFJ_competition_2025\model-proposal_A_v2\submission_A_v10.csv
Saved: C:\Users\koshihiramatsu\projects\MUFJ_competition_2025\model-proposal_A_v2\run_A2_v10.txt
