# Ensemble
---

In [7]:
# =============================
# import libraries
# =============================
import os, re, gc, copy, pickle, yaml
from pathlib import Path
import sys
# import warnings
# warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm
pd.set_option("display.max_columns", 3000)
pd.set_option("display.max_rows", 3000)


# original
sys.path.append(r"../src")
import utils
from utils.data import *

In [8]:
# =============================
# ensemble utils
# =============================
def find_infer_dir(root_infer_dir: str, infer_name: str = "300_infer") -> str:
    """
    root_infer_dir: .../experiment/xxx/inference
    ここ配下に infer_name がある前提。無ければ最初に見つかったディレクトリを返す。
    """
    p = Path(root_infer_dir)
    cand = p / infer_name
    if cand.exists():
        return str(cand)

    # fallback: inference配下の最初のdir
    dirs = [x for x in p.glob("*") if x.is_dir()]
    if not dirs:
        raise FileNotFoundError(f"no inference dir found under: {root_infer_dir}")
    return str(sorted(dirs)[0])


def load_sims_and_threshold(infer_dir: str):
    infer_dir = Path(infer_dir)
    sims_path = infer_dir / "sims_mean.npy"
    if not sims_path.exists():
        raise FileNotFoundError(f"sims_mean.npy not found: {sims_path}")

    sims = np.load(sims_path)  # (N,C) float32/float64

    # threshold table (optional)
    thr_path = infer_dir / "threshold_table_ens.csv"
    thr_sim_table = None
    thr_m_table = None
    if thr_path.exists():
        thr_df = pd.read_csv(thr_path)
        # columns: class_id, thr_sim, thr_margin  (あなたの300がこの形式)
        thr_df = thr_df.sort_values("class_id")
        thr_sim_table = thr_df["thr_sim"].to_numpy(np.float32)
        thr_m_table = thr_df["thr_margin"].to_numpy(np.float32)

    # submission (optional)
    sub_path = infer_dir / "submission.csv"
    sub = pd.read_csv(sub_path) if sub_path.exists() else None

    return sims, thr_sim_table, thr_m_table, sub


def top2_from_sims(sims: np.ndarray):
    N, C = sims.shape
    top2_idx = np.argpartition(-sims, kth=1, axis=1)[:, :2]
    top2_val = np.take_along_axis(sims, top2_idx, axis=1)
    order = np.argsort(-top2_val, axis=1)

    top1 = top2_val[np.arange(N), order[:, 0]]
    top2 = top2_val[np.arange(N), order[:, 1]]
    pred = top2_idx[np.arange(N), order[:, 0]].astype(np.int32)
    margin = (top1 - top2).astype(np.float32)
    return pred, top1.astype(np.float32), top2.astype(np.float32), margin


def apply_unknown_rule(
    sims_ens: np.ndarray,
    thr_sim_table: np.ndarray | None,
    thr_m_table: np.ndarray | None,
    thr_sim_fallback: float = 0.79,
    thr_m_fallback: float = 0.0,
    combine_mode: str = "or",
    margin_gate_add: float = 0.0,
    margin_gate_sim: float | None = None,
):
    pred, top1, top2, margin = top2_from_sims(sims_ens)
    C = sims_ens.shape[1]

    # threshold tables (fallback -> scalar)
    if thr_sim_table is None:
        thr_sim_used = np.full_like(top1, thr_sim_fallback, dtype=np.float32)
    else:
        assert len(thr_sim_table) == C
        thr_sim_used = thr_sim_table[pred]

    if thr_m_table is None:
        thr_m_used = np.full_like(top1, thr_m_fallback, dtype=np.float32)
    else:
        assert len(thr_m_table) == C
        thr_m_used = thr_m_table[pred]

    unknown_by_sim = top1 < thr_sim_used

    # margin gate
    if margin_gate_sim is not None:
        gate_mask = top1 < float(margin_gate_sim)
    elif margin_gate_add > 0:
        gate_mask = top1 < (thr_sim_used + float(margin_gate_add))
    else:
        gate_mask = np.ones_like(top1, dtype=bool)

    unknown_by_margin = gate_mask & (margin < thr_m_used)

    if combine_mode == "and":
        unknown = unknown_by_sim & unknown_by_margin
    else:
        unknown = unknown_by_sim | unknown_by_margin

    pred_out = np.where(unknown, -1, pred).astype(np.int32)

    dbg = pd.DataFrame({
        "pred_raw": pred,
        "max_sim": top1,
        "second_sim": top2,
        "margin": margin,
        "thr_sim_used": thr_sim_used,
        "thr_margin_used": thr_m_used,
        "gate_mask": gate_mask.astype(np.int8),
        "unknown_by_sim": unknown_by_sim.astype(np.int8),
        "unknown_by_margin": unknown_by_margin.astype(np.int8),
        "unknown": unknown.astype(np.int8),
        "pred": pred_out,
    })
    return pred_out, dbg

In [None]:
class CFG:
    DEBUG = False # False    
    EXP = "900_Ensemble_EXP000"
    SEED = 1129
    n_trials = 10

if CFG.DEBUG:
    CFG.EXP = "900_Ensemble_DEBUG"

class PATHS:
    DATA_DIR = "/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/data"

    INPUT_DIR0 = "/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp006_ens/inference" # 0.8892
    INPUT_DIR1 = "/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp008_ens/inference" # 0.8795
    INPUT_DIR2 = "/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp011_ens/inference" # 0.8572
    INPUT_DIR3 = "/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp012_ens/inference" # 0.8669
    # INPUT_DIR4 = "../output/200_multimodalNN_exp010" # 0.2035

    OUTPUT_DIR = f"../output/{CFG.EXP}"


use_paths = [PATHS.INPUT_DIR0, PATHS.INPUT_DIR1, PATHS.INPUT_DIR2, PATHS.INPUT_DIR3, ] # PATHS.INPUT_DIR4, ]
os.makedirs(PATHS.OUTPUT_DIR, exist_ok=True)

In [10]:
# =============================
# main ensemble
# =============================
use_paths = [PATHS.INPUT_DIR0, PATHS.INPUT_DIR1, PATHS.INPUT_DIR2, PATHS.INPUT_DIR3, ]  # ★存在するものだけにする

# 実験の重み（LBが近いなら同率でOK。LB差があるなら少し傾ける）
weights = np.array([0.60, 0.20, 0.10, 0.10], dtype=np.float32)  # 例: exp006_ens を少し重く
weights = weights / weights.sum()

infer_dirs = [find_infer_dir(p, infer_name="300_infer") for p in use_paths]
print("infer_dirs:\n", "\n".join(infer_dirs))

sims_list = []
thr_sim_tables = []
thr_m_tables = []

for d in infer_dirs:
    sims, thr_sim_table, thr_m_table, sub = load_sims_and_threshold(d)
    sims_list.append(sims.astype(np.float32))
    thr_sim_tables.append(thr_sim_table)
    thr_m_tables.append(thr_m_table)
    print(f"[loaded] {d} sims={sims.shape} thr_table={'yes' if thr_sim_table is not None else 'no'}")

# shape check
N = sims_list[0].shape[0]
C = sims_list[0].shape[1]
assert all(x.shape == (N, C) for x in sims_list), "sims shape mismatch between experiments"

# weighted mean sims
sims_ens = np.zeros((N, C), dtype=np.float32)
for w, s in zip(weights, sims_list):
    sims_ens += w * s

# threshold table: 使うなら “median” が無難（極端なfold由来を抑える）
# ※無ければ fallback scalar を使う
thr_sim_table_ens = None
thr_m_table_ens = None
if all(t is not None for t in thr_sim_tables):
    thr_sim_table_ens = np.median(np.stack(thr_sim_tables, axis=0), axis=0).astype(np.float32)
if all(t is not None for t in thr_m_tables):
    thr_m_table_ens = np.median(np.stack(thr_m_tables, axis=0), axis=0).astype(np.float32)

# unknown rule (あなたの最近の設定に合わせる)
pred_out, dbg = apply_unknown_rule(
    sims_ens,
    thr_sim_table=thr_sim_table_ens,
    thr_m_table=thr_m_table_ens,
    thr_sim_fallback=0.79,   # 無い場合の保険
    thr_m_fallback=0.0,
    combine_mode="or",
    margin_gate_add=0.10,    # いつもの
    margin_gate_sim=None,
)

# save
outdir = Path(PATHS.OUTPUT_DIR)
outdir.mkdir(parents=True, exist_ok=True)

sub = pd.DataFrame({"label_id": pred_out})
sub.to_csv(outdir / "submission.csv", index=False)
dbg.to_csv(outdir / "test_pred_debug.csv", index=False)
np.save(outdir / "sims_ens.npy", sims_ens)

print("saved:", outdir / "submission.csv")
print("pred distribution:\n", pd.Series(pred_out).value_counts().sort_index())
print("unknown rate:", float((pred_out == -1).mean()))

infer_dirs:
 /mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp006_ens/inference/300_infer
/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp008_ens/inference/300_infer
/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp011_ens/inference/300_infer
/mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp012_ens/inference/300_infer
[loaded] /mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp006_ens/inference/300_infer sims=(9223, 11) thr_table=yes
[loaded] /mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp008_ens/inference/300_infer sims=(9223, 11) thr_table=yes
[loaded] /mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/experiment/100_train_arcface_exp011_ens/inference/300_infer sims=(9223, 11) thr_table=yes
[loaded] /mnt/nfs/home/hidebu/study/atmaCup-22-CA-x-atmaCup-3rd/e