In [5]:
import pandas as pd

df = pd.read_csv("/work/a06/tsuda/2025_master_Tsuda/master_research/keiba/train_raw.csv")


# 着順が1位なら１，それ以外なら０に設定（目的関数の設定）

In [6]:
import pandas as pd

# 着順を数値に変換（中止・除外などは NaN になる）
df["着 順_num"] = pd.to_numeric(df["着 順"], errors="coerce")

# 3着以内なら1、それ以外（NaN含む）は0
df["y_place"] = (df["着 順_num"] == 1).fillna(False).astype(int)


In [7]:
# ===== Step 3: 特徴量（出走前にわかるものだけ） =====

# まず使う列を固定（ミニマム）
features = ["枠 番", "馬 番", "斤量", "馬体重", "人 気", "jockey_id", "調教師"]
cat_features = ["jockey_id", "調教師"]

# 必要な列があるか確認（なければここで止める）
missing = [c for c in (["race_id"] + features + ["y_place"]) if c not in df.columns]
if missing:
    raise ValueError(f"CSVに必要な列がありません: {missing}")

# X, y, groups を作る
X = df[features].copy()
y = df["y_place"].copy()
groups = df["race_id"].copy()

# 型と欠損を整える
for c in features:
    if c in cat_features:
        X[c] = X[c].fillna("UNKNOWN").astype("category")
    else:
        X[c] = pd.to_numeric(X[c], errors="coerce").fillna(-1)

print("X shape:", X.shape)
print("y mean:", y.mean())
print("n_races:", groups.nunique())


X shape: (47122, 7)
y mean: 0.0723229064980264
n_races: 3402


# 保存

In [10]:
models = []

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups)):
    tr_data = lgb.Dataset(
        X.iloc[tr_idx],
        y.iloc[tr_idx] if hasattr(y, "iloc") else y[tr_idx],
        categorical_feature=cat_features
    )
    va_data = lgb.Dataset(
        X.iloc[va_idx],
        y.iloc[va_idx] if hasattr(y, "iloc") else y[va_idx],
        categorical_feature=cat_features
    )

    model = lgb.train(
        params,
        tr_data,
        valid_sets=[va_data],
        num_boost_round=2000,
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)]
    )

    model.save_model(f"Tansyo_lgb_place_fold{fold}.txt")
    models.append(model)

    oof_pred[va_idx] = model.predict(X.iloc[va_idx])

print("saved fold models: Tansyo_lgb_place_fold0.txt ... Tansyo_lgb_place_fold4.txt")


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's binary_logloss: 0.21281
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.218741
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's binary_logloss: 0.216027
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[44]	valid_0's binary_logloss: 0.216661
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's binary_logloss: 0.215436
saved fold models: Tansyo_lgb_place_fold0.txt ... Tansyo_lgb_place_fold4.txt


In [None]:
# 予測と正解を一緒に保存（最低限これ）
cols_to_save = [
    "race_id", "馬名", "着順", "y_place", "pred_place",
    "人気", "枠番", "馬番", "斤量", "馬体重", "jockey_id", "調教師"
]

# 存在する列だけにする（安全）
cols_to_save = [c for c in cols_to_save if c in df.columns]

out_path = "oof_predictions_Tansyo.csv"
df[cols_to_save].to_csv(out_path, index=False, encoding="utf-8-sig")

print("saved:", out_path)


# 評価

In [13]:
import matplotlib.pyplot as plt
from matplotlib import font_manager, rcParams

# 利用可能な日本語フォント候補
candidates = [
    "IPAexGothic", "IPAPGothic",
    "Noto Sans CJK JP", "Noto Sans JP",
    "TakaoGothic", "Hiragino Sans",
    "Yu Gothic", "MS Gothic"
]

available = {f.name: f.fname for f in font_manager.fontManager.ttflist}

font_path = None
for name in candidates:
    if name in available:
        font_path = available[name]
        print("Using font:", name)
        break

if font_path is None:
    raise RuntimeError("日本語フォントが見つかりません")

# フォントを明示指定
jp_font = font_manager.FontProperties(fname=font_path)
rcParams["font.family"] = jp_font.get_name()
rcParams["axes.unicode_minus"] = False


Using font: Noto Sans CJK JP


In [2]:
import matplotlib.pyplot as plt

# y と oof_pred を使う（= pred_place）
p = df["pred_place"].values
y = df["y_place"].values

plt.figure(figsize=(7,4))
plt.hist(p[y==1], bins=50, alpha=0.6, label="1着以内 (y=1)")
plt.hist(p[y==0], bins=50, alpha=0.6, label="着外 (y=0)")
plt.xlabel("予測確率 P(1着以内)")
plt.ylabel("件数")
plt.legend()
plt.title("予測確率の分布（来た/来なかった）")
#plt.yscale("log")
plt.show()


NameError: name 'df' is not defined

In [1]:
import numpy as np

def calibration_curve_manual(y_true, y_prob, n_bins=20):
    bins = np.linspace(0, 1, n_bins + 1)
    bin_ids = np.digitize(y_prob, bins) - 1

    xs, ys = [], []
    for i in range(n_bins):
        mask = bin_ids == i
        if mask.sum() == 0:
            continue
        xs.append(y_prob[mask].mean())
        ys.append(y_true[mask].mean())
    return np.array(xs), np.array(ys)

xs, ys = calibration_curve_manual(y, p, n_bins=20)

plt.figure(figsize=(5,5))
plt.plot(xs, ys, marker="o", label="モデル")
plt.plot([0,1], [0,1], "--", label="理想（完全校正）")
plt.xlabel("予測確率")
plt.ylabel("実際の1着率")
plt.legend()
plt.title("キャリブレーション（確率の正直さ）")
plt.show()


NameError: name 'y' is not defined