# L01: 競馬予想モデル学習

LightGBMを使用した競馬予想モデルの構築と評価

**実行環境**: ローカル（M4 MacBook Air）

---
## 1. 環境設定

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# プロジェクトルート
PROJECT_ROOT = Path('../../')
sys.path.insert(0, str(PROJECT_ROOT / 'src'))

# データパス
RACES_PARQUET = PROJECT_ROOT / 'data' / 'processed' / 'races.parquet'
RESULTS_PARQUET = PROJECT_ROOT / 'data' / 'processed' / 'results.parquet'
MODEL_DIR = PROJECT_ROOT / 'models'
MODEL_DIR.mkdir(exist_ok=True)

print(f"データパス: {RACES_PARQUET}")

In [None]:
# データ読み込み
races_df = pd.read_parquet(RACES_PARQUET)
results_df = pd.read_parquet(RESULTS_PARQUET)

print(f"レース数: {len(races_df):,}")
print(f"出走馬データ数: {len(results_df):,}")

---
## 2. データ分割

- **Train**: 2010〜2024年
- **Test**: 2025年〜

In [None]:
# race_date を datetime に変換
results_df['race_date'] = pd.to_datetime(results_df['race_date'])
results_df['year'] = results_df['race_date'].dt.year

# 学習・テスト分割
TRAIN_END_YEAR = 2024

train_df = results_df[results_df['year'] <= TRAIN_END_YEAR].copy()
test_df = results_df[results_df['year'] > TRAIN_END_YEAR].copy()

print(f"Train: {len(train_df):,} ({train_df['year'].min()}〜{train_df['year'].max()})")
print(f"Test: {len(test_df):,} ({test_df['year'].min()}〜{test_df['year'].max()})")

---
## 3. 特徴量エンジニアリング

In [None]:
def calculate_historical_features(df, history_df):
    """
    各出走馬の過去成績から特徴量を計算
    
    Args:
        df: 対象データ
        history_df: 過去成績データ（df以前のデータ）
    """
    # race_date でソート
    history_df = history_df.sort_values('race_date')
    
    # 馬ごとの過去成績を集計
    horse_stats = history_df.groupby('horse_id').agg({
        'finish_position': ['count', 'mean'],
        'race_id': 'count'
    }).reset_index()
    horse_stats.columns = ['horse_id', 'horse_race_count', 'horse_avg_finish', 'horse_total_races']
    
    # 勝率・複勝率
    win_stats = history_df.groupby('horse_id').apply(
        lambda x: pd.Series({
            'horse_win_rate': (x['finish_position'] == 1).sum() / len(x) if len(x) > 0 else 0,
            'horse_place_rate': (x['finish_position'] <= 3).sum() / len(x) if len(x) > 0 else 0,
        })
    ).reset_index()
    
    horse_stats = horse_stats.merge(win_stats, on='horse_id', how='left')
    
    # 騎手の成績
    jockey_stats = history_df.groupby('jockey_id').apply(
        lambda x: pd.Series({
            'jockey_race_count': len(x),
            'jockey_win_rate': (x['finish_position'] == 1).sum() / len(x) if len(x) > 0 else 0,
            'jockey_place_rate': (x['finish_position'] <= 3).sum() / len(x) if len(x) > 0 else 0,
        })
    ).reset_index()
    
    # 調教師の成績
    trainer_stats = history_df.groupby('trainer_id').apply(
        lambda x: pd.Series({
            'trainer_race_count': len(x),
            'trainer_win_rate': (x['finish_position'] == 1).sum() / len(x) if len(x) > 0 else 0,
            'trainer_place_rate': (x['finish_position'] <= 3).sum() / len(x) if len(x) > 0 else 0,
        })
    ).reset_index()
    
    # マージ
    result = df.merge(horse_stats, on='horse_id', how='left')
    result = result.merge(jockey_stats, on='jockey_id', how='left')
    result = result.merge(trainer_stats, on='trainer_id', how='left')
    
    return result

print("特徴量計算関数を定義")

In [None]:
# 2023年以前のデータを使って2024年の統計を計算（Train用）
# 全履歴を使用（簡易版）
print("特徴量を計算中...")

# 全データから過去成績を集計
all_history = results_df[results_df['year'] <= TRAIN_END_YEAR].copy()

# Train/Test両方に特徴量を追加
train_features = calculate_historical_features(train_df, all_history)
test_features = calculate_historical_features(test_df, all_history)

print(f"Train features: {train_features.shape}")
print(f"Test features: {test_features.shape}")

In [None]:
# 追加特徴量
def add_features(df):
    df = df.copy()
    
    # surface をエンコード
    df['surface_encoded'] = df['surface'].map({'芝': 0, 'ダート': 1}).fillna(-1)
    
    # 目的変数（3着以内かどうか）
    df['target'] = (df['finish_position'] <= 3).astype(int)
    
    return df

train_features = add_features(train_features)
test_features = add_features(test_features)

print("特徴量追加完了")

---
## 4. モデル学習（LightGBM）

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

# 使用する特徴量
FEATURE_COLS = [
    # レース情報
    'distance', 'surface_encoded', 'level_score',
    'horse_number', 'gate_number', 'impost',
    'popularity', 'odds',
    # 馬の過去成績
    'horse_race_count', 'horse_avg_finish',
    'horse_win_rate', 'horse_place_rate',
    # 騎手の成績
    'jockey_race_count', 'jockey_win_rate', 'jockey_place_rate',
    # 調教師の成績
    'trainer_race_count', 'trainer_win_rate', 'trainer_place_rate',
]

# 欠損値を除外
train_clean = train_features.dropna(subset=['target'] + FEATURE_COLS)
test_clean = test_features.dropna(subset=['target'] + FEATURE_COLS)

print(f"Train (clean): {len(train_clean):,}")
print(f"Test (clean): {len(test_clean):,}")

X_train = train_clean[FEATURE_COLS]
y_train = train_clean['target']
X_test = test_clean[FEATURE_COLS]
y_test = test_clean['target']

In [None]:
# LightGBM 学習
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
}

# Validation split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

train_data = lgb.Dataset(X_tr, label=y_tr)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

print("モデル学習開始...")
model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

print(f"\n学習完了! Best iteration: {model.best_iteration}")

---
## 5. モデル評価

In [None]:
# Test データで予測
y_pred_proba = model.predict(X_test)

# AUC
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Test AUC: {auc:.4f}")

# 閾値0.5での精度
y_pred = (y_pred_proba >= 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# 特徴量重要度
import matplotlib.pyplot as plt

importance = pd.DataFrame({
    'feature': FEATURE_COLS,
    'importance': model.feature_importance()
}).sort_values('importance', ascending=False)

print("=== 特徴量重要度 ===")
print(importance.to_string())

plt.figure(figsize=(10, 6))
plt.barh(importance['feature'], importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

---
## 6. レースごとの予測精度

In [None]:
# 各レースで上位3頭の的中率を計算
test_clean_with_pred = test_clean.copy()
test_clean_with_pred['pred_proba'] = y_pred_proba

def calculate_hit_rate(group, top_n=3):
    """レースごとに上位N頭の的中率を計算"""
    # 予測確率で上位N頭を選択
    top_preds = group.nlargest(top_n, 'pred_proba')
    # その中に実際の3着内馬が含まれるか
    hit = (top_preds['finish_position'] <= 3).any()
    return hit

race_hit = test_clean_with_pred.groupby('race_id').apply(calculate_hit_rate)
hit_rate = race_hit.mean()

print(f"=== レースごとの的中率 ===")
print(f"上位3頭予測の3着内的中率: {hit_rate:.1%}")
print(f"対象レース数: {len(race_hit):,}")

In [None]:
# 回収率シミュレーション（単勝1点買い）
def simulate_returns(group):
    """予測1位馬の単勝を買った場合の回収"""
    top_pred = group.nlargest(1, 'pred_proba').iloc[0]
    if top_pred['finish_position'] == 1:
        return top_pred['odds'] * 100  # 100円で購入した場合の払い戻し
    return 0

returns = test_clean_with_pred.groupby('race_id').apply(simulate_returns)
total_bet = len(returns) * 100  # 1レース100円
total_return = returns.sum()
return_rate = total_return / total_bet * 100

print(f"=== 単勝回収率シミュレーション ===")
print(f"総賭け金: ¥{total_bet:,.0f}")
print(f"総払戻金: ¥{total_return:,.0f}")
print(f"回収率: {return_rate:.1f}%")

---
## 7. モデル保存

In [None]:
import pickle

model_path = MODEL_DIR / 'lightgbm_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(model, f)

print(f"モデル保存完了: {model_path}")

In [None]:
# 使用した特徴量も保存
config = {
    'feature_cols': FEATURE_COLS,
    'train_end_year': TRAIN_END_YEAR,
}

config_path = MODEL_DIR / 'model_config.pkl'
with open(config_path, 'wb') as f:
    pickle.dump(config, f)

print(f"設定保存完了: {config_path}")