# preparation

In [1]:
import pandas as pd
import numpy as np

#モデル
import lightgbm as lgb

#パラメータ探索
import optuna

#クロスバリデーション
from sklearn.model_selection import KFold

#エヴァリュエーション
from sklearn.metrics import precision_score, recall_score, mean_squared_error

#可視化
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#保存
import pickle

In [2]:
#データを読み込む
j_uma_race_df = pd.read_pickle('../datasets/traindata/j_uma_race.pkl')
j_payout_df = pd.read_pickle('../datasets/traindata/j_payout.pkl')
n_payout_df = pd.read_pickle('../datasets/traindata/n_payout.pkl')

In [3]:
#データを結合する
payout_df = pd.concat([j_payout_df, n_payout_df], axis=0).reset_index(drop=True)

### データの削除

In [4]:
#海外のレースを削除
j_uma_race_df['keibajo_code'] = pd.to_numeric(j_uma_race_df['keibajo_code'], errors='coerce')
j_uma_race_df = j_uma_race_df.dropna(subset=['keibajo_code'])
j_uma_race_df['keibajo_code']  = j_uma_race_df['keibajo_code'] .astype(int)

#レースに参加できていない競走馬を削除
j_uma_race_df= j_uma_race_df[j_uma_race_df['kakutei_chakujun'] != 0]

# LightGBM

## グループクエリ用のカラムを作成

In [5]:
# 新しいグループを作成
j_uma_race_df['group'] = j_uma_race_df['kaisai_nen'].astype(str) +"-"+ j_uma_race_df['kaisai_tsukihi'].astype(str) +"-"+  j_uma_race_df['keibajo_code'].astype(str) +"-"+  j_uma_race_df['race_bango'].astype(str)
j_payout_df['group'] = j_payout_df['kaisai_nen'].astype(str) +"-"+ j_payout_df['kaisai_tsukihi'].astype(str) +"-"+  j_payout_df['keibajo_code'].astype(str) +"-"+  j_payout_df['race_bango'].astype(str)

## LightGBM

### テスト用と学習用のデータに分け、モデルを作成

In [6]:
# 2010年から2019年のデータを学習データとして取得
train_data = j_uma_race_df[j_uma_race_df['kaisai_nen'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]

# ランキング学習のために必要な特徴量とターゲットを設定
features = [
            'kaisai_nen',
            'kaisai_tsukihi',
            'keibajo_code',
            # 'kaisai_kai',
            # 'kaisai_nichime',
            'race_bango',
            'wakuban',
            'umaban',
            'ketto_toroku_bango',
            # 'bamei',
            'umakigo_code',
            'seibetsu_code',
            'hinshu_code',
            'moshoku_code',
            'barei',
            'tozai_shozoku_code',
            'chokyoshi_code',
            'banushi_code',
            # 'banushimei',
            'futan_juryo',
            'blinker_shiyo_kubun',
            'kishu_code',
            # 'kishumei_ryakusho',
            'kishu_minarai_code',
            'bataiju',
            'zogen_fugo',
            'zogen_sa',
            'ijo_kubun_code',
            # 'nyusen_juni',
            ## 'kakutei_chakujun',
            # 'dochaku_kubun',
            # 'dochaku_tosu',
            # 'soha_time',
            # 'chakusa_code_1',
            # 'chakusa_code_2',
            # 'chakusa_code_3',
            # 'corner_1',
            # 'corner_2',
            # 'corner_3',
            # 'corner_4',
            # 'tansho_odds',
            # 'tansho_ninkijun',
            # 'kakutoku_honshokin',
            # 'kakutoku_fukashokin',
            # 'kohan_4f',
            # 'kohan_3f',
            # 'aiteuma_joho_1',
            # 'aiteuma_joho_2',
            # 'aiteuma_joho_3',
            # 'time_sa',
            # 'record_koshin_kubun',
            'kyakushitsu_hantei',
            ]


target = 'kakutei_chakujun'

# LightGBM用のデータセットを作成
train_dataset = lgb.Dataset(train_data[features], label=train_data[target])

# LightGBMのパラメータを設定
params = {
    'objective': 'regression',
    'metric': 'rmse', 
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# k=5のFOLDでデータを分割
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 各FOLDでのモデルを保存するためのリスト
models = []

# LightGBMモデルを訓練
for train_index, valid_index in kf.split(train_data):
    train_fold_data = train_data.iloc[train_index]
    valid_fold_data = train_data.iloc[valid_index]
    
    train_dataset = lgb.Dataset(train_fold_data[features], label=train_fold_data[target])
    valid_dataset = lgb.Dataset(valid_fold_data[features], label=valid_fold_data[target])
    
    model = lgb.train(params, train_dataset, num_boost_round=1000, valid_sets=[valid_dataset], early_stopping_rounds=100, verbose_eval=100)
    models.append(model)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1643
[LightGBM] [Info] Number of data points in the train set: 635508, number of used features: 24
[LightGBM] [Info] Start training from score 7.055376
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 3.41611
[200]	valid_0's rmse: 3.37426
[300]	valid_0's rmse: 3.35935
[400]	valid_0's rmse: 3.35126
[500]	valid_0's rmse: 3.34691
[600]	valid_0's rmse: 3.3418
[700]	valid_0's rmse: 3.33792
[800]	valid_0's rmse: 3.33438
[900]	valid_0's rmse: 3.33144
[1000]	valid_0's rmse: 3.32931
Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 3.32931
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 635508, number of used features: 24
[LightG

### テストデータで予測

In [7]:
# 2020年から2022年のテストデータを取得
test_data_2020 = j_uma_race_df[j_uma_race_df['kaisai_nen'] == 2020].copy()
test_data_2021 = j_uma_race_df[j_uma_race_df['kaisai_nen'] == 2021].copy()
test_data_2022 = j_uma_race_df[j_uma_race_df['kaisai_nen'] == 2022].copy()

# 2021年と2022年のデータに対して予測を行い、平均を取る
test_data_2020.loc[:, 'y_pred'] = sum([model.predict(test_data_2020[features], num_iteration=model.best_iteration) for model in models]) / len(models)
test_data_2021.loc[:, 'y_pred'] = sum([model.predict(test_data_2021[features], num_iteration=model.best_iteration) for model in models]) / len(models)
test_data_2022.loc[:, 'y_pred'] = sum([model.predict(test_data_2022[features], num_iteration=model.best_iteration) for model in models]) / len(models)

# 予測されたランクをグループごとに計算
test_data_2020.loc[:, 'predicted_rank'] = test_data_2020.groupby('group')['y_pred'].rank(method='min')
test_data_2021.loc[:, 'predicted_rank'] = test_data_2021.groupby('group')['y_pred'].rank(method='min')
test_data_2022.loc[:, 'predicted_rank'] = test_data_2022.groupby('group')['y_pred'].rank(method='min')

# 結果を表示
test_data_2021[['group', 'predicted_rank', 'kakutei_chakujun']]

Unnamed: 0,group,predicted_rank,kakutei_chakujun
883724,2021-101-45-10,3.0,3
883725,2021-101-45-10,6.0,6
883726,2021-101-45-10,4.0,5
883727,2021-101-45-10,5.0,4
883728,2021-101-45-10,2.0,1
...,...,...,...
1070762,2021-923-42-11,1.0,1
1070772,2021-1203-43-11,2.0,3
1070776,2021-1020-42-11,1.0,9
1070784,2021-908-44-11,3.0,10


## モデル評価

### RMSE

In [8]:
# RMSEを計算
rmse_2020 = np.sqrt(mean_squared_error(test_data_2020['predicted_rank'], test_data_2020['kakutei_chakujun']))
print(f"2021 RMSE: {rmse_2020:.3%}")
rmse_2021 = np.sqrt(mean_squared_error(test_data_2021['predicted_rank'], test_data_2021['kakutei_chakujun']))
print(f"2021 RMSE: {rmse_2021:.3%}")
rmse_2022 = np.sqrt(mean_squared_error(test_data_2022['predicted_rank'], test_data_2022['kakutei_chakujun']))
print(f"2022 RMSE: {rmse_2022:.3%}")
rmse_mean = np.mean([rmse_2020, rmse_2021, rmse_2022])
print(f"Mean RMSE: {rmse_mean:.3%}")

2021 RMSE: 392.235%
2021 RMSE: 394.915%
2022 RMSE: 389.893%
Mean RMSE: 392.348%


### precision@5&recall@5

In [9]:
#precision@5とrecall@5を計算
def precision_at_5(predictions, actual):
    top_5_predictions = predictions.argsort()[-5:][::-1]  # 予測の上位5頭のインデックス
    top_3_actual = actual.argsort()[-3:][::-1]  # 実際の上位3頭のインデックス
    common_elements = np.intersect1d(top_5_predictions, top_3_actual)  # 共通の要素を抽出
    precision = len(common_elements) / 5
    return precision

def recall_at_5(predictions, actual):
    top_5_predictions = predictions.argsort()[-5:][::-1]  # 予測の上位5頭のインデックス
    top_3_actual = actual.argsort()[-3:][::-1]  # 実際の上位3頭のインデックス
    common_elements = np.intersect1d(top_5_predictions, top_3_actual)  # 共通の要素を抽出
    recall = len(common_elements) / 3
    return recall

In [10]:
group_ids_2020 = test_data_2020['group'].unique()
precisions = []
recalls = []

#各レースごとでPrecision@5とRecall@5を計算
for group_id in group_ids_2020:
    test_data = test_data_2020[test_data_2020['group'] == group_id]
    p = precision_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    r = recall_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    precisions.append(p)
    recalls.append(r)

precision_5_2020 = np.mean(precisions)
recall_5_2020 = np.mean(recalls)

print(f"2020 Precision@5: {precision_5_2020:.3%}")
print(f"2020 Recall@5: {recall_5_2020:.3%}")


group_ids_2021 = test_data_2021['group'].unique()
precisions = []
recalls = []

#各レースごとでPrecision@5とRecall@5を計算
for group_id in group_ids_2021:
    test_data = test_data_2021[test_data_2021['group'] == group_id]
    p = precision_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    r = recall_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    precisions.append(p)
    recalls.append(r)

precision_5_2021 = np.mean(precisions)
recall_5_2021 = np.mean(recalls)

print(f"2021 Precision@5: {precision_5_2021:.3%}")
print(f"2021 Recall@5: {recall_5_2021:.3%}")


roup_ids_2022 = test_data_2022['group'].unique()
precisions = []
recalls = []

for group_id in roup_ids_2022:
    test_data = test_data_2022[test_data_2022['group'] == group_id]
    p = precision_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    r = recall_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    precisions.append(p)
    recalls.append(r)

precision_5_2022 = np.mean(precisions)
recall_5_2022 = np.mean(recalls)

print(f"2022 Precision@5: {precision_5_2022:.3%}")
print(f"2022 Recall@5: {recall_5_2022:.3%}")


precision_5_mean = np.mean([precision_5_2020, precision_5_2021, precision_5_2022])
recall_5_mean = np.mean([recall_5_2020, recall_5_2021, recall_5_2022])

print(f"Mean Precision@5: {precision_5_mean:.3%}")
print(f"Mean Recall@5: {recall_5_mean:.3%}")

2020 Precision@5: 42.172%
2020 Recall@5: 70.286%
2021 Precision@5: 41.731%
2021 Recall@5: 69.552%
2022 Precision@5: 41.809%
2022 Recall@5: 69.682%
Mean Precision@5: 41.904%
Mean Recall@5: 69.840%


### 収支(三連複五頭ボックス)

In [11]:
# 各レースごとに予測ランキング上位5頭のうち、実際の上位3頭がすべて入っているかを確認する関数
def check_top3_in_top5_predictions(group):
    predicted_top5 = group.nsmallest(5, 'y_pred').index.tolist()
    actual_top3 = group.nsmallest(3, 'kakutei_chakujun').index.tolist()
    return all([horse in predicted_top5 for horse in actual_top3])

In [12]:
# 各レースで上記の関数を適用
results_2020 = test_data_2020.groupby('group').apply(check_top3_in_top5_predictions)
results_2021 = test_data_2021.groupby('group').apply(check_top3_in_top5_predictions)
results_2022 = test_data_2022.groupby('group').apply(check_top3_in_top5_predictions)

# 予測モデルが成功したレースのインデックスを取得する
successful_groups_2020 = results_2020[results_2020].index
successful_groups_2021 = results_2021[results_2021].index
successful_groups_2022 = results_2022[results_2022].index

# そのインデックスを使用して、harai_dfから対応する支払倍率を取得する
successful_payout_2020 = j_payout_df[j_payout_df['group'].isin(successful_groups_2020)]
successful_payout_2021 = j_payout_df[j_payout_df['group'].isin(successful_groups_2021)]
successful_payout_2022 = j_payout_df[j_payout_df['group'].isin(successful_groups_2022)]

payouts_2020 = successful_payout_2020['haraimodoshi_sanrenpuku_1b']
payouts_2021 = successful_payout_2021['haraimodoshi_sanrenpuku_1b']
payouts_2022 = successful_payout_2022['haraimodoshi_sanrenpuku_1b']

# すべての成功したレースに対して、100円をかけた場合の支払いを計算する
total_payout_2020 = (payouts_2020).sum()
total_payout_2021 = (payouts_2021).sum()
total_payout_2022 = (payouts_2022).sum()

# 合計の支払いから、すべてのレースにかけた合計金額を引くことで、純利益を計算する
total_investment_2020 = test_data_2020['group'].nunique() * 1000
total_investment_2021 = test_data_2021['group'].nunique() * 1000
total_investment_2022 = test_data_2021['group'].nunique() * 1000

net_profit_2020 = total_payout_2020 - total_investment_2020
net_profit_2021 = total_payout_2021 - total_investment_2021
net_profit_2022 = total_payout_2022 - total_investment_2022

average_net_profit_2020 = net_profit_2020/len(test_data_2020)
average_net_profit_2021 = net_profit_2021/len(test_data_2021)
average_net_profit_2022 = net_profit_2020/len(test_data_2022)

print(f"2020 Average Net Profit: {average_net_profit_2020:.3f} yen")
print(f"2021 Average Net Profit: {average_net_profit_2021:.3f} yen")
print(f"2022 Average Net Profit: {average_net_profit_2022:.3f} yen")

mean_average_net_profit = np.mean([average_net_profit_2020, average_net_profit_2021, average_net_profit_2022])
print(f"Mean Average Net Profit: {mean_average_net_profit:.3f} yen")

2020 Average Net Profit: 40.975 yen
2021 Average Net Profit: 41.447 yen
2022 Average Net Profit: 41.728 yen
Mean Average Net Profit: 41.383 yen


In [13]:
# 1. 各レースごとに予測ランキング上位5頭のうち、実際の上位3頭がすべて入っているかを確認する関数
def check_top3_in_top5_predictions(group):
    predicted_top5 = group.nsmallest(5, 'y_pred').index.tolist()
    actual_top3 = group.nsmallest(3, 'kakutei_chakujun').index.tolist()
    return all([horse in predicted_top5 for horse in actual_top3])

# 2. 各グループの利益を計算する関数
def calculate_group_profit(group):
    if check_top3_in_top5_predictions(group):
        payout_value = group['haraimodoshi_sanrenpuku_1b'].iloc[0]
        return payout_value - 1000
    else:
        return -1000

# 3. 各テストデータに対して利益を計算し、新しいカラムとして追加
for test_data in [test_data_2020, test_data_2021, test_data_2022]:
    # test_dataとpayout_dfをマージ
    merged_data = pd.merge(test_data, payout_df[['group', 'haraimodoshi_sanrenpuku_1b']], on='group', how='left')
    profits = merged_data.groupby('group').apply(calculate_group_profit).reset_index()
    profits.columns = ['group', 'profit']
    test_data = pd.merge(test_data, profits, on='group', how='left')

KeyError: "['group'] not in index"

### 予測を見る

In [None]:
# # ```
# # すべての予測を見たい時用（処理時間かかる）
# # ```

# #グループごとの予測値と実際値を見てみる
# for group_id in group_ids_2021:
#     test_data = test_data_2021[test_data_2021['group'] == group_id]
#     sorted_test_data = test_data.sort_values(by='kakutei_chakujun')
#     print(sorted_test_data[['group', 'predicted_rank', 'kakutei_chakujun']])

# モデル保存

In [None]:
# モデルを保存
with open('bestmodels/benchmark.pkl', 'wb') as f:
    pickle.dump(models, f)

# 分析

In [None]:
# models[0]の特徴量の重要度を取得
feature_importance = models[0].feature_importance(importance_type='gain')

# 特徴量の名前と重要度をDataFrameに変換
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
})

# 重要度の降順にソート
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# プロット
plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df, x='Importance', y='Feature')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# 予測ランキングの上位5頭を1、それ以外を0とする
test_data_2021.loc[:, 'predicted_top5'] = test_data_2021.groupby('group')['y_pred'].rank(ascending=True).apply(lambda x: 1 if x <= 5 else 0)

# 実際のランキングの上位3頭を1、それ以外を0とする
test_data_2021.loc[:, 'actual_top3'] = test_data_2021['kakutei_chakujun'].apply(lambda x: 1 if x <= 3 else 0)

# Confusion Matrixを計算
cm = confusion_matrix(test_data_2021['actual_top3'], test_data_2021['predicted_top5'])

# Confusion Matrixを表示
x_label = ['Not Top 5', 'Top 5']
y_label = ['Not Top 3', 'Top 3']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=x_label, yticklabels=y_label)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
#正解したレースの特徴を調べる

In [None]:
successful_groups = test_data_2021.groupby('group').apply(check_top3_in_top5_predictions)
successful_races = test_data_2021[test_data_2021['group'].isin(successful_groups[successful_groups].index)]

In [None]:
successful_races

In [None]:
# 全体のJyoCDの分布を取得
total_counts = test_data_2021['keibajo_code'].value_counts()

# successful_racesの中のJyoCDの分布を取得
success_counts = successful_races['keibajo_code'].value_counts()

# 正解したレースの割合を計算
success_ratio = success_counts / total_counts

# データを双方向の棒グラフで可視化
labels = success_ratio.index
x = np.arange(len(labels))
width = 0.35

fig, ax1 = plt.subplots(figsize=(15, 7))

color = 'tab:blue'
ax1.set_xlabel('keibajo_code')
ax1.set_ylabel('Success Ratio', color=color)
ax1.bar(x - width/2, success_ratio, width, color=color, label='Success Ratio')
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_xticks(x)
ax1.set_xticklabels(labels, rotation=45)

ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Total Races', color=color)
ax2.bar(x + width/2, total_counts, width, color=color, label='Total Races')
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Success Ratio and Total Races by keibajo_code')
plt.show()

In [None]:
# 各競馬場での成功したレースの支払いを合計
total_payouts = successful_payout_2021.groupby('keibajo_code')['haraimodoshi_sanrenpuku_1b'].sum()

# 各競馬場での投資額を計算
total_investments = test_data_2021['keibajo_code'].value_counts() * 1000

# 各競馬場での純利益を計算
net_profits = total_payouts - total_investments 

# データを棒グラフで可視化
net_profits.sort_values(ascending=False).plot(kind='bar', figsize=(12, 6), color='lightcoral')
plt.title('Net Profit by keibajo_code')
plt.xlabel('keibajo_code')
plt.ylabel('Net Profit')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
total_payouts

In [None]:
total_investments

In [None]:
net_profits

In [None]:
def objective(trial):
    # Optunaで調整するパラメータを設定
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
    }
    
    # k=5のFOLDでデータを分割
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # 各FOLDでの評価値を保存するためのリスト
    rmses = []
    
    # LightGBMモデルを訓練
    for train_index, valid_index in kf.split(train_data):
        train_fold_data = train_data.iloc[train_index]
        valid_fold_data = train_data.iloc[valid_index]
        
        train_dataset = lgb.Dataset(train_fold_data[features], label=train_fold_data[target])
        valid_dataset = lgb.Dataset(valid_fold_data[features], label=valid_fold_data[target])
        
        model = lgb.train(params, train_dataset, num_boost_round=1000, valid_sets=[valid_dataset], early_stopping_rounds=100, verbose_eval=100)
        
        # RMSEを計算して保存
        y_pred = model.predict(valid_fold_data[features])
        rmse = np.sqrt(mean_squared_error(valid_fold_data[target], y_pred))
        rmses.append(rmse)
    
    # 5-FOLDの平均RMSEを返す
    return np.mean(rmses)

# Optunaでのパラメータ探索を実行
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# 最適なパラメータ
best_params = study.best_params
best_params['metric'] = 'rmse'  # ここでmetricを追加

# KFoldでのモデル訓練
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = []

for train_index, valid_index in kf.split(train_data):
    train_fold_data = train_data.iloc[train_index]
    valid_fold_data = train_data.iloc[valid_index]
    
    train_dataset = lgb.Dataset(train_fold_data[features], label=train_fold_data[target])
    valid_dataset = lgb.Dataset(valid_fold_data[features], label=valid_fold_data[target])
    
    model = lgb.train(best_params, train_dataset, num_boost_round=500, valid_sets=[valid_dataset], early_stopping_rounds=20, verbose_eval=100)  # ここでearly_stopping_roundsを50に変更
    models.append(model)

In [None]:
# 2020年から2022年のテストデータを取得
test_data_2020 = uma_race_df[uma_race_df['kaisai_nen'] == 2020].copy()
test_data_2021 = uma_race_df[uma_race_df['kaisai_nen'] == 2021].copy()
test_data_2022 = uma_race_df[uma_race_df['kaisai_nen'] == 2022].copy()

# 2021年と2022年のデータに対して予測を行い、平均を取る
test_data_2020.loc[:, 'y_pred'] = sum([model.predict(test_data_2020[features], num_iteration=model.best_iteration) for model in models]) / len(models)
test_data_2021.loc[:, 'y_pred'] = sum([model.predict(test_data_2021[features], num_iteration=model.best_iteration) for model in models]) / len(models)
test_data_2022.loc[:, 'y_pred'] = sum([model.predict(test_data_2022[features], num_iteration=model.best_iteration) for model in models]) / len(models)

# 予測されたランクをグループごとに計算
test_data_2020.loc[:, 'predicted_rank'] = test_data_2020.groupby('group')['y_pred'].rank(method='min')
test_data_2021.loc[:, 'predicted_rank'] = test_data_2021.groupby('group')['y_pred'].rank(method='min')
test_data_2022.loc[:, 'predicted_rank'] = test_data_2022.groupby('group')['y_pred'].rank(method='min')

# 結果を表示
test_data_2021[['group', 'predicted_rank', 'kakutei_chakujun']]

In [None]:
# RMSEを計算
rmse_2020 = np.sqrt(mean_squared_error(test_data_2020['predicted_rank'], test_data_2020['kakutei_chakujun']))
print(f"2021 RMSE: {rmse_2020:.3%}")
rmse_2021 = np.sqrt(mean_squared_error(test_data_2021['predicted_rank'], test_data_2021['kakutei_chakujun']))
print(f"2021 RMSE: {rmse_2021:.3%}")
rmse_2022 = np.sqrt(mean_squared_error(test_data_2022['predicted_rank'], test_data_2022['kakutei_chakujun']))
print(f"2022 RMSE: {rmse_2022:.3%}")
rmse_mean = np.mean([rmse_2020, rmse_2021, rmse_2022])
print(f"Mean RMSE: {rmse_mean:.3%}")

group_ids_2020 = test_data_2020['group'].unique()
precisions = []
recalls = []

#各レースごとでPrecision@5とRecall@5を計算
for group_id in group_ids_2020:
    test_data = test_data_2020[test_data_2020['group'] == group_id]
    p = precision_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    r = recall_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    precisions.append(p)
    recalls.append(r)

precision_5_2020 = np.mean(precisions)
recall_5_2020 = np.mean(recalls)

print(f"2020 Precision@5: {precision_5_2020:.3%}")
print(f"2020 Recall@5: {recall_5_2020:.3%}")


group_ids_2021 = test_data_2021['group'].unique()
precisions = []
recalls = []

#各レースごとでPrecision@5とRecall@5を計算
for group_id in group_ids_2021:
    test_data = test_data_2021[test_data_2021['group'] == group_id]
    p = precision_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    r = recall_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    precisions.append(p)
    recalls.append(r)

precision_5_2021 = np.mean(precisions)
recall_5_2021 = np.mean(recalls)

print(f"2021 Precision@5: {precision_5_2021:.3%}")
print(f"2021 Recall@5: {recall_5_2021:.3%}")


roup_ids_2022 = test_data_2022['group'].unique()
precisions = []
recalls = []

for group_id in roup_ids_2022:
    test_data = test_data_2022[test_data_2022['group'] == group_id]
    p = precision_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    r = recall_at_5(test_data['predicted_rank'].values, test_data['kakutei_chakujun'].values)
    precisions.append(p)
    recalls.append(r)

precision_5_2022 = np.mean(precisions)
recall_5_2022 = np.mean(recalls)

print(f"2022 Precision@5: {precision_5_2022:.3%}")
print(f"2022 Recall@5: {recall_5_2022:.3%}")


precision_5_mean = np.mean([precision_5_2020, precision_5_2021, precision_5_2022])
recall_5_mean = np.mean([recall_5_2020, recall_5_2021, recall_5_2022])

print(f"Mean Precision@5: {precision_5_mean:.3%}")
print(f"Mean Recall@5: {recall_5_mean:.3%}")

# 各レースで上記の関数を適用
results_2020 = test_data_2020.groupby('group').apply(check_top3_in_top5_predictions)
results_2021 = test_data_2021.groupby('group').apply(check_top3_in_top5_predictions)
results_2022 = test_data_2022.groupby('group').apply(check_top3_in_top5_predictions)

# 予測モデルが成功したレースのインデックスを取得する
successful_groups_2020 = results_2020[results_2020].index
successful_groups_2021 = results_2021[results_2021].index
successful_groups_2022 = results_2022[results_2022].index

# そのインデックスを使用して、harai_dfから対応する支払倍率を取得する
successful_payout_2020 = j_payout_df[j_payout_df['group'].isin(successful_groups_2020)]
successful_payout_2021 = j_payout_df[j_payout_df['group'].isin(successful_groups_2021)]
successful_payout_2022 = j_payout_df[j_payout_df['group'].isin(successful_groups_2022)]

payouts_2020 = successful_payout_2020['haraimodoshi_sanrenpuku_1b']
payouts_2021 = successful_payout_2021['haraimodoshi_sanrenpuku_1b']
payouts_2022 = successful_payout_2022['haraimodoshi_sanrenpuku_1b']

# すべての成功したレースに対して、100円をかけた場合の支払いを計算する
total_payout_2020 = (payouts_2020).sum()
total_payout_2021 = (payouts_2021).sum()
total_payout_2022 = (payouts_2022).sum()

# 合計の支払いから、すべてのレースにかけた合計金額を引くことで、純利益を計算する
total_investment_2020 = test_data_2020['group'].nunique() * 1000
total_investment_2021 = test_data_2021['group'].nunique() * 1000
total_investment_2022 = test_data_2021['group'].nunique() * 1000

net_profit_2020 = total_payout_2020 - total_investment_2020
net_profit_2021 = total_payout_2021 - total_investment_2021
net_profit_2022 = total_payout_2022 - total_investment_2022

average_net_profit_2020 = net_profit_2020/len(test_data_2020)
average_net_profit_2021 = net_profit_2021/len(test_data_2021)
average_net_profit_2022 = net_profit_2020/len(test_data_2022)

print(f"2020 Average Net Profit: {average_net_profit_2020:.3f} yen")
print(f"2021 Average Net Profit: {average_net_profit_2021:.3f} yen")
print(f"2022 Average Net Profit: {average_net_profit_2022:.3f} yen")

mean_average_net_profit = np.mean([average_net_profit_2020, average_net_profit_2021, average_net_profit_2022])
print(f"Mean Average Net Profit: {mean_average_net_profit:.3f} yen")

In [None]:
# モデルを保存
with open('bestmodels/benchmark_optunaed.pkl', 'wb') as f:
    pickle.dump(models, f)