# インポート

In [1]:
import preprocessing
from feature_engineering import FeatureCreator
#from feature_engineering import FeatureCreator, PredictionFeatureCreator
from train import Trainer
import prediction
from evaluation import Evaluator
%load_ext autoreload

In [2]:
%autoreload

# データ加工

In [3]:
# レース結果テーブルの前処理
results_preprocessed = preprocessing.process_results()

In [4]:
# 重複チェック
results_preprocessed.duplicated(subset=["race_id", "horse_id"]).sum()

np.int64(0)

In [5]:
# 欠損チェック
results_preprocessed.isnull().sum()

race_id        0
horse_id       0
jockey_id      0
trainer_id     0
owner_id       0
rank           0
umaban         0
wakuban        0
tansho_odds    0
popularity     0
impost         0
sex            0
age            0
weight         0
weight_diff    0
dtype: int64

In [6]:
# 馬の過去成績テーブルの加工
horse_results_preprocessed = preprocessing.process_horse_results()

In [7]:
# 欠損値チェック
horse_results_preprocessed.isnull().sum()

horse_id            0
date                0
rank                0
prize               0
rank_diff        3344
weather           551
race_type           0
course_len          0
ground_state      290
race_class      94091
time             3266
win                 0
rentai              0
show                0
place               0
n_horses            0
dtype: int64

In [8]:
# 重複チェック
horse_results_preprocessed.duplicated(subset=["horse_id", "date"]).sum()

np.int64(0)

In [9]:
# レース情報テーブルの前処理
race_info_preprocessed = preprocessing.process_race_info()

In [10]:
race_info_preprocessed

Unnamed: 0,race_id,date,race_type,around,course_len,weather,ground_state,race_class,place,month,sin_date,cos_date
0,202301010101,2023-07-22,1,0.0,1200,0,0,1,1,7,0.656633,0.060799
1,202301010102,2023-07-22,0,0.0,1000,0,0,1,1,7,0.656633,0.060799
2,202301010103,2023-07-22,0,0.0,1700,0,0,1,1,7,0.656633,0.060799
3,202301010104,2023-07-22,1,0.0,1500,0,0,1,1,7,0.656633,0.060799
4,202301010105,2023-07-22,0,0.0,1700,0,0,0,1,7,0.656633,0.060799
...,...,...,...,...,...,...,...,...,...,...,...,...
3451,202310030808,2023-09-03,0,0.0,1000,0,0,2,10,9,0.113279,0.537695
3452,202310030809,2023-09-03,1,0.0,1200,0,0,3,10,9,0.113279,0.537695
3453,202310030810,2023-09-03,0,0.0,1700,0,0,4,10,9,0.113279,0.537695
3454,202310030811,2023-09-03,1,0.0,1200,0,0,7,10,9,0.113279,0.537695


In [11]:
# 払い戻しテーブルの前処理
return_tables_preprocessed = preprocessing.process_return_tables()

In [12]:
return_tables_preprocessed

Unnamed: 0_level_0,bet_type,win_umaban,return
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
202301010101,単勝,[5],120
202301010101,複勝,[5],100
202301010101,複勝,[8],110
202301010101,複勝,[6],310
202301010101,馬連,"[5, 8]",170
...,...,...,...
202310030812,ワイド,"[6, 8]",410
202310030812,ワイド,"[6, 11]",470
202310030812,馬単,"[8, 11]",1580
202310030812,三連複,"[6, 8, 11]",1590


# 特徴量の作成

In [13]:
fc = FeatureCreator()
features = fc.create_features()

In [14]:
# 重複チェック
features.duplicated(subset=["race_id", "horse_id"]).sum()

np.int64(0)

# 学習

In [16]:
trainer = Trainer()
evaluation_df = trainer.run(test_start_date="2023-10-01")

[100]	training's binary_logloss: 0.153821	valid_1's binary_logloss: 0.20604


# 精度評価

In [17]:
evaluator = Evaluator()
evaluator.summarize_box_top_n(n=1, save_filename="box_summary_top1.csv")

Unnamed: 0_level_0,hitrate_model,hitrate_pop,returnrate_model,returnrate_pop
bet_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
単勝,0.327546,0.331019,0.81169,0.798727
複勝,0.633102,0.641204,0.841551,0.838194


In [18]:
evaluator.summarize_box_top_n(n=2, save_filename="box_summary_top2.csv")

Unnamed: 0_level_0,hitrate_model,hitrate_pop,returnrate_model,returnrate_pop
bet_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ワイド,0.326389,0.324074,0.880671,0.809606
単勝,0.539352,0.543981,0.877199,0.836285
複勝,0.820602,0.842593,0.839583,0.83941
馬単,0.165509,0.170139,0.883565,0.811748
馬連,0.165509,0.170139,0.883218,0.829977


In [19]:
evaluator.summarize_box_top_n(n=3, save_filename="box_summary_top3.csv")

Unnamed: 0_level_0,hitrate_model,hitrate_pop,returnrate_model,returnrate_pop
bet_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ワイド,0.543981,0.559028,0.828549,0.802353
三連単,0.09375,0.09375,0.736979,0.60787
三連複,0.09375,0.09375,0.789005,0.731713
単勝,0.663194,0.663194,0.851543,0.794676
複勝,0.922454,0.924769,0.83669,0.824807
馬単,0.31713,0.322917,0.793326,0.713117
馬連,0.31713,0.322917,0.811304,0.733719


# 予測

当日出走馬が確定した時点で実行できる

In [20]:
# 当日出走馬の過去成績テーブルの前処理
horse_results_preprocessed = preprocessing.process_horse_results(
    input_filename="horse_results_prediction.csv",
    output_filename="horse_results_prediction.csv"
)

In [21]:
# 予測
prediction.predict(features)

Unnamed: 0,race_id,umaban,tansho_odds,popularity,pred
11808,202305010109,3,1.5,1,0.917490
10376,202304030704,5,1.5,1,0.862048
19402,202306010109,9,1.5,1,0.860638
12322,202305010409,8,1.5,1,0.857335
19258,202305050812,2,1.3,1,0.853936
...,...,...,...,...,...
16578,202305040103,14,630.9,16,0.000197
14728,202305021007,9,667.7,16,0.000183
39079,202309020606,3,354.9,13,0.000179
15326,202305030112,10,420.7,16,0.000173
