# インポート

In [1]:
import preprocessing
from feature_engineering import FeatureCreator
#from feature_engineering import FeatureCreator, PredictionFeatureCreator
from train import Trainer
import prediction
from evaluation import Evaluator
%load_ext autoreload

In [2]:
%autoreload

# データ加工

In [3]:
# レース結果テーブルの前処理
results_preprocessed = preprocessing.process_results()

In [4]:
# 重複チェック
results_preprocessed.duplicated(subset=["race_id", "horse_id"]).sum()

np.int64(0)

In [5]:
# 欠損チェック
results_preprocessed.isnull().sum()

race_id        0
horse_id       0
jockey_id      0
trainer_id     0
owner_id       0
rank           0
umaban         0
wakuban        0
tansho_odds    0
popularity     0
impost         0
sex            0
age            0
weight         0
weight_diff    0
dtype: int64

In [6]:
# 馬の過去成績テーブルの加工
horse_results_preprocessed = preprocessing.process_horse_results()

In [7]:
# 欠損値チェック
horse_results_preprocessed.isnull().sum()

horse_id            0
date                0
rank                0
prize               0
rank_diff        3344
weather           551
race_type           0
course_len          0
ground_state      290
race_class      94091
time             3266
win                 0
rentai              0
show                0
place               0
n_horses            0
dtype: int64

In [8]:
# 重複チェック
horse_results_preprocessed.duplicated(subset=["horse_id", "date"]).sum()

np.int64(0)

In [9]:
# レース情報テーブルの前処理
race_info_preprocessed = preprocessing.process_race_info()

In [10]:
race_info_preprocessed

Unnamed: 0,race_id,date,race_type,around,course_len,weather,ground_state,race_class,place,month,sin_date,cos_date
0,202301010101,2023-07-22,1,0.0,1200,0,0,1,1,7,0.656633,0.060799
1,202301010102,2023-07-22,0,0.0,1000,0,0,1,1,7,0.656633,0.060799
2,202301010103,2023-07-22,0,0.0,1700,0,0,1,1,7,0.656633,0.060799
3,202301010104,2023-07-22,1,0.0,1500,0,0,1,1,7,0.656633,0.060799
4,202301010105,2023-07-22,0,0.0,1700,0,0,0,1,7,0.656633,0.060799
...,...,...,...,...,...,...,...,...,...,...,...,...
3451,202310030808,2023-09-03,0,0.0,1000,0,0,2,10,9,0.113279,0.537695
3452,202310030809,2023-09-03,1,0.0,1200,0,0,3,10,9,0.113279,0.537695
3453,202310030810,2023-09-03,0,0.0,1700,0,0,4,10,9,0.113279,0.537695
3454,202310030811,2023-09-03,1,0.0,1200,0,0,7,10,9,0.113279,0.537695


In [11]:
# 払い戻しテーブルの前処理
return_tables_preprocessed = preprocessing.process_return_tables()

In [12]:
return_tables_preprocessed

Unnamed: 0_level_0,bet_type,win_umaban,return
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
202301010101,単勝,[5],120
202301010101,複勝,[5],100
202301010101,複勝,[8],110
202301010101,複勝,[6],310
202301010101,馬連,"[5, 8]",170
...,...,...,...
202310030812,ワイド,"[6, 8]",410
202310030812,ワイド,"[6, 11]",470
202310030812,馬単,"[8, 11]",1580
202310030812,三連複,"[6, 8, 11]",1590


# 特徴量の作成

In [8]:
fc = FeatureCreator()
features = fc.create_features()

In [9]:
# 重複チェック
features.duplicated(subset=["race_id", "horse_id"]).sum()

np.int64(0)

# 学習

In [34]:
trainer = Trainer(
    config_filepath="config.yaml"
)
evaluation_df = trainer.run(
    valid_start_date="2023-11-01",
    test_start_date="2023-12-01"
)

Training until validation scores don't improve for 100 rounds


[100]	valid_0's binary_logloss: 0.209462
[200]	valid_0's binary_logloss: 0.20042
[300]	valid_0's binary_logloss: 0.19804
[400]	valid_0's binary_logloss: 0.197291
[500]	valid_0's binary_logloss: 0.197089
[600]	valid_0's binary_logloss: 0.196931
[700]	valid_0's binary_logloss: 0.196518
[800]	valid_0's binary_logloss: 0.196652
Early stopping, best iteration is:
[721]	valid_0's binary_logloss: 0.196501


# 精度評価

In [35]:
evaluator = Evaluator()
evaluator.summarize_box_top_n(n=1, save_filename="box_summary_top1.csv")

Unnamed: 0_level_0,hitrate_model,hitrate_pop,returnrate_model,returnrate_pop
bet_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
単勝,0.326389,0.315972,0.814583,0.751042
複勝,0.638889,0.631944,0.840278,0.823264


In [36]:
evaluator.summarize_box_top_n(n=2, save_filename="box_summary_top2.csv")

Unnamed: 0_level_0,hitrate_model,hitrate_pop,returnrate_model,returnrate_pop
bet_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ワイド,0.298611,0.309028,0.747569,0.759722
単勝,0.541667,0.53125,0.871181,0.817708
複勝,0.854167,0.857639,0.843576,0.851042
馬単,0.170139,0.184028,0.812153,0.873264
馬連,0.170139,0.184028,0.854167,0.901389


In [37]:
evaluator.summarize_box_top_n(n=3, save_filename="box_summary_top3.csv")

Unnamed: 0_level_0,hitrate_model,hitrate_pop,returnrate_model,returnrate_pop
bet_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ワイド,0.604167,0.583333,0.882292,0.835069
三連単,0.083333,0.076389,0.594734,0.535937
三連複,0.083333,0.076389,0.689931,0.680208
単勝,0.680556,0.652778,0.839468,0.777546
複勝,0.934028,0.920139,0.88125,0.859259
馬単,0.364583,0.329861,0.864525,0.743171
馬連,0.364583,0.329861,0.913079,0.758912


# 予測

当日出走馬が確定した時点で実行できる

In [38]:
# 当日出走馬の過去成績テーブルの前処理
horse_results_preprocessed = preprocessing.process_horse_results(
    input_filename="horse_results_prediction.csv",
    output_filename="horse_results_prediction.csv"
)

In [39]:
# 予測
prediction.predict(features)

Unnamed: 0,race_id,umaban,tansho_odds,popularity,pred
11808,202305010109,3,1.5,1,0.819407
19402,202306010109,9,1.5,1,0.792035
12322,202305010409,8,1.5,1,0.778701
35590,202308030408,12,1.2,1,0.771080
31977,202308010206,3,1.2,1,0.766301
...,...,...,...,...,...
1936,202301020708,11,356.1,14,0.000957
20888,202306020203,10,720.6,16,0.000948
30802,202307040106,11,453.3,16,0.000946
9196,202304020407,6,463.8,16,0.000907
