# Overview

Use Recursive Feature Elimination (RFE) to select the most important features. First optimize hyperparameters on the full features, then use the hyperparameters to run RFE. Use SMOTE to balance the data.

In [1]:
import os
import re
import tempfile
import warnings

import japanize_matplotlib
import lightgbm as lgb
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from hyperopt import STATUS_OK, SparkTrials, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImblearnPipeline
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import ColSpec, Schema
from pyspark.sql import SparkSession
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    auc,
    confusion_matrix,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from JapanHorseRaceAnalytics.utilities.base import get_base_dir, read_hive_table
from JapanHorseRaceAnalytics.utilities.metrics import (
    calculate_binary_classifier_statistics,
)
from JapanHorseRaceAnalytics.utilities.mlflow import get_colspecs
from JapanHorseRaceAnalytics.utilities.structured_logger import logger

# Set pandas display options
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", 200)

# Set seed for reproducibility
os.environ["PYTHONHASHSEED"] = str(42)
np.random.seed(42)
tf.random.set_seed(42)
random_state = 42

# Prepare the data

In [2]:
warehouse_dir = f"{get_base_dir()}/spark-warehouse"
postgres_driver_path = f"{get_base_dir()}/jars/postgresql-42.7.1.jar"

spark = (
    SparkSession.builder.appName("20240211_competitors")
    .config("spark.driver.memory", "20g")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .config("spark.jars", postgres_driver_path)
    .config("spark.executor.extraClassPath", postgres_driver_path)
    .config("spark.driver.extraClassPath", postgres_driver_path)
    .enableHiveSupport()
    .getOrCreate()
)

24/02/25 15:13:24 WARN Utils: Your hostname, Hanks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.40.105 instead (on interface en0)
24/02/25 15:13:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/02/25 15:13:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/25 15:13:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
data = read_hive_table(
    table_name="features_20240217_v1",
    schema="jhra_curated",
    spark_session=spark,
    # use_cache=False,
)

# drop from data where cat_トラック種別 == "障害"
data = data[(data["cat_トラック種別"] != "障害")]

# This would mess up the number of horses in the race
# No, it wouldn't because the number of horses in the race is already calculated
data = data[(data["meta_int_race_horses_異常区分"] == "0")]

data = data.reset_index(drop=True)
data.head()

{"event": "Read from hive jhra_curated.features_20240217_v1", "level": "info", "timestamp": "2024-02-25T06:13:25.300347Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
24/02/25 15:13:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/02/25 15:13:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/02/25 15:13:26 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
{"event": "Write to parquet /Users/hankehly/Projects/JapanHorseRaceAnalytics/data/sql_tables/features_20240217_v1.snappy.parquet", "level": "info", "timestamp": "2024-02-25T06:13:27.719752Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
24/02/25 15:13:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
{"event": "Read from parquet /Users/hankehly/Projects/JapanHorseRaceAnalytics/data/sql_tables/features_2024021

Unnamed: 0,meta_レースキー,meta_馬番,meta_着順,meta_本賞金,meta_単勝的中,meta_単勝払戻金,meta_複勝的中,meta_複勝払戻金,meta_int_races_レースキー,meta_発走日時,meta_場コード,cat_四半期,cat_距離,cat_事前_馬場状態コード,cat_事前_レース条件_トラック情報_右左,cat_事前_レース条件_トラック情報_内外,cat_事前_レース条件_種別,cat_事前_レース条件_条件,cat_事前_レース条件_記号,cat_事前_レース条件_重量,cat_事前_レース条件_グレード,num_事前_馬場差,num_頭数,cat_トラック種別,cat_馬場状態内,cat_馬場状態中,cat_馬場状態外,num_直線馬場差最内,num_直線馬場差内,num_直線馬場差中,num_直線馬場差外,num_直線馬場差大外,cat_芝種類,cat_草丈,cat_転圧,cat_凍結防止剤,num_中間降水量,meta_int_race_horses_レースキー,meta_int_race_horses_馬番,meta_int_race_horses_血統登録番号,meta_int_race_horses_発走日時,meta_int_race_horses_異常区分,num_事前ＩＤＭ,cat_事前脚質,num_事前単勝オッズ,num_事前複勝オッズ,cat_事前馬体,cat_事前気配コード,cat_事前上昇度,cat_事前クラスコード,num_事前テン指数,num_事前ペース指数,num_事前上がり指数,num_負担重量,num_馬体重,num_馬体重増減,cat_性別,cat_トラック種別瞬発戦好走馬,cat_トラック種別消耗戦好走馬,num_一走前不利,num_二走前不利,num_三走前不利,num_一走前着順,num_二走前着順,num_三走前着順,num_四走前着順,num_五走前着順,num_六走前着順,num_1走前上昇度,num_2走前上昇度,num_3走前上昇度,num_4走前上昇度,num_5走前上昇度,num_騎手指数,num_情報指数,num_オッズ指数,num_パドック指数,num_総合指数,cat_馬具変更情報,cat_脚元情報,cat_見習い区分,cat_オッズ印,cat_パドック印,cat_直前総合印,cat_距離適性,num_ローテーション,num_基準オッズ,num_基準人気順位,num_基準複勝オッズ,num_基準複勝人気順位,num_特定情報◎,num_特定情報○,num_特定情報▲,num_特定情報△,num_特定情報×,num_総合情報◎,num_総合情報○,num_総合情報▲,num_総合情報△,num_総合情報×,...,num_競争相手平均調教師1位完走,num_競争相手調教師1位完走標準偏差,num_競争相手最高調教師トップ3完走,num_競争相手最低調教師トップ3完走,num_競争相手平均調教師トップ3完走,num_競争相手調教師トップ3完走標準偏差,num_競争相手最高調教師1位完走率,num_競争相手最低調教師1位完走率,num_競争相手平均調教師1位完走率,num_競争相手調教師1位完走率標準偏差,num_競争相手最高調教師トップ3完走率,num_競争相手最低調教師トップ3完走率,num_競争相手平均調教師トップ3完走率,num_競争相手調教師トップ3完走率標準偏差,num_競争相手最高調教師場所レース数,num_競争相手最低調教師場所レース数,num_競争相手平均調教師場所レース数,num_競争相手調教師場所レース数標準偏差,num_競争相手最高調教師場所1位完走,num_競争相手最低調教師場所1位完走,num_競争相手平均調教師場所1位完走,num_競争相手調教師場所1位完走標準偏差,num_競争相手最高調教師場所トップ3完走,num_競争相手最低調教師場所トップ3完走,num_競争相手平均調教師場所トップ3完走,num_競争相手調教師場所トップ3完走標準偏差,num_競争相手最高調教師場所1位完走率,num_競争相手最低調教師場所1位完走率,num_競争相手平均調教師場所1位完走率,num_競争相手調教師場所1位完走率標準偏差,num_競争相手最高調教師場所トップ3完走率,num_競争相手最低調教師場所トップ3完走率,num_競争相手平均調教師場所トップ3完走率,num_競争相手調教師場所トップ3完走率標準偏差,num_競争相手最高調教師本賞金累計,num_競争相手最低調教師本賞金累計,num_競争相手平均調教師本賞金累計,num_競争相手調教師本賞金累計標準偏差,num_競争相手最高調教師1位完走平均賞金,num_競争相手最低調教師1位完走平均賞金,num_競争相手平均調教師1位完走平均賞金,num_競争相手調教師1位完走平均賞金標準偏差,num_競争相手最高調教師レース数平均賞金,num_競争相手最低調教師レース数平均賞金,num_競争相手平均調教師レース数平均賞金,num_競争相手調教師レース数平均賞金標準偏差,num_競争相手平均調教師レース数差,num_競争相手平均調教師1位完走差,num_競争相手平均調教師トップ3完走差,num_競争相手平均調教師1位完走率差,num_競争相手平均調教師トップ3完走率差,num_競争相手平均調教師場所レース数差,num_競争相手平均調教師場所1位完走差,num_競争相手平均調教師場所トップ3完走差,num_競争相手平均調教師場所1位完走率差,num_競争相手平均調教師場所トップ3完走率差,num_競争相手平均調教師本賞金累計差,num_競争相手平均調教師1位完走平均賞金差,num_競争相手平均調教師レース数平均賞金差,meta_int_combinations_レースキー,meta_int_combinations_馬番,num_馬騎手レース数,num_馬騎手1位完走,num_馬騎手1位完走率,num_馬騎手トップ3完走,num_馬騎手トップ3完走率,num_馬騎手初二走,num_馬騎手同騎手,num_馬騎手場所レース数,num_馬騎手場所1位完走,num_馬騎手場所1位完走率,num_馬騎手場所トップ3完走,num_馬騎手場所トップ3完走率,num_馬調教師レース数,num_馬調教師1位完走,num_馬調教師1位完走率,num_馬調教師トップ3完走,num_馬調教師トップ3完走率,num_馬調教師初二走,num_馬調教師同調教師,num_馬調教師場所レース数,num_馬調教師場所1位完走,num_馬調教師場所1位完走率,num_馬調教師場所トップ3完走,num_馬調教師場所トップ3完走率,meta_int_race_weather_レースキー,num_temperature,num_precipitation,num_snowfall,num_snow_depth,num_wind_speed,cat_wind_direction,num_solar_radiation,num_local_air_pressure,num_sea_level_air_pressure,num_relative_humidity,num_vapor_pressure,num_dew_point_temperature,cat_weather,num_visibility
0,1011103,4,6.0,0.0,False,0,False,0,1011103,2001-08-04 01:45:00,1,3,1200,20,1,1,12,A3,102,3,,,16,芝,1,1,1,1,1,0,0,0,,,False,False,,1011103,4,98102049,2001-08-04 01:45:00,0,36.0,好位差し,11.5,2.9,,,3,18.0,-12.4,-21.1,-10.9,550,476.0,14,牡,True,False,0.0,0.0,0.0,7.0,2.0,7.0,,,,3.0,3.0,3.0,,,0.4,0.4,0.0,1.8,38.6,0,0,0,,4.0,4.0,5,4.0,16.8,6,3.4,6,0,0,0,10,0,3,6,8,87,0,...,34.133333,18.575492,204,32,98.066667,44.395896,0.162376,0.027668,0.075334,0.036725,0.40396,0.120735,0.213261,0.076365,47,3,18.733333,12.390677,8,0,2.066667,2.112397,15,0,4.733333,4.464178,0.333333,0.0,0.09418,0.088041,0.5,0.0,0.207743,0.136352,231606.0,14187.0,65859.266667,53242.331542,1687.439024,567.857143,913.419366,273.792817,458.625743,49.167979,141.408061,101.993179,73.0,-3.133333,-5.066667,-0.016287,-0.036119,-6.733333,-1.066667,-0.733333,-0.010847,0.12559,-18662.266667,-159.225818,-51.509013,1011103,4,2,0,0.0,0,0.0,False,False,0,0,0.0,0,0.0,9,0,0.0,1,0.111111,False,True,0,0,0.0,0,0.0,1011103,22.8,0.0,,0.0,3.9,北西,2.93,1010.95,1013.95,60.75,16.875,14.85,,
1,1011103,9,2.0,200.0,False,0,True,120,1011103,2001-08-04 01:45:00,1,3,1200,20,1,1,12,A3,102,3,,,16,芝,1,1,1,1,1,0,0,0,,,False,False,,1011103,9,98102902,2001-08-04 01:45:00,0,38.0,先行,4.4,1.6,,,3,16.0,-10.6,-23.6,-5.1,550,482.0,0,牡,True,False,0.0,0.0,,3.0,2.0,,,,,3.0,3.0,,,,1.6,2.5,2.5,2.0,46.6,0,2,0,3.0,3.0,3.0,5,3.0,4.2,2,1.5,2,5,2,4,0,0,30,45,33,27,0,...,34.6,18.402174,204,32,99.0,44.131621,0.162376,0.027668,0.076291,0.036095,0.40396,0.120735,0.215263,0.074985,47,3,18.933333,12.25543,8,0,2.066667,2.112397,15,0,4.8,4.445222,0.333333,0.0,0.092329,0.087958,0.5,0.0,0.207743,0.136352,231606.0,14187.0,66490.8,52960.294858,1687.439024,567.857143,916.365603,271.818667,458.625743,49.167979,142.718028,101.193884,85.8,-10.6,-20.0,-0.031598,-0.06815,-9.933333,-1.066667,-1.8,0.018783,0.12559,-28766.8,-206.365603,-72.468494,1011103,9,3,0,0.0,2,0.666667,False,True,0,0,0.0,0,0.0,8,0,0.0,3,0.375,False,True,0,0,0.0,0,0.0,1011103,22.8,0.0,,0.0,3.9,北西,2.93,1010.95,1013.95,60.75,16.875,14.85,,
2,1011204,14,6.0,0.0,False,0,False,0,1011204,2001-08-05 02:15:00,1,3,1800,10,1,1,12,A3,102,3,,,14,芝,1,1,1,1,1,0,0,0,,,False,False,,1011204,14,98110058,2001-08-05 02:15:00,0,33.4,差し,7.6,2.3,,,3,38.0,-16.2,-9.6,-23.2,550,470.0,6,牡,False,False,0.0,,,8.0,,,,,,3.0,,,,,1.3,0.0,1.5,3.0,39.2,0,0,0,5.0,2.0,6.0,5,6.0,8.6,6,2.7,6,0,1,1,7,0,11,7,9,71,0,...,41.384615,23.470238,231,21,120.846154,58.455566,0.150621,0.033766,0.083075,0.030124,0.358696,0.119481,0.248023,0.072792,65,1,25.230769,17.129165,5,0,2.153846,1.511299,20,0,8.076923,5.980244,0.2,0.0,0.081879,0.051502,0.6,0.0,0.289074,0.151223,164902.0,12493.0,74442.5,47183.126061,1498.970588,626.153846,964.732308,274.777458,266.831715,48.657143,150.209357,65.950213,47.076923,13.615385,35.153846,0.021887,0.049687,15.769231,6.846154,9.923077,0.137634,0.14995,49377.5,316.358601,86.088353,1011204,14,4,0,0.0,0,0.0,False,False,0,0,0.0,0,0.0,10,0,0.0,1,0.1,False,True,0,0,0.0,0,0.0,1011204,22.3,0.0,,0.0,4.625,北北西,3.14,1010.325,1013.325,64.0,17.225,15.15,1.0,30.0
3,1011303,6,3.0,130.0,False,0,True,1090,1011303,2001-08-11 01:45:00,1,3,1700,10,1,1,12,A3,2,3,,,13,ダート,1,1,1,1,1,0,0,0,,,False,False,,1011303,6,98103267,2001-08-11 01:45:00,0,17.0,差し,50.8,8.7,,,3,18.0,-8.4,-24.3,-28.8,550,436.0,-4,牡,False,False,1.0,0.0,0.0,12.0,13.0,7.0,,,,3.0,3.0,3.0,,,0.3,-1.0,0.0,0.0,16.3,0,0,0,,,,5,0.0,89.5,13,14.5,13,0,0,0,0,0,0,0,0,1,0,...,38.25,16.573699,180,50,122.25,44.358624,0.096825,0.029126,0.064903,0.022876,0.294828,0.121359,0.208923,0.055122,65,2,35.25,19.472737,10,0,2.666667,2.838231,20,1,8.083333,7.193265,0.153846,0.0,0.053959,0.048886,0.5,0.039216,0.256171,0.157059,172151.0,24692.0,77010.125,41784.766816,1655.294118,716.296296,972.226859,235.029654,296.812069,59.932039,130.679572,63.348222,-41.333333,13.75,21.75,0.033582,0.063804,7.75,1.333333,7.916667,0.039064,0.115922,-1223.125,-199.919166,12.856413,1011303,6,1,0,0.0,0,0.0,True,True,1,0,0.0,0,0.0,5,0,0.0,0,0.0,False,True,1,0,0.0,0,0.0,1011303,23.475,0.0,,0.0,0.825,北,1.435,1009.925,1012.925,65.0,18.725,16.475,,
4,1011304,7,1.0,510.0,True,230,True,120,1011304,2001-08-11 02:15:00,1,3,2000,10,1,1,12,A3,102,3,,,16,芝,1,1,1,1,1,0,0,0,,,False,False,,1011304,7,98101610,2001-08-11 02:15:00,0,42.8,追込,2.2,1.0,,,2,,-21.6,-33.9,7.1,550,502.0,0,牡,False,False,,,,,,,,,,,,,,,2.9,4.1,3.5,1.8,55.1,0,3,0,1.0,4.0,1.0,2,43.0,2.8,1,1.3,1,8,3,0,0,0,47,55,18,11,0,...,47.866667,20.603775,232,66,137.733333,48.078362,0.151235,0.02729,0.085412,0.02996,0.358025,0.128655,0.246196,0.064278,66,7,29.933333,19.739864,7,0,2.466667,2.499778,20,0,7.666667,6.559133,0.269231,0.0,0.070152,0.073384,0.5,0.0,0.229625,0.133773,208944.0,27159.0,93025.866667,46309.291859,1294.605263,567.857143,950.1511,181.940753,343.657895,52.94152,164.958116,69.914071,139.066667,97.133333,153.266667,0.125038,0.176155,45.066667,15.533333,30.333333,0.169848,0.277041,230843.133333,516.538555,305.098488,1011304,7,1,0,0.0,1,1.0,True,True,0,0,0.0,0,0.0,1,0,0.0,1,1.0,True,True,0,0,0.0,0,0.0,1011304,24.125,0.0,,0.0,1.3,北北西,1.7175,1009.85,1012.825,61.5,18.425,16.225,2.0,30.0


In [4]:
spark.stop()

In [4]:
# num_cols = [col for col in data.columns if col.startswith("num_")]
usecols = pd.read_csv("step3.csv").query("rfe_support").reset_index(drop=True)["Unnamed: 0"].tolist()
usecols = [col[5:] for col in usecols]
# remove cat_ from usecols
usecols = [col for col in usecols if not col.startswith("cat_")]
data[usecols].isna().sum().to_frame()

Unnamed: 0,0
num_競争相手平均騎手レース数,0
num_調教師場所トップ3完走,0
num_位置指数順位,1173
num_競争相手最低事前単勝オッズ,0
num_競争相手調教師レース数標準偏差,0
num_競争相手平均トップ3完走,0
num_騎手指数変動率,0
num_騎手期待単勝率,66113
num_距離トップ3完走率,0
num_競争相手平均三走前不利,104475


# Train/test split

In [62]:
X = data
y = data["meta_複勝的中"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

X_train: (857061, 1008)
X_test: (214266, 1008)
y_train: (857061,)
y_test: (214266,)


In [63]:
assert len(X_train.filter(regex="実績", axis=1).columns.tolist()) == 0

In [64]:
num_columns = X_train.filter(regex="num_", axis=1).columns.tolist()
cat_columns = X_train.filter(regex="cat_", axis=1).columns.tolist()
ord_columns = X_train.filter(regex="ord_", axis=1).columns.tolist()
meta_columns = X_train.filter(regex="meta_", axis=1).columns.tolist()

print(f"num_columns: {len(num_columns)}, cat_columns: {len(cat_columns)}, ord_columns: {len(ord_columns)}, meta_columns: {len(meta_columns)}")
print(f"Total columns: {len(num_columns) + len(cat_columns) + len(ord_columns) + len(meta_columns)}, X_train.shape[1]: {X_train.shape[1]}")

num_columns: 913, cat_columns: 70, ord_columns: 0, meta_columns: 25
Total columns: 1008, X_train.shape[1]: 1008


In [65]:
from imblearn.over_sampling import SMOTE

numeric_features = X_train.filter(regex="num_", axis=1).columns.tolist()
categorical_features = list(set(X_train.filter(regex="cat_", axis=1).columns.tolist()) - {"cat_距離"})
preprocessor = ColumnTransformer(
    transformers=[
        ("ord", OrdinalEncoder(categories=[[1000, 1150, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 3000, 3200, 3400, 3600]]), ["cat_距離"]),
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="if_binary"), categorical_features),
    ],
)

pipeline = ImblearnPipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=random_state, k_neighbors=5)),
        ("classifier", lgb.LGBMClassifier(**params)),
    ]
)

X_train_p = preprocessor.fit_transform(X_train)

# smote = 
SMOTE(random_state=random_state, k_neighbors=5).fit_resample(X_train_p, y_train)
# X_sm, y_sm = smote.fit_resample(X_train, y_train)

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# Define objective function

In [8]:
def create_objective_fn(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    df_payout: pd.DataFrame,
    experiment_name: str,
):
    """
    df_payout should have the same index as *_test and have the following columns:
    * レースキー
    * 馬番
    * 距離
    * 発走日時
    * 年齢
    * 場コード
    * payout - amount won if betting 100 yen.
    """

    def train(params):
        def profit_loss(row, payout_column_name, bet_amount=100):
            if row["pred"] and row["actual"]:
                payout = row[payout_column_name] * (bet_amount / 100)
                return payout - bet_amount
            elif row["pred"] and not row["actual"]:
                return -bet_amount
            else:
                return 0

        mlflow.set_experiment(experiment_name=experiment_name)
        with mlflow.start_run():
            mlflow.log_params(params)

            numeric_features = X_train.filter(regex="num_", axis=1).columns.tolist()
            categorical_features = list(set(X_train.filter(regex="cat_", axis=1).columns.tolist()) - {"cat_距離"})
            preprocessor = ColumnTransformer(
                transformers=[
                    ("ord", OrdinalEncoder(categories=[[1000, 1150, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 3000, 3200, 3400, 3600]]), ["cat_距離"]),
                    ("num", StandardScaler(), numeric_features),
                    ("cat", OneHotEncoder(handle_unknown="ignore", drop="if_binary"), categorical_features),
                ],
            )
            model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", lgb.LGBMClassifier(**params)),
                ]
            )

            model.fit(X_train, y_train)
            mlflow.sklearn.log_model(sk_model=model, artifact_path="model")

            y_pred_proba = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)

            metrics = {
                "loss": log_loss(y_test, y_pred_proba),
                "accuracy": accuracy_score(y_test, y_pred),
                "precision": precision_score(y_test, y_pred),
                "recall": recall_score(y_test, y_pred),
                "f1": f1_score(y_test, y_pred),
                "roc_auc": roc_auc_score(y_test, y_pred),
            }
            mlflow.log_metrics(metrics)

            # Calculate payout rates by group
            results = pd.concat(
                [
                    df_payout,
                    pd.DataFrame(
                        np.c_[y_test, y_pred, y_pred_proba],
                        columns=["actual", "pred", "pred_proba_true"],
                    ),
                ],
                axis=1,
            )
            payout_all = calculate_binary_classifier_statistics(
                results, group_by=None, payout_column_name="payout"
            )
            payout_month = calculate_binary_classifier_statistics(
                results,
                group_by=results["発走日時"].dt.month,
                payout_column_name="payout",
            )
            payout_distance = calculate_binary_classifier_statistics(
                results,
                group_by=pd.cut(x=results["距離"], bins=[0, 1400, 1800, 10000]),
                payout_column_name="payout",
            )
            payout_season = calculate_binary_classifier_statistics(
                results,
                group_by=results["発走日時"].dt.month % 12 // 3,
                payout_column_name="payout",
            )
            payout_year = calculate_binary_classifier_statistics(
                results,
                group_by=results["発走日時"].dt.year,
                payout_column_name="payout",
            )
            payout_age = calculate_binary_classifier_statistics(
                results,
                group_by=pd.cut(results["年齢"], bins=[0, 3, 6, 100]),
                payout_column_name="payout",
            )
            payout_racetrack = calculate_binary_classifier_statistics(
                results, group_by=results["場コード"], payout_column_name="payout"
            )
            payout = (
                pd.concat(
                    [
                        pd.DataFrame(payout_all).T.assign(group="all"),
                        pd.DataFrame(payout_month).T.assign(group="month"),
                        pd.DataFrame(payout_distance).T.assign(group="distance"),
                        pd.DataFrame(payout_season).T.assign(group="season"),
                        pd.DataFrame(payout_year).T.assign(group="year"),
                        pd.DataFrame(payout_age).T.assign(group="horse_age"),
                        pd.DataFrame(payout_racetrack).T.assign(group="racetrack"),
                    ],
                    axis=0,
                )
                .rename_axis(index="part")
                .reset_index()
            )
            # Move "group" and "part" columns to the first position in this dataframe
            payout = payout[
                ["group", "part"]
                + [c for c in payout.columns if c not in ["group", "part"]]
            ]

            # Save payout rates as csv
            with tempfile.NamedTemporaryFile(prefix="payout_rate_", suffix=".csv") as f:
                payout.to_csv(f.name, index=False)
                mlflow.log_artifact(f.name)

            # Log payout rates as metrics
            payout_metrics = {}
            for group_name, group in payout.groupby("group"):
                for i, row in group.iterrows():
                    key = re.sub(r"\W", "_", f"payout_rate_{group_name}_{row['part']}")
                    payout_metrics[key] = row["payout_rate"]
            mlflow.log_metrics(payout_metrics)

            # Suppress UserWarning messages from matplotlib
            warnings.filterwarnings("ignore", category=UserWarning)

            # Plot payout rates by group
            sns.set_theme(style="whitegrid")
            fig, axes = plt.subplots(2, 4, figsize=(20, 10))
            for (group, df), ax in zip(payout.groupby("group"), axes.flatten()):
                sns.barplot(x="part", y="payout_rate", data=df, ax=ax)
                ax.set_title(group)
                ax.set_ylim(0, 150)
                ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
                ax.set_ylabel("payout rate")
                ax.set_xlabel("")
                ax.yaxis.set_major_formatter(ticker.PercentFormatter())
            with tempfile.NamedTemporaryFile(prefix="payout_rate_", suffix=".png") as f:
                plt.tight_layout()
                plt.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Plot bank balance over time
            results["profit_loss"] = results.apply(
                profit_loss, args=("payout", 100), axis=1
            )
            daily_profit_loss = results.groupby("発走日時")["profit_loss"].sum()
            bank_balance = daily_profit_loss.cumsum()
            plt.figure(figsize=(10, 10))
            ax = plt.subplot(1, 1, 1)
            ax.plot(bank_balance.index, bank_balance.values)
            ax.set_title("Bank Balance")
            ax.set_xlabel("Date")
            ax.set_ylabel("Bank Balance")
            ax.grid(True)
            ax.yaxis.set_major_formatter(ticker.StrMethodFormatter("{x:,.0f}"))
            with tempfile.NamedTemporaryFile(
                prefix="bank_balance_", suffix=".png"
            ) as f:
                plt.tight_layout()
                plt.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Confusion Matrix
            conf_matrix = confusion_matrix(y_test, y_pred)
            _, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
            sns.heatmap(conf_matrix, annot=True, fmt="g", cmap="Blues", ax=ax1)
            ax1.set_xlabel("Predicted")
            ax1.set_ylabel("Actual")
            ax1.set_title("Confusion Matrix")
            sns.heatmap(
                conf_matrix / conf_matrix.sum(axis=1)[:, None],
                annot=True,
                fmt=".2%",
                cmap="Blues",
                ax=ax2,
            )
            ax2.set_xlabel("Predicted")
            ax2.set_ylabel("Actual")
            ax2.set_title("Normalized Confusion Matrix")
            with tempfile.NamedTemporaryFile(prefix="confusion_matrix_", suffix=".png") as f:
                plt.tight_layout()
                plt.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # ROC Curve
            fpr, tpr, _ = roc_curve(y_test, y_pred)
            roc_auc = auc(fpr, tpr)
            _, ax = plt.subplots(figsize=(10, 10))
            ax.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
            ax.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.0])
            ax.set_xlabel("False Positive Rate")
            ax.set_ylabel("True Positive Rate")
            ax.set_title("Receiver Operating Characteristic")
            ax.legend(loc="lower right")
            with tempfile.NamedTemporaryFile(prefix="roc_curve_", suffix=".png") as f:
                plt.tight_layout()
                plt.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Feature Importances
            feature_importances = zip(
                model.named_steps["preprocessor"].get_feature_names_out(),
                model.named_steps["classifier"].feature_importances_,
            )
            feature_importances_df = (
                pd.DataFrame(
                    data=feature_importances, columns=["feature", "importance"]
                )
                .sort_values("importance", ascending=False)
                .reset_index(drop=True)
            )
            with tempfile.NamedTemporaryFile(
                prefix="feature_importance_", suffix=".csv"
            ) as f:
                feature_importances_df.to_csv(f.name, index=False)
                mlflow.log_artifact(f.name)
            font_properties = fm.FontProperties(
                fname="/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc"
            )

            sns.set_theme(style="whitegrid")
            plt.figure(figsize=(10, 12))
            ax = sns.barplot(x="importance", y="feature", data=feature_importances_df.iloc[:50])
            ax.set_title("Feature Importances (Top 50)", fontproperties=font_properties)
            ax.set_xlabel("Importance", fontproperties=font_properties)
            ax.set_ylabel("Features", fontproperties=font_properties)
            for label in ax.get_yticklabels():
                label.set_fontproperties(font_properties)
            with tempfile.NamedTemporaryFile(prefix="feature_importance_", suffix=".png") as f:
                plt.tight_layout()
                plt.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            return {"status": STATUS_OK, "params": params, "model": model, **metrics}

    return train

# Optimize hyperparameters

In [9]:
space = {
    "boosting_type": hp.choice("boosting_type", ["gbdt", "dart", "goss"]),
    "learning_rate": hp.loguniform("learning_rate", -5, 0),  # between e^-5 and 1
    "n_estimators": scope.int(hp.quniform("n_estimators", 100, 1000, 1)),
    "max_depth": scope.int(hp.quniform("max_depth", 3, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 20, 150, 1)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 20, 500, 1)),
    "feature_fraction": hp.uniform("feature_fraction", 0.5, 1.0),
    "lambda_l1": hp.uniform("lambda_l1", 0, 5),
    "lambda_l2": hp.uniform("lambda_l2", 0, 5),
    "min_split_gain": hp.uniform("min_split_gain", 0, 1),
    "min_child_weight": hp.uniform("min_child_weight", 0.001, 10),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
    "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
    "objective": "binary",
    "class_weight": "balanced",
    "verbose": -1,
    "seed": 80,
}

In [10]:
df_payout_renamed_columns = {
    "meta_レースキー": "レースキー",
    "meta_馬番": "馬番",
    "cat_距離": "距離",
    "meta_発走日時": "発走日時",
    "meta_複勝払戻金": "payout",
    "num_年齢": "年齢",
    "meta_場コード": "場コード",
}

df_payout = (
    data.iloc[X_test.index]
    .reset_index(drop=True)
    .rename(columns=df_payout_renamed_columns)[df_payout_renamed_columns.values()]
)

experiment_name = "20240223_rfe_full_features_lgbm"
if mlflow.get_experiment_by_name(experiment_name) is None:
    mlflow.create_experiment(experiment_name)

fn = create_objective_fn(
    X_train,
    y_train,
    X_test,
    y_test,
    df_payout=df_payout,
    experiment_name=experiment_name,
)

In [11]:
# serial
trials = Trials()
fmin(
    fn=fn,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    trials_save_file="trials_20240223_rfe_full_features_lgbm.pkl",
)

# parallel (runs out of memory)
# trials = SparkTrials(parallelism=2, spark_session=spark)
# fmin(fn=fn, space=space, algo=tpe.suggest, max_evals=60, trials=trials)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

build_posterior_wrapper took 0.002956 seconds
TPE using 0 trials



Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type

  2%|▏         | 1/50 [08:07<6:37:44, 487.03s/trial, best loss: 0.5113984276080934]

build_posterior_wrapper took 0.001754 seconds
TPE using 1/1 trials with best loss 0.511398
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

  4%|▍         | 2/50 [12:24<4:41:31, 351.91s/trial, best loss: 0.5113984276080934]

build_posterior_wrapper took 0.001740 seconds
TPE using 2/2 trials with best loss 0.511398
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

  6%|▌         | 3/50 [15:12<3:29:46, 267.80s/trial, best loss: 0.5113984276080934]

build_posterior_wrapper took 0.001757 seconds
TPE using 3/3 trials with best loss 0.511398
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

  8%|▊         | 4/50 [33:59<7:45:35, 607.29s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.002102 seconds
TPE using 4/4 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

 10%|█         | 5/50 [44:29<7:41:27, 615.28s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.001711 seconds
TPE using 5/5 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

 12%|█▏        | 6/50 [48:15<5:54:19, 483.18s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.001671 seconds
TPE using 6/6 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

 14%|█▍        | 7/50 [54:02<5:14:15, 438.50s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.002016 seconds
TPE using 7/7 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

 16%|█▌        | 8/50 [58:46<4:32:36, 389.44s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.001652 seconds
TPE using 8/8 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

 18%|█▊        | 9/50 [1:07:00<4:48:23, 422.03s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.001654 seconds
TPE using 9/9 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to th

 20%|██        | 10/50 [1:10:38<3:59:19, 358.98s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.001914 seconds
TPE using 10/10 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 22%|██▏       | 11/50 [1:21:55<4:56:37, 456.35s/trial, best loss: 0.5071860626822343]

build_posterior_wrapper took 0.001644 seconds
TPE using 11/11 trials with best loss 0.507186
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 24%|██▍       | 12/50 [1:27:04<4:20:34, 411.44s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.001939 seconds
TPE using 12/12 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 26%|██▌       | 13/50 [1:29:44<3:26:53, 335.50s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.008561 seconds
TPE using 13/13 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 28%|██▊       | 14/50 [1:32:37<2:51:41, 286.16s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.001616 seconds
TPE using 14/14 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 30%|███       | 15/50 [1:37:16<2:45:40, 284.03s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.002180 seconds
TPE using 15/15 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 32%|███▏      | 16/50 [1:40:46<2:28:25, 261.92s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.001694 seconds
TPE using 16/16 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 34%|███▍      | 17/50 [1:46:02<2:32:55, 278.04s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.001662 seconds
TPE using 17/17 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 36%|███▌      | 18/50 [1:51:14<2:33:49, 288.43s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.009582 seconds
TPE using 18/18 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 38%|███▊      | 19/50 [2:00:20<3:08:57, 365.71s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.001817 seconds
TPE using 19/19 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 40%|████      | 20/50 [2:10:27<3:38:59, 437.99s/trial, best loss: 0.5062139356555542]

build_posterior_wrapper took 0.001667 seconds
TPE using 20/20 trials with best loss 0.506214
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 42%|████▏     | 21/50 [2:23:07<4:18:32, 534.90s/trial, best loss: 0.4964474755163549]

build_posterior_wrapper took 0.007674 seconds
TPE using 21/21 trials with best loss 0.496447
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 44%|████▍     | 22/50 [2:29:46<3:50:34, 494.10s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.001855 seconds
TPE using 22/22 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 46%|████▌     | 23/50 [2:41:33<4:11:01, 557.84s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.014499 seconds
TPE using 23/23 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 48%|████▊     | 24/50 [2:48:16<3:41:37, 511.43s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.001706 seconds
TPE using 24/24 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 50%|█████     | 25/50 [2:55:20<3:22:09, 485.16s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.001642 seconds
TPE using 25/25 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 52%|█████▏    | 26/50 [3:00:42<2:54:26, 436.11s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.007726 seconds
TPE using 26/26 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 54%|█████▍    | 27/50 [3:07:26<2:43:33, 426.67s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.001629 seconds
TPE using 27/27 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 56%|█████▌    | 28/50 [3:13:24<2:28:48, 405.85s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.008897 seconds
TPE using 28/28 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 58%|█████▊    | 29/50 [3:18:20<2:10:36, 373.15s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.001734 seconds
TPE using 29/29 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 60%|██████    | 30/50 [3:23:48<1:59:47, 359.39s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.007846 seconds
TPE using 30/30 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 62%|██████▏   | 31/50 [3:28:30<1:46:28, 336.22s/trial, best loss: 0.48049928720261825]

build_posterior_wrapper took 0.001912 seconds
TPE using 31/31 trials with best loss 0.480499
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 64%|██████▍   | 32/50 [3:35:25<1:48:00, 360.01s/trial, best loss: 0.4775099212400654] 

build_posterior_wrapper took 0.008042 seconds
TPE using 32/32 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 66%|██████▌   | 33/50 [3:39:23<1:31:36, 323.31s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.009096 seconds
TPE using 33/33 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 68%|██████▊   | 34/50 [3:46:27<1:34:17, 353.60s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.001653 seconds
TPE using 34/34 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 70%|███████   | 35/50 [3:51:30<1:24:33, 338.24s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.008468 seconds
TPE using 35/35 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 72%|███████▏  | 36/50 [3:57:49<1:21:47, 350.57s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.001782 seconds
TPE using 36/36 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 74%|███████▍  | 37/50 [4:02:27<1:11:13, 328.75s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.007866 seconds
TPE using 37/37 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 76%|███████▌  | 38/50 [4:10:03<1:13:21, 366.79s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.001651 seconds
TPE using 38/38 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 78%|███████▊  | 39/50 [4:13:54<59:46, 326.09s/trial, best loss: 0.4775099212400654]  

build_posterior_wrapper took 0.008293 seconds
TPE using 39/39 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 80%|████████  | 40/50 [4:20:46<58:40, 352.08s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.002469 seconds
TPE using 40/40 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 82%|████████▏ | 41/50 [4:24:34<47:13, 314.83s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.001837 seconds
TPE using 41/41 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 84%|████████▍ | 42/50 [4:28:43<39:20, 295.10s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.007074 seconds
TPE using 42/42 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 86%|████████▌ | 43/50 [4:36:16<39:55, 342.25s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.001978 seconds
TPE using 43/43 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 88%|████████▊ | 44/50 [4:43:34<37:05, 370.97s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.006549 seconds
TPE using 44/44 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 90%|█████████ | 45/50 [4:49:51<31:04, 372.94s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.004897 seconds
TPE using 45/45 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 92%|█████████▏| 46/50 [4:53:52<22:13, 333.31s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.001828 seconds
TPE using 46/46 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 94%|█████████▍| 47/50 [4:57:32<14:57, 299.22s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.008241 seconds
TPE using 47/47 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 96%|█████████▌| 48/50 [5:03:13<10:23, 311.95s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.004994 seconds
TPE using 48/48 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

 98%|█████████▊| 49/50 [5:10:19<05:45, 345.99s/trial, best loss: 0.4775099212400654]

build_posterior_wrapper took 0.001621 seconds
TPE using 49/49 trials with best loss 0.477510
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to 

100%|██████████| 50/50 [5:17:47<00:00, 381.35s/trial, best loss: 0.4775099212400654]


{'boosting_type': 0,
 'colsample_bytree': 0.6378156304978417,
 'feature_fraction': 0.9336946785708267,
 'lambda_l1': 2.577098853592653,
 'lambda_l2': 4.954790152997694,
 'learning_rate': 0.1801994182561799,
 'max_depth': 10.0,
 'min_child_samples': 420.0,
 'min_child_weight': 1.9789716420914754,
 'min_split_gain': 0.7210712578836012,
 'n_estimators': 665.0,
 'num_leaves': 140.0,
 'reg_alpha': 0.8021502939893334,
 'reg_lambda': 0.34804973445991183,
 'subsample': 0.6360877718163862}

In [11]:
# load trials from trial.pkl
import pickle
with open("trials_20240223_rfe_full_features_lgbm.pkl", "rb") as f:
    trials = pickle.load(f)

# RFE

In [13]:
best_params = trials.best_trial["result"]["params"]

numeric_features = X_train.filter(regex="num_", axis=1).columns.tolist()
categorical_features = list(set(X_train.filter(regex="cat_", axis=1).columns.tolist()) - {"cat_距離"})

preprocessor = ColumnTransformer(
    transformers=[
        ("ord", OrdinalEncoder(categories=[[1000, 1150, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 3000, 3200, 3400, 3600]]), ["cat_距離"]),
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="if_binary"), categorical_features),
    ]
)

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

## Initial Reduction

Use RFE with a step size of 50-100 to quickly reduce the feature space from 1300 to a few hundred features.

### Round 1

In [20]:
n_features = X_train_prep.shape[1]
print(f"n_features: {n_features}")

n_features: 1386


In [21]:
rfe = RFE(
    estimator=lgb.LGBMClassifier(**best_params),
    step=100,
    verbose=1,
    n_features_to_select=0.5,
)
rfe.fit(X_train_prep, y_train)

Fitting estimator with 1386 features.
Fitting estimator with 1336 features.
Fitting estimator with 1286 features.
Fitting estimator with 1236 features.
Fitting estimator with 1186 features.
Fitting estimator with 1136 features.
Fitting estimator with 1086 features.
Fitting estimator with 1036 features.
Fitting estimator with 986 features.
Fitting estimator with 936 features.
Fitting estimator with 886 features.
Fitting estimator with 836 features.
Fitting estimator with 786 features.
Fitting estimator with 736 features.


In [26]:
y_pred_proba_1 = rfe.predict_proba(X_test_prep)[:, 1]
y_pred_1 = rfe.predict(X_test_prep)
metrics_1 = {
    "loss": log_loss(y_test, y_pred_proba_1),
    "accuracy": accuracy_score(y_test, y_pred_1),
    "precision": precision_score(y_test, y_pred_1),
    "recall": recall_score(y_test, y_pred_1),
    "f1": f1_score(y_test, y_pred_1),
    "roc_auc": roc_auc_score(y_test, y_pred_1),
}
print(metrics_1)

{'loss': 0.4792115048751, 'accuracy': 0.7585757889725855, 'precision': 0.44834491337722626, 'recall': 0.6579398472806608, 'f1': 0.5332876205599214, 'roc_auc': 0.7216045732060151}


In [33]:
step1 = (
    pd.DataFrame(
        rfe.ranking_, index=preprocessor.get_feature_names_out(), columns=["ranking"]
    )
    .assign(rfe_support=rfe.support_)
    .sort_values("ranking")
)
step1.to_csv("step1.csv", index=True)
step1

Unnamed: 0,ranking,rfe_support
ord__cat_距離,1,True
num__num_競争相手平均基準複勝オッズ差,1,True
num__num_競争相手平均人気指数差,1,True
num__num_競争相手平均調教指数差,1,True
num__num_競争相手平均厩舎指数差,1,True
...,...,...
cat__cat_事前_レース条件_記号_303,15,False
cat__cat_事前_レース条件_記号_400,15,False
cat__cat_事前_レース条件_記号_401,15,False
cat__cat_事前_レース条件_記号_500,15,False


### Round 2

In [34]:
n_features_2 = step1.query("rfe_support == True").shape[0]
print(f"n_features: {n_features_2}")

n_features: 693


In [35]:
X_train_prep_2 = X_train_prep[:, rfe.support_]
X_test_prep_2 = X_test_prep[:, rfe.support_]

In [37]:
rfe_2 = RFE(
    estimator=lgb.LGBMClassifier(**best_params),
    step=50,
    verbose=1,
    n_features_to_select=0.5,
)
rfe_2.fit(X_train_prep_2, y_train)

Fitting estimator with 693 features.
Fitting estimator with 678 features.
Fitting estimator with 663 features.
Fitting estimator with 648 features.
Fitting estimator with 633 features.
Fitting estimator with 618 features.
Fitting estimator with 603 features.
Fitting estimator with 588 features.
Fitting estimator with 573 features.
Fitting estimator with 558 features.
Fitting estimator with 543 features.
Fitting estimator with 528 features.
Fitting estimator with 513 features.
Fitting estimator with 498 features.
Fitting estimator with 483 features.
Fitting estimator with 468 features.
Fitting estimator with 453 features.
Fitting estimator with 438 features.
Fitting estimator with 423 features.
Fitting estimator with 408 features.
Fitting estimator with 393 features.
Fitting estimator with 378 features.
Fitting estimator with 363 features.
Fitting estimator with 348 features.


In [40]:
y_pred_proba_2 = rfe_2.predict_proba(X_test_prep_2)[:, 1]
y_pred_2 = rfe_2.predict(X_test_prep_2)
metrics_2 = {
    "loss": log_loss(y_test, y_pred_proba_2),
    "accuracy": accuracy_score(y_test, y_pred_2),
    "precision": precision_score(y_test, y_pred_2),
    "recall": recall_score(y_test, y_pred_2),
    "f1": f1_score(y_test, y_pred_2),
    "roc_auc": roc_auc_score(y_test, y_pred_2),
}
print(metrics_2)

{'loss': 0.4795363134554545, 'accuracy': 0.7567556215171796, 'precision': 0.4459215863001352, 'recall': 0.66085620784078, 'f1': 0.5325189032101822, 'roc_auc': 0.7215244918103437}


In [45]:
step2 = (
    pd.DataFrame(rfe_2.ranking_, index=step1[step1.rfe_support].index, columns=["ranking"])
    .assign(rfe_support=rfe_2.support_)
    .sort_values("ranking")
)
step2.to_csv("step2.csv", index=True)
step2

Unnamed: 0,ranking,rfe_support
num__num_競争相手最高入厩何日前,1,True
num__num_基準オッズ,1,True
num__num_競争相手騎手場所レース数標準偏差,1,True
num__num_競争相手平均騎手場所レース数,1,True
num__num_競争相手騎手過去5走トップ3完走率標準偏差,1,True
...,...,...
num__num_競争相手平均1着単勝オッズ変動率差,25,False
num__num_競争相手平均一走前着順,25,False
num__num_1着平均馬体重差,25,False
num__num_競争相手最小騎手場所トップ3完走,25,False


## Fine-tuning
Once you have a reduced set of features (e.g., 200-300), use a smaller step size (10-20) for more detailed feature elimination, allowing the model to assess the importance of features more accurately.

### Round 3

In [46]:
n_features_3 = step2.query("rfe_support == True").shape[0]
print(f"n_features: {n_features_3}")

n_features: 346


In [47]:
X_train_prep_3 = X_train_prep_2[:, rfe_2.support_]
X_test_prep_3 = X_test_prep_2[:, rfe_2.support_]

In [49]:
rfe_3 = RFE(
    estimator=lgb.LGBMClassifier(**best_params),
    step=15,
    verbose=1,
    n_features_to_select=130,
)
rfe_3.fit(X_train_prep_3, y_train)

Fitting estimator with 346 features.
Fitting estimator with 331 features.
Fitting estimator with 316 features.
Fitting estimator with 301 features.
Fitting estimator with 286 features.
Fitting estimator with 271 features.
Fitting estimator with 256 features.
Fitting estimator with 241 features.
Fitting estimator with 226 features.
Fitting estimator with 211 features.
Fitting estimator with 196 features.
Fitting estimator with 181 features.
Fitting estimator with 166 features.
Fitting estimator with 151 features.
Fitting estimator with 136 features.


In [50]:
y_pred_proba_3 = rfe_3.predict_proba(X_test_prep_3)[:, 1]
y_pred_3 = rfe_3.predict(X_test_prep_3)
metrics_3 = {
    "loss": log_loss(y_test, y_pred_proba_3),
    "accuracy": accuracy_score(y_test, y_pred_3),
    "precision": precision_score(y_test, y_pred_3),
    "recall": recall_score(y_test, y_pred_3),
    "f1": f1_score(y_test, y_pred_3),
    "roc_auc": roc_auc_score(y_test, y_pred_3),
}
print(metrics_3)

{'loss': 0.488040630920827, 'accuracy': 0.7499789980678222, 'precision': 0.4388672206206546, 'recall': 0.6913778133974487, 'f1': 0.5369155364228106, 'roc_auc': 0.7284503373706583}


In [51]:
step3 = (
    pd.DataFrame(rfe_3.ranking_, index=step2[step2.rfe_support].index, columns=["ranking"])
    .assign(rfe_support=rfe_3.support_)
    .sort_values("ranking")
)
step3.to_csv("step3.csv", index=True)
step3

Unnamed: 0,ranking,rfe_support
num__num_競争相手平均騎手レース数,1,True
num__num_調教師場所トップ3完走,1,True
num__num_位置指数順位,1,True
num__num_競争相手最低事前単勝オッズ,1,True
num__num_競争相手調教師レース数標準偏差,1,True
...,...,...
num__num_競争相手平均総合指数,16,False
num__num_競争相手最高パドック指数,16,False
num__num_競争相手平均本賞金累計,16,False
num__num_競争相手入厩何日前標準偏差,16,False


### Round 4

In [52]:
n_features_4 = step3.query("rfe_support == True").shape[0]
print(f"n_features: {n_features_4}")

n_features: 130


In [53]:
X_train_prep_4 = X_train_prep_3[:, rfe_3.support_]
X_test_prep_4 = X_test_prep_3[:, rfe_3.support_]

In [57]:
rfe_4 = RFECV(
    estimator=lgb.LGBMClassifier(**best_params),
    step=5,
    min_features_to_select=20,
    cv=5,
    n_jobs=-1,
    verbose=1,
)
rfe_4.fit(X_train_prep_4, y_train)

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 125 features.
Fitting estimator with 125 features.
Fitting estimator with 125 features.
Fitting estimator with 125 features.
Fitting estimator with 125 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 120 features.
Fitting estimator with 115 features.
Fitting estimator with 115 features.
Fitting estimator with 115 features.
Fitting estimator with 115 features.
Fitting estimator with 115 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 110 features.
Fitting estimator with 105 features.
Fitting estimator with 105 features.
F

KeyboardInterrupt: 

In [None]:
y_pred_proba_4 = rfe_4.predict_proba(X_test_prep_4)[:, 1]
y_pred_4 = rfe_4.predict(X_test_prep_4)
metrics_4 = {
    "loss": log_loss(y_test, y_pred_proba_4),
    "accuracy": accuracy_score(y_test, y_pred_4),
    "precision": precision_score(y_test, y_pred_4),
    "recall": recall_score(y_test, y_pred_4),
    "f1": f1_score(y_test, y_pred_4),
    "roc_auc": roc_auc_score(y_test, y_pred_4),
}
print(metrics_4)

{'loss': 0.5221410055247936, 'accuracy': 0.7247066730139173, 'precision': 0.41374701085290333, 'recall': 0.7511075491440148, 'f1': 0.5335747722672065, 'roc_auc': 0.7344057176238478}


## Final Model Training
After selecting the final set of features, perform a thorough hyperparameter optimization to fine-tune your model.