# Prepare the data

In [1]:
from pyspark.sql import SparkSession


spark = (
    SparkSession.builder
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.jars", "postgresql-42.7.1.jar")
    .getOrCreate()
)

23/12/28 15:05:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
df = spark.read.format("jdbc").options(
    url="jdbc:postgresql://localhost:5432/jrdb",
    dbtable="jrdb_curated.features",
    user="admin",
    password="admin",
    driver="org.postgresql.Driver",
).load()

In [3]:
feature_dtypes = {
    # "レースキー",
    # "馬番",
    "枠番": "category",
    "場名": "category",
    # "年月日",
    # "頭数",
    "四半期": "category",
    # "単勝的中",
    # "単勝払戻金",
    # "複勝的中",
    # "複勝払戻金",
    # "本賞金",
    # "収得賞金",
    # "血統登録番号",
    "瞬発戦好走馬_芝": "category",
    "消耗戦好走馬_芝": "category",
    "瞬発戦好走馬_ダート": "category",
    "消耗戦好走馬_ダート": "category",
    "瞬発戦好走馬_総合": "category",
    "消耗戦好走馬_総合": "category",
    "性別": "category",
    "トラック種別": "category",
    "前日_芝馬場差": "float",
    "前日_ダ馬場差": "float",
    "前日_ＩＤＭ": "float",
    "前日_脚質": "category",
    "前日_単勝オッズ": "float",
    "前日_複勝オッズ": "float",
    "直前_ＩＤＭ": "float",
    "直前_騎手指数": "float",
    "直前_情報指数": "float",
    "直前_オッズ指数": "float",
    "直前_パドック指数": "float",
    "直前_脚元情報": "category",
    "直前_天候": "category",
    "直前_単勝オッズ": "float",
    "直前_複勝オッズ": "float",
    "前走トップ3": "category",
    "前走枠番": "category",
    "入厩何日前": "float",
    "入厩15日未満": "category",
    "入厩35日以上": "category",
    "馬体重": "float",
    "馬体重増減": "float",
    "距離": "float",
    "前走距離差": "float",
    # "馬具コード",
    # "年齢",
    "4歳以下": "category",
    "4歳以下頭数": "float",
    "4歳以下割合": "float",
    "レース数": "float",
    "1位完走": "float",
    "トップ3完走": "float",
    "1位完走率": "float",
    "トップ3完走率": "float",
    "場所レース数": "float",
    "場所1位完走": "float",
    "場所トップ3完走": "float",
    "場所1位完走率": "float",
    "場所トップ3完走率": "float",
    "トラック種別レース数": "float",
    "トラック種別1位完走": "float",
    "トラック種別トップ3完走": "float",
    "トラック種別1位完走率": "float",
    "トラック種別トップ3完走率": "float",
    "馬場状態レース数": "float",
    "馬場状態1位完走": "float",
    "馬場状態トップ3完走": "float",
    "馬場状態1位完走率": "float",
    "馬場状態トップ3完走率": "float",
    "距離レース数": "float",
    "距離1位完走": "float",
    "距離トップ3完走": "float",
    "距離1位完走率": "float",
    "距離トップ3完走率": "float",
    "四半期レース数": "float",
    "四半期1位完走": "float",
    "四半期トップ3完走": "float",
    "四半期1位完走率": "float",
    "四半期トップ3完走率": "float",
    "騎手レース数": "float",
    "騎手1位完走": "float",
    "騎手トップ3完走": "float",
    "騎手1位完走率": "float",
    "騎手トップ3完走率": "float",
    "騎手場所レース数": "float",
    "騎手場所1位完走": "float",
    "騎手場所トップ3完走": "float",
    "騎手場所1位完走率": "float",
    "騎手場所トップ3完走率": "float",
    "騎手距離レース数": "float",
    "騎手距離1位完走": "float",
    "騎手距離トップ3完走": "float",
    "騎手距離1位完走率": "float",
    "騎手距離トップ3完走率": "float",
    "調教師レース数": "float",
    "調教師1位完走": "float",
    "調教師トップ3完走": "float",
    "調教師1位完走率": "float",
    "調教師トップ3完走率": "float",
    "調教師場所レース数": "float",
    "調教師場所1位完走": "float",
    "調教師場所トップ3完走": "float",
    "調教師場所1位完走率": "float",
    "調教師場所トップ3完走率": "float",
    "過去3走順位平方和": "float",
    "過去5走勝率": "float",
    "過去5走トップ3完走率": "float",
    "騎手過去5走勝率": "float",
    "騎手過去5走トップ3完走率": "float",
}

numeric_features = [
    field for field, dtype in feature_dtypes.items() if dtype in ["float"]
]

categorical_features = [
    field for field, dtype in feature_dtypes.items() if dtype in ["category"]
]

label = "複勝的中"

In [4]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load a sample dataset
data = df.toPandas().astype(feature_dtypes)
X = data[numeric_features + categorical_features]
y = data[label]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

23/12/28 15:05:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

# Define objective function

In [5]:
from hyperopt import hp, STATUS_OK
from sklearn.metrics import accuracy_score

def objective(params):
    numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
    categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder())])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", lgb.LGBMClassifier(**params))])
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    # We aim to maximize accuracy, hence negative sign
    return {'loss': -accuracy, 'status': STATUS_OK}


# Define hyperparameter space

In [6]:
from hyperopt import hp

# Define the hyperparameter space
space = {
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),  # Gradient Boosting Decision Tree, Dropouts meet Multiple Additive Regression Trees, Gradient-based One-Side Sampling
    'num_leaves': hp.choice('num_leaves', range(20, 151)),  # Number of leaves in one tree
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),  # Learning rate or shrinkage rate
    'n_estimators': hp.choice('n_estimators', range(100, 1001)),  # Number of boosted trees to fit
    'max_depth': hp.choice('max_depth', range(5, 31)),  # Maximum tree depth
    'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),  # Minimum sum of instance weight (hessian) needed in a child (leaf)
    'subsample': hp.uniform('subsample', 0.5, 1),  # Subsample ratio of the training instance
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),  # Subsample ratio of columns when constructing each tree
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),  # L1 regularization term on weights
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),  # L2 regularization term on weights
    "verbose": -1,
    "seed": 80
}


# space = {
#     "metric": "binary_error",
#     # "metric": "multi_error",
#     "boosting": "gbdt",
#     "min_data_in_leaf": 99,
#     "n_estimators": 300,
#     "objective": "binary",
#     # "objective": "multiclass",
#     # "num_class": 14,
#     "seed": 80,
#     "learning_rate": 0.03,  # hp.choice('learning_rate', [0.05, .1, .3]),
#     "max_depth": hp.choice("max_depth", range(6, 25)),
#     "num_leaves": hp.choice("num_leaves", range(20, 100, 10)),
#     "subsample": hp.uniform("subsample", 0.5, 1),
#     "feature_fraction": hp.uniform("feature_fraction", 0.5, 1),  # colsample_bytree
#     "reg_alpha": hp.uniform("reg_alpha", 0, 1),
#     "reg_lambda": hp.uniform("reg_lambda", 0, 1),
#     "min_child_samples": hp.choice(
#         "min_child_samples", range(10, 100, 10)
#     ),
#     "verbose": -1,
# }

# Run optimization

In [11]:
spark.stop()

In [12]:
from hyperopt import fmin, tpe, Trials, SparkTrials

spark = (
    SparkSession.builder
    .config("spark.executor.memory", "5g")
    .config("spark.driver.memory", "5g")
    .config("spark.jars", "postgresql-42.7.1.jar")
    .getOrCreate()
)

# trials = Trials()
# Example for using SparkTrials
trials = SparkTrials(parallelism=5, spark_session=spark)
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

print("Best hyperparameters:", best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

                                                                                

  1%|          | 1/100 [01:35<2:37:40, 95.56s/trial, best loss: -0.9599567232624758]

[Stage 0:>    (0 + 1) / 1][Stage 2:>    (0 + 1) / 1][Stage 3:>    (0 + 1) / 1]

# Train final model

In [None]:
best_params = space.copy()
best_params.update(best)

final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_train, y_train)

{'feature_fraction': 0.7203846782924843, 'max_depth': 9, 'min_child_samples': 3, 'num_leaves': 4, 'reg_alpha': 0.9084974075640193, 'reg_lambda': 0.06472230215029007, 'subsample': 0.8682758592597637}


# Evaluate model

In [None]:
final_pred = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_pred)
print("Final model accuracy:", final_accuracy)

Final model accuracy: 0.9292449930535919


In [None]:
X_test.assign(final_pred=final_pred)

Unnamed: 0,前日_芝馬場差,前日_ダ馬場差,前日_ＩＤＭ,前日_単勝オッズ,前日_複勝オッズ,直前_ＩＤＭ,直前_騎手指数,直前_情報指数,直前_オッズ指数,直前_パドック指数,...,トラック種別,前日_脚質,直前_脚元情報,直前_天候,前走トップ3,前走枠番,入厩15日未満,入厩35日以上,4歳以下,final_pred
118382,-8.0,-8.0,34.0,22.4,4.3,34.0,0.4,3.7,0.0,0.0,...,ダート,先行,0,晴,False,2,False,True,True,False
724411,,,54.0,,,53.0,0.8,1.2,0.0,0.0,...,芝,先行,0,雨,True,5,,,True,False
636861,-6.0,-8.0,43.0,5.8,1.5,43.0,1.6,2.4,2.0,2.0,...,芝,先行,0,小雨,False,8,False,False,True,False
931865,,,34.0,38.7,6.8,34.0,0.3,-1.0,0.0,0.0,...,芝,好位差し,0,晴,False,5,,,True,True
473644,-20.0,-8.0,52.0,5.5,1.9,52.0,1.5,0.8,3.0,0.0,...,ダート,差し,2,晴,False,6,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541001,-10.0,-15.0,10.0,105.7,21.0,10.0,0.0,0.0,0.0,0.0,...,芝,追込,0,雨,False,8,False,False,True,False
1157787,-6.0,-12.0,24.0,40.7,7.3,24.0,0.2,0.0,0.0,2.4,...,芝,追込,0,曇,False,5,,,True,True
513018,-16.0,-12.0,44.0,27.7,4.8,44.0,0.4,1.2,2.5,0.0,...,芝,差し,0,晴,False,8,False,True,True,False
1148413,-8.0,-10.0,23.7,13.4,3.4,23.7,0.7,0.8,0.0,0.0,...,ダート,逃げ,0,晴,False,3,False,True,True,False


In [None]:
data

Unnamed: 0,レースキー,馬番,枠番,場名,年月日,頭数,四半期,単勝的中,単勝払戻金,複勝的中,...,調教師場所レース数,調教師場所1位完走,調教師場所トップ3完走,調教師場所1位完走率,調教師場所トップ3完走率,過去3走順位平方和,過去5走勝率,過去5走トップ3完走率,騎手過去5走勝率,騎手過去5走トップ3完走率
0,04093407,05,3,新潟,2009-08-23,18,3,False,0,False,...,351.0,26.0,83.0,0.074074,0.236467,173.0,0.0,0.0,0.00,0.25
1,07094109,17,8,中京,2009-12-12,18,4,False,0,False,...,157.0,7.0,34.0,0.044586,0.216561,77.0,0.0,0.0,0.00,0.00
2,10091804,07,4,小倉,2009-03-01,18,1,True,410,True,...,204.0,34.0,94.0,0.166667,0.460784,0.0,,,0.00,0.00
3,09092208,09,6,阪神,2009-03-29,13,1,True,520,True,...,680.0,102.0,261.0,0.150000,0.383824,0.0,,,-0.25,-0.25
4,09092608,09,5,阪神,2009-04-12,16,2,False,0,False,...,693.0,103.0,264.0,0.148629,0.380952,0.0,2.0,2.0,0.25,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220049,05091503,01,1,東京,2009-02-14,16,1,False,0,False,...,509.0,34.0,82.0,0.066798,0.161100,0.0,,,0.00,0.25
1220050,06093305,15,8,中山,2009-04-04,16,2,False,0,False,...,571.0,52.0,136.0,0.091068,0.238179,0.0,0.0,0.0,0.25,0.25
1220051,05093106,16,8,東京,2009-05-23,18,2,False,0,False,...,527.0,34.0,86.0,0.064516,0.163188,325.0,0.0,0.0,0.00,0.00
1220052,03092604,07,4,福島,2009-07-05,16,3,False,0,False,...,332.0,18.0,82.0,0.054217,0.246988,410.0,0.0,0.0,0.00,0.50


In [None]:
# (
#     X_test
#     .assign(final_pred=final_pred)
#     .join(data[["レースキー", "馬番"]], how="left")
#     .assign(的中=lambda df: df["final_pred"] == df["複勝的中"])
# )

118382     False
724411     False
636861     False
931865      True
473644     False
           ...  
541001     False
1157787     True
513018      True
1148413    False
299480     False
Name: 複勝的中, Length: 244011, dtype: bool