# Hypothesis: 上がり3Fが速い馬は成績がいい

* Created weighted average 3F time rank feature for past 3 races. No predictive power found.
* 重み can be calculated by 1/days_since_last_race+1e6 which is easier to interpret

In [1]:
import pandas as pd

from JapanHorseRaceAnalytics.utilities.base import get_spark_session, read_hive_table
from JapanHorseRaceAnalytics.utilities.structured_logger import logger

In [2]:
spark = get_spark_session()

24/03/16 11:53:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
data = read_hive_table(
    table_name="features_20240304_v1",
    schema="jhra_curated",
    spark_session=spark,
    # use_cache=False,
    parse_dates=["meta_発走日時"],
)

rows_before = data.shape[0]
logger.info(f"Original data length: {rows_before}")

# Drop from data where cat_トラック種別 == "障害"
# Keep only horses that have 3 races
# Keep only data from 2000 onwards
data = data[
    (data["cat_トラック種別"] != "障害")
    & (data["meta_異常区分"] == "0")
    & (data["num_1走前着順"].notnull())
    & (data["num_2走前着順"].notnull())
    & (data["num_3走前着順"].notnull())
    & (data["meta_発走日時"] >= "2000-01-01")
]

rows_after = data.shape[0]
logger.info(
    f"Data length after filtering: {rows_after} (dropped {rows_before - rows_after} rows, {100 * (rows_before - rows_after) / rows_before:.2f}%)"
)

# Interpolate missing values for num_馬体重 (20 instances from 1999 ~ 2017)
data["num_馬体重"] = (
    data.sort_values("meta_発走日時")
    .groupby("meta_血統登録番号")["num_馬体重"]
    .transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
)

data.reset_index(drop=True, inplace=True)
data.head()

{"event": "Read from parquet /Users/hankehly/Projects/JapanHorseRaceAnalytics/data/sql_tables/features_20240304_v1.snappy.parquet to pandas", "level": "info", "timestamp": "2024-03-16T02:53:49.155007Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
{"event": "Original data length: 1217019", "level": "info", "timestamp": "2024-03-16T02:53:49.639385Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
{"event": "Data length after filtering: 808861 (dropped 408158 rows, 33.54%)", "level": "info", "timestamp": "2024-03-16T02:53:49.986265Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}


Unnamed: 0,meta_単勝払戻金,meta_複勝払戻金,meta_レースキー,meta_馬番,meta_血統登録番号,meta_発走日時,meta_単勝的中,meta_複勝的中,meta_複勝オッズ,meta_着順,...,cat_6走前休養理由分類コード,num_6走前3着タイム差,cat_トラック種別,num_距離,num_過去3走重み付き着順成績,num_入厩何日前逆数,cat_堅実な馬,cat_過去3走中1走訳あり凡走,cat_過去3走中2走好走,cat_過去3走繋がりあり
0,0,240,7023401,10,100184,2002-12-08 10:00:00+09:00,0,1,1.7,2.0,...,,,ダート,1700,0.290165,1.0,False,False,False,False
1,0,0,7032804,12,100184,2003-06-15 11:25:00+09:00,0,0,2.6,8.0,...,,,ダート,1700,0.136197,1.0,False,False,False,False
2,0,0,9033503,3,100184,2003-07-05 10:55:00+09:00,0,0,1.5,7.0,...,,,ダート,1800,0.184235,1.0,False,False,False,False
3,270,150,10032202,2,100184,2003-07-20 10:30:00+09:00,1,1,1.2,1.0,...,,3.7,ダート,1700,0.197533,1.0,False,False,False,False
4,0,0,9034208,12,100184,2003-09-14 14:00:00+09:00,0,0,7.8,12.0,...,,1.8,ダート,1800,0.108824,1.0,False,False,False,False


In [4]:
data["meta_レースキー"].unique().shape

(71593,)

In [5]:
data[["meta_レースキー", "meta_血統登録番号", "meta_発走日時", "meta_着順", "meta_後３Ｆタイム"]].head()

Unnamed: 0,meta_レースキー,meta_血統登録番号,meta_発走日時,meta_着順,meta_後３Ｆタイム
0,7023401,100184,2002-12-08 10:00:00+09:00,2.0,39.9
1,7032804,100184,2003-06-15 11:25:00+09:00,8.0,38.7
2,9033503,100184,2003-07-05 10:55:00+09:00,7.0,39.8
3,10032202,100184,2003-07-20 10:30:00+09:00,1.0,40.5
4,9034208,100184,2003-09-14 14:00:00+09:00,12.0,43.5


In [6]:
data[["meta_レースキー", "meta_血統登録番号", "meta_発走日時", "meta_着順", "meta_後３Ｆタイム"]].to_csv("data.csv.gz", index=False, compression="gzip")

In [7]:
# Preprocessing the data

# Convert 'meta_着順' to a numeric type, forcing errors to NaN (e.g., non-numeric values become NaN)
data['meta_着順'] = pd.to_numeric(data['meta_着順'], errors='coerce')
data['meta_後３Ｆタイム'] = pd.to_numeric(data['meta_後３Ｆタイム'], errors='coerce')

# Check for missing values in 'meta_着順' and 'meta_後３Ｆタイム'
missing_values = data[['meta_着順', 'meta_後３Ｆタイム']].isnull().sum()

# Dropping rows where 'meta_着順' or 'meta_後３Ｆタイム' is missing
cleaned_data = data.dropna(subset=['meta_着順', 'meta_後３Ｆタイム'])

# Check how many rows were dropped
rows_dropped = data.shape[0] - cleaned_data.shape[0]

missing_values, rows_dropped

(meta_着順        0
 meta_後３Ｆタイム    3
 dtype: int64,
 3)

In [8]:
# Adding a rank column for the last 3F times within each race
cleaned_data['3F_time_rank'] = cleaned_data.groupby('meta_レースキー')['meta_後３Ｆタイム'].rank() / cleaned_data.groupby('meta_レースキー')['meta_後３Ｆタイム'].transform('count')

# Displaying the adjusted dataframe to verify the ranking
cleaned_data[['meta_レースキー', 'meta_血統登録番号', 'meta_着順', 'meta_後３Ｆタイム', '3F_time_rank']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['3F_time_rank'] = cleaned_data.groupby('meta_レースキー')['meta_後３Ｆタイム'].rank() / cleaned_data.groupby('meta_レースキー')['meta_後３Ｆタイム'].transform('count')


Unnamed: 0,meta_レースキー,meta_血統登録番号,meta_着順,meta_後３Ｆタイム,3F_time_rank
0,7023401,100184,2.0,39.9,0.444444
1,7032804,100184,8.0,38.7,0.65
2,9033503,100184,7.0,39.8,0.75
3,10032202,100184,1.0,40.5,0.111111
4,9034208,100184,12.0,43.5,1.0


In [9]:
# Calculate the Spearman correlation between the 3F time rank and the final positions
correlation = cleaned_data[['meta_着順', '3F_time_rank']].corr(method='spearman')

correlation

Unnamed: 0,meta_着順,3F_time_rank
meta_着順,1.0,0.665796
3F_time_rank,0.665796,1.0


In [10]:
# Ensure the race time is in datetime format for correct sorting
cleaned_data['meta_発走日時'] = pd.to_datetime(cleaned_data['meta_発走日時'])

# Sort the data by Horse ID and Race Time to track performance over time
sorted_data = cleaned_data.sort_values(by=['meta_血統登録番号', 'meta_発走日時'])

# Add columns to identify the next race for each horse and the change in final position
sorted_data['next_race_time'] = sorted_data.groupby('meta_血統登録番号')['meta_発走日時'].shift(-1)
sorted_data['next_final_position'] = sorted_data.groupby('meta_血統登録番号')['meta_着順'].shift(-1)

# For analysis, we only need rows where both the current and next races are known
analysis_data = sorted_data.dropna(subset=['next_race_time', 'next_final_position'])

# Display the first few rows to verify the new structure
analysis_data[['meta_血統登録番号', 'meta_発走日時', 'meta_着順', 'meta_後３Ｆタイム', 'next_race_time', 'next_final_position']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['meta_発走日時'] = pd.to_datetime(cleaned_data['meta_発走日時'])


Unnamed: 0,meta_血統登録番号,meta_発走日時,meta_着順,meta_後３Ｆタイム,next_race_time,next_final_position
161249,100002,2003-04-06 15:10:00+09:00,5.0,35.8,2003-04-12 15:40:00+09:00,1.0
161250,100002,2003-04-12 15:40:00+09:00,1.0,34.6,2003-05-11 15:40:00+09:00,2.0
161251,100002,2003-05-11 15:40:00+09:00,2.0,34.7,2003-11-30 14:40:00+09:00,2.0
161252,100002,2003-11-30 14:40:00+09:00,2.0,37.3,2004-01-05 15:45:00+09:00,5.0
161253,100002,2004-01-05 15:45:00+09:00,5.0,35.4,2004-10-30 15:45:00+09:00,8.0


In [11]:
# # Identify the top quartile of 3F time performers within each race
# # Quartile calculation within each race
# analysis_data['3F_time_quartile'] = analysis_data.groupby('meta_レースキー')['meta_後３Ｆタイム'].transform(
#     lambda x: pd.qcut(x, 4, labels=False, duplicates='drop')
# )

# # For simplification, consider 'fast' as being in the top quartile (0th quartile after labels=False)
# # Analyze the change in final position for these 'fast' horses
# # A negative change means improvement (a lower final position number is better)
# analysis_data['position_change'] = analysis_data['meta_着順'] - analysis_data['next_final_position']

# # Repeating the necessary step due to the execution environment issue
# # Calculate the average change in position for fast horses to see if there's a general improvement

# # Filter the analysis_data for fast horses in the top quartile within their race for the 3F time
# fast_horses_data = analysis_data[analysis_data['3F_time_quartile'] == 0]

# # Calculate the average change in position for these fast horses (next race position - current race position)
# average_position_change_fast_horses = fast_horses_data['position_change'].mean()

# This was something like -2.3, which means that the average change in position for fast horses is -2.3
# average_position_change_fast_horses

In [12]:
analysis_data[analysis_data["meta_レースキー"] == "10023205"].sort_values("meta_着順")

Unnamed: 0,meta_単勝払戻金,meta_複勝払戻金,meta_レースキー,meta_馬番,meta_血統登録番号,meta_発走日時,meta_単勝的中,meta_複勝的中,meta_複勝オッズ,meta_着順,...,num_距離,num_過去3走重み付き着順成績,num_入厩何日前逆数,cat_堅実な馬,cat_過去3走中1走訳あり凡走,cat_過去3走中2走好走,cat_過去3走繋がりあり,3F_time_rank,next_race_time,next_final_position


In [13]:
sorted_data = cleaned_data.sort_values(by=['meta_血統登録番号', 'meta_発走日時'])

# Adjusting the calculation of inverse weights to avoid division by zero without introducing lookahead
# The subtraction by the min might not be necessary unless you're trying to normalize the weights
sorted_data['inverse_days_weight'] = 1 / sorted_data['num_1走前経過日数']

def calculate_weighted_features(group):
    # Calculate rolling sums of weights for the past 3 races
    roll_sum_weights = group['inverse_days_weight'].rolling(window=3).sum()
    
    # Weighted 3F time rank using past 3 races data
    weighted_3F_time_rank = (group['3F_time_rank'] * group['inverse_days_weight']).rolling(window=3).sum() / roll_sum_weights
    
    # Calculate position changes based on past data only
    # Removing shift(-1) to avoid lookahead
    position_changes = group['meta_着順'].diff()  # This now correctly calculates the change based on previous race
    weighted_position_change = (position_changes * group['inverse_days_weight']).rolling(window=3).sum() / roll_sum_weights
    
    # Shift the entire calculation by 1 to use it for predictions without including the current race's data
    return pd.DataFrame({
        'weighted_avg_3F_time_rank': weighted_3F_time_rank.shift(1),  
        'weighted_avg_position_change': weighted_position_change.shift(1)  
    })

# Apply the function to each group
weighted_features = sorted_data.groupby('meta_血統登録番号').apply(calculate_weighted_features).reset_index(level=0, drop=True)

# Join the calculated weighted features back to the sorted_data DataFrame
sorted_data = sorted_data.join(weighted_features)

# Drop NaN values resulting from rolling calculations and shifting
sorted_data.dropna(subset=['weighted_avg_3F_time_rank', 'weighted_avg_position_change'], inplace=True)

# Display to verify the new columns
sorted_data[['meta_血統登録番号', 'meta_発走日時', 'weighted_avg_3F_time_rank', 'weighted_avg_position_change']].head()

Unnamed: 0,meta_血統登録番号,meta_発走日時,weighted_avg_3F_time_rank,weighted_avg_position_change
161253,100002,2004-01-05 15:45:00+09:00,0.169534,-3.067729
161254,100002,2004-10-30 15:45:00+09:00,0.283744,1.753564
161255,100002,2004-11-27 15:25:00+09:00,0.45348,2.590042
161256,100002,2004-12-11 15:35:00+09:00,0.377156,-0.740469
161257,100002,2005-01-15 15:35:00+09:00,0.36104,2.03027


In [14]:
# Calculate the Spearman correlation between the 3F time rank and the final positions
sorted_data[['meta_着順', "meta_複勝的中", 'weighted_avg_3F_time_rank', "weighted_avg_position_change"]].corr(method='spearman')

Unnamed: 0,meta_着順,meta_複勝的中,weighted_avg_3F_time_rank,weighted_avg_position_change
meta_着順,1.0,-0.716781,0.230324,0.073013
meta_複勝的中,-0.716781,1.0,-0.148397,-0.048977
weighted_avg_3F_time_rank,0.230324,-0.148397,1.0,0.26293
weighted_avg_position_change,0.073013,-0.048977,0.26293,1.0


In [15]:
from sklearn.model_selection import train_test_split


df = sorted_data.copy()

X = df
y = df["meta_複勝的中"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

X_train: (426862, 112)
X_test: (106716, 112)
y_train: (426862,)
y_test: (106716,)


In [16]:
import mlflow
import tempfile
import re
import japanize_matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
import warnings
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImblearnPipeline
from hyperopt import STATUS_OK
from JapanHorseRaceAnalytics.utilities.metrics import (
    calculate_payout_rate,
    kelly_criterion
)

from JapanHorseRaceAnalytics.utilities.plot import (
    plot_confusion_matrix,
    plot_roc_curve,
    plot_feature_importances,
    plot_shap_interaction_values,
    plot_correlation_matrix,
)



def create_objective_fn(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    payouts: pd.DataFrame,
    payout_column_name: str,
    mlflow_experiment_name: str,
):
    """
    payouts should have the same index as *_test and have the following columns:
    * 発走日時
    * odds
    * payout
    """

    def train(params):
        mlflow.set_experiment(experiment_name=mlflow_experiment_name)
        with mlflow.start_run():
            ########################################################################################
            # decay_rate = params["features__decay_rate"]

            for df in [X_train, X_test]:
                # 1. Weighted average position
                df["num_1走前標準化着順"] = (df["num_1走前着順"] - 1) / (df["num_1走前頭数"] - 1)
                df["num_2走前標準化着順"] = (df["num_2走前着順"] - 1) / (df["num_2走前頭数"] - 1)
                df["num_3走前標準化着順"] = (df["num_3走前着順"] - 1) / (df["num_3走前頭数"] - 1)
                # We want our factor (num_1走前経過日数) to start from 0
                # so subtract the minimum value for "days since last race" across all horses
                # Add a small number to avoid division by zero
                df["num_1走前重み"] = 1 / (df["num_1走前経過日数"] - df["num_1走前経過日数"].min() + 1e-6)
                df["num_2走前重み"] = 1 / (df["num_2走前経過日数"] - df["num_1走前経過日数"].min() + 1e-6)
                df["num_3走前重み"] = 1 / (df["num_3走前経過日数"] - df["num_1走前経過日数"].min() + 1e-6)
                # df["num_2走前重み"] = np.exp(-decay_rate * (df["num_2走前経過日数"] - df["num_1走前経過日数"].min()))
                # df["num_3走前重み"] = np.exp(-decay_rate * (df["num_3走前経過日数"] - df["num_1走前経過日数"].min()))
                # Calculate weighted average of the feature
                # weighted_feature_values = np.average([value for _, value in races], weights=weights)
                df["num_過去3走重み付き標準化着順"] = (
                    (df["num_1走前標準化着順"] * df["num_1走前重み"])
                    + (df["num_2走前標準化着順"] * df["num_2走前重み"])
                    + (df["num_3走前標準化着順"] * df["num_3走前重み"])
                ) / (df["num_1走前重み"] + df["num_2走前重み"] + df["num_3走前重み"])

                # 2. Weighted average time difference between the horse and the 3 horses behind it
                df["num_1走前後続馬平均タイム差"] = (
                    df[["num_1走前後続馬1タイム差", "num_1走前後続馬2タイム差", "num_1走前後続馬3タイム差"]]
                    .mean(axis=1)
                    .fillna(0)
                )
                df["num_2走前後続馬平均タイム差"] = (
                    df[["num_2走前後続馬1タイム差", "num_2走前後続馬2タイム差", "num_2走前後続馬3タイム差"]]
                    .mean(axis=1)
                    .fillna(0)
                )
                df["num_3走前後続馬平均タイム差"] = (
                    df[["num_3走前後続馬1タイム差", "num_3走前後続馬2タイム差", "num_3走前後続馬3タイム差"]]
                    .mean(axis=1)
                    .fillna(0)
                )
                df["num_過去3走重み付き後続馬平均タイム差"] = (
                    (df["num_1走前後続馬平均タイム差"] * df["num_1走前重み"])
                    + (df["num_2走前後続馬平均タイム差"] * df["num_2走前重み"])
                    + (df["num_3走前後続馬平均タイム差"] * df["num_3走前重み"])
                ) / (df["num_1走前重み"] + df["num_2走前重み"] + df["num_3走前重み"])
            ########################################################################################
            preprocessor = ColumnTransformer(
                transformers=[
                    # (
                    #     "cat",
                    #     OneHotEncoder(drop="if_binary", handle_unknown="error"),
                    #     ["cat_性別", "cat_場コード", "cat_トラック種別"],
                    # ),
                    (
                        "num",
                        StandardScaler(),
                        [
                            "num_複勝率",
                            "num_1走前経過日数",
                            "num_過去3走重み付き標準化着順",
                            "num_過去3走重み付き後続馬平均タイム差",
                            "weighted_avg_3F_time_rank",
                            "weighted_avg_position_change",
                        ],
                    ),
                ],
                remainder="drop",
            )

            # Get all keys from params where the key starts with "smote__" and remove "smote__" from the key
            smote_params = {k.split("__")[1]: v for k, v in params.items() if k.startswith("smote__")}

            # do the same for classifier
            classifier_params = {k.split("__")[1]: v for k, v in params.items() if k.startswith("classifier__")}

            model = ImblearnPipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("smote", SMOTE(**smote_params)),
                    ("classifier", LGBMClassifier(**classifier_params)),
                ]
            )

            model.fit(X_train, y_train)
            y_proba = model.predict_proba(X_test)
            y_pred = model.predict(X_test)

            metrics = {
                "loss": -precision_score(y_test, y_pred),
                "log_loss": log_loss(y_test, y_proba[:, 1]),
                "accuracy": accuracy_score(y_test, y_pred),
                "precision": precision_score(y_test, y_pred),
                "recall": recall_score(y_test, y_pred),
                "f1": f1_score(y_test, y_pred),
                "roc_auc": roc_auc_score(y_test, y_pred),
            }

            mlflow.log_params(classifier_params)
            # mlflow.log_param("decay_rate", decay_rate)
            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(sk_model=model, artifact_path="model")

            payout = calculate_payout_rate(
                payouts=payouts,
                y_test=y_test,
                y_proba_true=y_proba[:, 1],
                groupby=[
                    ("all", None),
                    ("month", payouts["発走日時"].dt.month),
                    ("season", payouts["発走日時"].dt.month % 12 // 3),
                    ("year", payouts["発走日時"].dt.year),
                ],
                payout_column_name=payout_column_name,
            )

            # Save payout rates as csv
            with tempfile.NamedTemporaryFile(prefix="payout_rate_", suffix=".csv") as f:
                payout.to_csv(f.name, index=False)
                mlflow.log_artifact(f.name)

            # Log payout rates as metrics
            payout_metrics = {}
            for group_name, group in payout.groupby("group"):
                for i, row in group.iterrows():
                    key = re.sub(r"\W", "_", f"payout_rate_{group_name}_{row['part']}")
                    payout_metrics[key] = row["payout_rate"]
            mlflow.log_metrics(payout_metrics)

            # Suppress UserWarning messages from matplotlib
            warnings.filterwarnings("ignore", category=UserWarning)

            # Confusion Matrix
            fig, axes = plot_confusion_matrix(y_test, y_pred)
            with tempfile.NamedTemporaryFile(
                prefix="confusion_matrix_", suffix=".png"
            ) as f:
                fig.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # ROC Curve
            fig, ax = plot_roc_curve(y_test, y_proba[:, 1])
            with tempfile.NamedTemporaryFile(prefix="roc_curve_", suffix=".png") as f:
                fig.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Feature Importances Plot
            fig, ax = plot_feature_importances(
                preprocessor.get_feature_names_out(),
                model.named_steps["classifier"].feature_importances_,
                top_n=50,
            )
            with tempfile.NamedTemporaryFile(
                prefix="feature_importance_", suffix=".png"
            ) as f:
                fig.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Feature Importances Data
            feature_importances = zip(
                preprocessor.get_feature_names_out(),
                model.named_steps["classifier"].feature_importances_,
            )
            feature_importances_df = (
                pd.DataFrame(feature_importances, columns=["feature", "importance"])
                .sort_values("importance", ascending=False)
                .reset_index(drop=True)
            )
            with tempfile.NamedTemporaryFile(prefix="feature_importance_", suffix=".csv") as f:
                feature_importances_df.to_csv(f.name, index=False)
                mlflow.log_artifact(f.name)
            
            # SHAP values
            X_test_sample = X_test.sample(n=5000, random_state=42)
            X_test_sample_prep = preprocessor.transform(X_test_sample)
            explainer = shap.TreeExplainer(
                model=model.named_steps["classifier"],
                feature_names=preprocessor.get_feature_names_out(),
            )
            # Because we are working with a binary classifier, we only need the SHAP values for the positive class.
            # E.g., if you change 1->0 the waterfall plot flips backwards only.
            shap_values = explainer(X_test_sample_prep)[:, :, 1]
            shap_interaction_values = explainer.shap_interaction_values(X_test_sample_prep)

            # SHAP beeswarm plot
            shap.plots.beeswarm(shap_values, show=False)
            plt.tight_layout()
            with tempfile.NamedTemporaryFile(prefix="shap_beeswarm_", suffix=".png") as f:
                plt.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # SHAP interaction values heatmap
            fig, ax = plot_shap_interaction_values(shap_interaction_values, preprocessor.get_feature_names_out())
            with tempfile.NamedTemporaryFile(prefix="shap_interactions_", suffix=".png") as f:
                fig.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # SHAP bar plot
            shap.plots.bar(shap_values, show=False)
            plt.tight_layout()
            with tempfile.NamedTemporaryFile(prefix="shap_bar_", suffix=".png") as f:
                plt.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Correlation matrix
            fig, ax = plot_correlation_matrix(data=preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())
            with tempfile.NamedTemporaryFile(prefix="correlation_matrix_", suffix=".png") as f:
                fig.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Bankroll over time
            results = (
                pd.concat(
                    [
                        payouts,
                        pd.Series(y_pred.astype(bool)).rename("pred"),
                        pd.Series(y_proba[:, 1]).rename("proba_true"),
                        y_test.astype(bool).reset_index(drop=True).rename("actual"),
                    ],
                    axis=1,
                )
                .set_index("発走日時")
                .sort_index()
                .dropna()
            )
            b = results["odds"] - 1
            p = results["proba_true"]
            q = 1 - p
            japanize_matplotlib.japanize()
            for confidence in [0.5, 0.65, 0.8]:
                fig, ax = plt.subplots(figsize=(15, 5))
                for multiplier in [0.1, 0.2, 0.3]:
                    results[f"kelly_{multiplier}"] = kelly_criterion(b, p, q).clip(lower=0) * multiplier
                    bankroll = 10_000
                    history = []
                    for i, row in results.iterrows():
                        # bet in 100 yen increments
                        bet_amount = round(row[f"kelly_{multiplier}"] * bankroll / 100) * 100
                        bet = bet_amount > 0 and row["proba_true"] >= confidence
                        if bet and row["actual"] is True:
                            bankroll += (row["odds"] - 1) * bet_amount
                        elif bet and row["actual"] is False:
                            bankroll -= bet_amount
                        history.append(bankroll)
                    results["bankroll"] = history
                    results["bankroll"].plot(ax=ax, label=f"Kelly Criterion x {multiplier}")
                ax.set_title("Bankroll over time (Confidence: {confidence})")
                ax.set_ylabel("Bankroll")
                ax.set_xlabel("Date")
                ax.legend()
                plt.tight_layout()
                plt.grid()
                with tempfile.NamedTemporaryFile(prefix=f"bets_{confidence}_confidence_", suffix=".png") as f:
                    fig.savefig(f.name)
                    plt.close()
                    mlflow.log_artifact(f.name)
            return {"status": STATUS_OK, "params": params, "model": model, **metrics}

    return train

In [17]:
random_state = 42
from hyperopt import Trials, fmin, hp, tpe, SparkTrials
from hyperopt.pyll.base import scope

space = {
    "smote__random_state": random_state,
    # "features__decay_rate": hp.uniform("decay_rate", 0.001, 0.1),
    "classifier__boosting_type": "gbdt",
    "classifier__learning_rate": hp.loguniform("learning_rate", -5, 0),  # between e^-5 and 1
    "classifier__n_estimators": scope.int(hp.quniform("n_estimators", 100, 1000, 1)),
    "classifier__max_depth": scope.int(hp.quniform("max_depth", 3, 10, 1)),
    "classifier__num_leaves": scope.int(hp.quniform("num_leaves", 20, 150, 1)),
    "classifier__min_child_samples": scope.int(hp.quniform("min_child_samples", 20, 500, 1)),
    "classifier__feature_fraction": hp.uniform("feature_fraction", 0.5, 1.0),
    "classifier__lambda_l1": hp.uniform("lambda_l1", 0, 5),
    "classifier__lambda_l2": hp.uniform("lambda_l2", 0, 5),
    "classifier__min_split_gain": hp.uniform("min_split_gain", 0, 1),
    "classifier__min_child_weight": hp.uniform("min_child_weight", 0.001, 10),
    "classifier__subsample": hp.uniform("subsample", 0.5, 1),
    "classifier__colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "classifier__reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
    "classifier__reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
    "classifier__objective": "binary",
    "classifier__verbose": -1,
    "classifier__random_state": random_state,
}

In [18]:
payouts = (
    X_test[["meta_発走日時", "meta_複勝払戻金", "meta_複勝オッズ"]]
    .reset_index(drop=True)
    .rename(
        columns={
            "meta_発走日時": "発走日時",
            "meta_複勝払戻金": "payout",
            "meta_複勝オッズ": "odds",
        }
    )
    .assign(発走日時=lambda x: pd.to_datetime(x["発走日時"]))
)

experiment_name = "20240316-eda1"
if mlflow.get_experiment_by_name(experiment_name) is None:
    mlflow.create_experiment(experiment_name)

fn = create_objective_fn(
    X_train,
    y_train,
    X_test,
    y_test,
    payouts=payouts,
    payout_column_name="payout",
    mlflow_experiment_name=experiment_name,
)

In [19]:
# early_stop_fn = no_progress_loss(iteration_stop_count=10, absolute_increase=0.001)
# trials = Trials()
# fmin(
#     fn=fn,
#     space=space,
#     algo=tpe.suggest,
#     max_evals=1,
#     trials=trials,
# )

trials = SparkTrials(parallelism=3, spark_session=spark)
fmin(fn=fn, space=space, algo=tpe.suggest, max_evals=30, trials=trials)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

build_posterior_wrapper took 0.002515 seconds
TPE using 0 trials
build_posterior_wrapper took 0.001690 seconds
TPE using 1/1 trials with best loss inf
build_posterior_wrapper took 0.001564 seconds
TPE using 2/2 trials with best loss inf
build_posterior_wrapper took 0.002073 seconds                       (0 + 1) / 1]
TPE using 3/3 trials with best loss inf
Setuptools is replacing distutils.:>    (0 + 1) / 1][Stage 2:>    (0 + 1) / 1]1]
Setuptools is replacing distutils.
Setuptools is replacing distutils.
Closing down clientserver connection                                            


  3%|▎         | 1/30 [00:58<28:10, 58.29s/trial, best loss: -0.38905658569306695]

build_posterior_wrapper took 0.001901 secondse 2:>                  (0 + 1) / 1]
TPE using 4/4 trials with best loss -0.389057
Closing down clientserver connection                                            
[Stage 2:>                                                          (0 + 1) / 1]

  7%|▋         | 2/30 [01:00<11:45, 25.19s/trial, best loss: -0.38905658569306695]

build_posterior_wrapper took 0.001604 seconds
TPE using 5/5 trials with best loss -0.389057
Setuptools is replacing distutils.:>    (0 + 1) / 1][Stage 4:>    (0 + 1) / 1]1]
Closing down clientserver connection                                            


 10%|█         | 3/30 [01:20<10:17, 22.86s/trial, best loss: -0.38905658569306695]

build_posterior_wrapper took 0.001671 secondse 4:>                  (0 + 1) / 1]
TPE using 6/6 trials with best loss -0.389057
Setuptools is replacing distutils.:>    (0 + 1) / 1][Stage 5:>    (0 + 1) / 1]
Setuptools is replacing distutils.
Closing down clientserver connection                                            


 13%|█▎        | 4/30 [02:24<16:59, 39.22s/trial, best loss: -0.38905658569306695]

build_posterior_wrapper took 0.001632 secondse 5:>                  (0 + 1) / 1]
TPE using 7/7 trials with best loss -0.389057
Setuptools is replacing distutils.:>    (0 + 1) / 1][Stage 6:>    (0 + 1) / 1]
Closing down clientserver connection                                            
[Stage 4:>                  (0 + 1) / 1][Stage 6:>                  (0 + 1) / 1]

 17%|█▋        | 5/30 [03:14<17:59, 43.18s/trial, best loss: -0.3938738138838022] 

build_posterior_wrapper took 0.001835 seconds
TPE using 8/8 trials with best loss -0.393874
Closing down clientserver connection                                            


 20%|██        | 6/30 [03:27<13:10, 32.95s/trial, best loss: -0.3938738138838022]

Closing down clientserver connection                                            


 23%|██▎       | 7/30 [03:28<08:37, 22.50s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.001676 seconds
TPE using 9/9 trials with best loss -0.393874
build_posterior_wrapper took 0.002738 seconds                       (0 + 1) / 1]
TPE using 10/10 trials with best loss -0.393874
Setuptools is replacing distutils.) / 1][Stage 8:>                  (0 + 1) / 1]
Setuptools is replacing distutils.:>    (0 + 1) / 1][Stage 9:>    (0 + 1) / 1]
Setuptools is replacing distutils.
Closing down clientserver connection                                            
[Stage 7:>                  (0 + 1) / 1][Stage 9:>                  (0 + 1) / 1]

 27%|██▋       | 8/30 [04:30<12:46, 34.84s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.001584 seconds
TPE using 11/11 trials with best loss -0.393874
Setuptools is replacing distutils.:>    (0 + 1) / 1][Stage 10:>   (0 + 1) / 1]
Closing down clientserver connection                                            


 30%|███       | 9/30 [04:46<10:08, 28.97s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.001737 secondse 10:>                 (0 + 1) / 1]
TPE using 12/12 trials with best loss -0.393874
Closing down clientserver connection                                            


 33%|███▎      | 10/30 [04:48<06:52, 20.65s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.001677 secondse 11:>                 (0 + 1) / 1]
TPE using 13/13 trials with best loss -0.393874
Setuptools is replacing distutils.1:>   (0 + 1) / 1][Stage 12:>   (0 + 1) / 1]
Setuptools is replacing distutils.
Closing down clientserver connection                                            


 37%|███▋      | 11/30 [05:31<08:43, 27.54s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.005236 seconds
TPE using 14/14 trials with best loss -0.393874
Closing down clientserver connection                                            


 40%|████      | 12/30 [05:33<05:56, 19.78s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.001631 secondse 13:>                 (0 + 1) / 1]
TPE using 15/15 trials with best loss -0.393874
Closing down clientserver connection                                            


 43%|████▎     | 13/30 [05:43<04:46, 16.83s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.001933 secondse 14:>                 (0 + 1) / 1]
TPE using 16/16 trials with best loss -0.393874
Setuptools is replacing distutils.4:>   (0 + 1) / 1][Stage 15:>   (0 + 1) / 1]
Setuptools is replacing distutils.
Setuptools is replacing distutils.
Closing down clientserver connection                                            


 47%|████▋     | 14/30 [06:32<07:05, 26.62s/trial, best loss: -0.3938738138838022]

build_posterior_wrapper took 0.001613 secondse 14:>                 (0 + 1) / 1]
TPE using 17/17 trials with best loss -0.393874
Setuptools is replacing distutils.4:>   (0 + 1) / 1][Stage 16:>   (0 + 1) / 1]
Closing down clientserver connection                                            


 50%|█████     | 15/30 [06:57<06:32, 26.16s/trial, best loss: -0.3951001254978993]

build_posterior_wrapper took 0.001659 secondse 16:>                 (0 + 1) / 1]
TPE using 18/18 trials with best loss -0.395100
Setuptools is replacing distutils.6:>   (0 + 1) / 1][Stage 17:>   (0 + 1) / 1]
Closing down clientserver connection                                            


 53%|█████▎    | 16/30 [07:15<05:32, 23.73s/trial, best loss: -0.3951001254978993]

build_posterior_wrapper took 0.001654 secondse 17:>                 (0 + 1) / 1]
TPE using 19/19 trials with best loss -0.395100
Setuptools is replacing distutils.7:>   (0 + 1) / 1][Stage 18:>   (0 + 1) / 1]
Closing down clientserver connection                                            


 57%|█████▋    | 17/30 [08:49<09:40, 44.68s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001701 secondse 18:>                 (0 + 1) / 1]
TPE using 20/20 trials with best loss -0.396149
Closing down clientserver connection                                            


 60%|██████    | 18/30 [08:53<06:29, 32.46s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001649 secondse 19:>                 (0 + 1) / 1]
TPE using 21/21 trials with best loss -0.396149
Setuptools is replacing distutils.9:>   (0 + 1) / 1][Stage 20:>   (0 + 1) / 1]
Setuptools is replacing distutils.
Closing down clientserver connection                                            


 63%|██████▎   | 19/30 [09:38<06:39, 36.29s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001621 secondse 19:>                 (0 + 1) / 1]
TPE using 22/22 trials with best loss -0.396149
Closing down clientserver connection                                            
[Stage 19:>                 (0 + 1) / 1][Stage 21:>                 (0 + 1) / 1]

 67%|██████▋   | 20/30 [09:44<04:32, 27.21s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001717 seconds
TPE using 23/23 trials with best loss -0.396149
Setuptools is replacing distutils.1:>   (0 + 1) / 1][Stage 22:>   (0 + 1) / 1]
Setuptools is replacing distutils.
Closing down clientserver connection                                            


 70%|███████   | 21/30 [10:43<05:31, 36.83s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001689 secondse 21:>                 (0 + 1) / 1]
TPE using 24/24 trials with best loss -0.396149
Closing down clientserver connection                                            


 73%|███████▎  | 22/30 [10:45<03:31, 26.38s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001654 seconds                       (0 + 1) / 1]
TPE using 25/25 trials with best loss -0.396149
Closing down clientserver connection                                            


 77%|███████▋  | 23/30 [10:48<02:15, 19.38s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001722 secondse 24:>                 (0 + 1) / 1]
TPE using 26/26 trials with best loss -0.396149
Setuptools is replacing distutils.4:>   (0 + 1) / 1][Stage 25:>   (0 + 1) / 1]
Setuptools is replacing distutils.
Setuptools is replacing distutils.
Closing down clientserver connection                                            
[Stage 24:>                 (0 + 1) / 1][Stage 25:>                 (0 + 1) / 1]

 80%|████████  | 24/30 [12:21<04:07, 41.28s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001652 seconds
TPE using 27/27 trials with best loss -0.396149
Setuptools is replacing distutils.5:>   (0 + 1) / 1][Stage 26:>   (0 + 1) / 1]
Closing down clientserver connection                                            
[Stage 24:>                 (0 + 1) / 1][Stage 25:>                 (0 + 1) / 1]

 83%|████████▎ | 25/30 [13:01<03:24, 40.95s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001626 seconds
TPE using 28/28 trials with best loss -0.396149
Setuptools is replacing distutils.5:>   (0 + 1) / 1][Stage 27:>   (0 + 1) / 1]
Closing down clientserver connection                                            
[Stage 25:>                 (0 + 1) / 1][Stage 27:>                 (0 + 1) / 1]

 87%|████████▋ | 26/30 [14:28<03:39, 54.89s/trial, best loss: -0.39614876536937305]

build_posterior_wrapper took 0.001751 seconds
TPE using 29/29 trials with best loss -0.396149
Setuptools is replacing distutils.7:>   (0 + 1) / 1][Stage 28:>   (0 + 1) / 1]
Closing down clientserver connection                                            
[Stage 25:>                 (0 + 1) / 1][Stage 28:>                 (0 + 1) / 1]

 90%|█████████ | 27/30 [15:01<02:24, 48.07s/trial, best loss: -0.39614876536937305]

Setuptools is replacing distutils.8:>   (0 + 1) / 1][Stage 29:>   (0 + 1) / 1]
Closing down clientserver connection                                            


 93%|█████████▎| 28/30 [15:56<01:40, 50.22s/trial, best loss: -0.39614876536937305]

Closing down clientserver connection                                            


 97%|█████████▋| 29/30 [16:21<00:42, 42.68s/trial, best loss: -0.39614876536937305]

Closing down clientserver connection                                            


100%|██████████| 30/30 [17:01<00:00, 34.05s/trial, best loss: -0.39614876536937305]

Queue empty, exiting run.





Closing down clientserver connection
Total Trials: 30: 30 succeeded, 0 failed, 0 cancelled.


{'colsample_bytree': 0.9714517337924691,
 'feature_fraction': 0.9895358123133442,
 'lambda_l1': 1.2789820470995301,
 'lambda_l2': 2.8427382476927328,
 'learning_rate': 0.031200627683390197,
 'max_depth': 8.0,
 'min_child_samples': 185.0,
 'min_child_weight': 6.3447195744828955,
 'min_split_gain': 0.618233497310962,
 'n_estimators': 345.0,
 'num_leaves': 67.0,
 'reg_alpha': 0.6743049381465438,
 'reg_lambda': 0.6569291767377803,
 'subsample': 0.5329008714579213}

24/03/16 19:23:15 WARN TransportChannelHandler: Exception in connection from /192.168.40.105:57621
java.io.IOException: Operation timed out
	at java.base/sun.nio.ch.SocketDispatcher.read0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:47)
	at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:339)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:293)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:268)
	at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:425)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:254)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:357)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOpti