# Hypothesis 1: A horse which is closer to its average placing weight is more likely to place in the top 3.

There is a slight trend, but it's not statistically significant at the conventional threshold.

# Hypothesis 2: A horse closer to its average weight is more likely to perform better.

True. Statistically significant.

https://chat.openai.com/g/g-HMNcP6w7d-data-analyst/c/78350b74-65b7-41cb-bf8d-5eade31e0075

In [1]:
import re
import tempfile
import warnings

import japanize_matplotlib
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import shap
import trueskill
from hyperopt import STATUS_OK, SparkTrials, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImblearnPipeline
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

from JapanHorseRaceAnalytics.utilities.base import (
    get_random_seed,
    get_spark_session,
    read_hive_table,
)
from JapanHorseRaceAnalytics.utilities.metrics import (
    calculate_payout_rate,
    kelly_criterion,
)
from JapanHorseRaceAnalytics.utilities.plot import (
    plot_confusion_matrix,
    plot_correlation_matrix,
    plot_feature_importances,
    plot_roc_curve,
    plot_shap_interaction_values,
)
from JapanHorseRaceAnalytics.utilities.structured_logger import logger

japanize_matplotlib.japanize()

In [2]:
spark = get_spark_session()

24/03/20 14:39:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/20 14:39:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/03/20 14:39:51 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
data = read_hive_table(
    table_name="features_20240304_v1",
    schema="jhra_curated",
    spark_session=spark,
    # use_cache=False,
    parse_dates=["meta_発走日時"],
)

rows_before = data.shape[0]
logger.info(f"Original data length: {rows_before}")

# Drop from data where cat_トラック種別 == "障害"
# Keep only horses that have 3 races
# Keep only data from 2000 onwards
data = data[
    # (data["cat_トラック種別"] != "障害")
    (~data["meta_着順"].isna())
    # & (data["meta_異常区分"] == "0")
    # & (data["num_1走前着順"].notnull())
    # & (data["num_2走前着順"].notnull())
    # & (data["num_3走前着順"].notnull())
    # & (data["meta_発走日時"] >= "2000-01-01")
]

rows_after = data.shape[0]
logger.info(
    f"Data length after filtering: {rows_after} (dropped {rows_before - rows_after} rows, {100 * (rows_before - rows_after) / rows_before:.2f}%)"
)

# Interpolate missing values for num_馬体重 (20 instances from 1999 ~ 2017)
data["num_馬体重"] = (
    data.sort_values("meta_発走日時")
    .groupby("meta_血統登録番号")["num_馬体重"]
    .transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
)

data.reset_index(drop=True, inplace=True)
data.head()

{"event": "Read from hive jhra_curated.features_20240304_v1", "level": "info", "timestamp": "2024-03-20T05:40:04.904042Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
24/03/20 14:40:05 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/03/20 14:40:05 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/03/20 14:40:06 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
{"event": "Write to parquet /Users/hankehly/Projects/JapanHorseRaceAnalytics/data/sql_tables/features_20240304_v1.snappy.parquet", "level": "info", "timestamp": "2024-03-20T05:40:07.048704Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
24/03/20 14:40:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
{"event": "Read from parquet /Users/hankehly/Projects/JapanHorseRaceAnalytics/data/sql_tables/features_2024030

Unnamed: 0,meta_単勝払戻金,meta_複勝払戻金,meta_レースキー,meta_馬番,meta_血統登録番号,meta_発走日時,meta_単勝的中,meta_単勝オッズ,meta_複勝的中,meta_複勝オッズ,...,num_6走前3着タイム差,cat_トラック種別,num_距離,cat_距離区分,num_過去3走重み付き着順成績,num_入厩何日前逆数,cat_堅実な馬,cat_過去3走中1走訳あり凡走,cat_過去3走中2走好走,cat_過去3走繋がりあり
0,0,0,10022106,12,100005,2002-07-13 12:55:00+09:00,0,106.1,0,15.5,...,,芝,1000,短距離,,1.0,False,False,False,False
1,0,0,10022706,10,100005,2002-08-03 12:55:00+09:00,0,162.2,0,17.8,...,,芝,1200,短距離,,1.0,False,False,False,False
2,0,380,10023401,5,100005,2002-08-18 10:05:00+09:00,0,52.1,1,3.8,...,,芝,1200,短距離,,1.0,False,False,False,False
3,0,0,9024401,16,100005,2002-09-15 10:05:00+09:00,0,18.1,0,3.0,...,,芝,1200,短距離,0.189141,1.0,False,False,False,False
4,0,0,9024701,7,100005,2002-09-28 10:05:00+09:00,0,5.4,0,2.6,...,,ダート,1200,短距離,0.169981,1.0,False,False,False,False


In [15]:
# Create a column that contains the num_馬体重 if meta_着順 <= 3, else np.nan
data["num_3着内馬体重"] = data.apply(lambda x: x["num_馬体重"] if x["meta_着順"] <= 3 else np.nan, axis=1)
# Add a column that contains the average num_馬体重_3着内 for each meta_血統登録番号
data["num_3着内平均馬体重"] = data.groupby("meta_血統登録番号")["num_3着内馬体重"].transform("mean")
# Add a column that contains the difference between num_馬体重 and num_3着内平均馬体重
data["num_3着内平均馬体重差"] = data["num_馬体重"] - data["num_3着内平均馬体重"]

In [53]:
data[data["meta_血統登録番号"].isin(pd.Series(data["meta_血統登録番号"].unique()).sample(10_000))][["meta_レースキー", "meta_血統登録番号", "meta_発走日時", "meta_着順", "num_馬体重"]].to_csv("data.csv", index=False)

In [46]:
data["meta_血統登録番号"].unique().shape

(118561,)

In [60]:
df = data[
    (~data["num_1走前ＩＤＭ"].isna())
    & (~data["num_2走前ＩＤＭ"].isna())
    & (~data["num_3走前ＩＤＭ"].isna())
]

df[
    df["meta_血統登録番号"].isin(
        pd.Series(df["meta_血統登録番号"].unique()).sample(10_000)
    )
][
    [
        "meta_レースキー",
        "meta_血統登録番号",
        "meta_発走日時",
        "meta_着順",
        "num_年齢",
        # "num_1走前経過日数",
        # "num_2走前経過日数",
        # "num_3走前経過日数",
        # "num_1走前ＩＤＭ",
        # "num_2走前ＩＤＭ",
        # "num_3走前ＩＤＭ",
    ]
].to_csv(
    "data4.csv", index=False
)

24/03/20 16:23:32 WARN HikariPool: HikariPool-2 - Retrograde clock change detected (housekeeper delta=25s714ms), soft-evicting connections from pool.
24/03/20 16:23:32 WARN HikariPool: HikariPool-1 - Retrograde clock change detected (housekeeper delta=25s716ms), soft-evicting connections from pool.
