# Lambda Rank

In [46]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import tqdm
import trueskill
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from lightgbm import LGBMRanker
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupShuffleSplit

from JapanHorseRaceAnalytics.utilities.base import get_spark_session, read_hive_table
from JapanHorseRaceAnalytics.utilities.structured_logger import logger

In [2]:
spark = get_spark_session()

24/03/17 11:26:39 WARN Utils: Your hostname, Hanks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.20.6.115 instead (on interface en0)
24/03/17 11:26:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/17 11:26:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/17 11:26:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
data = read_hive_table(
    table_name="features_20240304_v1",
    schema="jhra_curated",
    spark_session=spark,
    # use_cache=False,
    parse_dates=["meta_発走日時"],
)

rows_before = data.shape[0]
logger.info(f"Original data length: {rows_before}")

data = data[
    # (data["cat_トラック種別"] != "障害")
    (~data["meta_着順"].isna())
    # & (data["meta_異常区分"] == "0")
    # & (data["num_1走前着順"].notnull())
    # & (data["num_2走前着順"].notnull())
    # & (data["num_3走前着順"].notnull())
    # & (data["meta_発走日時"] >= "2000-01-01")
]

rows_after = data.shape[0]
logger.info(
    f"Data length after filtering: {rows_after} (dropped {rows_before - rows_after} rows, {100 * (rows_before - rows_after) / rows_before:.2f}%)"
)

# Interpolate missing values for num_馬体重 (20 instances from 1999 ~ 2017)
data["num_馬体重"] = (
    data.sort_values("meta_発走日時")
    .groupby("meta_血統登録番号")["num_馬体重"]
    .transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
)

data.reset_index(drop=True, inplace=True)
data.head()

{"event": "Read from parquet /Users/hankehly/Projects/JapanHorseRaceAnalytics/data/sql_tables/features_20240304_v1.snappy.parquet to pandas", "level": "info", "timestamp": "2024-03-17T02:26:40.926060Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
{"event": "Original data length: 1217019", "level": "info", "timestamp": "2024-03-17T02:26:41.534836Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}
{"event": "Data length after filtering: 1206122 (dropped 10897 rows, 0.90%)", "level": "info", "timestamp": "2024-03-17T02:26:41.919006Z", "logger": "JapanHorseRaceAnalytics.utilities.base"}


Unnamed: 0,meta_単勝払戻金,meta_複勝払戻金,meta_レースキー,meta_馬番,meta_血統登録番号,meta_発走日時,meta_単勝的中,meta_複勝的中,meta_複勝オッズ,meta_着順,...,cat_6走前休養理由分類コード,num_6走前3着タイム差,cat_トラック種別,num_距離,num_過去3走重み付き着順成績,num_入厩何日前逆数,cat_堅実な馬,cat_過去3走中1走訳あり凡走,cat_過去3走中2走好走,cat_過去3走繋がりあり
0,0,0,6033104,9,100045,2003-03-01 11:20:00+09:00,0,0,20.5,14.0,...,,,ダート,1800,,1.0,False,False,False,False
1,0,0,1032410,3,100051,2003-09-21 14:55:00+09:00,0,0,3.6,7.0,...,,,芝,2600,,1.0,False,False,False,False
2,0,0,1032809,4,100051,2003-10-05 14:10:00+09:00,0,0,11.4,12.0,...,,,芝,2000,,1.0,False,False,False,False
3,0,1330,4022306,2,100076,2002-07-20 12:50:00+09:00,0,1,9.5,3.0,...,,,芝,1200,,1.0,False,False,False,False
4,0,0,4022706,3,100076,2002-08-03 12:50:00+09:00,0,0,1.7,6.0,...,,,芝,1600,,1.0,False,False,False,False


In [4]:
# Assuming df is your DataFrame
df_sorted = data.sort_values(by=["meta_発走日時", "meta_レースキー"])

# Initialize the TrueSkill environment
env = trueskill.TrueSkill(draw_probability=0)  # No draws in horse racing

# Initialize ratings for all horses
horse_ratings = {
    horse_id: env.create_rating()
    for horse_id in df_sorted["meta_血統登録番号"].unique()
}

# Placeholder for ratings at each point in time
df_sorted["rating_post_race"] = pd.NA

# Iterate through races in chronological order
for (race_datetime, race_id), race_data in tqdm.tqdm(df_sorted.groupby(["meta_発走日時", "meta_レースキー"])):
    race_results = race_data.sort_values("meta_着順")
    horse_ids = race_results["meta_血統登録番号"].tolist()
    horse_groups = [[horse_ratings[horse_id]] for horse_id in horse_ids]
    ranks = list(range(len(horse_groups)))  # Lower rank number means a better position

    # Update ratings based on the race outcome
    updated_ratings = env.rate(horse_groups, ranks=ranks)

    # Directly update the DataFrame with the new ratings
    for index, (horse_id, new_rating_group) in zip(race_results.index, zip(horse_ids, updated_ratings)):
        horse_ratings[horse_id] = new_rating_group[0]  # Update with new rating
        df_sorted.at[index, "rating_post_race"] = horse_ratings[horse_id].mu  # Directly assign the rating

100%|██████████| 86237/86237 [03:03<00:00, 470.60it/s]


In [5]:
# Lag the rating_post_race by one row per horse (meta_血統登録番号) chronologically (meta_発走日時)
df_sorted["rating_pre_race"] = (
    df_sorted.sort_values("meta_発走日時")
    .groupby("meta_血統登録番号")["rating_post_race"]
    .shift(1)
    .fillna(25.0)
)

In [6]:
# Step 1: Calculate the total rating sum for each race
race_rating_sum = df_sorted.groupby("meta_レースキー")["rating_pre_race"].transform("sum")

# Step 2: Calculate the count of horses in each race
race_horse_count = df_sorted.groupby("meta_レースキー")["rating_pre_race"].transform("count")

# Step 3: Calculate the mean competitor rating for each horse
df_sorted["mean_competitor_rating_pre_race"] = (race_rating_sum - df_sorted["rating_pre_race"]) / (race_horse_count - 1)

In [7]:
df_sorted["num_1走前標準化着順"] = (df_sorted["num_1走前着順"] - 1) / (df_sorted["num_1走前頭数"] - 1)
df_sorted["num_2走前標準化着順"] = (df_sorted["num_2走前着順"] - 1) / (df_sorted["num_2走前頭数"] - 1)
df_sorted["num_3走前標準化着順"] = (df_sorted["num_3走前着順"] - 1) / (df_sorted["num_3走前頭数"] - 1)

# We want our factor (num_1走前経過日数) to start from 0
# so subtract the minimum value for "days since last race" across all horses
# Add a small number to avoid division by zero
df_sorted["num_1走前重み"] = 1 / (df_sorted["num_1走前経過日数"] - df_sorted["num_1走前経過日数"].min() + 1e-6)
df_sorted["num_2走前重み"] = 1 / (df_sorted["num_2走前経過日数"] - df_sorted["num_1走前経過日数"].min() + 1e-6)
df_sorted["num_3走前重み"] = 1 / (df_sorted["num_3走前経過日数"] - df_sorted["num_1走前経過日数"].min() + 1e-6)

# Calculate weighted average of the feature
df_sorted["num_過去3走重み付き標準化着順"] = (
    (df_sorted["num_1走前標準化着順"] * df_sorted["num_1走前重み"])
    + (df_sorted["num_2走前標準化着順"] * df_sorted["num_2走前重み"])
    + (df_sorted["num_3走前標準化着順"] * df_sorted["num_3走前重み"])
) / (df_sorted["num_1走前重み"] + df_sorted["num_2走前重み"] + df_sorted["num_3走前重み"])

# 2. Weighted average time difference between the horse and the 3 horses behind it
df_sorted["num_1走前後続馬平均タイム差"] = (
    df_sorted[["num_1走前後続馬1タイム差", "num_1走前後続馬2タイム差", "num_1走前後続馬3タイム差"]]
    .mean(axis=1)
    .fillna(0)
)
df_sorted["num_2走前後続馬平均タイム差"] = (
    df_sorted[["num_2走前後続馬1タイム差", "num_2走前後続馬2タイム差", "num_2走前後続馬3タイム差"]]
    .mean(axis=1)
    .fillna(0)
)
df_sorted["num_3走前後続馬平均タイム差"] = (
    df_sorted[["num_3走前後続馬1タイム差", "num_3走前後続馬2タイム差", "num_3走前後続馬3タイム差"]]
    .mean(axis=1)
    .fillna(0)
)
df_sorted["num_過去3走重み付き後続馬平均タイム差"] = (
    (df_sorted["num_1走前後続馬平均タイム差"] * df_sorted["num_1走前重み"])
    + (df_sorted["num_2走前後続馬平均タイム差"] * df_sorted["num_2走前重み"])
    + (df_sorted["num_3走前後続馬平均タイム差"] * df_sorted["num_3走前重み"])
) / (df_sorted["num_1走前重み"] + df_sorted["num_2走前重み"] + df_sorted["num_3走前重み"])

In [8]:
df_final = df_sorted[
    (df_sorted["cat_トラック種別"] != "障害")
    & (df_sorted["num_1走前着順"].notnull())
    & (df_sorted["num_2走前着順"].notnull())
    & (df_sorted["num_3走前着順"].notnull())
]

In [9]:
df_final

Unnamed: 0,meta_単勝払戻金,meta_複勝払戻金,meta_レースキー,meta_馬番,meta_血統登録番号,meta_発走日時,meta_単勝的中,meta_複勝的中,meta_複勝オッズ,meta_着順,...,num_2走前標準化着順,num_3走前標準化着順,num_1走前重み,num_2走前重み,num_3走前重み,num_過去3走重み付き標準化着順,num_1走前後続馬平均タイム差,num_2走前後続馬平均タイム差,num_3走前後続馬平均タイム差,num_過去3走重み付き後続馬平均タイム差
462840,0,0,08992301,03,96100370,1999-02-06 10:05:00+09:00,0,0,1.7,10.0,...,0.375000,0.428571,0.500000,0.062500,0.038462,0.676562,-1.000000,-1.166667,-1.566667,-1.053600
343401,0,0,08992401,09,96105434,1999-02-07 10:05:00+09:00,0,0,53.0,15.0,...,0.818182,0.727273,0.100000,0.058824,0.035714,0.784803,-1.333333,-0.300000,-0.666667,-0.898488
698678,0,0,08992401,01,96103587,1999-02-07 10:05:00+09:00,0,0,18.7,14.0,...,1.000000,0.916667,0.500000,0.062500,0.035714,0.995025,0.000000,0.000000,-0.400000,-0.023881
466183,0,0,08992411,02,96109217,1999-02-07 15:45:00+09:00,0,0,29.5,10.0,...,0.818182,0.866667,0.500000,0.062500,0.043478,0.421606,-0.333333,-3.000000,-0.400000,-0.613154
694678,0,0,08992412,06,93108209,1999-02-07 16:20:00+09:00,0,0,25.5,11.0,...,0.833333,0.714286,0.100000,0.058824,0.035714,0.794343,-0.400000,-0.350000,-0.133333,-0.335925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032015,0,0,06235912,05,18103535,2023-12-28 16:25:00+09:00,0,0,,4.0,...,0.153846,0.333333,0.004484,0.004098,0.003774,0.304044,-0.166667,-0.266667,-0.300000,-0.240555
1034923,3850,580,06235912,06,19100684,2023-12-28 16:25:00+09:00,1,1,,1.0,...,0.800000,0.764706,0.029412,0.005319,0.004630,0.665082,-0.300000,-0.400000,-0.133333,-0.293910
1035182,0,210,06235912,01,19101408,2023-12-28 16:25:00+09:00,0,1,,3.0,...,0.133333,0.235294,0.017857,0.011905,0.004082,0.368411,-0.233333,-0.366667,-0.066667,-0.260134
1037133,0,0,06235912,04,19105177,2023-12-28 16:25:00+09:00,0,0,,12.0,...,0.000000,0.200000,0.018182,0.007937,0.005319,0.033840,-0.600000,-0.066667,-0.400000,-0.431518


In [18]:
# Initialize GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Assume df['group'] is your grouping variable
# Splitting the data
for train_idx, test_idx in gss.split(df_final, groups=df_final["meta_レースキー"]):
    train_df = df_final.iloc[train_idx]
    test_df = df_final.iloc[test_idx]

# Now, train_df and test_df contain your split data, respecting group boundaries
X_train = train_df
y_train = train_df["meta_着順"].apply(lambda x: int(1.0 / x * 10) if x < 4 else 0)
groups_train = train_df.groupby("meta_レースキー").size().to_numpy()

X_test = test_df
y_test = test_df["meta_着順"].apply(lambda x: int(1.0 / x * 10) if x < 4 else 0)
groups_test = test_df.groupby("meta_レースキー").size().to_numpy()

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")
print(f"groups_train: {groups_train.shape}")
print(f"groups_test: {groups_test.shape}")

X_train: (664321, 122)
X_test: (166817, 122)
y_train: (664321,)
y_test: (166817,)
groups_train: (59352,)
groups_test: (14839,)


In [20]:
space = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.2),
    "num_leaves": hp.choice("num_leaves", list(range(20, 100))),
    "min_data_in_leaf": hp.choice("min_data_in_leaf", list(range(1, 50))),
    "feature_fraction": hp.uniform("feature_fraction", 0.5, 1.0),
    "bagging_fraction": hp.uniform("bagging_fraction", 0.5, 1.0),
    "max_depth": hp.choice("max_depth", list(range(3, 10))),
    # Conatant parameters
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 3, 5],
    "verbose": -1,
    "seed": 42,
    "boosting_type": "gbdt",
}

In [53]:
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tempfile
import matplotlib.pyplot as plt
from JapanHorseRaceAnalytics.utilities.plot import plot_feature_importances


def create_objective_fn(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    groups_train: np.ndarray,
    groups_test: np.ndarray,
    experiment_name: str,
):
    def objective(params):
        mlflow.set_experiment(experiment_name=experiment_name)
        with mlflow.start_run():
            preprocessor = ColumnTransformer(
                transformers=[
                    (
                        "num",
                        StandardScaler(),
                        [
                            "num_複勝率",
                            "num_1走前経過日数",
                            "num_過去3走重み付き標準化着順",
                            "num_過去3走重み付き後続馬平均タイム差",
                            "mean_competitor_rating_pre_race",
                            "rating_pre_race",
                        ],
                    ),
                ],
                remainder="drop",
            )

            X_train_prep = preprocessor.fit_transform(X_train)
            X_test_prep = preprocessor.transform(X_test)
            # Prepare the LightGBM datasets
            train_dataset = lgb.Dataset(X_train_prep, label=y_train, group=groups_train)
            valid_dataset = lgb.Dataset(X_test_prep, label=y_test, group=groups_test, reference=train_dataset)
            # model = LGBMRanker(**params)
            # model.fit(
            #     X_train_prep,
            #     y_train,
            #     group=groups_train,
            #     # eval_set=[(X_test_prep, y_test)],
            #     # eval_group=[groups_test],
            #     # early_stopping_rounds=10,
            # )
            model = lgb.train(
                params,
                train_dataset,
                num_boost_round=1000,
                valid_sets=[valid_dataset],
                # early_stopping_rounds=10,
                # verbose_eval=100
            )

            y_pred = model.predict(X_test_prep)

            # Reshape y_test and y_pred to compute groupwise NDCG
            # y_test_reshaped = [
            #     y_test[i : i + size] for i, size in enumerate(groups_test)
            # ]
            # y_pred_reshaped = [
            #     y_pred[i : i + size] for i, size in enumerate(groups_test)
            # ]

            # # Compute NDCG scores for each group and take the average
            # ndcg_scores = [
            #     ndcg_score(np.asarray([true]), np.asarray([pred]))
            #     for true, pred in zip(y_test_reshaped, y_pred_reshaped)
            # ]
            # avg_ndcg_score = np.mean(ndcg_scores)

            # # Loss must be minimized
            # loss = -avg_ndcg_score
            # loss = -model.best_score["valid_0"]["ndcg@5"]

            mlflow.log_params(params)
            mlflow.log_metric("loss", loss)
            mlflow.sklearn.log_model(sk_model=model, artifact_path="model")

            # Feature Importances Plot
            fig, ax = plot_feature_importances(
                model.named_steps["ranker"].feature_name_,
                model.named_steps["ranker"].feature_importances_,
                top_n=50,
            )
            with tempfile.NamedTemporaryFile(
                prefix="feature_importance_", suffix=".png"
            ) as f:
                fig.savefig(f.name)
                plt.close()
                mlflow.log_artifact(f.name)

            # Feature Importances Data
            feature_importances = zip(
                model.named_steps["ranker"].feature_name_,
                model.named_steps["ranker"].feature_importances_,
            )
            feature_importances_df = (
                pd.DataFrame(feature_importances, columns=["feature", "importance"])
                .sort_values("importance", ascending=False)
                .reset_index(drop=True)
            )
            with tempfile.NamedTemporaryFile(
                prefix="feature_importance_", suffix=".csv"
            ) as f:
                feature_importances_df.to_csv(f.name, index=False)
                mlflow.log_artifact(f.name)

            return {"loss": loss, "params": params, "status": STATUS_OK}

    return objective

In [54]:
# payouts = (
#     X_test[["meta_発走日時", "meta_複勝払戻金", "meta_複勝オッズ"]]
#     .reset_index(drop=True)
#     .rename(
#         columns={
#             "meta_発走日時": "発走日時",
#             "meta_複勝払戻金": "payout",
#             "meta_複勝オッズ": "odds",
#         }
#     )
#     .assign(発走日時=lambda x: pd.to_datetime(x["発走日時"]))
# )

experiment_name = "20240318-eda1-lambda"

if mlflow.get_experiment_by_name(experiment_name) is None:
    mlflow.create_experiment(experiment_name)

fn = create_objective_fn(
    X_train,
    y_train,
    X_test,
    y_test,
    groups_train,
    groups_test,
    experiment_name=experiment_name,
)

In [55]:
trials = Trials()
best = fmin(fn=fn, space=space, algo=tpe.suggest, max_evals=1, trials=trials)
print(f"Best parameters: {best}")

# trials = SparkTrials(parallelism=3, spark_session=spark)
# fmin(fn=fn, space=space, algo=tpe.suggest, max_evals=30, trials=trials)

  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

build_posterior_wrapper took 0.004403 seconds
TPE using 0 trials
job exception: Computing NDCG is only meaningful when there is more than 1 document. Got 1 instead.


  0%|          | 0/1 [00:11<?, ?trial/s, best loss=?]


ValueError: Computing NDCG is only meaningful when there is more than 1 document. Got 1 instead.