In [1]:
import os

import japanize_matplotlib
import keras_tuner as kt
import lightgbm as lgb
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import tensorflow as tf
import tqdm
from pyspark.sql import SparkSession
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import Callback, EarlyStopping
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

# At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs,
# please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import Adam

from JapanHorseRaceAnalytics.utilities.base import get_base_dir, read_hive_table

# Set pandas display options
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 200)

# Set seed for reproducibility
os.environ["PYTHONHASHSEED"] = str(42)
np.random.seed(42)
tf.random.set_seed(42)
random_state = 42

In [2]:
warehouse_dir = f"{get_base_dir()}/spark-warehouse"
postgres_driver_path = f"{get_base_dir()}/jars/postgresql-42.7.1.jar"

spark = (
    SparkSession.builder.appName("20240211_competitors")
    .config("spark.driver.memory", "21g")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .config("spark.jars", postgres_driver_path)
    .config("spark.executor.extraClassPath", postgres_driver_path)
    .config("spark.driver.extraClassPath", postgres_driver_path)
    .enableHiveSupport()
    .getOrCreate()
)

In [None]:
data = read_hive_table(
    table_name="features_20240217_v1",
    schema="jhra_curated",
    spark_session=spark,
    use_cache=False,
)

# drop from data where cat_トラック種別 == "障害"
data = data[(data["cat_トラック種別"] != "障害")]

# This would mess up the number of horses in the race
# drop from data where meta_int_race_horses_異常区分 != '0'
# data = data[(data["meta_int_race_horses_異常区分"] == "0")]

data = data.reset_index(drop=True)
data.head()

In [None]:
X = data
y = data["meta_複勝的中"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

In [None]:
# get all columns that start with "num_"
numerical_features = X_train.filter(regex="^num_", axis=1).columns.tolist()
# get all columns that start with "cat_"
categorical_features = X_train.filter(regex="^cat_", axis=1).columns.tolist()

In [None]:
# get all categorical_features that have 実績 in the name
X_train.filter(regex="実績", axis=1).columns.tolist()

In [None]:
numeric_features = X_train.select_dtypes("number").columns.tolist()
# categorical_features = X_train.select_dtypes("category").columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        # (
        #     "cat",
        #     OneHotEncoder(handle_unknown="ignore"),
        #     categorical_features,
        # ),
    ]
)
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", lgb.LGBMClassifier(**params)),
    ]
)

params = {
    "boosting_type": "gbdt",
    "class_weight": "balanced",
    "colsample_bytree": 0.8016642153767848,
    "feature_fraction": 0.5578235667548754,
    "lambda_l1": 2.551673582227088,
    "lambda_l2": 1.3506414200964172,
    "learning_rate": 0.02904727910263315,
    "max_depth": 10,
    "min_child_samples": 68,
    "min_child_weight": 7.736782598405014,
    "min_split_gain": 0.0071078853628913415,
    "n_estimators": 861,
    "num_leaves": 121,
    "objective": "binary",
    "reg_alpha": 0.25409327833670503,
    "reg_lambda": 0.4275373164043184,
    "seed": 42,
    "subsample": 0.9179630226670973,
    "verbose": -1,
}

rfe = RFE(estimator=lgb.LGBMClassifier(), step=1)
rfe.fit(X_train, y_train)