# Introduction

Guess randomly whether a horse will win or not, then calculate the payout.

In [2]:
import re
import tempfile
import warnings

import lightgbm as lgb
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
from hyperopt import STATUS_OK, SparkTrials, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import ColSpec, Schema
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score,
    auc,
    confusion_matrix,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sqlalchemy import create_engine

from JapanHorseRaceAnalytics.models.features_20240202_v1 import Features
from JapanHorseRaceAnalytics.utilities.base import get_base_dir, get_data_dir
from JapanHorseRaceAnalytics.utilities.metrics import (
    calculate_binary_classifier_statistics,
)
from JapanHorseRaceAnalytics.utilities.mlflow import get_colspecs
from JapanHorseRaceAnalytics.utilities.structured_logger import logger


pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

In [3]:
warehouse_dir = f"{get_base_dir()}/spark-warehouse"
postgres_driver_path = f"{get_base_dir()}/jars/postgresql-42.7.1.jar"

spark = (
    SparkSession.builder.appName("20240211_competitors")
    .config("spark.driver.memory", "21g")
    .config("spark.driver.maxResultSize", "5g")
    .config("spark.sql.warehouse.dir", warehouse_dir)
    .config("spark.jars", postgres_driver_path)
    .config("spark.executor.extraClassPath", postgres_driver_path)
    .config("spark.driver.extraClassPath", postgres_driver_path)
    .enableHiveSupport()
    .getOrCreate()
)

def read_hive_table(
    table_name: str,
    schema: str,
    spark_session: SparkSession,
    use_cache: bool = True,
):
    save_path = get_data_dir() / "sql_tables" / f"{table_name}.snappy.parquet"
    if use_cache and save_path.exists():
        logger.info(f"Read from parquet {save_path} to pandas")
        return pd.read_parquet(save_path)
    logger.info(f"Read from hive {schema}.{table_name}")
    spark_df = spark_session.read.table(f"{schema}.{table_name}")
    logger.info(f"Write to parquet {save_path}")
    spark_df.write.mode("overwrite").parquet(str(save_path))
    logger.info(f"Read from parquet {save_path} to pandas")
    return pd.read_parquet(save_path)


data = read_hive_table(
    table_name="features_20240217_v1",
    schema="jhra_curated",
    spark_session=spark,
    # use_cache=False,
)

data = data[[col for col in data.columns if re.match(r"^meta_", col)]]
data.head()

24/02/17 15:58:34 WARN Utils: Your hostname, Hanks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.40.105 instead (on interface en0)
24/02/17 15:58:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/02/17 15:58:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/17 15:58:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/02/17 15:58:35 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/02/17 15:58:35 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
{"event": "Read from parquet /Users/hankehly/Projects/JapanHorseRaceAnalytics/data/sql_tables/features_20240217_v1.snappy.parquet to pandas", "level": "info", "timestamp": "2024-02-1

Unnamed: 0,meta_レースキー,meta_馬番,meta_着順,meta_本賞金,meta_単勝的中,meta_単勝払戻金,meta_複勝的中,meta_複勝払戻金,meta_int_races_レースキー,meta_発走日時,meta_場コード,meta_int_race_horses_レースキー,meta_int_race_horses_馬番,meta_int_race_jockeys_レースキー,meta_int_race_jockeys_馬番,meta_騎手コード,meta_int_race_trainers_レースキー,meta_int_race_trainers_馬番,meta_調教師コード,meta_int_combinations_レースキー,meta_int_combinations_馬番,meta_int_race_weather_レースキー
0,1011103,4,6.0,0.0,False,0,False,0,1011103,2001-08-04 01:45:00,1,1011103,4,1011103,4,10356,1011103,4,10263,1011103,4,1011103
1,1011103,9,2.0,200.0,False,0,True,120,1011103,2001-08-04 01:45:00,1,1011103,9,1011103,9,10366,1011103,9,10305,1011103,9,1011103
2,1011204,14,6.0,0.0,False,0,False,0,1011204,2001-08-05 02:15:00,1,1011204,14,1011204,14,10392,1011204,14,10219,1011204,14,1011204
3,1011303,6,3.0,130.0,False,0,True,1090,1011303,2001-08-11 01:45:00,1,1011303,6,1011303,6,10412,1011303,6,10303,1011303,6,1011303
4,1011304,7,1.0,510.0,True,230,True,120,1011304,2001-08-11 02:15:00,1,1011304,7,1011304,7,10076,1011304,7,10256,1011304,7,1011304


In [7]:
y_test = data["meta_複勝的中"]
y_pred = np.random.choice([0, 1], size=len(y_test))
y_pred_proba = np.array([0.5] * len(y_test))
payout = data["meta_複勝払戻金"]


metrics = {
    # "loss": log_loss(y_test, y_pred_proba),
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_pred),
}

results = pd.DataFrame(
    {"actual": y_test, "pred": y_pred, "pred_proba_true": 0.5, "payout": payout}
)

payout_all = calculate_binary_classifier_statistics(
    results, group_by=None, payout_column_name="payout"
)

In [8]:
metrics

{'accuracy': 0.49981500966461706,
 'precision': 0.21142801465851824,
 'recall': 0.4998485656393303,
 'f1': 0.2971614496818582,
 'roc_auc': 0.4998272861124029}

In [10]:
payout_all["*"]

{'payout_rate': 73.91461207075474,
 'hit_rate': 21.15432164495211,
 'precision': 0.21142801465851824,
 'recall': 0.4998485656393303,
 'f1_score': 0.2971614496818582,
 'total_bets': 1108166,
 'total_hits': 234425,
 'bet_rate': 100.0,
 'total_payout_amount': 81909660.0,
 'total_bet_amount': 110816600}