In [1]:
import datetime as dt
import json
import os
import re

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from joblib import dump, load
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

from sqlalchemy import create_engine, text
from sqlalchemy import Column, Integer, Float, String, DateTime, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.mysql import JSON

In [2]:
OUTPUT_PATH = "output"
MODE = "train"

## 学習データ取得

In [3]:
load_dotenv()
user = os.environ.get('DB_USER')
password = os.environ.get('DB_PASS')
host = os.environ.get('DB_HOST')
port = os.environ.get('DB_PORT')
database = os.environ.get('DB_NAME')
url = f'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}'

engine = create_engine(url)

query_races = 'SELECT * FROM races'
query_race_results = 'SELECT * FROM race_results'

with engine.connect() as connection:
    race_df = pd.read_sql_query(sql=text(query_races), con=connection)
    race_results_df = pd.read_sql_query(sql=text(query_race_results), con=connection)

## 特徴量化

In [4]:
def get_race_state_features(df):
    return_df = df.copy()
    return_df["race_course"] = df["race_state"].str[1]
    return_df["race_distance"] = df["race_state"].str[2:6]
    return_df["race_weather"] = df["race_state"].str[15]
    return_df["race_state"] = df["race_state"].str[23]
    return_df["race_start"] =  df["race_state"].str[32:37].str.replace(":", "")
    return return_df

def get_sex_and_age(df):
    return_df = df.copy()
    return_df["sex"] = return_df["sex_and_age"].str[0]
    return_df["age"] = return_df["sex_and_age"].str[1]
    return return_df

def get_horse_weight(df):
    return_df = df.copy()
    return_df["difference_weight"] = return_df["horse_weight"].str[3:]
    return_df["difference_weight"] = return_df["difference_weight"].replace(re.compile("\(|\)"), "", regex=True)
    return_df[return_df['difference_weight'] == ''] = -9999
    return_df["difference_weight"] = return_df["difference_weight"].astype(int)
    return_df[return_df['difference_weight'] == -9999] = None
    return_df["horse_weight"] = return_df["horse_weight"].str[0:3]
    return return_df

def get_date(df, mode):
    return_df = df.copy()
    return_df['date'] = return_df['date'].str.split(' ', expand=True)[0]
    if mode == "train":
        return_df['date'] = pd.to_datetime(return_df['date'], format='%Y年%m月%d日')
        return_df['day_of_year'] = return_df['date'].dt.day_of_year
    elif mode == "predict":
        return_df['date'] = pd.to_datetime(return_df['date'].str[:-3], format='%m月%d日')
        return_df['day_of_year'] = 2024
    return_df['date_cos'] = np.cos(2 * np.pi * return_df['day_of_year'] / return_df['day_of_year'].max())
    return_df['date_sin'] = np.sin(2 * np.pi * return_df['day_of_year'] / return_df['day_of_year'].max())
    return return_df

In [5]:
def get_all_features(race_df, race_results_df, mode):
    USE_COLUMNS = [
        "id", "race_name", "race_place", "number_of_entries", "race_state", "date",
        "box", "horse_order", "sex_and_age", "burden_weight",
        "jockey", "horse_weight", "horse_trainer", "horse_owner"
    ]

    if mode == "train":
        merge_df = pd.merge(race_df, race_results_df, on='id', how='left').dropna(subset=["id"])
        USE_COLUMNS.append("rank")
        use_df = merge_df[USE_COLUMNS]
    elif mode == "predict":
        merge_df = pd.merge(race_df, race_results_df, left_on='id', right_on='race_id', how='left').dropna(subset=["id"])
        use_df = merge_df[USE_COLUMNS]
    else:
        raise ValueError("Unsupported mode. Use 'train' or 'predict'.")

    df = use_df.copy()
    df = get_race_state_features(df)
    df = get_sex_and_age(df)
    df = get_horse_weight(df)
    df = get_date(df, mode)
    
    return df

In [6]:
def label_encoder(df, cols, mode, output_path):
    return_df = df.copy()
    return_df[cols] = return_df[cols].fillna('missing')

    oe = preprocessing.OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1,
    )

    categories_filename = os.path.join(output_path, "categories.joblib")

    if mode == "train":
        return_df[cols] = oe.fit_transform(return_df[cols])
        dump(oe, categories_filename)
    elif mode == "predict":
        loaded_encoder = load(categories_filename)
        return_df[cols] = loaded_encoder.transform(return_df[cols])
    else:
        raise ValueError("Unsupported mode. Use 'train' or 'predict'.")
    return_df = return_df.replace({'nan': np.nan})
    return return_df

In [7]:
def clean_df(df, int_columns, float_columns, mode):
    return_df = df.copy()
    if mode == "train":
        return_df['rank'] = return_df['rank'].replace({'1': 1, '2': 1, '3': 1})
        return_df.loc[~(return_df['rank'] == 1), 'rank'] = 0

    for col in int_columns:
        return_df[col] = pd.to_numeric(return_df[col], errors='coerce').fillna(0).astype('int64')
    for col in float_columns:
        return_df[col] = pd.to_numeric(return_df[col], errors='coerce')
    return return_df

In [8]:
def preprocess_data(df, mode, output_path):
    ENCODING_COLUMNS = [
        "race_name", "race_place",
        "race_state", "race_course", "race_weather",
        "sex_and_age", "sex",
        "jockey", "horse_trainer", "horse_owner"
    ]
    
    INT_COLUMNS = [
        "id", "box", "horse_order", "horse_weight", "race_distance",
        "race_start", "age", "day_of_year", "number_of_entries",
        "difference_weight", "day_of_year"
    ]
    if mode == "train":
        INT_COLUMNS.append("rank")
    
    FLOAT_COLUMNS =[
        "burden_weight"
    ]
    
    encoded_df = label_encoder(df, ENCODING_COLUMNS, mode, output_path)
    
    cleaned_df = clean_df(encoded_df, INT_COLUMNS, FLOAT_COLUMNS, mode)
    
    return cleaned_df

In [9]:
feature_engineered_df = get_all_features(race_df, race_results_df, MODE)
preprocessed_df = preprocess_data(feature_engineered_df, MODE, OUTPUT_PATH)

## 学習結果保存

In [10]:
def evaluate_model_performance(model, X_test, y_test, version, output_dir):
    y_pred = model.predict_proba(X_test)[:, 1]
    y_pred_binary = model.predict(X_test)
    
    auc_score = roc_auc_score(y_test, y_pred)
    print(f'Test AUC Score: {auc_score}')
    
    cm = confusion_matrix(y_test, y_pred_binary)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    cm_filename = os.path.join(output_dir, f"{version}_confusion_matrix.png")
    plt.savefig(cm_filename)
    plt.close()

    fpr, tpr, _ = roc_curve(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    rc_filename = os.path.join(output_dir, f"{version}_roc_curve.png")
    plt.savefig(rc_filename)
    plt.close()

    feature_importances = pd.DataFrame(model.feature_importances_,
                                        index = X_test.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)
    print(feature_importances)

    lgbm.plot_importance(model, importance_type='split', max_num_features=10)
    plt.title('Feature Importance')
    fi_filename = os.path.join(output_dir, f"{version}_feature_importance.png")
    plt.savefig(fi_filename)
    plt.close()

    evaluation_results = {
        'importances': feature_importances, 
        'AUC': auc_score,
        'TP': int(cm[1][1]),
        'FP': int(cm[0][1]),
        'FN': int(cm[1][0]),
        'TN': int(cm[0][0]),
        'FPR': fpr, 
        'TPR': tpr,
        'memo': 'memo',
        'version': version,
    }

    return evaluation_results

In [11]:
Base = declarative_base()

class ModelEvaluation(Base):
    __tablename__ = 'model_evaluation'
    
    id = Column(Integer, primary_key=True)
    feature_importance_json = Column(JSON)
    TP = Column(Integer)
    FP = Column(Integer)
    FN = Column(Integer)
    TN = Column(Integer)
    FPR = Column(JSON)
    TPR = Column(JSON)
    AUC = Column(Float)
    memo = Column(Text)
    version = Column(String(255))
    created_date = Column(DateTime, default=dt.datetime.utcnow)

def save_evaluation(evaluation_results, engine):
    Base.metadata.create_all(engine)
    
    Session = sessionmaker(bind=engine)
    session = Session()

    new_evaluation = ModelEvaluation(
        feature_importance_json=evaluation_results["importances"].to_json(),
        TP=evaluation_results["TP"],
        FP=evaluation_results["FP"],
        FN=evaluation_results["FN"],
        TN=evaluation_results["TN"],
        FPR=json.dumps(evaluation_results["FPR"].tolist()),
        TPR=json.dumps(evaluation_results["TPR"].tolist()),
        AUC=evaluation_results["AUC"],
        memo=evaluation_results["memo"],
        version=evaluation_results["version"],
        created_date=dt.datetime.utcnow()
    )
    
    session.add(new_evaluation)
    session.commit()
    session.close()

  Base = declarative_base()


## 検証

In [12]:
train_dates = [dt.datetime(2018, 12, 31), dt.datetime(2019, 12, 31), dt.datetime(2020, 12, 31), dt.datetime(2021, 12, 31), dt.datetime(2022, 12, 31)]
val_dates = [dt.datetime(2019, 12, 31), dt.datetime(2020, 12, 31), dt.datetime(2021, 12, 31), dt.datetime(2022, 12, 31), dt.datetime(2023, 12, 31)]
test_dates = [dt.datetime(2020, 12, 31), dt.datetime(2021, 12, 31), dt.datetime(2022, 12, 31), dt.datetime(2023, 12, 31), dt.datetime(2024, 12, 31)]

def split_df(df, train_date, val_date, test_date):
    return_df = df.copy()
    train_df = return_df[return_df['date'] < train_date].drop('date', axis=1)
    val_df = return_df[(return_df['date'] > train_date) & (return_df['date'] < val_date)].drop('date', axis=1)
    test_df = return_df[(return_df['date'] > val_date) & (return_df['date'] < test_date)].drop('date', axis=1)
    return train_df, val_df, test_df

def split_target(df):
    return_df = df.copy()
    X = return_df.drop('rank', axis=1)
    y = return_df['rank']
    return X, y

for i in range(5):
    print(f"fold_{i}---------------------")
    train_df, val_df, test_df = split_df(preprocessed_df, train_dates[i], val_dates[i], test_dates[i])

    X_train, y_train = split_target(train_df)
    X_val, y_val = split_target(val_df)
    X_test, y_test = split_target(test_df)

    train_set = lgbm.Dataset(X_train, y_train)
    val_set = lgbm.Dataset(X_val, y_val)

    params = {
        "objective": "binary",
        "metric": "auc",
        'boosting_type': 'gbdt',
        'n_estimators': 10000,
        'random_state': 74,
    }

    clf = lgbm.LGBMClassifier(**params)

    clf.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            callbacks=[lgbm.early_stopping(stopping_rounds=100, verbose=True), lgbm.log_evaluation(10)])
    
    evaluation_results = evaluate_model_performance(clf, X_test, y_test, str(i), OUTPUT_PATH)
    save_evaluation(evaluation_results, engine)

fold_0---------------------
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.725888	valid_1's auc: 0.668764
[20]	training's auc: 0.75588	valid_1's auc: 0.682515
[30]	training's auc: 0.776103	valid_1's auc: 0.688889
[40]	training's auc: 0.793976	valid_1's auc: 0.690527
[50]	training's auc: 0.808807	valid_1's auc: 0.692421
[60]	training's auc: 0.82016	valid_1's auc: 0.69265
[70]	training's auc: 0.831724	valid_1's auc: 0.69288
[80]	training's auc: 0.842345	valid_1's auc: 0.689974
[90]	training's auc: 0.852332	valid_1's auc: 0.688711
[100]	training's auc: 0.86037	valid_1's auc: 0.687754
[110]	training's auc: 0.868594	valid_1's auc: 0.68368
[120]	training's auc: 0.875331	valid_1's auc: 0.680024
[130]	training's auc: 0.883289	valid_1's auc: 0.677443
[140]	training's auc: 0.88939	valid_1's auc: 0.675278
[150]	training's auc: 0.896548	valid_1's auc: 0.674036
[160]	training's auc: 0.902354	valid_1's auc: 0.674389
Early stopping, best iteration is:
[69]	traini

## 全てのデータで学習

In [13]:
X_train, y_train = split_target(preprocessed_df.drop('date', axis=1))

train_set = lgbm.Dataset(X_train, y_train)

params = {
    "objective": "binary",
    "metric": "auc",
    'boosting_type': 'gbdt',
    'n_estimators': 120,
    'random_state': 74,
}

clf = lgbm.LGBMClassifier(**params)

clf.fit(X_train, y_train)

In [15]:
model_filename = os.path.join(OUTPUT_PATH, "model.joblib")
dump(clf, model_filename)

['output\\model.joblib']

# 予測フロー

In [27]:
MODE = "predict"

## 開催予定レースデータ取得(RDSから)

In [28]:
def load_new_data():
    query_weekly_races = 'SELECT * FROM weekly_races'
    query_race_entries = 'SELECT * FROM race_entries'

    with engine.connect() as connection:
        weekly_races_df = pd.read_sql_query(sql=text(query_weekly_races), con=connection)
        race_entries_df = pd.read_sql_query(sql=text(query_race_entries), con=connection)

    return weekly_races_df, race_entries_df

In [29]:
weekly_races_df, race_entries_df = load_new_data()

## 特徴量化

In [30]:
feature_engineered_df = get_all_features(weekly_races_df, race_entries_df, MODE)
preprocessed_df = preprocess_data(feature_engineered_df, MODE, OUTPUT_PATH).drop('date', axis=1)

## 学習済みモデル取得(S3から)

In [31]:
def load_model(output_path):
    model_path = os.path.join(output_path, "model.joblib")
    
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found: {model_path}")
    
    model = load(model_path)
    return model

In [32]:
model = load_model(OUTPUT_PATH)

## 予測

In [33]:
predictions = model.predict_proba(preprocessed_df)[:, 1]

## 予測結果保存(RDSに)

In [34]:
def add_predict_proba_column(engine, table_name='race_entries', column_name='predict_proba'):
    Base.metadata.reflect(engine)
    table = Base.metadata.tables[table_name]
    
    if column_name not in table.c:
        with engine.connect() as conn:
            sql_statement = text(f'ALTER TABLE {table_name} ADD COLUMN {column_name} FLOAT')
            conn.execute(sql_statement)
            print(f"Added '{column_name}' column to '{table_name}' table.")
    else:
        print(f"Column '{column_name}' already exists in '{table_name}' table.")

In [35]:
def save_predictions(race_entries_df, predictions, table_name='race_entries', column_name='predict_proba'):
    add_predict_proba_column(engine, table_name, column_name)

    race_entries_df[column_name] = predictions

    race_entries_df.to_sql(table_name, con=engine, if_exists='replace', index=False)
    print(f"Saved predictions to '{table_name}' table.")

In [36]:
save_predictions(race_entries_df, predictions)

Column 'predict_proba' already exists in 'race_entries' table.
Saved predictions to 'race_entries' table.
