In [1]:
#インポート文
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from lightgbm import LGBMRanker

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [2]:
#データのインポート
race_df = pd.read_csv("../data/main/race_data.csv",parse_dates=['datetime'])
horse_df = pd.read_csv("../data/main/horse_data.csv")

In [3]:
#データのマージ
race_df_for_merge = race_df[['race_id',
                             "race_round",
                             "race_title",
                             "weather",
                             "ground_status",
                             "where_racecourse",
                             "total_horse_number",
                             "frame_number_first",
                             "horse_number_first",
                             "frame_number_second",
                             "horse_number_second",
                             "frame_number_third",
                             "horse_number_third",
                             "tansyo",
                             "hukusyo_first",
                             "hukusyo_second",
                             "hukusyo_third",
                             "wakuren",
                             "umaren",
                             "wide_1_2",
                             "wide_1_3",
                             "wide_2_3",
                             "umatan",
                             "renhuku3",
                             "rentan3",
                             "is_obstacle",
                             "ground_type",
                             "is_left_right_straight",
                             "distance",
                             "weather_rain",
                             "weather_snow",
                             "datetime"
                            ]]

merged_horse_df = pd.merge(horse_df, race_df_for_merge, on='race_id')

In [4]:
#datetimeをUnix時間に変換してint32に変換する
unix_time = merged_horse_df['datetime'].astype('int64')
unix_time_int32 = unix_time.astype('int32')

merged_horse_df['datetime'] = unix_time_int32

In [5]:
#エンコード
for column in ['race_title',
               'weather',
               'where_racecourse',
               'ground_type',
               'is_left_right_straight']:
    le = LabelEncoder()
    le.fit(merged_horse_df[column])
    merged_horse_df[column] = le.transform(merged_horse_df[column])

In [6]:
#使う特徴量の設定
feature = ["race_id",
#            "rank",
           "frame_number",
           "horse_number",
           "horse_id",
           "sex_and_age",
           "burden_weight",
           "rider_id",
           "goal_time",
           "goal_time_dif",
           "half_way_rank",
           "last_time",
           "odds",
           "popular",
           "horse_weight",
           "tamer_id",
           "owner_id",
           "is_down",
           "is_senba",
           "is_mesu",
           "is_osu",
           "horse_weight_dif",
           "burden_weight_rate",
           "avg_velocity",
           
             "race_round",
             "race_title",
             "weather",
             "ground_status",
             "where_racecourse",
             "total_horse_number",
             "frame_number_first",
             "horse_number_first",
             "frame_number_second",
             "horse_number_second",
             "frame_number_third",
             "horse_number_third",
             "tansyo",
             "hukusyo_first",
             "hukusyo_second",
             "hukusyo_third",
             "wakuren",
             "umaren",
             "wide_1_2",
             "wide_1_3",
             "wide_2_3",
             "umatan",
             "renhuku3",
             "rentan3",
             "is_obstacle",
             "ground_type",
             "is_left_right_straight",
             "distance",
             "weather_rain",
             "weather_snow",
             "datetime"
            ]

target = ['rank']

X = merged_horse_df[feature]
y = merged_horse_df[target]

In [7]:
#LGBMで使用するグループの設定
train_baskets = merged_horse_df.groupby(["race_id"])["horse_id"].count().values

In [8]:
#LGBMRankerの設定
ranker = LGBMRanker(boosting_type="gbdt", 
                    num_leaves=31, 
                    learning_rate=0.05, 
                    n_estimators=20, 
                    random_state=42)

In [9]:
#正確度を入れるリスト
auc_scores = []

#ランダムステートを変更し、kfoldを繰り返す
for i in range(5):
    
    #予測結果を入れるリスト
    pred_df_list = []
    
    #kflodの設定
    kfold = KFold(n_splits=5, shuffle=True, random_state=i)
    
    #kflodで学習用と検証用に分ける
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # ここでtrain_basketsを計算する
        train_baskets = X_train.groupby(["race_id"])["horse_id"].count().values

        # モデルの学習
        model = ranker.fit(X_train,
                           y_train,
                           group=train_baskets)

        # モデルの予測
        y_pred = ranker.predict(X_val)

        #結果を入れるデータフレームを用意
        pred_df_fold = pd.DataFrame({
           "race_id": X_val['race_id'],
           "horse_id": X_val['horse_id'],
           "rank":y_val['rank'],
           "pred": y_pred
           })
        
        #そのデータフレームをリストに保存
        pred_df_list.append(pred_df_fold)

    # 各foldの予測結果を結合してpred_dfを作成
    pred_df = pd.concat(pred_df_list, axis=0).reset_index(drop=True)
    
    #予測した結果をrace_idごとでグループ化し順位をつける
    pred_df['pred_rank'] = pred_df.groupby('race_id')['pred'].rank(method='min', ascending=False)
    
    #3位以内ならprizeを1にする
    pred_df['rank_prize'] = pred_df['rank'].apply(lambda x: 1 if x <= 3 else 0)
    pred_df['pred_rank_prize'] = pred_df['pred_rank'].apply(lambda x: 1 if x <= 3 else 0)
    
    #正確度を計算
    auc_score = metrics.accuracy_score(pred_df['rank_prize'], pred_df['pred_rank_prize'])
    #auc_scoresのリストに保存
    auc_scores.append(auc_score)

In [10]:
#各ランダムステートの正確度平均と標準偏差を出力
avg_auc_score = np.mean(auc_scores)
std_score = np.std(auc_scores)
print(f"Average AUC score: {avg_auc_score:.4f}")
print(f"Std score: {std_score:.4f}")

Average AUC score: 0.5801
Std score: 0.0007
