In [1]:
#pip install google-cloud-bigquery これ必要
#pip install pydata_google_auth
#pip install db-dtypes

In [2]:
#インポート文
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from lightgbm import LGBMRanker

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix

import optuna

import pydata_google_auth
from google.cloud import bigquery 

import pickle

In [3]:
credentials = pydata_google_auth.get_user_credentials(
    ['https://www.googleapis.com/auth/bigquery'],
)

In [4]:
client = bigquery.Client(project='keiba-381006', credentials=credentials)

In [5]:
pre_race_query = '''
    SELECT * 
    FROM osaka.race_table
    '''
pre_race_df = client.query(pre_race_query).to_dataframe()

In [6]:
pd.set_option('display.max_columns', 100)
pre_race_df.head(1)

Unnamed: 0,race_id,race_round,race_title,weather,ground_status,date,where_racecourse,total_horse_number,frame_number_first,horse_number_first,frame_number_second,horse_number_second,frame_number_third,horse_number_third,tansyo,hukusyo_first,hukusyo_second,hukusyo_third,wakuren,umaren,wide_1_2,wide_1_3,wide_2_3,umatan,renhuku3,rentan3,is_obstacle,ground_type,is_left_right_straight,distance,datetime
0,201709020411,11,第61回大阪杯(G1),0,1,2017-04-02,9,14,4,5,3,4,8,13,240,110,280,280,2130,2320,540,480,1800,3390,4830,23910,0,1,1,2000,2017-04-02 15:40:00+00:00


In [7]:
pre_race_horse_query = '''
    SELECT * 
    FROM osaka.horse_table 
    '''
pre_race_horse_df = client.query(pre_race_horse_query).to_dataframe()

In [8]:
pd.set_option('display.max_columns', 100)
pre_race_horse_df.head(1)

Unnamed: 0,race_id,rank,frame_number,horse_number,horse_id,burden_weight,rider_id,goal_time,goal_time_dif,half_way_rank,last_time,odds,popular,horse_weight,tamer_id,owner_id,is_down,age,sex,horse_weight_dif,burden_weight_rate,avg_velocity
0,202009020411,2,8,12,2016104750,55,1102,118.4,0.0,3.0,34,5.2,4,454,1151,226800,0,4,1,-6,0.121145,16.891892


In [9]:
all_horse_query = '''
    SELECT * 
    FROM osaka.horse_info_table
    '''
all_horse_df = client.query(all_horse_query).to_dataframe()

In [10]:
pd.set_option('display.max_columns', 100)
all_horse_df.head(1)

Unnamed: 0,horse_id,bday,tame_id,owner_id,producer_id,production_area,auction_price,winnings,lifetime_record,wined_race_title,inbreeding_1,inbreeding_2,father,faths_father,faths_mother,mother,moths_father,moths_mother
0,2012104463,2012年3月17日,1128,471033,393126,千歳市,928万円 (2014年 千葉サラブレッド・セール),"2億3,061万円 (中央)",59戦9勝 [9-8-5-37],201803020811,2011104480,0,2003102205,000a000d77,1997102385,2004103210,1998101516,000a00fa62


In [11]:
all_horse_race_query = '''
    SELECT * 
    FROM osaka.horse_race_table
    '''
all_horse_race_df = client.query(all_horse_race_query).to_dataframe()

In [12]:
pd.set_option('display.max_columns', 100)
all_horse_race_df.head(1)

Unnamed: 0,date,where_racecourse,weather,race_round,race_title,race_id,total_horse_number,frame_number,horse_number,odds,popular,rank,rider_id,burden_weight,distance,ground_status,goal_time,goal_time_dif,half_way_rank,pace,last_time,horse_weight,runner_up_horse_id,prize,horse_id,target_race_id,race_rank,horse_weight_dif,ground_type,race_date
0,2014-11-01,8,1,9,萩S(OP),201408040809,6,4,4,5.8,3,5,1018,55.0,1800,1,1:48.0,0.4,6-6,35.9-34.8,34.3,448,2012105703,150.0,2012104463,201809020411,0,0,1,2018-04-01


In [13]:
today_race_query = '''
    SELECT * 
    FROM osaka.targetrace_table
    '''
today_race_df = client.query(today_race_query).to_dataframe()

In [14]:
pd.set_option('display.max_columns', 100)
today_race_df.head(1)

Unnamed: 0,race_id,race_title,date,race_round,weather,ground_status,where_racecourse,total_horse_number,race_rank,is_obstacle,ground_type,is_left_right_straight,distance,datetime
0,202309020411,大阪杯(G1),2023-04-02,11,0,1,9,16,3,0,1,1,2000,2023-04-02 15:40:00+00:00


In [15]:
today_race_horse_query = '''
    SELECT * 
    FROM osaka.targethorse_table
    '''
today_race_horse_df = client.query(today_race_horse_query).to_dataframe()

In [16]:
pd.set_option('display.max_columns', 100)
today_race_horse_df.head(1)

Unnamed: 0,race_id,frame_number,horse_number,horse_id,burden_weight,rider_id,tamer_id,horse_weight,odds,popular,horse_name,age,sex,horse_weight_dif,burden_weight_rate
0,202309020411,1,1,2018105081,56,1174,1151,472,,,ジェラルディーナ,5,1,2,0.118644


In [17]:
pre_race_horse_columns = ['rank',
                         'race_id',
                          'horse_id',
                          'rider_id',
                          'horse_number',
                          'age',
                          'sex',
#                           'odds',
#                           'popular',
                          'horse_weight'
                         ]
today_race_horse_columns = [
                         'race_id',
                          'horse_id',
                          'rider_id',
                          'horse_number',
                          'age',
                          'sex',
#                           'odds',
#                           'popular',
                          'horse_weight'
                         ]
df_main = pre_race_horse_df[pre_race_horse_columns]
df_target = today_race_horse_df[today_race_horse_columns]

In [18]:
#データのマージ
df_tmp_1 = pre_race_df[['race_id',
                      'ground_status',
                     ]]
df_main_1 = pd.merge(df_main, df_tmp_1, on='race_id', how='left')

df_tmp_1 = today_race_df[['race_id',
                      'ground_status',
                     ]]
df_target_1 = pd.merge(df_target, df_tmp_1, on='race_id', how='left')

In [19]:
# データのマージ
df_tmp_2 = all_horse_df[['horse_id',
                       'tame_id',
                       'inbreeding_1',
                       'inbreeding_2'
                     ]]
#重複データの削除
df_tmp_2 = df_tmp_2.drop_duplicates(subset=['horse_id'])

df_main_2 = pd.merge(df_main_1, df_tmp_2, on='horse_id', how='left')
df_target_2 = pd.merge(df_target_1, df_tmp_2, on='horse_id', how='left')

In [20]:
df_main_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85 entries, 0 to 84
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   rank           85 non-null     Int64 
 1   race_id        85 non-null     Int64 
 2   horse_id       85 non-null     Int64 
 3   rider_id       85 non-null     Int64 
 4   horse_number   85 non-null     Int64 
 5   age            85 non-null     Int64 
 6   sex            85 non-null     Int64 
 7   horse_weight   85 non-null     Int64 
 8   ground_status  85 non-null     Int64 
 9   tame_id        85 non-null     object
 10  inbreeding_1   85 non-null     object
 11  inbreeding_2   85 non-null     Int64 
dtypes: Int64(10), object(2)
memory usage: 9.5+ KB


##### ラベルエンコーディング

In [21]:
#df_target_2['horse_weight'] = df_target_2['horse_weight'].astype(float)

In [22]:
df_target_2['rank']=0

In [23]:
df_merged = pd.concat([df_target_2, df_main_2], axis=0)

In [24]:
le = LabelEncoder()
df_merged = pd.concat([df_target_2, df_main_2], axis=0)

In [25]:
tame_id = le.fit(df_merged['tame_id'])
df_main_2['tame_id'] = le.transform(df_main_2['tame_id'])
df_target_2['tame_id'] = le.transform(df_target_2['tame_id'])

inbreeding_1 = le.fit(df_merged['inbreeding_1'])
df_main_2['inbreeding_1'] = le.transform(df_main_2['inbreeding_1'])
df_target_2['inbreeding_1'] = le.transform(df_target_2['inbreeding_1'])

inbreeding_2 = le.fit(df_merged['inbreeding_2'])
df_main_2['inbreeding_2'] = le.transform(df_main_2['inbreeding_2'])
df_target_2['inbreeding_2'] = le.transform(df_target_2['inbreeding_2'])

In [26]:
df_main_2['rank'] = df_main_2['rank'].astype('int64')
df_main_2['race_id'] = df_main_2['race_id'].astype('int64')
df_main_2['horse_id'] = df_main_2['horse_id'].astype('int64')
df_main_2['rider_id'] = df_main_2['rider_id'].astype('int64')
df_main_2['horse_number'] = df_main_2['horse_number'].astype('int64')
df_main_2['age'] = df_main_2['age'].astype('int64')
df_main_2['sex'] = df_main_2['sex'].astype('int64')
# df_main_2['popular'] = df_main_2['popular'].astype('int64')
df_main_2['ground_status'] = df_main_2['ground_status'].astype('int64')

In [27]:
df_target_2['rank'] = df_target_2['rank'].astype('int64')
df_target_2['race_id'] = df_target_2['race_id'].astype('int64')
df_target_2['horse_id'] = df_target_2['horse_id'].astype('int64')
df_target_2['rider_id'] = df_target_2['rider_id'].astype('int64')
df_target_2['horse_number'] = df_target_2['horse_number'].astype('int64')
df_target_2['age'] = df_target_2['age'].astype('int64')
df_target_2['sex'] = df_target_2['sex'].astype('int64')
# df_target_2['popular'] = df_target_2['popular'].astype('int64')
df_target_2['ground_status'] = df_target_2['ground_status'].astype('int64')

In [28]:
#使う特徴量の設定
id = {"race_id",
      "horse_id"
    }

feature = [
           "race_id",
           "horse_id",
           "rider_id",
           "horse_number",
           "age",
           "sex",
#             "odds",
#             'popular',
            'horse_weight',
            'ground_status',
            'tame_id',
            'inbreeding_1',
            'inbreeding_2'
            ]

target = ['rank']

X_id = df_main_2[feature]
X_id['horse_weight'] = X_id['horse_weight'].astype('int64')
X_id['tame_id'] = X_id['tame_id'].astype('int64')
X_id['inbreeding_1'] = X_id['inbreeding_1'].astype('int64')
y = df_main_2[target]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_id['horse_weight'] = X_id['horse_weight'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_id['tame_id'] = X_id['tame_id'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_id['inbreeding_1'] = X_id['inbreeding_1'].astype('int64')


In [29]:
#使う特徴量の設定
id = {"race_id",
      "horse_id"
    }

feature = [
           "race_id",
           "horse_id",
           "rider_id",
           "horse_number",
           "age",
           "sex",
#             "odds",
#             'popular',
            'horse_weight',
            'ground_status',
            'tame_id',
            'inbreeding_1',
            'inbreeding_2'
            ]

target = ['rank']

today_race_X = df_target_2[feature]
today_race_X['horse_weight'] = today_race_X['horse_weight'].astype('int64')
today_race_X['tame_id'] = today_race_X['tame_id'].astype('int64')
today_race_X['inbreeding_1'] = today_race_X['inbreeding_1'].astype('int64')
today_race_X.to_csv("../data/main/today.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  today_race_X['horse_weight'] = today_race_X['horse_weight'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  today_race_X['tame_id'] = today_race_X['tame_id'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  today_race_X['inbreeding_1'] = today_race_X['inbreeding_1'].ast

In [31]:
today_race_X_with_name = pd.merge(today_race_X, today_race_horse_df[['horse_id', 'horse_name']], on='horse_id', how='left')
today_race_X.to_csv("../data/main/today_withname.csv",index=False)

In [62]:
pred_df = pd.DataFrame()
#スコア結果を入れるリスト
reports = []
auc_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
support_scores = []

#予測結果を入れるリスト
pred_df_list = []

def objective(trial):
    
# Optuna
    # ハイパーパラメータの探索範囲
    params = {
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0),
        'verbose': -1
    }
    
    #LGBMRankerの設定
    ranker = LGBMRanker(**params)

# K-fold
    #kflodの設定
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    #kflodで学習用と検証用に分ける
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_id, y)):
        #id入りの学習データ
        X_train_id, X_val_id = X_id.iloc[train_idx], X_id.iloc[val_idx]
        
        #id抜きの学習データ
        X = X_id.drop(id, axis =1)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        
        #検証データ
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

# Fit
        # train_basketsを計算
        train_baskets = X_train_id.groupby(["race_id"])["horse_id"].count().values

        # モデルの学習
        model = ranker.fit(X_train,
                           y_train,
                           group=train_baskets)

# Predict
        # モデルの予測
        y_pred = model.predict(X_val)

        #結果を入れるデータフレームを用意
        pred_df_fold = pd.DataFrame({
           "race_id": X_val_id['race_id'],
           "horse_id": X_val_id['horse_id'],
           "rank":y_val['rank'],
           "pred": y_pred
           })

        #そのデータフレームをリストに保存
        pred_df_list.append(pred_df_fold)

# Rank
    # 各foldの予測結果を結合してpred_dfを作成
    pred_df = pd.concat(pred_df_list, axis=0).reset_index(drop=True)

    #予測した結果をrace_idごとでグループ化し順位をつける
    pred_df['pred_rank'] = pred_df.groupby('race_id')['pred'].rank(method='min', ascending=False)

    #3位以内ならprizeを1にする
    pred_df['rank_prize'] = pred_df['rank'].apply(lambda x: 1 if x <= 3 else 0)
    pred_df['pred_rank_prize'] = pred_df['pred_rank'].apply(lambda x: 1 if x <= 3 else 0)

#Score
    #正確度を計算
    auc_score = metrics.accuracy_score(pred_df['rank_prize'], pred_df['pred_rank_prize'])
        
    #auc_scoresのリストに保存
    auc_scores.append(auc_score)
    
    #f1を計算            
    f1 = f1_score(pred_df['rank_prize'], pred_df['pred_rank_prize'])
    
    #reportを計算
    report = metrics.classification_report(pred_df['rank_prize'], pred_df['pred_rank_prize'])
    report_num = metrics.precision_recall_fscore_support(pred_df['rank_prize'], pred_df['pred_rank_prize'])
    reports.append(report)

    #reportから情報をとる
    precision_scores.append(report_num[0])
    recall_scores.append(report_num[1])
    f1_scores.append(report_num[2])
    support_scores.append(report_num[3])
    
    return f1

In [65]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2023-04-02 15:47:20,434][0m A new study created in memory with name: no-name-008cca55-513c-4aab-95e1-a3e42bc37689[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:20,827][0m Trial 0 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 94, 'learning_rate': 0.06945264036293636, 'feature_fraction': 0.8336700300495288, 'bagging_fraction': 0.2928339063891271, 'bagging_freq': 9, 'lambda_l1': 3.1982358688464707, 'lambda_l2': 4.602533643232549}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:21,206][0m Trial 1 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 38, 'learning_rate': 0.013106338093601518, 'feature_fraction': 0.37488412137919425, 'bagging_fraction': 0.2987301575633059, 'bagging_freq': 10, 'lambda_l1': 6.347728447581765, 'lambda_l2': 7.200989549339676}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:21,582][0m Trial 2 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 81, 'learning_rate': 0.06696176793227346, 'feature_fraction': 0.6344111669727113, 'bagging_fraction': 0.504137796101827, 'bagging_freq': 2, 'lambda_l1': 2.4104362813666524, 'lambda_l2': 8.063652623163868}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:22,011][0m Trial 3 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 40, 'learning_rate': 0.05409497025709154, 'feature_fraction': 0.9245271533222009, 'bagging_fraction': 0.22673303638332037, 'bagging_freq': 3, 'lambda_l1': 2.902858507728678, 'lambda_l2': 4.8015666671363775}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:22,404][0m Trial 4 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 42, 'learning_rate': 0.047813163047734204, 'feature_fraction': 0.48126507136360375, 'bagging_fraction': 0.6564097258337149, 'bagging_freq': 1, 'lambda_l1': 9.865222870596797, 'lambda_l2': 8.361857808880751}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:22,802][0m Trial 5 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 12, 'learning_rate': 0.05136141438366371, 'feature_fraction': 0.6902582610130062, 'bagging_fraction': 0.19901731150622354, 'bagging_freq': 1, 'lambda_l1': 7.182894362698116, 'lambda_l2': 9.258484154386416}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:23,195][0m Trial 6 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 21, 'learning_rate': 0.09108198120324217, 'feature_fraction': 0.7092530558159365, 'bagging_fraction': 0.791640807002418, 'bagging_freq': 10, 'lambda_l1': 7.143213731541669, 'lambda_l2': 8.084241435872803}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:23,585][0m Trial 7 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 98, 'learning_rate': 0.04132526987878367, 'feature_fraction': 0.352927338260863, 'bagging_fraction': 0.5668745718482291, 'bagging_freq': 3, 'lambda_l1': 6.884726425634296, 'lambda_l2': 4.399621777568123}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:23,973][0m Trial 8 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 17, 'learning_rate': 0.022438719759216234, 'feature_fraction': 0.9003784856920694, 'bagging_fraction': 0.13528158688230038, 'bagging_freq': 7, 'lambda_l1': 6.778394700927349, 'lambda_l2': 9.297471776017298}. Best is trial 0 with value: 0.34951456310679613.[0m




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m[I 2023-04-02 15:47:24,365][0m Trial 9 finished with value: 0.34951456310679613 and parameters: {'num_leaves': 62, 'learning_rate': 0.0999597585402107, 'feature_fraction': 0.37302715994906555, 'bagging_fraction': 0.45611400136043745, 'bagging_freq': 6, 'lambda_l1': 2.548258882359731, 'lambda_l2': 6.443363833790782}. Best is trial 0 with value: 0.34951456310679613.[0m




In [66]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_fraction,params_bagging_freq,params_feature_fraction,params_lambda_l1,params_lambda_l2,params_learning_rate,params_num_leaves,state
0,0,0.349515,2023-04-02 15:47:20.438605,2023-04-02 15:47:20.826071,0 days 00:00:00.387466,0.292834,9,0.83367,3.198236,4.602534,0.069453,94,COMPLETE
1,1,0.349515,2023-04-02 15:47:20.827988,2023-04-02 15:47:21.202162,0 days 00:00:00.374174,0.29873,10,0.374884,6.347728,7.20099,0.013106,38,COMPLETE
2,2,0.349515,2023-04-02 15:47:21.206160,2023-04-02 15:47:21.582997,0 days 00:00:00.376837,0.504138,2,0.634411,2.410436,8.063653,0.066962,81,COMPLETE
3,3,0.349515,2023-04-02 15:47:21.582997,2023-04-02 15:47:22.011223,0 days 00:00:00.428226,0.226733,3,0.924527,2.902859,4.801567,0.054095,40,COMPLETE
4,4,0.349515,2023-04-02 15:47:22.015218,2023-04-02 15:47:22.404780,0 days 00:00:00.389562,0.65641,1,0.481265,9.865223,8.361858,0.047813,42,COMPLETE
5,5,0.349515,2023-04-02 15:47:22.408778,2023-04-02 15:47:22.799622,0 days 00:00:00.390844,0.199017,1,0.690258,7.182894,9.258484,0.051361,12,COMPLETE
6,6,0.349515,2023-04-02 15:47:22.802647,2023-04-02 15:47:23.195835,0 days 00:00:00.393188,0.791641,10,0.709253,7.143214,8.084241,0.091082,21,COMPLETE
7,7,0.349515,2023-04-02 15:47:23.195835,2023-04-02 15:47:23.581054,0 days 00:00:00.385219,0.566875,3,0.352927,6.884726,4.399622,0.041325,98,COMPLETE
8,8,0.349515,2023-04-02 15:47:23.585053,2023-04-02 15:47:23.973707,0 days 00:00:00.388654,0.135282,7,0.900378,6.778395,9.297472,0.022439,17,COMPLETE
9,9,0.349515,2023-04-02 15:47:23.973707,2023-04-02 15:47:24.365713,0 days 00:00:00.392006,0.456114,6,0.373027,2.548259,6.443364,0.09996,62,COMPLETE


In [67]:
#Optunaによる、ベストパラメータの取得
best_params = study.best_params

In [68]:
best_ranker = LGBMRanker(**best_params)

In [69]:
train_baskets = X_id.groupby(["race_id"])["horse_id"].count().values
X = X_id.drop(id, axis =1)
model = best_ranker.fit(X,
                        y,
                        group=train_baskets)



In [38]:
result = model.predict(X)

In [39]:
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [40]:
#predict
train_baskets = X_id.groupby(["race_id"])["horse_id"].count().values
X = X_id.drop(["race_id", "horse_id"], axis=1)
y_pred = model.predict(X, group=train_baskets)

# 結果を入れるデータフレームを用意
pred_df = pd.DataFrame({
   "race_id": X_id['race_id'],
   "horse_id": X_id['horse_id'],
   "rank": y['rank'],
   "pred": y_pred
})

#予測した結果をrace_idごとでグループ化し順位をつける
pred_df['pred_rank'] = pred_df.groupby('race_id')['pred'].rank(method='min', ascending=False)

#3位以内ならprizeを1にする
pred_df['rank_prize'] = pred_df['rank'].apply(lambda x: 1 if x <= 3 else 0)
pred_df['pred_rank_prize'] = pred_df['pred_rank'].apply(lambda x: 1 if x <= 3 else 0)

In [41]:
pred_df

Unnamed: 0,race_id,horse_id,rank,pred,pred_rank,rank_prize,pred_rank_prize
0,202009020411,2016104750,2,0.0,1.0,1,1
1,202109020411,2017105335,1,0.0,1.0,1,1
2,202209020411,2018105554,8,0.0,1.0,0,1
3,201709020411,2013105788,4,0.0,1.0,0,1
4,201709020411,2013106099,7,0.0,1.0,0,1
...,...,...,...,...,...,...,...
80,202209020411,2015101654,7,0.0,1.0,0,1
81,201809020411,2010102459,9,0.0,1.0,0,1
82,201709020411,2009102678,10,0.0,1.0,0,1
83,201809020411,2010103602,15,0.0,1.0,0,1


In [42]:
#predict
train_baskets = today_race_X.groupby(["race_id"])["horse_id"].count().values
X = today_race_X.drop(["race_id", "horse_id"], axis=1)
y_pred = model.predict(X, group=train_baskets)

# 結果を入れるデータフレームを用意
pred_df = pd.DataFrame({
   "race_id": today_race_X['race_id'],
   "horse_id": today_race_X['horse_id'],
    "horse_name":today_race_X_with_name['horse_name'],
        "horse_number":today_race_X['horse_number'],
   "pred": y_pred
})

#予測した結果をrace_idごとでグループ化し順位をつける
pred_df['pred_rank'] = pred_df.groupby('race_id')['pred'].rank(method='min', ascending=False)

#3位以内ならprizeを1にする
pred_df['pred_rank_prize'] = pred_df['pred_rank'].apply(lambda x: 1 if x <= 3 else 0)

In [43]:
pred_df.sort_values('horse_number')

Unnamed: 0,race_id,horse_id,horse_name,horse_number,pred,pred_rank,pred_rank_prize
0,202309020411,2018105081,ジェラルディーナ,1,0.0,1.0,1
1,202309020411,2018100927,マリアエレーナ,2,0.0,1.0,1
3,202309020411,2016100915,モズベッロ,3,0.0,1.0,1
4,202309020411,2018102348,ノースブリッジ,4,0.0,1.0,1
5,202309020411,2018105074,ワンダフルタウン,5,0.0,1.0,1
6,202309020411,2017105567,ェルトライゼンデ,6,0.0,1.0,1
7,202309020411,2019102879,マテンロウレオ,7,0.0,1.0,1
8,202309020411,2019101782,ラーグルフ,8,0.0,1.0,1
9,202309020411,2018100274,ジャックドール,9,0.0,1.0,1
10,202309020411,2017105376,ポタジェ,10,0.0,1.0,1


In [44]:
# Unpickle model
model = pd.read_pickle('./model.pickle')

In [45]:
# one = float(request.POST.get('one'))
# two = float(request.POST.get('two'))
# three = float(request.POST.get('three'))
# four = float(request.POST.get('four'))
# five = float(request.POST.get('five'))
# six = float(request.POST.get('six'))
# seven = float(request.POST.get('seven'))
# eight = float(request.POST.get('eight'))
# nine = float(request.POST.get('nine'))
# ten = float(request.POST.get('ten'))
# eleven = float(request.POST.get('eleven'))
# twelve = float(request.POST.get('twelve'))
# thirteen = float(request.POST.get('thirteen'))
# fourteen = float(request.POST.get('fourteen'))
# fifteen = float(request.POST.get('fifteen'))
# sixteen = float(request.POST.get('sixteen'))
# seventeen = float(request.POST.get('seventeen'))
# eighteen = float(request.POST.get('eighteen'))

In [46]:
one = 500
two = 500
three = 500
four = 500
five = 500
six = 500
seven = 500
eight = 500
nine = 500
ten = 500
eleven = 500
twelve = 500
thirteen = 500
fourteen = 500
fifteen = 500
sixteen = 500
# seventeen = 500
# eighteen = 500

In [47]:
today_race_X = pd.read_csv('../data/main/today.csv')
today_race_X_withname = pd.read_csv('../data/main/today_withname.csv')

In [48]:
today_race_X.loc[today_race_X['horse_number'] == 1, 'horse_weight'] = one
today_race_X.loc[today_race_X['horse_number'] == 2, 'horse_weight'] = two
today_race_X.loc[today_race_X['horse_number'] == 3, 'horse_weight'] = three
today_race_X.loc[today_race_X['horse_number'] == 4, 'horse_weight'] = four
today_race_X.loc[today_race_X['horse_number'] == 5, 'horse_weight'] = five
today_race_X.loc[today_race_X['horse_number'] == 6, 'horse_weight'] = six
today_race_X.loc[today_race_X['horse_number'] == 7, 'horse_weight'] = seven
today_race_X.loc[today_race_X['horse_number'] == 8, 'horse_weight'] = eight
today_race_X.loc[today_race_X['horse_number'] == 9, 'horse_weight'] = nine
today_race_X.loc[today_race_X['horse_number'] == 10, 'horse_weight'] = ten
today_race_X.loc[today_race_X['horse_number'] == 11, 'horse_weight'] = eleven
today_race_X.loc[today_race_X['horse_number'] == 12, 'horse_weight'] = twelve
today_race_X.loc[today_race_X['horse_number'] == 13, 'horse_weight'] = thirteen
today_race_X.loc[today_race_X['horse_number'] == 14, 'horse_weight'] = fourteen
today_race_X.loc[today_race_X['horse_number'] == 15, 'horse_weight'] = fifteen
today_race_X.loc[today_race_X['horse_number'] == 16, 'horse_weight'] = sixteen
# today_race_X.loc[today_race_X['horse_number'] == 17, 'horse_weight'] = seventeen
# today_race_X.loc[today_race_X['horse_number'] == 18, 'horse_weight'] = eighteen

In [49]:
today_race_X = today_race_X.sort_values('horse_number')
today_race_X

Unnamed: 0,race_id,horse_id,rider_id,horse_number,age,sex,horse_weight,ground_status,tame_id,inbreeding_1,inbreeding_2
0,202309020411,2018105081,1174,1,5,1,500,1,49,71,44
1,202309020411,2018100927,1115,2,5,1,500,1,49,71,44
3,202309020411,2016100915,1171,3,7,0,500,1,35,20,43
4,202309020411,2018102348,5203,4,5,0,500,1,49,71,44
5,202309020411,2018105074,1018,5,5,0,500,1,49,71,44
6,202309020411,2017105567,1088,6,6,0,500,1,49,71,44
7,202309020411,2019102879,660,7,4,0,500,1,49,71,44
8,202309020411,2019101782,5386,8,4,0,500,1,49,71,44
9,202309020411,2018100274,666,9,5,0,500,1,17,56,21
10,202309020411,2017105376,1163,10,6,0,500,1,16,14,19


In [50]:
#predict
train_baskets = today_race_X.groupby(["race_id"])["horse_id"].count().values
X = today_race_X.drop(["race_id", "horse_id"], axis=1)
y_pred = model.predict(X, group=train_baskets)

In [51]:
y_pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [52]:
rank = np.arange(1, 19)
sorted_rank = rank[np.argsort(y_pred)[::-1]]

In [53]:
sorted_rank[0]

16