In [16]:
#pip install google-cloud-bigquery これ必要
#pip install pydata_google_auth
#pip install db-dtypes

In [17]:
#インポート文
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from lightgbm import LGBMRanker

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix

import optuna

import pydata_google_auth
from google.cloud import bigquery 

import pickle

In [18]:
credentials = pydata_google_auth.get_user_credentials(
    ['https://www.googleapis.com/auth/bigquery'],
)

In [19]:
client = bigquery.Client(project='keiba-381006', credentials=credentials)

In [20]:
horse_query = '''
    SELECT * 
    FROM horse_dataset.main_horse_table 
    LIMIT 1000
    '''
horse_df = client.query(horse_query).to_dataframe()

In [21]:
race_query = '''
    SELECT * 
    FROM race_dataset.main_race_table 
    LIMIT 1000
    '''
race_df = client.query(race_query).to_dataframe()

In [22]:
#データのマージ
race_df_for_merge = race_df[['race_id',
                             "distance",
                            ]]

merged_horse_df = pd.merge(horse_df, race_df_for_merge, on='race_id')

In [23]:
#使う特徴量の設定
id = {"race_id",
      "horse_id"
    }

feature = [
           "race_id",
           "horse_id",
           "frame_number",
           "horse_number",
           "horse_weight",
           "distance",
            ]

target = ['rank']

X_id = merged_horse_df[feature]
y = merged_horse_df[target]

In [24]:
X_id= X_id.astype(float)
y= y.astype(float)

In [25]:
#スコア結果を入れるリスト
reports = []
auc_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
support_scores = []

#予測結果を入れるリスト
pred_df_list = []

def objective(trial):
    
# Optuna
    # ハイパーパラメータの探索範囲
    params = {
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0),
        'verbose': -1
    }
    
    #LGBMRankerの設定
    ranker = LGBMRanker(**params)

# K-fold
    #kflodの設定
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    #kflodで学習用と検証用に分ける
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_id, y)):
        #id入りの学習データ
        X_train_id, X_val_id = X_id.iloc[train_idx], X_id.iloc[val_idx]
        
        #id抜きの学習データ
        X = X_id.drop(id, axis =1)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        
        #検証データ
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

# Fit
        # train_basketsを計算
        train_baskets = X_train_id.groupby(["race_id"])["horse_id"].count().values

        # モデルの学習
        model = ranker.fit(X_train,
                           y_train,
                           group=train_baskets)

# Predict
        # モデルの予測
        y_pred = model.predict(X_val)

        #結果を入れるデータフレームを用意
        pred_df_fold = pd.DataFrame({
           "race_id": X_val_id['race_id'],
           "horse_id": X_val_id['horse_id'],
           "rank":y_val['rank'],
           "pred": y_pred
           })

        #そのデータフレームをリストに保存
        pred_df_list.append(pred_df_fold)

# Rank
    # 各foldの予測結果を結合してpred_dfを作成
    pred_df = pd.concat(pred_df_list, axis=0).reset_index(drop=True)

    #予測した結果をrace_idごとでグループ化し順位をつける
    pred_df['pred_rank'] = pred_df.groupby('race_id')['pred'].rank(method='min', ascending=False)

    #3位以内ならprizeを1にする
    pred_df['rank_prize'] = pred_df['rank'].apply(lambda x: 1 if x <= 3 else 0)
    pred_df['pred_rank_prize'] = pred_df['pred_rank'].apply(lambda x: 1 if x <= 3 else 0)

#Score
    #正確度を計算
    auc_score = metrics.accuracy_score(pred_df['rank_prize'], pred_df['pred_rank_prize'])
        
    #auc_scoresのリストに保存
    auc_scores.append(auc_score)
    
    #f1を計算            
    f1 = f1_score(pred_df['rank_prize'], pred_df['pred_rank_prize'])
    
    #reportを計算
    report = metrics.classification_report(pred_df['rank_prize'], pred_df['pred_rank_prize'])
    report_num = metrics.precision_recall_fscore_support(pred_df['rank_prize'], pred_df['pred_rank_prize'])
    reports.append(report)

    #reportから情報をとる
    precision_scores.append(report_num[0])
    recall_scores.append(report_num[1])
    f1_scores.append(report_num[2])
    support_scores.append(report_num[3])
    
    return f1

In [26]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2023-03-21 16:07:59,729][0m A new study created in memory with name: no-name-b0a8d98f-aee8-4f87-94d2-f84786ed6a37[0m




[32m[I 2023-03-21 16:08:00,200][0m Trial 0 finished with value: 0.26552462526766596 and parameters: {'num_leaves': 48, 'learning_rate': 0.0523007787592449, 'feature_fraction': 0.996191518631241, 'bagging_fraction': 0.6954961834276343, 'bagging_freq': 7, 'lambda_l1': 8.326288249678386, 'lambda_l2': 0.5127166285613973}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:00,703][0m Trial 1 finished with value: 0.18641390205371247 and parameters: {'num_leaves': 63, 'learning_rate': 0.002629206481840648, 'feature_fraction': 0.9927851423604727, 'bagging_fraction': 0.4315480586584398, 'bagging_freq': 9, 'lambda_l1': 3.713952805726751, 'lambda_l2': 8.074647571584114}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:01,234][0m Trial 2 finished with value: 0.12077294685990338 and parameters: {'num_leaves': 62, 'learning_rate': 0.08739456355748115, 'feature_fraction': 0.6399434857184209, 'bagging_fraction': 0.7451218629066981, 'bagging_freq': 4, 'lambda_l1': 2.590181205422719, 'lambda_l2': 9.16242626916039}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:01,562][0m Trial 3 finished with value: 0.10616113744075828 and parameters: {'num_leaves': 58, 'learning_rate': 0.09847627344814058, 'feature_fraction': 0.8703636525866414, 'bagging_fraction': 0.18351338632813613, 'bagging_freq': 10, 'lambda_l1': 8.631580986527714, 'lambda_l2': 4.45331077953838}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:02,033][0m Trial 4 finished with value: 0.0821256038647343 and parameters: {'num_leaves': 83, 'learning_rate': 0.015961882467922822, 'feature_fraction': 0.25468643393486723, 'bagging_fraction': 0.7014629632278517, 'bagging_freq': 5, 'lambda_l1': 2.3099911644784252, 'lambda_l2': 6.814421777918993}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:02,556][0m Trial 5 finished with value: 0.07039337474120085 and parameters: {'num_leaves': 81, 'learning_rate': 0.0008796336183157943, 'feature_fraction': 0.9414036623759877, 'bagging_fraction': 0.5227201188631443, 'bagging_freq': 8, 'lambda_l1': 2.0571162112179215, 'lambda_l2': 7.606861882478177}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:02,932][0m Trial 6 finished with value: 0.05917874396135265 and parameters: {'num_leaves': 11, 'learning_rate': 0.047649115893559864, 'feature_fraction': 0.9334472761725516, 'bagging_fraction': 0.6289626858666207, 'bagging_freq': 9, 'lambda_l1': 9.230135980306068, 'lambda_l2': 5.122873913347379}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:03,366][0m Trial 7 finished with value: 0.052603327965646804 and parameters: {'num_leaves': 45, 'learning_rate': 0.004715513487947986, 'feature_fraction': 0.5949274517407911, 'bagging_fraction': 0.21057378485151015, 'bagging_freq': 4, 'lambda_l1': 5.079159110002386, 'lambda_l2': 6.951162986027604}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:03,782][0m Trial 8 finished with value: 0.05410628019323671 and parameters: {'num_leaves': 79, 'learning_rate': 0.09369724579047568, 'feature_fraction': 0.33546531611532066, 'bagging_fraction': 0.4273254902875412, 'bagging_freq': 7, 'lambda_l1': 5.193010724140269, 'lambda_l2': 1.9601273378896504}. Best is trial 0 with value: 0.26552462526766596.[0m




[32m[I 2023-03-21 16:08:04,181][0m Trial 9 finished with value: 0.04830917874396135 and parameters: {'num_leaves': 22, 'learning_rate': 0.02814976721121546, 'feature_fraction': 0.7555298354909579, 'bagging_fraction': 0.6526785658761382, 'bagging_freq': 4, 'lambda_l1': 9.352026749576217, 'lambda_l2': 8.568315228978372}. Best is trial 0 with value: 0.26552462526766596.[0m




In [27]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_fraction,params_bagging_freq,params_feature_fraction,params_lambda_l1,params_lambda_l2,params_learning_rate,params_num_leaves,state
0,0,0.265525,2023-03-21 16:07:59.736567,2023-03-21 16:08:00.200199,0 days 00:00:00.463632,0.695496,7,0.996192,8.326288,0.512717,0.052301,48,COMPLETE
1,1,0.186414,2023-03-21 16:08:00.208205,2023-03-21 16:08:00.687732,0 days 00:00:00.479527,0.431548,9,0.992785,3.713953,8.074648,0.002629,63,COMPLETE
2,2,0.120773,2023-03-21 16:08:00.703656,2023-03-21 16:08:01.234825,0 days 00:00:00.531169,0.745122,4,0.639943,2.590181,9.162426,0.087395,62,COMPLETE
3,3,0.106161,2023-03-21 16:08:01.234825,2023-03-21 16:08:01.562403,0 days 00:00:00.327578,0.183513,10,0.870364,8.631581,4.453311,0.098476,58,COMPLETE
4,4,0.082126,2023-03-21 16:08:01.562403,2023-03-21 16:08:02.033158,0 days 00:00:00.470755,0.701463,5,0.254686,2.309991,6.814422,0.015962,83,COMPLETE
5,5,0.070393,2023-03-21 16:08:02.035178,2023-03-21 16:08:02.548420,0 days 00:00:00.513242,0.52272,8,0.941404,2.057116,7.606862,0.00088,81,COMPLETE
6,6,0.059179,2023-03-21 16:08:02.556523,2023-03-21 16:08:02.925960,0 days 00:00:00.369437,0.628963,9,0.933447,9.230136,5.122874,0.047649,11,COMPLETE
7,7,0.052603,2023-03-21 16:08:02.934508,2023-03-21 16:08:03.365106,0 days 00:00:00.430598,0.210574,4,0.594927,5.079159,6.951163,0.004716,45,COMPLETE
8,8,0.054106,2023-03-21 16:08:03.366604,2023-03-21 16:08:03.782557,0 days 00:00:00.415953,0.427325,7,0.335465,5.193011,1.960127,0.093697,79,COMPLETE
9,9,0.048309,2023-03-21 16:08:03.782557,2023-03-21 16:08:04.173004,0 days 00:00:00.390447,0.652679,4,0.75553,9.352027,8.568315,0.02815,22,COMPLETE


In [28]:
#Optunaによる、ベストパラメータの取得
best_params = study.best_params

In [29]:
best_ranker = LGBMRanker(**best_params)

In [30]:
train_baskets = X_id.groupby(["race_id"])["horse_id"].count().values
X = X_id.drop(id, axis =1)
model = best_ranker.fit(X,
                        y,
                        group=train_baskets)



In [35]:
result = model.predict(X)
result[0]

0.0011927359905513865

In [32]:
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)