# 準備

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

#モデル
import lightgbm as lgb

#パラメータ探索
import optuna

#クロスバリデーション
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold

# 評価指標
import sys
sys.path.append('../evaluation')
from rmse import rmse
from recall5 import recall5
from profit import profit

# #エヴァリュエーション
# from sklearn.metrics import precision_score, recall_score, mean_squared_error

#可視化
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, classification_report

#保存
import pickle

In [2]:
#データを読み込む
n_uma_race_df = pd.read_pickle('../../datasets/traindata/n_uma_race.pkl')
n_race_df = pd.read_pickle('../../datasets/traindata/n_race.pkl')
n_payout_df = pd.read_pickle('../../datasets/traindata/n_payout.pkl')

In [3]:
# 新しいグループを作成
n_uma_race_df['group'] = n_uma_race_df['kaisai_nen'].astype(int).astype(str) +"-"+ n_uma_race_df['kaisai_tsukihi'].astype(int).astype(str) +"-"+  n_uma_race_df['keibajo_code'].astype(int).astype(str) +"-"+  n_uma_race_df['race_bango'].astype(int).astype(str)
n_race_df['group'] = n_race_df['kaisai_nen'].astype(int).astype(str) +"-"+ n_race_df['kaisai_tsukihi'].astype(int).astype(str) +"-"+  n_race_df['keibajo_code'].astype(int).astype(str) +"-"+  n_race_df['race_bango'].astype(int).astype(str)
n_payout_df['group'] = n_payout_df['kaisai_nen'].astype(int).astype(str) +"-"+ n_payout_df['kaisai_tsukihi'].astype(int).astype(str) +"-"+  n_payout_df['keibajo_code'].astype(int).astype(str) +"-"+  n_payout_df['race_bango'].astype(int).astype(str)

n_race_df_drop = n_race_df.drop(['kaisai_nen', 'kaisai_tsukihi', 'keibajo_code', 'kaisai_kai', 'kaisai_nichime', 'race_bango'],axis=1)
n_payout_df_drop = n_payout_df.drop(['kaisai_nen', 'kaisai_tsukihi', 'keibajo_code', 'kaisai_kai', 'kaisai_nichime', 'race_bango', 'toroku_tosu', 'shusso_tosu'],axis=1)

merged_df = pd.merge(n_uma_race_df, n_race_df_drop, on='group', how='left')
merged_df = pd.merge(merged_df, n_payout_df_drop, on='group', how='left')
merged_df['group'].nunique()

205318

In [4]:
merged_df['group'].unique()

array(['2023-916-46-1', '2023-916-46-2', '2023-916-46-3', ...,
       '2023-907-50-10', '2023-907-50-11', '2023-907-50-12'], dtype=object)

In [5]:
pd.set_option('display.max_columns', None)
merged_df.head(1)

Unnamed: 0,kaisai_nen,kaisai_tsukihi,keibajo_code,kaisai_kai,kaisai_nichime,race_bango,wakuban,umaban,ketto_toroku_bango,bamei,umakigo_code,seibetsu_code,hinshu_code,moshoku_code,barei,tozai_shozoku_code,chokyoshi_code,banushi_code,banushimei,futan_juryo,blinker_shiyo_kubun,kishu_code,kishumei_ryakusho,kishu_minarai_code,bataiju,zogen_fugo,zogen_sa,ijo_kubun_code,nyusen_juni,kakutei_chakujun,dochaku_kubun,dochaku_tosu,soha_time,chakusa_code_1,chakusa_code_2,chakusa_code_3,corner_1,corner_2,corner_3,corner_4,tansho_odds,tansho_ninkijun,kakutoku_honshokin,kakutoku_fukashokin,kohan_4f,kohan_3f,aiteuma_joho_1,aiteuma_joho_2,aiteuma_joho_3,time_sa,record_koshin_kubun,kyakushitsu_hantei,group,yobi_code,jusho_kaiji,grade_code,kyoso_shubetsu_code,kyoso_kigo_code,juryo_shubetsu_code,kyoso_joken_code_2sai,kyoso_joken_code_3sai,kyoso_joken_code_4sai,kyoso_joken_code_5sai_ijo,kyoso_joken_code,kyori,track_code,course_kubun,honshokin,fukashokin,hasso_jikoku,toroku_tosu,shusso_tosu,nyusen_tosu,tenko_code,babajotai_code_shiba,babajotai_code_dirt,fuseiritsu_flag_sanrenpuku,tokubarai_flag_sanrenpuku,henkan_flag_sanrenpuku,haraimodoshi_sanrenpuku_1a,haraimodoshi_sanrenpuku_1b,haraimodoshi_sanrenpuku_1c
0,2023,916,46,13,1,1,1,1,2020106227,スターサファイア,0,1,1.0,3,3,3,5319,875800,ＪＰＮ技研,560.0,0,5300,葛山晃平,0,466.0,1,0.0,0,8,8,0,0,1359,4.0,,,9,9,9,9,899,8,0,0,0,411,2020105068フェイマスグローリ,0,0,50,0,0,2023-916-46-1,1,0,0,49,0,4,0,0,0,0,0,1400,24,0,4e+51,0,1140,9,9,9,1,0,1.0,0.0,0.0,0.0,60809.0,210.0,1.0


# 前処理

In [6]:
merged_df['hutan_wariai'] = merged_df['futan_juryo'] / merged_df['bataiju']

In [7]:
def combine_sign_and_diff(row):
    if row['zogen_fugo'] == 2:
        return row['zogen_sa']
    elif row['zogen_fugo'] == 0:
        return -row['zogen_sa']
    else:
        return 0

merged_df['zogen_ryou'] = merged_df.apply(combine_sign_and_diff, axis=1)

In [8]:
merged_df = merged_df[merged_df['ijo_kubun_code'] == 0]
# 1	出走取消	取消	SCRATCHED	S

In [9]:
columns_to_convert = [
                    'wakuban',
                    'umaban',

                    'kyori',
                    'grade_code',
                    'seibetsu_code',
                    'moshoku_code',
                    'barei',
                    'chokyoshi_code',
                    'banushi_code',
                    'kishu_code',
                    'kishu_minarai_code',
                    'kyoso_shubetsu_code',
                    'juryo_shubetsu_code',
                    'shusso_tosu',
                    'tenko_code',
                    'babajotai_code_dirt',
                    'hutan_wariai',
                    'zogen_ryou',
                    'track_code',
                    'keibajo_code',
                    'hinshu_code',
                    'umakigo_code'
                    ]

for column in columns_to_convert:
    merged_df[column].fillna(0, inplace=True)
    try:
        if merged_df[column].astype(float).apply(lambda x: x.is_integer()).all():
            merged_df[column] = merged_df[column].astype(int)
        else:
            merged_df[column] = merged_df[column].astype(float)
    except ValueError:
        merged_df[column] = merged_df[column].astype(float)

    print(f"{column} のデータ型: {merged_df[column].dtype}")

wakuban のデータ型: int32
umaban のデータ型: int32
kyori のデータ型: int32
grade_code のデータ型: int32
seibetsu_code のデータ型: int32
moshoku_code のデータ型: int32
barei のデータ型: int32
chokyoshi_code のデータ型: int32
banushi_code のデータ型: int32
kishu_code のデータ型: int32
kishu_minarai_code のデータ型: int32
kyoso_shubetsu_code のデータ型: int32
juryo_shubetsu_code のデータ型: int32
shusso_tosu のデータ型: int32
tenko_code のデータ型: int32
babajotai_code_dirt のデータ型: int32
hutan_wariai のデータ型: float64
zogen_ryou のデータ型: int32
track_code のデータ型: int32
keibajo_code のデータ型: int32
hinshu_code のデータ型: int32
umakigo_code のデータ型: int32


In [10]:
merged_df[column].info()

<class 'pandas.core.series.Series'>
Int64Index: 2017402 entries, 0 to 2042540
Series name: umakigo_code
Non-Null Count    Dtype
--------------    -----
2017402 non-null  int32
dtypes: int32(1)
memory usage: 23.1 MB


In [11]:
def invert_rankings(df, column_name='kakutei_chakujun'):
    """
    この関数はDataFrameの特定の列のランキングを逆転させます。
    - df: DataFrameオブジェクト
    - column_name: 逆転させる列の名前（デフォルトは 'kakutei_chakujun'）
    """
    max_rank = df.groupby('group')[column_name].transform('max')
    df[column_name] = max_rank - df[column_name] + 1
    
    return df

In [12]:
def modify_rankings(df, column_name='kakutei_chakujun'):
    """
    この関数はDataFrameの特定の列のランキングを修正します。
    - df: DataFrameオブジェクト
    - column_name: 修正する列の名前（デフォルトは 'kakutei_chakujun'）
    """
    conditions = [
        (df[column_name] == 1),
        (df[column_name] == 2),
        (df[column_name] == 3),
        (df[column_name] == 4),
        (df[column_name] == 5)
    ]
    
    values = [5, 4, 3, 2, 1]
    
    df['target'] = np.select(conditions, values, default=0)
    
    return df

In [13]:
merged_df = modify_rankings(merged_df)

In [14]:
merged_df['target'].value_counts()

0    999416
5    203842
4    203673
3    203665
2    203430
1    203376
Name: target, dtype: int64

# lightgbm.LGBMRanker

In [15]:
# 2010年から2019年のデータを学習データとして取得
train_data = merged_df[merged_df['kaisai_nen'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
test_data = merged_df[merged_df['kaisai_nen'].isin([2020, 2021, 2022])]

In [16]:
# ランキング学習のために必要な特徴量とターゲットを設定
features = [
            # 'wakuban',
            'umaban',

            # 'seibetsu_code',
            # 'moshoku_code',
            # 'barei',
            # 'chokyoshi_code',
            # 'banushi_code',
            # 'kishu_code',
            # 'kishu_minarai_code',
            # 'hutan_wariai',
            'zogen_ryou',
            # 'hinshu_code',
            # 'umakigo_code',
            ]

target = 'target'

In [17]:
categorical_features = [
                        # 'seibetsu_code',
                        # 'moshoku_code',
                        # 'chokyoshi_code',
                        # 'banushi_code',
                        # 'kishu_code',
                        # 'kishu_minarai_code',
                        # 'hinshu_code',
                        # 'umakigo_code',
                        ]

In [18]:
def ndcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    
    gain = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def mean_ndcg_score(y_true, y_score, groups, k=5):
    ndcgs = []
    idx_start = 0
    for group in groups:
        ndcgs.append(ndcg_score(y_true[idx_start:idx_start+group], y_score[idx_start:idx_start+group], k))
        idx_start += group
    return np.mean(ndcgs)

In [19]:
group_sizes = train_data.groupby('group').size()

def objective(trial):
    # Optuna parameters
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'ndcg_at': 5,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'verbose': trial.suggest_int('verbose', 0, 1),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-5, 10.0, log=True),  # L1正則化
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-5, 10.0, log=True)  # L2正則化
    }
    
    gkf = GroupKFold(n_splits=5)
    ndcgs = []
    
    for train_index, valid_index in gkf.split(train_data, groups=train_data['group']):
        train_fold_data = train_data.iloc[train_index]
        valid_fold_data = train_data.iloc[valid_index]
        for feature in categorical_features:
            train_fold_data = train_fold_data.copy()
            valid_fold_data = valid_fold_data.copy()
            train_fold_data[feature] = train_fold_data[feature].astype('category')
            valid_fold_data[feature] = valid_fold_data[feature].astype('category')

        train_fold_group_sizes = train_fold_data.groupby('group').size().tolist()
        valid_fold_group_sizes = valid_fold_data.groupby('group').size().tolist()

        ranker = lgb.LGBMRanker(**params)
        ranker.fit(train_fold_data[features], train_fold_data[target], 
           group=train_fold_group_sizes, 
           eval_set=[(valid_fold_data[features], valid_fold_data[target])], 
           eval_group=[valid_fold_group_sizes], categorical_feature=categorical_features) 
        
        y_pred = ranker.predict(valid_fold_data[features])
        ndcg_value = mean_ndcg_score(valid_fold_data[target].values, y_pred, valid_fold_group_sizes)
        ndcgs.append(ndcg_value)
    
    return np.mean(ndcgs)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3)

[32m[I 2023-11-06 16:25:02,504][0m A new study created in memory with name: no-name-20116ce5-3245-44c1-a725-b89996fc0966[0m
New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's ndcg@5: 0.411943
[2]	valid_0's ndcg@5: 0.416151
[3]	valid_0's ndcg@5: 0.403845
[4]	valid_0's ndcg@5: 0.39581
[5]	valid_0's ndcg@5: 0.406336
[6]	valid_0's ndcg@5: 0.412887
[7]	valid_0's ndcg@5: 0.409894
[8]	valid_0's ndcg@5: 0.401347
[9]	valid_0's ndcg@5: 0.403921
[10]	valid_0's ndcg@5: 0.409607
[11]	valid_0's ndcg@5: 0.411725
[12]	valid_0's ndcg@5: 0.403804
[13]	valid_0's ndcg@5: 0.405648
[14]	valid_0's ndcg@5: 0.408088
[15]	valid_0's ndcg@5: 0.409471
[16]	valid_0's ndcg@5: 0.406685
[17]	valid_0's ndcg@5: 0.404035
[18]	valid_0's ndcg@5: 0.406944
[19]	valid_0's ndcg@5: 0.409496
[20]	valid_0's ndcg@5: 0.405784
[21]	valid_0's ndcg@5: 0.404103
[22]	valid_0's ndcg@5: 0.407271
[23]	valid_0's ndcg@5: 0.408754
[24]	valid_0's ndcg@5: 0.407113
[25]	valid_0's ndcg@5: 0.402147
[26]	valid_0's ndcg@5: 0.405412
[27]	valid_0's ndcg@5: 0.411777
[28]	valid_0's ndcg@5: 0

New categorical_feature is []


You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's ndcg@5: 0.411901
[2]	valid_0's ndcg@5: 0.416704
[3]	valid_0's ndcg@5: 0.403882
[4]	valid_0's ndcg@5: 0.393447
[5]	valid_0's ndcg@5: 0.406684
[6]	valid_0's ndcg@5: 0.414294
[7]	valid_0's ndcg@5: 0.412028
[8]	valid_0's ndcg@5: 0.400586
[9]	valid_0's ndcg@5: 0.406371
[10]	valid_0's ndcg@5: 0.411789
[11]	valid_0's ndcg@5: 0.413298
[12]	valid_0's ndcg@5: 0.403315
[13]	valid_0's ndcg@5: 0.403108
[14]	valid_0's ndcg@5: 0.410621
[15]	valid_0's ndcg@5: 0.412461
[16]	valid_0's ndcg@5: 0.401836
[17]	valid_0's ndcg@5: 0.403897
[18]	valid_0's ndcg@5: 0.410538
[19]	valid_0's ndcg@5: 0.412466
[20]	valid_0's ndcg@5: 0.406021
[21]	valid_0's ndcg@5: 0.406909
[22]	valid_0's ndcg@5: 0.408505
[23]	valid_0's ndcg@5: 0.410549
[24]	valid_0's ndcg@5: 0.404933
[25]	valid_0's ndcg@5: 0.406991
[26]	valid_0's ndcg@5: 0.409282
[27]	valid_0's ndcg@5: 0.411164
[28]	valid_0's ndcg@5: 0.403318
[29]	valid_0's ndcg@5: 0.406177
[30]	valid_0's ndcg@5: 

New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's ndcg@5: 0.414134
[2]	valid_0's ndcg@5: 0.416435
[3]	valid_0's ndcg@5: 0.403079
[4]	valid_0's ndcg@5: 0.395761
[5]	valid_0's ndcg@5: 0.408424
[6]	valid_0's ndcg@5: 0.413476
[7]	valid_0's ndcg@5: 0.412257
[8]	valid_0's ndcg@5: 0.403456
[9]	valid_0's ndcg@5: 0.40444
[10]	valid_0's ndcg@5: 0.409522
[11]	valid_0's ndcg@5: 0.412063
[12]	valid_0's ndcg@5: 0.405199
[13]	valid_0's ndcg@5: 0.401321
[14]	valid_0's ndcg@5: 0.406252
[15]	valid_0's ndcg@5: 0.412926
[16]	valid_0's ndcg@5: 0.408249
[17]	valid_0's ndcg@5: 0.402599
[18]	valid_0's ndcg@5: 0.405656
[19]	valid_0's ndcg@5: 0.41211
[20]	valid_0's ndcg@5: 0.408357
[21]	valid_0's ndcg@5: 0.403198
[22]	valid_0's ndcg@5: 0.405613
[23]	valid_0's ndcg@5: 0.410575
[24]	valid_0's ndcg@5: 0.409262
[25]	valid_0's ndcg@5: 0.403697
[26]	valid_0's ndcg@5: 0.405599
[27]	valid_0's ndcg@5: 0.409819
[28]	valid_0's ndcg@5: 0.

New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's ndcg@5: 0.409025
[2]	valid_0's ndcg@5: 0.413273
[3]	valid_0's ndcg@5: 0.407646
[4]	valid_0's ndcg@5: 0.39874
[5]	valid_0's ndcg@5: 0.405632
[6]	valid_0's ndcg@5: 0.412009
[7]	valid_0's ndcg@5: 0.410407
[8]	valid_0's ndcg@5: 0.403342
[9]	valid_0's ndcg@5: 0.405323
[10]	valid_0's ndcg@5: 0.40998
[11]	valid_0's ndcg@5: 0.412009
[12]	valid_0's ndcg@5: 0.406284
[13]	valid_0's ndcg@5: 0.406299
[14]	valid_0's ndcg@5: 0.409886
[15]	valid_0's ndcg@5: 0.408839
[16]	valid_0's ndcg@5: 0.40448
[17]	valid_0's ndcg@5: 0.40659
[18]	valid_0's ndcg@5: 0.40894
[19]	valid_0's ndcg@5: 0.409247
[20]	valid_0's ndcg@5: 0.405388
[21]	valid_0's ndcg@5: 0.406357
[22]	valid_0's ndcg@5: 0.409307
[23]	valid_0's ndcg@5: 0.409264
[24]	valid_0's ndcg@5: 0.404475
[25]	valid_0's ndcg@5: 0.406331
[26]	valid_0's ndcg@5: 0.407023
[27]	valid_0's ndcg@5: 0.409629
[28]	valid_0's ndcg@5: 0.407

New categorical_feature is []


You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's ndcg@5: 0.410553
[2]	valid_0's ndcg@5: 0.414999
[3]	valid_0's ndcg@5: 0.404668
[4]	valid_0's ndcg@5: 0.395745
[5]	valid_0's ndcg@5: 0.406632
[6]	valid_0's ndcg@5: 0.413317
[7]	valid_0's ndcg@5: 0.414218
[8]	valid_0's ndcg@5: 0.402546
[9]	valid_0's ndcg@5: 0.404012
[10]	valid_0's ndcg@5: 0.408591
[11]	valid_0's ndcg@5: 0.411613
[12]	valid_0's ndcg@5: 0.404223
[13]	valid_0's ndcg@5: 0.404212
[14]	valid_0's ndcg@5: 0.407224
[15]	valid_0's ndcg@5: 0.409836
[16]	valid_0's ndcg@5: 0.406128
[17]	valid_0's ndcg@5: 0.404602
[18]	valid_0's ndcg@5: 0.405775
[19]	valid_0's ndcg@5: 0.409784
[20]	valid_0's ndcg@5: 0.407189
[21]	valid_0's ndcg@5: 0.405029
[22]	valid_0's ndcg@5: 0.406893
[23]	valid_0's ndcg@5: 0.408038
[24]	valid_0's ndcg@5: 0.406429
[25]	valid_0's ndcg@5: 0.406515
[26]	valid_0's ndcg@5: 0.406163
[27]	valid_0's ndcg@5: 0.408169
[28]	valid_0's ndcg@5: 0.406731
[29]	valid_0's ndcg@5: 0.406198
[30]	valid_0's ndcg@5: 

[32m[I 2023-11-06 16:26:13,100][0m Trial 0 finished with value: 17.239063042447277 and parameters: {'num_leaves': 255, 'learning_rate': 0.06042277876149864, 'feature_fraction': 0.7113479132324596, 'bagging_freq': 6, 'verbose': 0, 'lambda_l1': 3.7795848495489845e-05, 'lambda_l2': 0.1778180298410174}. Best is trial 0 with value: 17.239063042447277.[0m
New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 1168488, number of used features: 2
[1]	valid_0's ndcg@5: 0.414447
[2]	valid_0's ndcg@5: 0.404389
[3]	valid_0's ndcg@5: 0.405127
[4]	valid_0's ndcg@5: 0.408148
[5]	valid_0's ndcg@5: 0.404787
[6]	valid_0's ndcg@5: 0.407547
[7]	valid_0's ndcg@5: 0.405348
[8]	valid_0's ndcg@5: 0.408029
[9]	valid_0's ndcg@5: 0.405784
[10]	valid_0's ndcg@5: 0.406504
[11]	valid_0's ndcg@5: 0.403892
[12]	valid_0's ndcg@5: 0.407614
[13]	valid_0's ndcg@5: 0.405519
[14]	valid_0's ndcg@5: 0.405541
[15]	valid_0's ndcg@5: 0.407294
[16]	valid_0's ndcg@5: 0.406988
[17]	valid_0's ndcg@5: 0.406244
[18]	valid_0's ndcg@5: 0.406634
[19]	valid_0's ndcg@5: 0.40498
[20]	valid_0's ndcg@5: 0.409718
[21]	valid_0's ndcg@5: 0.406147
[22]	valid_0's ndcg@5: 0.406965
[23]	valid_0's ndcg@5: 0.407056
[24]	valid_0's ndcg@5: 0.

New categorical_feature is []


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 1168488, number of used features: 2
[1]	valid_0's ndcg@5: 0.415659
[2]	valid_0's ndcg@5: 0.403349
[3]	valid_0's ndcg@5: 0.407365
[4]	valid_0's ndcg@5: 0.408644
[5]	valid_0's ndcg@5: 0.402825
[6]	valid_0's ndcg@5: 0.408893
[7]	valid_0's ndcg@5: 0.40653
[8]	valid_0's ndcg@5: 0.406879
[9]	valid_0's ndcg@5: 0.40444
[10]	valid_0's ndcg@5: 0.406853
[11]	valid_0's ndcg@5: 0.408497
[12]	valid_0's ndcg@5: 0.406009
[13]	valid_0's ndcg@5: 0.406341
[14]	valid_0's ndcg@5: 0.406704
[15]	valid_0's ndcg@5: 0.406212
[16]	valid_0's ndcg@5: 0.406044
[17]	valid_0's ndcg@5: 0.40705
[18]	valid_0's ndcg@5: 0.407304
[19]	valid_0's ndcg@5: 0.405323
[20]	valid_0's ndcg@5: 0.408259
[21]	valid_0's ndcg@5: 0.408127
[22]	valid_0's ndcg@5: 0.407387
[23]	valid_0's ndcg@5: 0.404707
[24]	valid_0's ndcg@5: 0.410528
[25]	valid_0's ndcg@5: 0.406867
[26]	valid_0's ndcg@5: 0.40

New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168488, number of used features: 2
[1]	valid_0's ndcg@5: 0.414124
[2]	valid_0's ndcg@5: 0.406601
[3]	valid_0's ndcg@5: 0.407604
[4]	valid_0's ndcg@5: 0.407306
[5]	valid_0's ndcg@5: 0.402175
[6]	valid_0's ndcg@5: 0.411619
[7]	valid_0's ndcg@5: 0.407055
[8]	valid_0's ndcg@5: 0.406232
[9]	valid_0's ndcg@5: 0.408851
[10]	valid_0's ndcg@5: 0.40805
[11]	valid_0's ndcg@5: 0.405822
[12]	valid_0's ndcg@5: 0.407728
[13]	valid_0's ndcg@5: 0.405798
[14]	valid_0's ndcg@5: 0.407983
[15]	valid_0's ndcg@5: 0.406333
[16]	valid_0's ndcg@5: 0.409061
[17]	valid_0's ndcg@5: 0.406449
[18]	valid_0's ndcg@5: 0.407025
[19]	valid_0's ndcg@5: 0.403413
[20]	valid_0's ndcg@5: 0.408481
[21]	valid_0's ndcg@5: 0.405591
[22]	valid_0's ndcg@5: 0.405806
[23]	valid_0's ndcg@5: 0.40676
[24]	valid_0's ndcg@5: 0.4

New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168486, number of used features: 2
[1]	valid_0's ndcg@5: 0.411592
[2]	valid_0's ndcg@5: 0.407158
[3]	valid_0's ndcg@5: 0.405323
[4]	valid_0's ndcg@5: 0.404655
[5]	valid_0's ndcg@5: 0.406716
[6]	valid_0's ndcg@5: 0.406877
[7]	valid_0's ndcg@5: 0.404546
[8]	valid_0's ndcg@5: 0.406953
[9]	valid_0's ndcg@5: 0.404593
[10]	valid_0's ndcg@5: 0.410477
[11]	valid_0's ndcg@5: 0.404973
[12]	valid_0's ndcg@5: 0.404731
[13]	valid_0's ndcg@5: 0.407794
[14]	valid_0's ndcg@5: 0.405006
[15]	valid_0's ndcg@5: 0.407167
[16]	valid_0's ndcg@5: 0.407788
[17]	valid_0's ndcg@5: 0.403195
[18]	valid_0's ndcg@5: 0.407258
[19]	valid_0's ndcg@5: 0.406815
[20]	valid_0's ndcg@5: 0.403728
[21]	valid_0's ndcg@5: 0.407044
[22]	valid_0's ndcg@5: 0.406794
[23]	valid_0's ndcg@5: 0.40462
[24]	valid_0's ndcg@5: 0.

New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168486, number of used features: 2
[1]	valid_0's ndcg@5: 0.414294
[2]	valid_0's ndcg@5: 0.404872
[3]	valid_0's ndcg@5: 0.405581
[4]	valid_0's ndcg@5: 0.407056
[5]	valid_0's ndcg@5: 0.405631
[6]	valid_0's ndcg@5: 0.407073
[7]	valid_0's ndcg@5: 0.40553
[8]	valid_0's ndcg@5: 0.407655
[9]	valid_0's ndcg@5: 0.407797
[10]	valid_0's ndcg@5: 0.403922
[11]	valid_0's ndcg@5: 0.408765
[12]	valid_0's ndcg@5: 0.404423
[13]	valid_0's ndcg@5: 0.408057
[14]	valid_0's ndcg@5: 0.407513
[15]	valid_0's ndcg@5: 0.405966
[16]	valid_0's ndcg@5: 0.405662
[17]	valid_0's ndcg@5: 0.406606
[18]	valid_0's ndcg@5: 0.406101
[19]	valid_0's ndcg@5: 0.408099
[20]	valid_0's ndcg@5: 0.40388
[21]	valid_0's ndcg@5: 0.40574
[22]	valid_0's ndcg@5: 0.405338
[23]	valid_0's ndcg@5: 0.407047
[24]	valid_0's ndcg@5: 0.40

[32m[I 2023-11-06 16:28:07,772][0m Trial 1 finished with value: 17.290179868586876 and parameters: {'num_leaves': 192, 'learning_rate': 0.003195299921086197, 'feature_fraction': 0.7865838919063934, 'bagging_freq': 2, 'verbose': 1, 'lambda_l1': 1.8988063368464292, 'lambda_l2': 0.8775552989184024}. Best is trial 1 with value: 17.290179868586876.[0m
New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 1168488, number of used features: 2
[1]	valid_0's ndcg@5: 0.414993
[2]	valid_0's ndcg@5: 0.40517
[3]	valid_0's ndcg@5: 0.402451
[4]	valid_0's ndcg@5: 0.407133
[5]	valid_0's ndcg@5: 0.406511
[6]	valid_0's ndcg@5: 0.406898
[7]	valid_0's ndcg@5: 0.404625
[8]	valid_0's ndcg@5: 0.404589
[9]	valid_0's ndcg@5: 0.405892
[10]	valid_0's ndcg@5: 0.40816
[11]	valid_0's ndcg@5: 0.40638
[12]	valid_0's ndcg@5: 0.405841
[13]	valid_0's ndcg@5: 0.404535
[14]	valid_0's ndcg@5: 0.404737
[15]	valid_0's ndcg@5: 0.404498
[16]	valid_0's ndcg@5: 0.406322
[17]	valid_0's ndcg@5: 0.404207
[18]	valid_0's ndcg@5: 0.406974
[19]	valid_0's ndcg@5: 0.407908
[20]	valid_0's ndcg@5: 0.404946
[21]	valid_0's ndcg@5: 0.406487
[22]	valid_0's ndcg@5: 0.407761
[23]	valid_0's ndcg@5: 0.405281
[24]	valid_0's ndcg@5: 0.40

New categorical_feature is []


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 1168488, number of used features: 2
[1]	valid_0's ndcg@5: 0.414825
[2]	valid_0's ndcg@5: 0.403297
[3]	valid_0's ndcg@5: 0.40907
[4]	valid_0's ndcg@5: 0.40762
[5]	valid_0's ndcg@5: 0.403764
[6]	valid_0's ndcg@5: 0.408064
[7]	valid_0's ndcg@5: 0.405586
[8]	valid_0's ndcg@5: 0.408299
[9]	valid_0's ndcg@5: 0.407077
[10]	valid_0's ndcg@5: 0.406268
[11]	valid_0's ndcg@5: 0.406176
[12]	valid_0's ndcg@5: 0.409586
[13]	valid_0's ndcg@5: 0.405992
[14]	valid_0's ndcg@5: 0.407405
[15]	valid_0's ndcg@5: 0.405504
[16]	valid_0's ndcg@5: 0.4084
[17]	valid_0's ndcg@5: 0.405279
[18]	valid_0's ndcg@5: 0.408148
[19]	valid_0's ndcg@5: 0.40754
[20]	valid_0's ndcg@5: 0.40596
[21]	valid_0's ndcg@5: 0.405511
[22]	valid_0's ndcg@5: 0.405504
[23]	valid_0's ndcg@5: 0.406059
[24]	valid_0's ndcg@5: 0.40768
[25]	valid_0's ndcg@5: 0.406225
[26]	valid_0's ndcg@5: 0.4055
[

New categorical_feature is []


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168488, number of used features: 2
[1]	valid_0's ndcg@5: 0.413195
[2]	valid_0's ndcg@5: 0.406351
[3]	valid_0's ndcg@5: 0.407057
[4]	valid_0's ndcg@5: 0.407689
[5]	valid_0's ndcg@5: 0.404364
[6]	valid_0's ndcg@5: 0.412034
[7]	valid_0's ndcg@5: 0.402838
[8]	valid_0's ndcg@5: 0.40557
[9]	valid_0's ndcg@5: 0.408631
[10]	valid_0's ndcg@5: 0.406424
[11]	valid_0's ndcg@5: 0.406574
[12]	valid_0's ndcg@5: 0.407791
[13]	valid_0's ndcg@5: 0.407415
[14]	valid_0's ndcg@5: 0.404372
[15]	valid_0's ndcg@5: 0.40824
[16]	valid_0's ndcg@5: 0.405273
[17]	valid_0's ndcg@5: 0.407167
[18]	valid_0's ndcg@5: 0.40679
[19]	valid_0's ndcg@5: 0.407393
[20]	valid_0's ndcg@5: 0.407214
[21]	valid_0's ndcg@5: 0.40739
[22]	valid_0's ndcg@5: 0.406535
[23]	valid_0's ndcg@5: 0.408523
[24]	valid_0's ndcg@5: 0.406499
[25]	valid_0's ndcg@5: 0.405038
[26]	valid_0's ndcg@5: 0.408

New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168486, number of used features: 2
[1]	valid_0's ndcg@5: 0.411084
[2]	valid_0's ndcg@5: 0.405935
[3]	valid_0's ndcg@5: 0.406482
[4]	valid_0's ndcg@5: 0.407158
[5]	valid_0's ndcg@5: 0.401287
[6]	valid_0's ndcg@5: 0.407817
[7]	valid_0's ndcg@5: 0.407525
[8]	valid_0's ndcg@5: 0.407285
[9]	valid_0's ndcg@5: 0.406391
[10]	valid_0's ndcg@5: 0.405702
[11]	valid_0's ndcg@5: 0.406361
[12]	valid_0's ndcg@5: 0.40576
[13]	valid_0's ndcg@5: 0.404852
[14]	valid_0's ndcg@5: 0.407481
[15]	valid_0's ndcg@5: 0.406196
[16]	valid_0's ndcg@5: 0.407065
[17]	valid_0's ndcg@5: 0.405871
[18]	valid_0's ndcg@5: 0.403939
[19]	valid_0's ndcg@5: 0.406546
[20]	valid_0's ndcg@5: 0.406834
[21]	valid_0's ndcg@5: 0.40543
[22]	valid_0's ndcg@5: 0.405347
[23]	valid_0's ndcg@5: 0.404363
[24]	valid_0's ndcg@5: 0.4

New categorical_feature is []


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168486, number of used features: 2
[1]	valid_0's ndcg@5: 0.413601
[2]	valid_0's ndcg@5: 0.406909
[3]	valid_0's ndcg@5: 0.403846
[4]	valid_0's ndcg@5: 0.405579
[5]	valid_0's ndcg@5: 0.406369
[6]	valid_0's ndcg@5: 0.407664
[7]	valid_0's ndcg@5: 0.403735
[8]	valid_0's ndcg@5: 0.409776
[9]	valid_0's ndcg@5: 0.40474
[10]	valid_0's ndcg@5: 0.408561
[11]	valid_0's ndcg@5: 0.403808
[12]	valid_0's ndcg@5: 0.40648
[13]	valid_0's ndcg@5: 0.406571
[14]	valid_0's ndcg@5: 0.408352
[15]	valid_0's ndcg@5: 0.404901
[16]	valid_0's ndcg@5: 0.409703
[17]	valid_0's ndcg@5: 0.403187
[18]	valid_0's ndcg@5: 0.405066
[19]	valid_0's ndcg@5: 0.409356
[20]	valid_0's ndcg@5: 0.404456
[21]	valid_0's ndcg@5: 0.407226
[22]	valid_0's ndcg@5: 0.406037
[23]	valid_0's ndcg@5: 0.404193
[24]	valid_0's ndcg@5: 0.4

[32m[I 2023-11-06 16:30:30,182][0m Trial 2 finished with value: 17.26865312879388 and parameters: {'num_leaves': 237, 'learning_rate': 0.0002491364683993565, 'feature_fraction': 0.9567831154498239, 'bagging_freq': 1, 'verbose': 1, 'lambda_l1': 0.7919856183140275, 'lambda_l2': 0.004234111169157586}. Best is trial 1 with value: 17.290179868586876.[0m


In [20]:
# 最適なパラメータ
best_params = study.best_params

# KFoldでのモデル訓練
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = []

for train_index, valid_index in kf.split(train_data):
    train_fold_data = train_data.iloc[train_index]
    valid_fold_data = train_data.iloc[valid_index]
    
    train_fold_group_sizes = train_fold_data.groupby('group').size().tolist()
    valid_fold_group_sizes = valid_fold_data.groupby('group').size().tolist()
    
    ranker = lgb.LGBMRanker(**best_params)
    ranker.fit(
        train_fold_data[features], train_fold_data[target], 
        group=train_fold_group_sizes, 
        eval_set=[(valid_fold_data[features], valid_fold_data[target])], 
        eval_group=[valid_fold_group_sizes], 
        eval_at=5, early_stopping_rounds=20, verbose=10
    )
    models.append(ranker)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 108
[LightGBM] [Info] Number of data points in the train set: 1168487, number of used features: 2
[10]	valid_0's ndcg@5: 0.872934
[20]	valid_0's ndcg@5: 0.873042




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 1168487, number of used features: 2
[10]	valid_0's ndcg@5: 0.872845
[20]	valid_0's ndcg@5: 0.873422




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 1168487, number of used features: 2
[10]	valid_0's ndcg@5: 0.872724
[20]	valid_0's ndcg@5: 0.872531




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168487, number of used features: 2
[10]	valid_0's ndcg@5: 0.872337
[20]	valid_0's ndcg@5: 0.872182




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 1168488, number of used features: 2
[10]	valid_0's ndcg@5: 0.872868
[20]	valid_0's ndcg@5: 0.87317


# モデル保存

In [21]:
# 学習
full_train_dataset = lgb.Dataset(merged_df[features], label=merged_df[target])
full_model = lgb.train(best_params, full_train_dataset, num_boost_round=500, verbose_eval=100)

# モデルを保存
model_save_path = '../../models/model.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(full_model, f)

print(f"Model saved to {model_save_path}")



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 2017402, number of used features: 2
[LightGBM] [Info] Start training from score 1.514390
Model saved to ../../models/model.pkl
