In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o
from itertools import combinations, permutations
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'lightgbm'

In [2]:
class DataProcessor:
    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """
    
    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()
        
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す

        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """

        self.data_h = self.data_p.copy()
        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
            
        #6/6追加： 馬の出走間隔追加
        self.data_h['interval'] = (self.data_h['date'] - self.data_h['latest']).dt.days
        self.data_h.drop(['開催', 'latest'], axis=1, inplace=True)
        
    def merge_peds(self, peds):
        """
        5世代分血統データを追加してdata_peに返す

        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """

        self.data_pe = \
            self.data_h.merge(peds, left_on='horse_id', right_index=True,
                                                             how='left')
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]\
            ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
            
    def process_categorical(self, le_horse, le_jockey, results_m):
        """
        カテゴリ変数を処理してdata_cに返す

        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """

        df = self.data_pe.copy()
        
        #ラベルエンコーディング。horse_id, jockey_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])
        
        #horse_id, jockey_idをpandasのcategory型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df['jockey_id'].astype('category')
        
        #そのほかのカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        #列を一定にするため
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])
        
        self.data_c = df

In [3]:
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        """
        レース結果データをスクレイピングする関数

        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト

        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """

        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                html = requests.get(url)
                html.encoding = "EUC-JP"

                #メインとなるテーブルデータを取得
                df = pd.read_html(html.text)[0]
                # 列名に半角スペースがあれば除去する
                df = df.rename(columns=lambda x: x.replace(' ', ''))

                # 天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                soup = BeautifulSoup(html.text, "html.parser")
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[-1])] * len(df) #20211212：[0]→[-1]に修正
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)

                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #インデックスをrace_idにする
                df.index = [race_id] * len(df)

                race_results[race_id] = df
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df
    
    #前処理    
    def preprocessing(self):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x:1 if x<4 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0]
        df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1]
        
        #errors='coerce'で、"計不"など変換できない時に欠損値にする
        df['体重'] = pd.to_numeric(df['体重'], errors='coerce')
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')

        # 単勝をfloatに変換
        df["単勝"] = df["単勝"].astype(float)
        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        # 不要な列を削除
        df.drop(["タイム", "着差", "調教師", "性齢", "馬体重", '馬名', '騎手', '人気', '着順'],
                axis=1, inplace=True)

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        #開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        #6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())

        self.data_p = df
    
    #カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
        super().process_categorical(self.le_horse, self.le_jockey, self.data_pe)

In [4]:
class ShutubaTable(DataProcessor):
    def __init__(self, shutuba_tables):
        super(ShutubaTable, self).__init__()
        self.data = shutuba_tables
    
    @classmethod
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id

            html = requests.get(url)
            html.encoding = "EUC-JP"

            df = pd.read_html(html.text)[0]
            # 列名に半角スペースがあれば除去する
            df = df.rename(columns=lambda x: x.replace(' ', ''))
            df = df.T.reset_index(level=0, drop=True).T

            soup = BeautifulSoup(html.text, "html.parser")

            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[-1])] * len(df) #20211212：[0]→[-1]に修正
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "重"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                # 2020/12/13追加
                if '稍' in text:
                    df["ground_state"] = ['稍重'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)

            # horse_id
            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list

            df.index = [race_id] * len(df)
            data = pd.concat([data, df])
        return cls(data)
             
    #前処理            
    def preprocessing(self):
        df = self.data.copy()
        
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df = df[df["馬体重(増減)"] != '--']
        df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].str[:-1]
        # 2020/12/13追加：増減が「前計不」などのとき欠損値にする
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')
        
        df["date"] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        #6/6出走数追加
        df['n_horses'] = df.index.map(df.index.value_counts())

        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        # 使用する列を選択
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather','race_type',
        'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢',
       '体重', '体重変化', '開催', 'n_horses']]
        
        self.data_p = df.rename(columns={'枠': '枠番'})

In [5]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過', '開催', '距離']]
        self.preprocessing()
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離は10の位を切り捨てる
        #一部の馬で欠損値があり、intに変換できないためfloatに変換する
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(float) // 100
        df.drop(['距離'], axis=1, inplace=True)
        #インデックス名を与える
        df.index.name = 'horse_id'
        
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner', 'final_corner',
                            'first_to_rank', 'first_to_final','final_to_rank']
    
    #n_samplesレース分馬ごとに平均する
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
        
        #集計して辞書型に入れる
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean()\
            .add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))

        #6/6追加: 馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            self.latest = filtered_df.groupby('horse_id')['date'].max().rename('latest')
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        for column in ['course_len','race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left')

        #6/6追加：馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            merged_df = merged_df.merge(self.latest, left_on='horse_id',
                             right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

#開催場所をidに変換するための辞書型
place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

#レースタイプをレース結果データと整合させるための辞書型
race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}

In [6]:
class Peds:
    def __init__(self, peds):
        self.peds = peds
        self.peds_e = pd.DataFrame() #after label encoding and transforming into category
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        return peds_df
    
    def encode(self):
        df = self.peds.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.peds_e = df.astype('category')

In [7]:
def update_data(old, new):
    """
    Parameters:
    ----------
    old : pandas.DataFrame
        古いデータ
    new : pandas.DataFrame
        新しいデータ
    """

    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])

In [8]:
hr = HorseResults.read_pickle(['beta/horse_result.pickle'])
# r.merge_horse_results(hr, n_samples_list=[5, 9, 'all'])
# r.data_h.head() #jupyterで出力

In [9]:
p = Peds.read_pickle(['beta/horse_peds.pickle'])
p.encode()
p.peds_e #jupyterで出力

Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2015100713,540,9496,218,487,821,7111,50,224,321,598,...,194,906,273,508,716,1293,343,430,1003,1290
2015103211,787,6004,202,432,416,7004,63,270,326,597,...,285,331,28,305,385,96,117,1231,832,3001
2015103578,786,1061,202,432,930,8318,63,270,326,597,...,76,612,135,523,455,554,343,690,1720,4388
2015101520,707,4170,218,427,743,4208,50,224,9,54,...,165,861,29,291,438,309,343,1425,51,2591
2015101217,819,3347,204,330,823,5310,63,2,202,130,...,456,45,135,523,455,554,633,1510,1582,3954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020106734,734,10852,44,94,87,3593,32,229,95,196,...,27,398,172,170,514,752,121,202,133,1675
2020104905,519,8637,270,530,1022,3597,140,53,298,638,...,249,536,85,54,209,424,346,1549,149,234
2020103477,764,7221,227,657,1012,9219,132,275,284,663,...,401,109,135,221,534,985,285,686,189,1477
2020100674,774,4729,202,515,977,4836,63,270,309,540,...,64,364,85,54,209,424,443,592,832,2203


In [10]:
r = Results.read_pickle(['beta/race.pickle'])
r.preprocessing()
r.data_p.head() #jupyterで出力

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,rank,性,年齢,体重,体重変化,開催,n_horses
201701010101,3,3,54.0,3.0,18.0,晴,芝,良,2017-07-29,2015100713,1091,1,牝,2,406,-4,1,7
201701010101,5,5,54.0,1.5,18.0,晴,芝,良,2017-07-29,2015103211,5339,1,牡,2,484,-4,1,7
201701010101,7,7,54.0,6.2,18.0,晴,芝,良,2017-07-29,2015103578,1014,1,牡,2,450,-16,1,7
201701010101,1,1,54.0,31.1,18.0,晴,芝,良,2017-07-29,2015101520,663,0,牡,2,442,-10,1,7
201701010101,2,2,53.0,22.8,18.0,晴,芝,良,2017-07-29,2015101217,1153,0,牡,2,472,-18,1,7


In [11]:
r.merge_horse_results(hr, n_samples_list=[5, 9, 'all'])
r.data_h.head() #jupyterで出力

  0%|          | 0/671 [00:00<?, ?it/s]

  0%|          | 0/671 [00:00<?, ?it/s]

  0%|          | 0/671 [00:00<?, ?it/s]

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,final_to_rank_race_type_allR,着順_開催_allR,賞金_開催_allR,着差_開催_allR,first_corner_開催_allR,final_corner_開催_allR,first_to_rank_開催_allR,first_to_final_開催_allR,final_to_rank_開催_allR,interval
201701010101,3,3,54.0,3.0,18.0,晴,芝,良,2017-07-29,2015100713,...,-0.5,,,,,,,,,20.0
201701010101,5,5,54.0,1.5,18.0,晴,芝,良,2017-07-29,2015103211,...,3.5,,,,,,,,,35.0
201701010101,7,7,54.0,6.2,18.0,晴,芝,良,2017-07-29,2015103578,...,-1.0,,,,,,,,,14.0
201701010101,1,1,54.0,31.1,18.0,晴,芝,良,2017-07-29,2015101520,...,-1.0,,,,,,,,,13.0
201701010101,2,2,53.0,22.8,18.0,晴,芝,良,2017-07-29,2015101217,...,-2.0,,,,,,,,,27.0


In [12]:
r.merge_peds(p.peds_e)
r.data_pe.head() #jupyterで出力

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
201701010101,3,3,54.0,3.0,18.0,晴,芝,良,2017-07-29,2015100713,...,194,906,273,508,716,1293,343,430,1003,1290
201701010101,5,5,54.0,1.5,18.0,晴,芝,良,2017-07-29,2015103211,...,285,331,28,305,385,96,117,1231,832,3001
201701010101,7,7,54.0,6.2,18.0,晴,芝,良,2017-07-29,2015103578,...,76,612,135,523,455,554,343,690,1720,4388
201701010101,1,1,54.0,31.1,18.0,晴,芝,良,2017-07-29,2015101520,...,165,861,29,291,438,309,343,1425,51,2591
201701010101,2,2,53.0,22.8,18.0,晴,芝,良,2017-07-29,2015101217,...,456,45,135,523,455,554,633,1510,1582,3954


In [13]:
r.process_categorical() #r.le_horse, r.le_jockeyに対応関係が保存される

In [14]:
#時系列に沿って訓練データとテストデータに分ける関数
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_values("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

In [15]:
train, test = split_data(r.data_c)


In [16]:
print(list(r.data_c.columns))

['枠番', '馬番', '斤量', '単勝', 'course_len', 'date', 'horse_id', 'jockey_id', 'rank', '年齢', '体重', '体重変化', 'n_horses', '着順_5R', '賞金_5R', '着差_5R', 'first_corner_5R', 'final_corner_5R', 'first_to_rank_5R', 'first_to_final_5R', 'final_to_rank_5R', '着順_course_len_5R', '賞金_course_len_5R', '着差_course_len_5R', 'first_corner_course_len_5R', 'final_corner_course_len_5R', 'first_to_rank_course_len_5R', 'first_to_final_course_len_5R', 'final_to_rank_course_len_5R', '着順_race_type_5R', '賞金_race_type_5R', '着差_race_type_5R', 'first_corner_race_type_5R', 'final_corner_race_type_5R', 'first_to_rank_race_type_5R', 'first_to_final_race_type_5R', 'final_to_rank_race_type_5R', '着順_開催_5R', '賞金_開催_5R', '着差_開催_5R', 'first_corner_開催_5R', 'final_corner_開催_5R', 'first_to_rank_開催_5R', 'first_to_final_開催_5R', 'final_to_rank_開催_5R', '着順_9R', '賞金_9R', '着差_9R', 'first_corner_9R', 'final_corner_9R', 'first_to_rank_9R', 'first_to_final_9R', 'final_to_rank_9R', '着順_course_len_9R', '賞金_course_len_9R', '着差_course_len_9R', 'first

In [17]:
train, valid = split_data(train)

#説明変数と目的変数に分ける。dateはこの後不要なので省く。単勝オッズも学習時には使わない。
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
X_valid = valid.drop(['rank', 'date', '単勝'], axis=1)
y_valid = valid['rank']

In [18]:
#データセットを作成
lgb_train = lgb_o.Dataset(X_train.values, y_train.values)
lgb_valid = lgb_o.Dataset(X_valid.values, y_valid.values)

params = {
    'objective': 'binary', #今回は0or1の二値予測なのでbinaryを指定
    'random_state': 100
}

#チューニング実行
lgb_clf_o = lgb_o.train(params, lgb_train,
                        valid_sets=(lgb_train, lgb_valid),
                        verbose_eval=100,
                        early_stopping_rounds=10,
                        optuna_seed=100 #optunaのseed固定
                        )

[32m[I 2023-06-16 21:13:50,312][0m A new study created in memory with name: no-name-6d10f714-636f-4194-b1fa-e0905815a1c4[0m


[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.458954:  14%|#4        | 1/7 [00:11<01:09, 11.57s/it][32m[I 2023-06-16 21:14:01,901][0m Trial 0 finished with value: 0.4589541275278615 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.4589541275278615.[0m
feature_fraction, val_score: 0.458954:  14%|#4        | 1/7 [00:11<01:09, 11.57s/it]

Early stopping, best iteration is:
[84]	valid_0's binary_logloss: 0.429877	valid_1's binary_logloss: 0.458954
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.458954:  29%|##8       | 2/7 [00:21<00:52, 10.50s/it][32m[I 2023-06-16 21:14:11,657][0m Trial 1 finished with value: 0.4590172981746239 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.4589541275278615.[0m
feature_fraction, val_score: 0.458954:  29%|##8       | 2/7 [00:21<00:52, 10.50s/it]

[100]	valid_0's binary_logloss: 0.426213	valid_1's binary_logloss: 0.45905
Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.428283	valid_1's binary_logloss: 0.459017
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425878	valid_1's binary_logloss: 0.458622


feature_fraction, val_score: 0.458377:  43%|####2     | 3/7 [00:32<00:42, 10.68s/it][32m[I 2023-06-16 21:14:22,554][0m Trial 2 finished with value: 0.45837705814437285 and parameters: {'feature_fraction': 0.8}. Best is trial 2 with value: 0.45837705814437285.[0m
feature_fraction, val_score: 0.458377:  43%|####2     | 3/7 [00:32<00:42, 10.68s/it]

Early stopping, best iteration is:
[116]	valid_0's binary_logloss: 0.421869	valid_1's binary_logloss: 0.458377
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.42761	valid_1's binary_logloss: 0.458106


feature_fraction, val_score: 0.457693:  57%|#####7    | 4/7 [00:41<00:30, 10.02s/it][32m[I 2023-06-16 21:14:31,566][0m Trial 3 finished with value: 0.4576925443720778 and parameters: {'feature_fraction': 0.4}. Best is trial 3 with value: 0.4576925443720778.[0m
feature_fraction, val_score: 0.457693:  57%|#####7    | 4/7 [00:41<00:30, 10.02s/it]

Early stopping, best iteration is:
[128]	valid_0's binary_logloss: 0.421155	valid_1's binary_logloss: 0.457693
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


feature_fraction, val_score: 0.457693:  71%|#######1  | 5/7 [00:50<00:19,  9.89s/it][32m[I 2023-06-16 21:14:41,224][0m Trial 4 finished with value: 0.4585877919892416 and parameters: {'feature_fraction': 0.6}. Best is trial 3 with value: 0.4576925443720778.[0m
feature_fraction, val_score: 0.457693:  71%|#######1  | 5/7 [00:50<00:19,  9.89s/it]

[100]	valid_0's binary_logloss: 0.426761	valid_1's binary_logloss: 0.458621
Early stopping, best iteration is:
[90]	valid_0's binary_logloss: 0.429247	valid_1's binary_logloss: 0.458588
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.426989	valid_1's binary_logloss: 0.458641


feature_fraction, val_score: 0.457693:  86%|########5 | 6/7 [01:01<00:10, 10.15s/it][32m[I 2023-06-16 21:14:51,882][0m Trial 5 finished with value: 0.4584926733800296 and parameters: {'feature_fraction': 0.5}. Best is trial 3 with value: 0.4576925443720778.[0m
feature_fraction, val_score: 0.457693:  86%|########5 | 6/7 [01:01<00:10, 10.15s/it]

Early stopping, best iteration is:
[122]	valid_0's binary_logloss: 0.421651	valid_1's binary_logloss: 0.458493
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425286	valid_1's binary_logloss: 0.458539


feature_fraction, val_score: 0.457693: 100%|##########| 7/7 [01:12<00:00, 10.55s/it][32m[I 2023-06-16 21:15:03,266][0m Trial 6 finished with value: 0.458539349234576 and parameters: {'feature_fraction': 1.0}. Best is trial 3 with value: 0.4576925443720778.[0m
feature_fraction, val_score: 0.457693: 100%|##########| 7/7 [01:12<00:00, 10.42s/it]


Early stopping, best iteration is:
[100]	valid_0's binary_logloss: 0.425286	valid_1's binary_logloss: 0.458539


num_leaves, val_score: 0.457693:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457693:   5%|5         | 1/20 [00:09<03:03,  9.67s/it][32m[I 2023-06-16 21:15:12,973][0m Trial 7 finished with value: 0.4599144273855329 and parameters: {'num_leaves': 140}. Best is trial 7 with value: 0.4599144273855329.[0m
num_leaves, val_score: 0.457693:   5%|5         | 1/20 [00:09<03:03,  9.67s/it]

Early stopping, best iteration is:
[52]	valid_0's binary_logloss: 0.395498	valid_1's binary_logloss: 0.459914
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457693:  10%|#         | 2/20 [00:19<02:53,  9.62s/it][32m[I 2023-06-16 21:15:22,557][0m Trial 8 finished with value: 0.4585164727400624 and parameters: {'num_leaves': 72}. Best is trial 8 with value: 0.4585164727400624.[0m
num_leaves, val_score: 0.457693:  10%|#         | 2/20 [00:19<02:53,  9.62s/it]

Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.405391	valid_1's binary_logloss: 0.458516
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457693:  15%|#5        | 3/20 [00:29<02:50, 10.05s/it][32m[I 2023-06-16 21:15:33,104][0m Trial 9 finished with value: 0.45913518773477074 and parameters: {'num_leaves': 110}. Best is trial 8 with value: 0.4585164727400624.[0m
num_leaves, val_score: 0.457693:  15%|#5        | 3/20 [00:29<02:50, 10.05s/it]

Early stopping, best iteration is:
[79]	valid_0's binary_logloss: 0.385587	valid_1's binary_logloss: 0.459135
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457693:  20%|##        | 4/20 [00:41<02:50, 10.64s/it][32m[I 2023-06-16 21:15:44,642][0m Trial 10 finished with value: 0.46179906327548953 and parameters: {'num_leaves': 217}. Best is trial 8 with value: 0.4585164727400624.[0m
num_leaves, val_score: 0.457693:  20%|##        | 4/20 [00:41<02:50, 10.64s/it]

Early stopping, best iteration is:
[64]	valid_0's binary_logloss: 0.352871	valid_1's binary_logloss: 0.461799
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.462357	valid_1's binary_logloss: 0.470007
[200]	valid_0's binary_logloss: 0.457263	valid_1's binary_logloss: 0.466046
[300]	valid_0's binary_logloss: 0.454291	valid_1's binary_logloss: 0.464253
[400]	valid_0's binary_logloss: 0.451814	valid_1's binary_logloss: 0.462785
[500]	valid_0's binary_logloss: 0.449895	valid_1's binary_logloss: 0.46187
[600]	valid_0's binary_logloss: 0.448224	valid_1's binar

num_leaves, val_score: 0.457693:  25%|##5       | 5/20 [00:55<03:00, 12.03s/it][32m[I 2023-06-16 21:15:59,132][0m Trial 11 finished with value: 0.4600577671377222 and parameters: {'num_leaves': 3}. Best is trial 8 with value: 0.4585164727400624.[0m
num_leaves, val_score: 0.457693:  25%|##5       | 5/20 [00:55<03:00, 12.03s/it]

Early stopping, best iteration is:
[763]	valid_0's binary_logloss: 0.445734	valid_1's binary_logloss: 0.460058
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425902	valid_1's binary_logloss: 0.457852


num_leaves, val_score: 0.457596:  30%|###       | 6/20 [01:05<02:36, 11.18s/it][32m[I 2023-06-16 21:16:08,682][0m Trial 12 finished with value: 0.4575955806949461 and parameters: {'num_leaves': 33}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  30%|###       | 6/20 [01:05<02:36, 11.18s/it]

Early stopping, best iteration is:
[125]	valid_0's binary_logloss: 0.419659	valid_1's binary_logloss: 0.457596
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596:  35%|###5      | 7/20 [01:16<02:24, 11.12s/it][32m[I 2023-06-16 21:16:19,661][0m Trial 13 finished with value: 0.4601438664506896 and parameters: {'num_leaves': 173}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  35%|###5      | 7/20 [01:16<02:24, 11.12s/it]

Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.365049	valid_1's binary_logloss: 0.460144
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596:  40%|####      | 8/20 [01:26<02:10, 10.89s/it][32m[I 2023-06-16 21:16:30,052][0m Trial 14 finished with value: 0.46149494536877006 and parameters: {'num_leaves': 212}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  40%|####      | 8/20 [01:26<02:10, 10.89s/it]

Early stopping, best iteration is:
[55]	valid_0's binary_logloss: 0.366881	valid_1's binary_logloss: 0.461495
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596:  45%|####5     | 9/20 [01:36<01:54, 10.38s/it][32m[I 2023-06-16 21:16:39,323][0m Trial 15 finished with value: 0.4585660169279525 and parameters: {'num_leaves': 36}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  45%|####5     | 9/20 [01:36<01:54, 10.38s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.427491	valid_1's binary_logloss: 0.458566
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596:  50%|#####     | 10/20 [01:46<01:45, 10.51s/it][32m[I 2023-06-16 21:16:50,110][0m Trial 16 finished with value: 0.46008439717671856 and parameters: {'num_leaves': 148}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  50%|#####     | 10/20 [01:46<01:45, 10.51s/it]

Early stopping, best iteration is:
[71]	valid_0's binary_logloss: 0.373125	valid_1's binary_logloss: 0.460084
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.39902	valid_1's binary_logloss: 0.458601


num_leaves, val_score: 0.457596:  55%|#####5    | 11/20 [01:56<01:33, 10.37s/it][32m[I 2023-06-16 21:17:00,174][0m Trial 17 finished with value: 0.45855884159368004 and parameters: {'num_leaves': 68}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  55%|#####5    | 11/20 [01:56<01:33, 10.37s/it]

Early stopping, best iteration is:
[99]	valid_0's binary_logloss: 0.399499	valid_1's binary_logloss: 0.458559
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596:  60%|######    | 12/20 [02:06<01:20, 10.10s/it][32m[I 2023-06-16 21:17:09,666][0m Trial 18 finished with value: 0.4586794115388186 and parameters: {'num_leaves': 77}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  60%|######    | 12/20 [02:06<01:20, 10.10s/it]

Early stopping, best iteration is:
[74]	valid_0's binary_logloss: 0.40626	valid_1's binary_logloss: 0.458679
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.467142	valid_1's binary_logloss: 0.474037
[200]	valid_0's binary_logloss: 0.463187	valid_1's binary_logloss: 0.470723
[300]	valid_0's binary_logloss: 0.460947	valid_1's binary_logloss: 0.468932
[400]	valid_0's binary_logloss: 0.459439	valid_1's binary_logloss: 0.467787
[500]	valid_0's binary_logloss: 0.458326	valid_1's binary_logloss: 0.467064
[600]	valid_0's binary_logloss: 0.457458	valid_1's binar

num_leaves, val_score: 0.457596:  65%|######5   | 13/20 [02:18<01:13, 10.57s/it][32m[I 2023-06-16 21:17:21,296][0m Trial 19 finished with value: 0.46601682490064955 and parameters: {'num_leaves': 2}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  65%|######5   | 13/20 [02:18<01:13, 10.57s/it]

Early stopping, best iteration is:
[681]	valid_0's binary_logloss: 0.456872	valid_1's binary_logloss: 0.466017
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.39902	valid_1's binary_logloss: 0.458601


num_leaves, val_score: 0.457596:  70%|#######   | 14/20 [02:27<01:02, 10.38s/it][32m[I 2023-06-16 21:17:31,248][0m Trial 20 finished with value: 0.45855884159368004 and parameters: {'num_leaves': 68}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  70%|#######   | 14/20 [02:27<01:02, 10.38s/it]

Early stopping, best iteration is:
[99]	valid_0's binary_logloss: 0.399499	valid_1's binary_logloss: 0.458559
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596:  75%|#######5  | 15/20 [02:37<00:51, 10.27s/it][32m[I 2023-06-16 21:17:41,279][0m Trial 21 finished with value: 0.45909827120436686 and parameters: {'num_leaves': 102}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  75%|#######5  | 15/20 [02:38<00:51, 10.27s/it]

Early stopping, best iteration is:
[76]	valid_0's binary_logloss: 0.391995	valid_1's binary_logloss: 0.459098
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.421958	valid_1's binary_logloss: 0.457871


num_leaves, val_score: 0.457596:  80%|########  | 16/20 [02:47<00:39,  9.99s/it][32m[I 2023-06-16 21:17:50,605][0m Trial 22 finished with value: 0.45762608106512864 and parameters: {'num_leaves': 38}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  80%|########  | 16/20 [02:47<00:39,  9.99s/it]

Early stopping, best iteration is:
[115]	valid_0's binary_logloss: 0.417906	valid_1's binary_logloss: 0.457626
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.428257	valid_1's binary_logloss: 0.457767


num_leaves, val_score: 0.457596:  85%|########5 | 17/20 [02:56<00:29,  9.80s/it][32m[I 2023-06-16 21:17:59,965][0m Trial 23 finished with value: 0.45765662929498974 and parameters: {'num_leaves': 30}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  85%|########5 | 17/20 [02:56<00:29,  9.80s/it]

Early stopping, best iteration is:
[111]	valid_0's binary_logloss: 0.425802	valid_1's binary_logloss: 0.457657
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425902	valid_1's binary_logloss: 0.457852


num_leaves, val_score: 0.457596:  90%|######### | 18/20 [03:06<00:19,  9.79s/it][32m[I 2023-06-16 21:18:09,731][0m Trial 24 finished with value: 0.4575955806949461 and parameters: {'num_leaves': 33}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  90%|######### | 18/20 [03:06<00:19,  9.79s/it]

Early stopping, best iteration is:
[125]	valid_0's binary_logloss: 0.419659	valid_1's binary_logloss: 0.457596
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596:  95%|#########5| 19/20 [03:16<00:09,  9.88s/it][32m[I 2023-06-16 21:18:19,815][0m Trial 25 finished with value: 0.46043019605780844 and parameters: {'num_leaves': 177}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596:  95%|#########5| 19/20 [03:16<00:09,  9.88s/it]

Early stopping, best iteration is:
[57]	valid_0's binary_logloss: 0.376181	valid_1's binary_logloss: 0.46043
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


num_leaves, val_score: 0.457596: 100%|##########| 20/20 [03:25<00:00,  9.73s/it][32m[I 2023-06-16 21:18:29,205][0m Trial 26 finished with value: 0.4582499348456999 and parameters: {'num_leaves': 45}. Best is trial 12 with value: 0.4575955806949461.[0m
num_leaves, val_score: 0.457596: 100%|##########| 20/20 [03:25<00:00, 10.30s/it]


[100]	valid_0's binary_logloss: 0.416287	valid_1's binary_logloss: 0.458329
Early stopping, best iteration is:
[95]	valid_0's binary_logloss: 0.418	valid_1's binary_logloss: 0.45825


bagging, val_score: 0.457596:   0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425642	valid_1's binary_logloss: 0.45891


bagging, val_score: 0.457596:  10%|#         | 1/10 [00:09<01:26,  9.58s/it][32m[I 2023-06-16 21:18:38,815][0m Trial 27 finished with value: 0.45837086303089986 and parameters: {'bagging_fraction': 0.7260429650751228, 'bagging_freq': 2}. Best is trial 27 with value: 0.45837086303089986.[0m
bagging, val_score: 0.457596:  10%|#         | 1/10 [00:09<01:26,  9.58s/it]

Early stopping, best iteration is:
[131]	valid_0's binary_logloss: 0.417894	valid_1's binary_logloss: 0.458371
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425772	valid_1's binary_logloss: 0.459469


bagging, val_score: 0.457596:  20%|##        | 2/10 [00:18<01:11,  8.98s/it][32m[I 2023-06-16 21:18:47,374][0m Trial 28 finished with value: 0.45927097596025246 and parameters: {'bagging_fraction': 0.6547105544499044, 'bagging_freq': 6}. Best is trial 27 with value: 0.45837086303089986.[0m
bagging, val_score: 0.457596:  20%|##        | 2/10 [00:18<01:11,  8.98s/it]

Early stopping, best iteration is:
[109]	valid_0's binary_logloss: 0.423314	valid_1's binary_logloss: 0.459271
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.457596:  30%|###       | 3/10 [00:25<00:57,  8.28s/it][32m[I 2023-06-16 21:18:54,819][0m Trial 29 finished with value: 0.462087635108612 and parameters: {'bagging_fraction': 0.4028313137145883, 'bagging_freq': 1}. Best is trial 27 with value: 0.45837086303089986.[0m
bagging, val_score: 0.457596:  30%|###       | 3/10 [00:25<00:57,  8.28s/it]

Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.434405	valid_1's binary_logloss: 0.462088
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.426084	valid_1's binary_logloss: 0.458724


bagging, val_score: 0.457596:  40%|####      | 4/10 [00:34<00:52,  8.71s/it][32m[I 2023-06-16 21:19:04,183][0m Trial 30 finished with value: 0.4584386289675136 and parameters: {'bagging_fraction': 0.802449450836738, 'bagging_freq': 6}. Best is trial 27 with value: 0.45837086303089986.[0m
bagging, val_score: 0.457596:  40%|####      | 4/10 [00:34<00:52,  8.71s/it]

Early stopping, best iteration is:
[111]	valid_0's binary_logloss: 0.423242	valid_1's binary_logloss: 0.458439
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.42683	valid_1's binary_logloss: 0.460369


bagging, val_score: 0.457596:  50%|#####     | 5/10 [00:43<00:43,  8.73s/it][32m[I 2023-06-16 21:19:12,959][0m Trial 31 finished with value: 0.4601910265662322 and parameters: {'bagging_fraction': 0.4820239538111085, 'bagging_freq': 5}. Best is trial 27 with value: 0.45837086303089986.[0m
bagging, val_score: 0.457596:  50%|#####     | 5/10 [00:43<00:43,  8.73s/it]

Early stopping, best iteration is:
[107]	valid_0's binary_logloss: 0.424915	valid_1's binary_logloss: 0.460191
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.42561	valid_1's binary_logloss: 0.458075


bagging, val_score: 0.457596:  60%|######    | 6/10 [00:53<00:35,  8.97s/it][32m[I 2023-06-16 21:19:22,398][0m Trial 32 finished with value: 0.45768536981918356 and parameters: {'bagging_fraction': 0.9347931725882498, 'bagging_freq': 2}. Best is trial 32 with value: 0.45768536981918356.[0m
bagging, val_score: 0.457596:  60%|######    | 6/10 [00:53<00:35,  8.97s/it]

Early stopping, best iteration is:
[111]	valid_0's binary_logloss: 0.42275	valid_1's binary_logloss: 0.457685
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.457596:  70%|#######   | 7/10 [01:01<00:26,  8.72s/it][32m[I 2023-06-16 21:19:30,586][0m Trial 33 finished with value: 0.4601640585376917 and parameters: {'bagging_fraction': 0.5111969317302304, 'bagging_freq': 1}. Best is trial 32 with value: 0.45768536981918356.[0m
bagging, val_score: 0.457596:  70%|#######   | 7/10 [01:01<00:26,  8.72s/it]

Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.432699	valid_1's binary_logloss: 0.460164
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


bagging, val_score: 0.457596:  80%|########  | 8/10 [01:09<00:17,  8.67s/it][32m[I 2023-06-16 21:19:39,164][0m Trial 34 finished with value: 0.4606432271156078 and parameters: {'bagging_fraction': 0.531818495575215, 'bagging_freq': 7}. Best is trial 32 with value: 0.45768536981918356.[0m
bagging, val_score: 0.457596:  80%|########  | 8/10 [01:09<00:17,  8.67s/it]

[100]	valid_0's binary_logloss: 0.426458	valid_1's binary_logloss: 0.460729
Early stopping, best iteration is:
[90]	valid_0's binary_logloss: 0.429229	valid_1's binary_logloss: 0.460643
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425979	valid_1's binary_logloss: 0.458639


bagging, val_score: 0.457596:  90%|######### | 9/10 [01:19<00:08,  8.89s/it][32m[I 2023-06-16 21:19:48,535][0m Trial 35 finished with value: 0.45805012402168344 and parameters: {'bagging_fraction': 0.8870098894544057, 'bagging_freq': 2}. Best is trial 32 with value: 0.45768536981918356.[0m
bagging, val_score: 0.457596:  90%|######### | 9/10 [01:19<00:08,  8.89s/it]

Early stopping, best iteration is:
[111]	valid_0's binary_logloss: 0.423046	valid_1's binary_logloss: 0.45805
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425918	valid_1's binary_logloss: 0.458405


bagging, val_score: 0.457596: 100%|##########| 10/10 [01:28<00:00,  9.04s/it][32m[I 2023-06-16 21:19:57,907][0m Trial 36 finished with value: 0.4583661514224208 and parameters: {'bagging_fraction': 0.8897348492363203, 'bagging_freq': 2}. Best is trial 32 with value: 0.45768536981918356.[0m
bagging, val_score: 0.457596: 100%|##########| 10/10 [01:28<00:00,  8.87s/it]


Early stopping, best iteration is:
[101]	valid_0's binary_logloss: 0.425609	valid_1's binary_logloss: 0.458366


feature_fraction_stage2, val_score: 0.457596:   0%|          | 0/3 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425707	valid_1's binary_logloss: 0.457974


feature_fraction_stage2, val_score: 0.457377:  33%|###3      | 1/3 [00:09<00:19,  9.71s/it][32m[I 2023-06-16 21:20:07,642][0m Trial 37 finished with value: 0.4573770317493555 and parameters: {'feature_fraction': 0.41600000000000004}. Best is trial 37 with value: 0.4573770317493555.[0m
feature_fraction_stage2, val_score: 0.457377:  33%|###3      | 1/3 [00:09<00:19,  9.71s/it]

Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.41849	valid_1's binary_logloss: 0.457377
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425395	valid_1's binary_logloss: 0.457707
Early stopping, best iteration is:
[103]	valid_0's binary_logloss: 0.424579	valid_1's binary_logloss: 0.457704


feature_fraction_stage2, val_score: 0.457377:  67%|######6   | 2/3 [00:18<00:08,  8.90s/it][32m[I 2023-06-16 21:20:15,979][0m Trial 38 finished with value: 0.4577042071568392 and parameters: {'feature_fraction': 0.44800000000000006}. Best is trial 37 with value: 0.4573770317493555.[0m
feature_fraction_stage2, val_score: 0.457377:  67%|######6   | 2/3 [00:18<00:08,  8.90s/it]

[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


feature_fraction_stage2, val_score: 0.457377: 100%|##########| 3/3 [00:25<00:00,  8.28s/it][32m[I 2023-06-16 21:20:23,509][0m Trial 39 finished with value: 0.45815976252175 and parameters: {'feature_fraction': 0.48000000000000004}. Best is trial 37 with value: 0.4573770317493555.[0m
feature_fraction_stage2, val_score: 0.457377: 100%|##########| 3/3 [00:25<00:00,  8.53s/it]


Early stopping, best iteration is:
[83]	valid_0's binary_logloss: 0.430033	valid_1's binary_logloss: 0.45816


regularization_factors, val_score: 0.457377:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425765	valid_1's binary_logloss: 0.458042


regularization_factors, val_score: 0.457309:   5%|5         | 1/20 [00:08<02:46,  8.79s/it][32m[I 2023-06-16 21:20:32,324][0m Trial 40 finished with value: 0.4573093799131221 and parameters: {'lambda_l1': 0.0007773998922821829, 'lambda_l2': 3.2012859298995277e-06}. Best is trial 40 with value: 0.4573093799131221.[0m
regularization_factors, val_score: 0.457309:   5%|5         | 1/20 [00:08<02:46,  8.79s/it]

Early stopping, best iteration is:
[145]	valid_0's binary_logloss: 0.414766	valid_1's binary_logloss: 0.457309
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.426427	valid_1's binary_logloss: 0.45734


regularization_factors, val_score: 0.457106:  10%|#         | 2/20 [00:17<02:35,  8.62s/it][32m[I 2023-06-16 21:20:40,818][0m Trial 41 finished with value: 0.45710606817445343 and parameters: {'lambda_l1': 6.616957066014342e-05, 'lambda_l2': 0.400853048601546}. Best is trial 41 with value: 0.45710606817445343.[0m
regularization_factors, val_score: 0.457106:  10%|#         | 2/20 [00:17<02:35,  8.62s/it]

Early stopping, best iteration is:
[120]	valid_0's binary_logloss: 0.421559	valid_1's binary_logloss: 0.457106
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425707	valid_1's binary_logloss: 0.457974


regularization_factors, val_score: 0.457106:  15%|#5        | 3/20 [00:25<02:22,  8.40s/it][32m[I 2023-06-16 21:20:48,951][0m Trial 42 finished with value: 0.4573770550368445 and parameters: {'lambda_l1': 1.1027313099672533e-08, 'lambda_l2': 1.242001404761155e-07}. Best is trial 41 with value: 0.45710606817445343.[0m
regularization_factors, val_score: 0.457106:  15%|#5        | 3/20 [00:25<02:22,  8.40s/it]

Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.41849	valid_1's binary_logloss: 0.457377
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.426174	valid_1's binary_logloss: 0.457945


regularization_factors, val_score: 0.457106:  20%|##        | 4/20 [00:34<02:16,  8.54s/it][32m[I 2023-06-16 21:20:57,717][0m Trial 43 finished with value: 0.4577240336423796 and parameters: {'lambda_l1': 0.010882827930218712, 'lambda_l2': 0.2708162972907513}. Best is trial 41 with value: 0.45710606817445343.[0m
regularization_factors, val_score: 0.457106:  20%|##        | 4/20 [00:34<02:16,  8.54s/it]

Early stopping, best iteration is:
[113]	valid_0's binary_logloss: 0.423073	valid_1's binary_logloss: 0.457724
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.457106:  25%|##5       | 5/20 [00:41<02:02,  8.19s/it][32m[I 2023-06-16 21:21:05,281][0m Trial 44 finished with value: 0.4582431359878819 and parameters: {'lambda_l1': 1.6996492507894156e-07, 'lambda_l2': 0.0014991323116035308}. Best is trial 41 with value: 0.45710606817445343.[0m
regularization_factors, val_score: 0.457106:  25%|##5       | 5/20 [00:41<02:02,  8.19s/it]

Early stopping, best iteration is:
[76]	valid_0's binary_logloss: 0.432197	valid_1's binary_logloss: 0.458243
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.426341	valid_1's binary_logloss: 0.457181


regularization_factors, val_score: 0.456989:  30%|###       | 6/20 [00:50<01:57,  8.36s/it][32m[I 2023-06-16 21:21:13,980][0m Trial 45 finished with value: 0.4569889597973449 and parameters: {'lambda_l1': 1.0517138394360073, 'lambda_l2': 7.635176818135586e-07}. Best is trial 45 with value: 0.4569889597973449.[0m
regularization_factors, val_score: 0.456989:  30%|###       | 6/20 [00:50<01:57,  8.36s/it]

Early stopping, best iteration is:
[115]	valid_0's binary_logloss: 0.42274	valid_1's binary_logloss: 0.456989
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425707	valid_1's binary_logloss: 0.457973


regularization_factors, val_score: 0.456989:  35%|###5      | 7/20 [00:59<01:50,  8.47s/it][32m[I 2023-06-16 21:21:22,680][0m Trial 46 finished with value: 0.4573784540545382 and parameters: {'lambda_l1': 4.655367559816141e-07, 'lambda_l2': 9.449134137745608e-08}. Best is trial 45 with value: 0.4569889597973449.[0m
regularization_factors, val_score: 0.456989:  35%|###5      | 7/20 [00:59<01:50,  8.47s/it]

Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.41849	valid_1's binary_logloss: 0.457378
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.428023	valid_1's binary_logloss: 0.457579


regularization_factors, val_score: 0.456982:  40%|####      | 8/20 [01:07<01:43,  8.59s/it][32m[I 2023-06-16 21:21:31,525][0m Trial 47 finished with value: 0.45698150883851635 and parameters: {'lambda_l1': 9.490245203532942e-07, 'lambda_l2': 6.421168438428032}. Best is trial 47 with value: 0.45698150883851635.[0m
regularization_factors, val_score: 0.456982:  40%|####      | 8/20 [01:08<01:43,  8.59s/it]

Early stopping, best iteration is:
[128]	valid_0's binary_logloss: 0.422087	valid_1's binary_logloss: 0.456982
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.42626	valid_1's binary_logloss: 0.458187


regularization_factors, val_score: 0.456982:  45%|####5     | 9/20 [01:16<01:33,  8.49s/it][32m[I 2023-06-16 21:21:39,784][0m Trial 48 finished with value: 0.45768057293913356 and parameters: {'lambda_l1': 0.2019055894080857, 'lambda_l2': 3.5275169933928286e-07}. Best is trial 47 with value: 0.45698150883851635.[0m
regularization_factors, val_score: 0.456982:  45%|####5     | 9/20 [01:16<01:33,  8.49s/it]

Early stopping, best iteration is:
[130]	valid_0's binary_logloss: 0.418803	valid_1's binary_logloss: 0.457681
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.426003	valid_1's binary_logloss: 0.458173


regularization_factors, val_score: 0.456982:  50%|#####     | 10/20 [01:24<01:25,  8.54s/it][32m[I 2023-06-16 21:21:48,427][0m Trial 49 finished with value: 0.45777922863647086 and parameters: {'lambda_l1': 0.22183125618514202, 'lambda_l2': 2.9286247167445133e-06}. Best is trial 47 with value: 0.45698150883851635.[0m
regularization_factors, val_score: 0.456982:  50%|#####     | 10/20 [01:24<01:25,  8.54s/it]

Early stopping, best iteration is:
[139]	valid_0's binary_logloss: 0.416569	valid_1's binary_logloss: 0.457779
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.428578	valid_1's binary_logloss: 0.457559


regularization_factors, val_score: 0.456982:  55%|#####5    | 11/20 [01:33<01:18,  8.68s/it][32m[I 2023-06-16 21:21:57,441][0m Trial 50 finished with value: 0.457220615409413 and parameters: {'lambda_l1': 3.0057843641607915e-05, 'lambda_l2': 8.38297710342227}. Best is trial 47 with value: 0.45698150883851635.[0m
regularization_factors, val_score: 0.456982:  55%|#####5    | 11/20 [01:33<01:18,  8.68s/it]

Early stopping, best iteration is:
[121]	valid_0's binary_logloss: 0.424051	valid_1's binary_logloss: 0.457221
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.430216	valid_1's binary_logloss: 0.457314


regularization_factors, val_score: 0.456655:  60%|######    | 12/20 [01:43<01:11,  8.90s/it][32m[I 2023-06-16 21:22:06,843][0m Trial 51 finished with value: 0.45665487605444693 and parameters: {'lambda_l1': 9.113641784001668, 'lambda_l2': 7.523632870105917e-05}. Best is trial 51 with value: 0.45665487605444693.[0m
regularization_factors, val_score: 0.456655:  60%|######    | 12/20 [01:43<01:11,  8.90s/it]

Early stopping, best iteration is:
[159]	valid_0's binary_logloss: 0.419499	valid_1's binary_logloss: 0.456655
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.428073	valid_1's binary_logloss: 0.457375


regularization_factors, val_score: 0.456536:  65%|######5   | 13/20 [01:52<01:03,  9.00s/it][32m[I 2023-06-16 21:22:16,077][0m Trial 52 finished with value: 0.4565360225003592 and parameters: {'lambda_l1': 3.649389294042618, 'lambda_l2': 0.00012558479384857605}. Best is trial 52 with value: 0.4565360225003592.[0m
regularization_factors, val_score: 0.456536:  65%|######5   | 13/20 [01:52<01:03,  9.00s/it]

Early stopping, best iteration is:
[162]	valid_0's binary_logloss: 0.415102	valid_1's binary_logloss: 0.456536
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.427325	valid_1's binary_logloss: 0.457134


regularization_factors, val_score: 0.456536:  70%|#######   | 14/20 [02:01<00:53,  8.97s/it][32m[I 2023-06-16 21:22:24,958][0m Trial 53 finished with value: 0.45669038044641436 and parameters: {'lambda_l1': 2.2484581452667936, 'lambda_l2': 0.0002539292304807585}. Best is trial 52 with value: 0.4565360225003592.[0m
regularization_factors, val_score: 0.456536:  70%|#######   | 14/20 [02:01<00:53,  8.97s/it]

Early stopping, best iteration is:
[135]	valid_0's binary_logloss: 0.419333	valid_1's binary_logloss: 0.45669
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.429253	valid_1's binary_logloss: 0.457541


regularization_factors, val_score: 0.456536:  75%|#######5  | 15/20 [02:10<00:45,  9.07s/it][32m[I 2023-06-16 21:22:34,304][0m Trial 54 finished with value: 0.4569392392015818 and parameters: {'lambda_l1': 5.673852371775013, 'lambda_l2': 7.767625929948251e-05}. Best is trial 52 with value: 0.4565360225003592.[0m
regularization_factors, val_score: 0.456536:  75%|#######5  | 15/20 [02:10<00:45,  9.07s/it]

Early stopping, best iteration is:
[145]	valid_0's binary_logloss: 0.420011	valid_1's binary_logloss: 0.456939
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.428943	valid_1's binary_logloss: 0.45717


regularization_factors, val_score: 0.456536:  80%|########  | 16/20 [02:19<00:35,  8.91s/it][32m[I 2023-06-16 21:22:42,816][0m Trial 55 finished with value: 0.4566797588091834 and parameters: {'lambda_l1': 5.018847808162718, 'lambda_l2': 1.220872659800592e-08}. Best is trial 52 with value: 0.4565360225003592.[0m
regularization_factors, val_score: 0.456536:  80%|########  | 16/20 [02:19<00:35,  8.91s/it]

Early stopping, best iteration is:
[121]	valid_0's binary_logloss: 0.424343	valid_1's binary_logloss: 0.45668
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.456536:  85%|########5 | 17/20 [02:27<00:25,  8.62s/it][32m[I 2023-06-16 21:22:50,743][0m Trial 56 finished with value: 0.45787284114078697 and parameters: {'lambda_l1': 0.032488863187699966, 'lambda_l2': 3.4829067233683635e-05}. Best is trial 52 with value: 0.4565360225003592.[0m
regularization_factors, val_score: 0.456536:  85%|########5 | 17/20 [02:27<00:25,  8.62s/it]

[100]	valid_0's binary_logloss: 0.425791	valid_1's binary_logloss: 0.457926
Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.427815	valid_1's binary_logloss: 0.457873
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.430241	valid_1's binary_logloss: 0.457011


regularization_factors, val_score: 0.455679:  90%|######### | 18/20 [02:36<00:17,  8.94s/it][32m[I 2023-06-16 21:23:00,452][0m Trial 57 finished with value: 0.4556789783497547 and parameters: {'lambda_l1': 8.955559716312521, 'lambda_l2': 0.002306507164824572}. Best is trial 57 with value: 0.4556789783497547.[0m
regularization_factors, val_score: 0.455679:  90%|######### | 18/20 [02:36<00:17,  8.94s/it]

Early stopping, best iteration is:
[170]	valid_0's binary_logloss: 0.417397	valid_1's binary_logloss: 0.455679
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds


regularization_factors, val_score: 0.455679:  95%|#########5| 19/20 [02:44<00:08,  8.68s/it][32m[I 2023-06-16 21:23:08,524][0m Trial 58 finished with value: 0.45811644266226587 and parameters: {'lambda_l1': 0.38822388291282683, 'lambda_l2': 0.0025617784247268655}. Best is trial 57 with value: 0.4556789783497547.[0m
regularization_factors, val_score: 0.455679:  95%|#########5| 19/20 [02:45<00:08,  8.68s/it]

[100]	valid_0's binary_logloss: 0.426275	valid_1's binary_logloss: 0.458125
Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.428295	valid_1's binary_logloss: 0.458116
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.425646	valid_1's binary_logloss: 0.457971


regularization_factors, val_score: 0.455679: 100%|##########| 20/20 [02:53<00:00,  8.65s/it][32m[I 2023-06-16 21:23:17,096][0m Trial 59 finished with value: 0.4573904030837565 and parameters: {'lambda_l1': 0.025948843511080254, 'lambda_l2': 0.005152339738372678}. Best is trial 57 with value: 0.4556789783497547.[0m
regularization_factors, val_score: 0.455679: 100%|##########| 20/20 [02:53<00:00,  8.68s/it]


Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.419513	valid_1's binary_logloss: 0.45739


min_data_in_leaf, val_score: 0.455679:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.4303	valid_1's binary_logloss: 0.457113


min_data_in_leaf, val_score: 0.455679:  20%|##        | 1/5 [00:08<00:33,  8.45s/it][32m[I 2023-06-16 21:23:25,570][0m Trial 60 finished with value: 0.45635055704496147 and parameters: {'min_child_samples': 50}. Best is trial 60 with value: 0.45635055704496147.[0m
min_data_in_leaf, val_score: 0.455679:  20%|##        | 1/5 [00:08<00:33,  8.45s/it]

Early stopping, best iteration is:
[131]	valid_0's binary_logloss: 0.424337	valid_1's binary_logloss: 0.456351
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.430347	valid_1's binary_logloss: 0.457008


min_data_in_leaf, val_score: 0.455679:  40%|####      | 2/5 [00:17<00:25,  8.60s/it][32m[I 2023-06-16 21:23:34,284][0m Trial 61 finished with value: 0.45650412105773047 and parameters: {'min_child_samples': 25}. Best is trial 60 with value: 0.45635055704496147.[0m
min_data_in_leaf, val_score: 0.455679:  40%|####      | 2/5 [00:17<00:25,  8.60s/it]

Early stopping, best iteration is:
[121]	valid_0's binary_logloss: 0.426004	valid_1's binary_logloss: 0.456504
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.43018	valid_1's binary_logloss: 0.457195


min_data_in_leaf, val_score: 0.455679:  60%|######    | 3/5 [00:26<00:18,  9.04s/it][32m[I 2023-06-16 21:23:43,844][0m Trial 62 finished with value: 0.45625641826781316 and parameters: {'min_child_samples': 5}. Best is trial 62 with value: 0.45625641826781316.[0m
min_data_in_leaf, val_score: 0.455679:  60%|######    | 3/5 [00:26<00:18,  9.04s/it]

Early stopping, best iteration is:
[155]	valid_0's binary_logloss: 0.419743	valid_1's binary_logloss: 0.456256
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.430328	valid_1's binary_logloss: 0.457052


min_data_in_leaf, val_score: 0.455679:  80%|########  | 4/5 [00:35<00:09,  9.01s/it][32m[I 2023-06-16 21:23:52,810][0m Trial 63 finished with value: 0.45612170218943626 and parameters: {'min_child_samples': 100}. Best is trial 63 with value: 0.45612170218943626.[0m
min_data_in_leaf, val_score: 0.455679:  80%|########  | 4/5 [00:35<00:09,  9.01s/it]

Early stopping, best iteration is:
[156]	valid_0's binary_logloss: 0.41993	valid_1's binary_logloss: 0.456122
[LightGBM] [Info] Number of positive: 28756, number of negative: 104111
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34110
[LightGBM] [Info] Number of data points in the train set: 132867, number of used features: 185
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216427 -> initscore=-1.286611
[LightGBM] [Info] Start training from score -1.286611
Training until validation scores don't improve for 10 rounds
[100]	valid_0's binary_logloss: 0.43018	valid_1's binary_logloss: 0.457195


min_data_in_leaf, val_score: 0.455679: 100%|##########| 5/5 [00:44<00:00,  9.10s/it][32m[I 2023-06-16 21:24:02,066][0m Trial 64 finished with value: 0.45632970732309136 and parameters: {'min_child_samples': 10}. Best is trial 63 with value: 0.45612170218943626.[0m
min_data_in_leaf, val_score: 0.455679: 100%|##########| 5/5 [00:44<00:00,  8.99s/it]

Early stopping, best iteration is:
[152]	valid_0's binary_logloss: 0.420518	valid_1's binary_logloss: 0.45633





In [19]:
lgb_clf_o.params #jupyterで出力

{'objective': 'binary',
 'random_state': 100,
 'feature_pre_filter': False,
 'lambda_l1': 8.955559716312521,
 'lambda_l2': 0.002306507164824572,
 'num_leaves': 33,
 'feature_fraction': 0.41600000000000004,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': 10}

In [20]:
train, test = split_data(r.data_c)

#説明変数と目的変数に分ける。dateはこの後不要なので省く。
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
#2021/3/12追加： テストデータの単勝オッズはシミュレーション時に使用するので残しておく
X_test = test.drop(['rank', 'date'], axis=1)
y_test = test['rank']

lgb_clf = lgb.LGBMClassifier(**lgb_clf_o.params)
lgb_clf.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], early_stopping_rounds=10)



[1]	valid_0's binary_logloss: 0.527706
[2]	valid_0's binary_logloss: 0.529155
[3]	valid_0's binary_logloss: 0.530083
[4]	valid_0's binary_logloss: 0.53159
[5]	valid_0's binary_logloss: 0.530871
[6]	valid_0's binary_logloss: 0.532077
[7]	valid_0's binary_logloss: 0.533025
[8]	valid_0's binary_logloss: 0.535892
[9]	valid_0's binary_logloss: 0.538047
[10]	valid_0's binary_logloss: 0.541285
[11]	valid_0's binary_logloss: 0.54535


In [21]:
class Return:
    def __init__(self, return_tables):
        self.return_tables = return_tables
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        return_tables = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
                #そのため、改行コードを文字列brに変換して後でsplitする
                f = urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
                df = pd.concat([dfs[1], dfs[2]])

                df.index = [race_id] * len(df)
                return_tables[race_id] = df
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        return return_tables_df
    
    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
        
        wins.columns = ['win_0', 'win_1', 'win_2']
        returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
        returns.columns = ['return_0', 'return_1', 'return_2']
        
        df = pd.concat([wins, returns], axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',', '')
        return df.fillna(0).astype(int)
    
    @property
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
        tansho.columns = ['win', 'return']
        
        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
            
        return tansho
    
    @property
    def umaren(self):
        umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
        return_ = umaren[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def umatan(self):
        umatan = self.return_tables[self.return_tables[0]=='馬単'][[1,2]]
        wins = umatan[1].str.split('→', expand=True)[[0,1]].add_prefix('win_')
        return_ = umatan[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def wide(self):
        wide = self.return_tables[self.return_tables[0]=='ワイド'][[1,2]]
        wins = wide[1].str.split('br', expand=True)[[0,1,2]]
        wins = wins.stack().str.split('-', expand=True).add_prefix('win_')
        return_ = wide[2].str.split('br', expand=True)[[0,1,2]]
        return_ = return_.stack().rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce'))
    
    @property
    def sanrentan(self):
        rentan = self.return_tables[self.return_tables[0]=='三連単'][[1,2]]
        wins = rentan[1].str.split('→', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = rentan[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def sanrenpuku(self):
        renpuku = self.return_tables[self.return_tables[0]=='三連複'][[1,2]]
        wins = renpuku[1].str.split('-', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = renpuku[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [22]:
class ModelEvaluator:
    def __init__(self, model, return_tables_path_list):
        self.model = model
        self.rt = Return.read_pickle(return_tables_path_list)
        self.fukusho = self.rt.fukusho
        self.tansho = self.rt.tansho
        self.umaren = self.rt.umaren
        self.umatan = self.rt.umatan
        self.wide = self.rt.wide
        self.sanrentan = self.rt.sanrentan
        self.sanrenpuku = self.rt.sanrenpuku
    
    #3着以内に入る確率を予測
    def predict_proba(self, X, train=True, std=True, minmax=False):
        if train:
            proba = pd.Series(
                self.model.predict_proba(X.drop(['単勝'], axis=1))[:, 1], index=X.index
            )
        else:
            proba = pd.Series(
                self.model.predict_proba(X, axis=1)[:, 1], index=X.index
            )
        if std:
            #レース内で標準化して、相対評価する。「レース内偏差値」みたいなもの。
            standard_scaler = lambda x: (x - x.mean()) / x.std(ddof=0)
            proba = proba.groupby(level=0).transform(standard_scaler)
        if minmax:
            #データ全体を0~1にする
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba
    
    #0か1かを予測
    def predict(self, X, threshold=0.5):
        y_pred = self.predict_proba(X)
        self.proba = y_pred
        return [0 if p<threshold else 1 for p in y_pred]
    
    def score(self, y_true, X):
        return roc_auc_score(y_true, self.predict_proba(X))
    
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({"features": X.columns, 
                                    "importance": self.model.feature_importances_})
        return importances.sort_values("importance", ascending=False)[:n_display]
    
    def pred_table(self, X, threshold=0.5, bet_only=True):
        pred_table = X.copy()[['馬番', '単勝']]
        pred_table['pred'] = self.predict(X, threshold)
        pred_table['score'] = self.proba
        if bet_only:
            return pred_table[pred_table['pred']==1][['馬番', '単勝', 'score']]
        else:
            return pred_table[['馬番', '単勝', 'score', 'pred']]
        
    def bet(self, race_id, kind, umaban, amount):
        if kind == 'fukusho':
            rt_1R = self.fukusho.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1', 'win_2']]==umaban).values * \
                rt_1R[['return_0', 'return_1', 'return_2']].values * amount/100
            return_ = np.sum(return_)
        if kind == 'tansho':
            rt_1R = self.tansho.loc[race_id]
            return_ = (rt_1R['win']==umaban) * rt_1R['return'] * amount/100
        if kind == 'umaren':
            rt_1R = self.umaren.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1']]) == set(umaban)) \
                * rt_1R['return']/100 * amount
        if kind == 'umatan':
            rt_1R = self.umatan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1']]) == list(umaban))\
                * rt_1R['return']/100 * amount
        if kind == 'wide':
            rt_1R = self.wide.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1']].\
                           apply(lambda x: set(x)==set(umaban), axis=1)) \
                * rt_1R['return']/100 * amount
            return_ = return_.sum()
        if kind == 'sanrentan':
            rt_1R = self.sanrentan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1', 'win_2']]) == list(umaban)) * \
                rt_1R['return']/100 * amount
        if kind == 'sanrenpuku':
            rt_1R = self.sanrenpuku.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1', 'win_2']]) == set(umaban)) \
                * rt_1R['return']/100 * amount
        if not (return_ >= 0):
                return_ = amount
        return return_
        
    def fukusho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(np.sum([
                self.bet(race_id, 'fukusho', umaban, 1) for umaban in preds['馬番']
            ]))
        return_rate = np.sum(return_list) / n_bets
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return n_bets, return_rate, n_hits, std
    
    def tansho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        self.sample = pred_table
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum([self.bet(race_id, 'tansho', umaban, 1) for umaban in preds['馬番']])
            )
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def tansho_return_proper(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum(preds.apply(lambda x: self.bet(
                    race_id, 'tansho', x['馬番'], 1/x['単勝']), axis=1)))
        
        bet_money = (1 / pred_table['単勝']).sum()
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / bet_money
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / bet_money
        return n_bets, return_rate, n_hits, std
    
    def umaren_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umaren', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umatan_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue   
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umatan', umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def wide_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'wide', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std  
        
    def sanrentan_box(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            if len(preds)<3:
                continue
            else:
                for umaban in permutations(preds['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def sanrenpuku_box(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            if len(preds)<3:
                continue
            else:
                for umaban in combinations(preds['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrenpuku', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umaren_nagashi(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'umaren', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += n_aite
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umaren', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umatan_nagashi(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'umatan', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += n_aite
                
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umatan', umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def wide_nagashi(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'wide', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'wide', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def sanrentan_nagashi(self, X, threshold = 1.5, n_aite=7):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) == 2:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[2:(n_aite+2)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'sanrentan',
                        np.append(preds_jiku['馬番'].values, x),
                        1
                    )
                ).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 3:
                return_ = 0
                for umaban in permutations(preds_jiku['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std

In [23]:
#11Rの日本ダービーの出馬表をスクレイピング
def current_race(race_id, month, day):
    race_id = str(race_id)
    date = '2023' + str(month).zfill(2) + str(day).zfill(2)
    st = ShutubaTable.scrape([race_id], date)
    #データ加工
    st.preprocessing() #前処理
    st.merge_horse_results(hr) #馬の過去成績結合
    st.merge_peds(p.peds_e) #血統データ結合
    st.process_categorical(r.le_horse, r.le_jockey, r.data_h) #カテゴリ変数処理
    today = st.data_c.copy()
    today.drop('date', axis=1, inplace=True)
    pred = lgb_clf.predict_proba(today)[:, 1]
    pred_data = pd.DataFrame({'pred': pred}, index=[today.馬番])
    
    return pred_data.sort_values('pred', ascending=False)
    

In [106]:
tokyo = current_race(202305030611, 6, 18)
hanshin = current_race(202309030611, 6, 18)
hakodate = current_race(202302010411, 6, 18)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [107]:
tokyo

Unnamed: 0_level_0,pred
馬番,Unnamed: 1_level_1
3,0.242886
9,0.242886
12,0.242886
13,0.242886
14,0.242886
5,0.232887
1,0.232085
2,0.232085
4,0.232085
7,0.232085


In [108]:
hanshin

Unnamed: 0_level_0,pred
馬番,Unnamed: 1_level_1
4,0.242886
9,0.242886
1,0.229987
7,0.223436
3,0.221014
11,0.216272
12,0.216272
13,0.216272
8,0.213187
2,0.210189


In [109]:
hakodate

Unnamed: 0_level_0,pred
馬番,Unnamed: 1_level_1
6,0.232085
10,0.232085
5,0.225581
3,0.223658
7,0.223658
4,0.223436
11,0.223436
13,0.221119
12,0.221014
8,0.213187


In [28]:
r.data_c

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,rank,年齢,...,race_type_芝,race_type_ダート,race_type_障害,ground_state_良,ground_state_稍重,ground_state_重,ground_state_不良,性_牝,性_牡,性_セ
201701010101,3,3,54.0,3.0,18.0,2017-07-29,8710,67,1,2,...,True,False,False,True,False,False,False,True,False,False
201701010101,5,5,54.0,1.5,18.0,2017-07-29,10340,197,1,2,...,True,False,False,True,False,False,False,False,True,False
201701010101,7,7,54.0,6.2,18.0,2017-07-29,10613,28,1,2,...,True,False,False,True,False,False,False,False,True,False
201701010101,1,1,54.0,31.1,18.0,2017-07-29,9222,7,0,2,...,True,False,False,True,False,False,False,False,True,False
201701010101,2,2,53.0,22.8,18.0,2017-07-29,9020,117,0,2,...,True,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202306030412,2,3,56.0,2.9,12.0,2023-04-02,28862,134,0,4,...,False,True,False,True,False,False,False,True,False,False
202306030412,4,8,53.0,24.2,12.0,2023-04-02,30769,169,0,4,...,False,True,False,True,False,False,False,True,False,False
202306030412,7,13,56.0,22.9,12.0,2023-04-02,16718,148,0,7,...,False,True,False,True,False,False,False,False,True,False
202306030412,2,4,58.0,59.1,12.0,2023-04-02,22122,1,0,6,...,False,True,False,True,False,False,False,False,True,False


In [29]:
for i in range(0, 20000, 1000):
    d1 = pd.read_pickle(f'beta/horse_data/horse_result{i}.pickle')
    d2 = pd.read_pickle(f'beta/horse_data_raceID_added_ver2/horse_result{i}.pickle')
    print(len(d2)-len(d1))

18
9
9
14
18
11
24
18
11
13
19
36
44
51
34
38
53
56
62
57
