In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm
import re
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o
import math

In [2]:
results = pd.read_pickle('results.pickle')

In [3]:
horse_results = pd.read_pickle('horse_results.pickle')

In [4]:
peds = pd.read_pickle('peds.pickle')

In [5]:
return_tables = pd.read_pickle('return_tables.pickle')

In [6]:
def update_data(old, new):
    """
    Parameters:
    ----------
    old : pandas.DataFrame
        古いデータ
    new : pandas.DataFrame
        新しいデータ
    """

    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])

In [7]:
class DataProcessor:
    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """
    
    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()
        
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す

        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """
        
        self.data_h = self.data_p.copy()
        self.data_h['core_distance'] = self.data_h.apply(lambda x: '根幹距離' if x['course_len'] % 4 == 0 else '非根幹距離', axis=1)

        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
            
        
        
        self.data_h.drop(['開催'], axis=1, inplace=True)
        
    def merge_previous_data(self, hr):
        #前走のデータを追加する


        self.data_h = hr.merge_pre_data(self.data_h)
        
    def merge_peds(self, peds):
        """
        5世代分血統データを追加してdata_peに返す

        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """
	
        self.data_pe = \
            self.data_h.merge(peds, left_on='horse_id', right_index=True,
                                                             how='left')
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]\
            ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
            
    def process_categorical(self, le_horse, le_jockey, le_trainer, results_m):
        """
        カテゴリ変数を処理してdata_cに返す

        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """
	
        df = self.data_pe.copy()
        
        #ラベルエンコーディング。horse_id, jockey_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])
        
        #trainer_idを-始まりの整数に変換
        mask_trainer = df['trainer_id'].isin(le_trainer.classes_)
        new_trainer_id = df['trainer_id'].mask(mask_trainer).dropna().unique()
        le_trainer.classes_ = np.concatenate([le_trainer.classes_, new_trainer_id])
        df['trainer_id'] = le_trainer.transform(df['trainer_id'])
        
        #horse_id, jockey_idをpandasのcategory型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df['jockey_id'].astype('category')
        
        df['trainer_id'] = df['trainer_id'].astype('category')
        
        #そのほかのカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        #列を一定にするため
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()
        class_types = results_m['class_type'].unique()
#         previous_course_kinds = results_m['前芝ダ'].unique()
#         previous_baba_status = results_m['前走馬場状態'].unique()
        pre_run_place = results_m['前走場所'].unique()
        core_distance = results_m['core_distance'].unique()
        straight_types = ['long', 'short']
        #所属(affiliation)
#         affiliations = results_m['所属'].unique()
        
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df['class_type'] = pd.Categorical(df['class_type'], class_types)
#         df['前芝ダ'] = pd.Categorical(df['前芝ダ'], previous_course_kinds)
#         df['前走馬場状態'] = pd.Categorical(df['前走馬場状態'], previous_baba_status)
        df['前走場所'] = pd.Categorical(df['前走場所'], pre_run_place)
        df['core_distance'] = pd.Categorical(df['core_distance'], core_distance)
        df['straight_type'] = pd.Categorical(df['straight_type'], straight_types)
#         df['所属'] = pd.Categorical(df['所属'], affiliations)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性', 'class_type', '前走場所', 'core_distance', 'straight_type'])
        
        self.data_c = df

In [8]:
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        """
        レース結果データをスクレイピングする関数

        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト

        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """

        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            try:
                url = "https://db.netkeiba.com/race/" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]

                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")

                #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[0])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)

                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #調教師IDをスクレイピング
                trainer_id_list = []
                trainer_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/trainer")}
                )
                for a in trainer_a_list:
                    trainer_id = re.findall(r"\d+", a["href"])
                    trainer_id_list.append(trainer_id[0])
                
                df["trainer_id"] = trainer_id_list
                
                #インデックスをrace_idにする
                df.index = [race_id] * len(df)

                race_results[race_id] = df
                time.sleep(1)
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df
    
    #前処理    
    def preprocessing(self):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x:1 if x<4 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        #馬体重を体重と体重変化に分ける
#         df["体重"] = df["馬体重"].str.split("(", expand=True)[0].astype(int)
#         df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1].astype(int)

        # 単勝をfloatに変換
        df["単勝"] = df["単勝"].astype(float)
	# 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        #所属を追加
#         df['所属'] = df["調教師"].map(lambda x: x[1:2])
        # 不要な列を削除
        
        df.drop(["タイム", "着差", "調教師", "性齢", '馬名', '騎手', '人気', '着順', '馬体重'],
                axis=1, inplace=True)

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        #開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        
        df['n_horses'] = df.index.map(df.index.value_counts())

        self.data_p = df
    
    #カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
        
        self.le_trainer = LabelEncoder().fit(self.data_pe['trainer_id'])
        super().process_categorical(self.le_horse, self.le_jockey, self.le_trainer, self.data_pe)

In [9]:
class ShutubaTable(DataProcessor):
    def __init__(self, shutuba_tables):
        super(ShutubaTable, self).__init__()
        self.data = shutuba_tables
    
    @classmethod
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T

            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "重"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                # 2020/12/13追加
                if '稍' in text:
                    df["ground_state"] = ['稍重'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)
            
#             url = 'https://race.netkeiba.com/race/shutuba.html?race_id=202105020709'
#             df = pd.read_html(url)[0]
#             df = df.T.reset_index(level=0, drop=True).T
#             html = requests.get(url)
#             html.encoding = "EUC-JP"
#             soup = BeautifulSoup(html.text, "html.parser")
            race_name = soup.find('div', attrs={'class': 'RaceName'}).text
            
            if '未勝利' in race_name:
                df['class_type'] = ['未勝利'] * len(df)
            if '新馬' in race_name:
                df['class_type'] = ['新馬'] * len(df)
            if '2勝クラス' in race_name:
                df['class_type'] = ['2勝'] * len(df)
            if '1勝クラス' in race_name:
                df['class_type'] = ['1勝'] * len(df)
            if '500万下' in race_name:
                df['class_type'] = ['500万'] * len(df)  
            if '1000万下' in race_name:
                df['class_type'] = ['1000万'] * len(df)
            if '1600万下' in race_name:
                df['class_type'] = ['1600万'] * len(df)                
            e = soup.find_all('span', class_='Icon_GradeType')
            if len(e) > 0:
                if 'Icon_GradeType1'  in e[0]['class']:
                    #Ｇ１
                    df['class_type'] = ['Ｇ１'] * len(df)
                if 'Icon_GradeType2'  in e[0]['class']:
                    #Ｇ２
                    df['class_type'] = ['Ｇ２'] * len(df)
                if 'Icon_GradeType3'  in e[0]['class']:
                    #Ｇ３
                    df['class_type'] = ['Ｇ３'] * len(df)
                if 'Icon_GradeType4'  in e[0]['class']:
                    #重賞
                    df['class_type'] = ['重賞'] * len(df)
                if 'Icon_GradeType5'  in e[0]['class']:
                    #ｵｰﾌﾟﾝ
                    df['class_type'] = ['ｵｰﾌﾟﾝ'] * len(df)
                if 'Icon_GradeType6'  in e[0]['class']:
                    #1600下
                    df['class_type'] = ['1600万']
                if 'Icon_GradeType7'  in e[0]['class']:
                    #1000下
                    df['class_type'] = ['1000万']
#                 if 'Icon_GradeType8'  in e[0]['class']:
#                     #900下
                if 'Icon_GradeType9'  in e[0]['class']:
                    #500下
                    df['class_type'] = ['500万']
                if 'Icon_GradeType10'  in e[0]['class']:
                    #ＪＧ１
                    df['class_type'] = ['ＪＧ１'] * len(df)
                if 'Icon_GradeType11'  in e[0]['class']:
                    #ＪＧ２
                    df['class_type'] = ['ＪＧ２'] * len(df)
                if 'Icon_GradeType12'  in e[0]['class']:
                    #ＪＧ３
                    df['class_type'] = ['ＪＧ３'] * len(df)
                if 'Icon_GradeType15'  in e[0]['class']:
                    #OP(L)
                    df['class_type'] = ['OP(L)'] * len(df)
                if 'Icon_GradeType16'  in e[0]['class']:
                    #3勝
                    df['class_type'] = ['3勝'] * len(df)
                if 'Icon_GradeType17'  in e[0]['class']:
                    #2勝
                    df['class_type'] = ['2勝'] * len(df)
                if 'Icon_GradeType18'  in e[0]['class']:
                    #1勝
                    df['class_type'] = ['1勝'] * len(df)
    

            # horse_id
            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list
            
            #trainer_id
            trainer_id_list = []
            trainer_td_list = soup.find_all("td", attrs={'class': 'Trainer'})
            for td in trainer_td_list:
                trainer_id = re.findall(r'\d+', td.find('a')['href'])[0]
                trainer_id_list.append(trainer_id)
            df['trainer_id'] = trainer_id_list
            
            
            df.index = [race_id] * len(df)

            
            #レースの1位の賞金を追加
#             url = 'https://race.netkeiba.com/race/result.html?race_id=' + race_id
#             html = requests.get(url)
#             html.encoding = "EUC-JP"
#             soup = BeautifulSoup(html.text, "html.parser")

#             text = soup.find('div', attrs={'class': 'RaceData02'}).find_all('span')[8].text
#             shoukin = text.replace('万円', '')
#             shoukin = shoukin.split(':')[1]
#             shoukin = list(map(int,shoukin.split(',')))
#             # sum(shoukin)
#             df['賞金'] = int(shoukin[0])
            
            data = data.append(df)
            time.sleep(1)
        return cls(data)
             
    #前処理            
    def preprocessing(self):
        df = self.data.copy()
        
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)
        
        df = df[df['印'] != '除外']
        df = df[df['印'] != '取消']
        # 馬体重を体重と体重変化に分ける
#         df = df[df["馬体重(増減)"] != '--']
#         df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)
#         df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].str[:-1]
#         # 2020/12/13追加：増減が「前計不」などのとき欠損値にする
#         df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')
        
        df["date"] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        df['開催'] = df.index.map(lambda x:str(x)[4:6])
        df["course_len"] = df["course_len"].astype(float) // 100
        df['n_horses'] = df.index.map(df.index.value_counts())
#         df['所属'] = df['厩舎'].map(lambda x: x[0:1])
#         df['所属'] = df['所属'].map(lambda x: '外' if x == '海' else x)
        
        # 不要な列を削除
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather','race_type',
        'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢',
        '開催', 'class_type', 'n_horses', 'trainer_id']]
        
        df['上り'] = np.nan
        
        self.data_p = df.rename(columns={'枠': '枠番'})

In [10]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過', '開催', '距離','上り', '馬番']]
        self.preprocessing()
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離は10の位を切り捨てる
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)
        #インデックス名を与える
        df.index.name = 'horse_id'
        
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner',
                            'first_to_rank', 'first_to_final','final_to_rank', '上り', 'y-x', 'diff_final-diff_y']
    
    #n_samplesレース分馬ごとに平均する
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
        self.filtered_df = filtered_df
        self.target_df = target_df
	#集計して辞書型に入れる
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean()\
            .add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催', 'class_type']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))
            
        for column in ['core_distance', 'course_around']:
            if n_samples == 'all':
                self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                    [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))
            else:
                continue
            
            
        # self.data_p からデータを取り出し入れてみる
        target_df_r = self.results.query('horse_id in @horse_id_list')
        self.pre_data_r_dict = {}
        self.target_df_r = target_df_r
        self.target_df_r = self.target_df_r.set_index('horse_id')
        
        #過去何走分取り出すか指定(r分)
        if n_samples == 'all':
            filtered_df_r = self.target_df_r[self.target_df_r['date'] < date]
        elif n_samples > 0:
            filtered_df_r = self.target_df_r[self.target_df_r['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be_r >0')
            
        self.filtered_df_r = self.filtered_df
        for column in ['ground_state', 'straight_type']:
            target_column = filtered_df_r[[column]]
            self.filtered_df_r = self.filtered_df_r.merge(target_column, left_on='horse_id', right_index=True, how='left')
#         self.filtered_df_r = filtered_df_r[['ground_state']]
#         self.filtered_df_r = self.filtered_df.merge(self.filtered_df_r, left_on='horse_id',right_index=True, how='left')
        for column in ['ground_state', 'straight_type']:
            if n_samples == 'all':
                self.average_dict[column] = self.filtered_df_r.groupby(['horse_id', column])\
                    [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples)) 
            else:
                continue
            
            
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        
        self.results = results

        
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        
        
        
#         #間隔データ追加
        self.data_dict = {}
        self.target_date_df = self.target_df[self.target_df['date'] == date]
        self.data_dict['間隔'] = self.target_date_df[['間隔']]
        merged_df = merged_df.merge(self.data_dict['間隔'], left_on='horse_id', right_index=True, how='left')
        
        self.merged_df = merged_df
        
        #前走距離変化データ追加
        self.last_time = self.target_df[self.target_df['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(1)
        self.data_dict['前走距離'] = self.last_time[['course_len']].rename(columns={'course_len': '前走距離'})
        merged_df = merged_df.merge(self.data_dict['前走距離'], left_on='horse_id', right_index=True, how='left')
        
        for column in ['course_len','race_type', '開催', 'class_type']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left')

        for column in ['ground_state', 'straight_type', 'core_distance', 'course_around']:
            if n_samples == 'all':
                merged_df = merged_df.merge(self.average_dict[column], 
                                            left_on=['horse_id', column],
                                            right_index=True, how='left')
            else:
                continue
                
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        self.horse_results['core_distance'] = self.horse_results.apply(lambda x: '根幹距離' if x['course_len'] % 4 == 0 else '非根幹距離', axis=1)
        self.horse_results['course_around'] = self.horse_results['開催'].map(convert_around_dict)


        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
#         merged_df.drop('course_around', axis=1, inplace=True)
        return merged_df
    
    def merge_pre_data(self, results):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge_pre(results, date) for date in tqdm(date_list)])
        self.merged_df1 = merged_df
        
        return merged_df
    
    def merge_pre(self, results, date):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        target_df = self.horse_results.query('index in @horse_id_list')
        self.pre_data_dict = {}
        filtered_df = target_df[target_df['date'] < date]
        
        
        #前走の場所追加
        self.pre_run_data = target_df[target_df['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(1)
        self.pre_data_dict['前走場所'] = self.pre_run_data[['開催']].rename(columns={'開催': '前走場所'})
        merged_df = df.merge(self.pre_data_dict['前走場所'], left_on='horse_id', right_index=True, how='left')
        merged_df['前走場所'] = merged_df['前走場所'].fillna('初出走')
        place_string_dict = {v: k for k, v in place_dict.items()}
        place_string_dict['初出走'] = '初出走'
        merged_df['前走場所'] = merged_df['前走場所'].map(place_string_dict)
        merged_df['前走場所'] = merged_df['前走場所'].fillna('その他')
        
        
        #出走回数
        number_of_runs = filtered_df.groupby(level=0).size()
        merged_df = merged_df.merge(number_of_runs.to_frame(), left_on='horse_id', right_index=True, how='left')
        merged_df = merged_df.rename(columns={0: '出走回数'})
        merged_df['出走回数'] = merged_df['出走回数'].fillna(0)
        
        #前走逃げたか
#         self.pre_data_dict['前走1コーナー位置'] = self.pre_run_data[['first_corner']]
        


        #何か条件allR分だけ取りたい
        
        
        #前走着差追加
        #閾値1.3超えるものが少なくなり単勝しか狙えない。改善する必要がアリ。
        
#         self.pre_data_dict['前走着差'] = self.pre_run_data[['着差']].rename(columns={'着差': '前走着差'})
#         merged_df = merged_df.merge(self.pre_data_dict['前走着差'], left_on='horse_id', right_index=True, how='left')

        # r.data_pのデータを使い、前走◎◎みたいなデータを取りたい
        target_df_r = results.query('horse_id in @horse_id_list')
        self.pre_data_r_dict = {}
        self.target_df = target_df
        self.target_df_r = target_df_r
        self.date = date
        self.results = results
        self.target_df_r = self.target_df_r.set_index('horse_id')
        self.pre_run_data_r = self.target_df_r[self.target_df_r['date'] < date].sort_values('date', ascending=False)
#         self.pre_data_r_dict['前走体重'] = self.pre_run_data_r[['体重']].rename(columns={'体重': '前走体重'})
#         merged_df = merged_df.merge(self.pre_data_r_dict['前走体重'], left_on='horse_id', right_index=True, how='left')

        #その日乗るジョッキーが何回目か
#         self.pre_data_r_dict['騎手の通算騎乗回数'] = self.pre_run_data_r.groupby(['horse_id', 'jockey_id']).size().reset_index().rename(columns={0: '騎手の通算騎乗回数'})
#         index = merged_df.index
#         merged_df = merged_df.merge(self.pre_data_r_dict['騎手の通算騎乗回数'], left_on=['horse_id','jockey_id'], right_on=['horse_id','jockey_id'], how='left')
#         merged_df.index = index
#         merged_df['騎手の通算騎乗回数'].fillna(0.0, inplace=True)
        

        
        
        return merged_df

In [11]:
class Peds:
    def __init__(self, peds):
        self.peds = peds
        self.peds_e = pd.DataFrame() #after label encoding and transforming into category
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.concat([pd.read_pickle(path) for path in path_list])
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        return peds_df
    
    def encode(self):
        df = self.peds.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.peds_e = df.astype('category')

In [12]:
#そのほかの定義   

#開催場所
place_dict = {
    '札幌': '01',
    '函館': '02',
    '福島': '03',
    '新潟': '04',
    '東京': '05',
    '中山': '06',
    '中京': '07',
    '京都': '08',
    '阪神': '09',
    '小倉': '10'
}

race_type_dict = {
    '芝': '芝',
    'ダ': 'ダート',
    '障': '障害'
}

def update_data(old, new):
    """
    Parameters:
    ----------
    old : pandas.DataFrame
        古いデータ
    new : pandas.DataFrame
        新しいデータ
    """

    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])

def add_data(pe_data, add_data):
    index_ = pe_data.index
    pe_data = pe_data.merge(add_data, on='horse_id', how='left')
    pe_data.index = index_
    return pe_data
def add_data1(pe_data, add_data):
    index_ = pe_data.index
    pe_data = pe_data.merge(add_data, on='date', how='left')
    pe_data.index = index_
    return pe_data

place_dict_s = {
    '札': '01',
    '函': '02',
    '福': '03',
    '新': '04',
    '東': '05',
    '中': '06',
    '名': '07',
    '京': '08',
    '阪': '09',
    '小': '10'
}

convert_around_dict = {
    '01': '右',
    '02': '右',
    '03': '右',
    '04': '左',
    '05': '左',
    '06': '右',
    '07': '左',
    '08': '右',
    '09': '右',
    '10': '右'
}

pre_info = pd.read_pickle('pre_info.pickle')
pre_info.rename(columns={'date': 'date_f'}, inplace=True)
pre_info['date_f'] = pd.to_datetime(pre_info['date_f'])
pre_info = pre_info[['horse_id', 'date_f', '前芝ダ', '前走馬場状態']]

cource_info = pd.read_csv('コース.csv', encoding='utf-8')
cource_info.drop('Unnamed: 7', axis=1, inplace=True)

df_day = pd.read_pickle('df_day.pickle')



In [13]:
horse_results = pd.read_pickle('horse_results_new.pickle')

In [14]:
# 不要データ削除
horse_results = horse_results.drop('2005190008')
horse_results = horse_results.drop('2006190005')
horse_results = horse_results.drop('2006190004')
horse_results = horse_results.drop('2006190001')
horse_results = horse_results.drop('2005190005')

In [15]:
hr = HorseResults(horse_results)
print(hr.horse_results.duplicated().sum())
print(len(hr.horse_results))
hr.horse_results.drop_duplicates(inplace=True)
print(len(hr.horse_results))
hr.horse_results['horse_id_'] = hr.horse_results.index
hr.horse_results.drop('上り', axis=1, inplace=True)
target_new = pd.read_pickle('target_new_prepro.pickle')
target_new.rename(columns={'date': 'date_f'}, inplace=True)
target_new['date_f'] = pd.to_datetime(target_new['date_f'])
target_new_hr = target_new[['class_type', 'date_f', 'horse_id', '上り', '間隔']]
target_new_hr = target_new_hr.set_index(['horse_id', 'date_f'])
target_new_hr
hr.horse_results = hr.horse_results.merge(target_new_hr, left_on=['horse_id_', 'date'], right_index=True, how='left')
hr.horse_results.drop('horse_id_', inplace=True, axis=1)
hr.horse_results
hr.horse_results.drop(['馬番'], axis=1, inplace=True)
hr.horse_results = hr.horse_results.set_index('date', append=True)
diff = pd.read_pickle('diff_master.pickle')
diff = diff[['date', 'horse_id', 'y-x', 'diff_final-diff_y']]
diff = diff.set_index(['horse_id', 'date'])
hr.horse_results = hr.horse_results.merge(diff, left_index=True, right_on=['horse_id', 'date'], how='left')
hr.horse_results = hr.horse_results.reset_index('date')

In [37]:
r = Results.read_pickle(['results_20220827_0828.pickle','results_20220820_0821.pickle','results_20220813_0814.pickle','results_20220730_0807.pickle','results_20220716_0724.pickle','results_20220702_0710.pickle','results_20220618_0626.pickle','results_20220611_0612.pickle','results_20220604_0605.pickle','results_20220528_0529.pickle','results_20220521_0522.pickle','results_20220514_0515.pickle','results_20220507_0508.pickle','results_20220416_0501.pickle','results_20220402_0410.pickle','results_20220326_0327.pickle','results_20220319_0321.pickle','results_20220305_0313.pickle','results_20220226_0227.pickle','results_20220219_0220.pickle','results_20220212_0213.pickle','results_20220205_0206.pickle','results_20220129_0130.pickle','results_20220122_0123.pickle','results_20220101_0116.pickle','results_20211225_1228.pickle','results_20211218_1219.pickle','results_20211211_1212.pickle','results_20211204_1205.pickle', 'results_20211127_1128.pickle', 'results_20211120_1121.pickle','results_20211113_1114.pickle', 'results_20211106_1107.pickle','results_20211030_1031.pickle','results_20211023_1024.pickle', 'results_.pickle'])

In [38]:
r.data.sort_values('date_list')

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,...,weather,race_type,ground_state,date,horse_id,jockey_id,trainer_id,date_list,index_name,class_type
200906010106,4,1,2,ゴールドアカデミー,牡3,56.0,田中勝春,2:16.7,1.1/4,35,...,晴,芝,良,2009年1月4日,2006104915,00684,01079,2009-01-04,200906010106,未勝利
200906010105,7,4,7,マイネルエルドラド,牡3,56.0,和田竜二,1:36.7,1.3/4,17.6,...,晴,芝,良,2009年1月4日,2006104441,01018,01051,2009-01-04,200906010105,未勝利
200906010105,8,6,11,マイネルフレンズ,牡3,56.0,津村明秀,1:36.9,1.1/4,61.9,...,晴,芝,良,2009年1月4日,2006105778,01092,01029,2009-01-04,200906010105,未勝利
200906010105,9,6,12,トリプルスレット,牝3,54.0,後藤浩輝,1:36.9,クビ,31.1,...,晴,芝,良,2009年1月4日,2006103006,00711,00390,2009-01-04,200906010105,未勝利
200906010105,10,8,16,コスモドロス,牡3,56.0,松岡正海,1:37.2,1.3/4,26.3,...,晴,芝,良,2009年1月4日,2006104837,01085,01080,2009-01-04,200906010105,未勝利
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202210040612,4,8,11,デルマヤクシ,牡4,57.0,酒井学,1:08.1,クビ,9.9,...,晴,芝,良,2022年8月28日,2018100480,01034,01180,2022-08-28,202210040612,1勝
202210040612,5,6,7,シーズザデイ,牡3,54.0,富田暁,1:08.2,クビ,5.7,...,晴,芝,良,2022年8月28日,2019101797,01168,01003,2022-08-28,202210040612,1勝
202210040612,6,5,5,ファタリテ,牡6,57.0,国分優作,1:08.3,3/4,80,...,晴,芝,良,2022年8月28日,2016102111,01125,01032,2022-08-28,202210040612,1勝
202204030601,2,8,8,ヴァンクールシルク,セ8,60.0,伴啓太,3:09.3,5,2.8,...,曇,障害,稍重,2022年8月28日,2014106228,01149,01133,2022-08-28,202204030601,未勝利


In [40]:
results = r.data.copy()
pre_info.rename(columns={'date_f': 'date_list'}, inplace=True)
results["date_list"] = pd.to_datetime(results["date_list"], format="%Y年%m月%d日")
index_ = results.index
results = results.merge(pre_info, on=['horse_id', 'date_list'], how='left')
results.index = index_
r.data = results
hr.horse_results.sort_values('date')

In [43]:
r.data.drop(['index_name', 'date_list'], axis=1, inplace=True)

In [44]:
# r.data = r.data[pd.to_datetime(r.data['date'], format="%Y年%m月%d日") > '20191231']
r.data['class_type'] = r.data.apply(lambda x: '1勝' if x['class_type'] == '500万' else x['class_type'], axis=1)
r.data['class_type'] = r.data.apply(lambda x: '2勝' if x['class_type'] == '1000万' else x['class_type'], axis=1)
r.data['class_type'] = r.data.apply(lambda x: '3勝' if x['class_type'] == '1600万' else x['class_type'], axis=1)


In [45]:
r.preprocessing()

In [46]:
cource_prepro = pd.read_pickle('cource_prepro.pickle')
cource_prepro['date'] = pd.to_datetime(cource_prepro['date'])
index_ = r.data_p.index
r.data_p = r.data_p.merge(cource_prepro, on=['horse_id', 'date'], left_index=True, how='left')
r.data_p.index = index_
index_ = r.data_p.index
r.data_p = r.data_p.merge(cource_info, on='コース', left_index=True, how='left')
r.data_p.index = index_

In [47]:
r.data_p.sort_values('date')

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,年齢,開催,n_horses,コース,最後の直線,高低差,幅員,straight_type,course_around,最後の直線高低差
201406010106,4,7,54.0,5.3,18.0,晴,ダート,良,2014-01-05,2010102582,...,4,06,16,中山ダ,308.0,4.5,22.5,short,右,2.0
201406010105,5,9,56.0,33.2,20.0,晴,芝,良,2014-01-05,2011101164,...,3,06,16,中山芝C,310.0,5.3,22.0,short,右,2.5
201406010105,3,6,54.0,29.2,20.0,晴,芝,良,2014-01-05,2011100779,...,3,06,16,中山芝C,310.0,5.3,22.0,short,右,2.5
201406010105,3,5,54.0,33.0,20.0,晴,芝,良,2014-01-05,2011101086,...,3,06,16,中山芝C,310.0,5.3,22.0,short,右,2.5
201406010105,5,10,54.0,24.2,20.0,晴,芝,良,2014-01-05,2011106127,...,3,06,16,中山芝C,310.0,5.3,22.0,short,右,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202210040612,6,6,55.0,7.6,12.0,晴,芝,良,2022-08-28,2018103063,...,4,10,11,小倉芝B,293.0,3.0,27.0,short,右,0.0
202210040612,7,8,57.0,5.0,12.0,晴,芝,良,2022-08-28,2017100169,...,5,10,11,小倉芝B,293.0,3.0,27.0,short,右,0.0
202210040612,8,11,57.0,9.9,12.0,晴,芝,良,2022-08-28,2018100480,...,4,10,11,小倉芝B,293.0,3.0,27.0,short,右,0.0
202210040612,5,5,57.0,80.0,12.0,晴,芝,良,2022-08-28,2016102111,...,6,10,11,小倉芝B,293.0,3.0,27.0,short,右,0.0


In [48]:
start = time.time()
r.merge_horse_results(hr, n_samples_list=[4, 8, 'all'])
# r.data_h.head() #jupyterで出力
elapsed_time = time.time() - start
elapsed_time

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=942.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=942.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=942.0), HTML(value='')))




2788.604732275009

In [49]:
r.merge_previous_data(hr)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=942.0), HTML(value='')))




In [50]:
r.data_h.sort_values('date')

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,着差_course_around_allR,first_corner_course_around_allR,first_to_rank_course_around_allR,first_to_final_course_around_allR,final_to_rank_course_around_allR,上り_course_around_allR,y-x_course_around_allR,diff_final-diff_y_course_around_allR,前走場所,出走回数
201406010106,1,2,54.0,14.2,18.0,晴,ダート,良,2014-01-05,2010106367,...,1.250000,14.000000,11.500000,5.500000,6.000000,2.500000,-8.250000,-1.425000,中京,9.0
201406010105,3,6,54.0,29.2,20.0,晴,芝,良,2014-01-05,2011100779,...,0.300000,2.000000,0.000000,0.000000,0.000000,7.000000,-1.500000,1.450000,中山,1.0
201406010105,3,5,54.0,33.0,20.0,晴,芝,良,2014-01-05,2011101086,...,1.725000,10.750000,3.000000,1.000000,2.000000,4.750000,-2.175000,-1.525000,中山,6.0
201406010105,5,10,54.0,24.2,20.0,晴,芝,良,2014-01-05,2011106127,...,0.300000,10.000000,7.000000,1.000000,6.000000,3.000000,-1.500000,-5.450000,中山,1.0
201406010105,1,2,56.0,156.0,20.0,晴,芝,良,2014-01-05,2011100853,...,0.700000,7.000000,1.000000,0.500000,0.500000,6.500000,-3.450000,-0.585000,中山,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202210040612,6,6,55.0,7.6,12.0,晴,芝,良,2022-08-28,2018103063,...,1.028571,4.000000,-5.714286,-1.142857,-4.571429,11.571429,0.900000,1.771429,阪神,14.0
202210040612,7,8,57.0,5.0,12.0,晴,芝,良,2022-08-28,2017100169,...,1.008333,9.000000,2.083333,1.833333,0.250000,4.833333,-2.925000,0.213333,小倉,25.0
202210040612,8,11,57.0,9.9,12.0,晴,芝,良,2022-08-28,2018100480,...,1.000000,6.571429,0.000000,0.857143,-0.857143,5.857143,-1.242857,0.962857,中京,9.0
202210040612,5,5,57.0,80.0,12.0,晴,芝,良,2022-08-28,2016102111,...,0.912500,9.000000,0.333333,0.916667,-0.583333,7.458333,-1.425000,-1.494167,小倉,29.0


In [52]:
p = Peds.read_pickle(['peds.pickle'])
p.encode()
p.peds_e #jupyterで出力

Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2004110237,488,1103,190,512,346,410,151,161,201,339,...,634,1129,192,388,220,719,486,1178,1633,1283
2003104570,870,16527,118,890,102,11923,74,88,487,654,...,383,990,238,190,175,1332,409,899,446,915
2001100925,875,2949,285,752,1723,12953,134,512,533,932,...,184,149,231,411,829,1700,776,2078,1827,5055
2006101121,1030,9916,363,1129,1859,5349,76,464,314,492,...,255,1151,192,337,607,1230,418,147,1926,5048
2004102051,1209,15903,240,397,1166,5709,74,44,189,383,...,150,351,203,208,612,1255,487,1191,1160,2846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020106820,856,15154,407,1255,1970,9364,224,485,448,342,...,525,54,170,421,307,1171,398,756,186,6581
2020104539,1094,21195,407,1225,1304,10138,224,485,273,229,...,13,1014,238,239,109,231,286,1709,800,4025
2020106275,703,4256,407,995,1433,6153,224,485,448,156,...,62,1179,126,129,763,993,776,2004,755,1907
2020106120,1283,7991,340,1259,1766,9105,99,532,339,856,...,545,170,75,396,151,450,433,897,1899,6090


In [53]:
r.merge_peds(p.peds_e)
r.data_pe.head() #jupyterで出力

scrape peds at horse_id_list "no_peds"


Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
202201020612,4,5,56.0,22.1,18.0,晴,芝,良,2022-08-28,2018102158,...,294,769,185,132,752,1619,199,1925,1089,1061
202201020612,7,12,58.0,4.9,18.0,晴,芝,良,2022-08-28,2018101562,...,97,524,201,218,366,697,776,1954,1970,5624
202201020612,3,3,56.0,3.9,18.0,晴,芝,良,2022-08-28,2019102820,...,249,887,198,381,567,1767,519,1303,2070,6348
202201020612,5,7,58.0,16.0,18.0,晴,芝,良,2022-08-28,2015102127,...,370,336,203,208,341,517,398,756,11,3996
202201020612,7,11,56.0,7.1,18.0,晴,芝,良,2022-08-28,2017105623,...,766,1108,367,40,131,1349,338,586,1747,2663


In [54]:
fn = p.peds.copy()
fn = fn[['peds_1']]
fn['peds_1'] = fn['peds_1'].astype('str')
fn['family_number'] = fn.apply(lambda x: x['peds_1'].split('FNo.')[1].replace('[', '').replace(']', '') if 'FNo.' in x['peds_1'] else np.nan , axis=1)
fn = fn.dropna()
fn['family_number_sub'] = fn['family_number']
fn['family_number'] = fn.apply(lambda x: x['family_number'].split('-')[0] if '-' in x['family_number'] else x['family_number'], axis=1)
fn = fn[['family_number', 'family_number_sub']]
fn

Unnamed: 0,family_number,family_number_sub
2004110237,22,22-d
2003104570,18,18
2001100925,5,5-d
2006101121,7,7-d
2004102051,2,2-n
...,...,...
2020106820,4,4-d
2020104539,16,16-g
2020106275,8,8-f
2020106120,9,9-f


In [55]:
for column in fn.columns:
    fn[column] = LabelEncoder().fit_transform(fn[column].fillna('Na'))
fn = fn.astype('category')

In [56]:
r.data_pe = r.data_pe.merge(fn, left_on='horse_id', right_index=True, how='left')
r.data_pe

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,...,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61,family_number,family_number_sub
202201020612,4,5,56.0,22.1,18.0,晴,芝,良,2022-08-28,2018102158,...,185,132,752,1619,199,1925,1089,1061,0,16
202201020612,7,12,58.0,4.9,18.0,晴,芝,良,2022-08-28,2018101562,...,201,218,366,697,776,1954,1970,5624,26,130
202201020612,3,3,56.0,3.9,18.0,晴,芝,良,2022-08-28,2019102820,...,198,381,567,1767,519,1303,2070,6348,26,143
202201020612,5,7,58.0,16.0,18.0,晴,芝,良,2022-08-28,2015102127,...,203,208,341,517,398,756,11,3996,33,182
202201020612,7,11,56.0,7.1,18.0,晴,芝,良,2022-08-28,2017105623,...,367,40,131,1349,338,586,1747,2663,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202110020812,2,3,52.0,132.9,17.0,曇,ダート,重,2021-03-07,2016104645,...,319,613,48,339,506,660,1049,4278,28,153
202110020812,6,11,51.0,165.6,17.0,曇,ダート,重,2021-03-07,2017103403,...,75,396,151,450,566,1813,366,4790,0,9
202110020812,8,14,53.0,185.3,17.0,曇,ダート,重,2021-03-07,2013100206,...,126,82,341,400,433,897,358,917,26,130
202110020812,4,7,52.0,52.2,17.0,曇,ダート,重,2021-03-07,2017103320,...,126,82,341,400,433,897,1899,6090,33,187


In [57]:
r.process_categorical() #r.le_horse, r.le_jockeyに対応関係が保存される

In [58]:
r.data_c = add_data1(r.data_c, df_day)

In [59]:
r.data_c.drop_duplicates(inplace=True)

In [60]:
r.data_c.drop('コース', axis=1, inplace=True)

In [61]:
r.data_c.sort_values('date')

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,trainer_id,前芝ダ,...,前走場所_初出走,前走場所_小倉,前走場所_阪神,前走場所_京都,core_distance_非根幹距離,core_distance_根幹距離,straight_type_long,straight_type_short,cos_day,sin_day
201406010106,1,2,54.0,14.2,18.0,2014-01-05,3755,201,204,ダ,...,0,0,0,0,1,0,0,1,0.085965,0.996298
201406010105,3,6,54.0,29.2,20.0,2014-01-05,4328,71,104,芝,...,0,0,0,0,0,1,0,1,0.085965,0.996298
201406010105,3,5,54.0,33.0,20.0,2014-01-05,4477,4,65,芝,...,0,0,0,0,0,1,0,1,0.085965,0.996298
201406010105,5,10,54.0,24.2,20.0,2014-01-05,7641,7,56,芝,...,0,0,0,0,0,1,0,1,0.085965,0.996298
201406010105,1,2,56.0,156.0,20.0,2014-01-05,4367,46,81,芝,...,0,0,0,0,0,1,0,1,0.085965,0.996298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202210040612,6,6,55.0,7.6,12.0,2022-08-28,37774,117,152,芝,...,0,0,1,0,0,1,0,1,-0.835925,-0.548843
202210040612,7,8,57.0,5.0,12.0,2022-08-28,31295,53,231,芝,...,0,1,0,0,0,1,0,1,-0.835925,-0.548843
202210040612,8,11,57.0,9.9,12.0,2022-08-28,36208,50,250,芝,...,0,0,0,0,0,1,0,1,-0.835925,-0.548843
202210040612,5,5,57.0,80.0,12.0,2022-08-28,27858,105,106,芝,...,0,1,0,0,0,1,0,1,-0.835925,-0.548843


In [62]:
r.data_c['前走距離差'] = r.data_c.apply(lambda x: x['course_len'] - x['前走距離'], axis=1)

In [63]:
r.data_c.drop('前走距離', axis=1, inplace=True)

In [64]:
r.data_c.drop(['前走馬場状態', '前芝ダ'], axis=1, inplace=True)

In [65]:
date = '2022/9/3'

In [66]:
r.data_c['開催場所'] = r.data_c.apply(lambda x: x.name[4:6], axis=1)

In [67]:
r.data_c['course_around'] = r.data_c['開催場所'].map(convert_around_dict)

In [68]:
r.data_c['course_around'] = r.data_c.apply(lambda x: x['course_around'] if not ((x['開催場所'] == '04') & (x['course_len'] == 10)) else '直線', axis=1)

In [69]:
r.data_c = pd.get_dummies(r.data_c, columns=['course_around'])

In [70]:
# r.data_c = pd.get_dummies(r.data_c, columns=['開催回'])

In [71]:
r.data_c.drop('開催場所', axis=1, inplace=True)

In [59]:
r.data_c = r.data_c[r.data_c['date'] >  '20131231']
r.data_c.sort_values('date')
# hr.horse_results.sort_values('date')

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,trainer_id,rank,...,core_distance_根幹距離,core_distance_非根幹距離,straight_type_long,straight_type_short,cos_day,sin_day,前走距離差,course_around_右,course_around_左,course_around_直線
201406010106,1,2,54.0,14.2,18.0,2014-01-05,3755,201,204,0,...,0,1,0,1,0.085965,0.996298,0.0,1,0,0
201406010105,3,6,54.0,29.2,20.0,2014-01-05,4328,71,104,0,...,1,0,0,1,0.085965,0.996298,2.0,1,0,0
201406010105,3,5,54.0,33.0,20.0,2014-01-05,4477,4,65,0,...,1,0,0,1,0.085965,0.996298,0.0,1,0,0
201406010105,5,10,54.0,24.2,20.0,2014-01-05,7641,7,56,0,...,1,0,0,1,0.085965,0.996298,4.0,1,0,0
201406010105,1,2,56.0,156.0,20.0,2014-01-05,4367,46,81,0,...,1,0,0,1,0.085965,0.996298,0.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202109040612,2,4,57.0,5.8,14.0,2021-10-24,35709,22,102,0,...,0,1,0,1,-0.920971,0.389630,0.0,1,0,0
202109040612,3,6,57.0,70.2,14.0,2021-10-24,27226,120,246,0,...,0,1,0,1,-0.920971,0.389630,-2.0,1,0,0
202109040612,6,11,57.0,105.0,14.0,2021-10-24,19823,40,169,0,...,0,1,0,1,-0.920971,0.389630,-4.0,1,0,0
202105040601,3,3,55.0,195.2,16.0,2021-10-24,42189,20,51,0,...,1,0,1,0,-0.920971,0.389630,2.0,0,1,0


In [72]:
r.data_c = r.data_c.sort_values('date')
r.data_c

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,trainer_id,rank,...,core_distance_非根幹距離,core_distance_根幹距離,straight_type_long,straight_type_short,cos_day,sin_day,前走距離差,course_around_右,course_around_左,course_around_直線
201406010106,1,2,54.0,14.2,18.0,2014-01-05,3755,201,204,0,...,1,0,0,1,0.085965,0.996298,0.0,1,0,0
201406010105,3,6,54.0,29.2,20.0,2014-01-05,4328,71,104,0,...,0,1,0,1,0.085965,0.996298,2.0,1,0,0
201406010105,3,5,54.0,33.0,20.0,2014-01-05,4477,4,65,0,...,0,1,0,1,0.085965,0.996298,0.0,1,0,0
201406010105,5,10,54.0,24.2,20.0,2014-01-05,7641,7,56,0,...,0,1,0,1,0.085965,0.996298,4.0,1,0,0
201406010105,1,2,56.0,156.0,20.0,2014-01-05,4367,46,81,0,...,0,1,0,1,0.085965,0.996298,0.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202210040612,6,6,55.0,7.6,12.0,2022-08-28,37774,117,152,1,...,0,1,0,1,-0.835925,-0.548843,-2.0,1,0,0
202210040612,7,8,57.0,5.0,12.0,2022-08-28,31295,53,231,1,...,0,1,0,1,-0.835925,-0.548843,0.0,1,0,0
202210040612,8,11,57.0,9.9,12.0,2022-08-28,36208,50,250,0,...,0,1,0,1,-0.835925,-0.548843,-2.0,1,0,0
202210040612,5,5,57.0,80.0,12.0,2022-08-28,27858,105,106,0,...,0,1,0,1,-0.835925,-0.548843,0.0,1,0,0


In [522]:
race_id_list = ['2022010208{}'.format(str(i).zfill(2)) for i in range(1, 13, 1)]
st = ShutubaTable.scrape(race_id_list, date)
ut = time.time()
ut
#使用芝コース
turf_status = 'C'

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [523]:
hr.horse_results.drop_duplicates(inplace=True)
print(len(hr.horse_results))
date

1523435


'2022/9/4'

In [526]:
st.preprocessing()

In [527]:
race_list = st.data_p.index.unique()
cource_info_list = []
horse_len_list = []
df = pd.DataFrame()
for race in race_list:
    url = 'https://race.netkeiba.com/race/shutuba_past.html?race_id=' + race +'&rf=shutuba_submenu'
    html = requests.get(url)
    html.encoding = 'EUC-JP'
    soup = BeautifulSoup(html.text, 'html.parser')
    texts1 = soup.find('div', attrs={'class': 'RaceData01'}).text
    texts1 = re.findall(r'\w+', texts1)
    uchisoto = '内'
    for text1 in texts1:
        if '芝' in text1:
            baba = text1[0] + turf_status
        if 'ダ' in text1:
            baba = text1[0]
    for text1 in texts1:
        if text1 in ['外']:
            uchisoto = text1
    texts2 = soup.find('div', attrs={'class': 'RaceData02'}).text
    texts2 = re.findall(r'\w+', texts2)
    for text2 in texts2:
        if text2 in ['札幌', '函館', '新潟', '福島', '東京', '中山', '中京', '阪神', '京都', '小倉']:
            place = text2
    horse_len = len(soup.find_all('div', attrs={'class': 'Horse06'}))
    if baba == 'ダ':
        uchisoto = ''
    if place == '小倉' or place == '中京' or place == '中山' or place == '東京' or place == '福島' or place == '函館' or place == '札幌':
        uchisoto = ''
    for i in range(horse_len):
        isRun = True
        if (st.data.loc[race]['印'][i] == '取消'):
            isRun = False
        if (st.data.loc[race]['印'][i] == '除外'):
            isRun = False
        if (isRun):
            cource_info_list.append(place + baba + uchisoto)

In [528]:
st.data_p['コース'] = cource_info_list
index_ = st.data_p.index
st.data_p = st.data_p.merge(cource_info, on='コース', left_index=True, how='left')
st.data_p.index = index_
st.data_p.drop('コース', axis=1, inplace=True)

In [529]:
#馬の過去成績データの追加。新馬はNaNが追加される
st.merge_horse_results(hr, n_samples_list=[4, 8, 'all'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [531]:
# st.data_h.drop('騎手の通算騎乗回数', axis=1, inplace=True)
horse_id_list_st = st.data_p['horse_id']
target_df_st = r.data_p.query('horse_id in @horse_id_list_st')
pre_data_dict = {}
target_df_st = target_df_st.set_index('horse_id')
# pre_run_data_r = target_df_st[target_df_st['date'] < date].sort_values('date', ascending=False)
# pre_data_dict['騎手の通算騎乗回数'] = pre_run_data_r.groupby(['horse_id', 'jockey_id']).size().reset_index().rename(columns={0: '騎手の通算騎乗回数'})
# index = st.data_h.index
# st.data_h = st.data_h.merge(pre_data_dict['騎手の通算騎乗回数'], left_on=['horse_id','jockey_id'], right_on=['horse_id','jockey_id'], how='left')
# st.data_h.index = index
# st.data_h.fillna(0.0, inplace=True)

In [532]:
def average(data_p, horse_id_list, date, n_samples='all'):
    target_list = ['着順', '賞金', '着差', 'first_corner','first_to_rank', 'first_to_final','final_to_rank', '上り']
    target_df_r = r.data_p.query('horse_id in @horse_id_list_st')
    target_df_r = target_df_r.set_index('horse_id')
    if n_samples == 'all':
        filtered_df_r = target_df_r[target_df_r['date'] < date]
    elif n_samples > 0:
        filtered_df_r = target_df_r[target_df_r['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(n_samples)
    else:
        raise Exception('n_samples must be > 0')
    target_df = hr.horse_results.query('horse_id in @horse_id_list_st')
    if n_samples == 'all':
        filtered_df = target_df[target_df['date'] < date]
    elif n_samples > 0:
        filtered_df = target_df[target_df['date'] < date].sort_values('date', ascending=False).groupby(level=0).head(n_samples)
    else:
        raise Exception('n_samples must be > 0')
    for column in ['ground_state', 'straight_type']:
        target_column = filtered_df_r[[column]]
        filtered_df = filtered_df.merge(target_column, left_on='horse_id', right_index=True, how='left')
    for column in ['ground_state', 'straight_type']:
        average_dict[column] = filtered_df.groupby(['horse_id', column])[target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))

In [533]:
target_columns_list = []
average_dict = {}

In [538]:
average(r.data_p, horse_id_list_st, date, 'all')

In [539]:
for column in ['ground_state', 'straight_type']:
    st.data_p = st.data_p.merge(average_dict[column], left_on=['horse_id', column], right_index=True, how='left')
    target_columns_list.append(list(average_dict[column].columns)) 
print(len(st.data_p.columns))

39


In [540]:
target_columns_list = sum(target_columns_list, [])
target_columns_list

['着順_ground_state_allR',
 '賞金_ground_state_allR',
 '着差_ground_state_allR',
 'first_corner_ground_state_allR',
 'first_to_rank_ground_state_allR',
 'first_to_final_ground_state_allR',
 'final_to_rank_ground_state_allR',
 '上り_ground_state_allR',
 '着順_straight_type_allR',
 '賞金_straight_type_allR',
 '着差_straight_type_allR',
 'first_corner_straight_type_allR',
 'first_to_rank_straight_type_allR',
 'first_to_final_straight_type_allR',
 'final_to_rank_straight_type_allR',
 '上り_straight_type_allR']

In [541]:
st.data_p[target_columns_list]

Unnamed: 0,着順_ground_state_allR,賞金_ground_state_allR,着差_ground_state_allR,first_corner_ground_state_allR,first_to_rank_ground_state_allR,first_to_final_ground_state_allR,final_to_rank_ground_state_allR,上り_ground_state_allR,着順_straight_type_allR,賞金_straight_type_allR,着差_straight_type_allR,first_corner_straight_type_allR,first_to_rank_straight_type_allR,first_to_final_straight_type_allR,final_to_rank_straight_type_allR,上り_straight_type_allR
202201020801,6.000000,0.000000,0.600000,6.000000,0.000000,0.000000,0.000000,4.000000,6.000000,0.000000,0.600000,6.000000,0.000000,0.000000,0.000000,4.000000
202201020801,7.000000,23.333333,2.466667,5.666667,-1.333333,0.000000,-1.333333,6.666667,7.000000,23.333333,2.466667,5.666667,-1.333333,0.000000,-1.333333,6.666667
202201020801,,,,,,,,,3.000000,160.000000,0.600000,6.000000,3.000000,-0.500000,3.500000,2.500000
202201020801,,,,,,,,,3.000000,180.000000,0.200000,3.000000,0.000000,-1.000000,1.000000,4.000000
202201020801,2.000000,280.000000,0.000000,1.000000,-1.000000,0.000000,-1.000000,3.000000,2.000000,280.000000,0.000000,1.000000,-1.000000,0.000000,-1.000000,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202201020812,6.714286,108.642857,0.800000,9.857143,3.142857,0.750000,2.392857,4.285714,6.714286,108.642857,0.800000,9.857143,3.142857,0.750000,2.392857,4.285714
202201020812,6.000000,237.142857,1.400000,8.571429,2.571429,1.571429,1.000000,5.571429,6.000000,237.142857,1.400000,8.571429,2.571429,1.571429,1.000000,5.571429
202201020812,3.777778,234.555556,0.455556,6.666667,2.888889,1.333333,1.555556,4.000000,3.777778,234.555556,0.455556,6.666667,2.888889,1.333333,1.555556,4.000000
202201020812,7.520000,135.504000,1.412000,6.080000,-1.440000,-0.560000,-0.880000,8.160000,7.520000,135.504000,1.412000,6.080000,-1.440000,-0.560000,-0.880000,8.160000


In [542]:
st.data_h[target_columns_list] = st.data_p[target_columns_list]

In [543]:
race_list = st.data_h.index.unique()
kankaku_list = []
horse_len_list = []
for race in race_list:
    url = 'https://race.netkeiba.com/race/shutuba_past.html?race_id=' + race +'&rf=shutuba_submenu'
    html = requests.get(url)
    html.encoding = 'EUC-JP'
    soup = BeautifulSoup(html.text, 'html.parser')
    horse_len = len(soup.find_all('div', attrs={'class': 'Horse06'}))
    horse_len_list.append(horse_len)
    for i in range(horse_len):
        isRun = True
        if (st.data.loc[race]['印'][i] == '取消'):
            isRun = False
        if (st.data.loc[race]['印'][i] == '除外'):
            isRun = False
        if (isRun):
#             if (kankaku == '闘'):
#             kankaku = str(0)
            kankaku = soup.find_all('div', attrs={'class': 'Horse06'})[i].text
            if (kankaku == ''):
                kankaku = np.nan
                kankaku_list.append(float(kankaku))
                continue
            kankaku = re.sub(r'\D', '', kankaku)
            if (kankaku == ''):
                kankaku = 0
            kankaku_list.append(float(kankaku))

In [544]:
st.data_h['間隔'] = kankaku_list

In [545]:
last_len_list = []
pre_race_kinds_list = []
pre_race_baba_list = []
pre_horse_weight_list = []
for race in race_list:
    url = 'https://race.netkeiba.com/race/shutuba_past.html?race_id=' + race +'&rf=shutuba_submenu'
    html = requests.get(url)
    html.encoding = 'EUC-JP'
    soup = BeautifulSoup(html.text, 'html.parser')
    horse_len = len(soup.find_all('div', attrs={'class': 'Horse06'}))
    for i in range(horse_len):
        isRun = True
        if (st.data.loc[race]['印'][i] == '取消'):
            isRun = False
        if (st.data.loc[race]['印'][i] == '除外'):
            isRun = False
        if (isRun):
            horse_url = 'https://db.netkeiba.com/horse/' + st.data.loc[race]['horse_id'][i]
            if len(pd.read_html(horse_url)) < 4:
                last_len_list.append(np.nan)
                pre_race_kinds_list.append(np.nan)
                pre_race_baba_list.append(np.nan)
                pre_horse_weight_list.append(np.nan)
                continue
            df = pd.read_html(horse_url)[3]
#             #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
            if df.columns[0]=='受賞歴':
                df = pd.read_html(horse_url)[4]
            if 0 in df.columns:
                last_len_list.append(np.nan)
                pre_race_kinds_list.append(np.nan)
                pre_race_baba_list.append(np.nan)
                pre_horse_weight_list.append(np.nan)
                continue
            target_date = date.split('/')
            target_date = target_date[0] + target_date[1].zfill(2) + target_date[2].zfill(2)
            df['date'] = pd.to_datetime(df['日付'])
            df = df[df['date'] < target_date]
            if len(df) == 0:
                last_len_list.append(np.nan)
                pre_race_kinds_list.append(np.nan)
                pre_race_baba_list.append(np.nan)
                pre_horse_weight_list.append(np.nan)
                continue
            previous_info_kinds = df.head(1)['距離'].values[0]
            previous_info_baba = df.head(1)['馬場'].values[0]
            previous_horse_weight = df.head(1)['馬体重'].values[0]
            last_len = re.sub(r'\D', '', previous_info_kinds)
            pre_race_kinds = re.sub(r'\d', '', previous_info_kinds)
            last_len_list.append(float(int(last_len) / 100))
            pre_race_kinds_list.append(pre_race_kinds)
            pre_race_baba_list.append(previous_info_baba)
#             if previous_horse_weight == '計不':
#                 pre_horse_weight_list.append(np.nan)
#                 continue
#             pre_horse_weight_list.append(int(previous_horse_weight.split("(")[0]))

In [546]:
def convert_kinds(k):
    if k == '障':
        return '芝'
    else:
        return k

In [547]:
pre_race_kinds_list = list(map(convert_kinds, pre_race_kinds_list))

In [548]:
st.merge_previous_data(hr)
# st.data_h['前走体重'] = pre_horse_weight_list

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [550]:
st.data_h['前芝ダ'] = pre_race_kinds_list
st.data_h['前走馬場状態'] = pre_race_baba_list

In [551]:
st.data_h['前走距離'] = last_len_list

In [552]:
#5世代分の血統データの追加
st.merge_peds(p.peds_e)

scrape peds at horse_id_list "no_peds"


In [553]:
if len(st.no_peds) > 0 :
    peds_new = Peds.scrape(st.no_peds)
    peds.to_pickle('peds_h.pickle') #pedsを更新する前にバックアップ
    peds = update_data(peds, peds_new)
    peds.to_pickle('peds.pickle')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [554]:
p = Peds.read_pickle(['peds.pickle'])
# p = Peds.read_pickle(['peds_new.pickle'])

In [555]:
p.encode()

In [556]:
st.merge_peds(p.peds_e)

In [557]:
fn = p.peds.copy()
fn = fn[['peds_1']]
fn['peds_1'] = fn['peds_1'].astype('str')
fn['family_number'] = fn.apply(lambda x: x['peds_1'].split('FNo.')[1].replace('[', '').replace(']', '') if 'FNo.' in x['peds_1'] else np.nan , axis=1)
fn = fn.dropna()
fn['family_number_sub'] = fn['family_number']
fn['family_number'] = fn.apply(lambda x: x['family_number'].split('-')[0] if '-' in x['family_number'] else x['family_number'], axis=1)
fn = fn[['family_number', 'family_number_sub']]
fn
for column in fn.columns:
    fn[column] = LabelEncoder().fit_transform(fn[column].fillna('Na'))
fn = fn.astype('category')
st.data_pe = st.data_pe.merge(fn, left_on='horse_id', right_index=True, how='left')
st.data_pe

Unnamed: 0,枠番,馬番,斤量,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,...,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61,family_number,family_number_sub
202201020801,1,1,54,20.0,晴,芝,良,2022-09-04,2020106402,01140,...,115,158,866,1709,514,1496,800,3813,26,139
202201020801,2,2,53,20.0,晴,芝,良,2022-09-04,2020105612,01186,...,182,617,19,17,236,966,1372,3542,0,16
202201020801,3,3,54,20.0,晴,芝,良,2022-09-04,2020100742,01091,...,127,675,485,767,399,1870,1943,5531,27,144
202201020801,4,4,54,20.0,晴,芝,良,2022-09-04,2020105576,01093,...,203,192,742,1549,532,693,289,2475,28,152
202201020801,5,5,54,20.0,晴,芝,良,2022-09-04,2020103235,05339,...,127,675,872,1681,4,711,1669,104,26,143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202201020812,6,10,57,17.0,晴,ダート,良,2022-09-04,2017106662,01143,...,203,208,109,1227,492,1669,2055,5581,28,150
202201020812,7,11,54,17.0,晴,ダート,良,2022-09-04,2019100800,01109,...,245,672,775,791,310,1114,329,2800,14,101
202201020812,7,12,52,17.0,晴,ダート,良,2022-09-04,2019104537,01093,...,75,396,151,450,305,420,660,1024,33,187
202201020812,8,13,55,17.0,晴,ダート,良,2022-09-04,2016100161,01192,...,192,337,549,1044,636,1461,1201,3880,7,56


In [559]:
st.process_categorical(r.le_horse, r.le_jockey, r.le_trainer, r.data_pe)

In [560]:
st.data_c.drop('上り', axis=1, inplace=True)

In [561]:
st.data_c = add_data1(st.data_c, df_day)

In [562]:
st.data_c['前走距離差'] = st.data_c.apply(lambda x: x['course_len'] - x['前走距離'], axis=1)
st.data_c.drop('前走距離', axis=1, inplace=True)

In [565]:
st.data_c.drop(['前芝ダ', '前走馬場状態'], axis=1, inplace=True)


In [568]:
st.data_c['開催場所'] = st.data_c.apply(lambda x: x.name[4:6], axis=1)
st.data_c['course_around'] = st.data_c['開催場所'].map(convert_around_dict)
st.data_c['course_around'] = st.data_c.apply(lambda x: x['course_around'] if not ((x['開催場所'] == '04') & (x['course_len'] == 10)) else '直線', axis=1)
st.data_c = pd.get_dummies(st.data_c, columns=['course_around'])
st.data_c.drop('開催場所', axis=1, inplace=True)

In [573]:
sabun_list = list((set(r.data_c.keys()) - set(st.data_c.keys())))

In [575]:
sabun_list.remove('rank')
sabun_list.remove('単勝')

In [576]:
if len(sabun_list) > 0:
    print('ok')
    st.data_c[sabun_list] = 0

ok


In [580]:
print(len(r.data_c.columns))
print(len(st.data_c.columns))


324
326


In [582]:
if '間隔_x' in r.data_c.columns: 
    r.data_c.drop(['間隔_x', '間隔_y', '前走距離_x', '前走距離_y'], axis=1, inplace=True)
if '間隔_x' in st.data_c.columns: 
    st.data_c.drop(['間隔_x', '間隔_y', '前走距離_x', '前走距離_y'], axis=1, inplace=True)

In [584]:
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_values("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

In [585]:
train, test = split_data(r.data_c, test_size=0.28)

In [586]:
train, valid = split_data(train, test_size=0.28)

In [587]:
st.data_c

Unnamed: 0,枠番,馬番,斤量,course_len,date,horse_id,jockey_id,年齢,n_horses,trainer_id,...,core_distance_非根幹距離,core_distance_根幹距離,straight_type_long,straight_type_short,cos_day,sin_day,前走距離差,course_around_右,course_around_直線,course_around_左
202201020801,1,1,54,20.0,2022-09-04,46313,119,2,8,116,...,0,1,0,1,-0.895839,-0.444378,2.0,1,0,0
202201020801,2,2,53,20.0,2022-09-04,46200,165,2,8,111,...,0,1,0,1,-0.895839,-0.444378,2.0,1,0,0
202201020801,3,3,54,20.0,2022-09-04,45455,79,2,8,123,...,0,1,0,1,-0.895839,-0.444378,2.0,1,0,0
202201020801,4,4,54,20.0,2022-09-04,46197,81,2,8,66,...,0,1,0,1,-0.895839,-0.444378,2.0,1,0,0
202201020801,5,5,54,20.0,2022-09-04,45814,213,2,8,175,...,0,1,0,1,-0.895839,-0.444378,2.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202201020812,6,10,57,17.0,2022-09-04,35632,122,5,14,81,...,1,0,0,1,-0.895839,-0.444378,0.0,1,0,0
202201020812,7,11,54,17.0,2022-09-04,41181,91,3,14,232,...,1,0,0,1,-0.895839,-0.444378,0.0,1,0,0
202201020812,7,12,52,17.0,2022-09-04,43425,81,3,14,187,...,1,0,0,1,-0.895839,-0.444378,0.0,1,0,0
202201020812,8,13,55,17.0,2022-09-04,26559,171,6,14,94,...,1,0,0,1,-0.895839,-0.444378,0.0,1,0,0


In [588]:
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
X_valid = valid.drop(['rank', 'date', '単勝'], axis=1)
y_valid = valid['rank']

In [589]:
print(len(r.data_c.columns))
print(len(st.data_c.columns))


324
322


In [338]:
start = time.time()
#データセットを作成
lgb_train = lgb_o.Dataset(X_train.values, y_train.values)
lgb_valid = lgb_o.Dataset(X_valid.values, y_valid.values)

params = {
    'objective': 'binary', #今回は0or1の二値予測なのでbinaryを指定
    'random_state': 100
}

#チューニング実行
lgb_clf_o = lgb_o.train(params, lgb_train,
                        valid_sets=(lgb_train, lgb_valid),
                        verbose_eval=100,
                        early_stopping_rounds=50)
elapsed_time = time.time() - start
elapsed_time

[32m[I 2022-09-02 23:35:58,562][0m A new study created in memory with name: no-name-c7e3dd99-5dd6-4522-b562-6b69afeea396[0m
feature_fraction, val_score: inf:   0%|                                                          | 0/7 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420283	valid_1's binary_logloss: 0.457823


feature_fraction, val_score: 0.457304:  14%|######4                                      | 1/7 [00:20<02:03, 20.60s/it][32m[I 2022-09-02 23:36:19,248][0m Trial 0 finished with value: 0.45730375685641106 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.45730375685641106.[0m
feature_fraction, val_score: 0.457304:  14%|######4                                      | 1/7 [00:20<02:03, 20.60s/it]

Early stopping, best iteration is:
[113]	valid_0's binary_logloss: 0.417726	valid_1's binary_logloss: 0.457304
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418248	valid_1's binary_logloss: 0.457292


feature_fraction, val_score: 0.456828:  29%|############8                                | 2/7 [00:43<01:45, 21.17s/it][32m[I 2022-09-02 23:36:41,750][0m Trial 1 finished with value: 0.4568282884928387 and parameters: {'feature_fraction': 1.0}. Best is trial 1 with value: 0.4568282884928387.[0m
feature_fraction, val_score: 0.456828:  29%|############8                                | 2/7 [00:43<01:45, 21.17s/it]

Early stopping, best iteration is:
[148]	valid_0's binary_logloss: 0.408971	valid_1's binary_logloss: 0.456828
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.419252	valid_1's binary_logloss: 0.45742
[200]	valid_0's binary_logloss: 0.402319	valid_1's binary_logloss: 0.456652


feature_fraction, val_score: 0.456360:  43%|###################2                         | 3/7 [00:58<01:18, 19.54s/it][32m[I 2022-09-02 23:36:57,496][0m Trial 2 finished with value: 0.4563604870579909 and parameters: {'feature_fraction': 0.5}. Best is trial 2 with value: 0.4563604870579909.[0m
feature_fraction, val_score: 0.456360:  43%|###################2                         | 3/7 [00:58<01:18, 19.54s/it]

Early stopping, best iteration is:
[157]	valid_0's binary_logloss: 0.408941	valid_1's binary_logloss: 0.45636
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418648	valid_1's binary_logloss: 0.456584


feature_fraction, val_score: 0.456182:  57%|#########################7                   | 4/7 [01:18<00:58, 19.59s/it][32m[I 2022-09-02 23:37:17,219][0m Trial 3 finished with value: 0.45618154255468857 and parameters: {'feature_fraction': 0.8}. Best is trial 3 with value: 0.45618154255468857.[0m
feature_fraction, val_score: 0.456182:  57%|#########################7                   | 4/7 [01:18<00:58, 19.59s/it]

Early stopping, best iteration is:
[143]	valid_0's binary_logloss: 0.410446	valid_1's binary_logloss: 0.456182
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418944	valid_1's binary_logloss: 0.457633


feature_fraction, val_score: 0.456182:  71%|################################1            | 5/7 [01:33<00:36, 18.15s/it][32m[I 2022-09-02 23:37:31,999][0m Trial 4 finished with value: 0.4571436076032529 and parameters: {'feature_fraction': 0.6}. Best is trial 3 with value: 0.45618154255468857.[0m
feature_fraction, val_score: 0.456182:  71%|################################1            | 5/7 [01:33<00:36, 18.15s/it]

Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.420631	valid_1's binary_logloss: 0.457144
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418705	valid_1's binary_logloss: 0.457491
[200]	valid_0's binary_logloss: 0.401246	valid_1's binary_logloss: 0.457358


feature_fraction, val_score: 0.456182:  86%|######################################5      | 6/7 [01:53<00:18, 18.80s/it][32m[I 2022-09-02 23:37:52,294][0m Trial 5 finished with value: 0.45683766665649184 and parameters: {'feature_fraction': 0.7}. Best is trial 3 with value: 0.45618154255468857.[0m
feature_fraction, val_score: 0.456182:  86%|######################################5      | 6/7 [01:53<00:18, 18.80s/it]

Early stopping, best iteration is:
[167]	valid_0's binary_logloss: 0.406463	valid_1's binary_logloss: 0.456838
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418294	valid_1's binary_logloss: 0.458316


feature_fraction, val_score: 0.456182: 100%|#############################################| 7/7 [02:12<00:00, 18.71s/it][32m[I 2022-09-02 23:38:10,843][0m Trial 6 finished with value: 0.45798651621081315 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 3 with value: 0.45618154255468857.[0m
feature_fraction, val_score: 0.456182: 100%|#############################################| 7/7 [02:12<00:00, 18.89s/it]
num_leaves, val_score: 0.456182:   0%|                                                          | 0/20 [00:00<?, ?it/s]

Early stopping, best iteration is:
[95]	valid_0's binary_logloss: 0.419302	valid_1's binary_logloss: 0.457987
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.333655	valid_1's binary_logloss: 0.460537


num_leaves, val_score: 0.456182:   5%|##5                                               | 1/20 [00:27<08:42, 27.51s/it][32m[I 2022-09-02 23:38:38,379][0m Trial 7 finished with value: 0.4592274471724152 and parameters: {'num_leaves': 199}. Best is trial 7 with value: 0.4592274471724152.[0m
num_leaves, val_score: 0.456182:   5%|##5                                               | 1/20 [00:27<08:42, 27.51s/it]

Early stopping, best iteration is:
[59]	valid_0's binary_logloss: 0.370572	valid_1's binary_logloss: 0.459227
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.398856	valid_1's binary_logloss: 0.458764


num_leaves, val_score: 0.456182:  10%|#####                                             | 2/20 [00:47<07:34, 25.25s/it][32m[I 2022-09-02 23:38:58,361][0m Trial 8 finished with value: 0.4582492627263538 and parameters: {'num_leaves': 64}. Best is trial 8 with value: 0.4582492627263538.[0m
num_leaves, val_score: 0.456182:  10%|#####                                             | 2/20 [00:47<07:34, 25.25s/it]

Early stopping, best iteration is:
[67]	valid_0's binary_logloss: 0.411628	valid_1's binary_logloss: 0.458249
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.400538	valid_1's binary_logloss: 0.458114


num_leaves, val_score: 0.456182:  15%|#######5                                          | 3/20 [01:08<06:48, 24.01s/it][32m[I 2022-09-02 23:39:19,463][0m Trial 9 finished with value: 0.45732875565601266 and parameters: {'num_leaves': 61}. Best is trial 9 with value: 0.45732875565601266.[0m
num_leaves, val_score: 0.456182:  15%|#######5                                          | 3/20 [01:08<06:48, 24.01s/it]

Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.408866	valid_1's binary_logloss: 0.457329
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.456182:  20%|##########                                        | 4/20 [01:36<06:42, 25.17s/it][32m[I 2022-09-02 23:39:47,412][0m Trial 10 finished with value: 0.4598121008554167 and parameters: {'num_leaves': 254}. Best is trial 9 with value: 0.45732875565601266.[0m
num_leaves, val_score: 0.456182:  20%|##########                                        | 4/20 [01:36<06:42, 25.17s/it]

Early stopping, best iteration is:
[35]	valid_0's binary_logloss: 0.388392	valid_1's binary_logloss: 0.459812
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.35643	valid_1's binary_logloss: 0.460004


num_leaves, val_score: 0.456182:  25%|############5                                     | 5/20 [02:00<06:13, 24.90s/it][32m[I 2022-09-02 23:40:11,625][0m Trial 11 finished with value: 0.4578214181080509 and parameters: {'num_leaves': 148}. Best is trial 9 with value: 0.45732875565601266.[0m
num_leaves, val_score: 0.456182:  25%|############5                                     | 5/20 [02:00<06:13, 24.90s/it]

Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.379039	valid_1's binary_logloss: 0.457821
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.42525	valid_1's binary_logloss: 0.457692


num_leaves, val_score: 0.456182:  30%|###############                                   | 6/20 [02:16<05:12, 22.30s/it][32m[I 2022-09-02 23:40:27,863][0m Trial 12 finished with value: 0.4569945848347222 and parameters: {'num_leaves': 21}. Best is trial 12 with value: 0.4569945848347222.[0m
num_leaves, val_score: 0.456182:  30%|###############                                   | 6/20 [02:17<05:12, 22.30s/it]

Early stopping, best iteration is:
[125]	valid_0's binary_logloss: 0.421575	valid_1's binary_logloss: 0.456995
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.314513	valid_1's binary_logloss: 0.459963


num_leaves, val_score: 0.456182:  35%|#################5                                | 7/20 [02:51<05:38, 26.01s/it][32m[I 2022-09-02 23:41:02,545][0m Trial 13 finished with value: 0.4583905801528264 and parameters: {'num_leaves': 249}. Best is trial 12 with value: 0.4569945848347222.[0m
num_leaves, val_score: 0.456182:  35%|#################5                                | 7/20 [02:51<05:38, 26.01s/it]

Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.351727	valid_1's binary_logloss: 0.458391
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.35643	valid_1's binary_logloss: 0.460004


num_leaves, val_score: 0.456182:  40%|####################                              | 8/20 [03:14<05:01, 25.13s/it][32m[I 2022-09-02 23:41:25,596][0m Trial 14 finished with value: 0.4578214181080509 and parameters: {'num_leaves': 148}. Best is trial 12 with value: 0.4569945848347222.[0m
num_leaves, val_score: 0.456182:  40%|####################                              | 8/20 [03:14<05:01, 25.13s/it]

Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.379039	valid_1's binary_logloss: 0.457821
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.43623	valid_1's binary_logloss: 0.460445
[200]	valid_0's binary_logloss: 0.428263	valid_1's binary_logloss: 0.457766
[300]	valid_0's binary_logloss: 0.422661	valid_1's binary_logloss: 0.457259


num_leaves, val_score: 0.456182:  45%|######################5                           | 9/20 [03:35<04:21, 23.73s/it][32m[I 2022-09-02 23:41:46,051][0m Trial 15 finished with value: 0.45673630991101066 and parameters: {'num_leaves': 9}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  45%|######################5                           | 9/20 [03:35<04:21, 23.73s/it]

Early stopping, best iteration is:
[278]	valid_0's binary_logloss: 0.423786	valid_1's binary_logloss: 0.456736
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.37889	valid_1's binary_logloss: 0.459013


num_leaves, val_score: 0.456182:  50%|########################5                        | 10/20 [03:57<03:53, 23.39s/it][32m[I 2022-09-02 23:42:08,689][0m Trial 16 finished with value: 0.45798611528899774 and parameters: {'num_leaves': 101}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  50%|########################5                        | 10/20 [03:57<03:53, 23.39s/it]

Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.392037	valid_1's binary_logloss: 0.457986
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.330785	valid_1's binary_logloss: 0.459694


num_leaves, val_score: 0.456182:  55%|##########################9                      | 11/20 [04:24<03:40, 24.53s/it][32m[I 2022-09-02 23:42:35,836][0m Trial 17 finished with value: 0.45843731923795095 and parameters: {'num_leaves': 206}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  55%|##########################9                      | 11/20 [04:24<03:40, 24.53s/it]

Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.364215	valid_1's binary_logloss: 0.458437
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.456182:  60%|#############################4                   | 12/20 [04:51<03:21, 25.14s/it][32m[I 2022-09-02 23:43:02,420][0m Trial 18 finished with value: 0.4586381489438264 and parameters: {'num_leaves': 200}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  60%|#############################4                   | 12/20 [04:51<03:21, 25.14s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.381332	valid_1's binary_logloss: 0.458638
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.373006	valid_1's binary_logloss: 0.459377


num_leaves, val_score: 0.456182:  65%|###############################8                 | 13/20 [05:13<02:48, 24.11s/it][32m[I 2022-09-02 23:43:24,153][0m Trial 19 finished with value: 0.45892039857123423 and parameters: {'num_leaves': 112}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  65%|###############################8                 | 13/20 [05:13<02:48, 24.11s/it]

Early stopping, best iteration is:
[59]	valid_0's binary_logloss: 0.397871	valid_1's binary_logloss: 0.45892
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.406394	valid_1's binary_logloss: 0.456935


num_leaves, val_score: 0.456182:  70%|##################################3              | 14/20 [05:31<02:13, 22.22s/it][32m[I 2022-09-02 23:43:41,915][0m Trial 20 finished with value: 0.4569185045156109 and parameters: {'num_leaves': 50}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  70%|##################################3              | 14/20 [05:31<02:13, 22.22s/it]

Early stopping, best iteration is:
[98]	valid_0's binary_logloss: 0.407009	valid_1's binary_logloss: 0.456919
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.44329	valid_1's binary_logloss: 0.464191
[200]	valid_0's binary_logloss: 0.436148	valid_1's binary_logloss: 0.460054
[300]	valid_0's binary_logloss: 0.432103	valid_1's binary_logloss: 0.458198
[400]	valid_0's binary_logloss: 0.429157	valid_1's binary_logloss: 0.457456
[500]	valid_0's binary_logloss: 0.426497	valid_1's binary_logloss: 0.457159


num_leaves, val_score: 0.456182:  75%|####################################7            | 15/20 [05:57<01:57, 23.51s/it][32m[I 2022-09-02 23:44:08,451][0m Trial 21 finished with value: 0.45709954292113375 and parameters: {'num_leaves': 5}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  75%|####################################7            | 15/20 [05:57<01:57, 23.51s/it]

Early stopping, best iteration is:
[508]	valid_0's binary_logloss: 0.426296	valid_1's binary_logloss: 0.4571
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.421028	valid_1's binary_logloss: 0.457955


num_leaves, val_score: 0.456182:  80%|#######################################2         | 16/20 [06:16<01:28, 22.16s/it][32m[I 2022-09-02 23:44:27,453][0m Trial 22 finished with value: 0.4572266833165413 and parameters: {'num_leaves': 27}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  80%|#######################################2         | 16/20 [06:16<01:28, 22.16s/it]

[200]	valid_0's binary_logloss: 0.40535	valid_1's binary_logloss: 0.457321
Early stopping, best iteration is:
[153]	valid_0's binary_logloss: 0.412263	valid_1's binary_logloss: 0.457227
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.383022	valid_1's binary_logloss: 0.458676


num_leaves, val_score: 0.456182:  85%|#########################################6       | 17/20 [06:39<01:06, 22.30s/it][32m[I 2022-09-02 23:44:50,110][0m Trial 23 finished with value: 0.45807588636625535 and parameters: {'num_leaves': 92}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  85%|#########################################6       | 17/20 [06:39<01:06, 22.30s/it]

Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.394379	valid_1's binary_logloss: 0.458076
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.441168	valid_1's binary_logloss: 0.462726
[200]	valid_0's binary_logloss: 0.433869	valid_1's binary_logloss: 0.458991
[300]	valid_0's binary_logloss: 0.429405	valid_1's binary_logloss: 0.457962


num_leaves, val_score: 0.456182:  90%|############################################1    | 18/20 [06:59<00:43, 21.66s/it][32m[I 2022-09-02 23:45:10,229][0m Trial 24 finished with value: 0.4575985286104998 and parameters: {'num_leaves': 6}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  90%|############################################1    | 18/20 [06:59<00:43, 21.66s/it]

Early stopping, best iteration is:
[286]	valid_0's binary_logloss: 0.429901	valid_1's binary_logloss: 0.457599
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.360192	valid_1's binary_logloss: 0.459423


num_leaves, val_score: 0.456182:  95%|##############################################5  | 19/20 [07:24<00:22, 22.65s/it][32m[I 2022-09-02 23:45:35,190][0m Trial 25 finished with value: 0.4584090611435693 and parameters: {'num_leaves': 139}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182:  95%|##############################################5  | 19/20 [07:24<00:22, 22.65s/it]

Early stopping, best iteration is:
[64]	valid_0's binary_logloss: 0.385163	valid_1's binary_logloss: 0.458409
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.344499	valid_1's binary_logloss: 0.460146


num_leaves, val_score: 0.456182: 100%|#################################################| 20/20 [07:51<00:00, 23.98s/it][32m[I 2022-09-02 23:46:02,316][0m Trial 26 finished with value: 0.45894437017736245 and parameters: {'num_leaves': 175}. Best is trial 15 with value: 0.45673630991101066.[0m
num_leaves, val_score: 0.456182: 100%|#################################################| 20/20 [07:51<00:00, 23.57s/it]
bagging, val_score: 0.456182:   0%|                                                             | 0/10 [00:00<?, ?it/s]

Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.37451	valid_1's binary_logloss: 0.458944
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418212	valid_1's binary_logloss: 0.456006


bagging, val_score: 0.455402:  10%|#####3                                               | 1/10 [00:19<02:51, 19.03s/it][32m[I 2022-09-02 23:46:21,363][0m Trial 27 finished with value: 0.4554022412771171 and parameters: {'bagging_fraction': 0.7509744205670051, 'bagging_freq': 2}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  10%|#####3                                               | 1/10 [00:19<02:51, 19.03s/it]

Early stopping, best iteration is:
[127]	valid_0's binary_logloss: 0.412851	valid_1's binary_logloss: 0.455402
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418543	valid_1's binary_logloss: 0.456826


bagging, val_score: 0.455402:  20%|##########6                                          | 2/10 [00:37<02:31, 18.89s/it][32m[I 2022-09-02 23:46:39,935][0m Trial 28 finished with value: 0.45606125440860457 and parameters: {'bagging_fraction': 0.7674492923933173, 'bagging_freq': 2}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  20%|##########6                                          | 2/10 [00:37<02:31, 18.89s/it]

Early stopping, best iteration is:
[119]	valid_0's binary_logloss: 0.414698	valid_1's binary_logloss: 0.456061
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418229	valid_1's binary_logloss: 0.456389


bagging, val_score: 0.455402:  30%|###############9                                     | 3/10 [00:56<02:13, 19.04s/it][32m[I 2022-09-02 23:46:59,313][0m Trial 29 finished with value: 0.4558829703591304 and parameters: {'bagging_fraction': 0.7612539014732796, 'bagging_freq': 2}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  30%|###############9                                     | 3/10 [00:56<02:13, 19.04s/it]

Early stopping, best iteration is:
[110]	valid_0's binary_logloss: 0.415987	valid_1's binary_logloss: 0.455883
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418591	valid_1's binary_logloss: 0.457567


bagging, val_score: 0.455402:  40%|#####################2                               | 4/10 [01:17<01:56, 19.42s/it][32m[I 2022-09-02 23:47:19,621][0m Trial 30 finished with value: 0.4569374392637041 and parameters: {'bagging_fraction': 0.7719933275512607, 'bagging_freq': 2}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  40%|#####################2                               | 4/10 [01:17<01:56, 19.42s/it]

Early stopping, best iteration is:
[114]	valid_0's binary_logloss: 0.415753	valid_1's binary_logloss: 0.456937
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418654	valid_1's binary_logloss: 0.456824


bagging, val_score: 0.455402:  50%|##########################5                          | 5/10 [01:36<01:37, 19.43s/it][32m[I 2022-09-02 23:47:39,090][0m Trial 31 finished with value: 0.45658262530515903 and parameters: {'bagging_fraction': 0.755951338659078, 'bagging_freq': 2}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  50%|##########################5                          | 5/10 [01:36<01:37, 19.43s/it]

Early stopping, best iteration is:
[99]	valid_0's binary_logloss: 0.418897	valid_1's binary_logloss: 0.456583
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418401	valid_1's binary_logloss: 0.456275


bagging, val_score: 0.455402:  60%|###############################8                     | 6/10 [01:55<01:16, 19.23s/it][32m[I 2022-09-02 23:47:57,846][0m Trial 32 finished with value: 0.4562552870382858 and parameters: {'bagging_fraction': 0.7446077624983114, 'bagging_freq': 2}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  60%|###############################8                     | 6/10 [01:55<01:16, 19.23s/it]

Early stopping, best iteration is:
[99]	valid_0's binary_logloss: 0.418597	valid_1's binary_logloss: 0.456255
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.41854	valid_1's binary_logloss: 0.45772


bagging, val_score: 0.455402:  70%|#####################################                | 7/10 [02:18<01:01, 20.48s/it][32m[I 2022-09-02 23:48:21,259][0m Trial 33 finished with value: 0.45704884290992603 and parameters: {'bagging_fraction': 0.9581283686035754, 'bagging_freq': 2}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  70%|#####################################                | 7/10 [02:18<01:01, 20.48s/it]

Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.412941	valid_1's binary_logloss: 0.457049
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418769	valid_1's binary_logloss: 0.457839


bagging, val_score: 0.455402:  80%|##########################################4          | 8/10 [02:36<00:39, 19.66s/it][32m[I 2022-09-02 23:48:38,992][0m Trial 34 finished with value: 0.457347338255947 and parameters: {'bagging_fraction': 0.5772954027820191, 'bagging_freq': 5}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  80%|##########################################4          | 8/10 [02:36<00:39, 19.66s/it]

Early stopping, best iteration is:
[105]	valid_0's binary_logloss: 0.417564	valid_1's binary_logloss: 0.457347
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.41841	valid_1's binary_logloss: 0.458181


bagging, val_score: 0.455402:  90%|###############################################7     | 9/10 [03:00<00:20, 20.84s/it][32m[I 2022-09-02 23:49:02,573][0m Trial 35 finished with value: 0.4567479069773668 and parameters: {'bagging_fraction': 0.8695358023049389, 'bagging_freq': 1}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402:  90%|###############################################7     | 9/10 [03:00<00:20, 20.84s/it]

Early stopping, best iteration is:
[149]	valid_0's binary_logloss: 0.409003	valid_1's binary_logloss: 0.456748
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418959	valid_1's binary_logloss: 0.45786
[200]	valid_0's binary_logloss: 0.401156	valid_1's binary_logloss: 0.45846


bagging, val_score: 0.455402: 100%|####################################################| 10/10 [03:21<00:00, 20.86s/it][32m[I 2022-09-02 23:49:23,541][0m Trial 36 finished with value: 0.4576038636020981 and parameters: {'bagging_fraction': 0.6222299701696431, 'bagging_freq': 4}. Best is trial 27 with value: 0.4554022412771171.[0m
bagging, val_score: 0.455402: 100%|####################################################| 10/10 [03:21<00:00, 20.12s/it]
feature_fraction_stage2, val_score: 0.455402:   0%|                                              | 0/6 [00:00<?, ?it/s]

Early stopping, best iteration is:
[160]	valid_0's binary_logloss: 0.407874	valid_1's binary_logloss: 0.457604
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418184	valid_1's binary_logloss: 0.457095


feature_fraction_stage2, val_score: 0.455402:  17%|######3                               | 1/6 [00:19<01:35, 19.01s/it][32m[I 2022-09-02 23:49:42,587][0m Trial 37 finished with value: 0.4567819836303981 and parameters: {'feature_fraction': 0.88}. Best is trial 37 with value: 0.4567819836303981.[0m
feature_fraction_stage2, val_score: 0.455402:  17%|######3                               | 1/6 [00:19<01:35, 19.01s/it]

Early stopping, best iteration is:
[85]	valid_0's binary_logloss: 0.421587	valid_1's binary_logloss: 0.456782
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418442	valid_1's binary_logloss: 0.456683


feature_fraction_stage2, val_score: 0.455402:  33%|############6                         | 2/6 [00:34<01:11, 17.94s/it][32m[I 2022-09-02 23:49:58,019][0m Trial 38 finished with value: 0.4566828942200165 and parameters: {'feature_fraction': 0.7520000000000001}. Best is trial 38 with value: 0.4566828942200165.[0m
feature_fraction_stage2, val_score: 0.455402:  33%|############6                         | 2/6 [00:34<01:11, 17.94s/it]

Early stopping, best iteration is:
[100]	valid_0's binary_logloss: 0.418442	valid_1's binary_logloss: 0.456683
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418425	valid_1's binary_logloss: 0.457419


feature_fraction_stage2, val_score: 0.455402:  50%|###################                   | 3/6 [00:50<00:51, 17.31s/it][32m[I 2022-09-02 23:50:13,849][0m Trial 39 finished with value: 0.4572080967208952 and parameters: {'feature_fraction': 0.8160000000000001}. Best is trial 38 with value: 0.4566828942200165.[0m
feature_fraction_stage2, val_score: 0.455402:  50%|###################                   | 3/6 [00:50<00:51, 17.31s/it]

Early stopping, best iteration is:
[96]	valid_0's binary_logloss: 0.419288	valid_1's binary_logloss: 0.457208
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.41842	valid_1's binary_logloss: 0.457549


feature_fraction_stage2, val_score: 0.455402:  67%|#########################3            | 4/6 [01:06<00:33, 16.93s/it][32m[I 2022-09-02 23:50:29,957][0m Trial 40 finished with value: 0.4571454778714004 and parameters: {'feature_fraction': 0.7200000000000001}. Best is trial 38 with value: 0.4566828942200165.[0m
feature_fraction_stage2, val_score: 0.455402:  67%|#########################3            | 4/6 [01:06<00:33, 16.93s/it]

Early stopping, best iteration is:
[116]	valid_0's binary_logloss: 0.415144	valid_1's binary_logloss: 0.457145
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418347	valid_1's binary_logloss: 0.456696


feature_fraction_stage2, val_score: 0.455402:  83%|###############################6      | 5/6 [01:22<00:16, 16.68s/it][32m[I 2022-09-02 23:50:45,996][0m Trial 41 finished with value: 0.4565825089398031 and parameters: {'feature_fraction': 0.8480000000000001}. Best is trial 41 with value: 0.4565825089398031.[0m
feature_fraction_stage2, val_score: 0.455402:  83%|###############################6      | 5/6 [01:22<00:16, 16.68s/it]

Early stopping, best iteration is:
[99]	valid_0's binary_logloss: 0.418562	valid_1's binary_logloss: 0.456583
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418414	valid_1's binary_logloss: 0.456561


feature_fraction_stage2, val_score: 0.455402: 100%|######################################| 6/6 [01:39<00:00, 16.90s/it][32m[I 2022-09-02 23:51:03,418][0m Trial 42 finished with value: 0.45599759019210795 and parameters: {'feature_fraction': 0.784}. Best is trial 42 with value: 0.45599759019210795.[0m
feature_fraction_stage2, val_score: 0.455402: 100%|######################################| 6/6 [01:39<00:00, 16.64s/it]
regularization_factors, val_score: 0.455402:   0%|                                              | 0/20 [00:00<?, ?it/s]

Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.413784	valid_1's binary_logloss: 0.455998
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.41875	valid_1's binary_logloss: 0.457088


regularization_factors, val_score: 0.455402:   5%|#9                                    | 1/20 [00:18<05:52, 18.56s/it][32m[I 2022-09-02 23:51:22,017][0m Trial 43 finished with value: 0.45662003004315893 and parameters: {'lambda_l1': 2.8287343197190196e-07, 'lambda_l2': 0.0022450591110968364}. Best is trial 43 with value: 0.45662003004315893.[0m
regularization_factors, val_score: 0.455402:   5%|#9                                    | 1/20 [00:18<05:52, 18.56s/it]

Early stopping, best iteration is:
[123]	valid_0's binary_logloss: 0.414167	valid_1's binary_logloss: 0.45662
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420379	valid_1's binary_logloss: 0.456595
[200]	valid_0's binary_logloss: 0.40528	valid_1's binary_logloss: 0.455672


regularization_factors, val_score: 0.455402:  10%|###8                                  | 2/20 [00:45<06:19, 21.07s/it][32m[I 2022-09-02 23:51:48,929][0m Trial 44 finished with value: 0.45566488988711834 and parameters: {'lambda_l1': 6.8527272213935735, 'lambda_l2': 1.9864141413522038e-08}. Best is trial 44 with value: 0.45566488988711834.[0m
regularization_factors, val_score: 0.455402:  10%|###8                                  | 2/20 [00:45<06:19, 21.07s/it]

Early stopping, best iteration is:
[201]	valid_0's binary_logloss: 0.405156	valid_1's binary_logloss: 0.455665
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.419587	valid_1's binary_logloss: 0.456548


regularization_factors, val_score: 0.455402:  15%|#####7                                | 3/20 [01:05<05:53, 20.77s/it][32m[I 2022-09-02 23:52:08,981][0m Trial 45 finished with value: 0.4554472114722202 and parameters: {'lambda_l1': 3.1338373166716877, 'lambda_l2': 2.0955986051808027e-08}. Best is trial 45 with value: 0.4554472114722202.[0m
regularization_factors, val_score: 0.455402:  15%|#####7                                | 3/20 [01:05<05:53, 20.77s/it]

Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.414996	valid_1's binary_logloss: 0.455447
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420896	valid_1's binary_logloss: 0.456007


regularization_factors, val_score: 0.455370:  20%|#######6                              | 4/20 [01:26<05:32, 20.76s/it][32m[I 2022-09-02 23:52:29,723][0m Trial 46 finished with value: 0.4553698933851962 and parameters: {'lambda_l1': 8.381120681618567, 'lambda_l2': 1.3390998983191105e-08}. Best is trial 46 with value: 0.4553698933851962.[0m
regularization_factors, val_score: 0.455370:  20%|#######6                              | 4/20 [01:26<05:32, 20.76s/it]

Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.41684	valid_1's binary_logloss: 0.45537
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420392	valid_1's binary_logloss: 0.456845


regularization_factors, val_score: 0.455370:  25%|#########5                            | 5/20 [01:47<05:13, 20.93s/it][32m[I 2022-09-02 23:52:51,053][0m Trial 47 finished with value: 0.4560077942843901 and parameters: {'lambda_l1': 5.9004812034634195, 'lambda_l2': 1.0813875456035107e-08}. Best is trial 46 with value: 0.4553698933851962.[0m
regularization_factors, val_score: 0.455370:  25%|#########5                            | 5/20 [01:47<05:13, 20.93s/it]

Early stopping, best iteration is:
[120]	valid_0's binary_logloss: 0.416851	valid_1's binary_logloss: 0.456008
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420764	valid_1's binary_logloss: 0.456496


regularization_factors, val_score: 0.455370:  30%|###########4                          | 6/20 [02:08<04:52, 20.92s/it][32m[I 2022-09-02 23:53:11,932][0m Trial 48 finished with value: 0.4555186100751818 and parameters: {'lambda_l1': 7.589703354751703, 'lambda_l2': 2.105381159347725e-08}. Best is trial 46 with value: 0.4553698933851962.[0m
regularization_factors, val_score: 0.455370:  30%|###########4                          | 6/20 [02:08<04:52, 20.92s/it]

Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.416523	valid_1's binary_logloss: 0.455519
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420988	valid_1's binary_logloss: 0.456552


regularization_factors, val_score: 0.455370:  35%|#############3                        | 7/20 [02:29<04:32, 20.97s/it][32m[I 2022-09-02 23:53:33,045][0m Trial 49 finished with value: 0.4556766363690781 and parameters: {'lambda_l1': 7.6739286360397285, 'lambda_l2': 1.2544715723218584e-08}. Best is trial 46 with value: 0.4553698933851962.[0m
regularization_factors, val_score: 0.455370:  35%|#############3                        | 7/20 [02:29<04:32, 20.97s/it]

Early stopping, best iteration is:
[120]	valid_0's binary_logloss: 0.41741	valid_1's binary_logloss: 0.455677
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.421153	valid_1's binary_logloss: 0.456689


regularization_factors, val_score: 0.455370:  40%|###############2                      | 8/20 [02:52<04:18, 21.53s/it][32m[I 2022-09-02 23:53:55,907][0m Trial 50 finished with value: 0.45596431375773977 and parameters: {'lambda_l1': 9.044911931946118, 'lambda_l2': 2.1806108948812332e-08}. Best is trial 46 with value: 0.4553698933851962.[0m
regularization_factors, val_score: 0.455370:  40%|###############2                      | 8/20 [02:52<04:18, 21.53s/it]

Early stopping, best iteration is:
[147]	valid_0's binary_logloss: 0.413516	valid_1's binary_logloss: 0.455964
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420651	valid_1's binary_logloss: 0.455905


regularization_factors, val_score: 0.455370:  45%|#################1                    | 9/20 [03:15<04:00, 21.90s/it][32m[I 2022-09-02 23:54:18,634][0m Trial 51 finished with value: 0.455611912090175 and parameters: {'lambda_l1': 7.962621049189688, 'lambda_l2': 2.0520087593900677e-08}. Best is trial 46 with value: 0.4553698933851962.[0m
regularization_factors, val_score: 0.455370:  45%|#################1                    | 9/20 [03:15<04:00, 21.90s/it]

Early stopping, best iteration is:
[132]	valid_0's binary_logloss: 0.415414	valid_1's binary_logloss: 0.455612
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420896	valid_1's binary_logloss: 0.456007


regularization_factors, val_score: 0.455370:  50%|##################5                  | 10/20 [03:36<03:36, 21.61s/it][32m[I 2022-09-02 23:54:39,574][0m Trial 52 finished with value: 0.45550449941320426 and parameters: {'lambda_l1': 8.376156375671854, 'lambda_l2': 3.5082227732759784e-08}. Best is trial 46 with value: 0.4553698933851962.[0m
regularization_factors, val_score: 0.455370:  50%|##################5                  | 10/20 [03:36<03:36, 21.61s/it]

Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.41774	valid_1's binary_logloss: 0.455504
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420705	valid_1's binary_logloss: 0.455616


regularization_factors, val_score: 0.454705:  55%|####################3                | 11/20 [04:00<03:21, 22.39s/it][32m[I 2022-09-02 23:55:03,848][0m Trial 53 finished with value: 0.454705288833924 and parameters: {'lambda_l1': 8.82874334859225, 'lambda_l2': 3.4996874947378043e-07}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  55%|####################3                | 11/20 [04:00<03:21, 22.39s/it]

Early stopping, best iteration is:
[134]	valid_0's binary_logloss: 0.415127	valid_1's binary_logloss: 0.454705
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418491	valid_1's binary_logloss: 0.456574


regularization_factors, val_score: 0.454705:  60%|######################2              | 12/20 [04:23<03:00, 22.62s/it][32m[I 2022-09-02 23:55:26,939][0m Trial 54 finished with value: 0.4560307835072251 and parameters: {'lambda_l1': 0.20355316072717966, 'lambda_l2': 4.359406485422969e-06}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  60%|######################2              | 12/20 [04:23<03:00, 22.62s/it]

Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.413643	valid_1's binary_logloss: 0.456031
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418504	valid_1's binary_logloss: 0.456441


regularization_factors, val_score: 0.454705:  65%|########################             | 13/20 [04:44<02:34, 22.09s/it][32m[I 2022-09-02 23:55:47,798][0m Trial 55 finished with value: 0.45639425309644005 and parameters: {'lambda_l1': 0.06411056638277571, 'lambda_l2': 2.270766348216896e-06}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  65%|########################             | 13/20 [04:44<02:34, 22.09s/it]

Early stopping, best iteration is:
[99]	valid_0's binary_logloss: 0.418703	valid_1's binary_logloss: 0.456394
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418479	valid_1's binary_logloss: 0.458009


regularization_factors, val_score: 0.454705:  70%|#########################9           | 14/20 [05:07<02:13, 22.29s/it][32m[I 2022-09-02 23:56:10,561][0m Trial 56 finished with value: 0.4573140252206462 and parameters: {'lambda_l1': 0.18674480645962857, 'lambda_l2': 6.268964023466392e-07}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  70%|#########################9           | 14/20 [05:07<02:13, 22.29s/it]

Early stopping, best iteration is:
[127]	valid_0's binary_logloss: 0.413294	valid_1's binary_logloss: 0.457314
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420991	valid_1's binary_logloss: 0.456045


regularization_factors, val_score: 0.454705:  75%|###########################7         | 15/20 [05:34<01:58, 23.70s/it][32m[I 2022-09-02 23:56:37,555][0m Trial 57 finished with value: 0.4550727754544044 and parameters: {'lambda_l1': 7.445498934311096, 'lambda_l2': 2.3275369593877784}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  75%|###########################7         | 15/20 [05:34<01:58, 23.70s/it]

Early stopping, best iteration is:
[135]	valid_0's binary_logloss: 0.415035	valid_1's binary_logloss: 0.455073
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418983	valid_1's binary_logloss: 0.456336


regularization_factors, val_score: 0.454705:  80%|#############################6       | 16/20 [05:57<01:34, 23.62s/it][32m[I 2022-09-02 23:57:00,958][0m Trial 58 finished with value: 0.45579901684557517 and parameters: {'lambda_l1': 0.294724303466649, 'lambda_l2': 0.6800162863529573}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  80%|#############################6       | 16/20 [05:57<01:34, 23.62s/it]

Early stopping, best iteration is:
[121]	valid_0's binary_logloss: 0.414998	valid_1's binary_logloss: 0.455799
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.419453	valid_1's binary_logloss: 0.455961


regularization_factors, val_score: 0.454705:  85%|###############################4     | 17/20 [06:19<01:09, 23.21s/it][32m[I 2022-09-02 23:57:23,238][0m Trial 59 finished with value: 0.45570017204186836 and parameters: {'lambda_l1': 0.0019420582517861045, 'lambda_l2': 3.094583645746526}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  85%|###############################4     | 17/20 [06:19<01:09, 23.21s/it]

Early stopping, best iteration is:
[105]	valid_0's binary_logloss: 0.418493	valid_1's binary_logloss: 0.4557
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418867	valid_1's binary_logloss: 0.456509


regularization_factors, val_score: 0.454705:  90%|#################################3   | 18/20 [06:43<00:46, 23.41s/it][32m[I 2022-09-02 23:57:47,152][0m Trial 60 finished with value: 0.4558749360856644 and parameters: {'lambda_l1': 1.4071033189327737, 'lambda_l2': 3.100342532706636e-07}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  90%|#################################3   | 18/20 [06:43<00:46, 23.41s/it]

Early stopping, best iteration is:
[126]	valid_0's binary_logloss: 0.413987	valid_1's binary_logloss: 0.455875
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.421153	valid_1's binary_logloss: 0.456
[200]	valid_0's binary_logloss: 0.406539	valid_1's binary_logloss: 0.455685


regularization_factors, val_score: 0.454705:  95%|###################################1 | 19/20 [07:16<00:26, 26.10s/it][32m[I 2022-09-02 23:58:19,475][0m Trial 61 finished with value: 0.45561578676724723 and parameters: {'lambda_l1': 9.525348999572989, 'lambda_l2': 1.193567135140404e-07}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705:  95%|###################################1 | 19/20 [07:16<00:26, 26.10s/it]

Early stopping, best iteration is:
[194]	valid_0's binary_logloss: 0.407277	valid_1's binary_logloss: 0.455616
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.418982	valid_1's binary_logloss: 0.455712


regularization_factors, val_score: 0.454705: 100%|#####################################| 20/20 [07:39<00:00, 25.39s/it][32m[I 2022-09-02 23:58:43,218][0m Trial 62 finished with value: 0.45518515320758773 and parameters: {'lambda_l1': 1.5011768546531503, 'lambda_l2': 0.00022346263147027248}. Best is trial 53 with value: 0.454705288833924.[0m
regularization_factors, val_score: 0.454705: 100%|#####################################| 20/20 [07:39<00:00, 22.99s/it]
min_data_in_leaf, val_score: 0.454705:   0%|                                                     | 0/5 [00:00<?, ?it/s]

Early stopping, best iteration is:
[128]	valid_0's binary_logloss: 0.413684	valid_1's binary_logloss: 0.455185
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.42082	valid_1's binary_logloss: 0.455482


min_data_in_leaf, val_score: 0.454705:  20%|#########                                    | 1/5 [00:24<01:39, 25.00s/it][32m[I 2022-09-02 23:59:08,245][0m Trial 63 finished with value: 0.45488568108303723 and parameters: {'min_child_samples': 100}. Best is trial 63 with value: 0.45488568108303723.[0m
min_data_in_leaf, val_score: 0.454705:  20%|#########                                    | 1/5 [00:25<01:39, 25.00s/it]

Early stopping, best iteration is:
[121]	valid_0's binary_logloss: 0.417358	valid_1's binary_logloss: 0.454886
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420705	valid_1's binary_logloss: 0.455616


min_data_in_leaf, val_score: 0.454705:  40%|##################                           | 2/5 [00:51<01:16, 25.56s/it][32m[I 2022-09-02 23:59:35,148][0m Trial 64 finished with value: 0.4548417638468623 and parameters: {'min_child_samples': 10}. Best is trial 64 with value: 0.4548417638468623.[0m
min_data_in_leaf, val_score: 0.454705:  40%|##################                           | 2/5 [00:51<01:16, 25.56s/it]

Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.415927	valid_1's binary_logloss: 0.454842
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.42083	valid_1's binary_logloss: 0.456032


min_data_in_leaf, val_score: 0.454705:  60%|###########################                  | 3/5 [01:16<00:50, 25.23s/it][32m[I 2022-09-02 23:59:59,564][0m Trial 65 finished with value: 0.45561690556308754 and parameters: {'min_child_samples': 50}. Best is trial 64 with value: 0.4548417638468623.[0m
min_data_in_leaf, val_score: 0.454705:  60%|###########################                  | 3/5 [01:16<00:50, 25.23s/it]

Early stopping, best iteration is:
[134]	valid_0's binary_logloss: 0.415428	valid_1's binary_logloss: 0.455617
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420705	valid_1's binary_logloss: 0.455616
[200]	valid_0's binary_logloss: 0.405997	valid_1's binary_logloss: 0.454908


min_data_in_leaf, val_score: 0.454705:  80%|####################################         | 4/5 [01:45<00:26, 26.27s/it][32m[I 2022-09-03 00:00:28,261][0m Trial 66 finished with value: 0.4549078505309696 and parameters: {'min_child_samples': 5}. Best is trial 64 with value: 0.4548417638468623.[0m
min_data_in_leaf, val_score: 0.454705:  80%|####################################         | 4/5 [01:45<00:26, 26.27s/it]

Early stopping, best iteration is:
[200]	valid_0's binary_logloss: 0.405997	valid_1's binary_logloss: 0.454908
[LightGBM] [Info] Number of positive: 46409, number of negative: 175091
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 56974
[LightGBM] [Info] Number of data points in the train set: 221500, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209521 -> initscore=-1.327812
[LightGBM] [Info] Start training from score -1.327812
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.420705	valid_1's binary_logloss: 0.455616


min_data_in_leaf, val_score: 0.454705: 100%|#############################################| 5/5 [02:08<00:00, 25.34s/it][32m[I 2022-09-03 00:00:51,457][0m Trial 67 finished with value: 0.4547244409537142 and parameters: {'min_child_samples': 25}. Best is trial 67 with value: 0.4547244409537142.[0m
min_data_in_leaf, val_score: 0.454705: 100%|#############################################| 5/5 [02:08<00:00, 25.65s/it]

Early stopping, best iteration is:
[125]	valid_0's binary_logloss: 0.416514	valid_1's binary_logloss: 0.454724





1496.551155090332

In [339]:
lgb_clf_o.params #jupyterで出力

{'objective': 'binary',
 'random_state': 100,
 'feature_pre_filter': False,
 'lambda_l1': 8.82874334859225,
 'lambda_l2': 3.4996874947378043e-07,
 'num_leaves': 31,
 'feature_fraction': 0.8,
 'bagging_fraction': 0.7509744205670051,
 'bagging_freq': 2,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': 50}

In [590]:
# train, test = split_data(r.data_c)

# #説明変数と目的変数に分ける。dateはこの後不要なので省く。
# X_train = train.drop(['rank', 'date'], axis=1)
# y_train = train['rank']
# X_test = test.drop(['rank', 'date'], axis=1)
# y_test = test['rank']
train, test = split_data(r.data_c, test_size=0.28)

#説明変数と目的変数に分ける。dateはこの後不要なので省く。
X_train = train.drop(['rank', 'date', '単勝'], axis=1)
y_train = train['rank']
#2021/3/12追加： テストデータの単勝オッズはシミュレーション時に使用するので残しておく
X_test = test.drop(['rank', 'date'], axis=1)
y_test = test['rank']

# lgb_clf = lgb.LGBMClassifier(**lgb_clf_o.params)
# lgb_clf.fit(X_train.values, y_train.values)

In [341]:
params = lgb_clf_o.params
del params['early_stopping_round']
del params['num_iterations']

In [344]:
class Return:
    def __init__(self, return_tables):
        self.return_tables = return_tables
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        return_tables = {}
        for race_id in tqdm(race_id_list):
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
                #そのため、改行コードを文字列brに変換して後でsplitする
                f = urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
                df = pd.concat([dfs[1], dfs[2]])

                df.index = [race_id] * len(df)
                return_tables[race_id] = df
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        return return_tables_df
    
    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        wins = fukusho[1].str.split('br', expand=True)[[0,1,2]]
        
        wins.columns = ['win_0', 'win_1', 'win_2']
        returns = fukusho[2].str.split('br', expand=True)[[0,1,2]]
        returns.columns = ['return_0', 'return_1', 'return_2']
        
        df = pd.concat([wins, returns], axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',', '')
        return df.fillna(0).astype(int)
    
    @property
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
        tansho.columns = ['win', 'return']
        
        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
            
        return tansho
    
    @property
    def umaren(self):
        umaren = self.return_tables[self.return_tables[0]=='馬連'][[1,2]]
        wins = umaren[1].str.split('-', expand=True)[[0,1]].add_prefix('win_')
        return_ = umaren[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def umatan(self):
        umatan = self.return_tables[self.return_tables[0]=='馬単'][[1,2]]
        wins = umatan[1].str.split('→', expand=True)[[0,1]].add_prefix('win_')
        return_ = umatan[2].rename('return')  
        df = pd.concat([wins, return_], axis=1)        
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def wide(self):
        wide = self.return_tables[self.return_tables[0]=='ワイド'][[1,2]]
        wins = wide[1].str.split('br', expand=True)[[0,1,2]]
        wins = wins.stack().str.split('-', expand=True).add_prefix('win_')
        return_ = wide[2].str.split('br', expand=True)[[0,1,2]]
        return_ = return_.stack().rename('return')
        df = pd.concat([wins, return_], axis=1)
        return df.apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce'))
    
    @property
    def sanrentan(self):
        rentan = self.return_tables[self.return_tables[0]=='三連単'][[1,2]]
        wins = rentan[1].str.split('→', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = rentan[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    
    @property
    def sanrenpuku(self):
        renpuku = self.return_tables[self.return_tables[0]=='三連複'][[1,2]]
        wins = renpuku[1].str.split('-', expand=True)[[0,1,2]].add_prefix('win_')
        return_ = renpuku[2].rename('return')
        df = pd.concat([wins, return_], axis=1) 
        return df.apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [345]:
rt = Return(return_tables)
rt.fukusho #jupyterで表示

Unnamed: 0,win_0,win_1,win_2,return_0,return_1,return_2
200906050811,11,2,13,200,340,460
200906050810,9,2,6,150,140,380
200906050809,14,11,4,120,140,320
200906050808,3,4,5,220,170,270
200906050807,11,7,13,150,240,320
...,...,...,...,...,...,...
202210040505,2,9,3,170,130,700
202210040504,5,7,1,270,290,140
202210040503,15,12,14,110,250,510
202210040502,4,14,3,110,140,1630


In [347]:
class ModelEvaluator:
    def __init__(self, model, return_tables_path_list):
        self.model = model
        self.rt = Return.read_pickle(return_tables_path_list)
        self.fukusho = self.rt.fukusho
        self.tansho = self.rt.tansho
        self.umaren = self.rt.umaren
        self.umatan = self.rt.umatan
        self.wide = self.rt.wide
        self.sanrentan = self.rt.sanrentan
        self.sanrenpuku = self.rt.sanrenpuku
    
    #3着以内に入る確率を予測
    def predict_proba(self, X, train=True, std=True, minmax=False):
        if train:
            proba = pd.Series(
                self.model.predict_proba(X.drop(['単勝'], axis=1))[:, 1], index=X.index
            )
        else:
            proba = pd.Series(
                self.model.predict_proba(X, axis=1)[:, 1], index=X.index
            )
        if std:
            #レース内で標準化して、相対評価する。「レース内偏差値」みたいなもの。
            standard_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standard_scaler)
        if minmax:
            #データ全体を0~1にする
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba
    
    #0か1かを予測
    def predict(self, X, threshold=0.5):
        y_pred = self.predict_proba(X)
        self.proba = y_pred
        return [0 if p<threshold else 1 for p in y_pred]
    
    def score(self, y_true, X):
        return roc_auc_score(y_true, self.predict_proba(X))
    
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({"features": X.columns, 
                                    "importance": self.model.feature_importances_})
        return importances.sort_values("importance", ascending=False)[:n_display]
    
    def pred_table(self, X, threshold=0.5, bet_only=True):
        pred_table = X.copy()[['馬番', '単勝']]
        pred_table['pred'] = self.predict(X, threshold)
        pred_table['score'] = self.proba
        if bet_only:
            return pred_table[pred_table['pred']==1][['馬番', '単勝', 'score']]
        else:
            return pred_table[['馬番', '単勝', 'score', 'pred']]
        
    def bet(self, race_id, kind, umaban, amount):
        if kind == 'fukusho':
            rt_1R = self.fukusho.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1', 'win_2']]==umaban).values * \
                rt_1R[['return_0', 'return_1', 'return_2']].values * amount/100
            return_ = np.sum(return_)
        if kind == 'tansho':
            rt_1R = self.tansho.loc[race_id]
            return_ = (rt_1R['win']==umaban) * rt_1R['return'] * (amount/100)
        if kind == 'umaren':
            rt_1R = self.umaren.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1']]) == set(umaban)) \
                * rt_1R['return']/100 * amount
        if kind == 'umatan':
            rt_1R = self.umatan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1']]) == list(umaban))\
                * rt_1R['return']/100 * amount
        if kind == 'wide':
            rt_1R = self.wide.loc[race_id]
            return_ = (rt_1R[['win_0', 'win_1']].\
                           apply(lambda x: set(x)==set(umaban), axis=1)) \
                * rt_1R['return']/100 * amount
            return_ = return_.sum()
        if kind == 'sanrentan':
            rt_1R = self.sanrentan.loc[race_id]
            return_ = (list(rt_1R[['win_0', 'win_1', 'win_2']]) == list(umaban)) * \
                rt_1R['return']/100 * amount
        if kind == 'sanrenpuku':
            rt_1R = self.sanrenpuku.loc[race_id]
            return_ = (set(rt_1R[['win_0', 'win_1', 'win_2']]) == set(umaban)) \
                * rt_1R['return']/100 * amount
#         if not (return_ >= 0):
#                 return_ = amount
        if not (isinstance(return_, float)):
#             if not (return_.dtypes == 'int64' or return_.dtypes == 'float64'):
            return_ = amount
        return return_
        
    def fukusho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(np.sum([
                self.bet(race_id, 'fukusho', umaban, 1) for umaban in preds['馬番']
            ]))
        return_rate = np.sum(return_list) / n_bets
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        n_hits = np.sum([x>0 for x in return_list])
        return n_bets, return_rate, n_hits, std
    
    def tansho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        self.sample = pred_table
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum([self.bet(race_id, 'tansho', umaban, 1) for umaban in preds['馬番']])
            )
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def tansho_return_proper(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_list.append(
                np.sum(preds.apply(lambda x: self.bet(
                    race_id, 'tansho', x['馬番'], (1/x['単勝'])), axis=1)))
        
        bet_money = (1 / pred_table['単勝']).sum()
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / bet_money
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / bet_money
        return n_bets, return_rate, n_hits, std
    
    def umaren_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umaren', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umatan_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue   
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umatan', umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def wide_box(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'wide', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std  
        
    def sanrentan_box(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            if len(preds)<3:
                continue
            else:
                for umaban in permutations(preds['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def sanrenpuku_box(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            if len(preds)<3:
                continue
            else:
                for umaban in combinations(preds['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrenpuku', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umaren_nagashi(self, X, threshold=0.5, n_aite=2):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'umaren', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += n_aite
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umaren', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def umatan_nagashi(self, X, threshold=0.5, n_aite=5):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'umatan', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += n_aite
                
            elif len(preds_jiku) >= 2:
                for umaban in permutations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'umatan', umaban, 1)
                    n_bets += 1
            return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def wide_nagashi(self, X, threshold=0.5, n_aite=2):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            return_ = 0
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[1:(n_aite+1)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'wide', [preds_jiku['馬番'].values[0], x], 1
                    )
                ).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 2:
                for umaban in combinations(preds_jiku['馬番'], 2):
                    return_ += self.bet(race_id, 'wide', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std
    
    def sanrentan_nagashi(self, X, threshold = 1.5, n_aite=7):
        pred_table = self.pred_table(X, threshold, bet_only = False)
        n_bets = 0
        return_list = []
        for race_id, preds in pred_table.groupby(level=0):
            preds_jiku = preds.query('pred == 1')
            if len(preds_jiku) == 1:
                continue
            elif len(preds_jiku) == 2:
                preds_aite = preds.sort_values('score', ascending = False)\
                    .iloc[2:(n_aite+2)]['馬番']
                return_ = preds_aite.map(
                    lambda x: self.bet(
                        race_id, 'sanrentan',
                        np.append(preds_jiku['馬番'].values, x),
                        1
                    )
                ).sum()
                n_bets += len(preds_aite)
                return_list.append(return_)
            elif len(preds_jiku) >= 3:
                return_ = 0
                for umaban in permutations(preds_jiku['馬番'], 3):
                    return_ += self.bet(race_id, 'sanrentan', umaban, 1)
                    n_bets += 1
                return_list.append(return_)
        
        std = np.std(return_list) * np.sqrt(len(return_list)) / n_bets
        
        n_hits = np.sum([x>0 for x in return_list])
        return_rate = np.sum(return_list) / n_bets
        return n_bets, return_rate, n_hits, std

In [348]:
# #回収率を計算する関数
# def gain(return_func, X, n_samples=100, lower=50, min_threshold=0.5):
#     gain = {}
#     for i in tqdm(range(n_samples)):
#         threshold = 1 * i / n_samples + min_threshold * (1-(i/n_samples))
#         n_bets, return_rate = return_func(X, threshold)
#         if n_bets > lower:
#             gain[n_bets] = return_rate
#     return pd.Series(gain)
def gain(return_func, X, n_samples=100, t_range=[0.5, 3.5]):
    gain = {}
    for i in tqdm(range(n_samples)):
        #min_thresholdから1まで、n_samples等分して、thresholdをfor分で回す
        threshold = t_range[1] * i / n_samples + t_range[0] * (1-(i/n_samples))
        n_bets, return_rate, n_hits, std = return_func(X, threshold)
        if n_bets > 2:
            gain[threshold] = {'return_rate': return_rate, 
                            'n_hits': n_hits,
                            'std': std,
                            'n_bets': n_bets}
    return pd.DataFrame(gain).T

In [591]:
%%time
#目的変数は「3着以内に入ったかどうか」の0or1データを持った'rank'
X = r.data_c.drop(['rank', 'date','単勝'], axis=1)
y = r.data_c['rank']

#LightGBMのハイパーパラメータ
# params = {
#     'num_leaves': 4,
#     'n_estimators': 80,
#     'class_weight': 'balanced',
#     'random_state': 100
# }

#予測モデル作成&学習
lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(X.values, y.values)

#自作したModelEvaluatorクラスのオブジェクトを作成
me = ModelEvaluator(lgb_clf, ['return_tables.pickle'])

Wall time: 42.5 s


In [592]:
#馬が勝つ確率を予測
pred = me.predict_proba(st.data_c.drop(['date'], axis=1), train=False)

#予測結果を表に結合
pred_table = st.data_c[['馬番']].copy()
pred_table['pred'] = pred

#確率が高い順に出力
pred_table.sort_values('pred', ascending=False)

Unnamed: 0,馬番,pred
202201020806,5,2.722218
202201020811,5,2.671789
202201020809,14,2.647747
202201020812,9,2.219956
202201020808,6,1.959243
...,...,...
202201020810,5,-1.307603
202201020810,4,-1.317019
202201020804,3,-1.346342
202201020801,2,-1.353446


In [515]:
# gain_optuna.max()


In [516]:
# r.data_h.loc['201109020411']
# hr.horse_results.loc['2007100727'].sort_values('date')
# hr.horse_results[hr.horse_results['date'] > '20101231'].sort_values('date')

In [593]:
me.feature_importance(X_valid, 50)

Unnamed: 0,features,importance
5,jockey_id,150
8,n_horses,116
204,出走回数,102
123,間隔,82
135,賞金_race_type_allR,77
288,class_type_未勝利,71
317,前走距離差,60
114,賞金_allR,49
4,horse_id,48
7,年齢,47


In [518]:
# gain_optuna.max()

In [519]:
aa = 1.1
if (type(aa) == 'float64'):
    print('od')
type(aa)

float

In [594]:
# race_number = 1
# for counter in range(len(pred_table)):
#     if counter == 0:
#         print('-----' + str(race_number) + 'R' + '-----')
#     if (counter + 1) % 16 == 0:
#         race_number += 1
#         print('-----' + str(race_number) + 'R' + '-----')
#     print(pred_table.values[counter])
def horse_count(pred_table) :
    horse_count_list = []
    pred_num = 0
    count = 0
    for i in range(len(pred_table)):
        pred_num += 1
        if(int(pred_table.values[i][0])) < pred_num:
            horse_count_list.append(count)
            count = 0
            pred_num = 0
        if i == len(pred_table) -1 :
            count += 1
            horse_count_list.append(count)
        count += 1
    return horse_count_list
hc = horse_count(pred_table)
def torank(hc):
    return_rank = []
    c = 0
    rc = 1
    for h in range(len(hc)):
        array = []
        for i in range(hc[h]):
            array.append(pred_table.values[i+c][1])
            array.sort()
            array.reverse()
            rank = []
            for j in range(len(array)):
                for k in range(hc[h]):
                    if array[j] == pred_table.values[k+c][1]:
                        r = [math.floor(pred_table.values[k+c][0]), array[j]]
                        rank.append(r)
                        break
        rank.insert(0, "-----------" + str(rc) + "R----------")
        rc = rc + 1
        return_rank.append(rank)
        c = c + hc[h]
    return return_rank
torank(hc)

[['-----------1R----------',
  [4, 1.311176373073828],
  [7, 1.2955268213334328],
  [3, 0.5622203856180105],
  [8, -0.1153573671781424],
  [5, -0.16353638978460908],
  [6, -0.42629578014524444],
  [1, -1.1102883850463627],
  [2, -1.3534456578709133]],
 ['-----------2R----------',
  [8, 1.73842791705694],
  [14, 1.6713414374891253],
  [10, 1.360622530422904],
  [7, 0.6288352459905229],
  [1, 0.1844771513550349],
  [3, 0.05322375191976288],
  [4, -0.05371942732036163],
  [13, -0.3922312426380393],
  [9, -0.649245183084055],
  [11, -0.7292531326687433],
  [12, -0.7396488793423116],
  [2, -0.7713298596320409],
  [5, -1.137492437075573],
  [6, -1.1640078724731646]],
 ['-----------3R----------',
  [4, 1.8306206736158652],
  [14, 1.3599046438033922],
  [13, 1.0142518998054337],
  [11, 0.4647623826365914],
  [7, 0.352867621334876],
  [9, 0.3300189854220102],
  [6, 0.15689926412163874],
  [5, -0.133295508513628],
  [1, -0.214663772432932],
  [3, -0.26921177991823914],
  [10, -1.0305560541907095

In [595]:
pt = pred_table.copy()
pt = pt[pt['pred'] >= 1.30]
pt['レース'] = pt.apply(lambda x: x.name[10:12], axis=1)
pt = pt.drop('pred', axis=1)
for_horse_name = st.data.copy()
for_horse_name['レース'] = for_horse_name.apply(lambda x: x.name[10:12], axis=1)
for_horse_name = for_horse_name[['レース', '馬番', '馬名']]
pt = pt.merge(for_horse_name, on=['レース', '馬番'])
pt = pt.set_index('レース')
print(len(pt))
pt

20


Unnamed: 0_level_0,馬番,馬名
レース,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,ヴェルデシチー
2,8,サクラトップクリス
2,10,ショーモン
2,14,タリエシン
3,4,スターザサンライズ
3,14,ベニッシモ
4,11,ディージェーサン
4,15,コーリングユー
5,6,カフジペンタゴン
5,8,アカノストロング


In [358]:
from itertools import combinations, permutations
start = time.time()
#ModelEvaluatorクラスのオブジェクトを作成
me = ModelEvaluator(lgb_clf, ['return_tables.pickle'])

#単勝適正回収値＝払い戻し金額が常に一定になるように賭けた場合の回収率
gain_optuna = gain(me.wide_nagashi, X_test)
gain_optuna.plot


elapsed_time = time.time() - start
elapsed_time

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




4594.713486909866

In [359]:
gain_optuna.max()

return_rate        1.062500
n_hits          5247.000000
std                0.274307
n_bets         51119.000000
dtype: float64

In [360]:
r.data_c.sort_values('diff_final-diff_y_course_around_allR').head(20)

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,trainer_id,rank,...,core_distance_非根幹距離,core_distance_根幹距離,straight_type_long,straight_type_short,cos_day,sin_day,前走距離差,course_around_右,course_around_左,course_around_直線
201505010209,1,1,56.0,141.9,18.0,2015-02-01,12476,102,111,1,...,1,0,1,0,0.523416,0.852078,2.0,0,1,0
201505020202,5,10,56.0,9.2,14.0,2015-04-26,10256,23,48,0,...,1,0,1,0,0.910605,-0.413279,0.0,0,1,0
201705010505,2,3,53.0,11.7,16.0,2017-02-11,19187,137,123,1,...,0,1,1,0,0.661635,0.749826,2.0,0,1,0
201905040811,4,4,54.0,10.7,16.0,2019-10-26,34476,197,145,0,...,0,1,1,0,-0.907014,0.421101,-2.0,0,1,0
201904030110,7,7,54.0,1.8,22.0,2019-10-05,28527,107,45,1,...,1,0,0,1,-0.997325,0.073095,2.0,0,1,0
201405050302,2,2,54.0,49.1,20.0,2014-11-15,8314,11,70,0,...,0,1,1,0,-0.711657,0.702527,4.0,0,1,0
201506020606,8,12,56.0,26.1,16.0,2015-03-15,11692,4,204,0,...,0,1,0,1,0.956235,0.2926,4.0,1,0,0
201505050901,6,11,54.0,8.8,14.0,2015-11-29,14931,201,90,0,...,1,0,1,0,-0.523416,0.852078,0.0,0,1,0
202206010209,7,13,56.0,316.2,18.0,2022-01-08,44606,75,138,0,...,1,0,0,1,0.137279,0.990532,2.0,1,0,0
201405050403,5,9,54.0,126.4,18.0,2014-11-16,8150,64,170,0,...,1,0,1,0,-0.699458,0.714673,2.0,0,1,0


In [361]:
r.data_c.loc['202107060210'].sort_values('diff_final-diff_y_course_around_allR')

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,trainer_id,rank,...,core_distance_非根幹距離,core_distance_根幹距離,straight_type_long,straight_type_short,cos_day,sin_day,前走距離差,course_around_右,course_around_左,course_around_直線
202107060210,4,4,55.0,26.7,16.0,2021-12-05,30581,106,79,0,...,0,1,1,0,-0.432776,0.901502,0.0,0,1,0
202107060210,2,2,56.0,1.7,16.0,2021-12-05,38771,77,234,1,...,0,1,1,0,-0.432776,0.901502,0.0,0,1,0
202107060210,7,11,54.0,5.4,16.0,2021-12-05,40600,39,126,1,...,0,1,1,0,-0.432776,0.901502,0.0,0,1,0
202107060210,8,12,57.0,153.5,16.0,2021-12-05,29638,133,143,0,...,0,1,1,0,-0.432776,0.901502,-2.0,0,1,0
202107060210,1,1,54.0,4.2,16.0,2021-12-05,39181,213,140,0,...,0,1,1,0,-0.432776,0.901502,-2.0,0,1,0
202107060210,6,9,57.0,25.9,16.0,2021-12-05,31901,149,167,0,...,0,1,1,0,-0.432776,0.901502,0.0,0,1,0
202107060210,5,7,56.0,8.5,16.0,2021-12-05,39271,197,132,1,...,0,1,1,0,-0.432776,0.901502,0.0,0,1,0
202107060210,4,5,55.0,38.1,16.0,2021-12-05,33837,82,198,0,...,0,1,1,0,-0.432776,0.901502,-2.0,0,1,0
202107060210,8,13,57.0,405.4,16.0,2021-12-05,19015,142,33,0,...,0,1,1,0,-0.432776,0.901502,0.0,0,1,0
202107060210,3,3,56.0,70.1,16.0,2021-12-05,39026,9,180,0,...,0,1,1,0,-0.432776,0.901502,-2.0,0,1,0


In [362]:
aaa = pd.read_csv('data20220618_0626.csv', encoding='cp932')

In [363]:
jounetunomak

NameError: name 'jounetunomak' is not defined

In [None]:
hr.horse_results.loc['2019101000']

In [None]:
button = widgets.Button(description="Click me")
button

In [None]:
import sys

print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_"):
        print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))

In [None]:
hr.horse_results['race_type'].map(race_type_dict).value_counts()

In [None]:
me.feature_importance(X_train)
st.data_c['最後の直線'].isnull().sum()

In [None]:
r.data_p.merge(hr, left_on=['horse_id', '開催'], right_index=True, how='left')

In [None]:
hr.horse_results.groupby(['horse_id', '開催'])[['着順', '賞金']].mean()

In [None]:
len(hr.horse_results.loc['2015104287'])

In [None]:
)df = hr.horse_results.groupby(['horse_id', 'race_type'])[['着順', '賞金']].mean()

In [None]:
r.data_p.merge(df, left_on=['horse_id', '開催'], right_index=True, how='left')

In [None]:
hr.horse_results.to_pickle('horseresult_temp.pickle')

In [None]:
st.data_c['最後の直線']
hr.horse_results.sort_values('date')

In [None]:
scores = me.predict_proba(st.data_c.drop(['date'], axis=1))
pred= st.data_c[['馬番']].copy()
pred['score'] = scores
pred.loc['202104040312'].sort_values('score', ascending = False)

In [None]:
customer = pd.DataFrame([['0001', 'John'], ['0002', 'Lily']], columns=['customer', 'name'])

In [None]:
order = pd.DataFrame([["0001", "Smartphone"],
                          ["0001", "Wireless Charger"],
                          ["0002", "Wearable watch"]],
                          columns=['customer', 'product_name'])

In [None]:
hr.horse_results.sort_values('date')

In [None]:
type(me.return_) == 'numpy.float64'
isinstance(me.return_, float)

In [None]:
me = ModelEvaluator(lgb_clf, 'return_tables.pickle')

In [None]:
race_number = 1
for counter in range(len(pred_table)):
    if counter == 0:
        print('-----' + str(race_number) + 'R' + '-----')
    if (counter + 1) % 16 == 0:
        race_number += 1
        print('-----' + str(race_number) + 'R' + '-----')
    print(pred_table.values[counter])
    
    

In [None]:
st.data_c[['賞金_core_distance_5R']]

In [None]:
list1 = [12,np.nan]
a = pd.Series(data=list1)

In [None]:
a.dtypes  == 'int64' or a.dtypes == 'float64'

In [None]:
hr.horse_results.loc['2017104612'].sort_values('date')

In [None]:
for i in range(len(hr.horse_results.index)):
    date_list = hr.horse_results.loc[(hr.horse_results.index == hr.horse_results.index[i]), 'date']
    date_list = list(set(date_list))
    for j in range(len(date_list)):
            if len(hr.horse_results.loc[(hr.horse_results.index == hr.horse_results.index[i]) & (hr.horse_results['date'] == date_list[j])]) > 1:
                df = pd.DataFrame()
                df = hr.horse_results.loc[(hr.horse_results.index == hr.horse_results.index[i]) & (hr.horse_results['date'] == date_list[j])].sort_values('上り').dropna(how='any')
                hr.horse_results.loc[(hr.horse_results.index == hr.horse_results.index[i]) & (hr.horse_results['date'] == date_list[j])] = df

In [None]:
date_list = hr.horse_results.loc[(hr.horse_results.index == '2017104612'), 'date']

In [None]:
date_list = list(set(date_list))

In [None]:
hr.horse_results.loc[(hr.horse_results.index == '2017104612') & (hr.horse_results['date'] == date_list[0])] = hr.horse_results.loc[(hr.horse_results.index == '2017104612') & (hr.horse_results['date'] == date_list[0])].sort_values('上り').dropna(how='any')

In [None]:
hr.horse_results.loc[(hr.horse_results.index == '2017104612') & (hr.horse_results['date'] == date_list[1])].sort_values('上り').dropna(how='any')

In [None]:
hr.horse_results.loc[(hr.horse_results.index == '2017104612') & (hr.horse_results['date'] == date_list[0])]

In [None]:
hr.horse_results.loc[(hr.horse_results.index == '2017104612') & (hr.horse_results['date'] == date_list[0])] = a

In [None]:
hr.horse_results = pd.read_pickle('horse_results_kai20192020.pickle')

In [None]:
shoukin = '本賞金:6700,2700,1700,1000,670万円'
shoukin = shoukin.replace('万円', '')
shoukin = shoukin.split(':')[1]
shoukin = list(map(int,shoukin.split(',')))
sum(shoukin)

In [None]:
list = [[]]*12
race_number = 1
horse_number = 0
list_number = 0
for counter in range(len(pred_table)):
    if horse_number + 1 == pred_table.values[counter]:


In [None]:
def read_file(file, all_races) :
    with open(file, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for race in csvreader:
            all_races.appned(race)
            race_meta = race[0].split('|')
            if len(race_meta) > 5:
                all_where_str.append(race_meta[1])
                all_baba_str.append(race_meta[2])
                all_tenki_str.append(race_meta[4])
            for e in range(1, len(radce)):
                if len(result) >= 3:
                    all_horse_name.append(result[1])
                    all_jockey_name.append(result[2])

In [None]:
race1 = []
for a in range(16):
    race1.append(pred_table.values[a])
race1

In [None]:
def horse_count(pred_table) :
    horse_count_list = []
    pred_num = 0
    count = 0
    for i in range(len(pred_table)):
        pred_num += 1
        if(int(pred_table.values[i][0])) < pred_num:
            horse_count_list.append(count)
            count = 0
            pred_num = 0
        if i == len(pred_table) -1 :
            count += 1
            horse_count_list.append(count)
        count += 1
    return horse_count_list

In [None]:
hc = horse_count(pred_table)

In [None]:
race_number = 1
for counter in range(len(pred_table)):
    if counter == 0:
        print('-----' + str(race_number) + 'R' + '-----')
    if (counter + 1) % 16 == 0:
        race_number += 1
        print('-----' + str(race_number) + 'R' + '-----')
    print(pred_table.values[counter])
    
    

In [None]:
race_number = 1
list = [[0]]*12
list_number = 0
counter = 1
for i in range(len(pred_table)):
    if counter > hc[]
    list[list_number].append(i)
    

In [None]:
hr.horse_results.sort_values('date')

In [None]:
def TrainBatch(train_data_size, train_loader, device, model, loss_func, optimizer):
    train_loss = 0
    train_acc = 0
    cnt = 0
    model.train()
    for data, target in train_loader:
        data, target = data.to(device), targe

In [None]:
# データ1の準備
attri_data1 = {'ID':['100','101','102','105','106']
        ,'city':['Tokyo','Chiba','Kyoto','Gunma','Tokyo']
        ,'birth_year':[1991,1992,1985,1996,1981]
        ,'name':['Yamada','Sato','Suzuki','Kitamura','Aoki']}
data_frame1 = pd.DataFrame(attri_data1)

# データ2の準備
attri_data2 = {'ID':['100','101','102','103','104']
        ,'math':[34,77,45,81,98]
        ,'English':[47,64,16,53,37]
        ,'sex':['F','M','F','F','M']
        ,'index_num':[0,1,2,3,4]}
data_frame2 = pd.DataFrame(attri_data2)


In [None]:
data_frame1

In [None]:
data_frame2

In [None]:
k = pd.read_pickle('kankaku_p2020.pickle')
k

In [None]:
r.data_p

In [None]:
hr.filtered_df

In [None]:
hr.data_dict['間隔']

In [None]:
r.data_c.sort_values('date')

In [None]:
hr.target_df[hr.target_df['date'] < '20200314']

In [None]:
hr.target_df[hr.target_df['date'] < '20200314'].sort_values('date', ascending=False).head(1)

In [None]:
r.data_c

In [None]:
a = pd.read_pickle('results.pickle')
a

In [None]:
LabelEncoder().fit_transform(results['horse_id']).max()

In [None]:
results['horse_id'].nunique()

In [None]:
### マスク関数

In [None]:
pd.get_dummies(results[['weather', 'race_type']])

In [None]:
r.data_pe['class_type']

In [None]:
results['weather'].unique()

In [None]:
weathers = results['weather'].unique()

In [None]:
sample = st.data_pe[['weather', 'race_type', 'ground_state', '性']].copy()
sample['weather'] = pd.Categorical(sample['weather'], weathers)

In [None]:
results4 = result4[pd.to_datetime(results4['date'], format="%Y年%m月%d日") > '20191231']
class_types = results4['class_type'].unique()
s = pd.DataFrame()
s['class_type'] = results4['class_type']
s['class_type'] = pd.Categorical(s['class_type'], class_types)

In [None]:
pd.get_dummies(s)

In [None]:
rc = pd.read_pickle('rc_202104_0613.pickle')
rc = rc[['horse_id', 'class_type', 'race_id']]
rc['horse_id'] = rc['horse_id'].astype(str)
rc

In [None]:
rc = pd.read_pickle('rc_2020.pickle')
rc = rc[['horse_id', 'class_type', 'race_id']]
rc['horse_id'] = rc['horse_id'].astype(str)
rc

In [None]:
results = pd.read_pickle('race_data202104_0613.pickle')

In [None]:
results['date_time'] = pd.to_datetime(results['date'], format="%Y年%m月%d日")
results['race_id'] = results.index

In [None]:
results = results.merge(rc, how='left',on=['horse_id', 'race_id'], left_index=True)

In [None]:
results.to_pickle('results_p_2021040_0613')

In [None]:
results = pd.read_pickle('results.pickle')
results['date_time'] = pd.to_datetime(results['date'], format="%Y年%m月%d日")
results['race_id'] = results.index

In [None]:
results

In [None]:
r1 = results[results['date_time'] < '20200101']
r2 = results[~(results['date_time'] < '20200101')]
len(r1) + len(r2) == len(results)

In [None]:
results['date_time'] = pd.to_datetime(results['date'], format="%Y年%m月%d日")

In [None]:
results[results['date_time'] < '20200101'].sort_values('date_time')

In [None]:
r3 = r2.merge(rc, on=['horse_id', 'race_id'], left_index=True)
r3

In [None]:
results4 = pd.concat([r1, r3])

In [None]:
type(results['date'][0])

In [None]:
results4

In [None]:
r1 = results[results['date_time'] > '20161231']
r2 = results[~(results['date_time'] > '20161231')]

In [None]:
#rc前準備
rc = pd.read_pickle('rc_202017.pickle')
rc = rc[['horse_id', 'class_type', 'race_id']]
rc['horse_id'] = rc['horse_id'].astype(str)
rc

In [None]:
print(type(rc['race_id'][0]), type(rc['horse_id'][0]))
print(type(r1['race_id'][0]), type(r1['horse_id'][0]))

In [None]:
r3 = r1.merge(rc, on=['horse_id', 'race_id'], left_index=True)

In [None]:
r = pd.concat([r2, r3])

In [None]:
url = 'https://race.netkeiba.com/race/shutuba.html?race_id=202105020709'
df = pd.read_html(url)[0]
df = df.T.reset_index(level=0, drop=True).T
html = requests.get(url)
html.encoding = "EUC-JP"
soup = BeautifulSoup(html.text, "html.parser")
race_name = soup.find('div', attrs={'class': 'RaceName'}).text


In [None]:
race_name = soup.find('div', attrs={'class': 'RaceName'}).text

In [None]:
re.findall(r'\w+', race_t)

In [None]:
soup.find('span', attrs={'class': 'Icon_GradeType'}).text

In [None]:
soup.select('.Icon_GradeType', ::after).content

In [None]:
e = soup.find_all('span', class_='Icon_GradeTypea')
e

In [None]:
r.data['class_type'].unique()

In [None]:
url = 'https://race.netkeiba.com/race/shutuba.html?race_id=202105020709'
df = pd.read_html(url)[0]
df = df.T.reset_index(level=0, drop=True).T
html = requests.get(url)
html.encoding = "EUC-JP"
soup = BeautifulSoup(html.text, "html.parser")
race_name = soup.find('div', attrs={'class': 'RaceName'}).text
e = soup.find_all('span', class_='Icon_GradeType')
if 'Icon_GradeType1'  in e[0]['class']:
    #Ｇ１
if 'Icon_GradeType2'  in e[0]['class']:
    #Ｇ２
if 'Icon_GradeType3'  in e[0]['class']:
    #Ｇ３
if 'Icon_GradeType4'  in e[0]['class']:
    #重賞
if 'Icon_GradeType5'  in e[0]['class']:
    #ｵｰﾌﾟﾝ
if 'Icon_GradeType6'  in e[0]['class']:
    #1660下
if 'Icon_GradeType7'  in e[0]['class']:
    #1000下
if 'Icon_GradeType8'  in e[0]['class']:
    #900下
if 'Icon_GradeType9'  in e[0]['class']:
    #500下
if 'Icon_GradeType10'  in e[0]['class']:
    #ＪＧ１
if 'Icon_GradeType11'  in e[0]['class']:
    #ＪＧ２
if 'Icon_GradeType12'  in e[0]['class']:
    #ＪＧ３
if 'Icon_GradeType15'  in e[0]['class']:
    #OP(L)
if 'Icon_GradeType16'  in e[0]['class']:
    #3勝
if 'Icon_GradeType17'  in e[0]['class']:
    #2勝
if 'Icon_GradeType18'  in e[0]['class']:
    #1勝
    

In [None]:
if e in ['Icon_GradeType']:
    de = e

In [None]:
for a in e:
    print(a['class'])

In [None]:
type(e[0]['class'])

In [None]:
if e[0]['class'] in ['Icon_GradeType']:
    de = e[0]['class']

In [None]:
if 'Icon_GradeType17'  in e[0]['class']:
    ab  = 0

In [None]:
ab

In [None]:
'新馬' in '3歳未勝利'

In [None]:
r.data = r.data[pd.to_datetime(r.data['date'], format="%Y年%m月%d日") > '20191231']

In [None]:
r.data['class_type'].unique()

In [None]:
r.data['class_type'] = r.data['class_type'].map(lambda x: '1勝' if x == '500万' else x)

In [None]:
r.data

In [None]:
hr.horse_results

In [None]:
rc2 = pd.read_pickle('rc_202017.pickle')
rc2

In [None]:
r.data['date'] = pd.to_datetime(r.data['date'], format="%Y年%m月%d日")

In [None]:
r.data['date']

In [None]:
r = Results.read_pickle(['results_addclass.pickle'])
r

In [None]:
r.data

In [None]:
r.data

In [None]:
hr.horse_results

In [None]:
a = pd.read_pickle('rc_202017_date.pickle')

In [None]:
cource_info_list

In [None]:
peds['peds_0'].unique()[350:400]

In [None]:
type(a) == 'pandas.core.series.Series'

In [None]:
r.data_c

In [None]:
'芝' in '芝1200' 


In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
Nx = 30
Ny = 30
learntime = 50000
alpha = 0.08
weight = np.random.random([Nx,Ny,3])

def som(ColorVec):
    min_index = np.argmin(((weight-ColorVec)**2).sum(axis=2))
    mini = int(min_index / Ny)
    minj = int(min_index % Ny)
    for i in range(-2,3):
        for j in range(-2,3):
            try:
                weight[mini+i,minj+j] += alpha * (ColorVec - weight[mini+i,minj+j])
            except:
                pass
    
for time in range(learntime):
    ColorVec = np.random.rand(3)
    som(ColorVec)
    
print("imgshow")
im = plt.imshow(weight,interpolation='none')
plt.show()

In [None]:
cource_prepro['コース'].value_counts()

In [None]:
r.data_h.columns

In [None]:
df1 = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['col_0', 'col_1', 'col_2', 'col_3'],
                  index=['row_0', 'row_1', 'row_2'])

In [None]:
df2 = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['col_4', 'col_5', 'col_6', 'col_7'],
                  index=['row_0', 'row_1', 'row_2'])

In [None]:
df1 = df2
df1

In [None]:
df1

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
from ipywidgets import IntProgress
IntProgress(10,max=100)
for i in tqdm(range(10)):
  time.sleep(1)

In [None]:
a = r.data.copy()

In [None]:
df = pd.DataFrame({
    'city': ['osaka', 'osaka', 'osaka', 'osaka', 'tokyo', 'tokyo', 'tokyo'],
    'food': ['apple', 'orange', 'banana', 'banana', 'apple', 'apple', 'banana'],
    'price': [100, 200, 250, 300, 150, 200, 400],
    'quantity': [1, 2, 3, 4, 5, 6, 7]
})
df

In [None]:
df.groupby('city').size()

In [None]:
hr = HorseResults(horse_results)

In [None]:
hr.horse_results.index

In [None]:
hr.horse_results = hr.horse_results.merge(a.to_frame(), right_index=True, left_index=True, how='left')

In [None]:
hr.horse_results.rename(columns={0: '出走回数'})

In [None]:
hr.horse_results

In [None]:
n = '1.3/4'
int(n.split('.')[1].split('/')[0]) / int(n.split('.')[1].split('/')[1])

In [None]:
import pyparsing as pp
columns = ['diff', 'horse_no']
DIFF_GROUP = 0.3
DIFF_MIN = 1.5
DIFF_MID = 3.0
DIFF_MUCH = 6.0
class ParsePass():
    
    def __init__(self):
        
        # 馬番
        horse_no = pp.Word(pp.nums).setParseAction(self._horse_no_action)
        
        # 馬群
        group = pp.Suppress(pp.Literal('(')) + \
                    pp.Optional(pp.delimitedList(pp.Word(pp.nums), delim=',')) + \
                    pp.Suppress(pp.Literal(')'))
        group.ignore('*')
        group.setParseAction(self._group_action)

        # 情報要素
        element = (group | horse_no)
        
        # 前走馬との差
        diff_min = pp.Suppress(pp.Optional(pp.Literal(','))).setParseAction(self._diff_min_action) + element
        diff_mid = pp.Suppress(pp.Literal('-')).setParseAction(self._diff_mid_action) + element
        diff_much = pp.Suppress(pp.Literal('=')).setParseAction(self._diff_much_action) + element

        # 全体定義
        self._passing_order = element + pp.ZeroOrMore( diff_mid | diff_much | diff_min )
        
    def _horse_no_action(self, token):
        
        self._data = self._data.append({'diff':self._diff, 'horse_no':token[0]}, ignore_index=True)
        return

    def _group_action(self, token):
        
        for no in token:
            self._data = self._data.append({'diff':self._diff, 'horse_no':no}, ignore_index=True)
            self._diff += DIFF_GROUP
        self._diff -= DIFF_GROUP
        return
        
    def _diff_min_action(self, token):
        
        self._diff += DIFF_MIN
        return
        
    def _diff_mid_action(self, token):
        
        self._diff += DIFF_MID
        return
    
    def _diff_much_action(self, token):
        
        self._diff += DIFF_MUCH
        return
        
    def parse(self, pass_str):
        
        # 初期化
        self._data = pd.DataFrame(columns=columns)
        self._diff = 0
        # parse
        self._passing_order.parseString(pass_str)
        # index調整
        self._data.index = np.arange(1, len(self._data)+1)
        self._data.index.name = 'rank'
        
        return self._data

In [None]:
pass_data = ['2(5,9)13(1,10)15(4,12)-(3,8,14)(6,11)7', '2(5,9)(1,13)10(15,12)(4,8,14)3(6,11)7', '2,5(9,13)1(10,12,14)-(15,8)(6,4,11)3,7', '(*2,13)5,14,12(1,9)(10,8)(15,11)6,4-(3,7)']

pass_parsing = ParsePass()
for pass_str in pass_data:
    print(pass_parsing.parse(pass_str))

In [None]:
r = pd.read_pickle('results_.pickle')

In [None]:
r

In [None]:
r['着差'].value_counts()[:59]

In [None]:
tyakusa_dict = {
    'クビ': 0.35,
    'ハナ': 0.08,
    'アタマ': 0.15,
    '1/2': 0.5,
    '1.1/4': 1.25,
    '3/4': 0.75,
    '2.1/2': 2.5,
    '1.3/4': 1.75,
    '1.1/2': 1.5,
    '3.1/2': 3.5,
    '大': 15,
    'クビ+クビ': 0.7,
    '1.1/4+クビ': 1.55,
    'クビ+3/4': 1.05,
    'クビ+1.3/4': 2.1,
    '1/2+1/2': 1,
    '1/2+ハナ': 0.58,
    'クビ+2': 2.35,
    'クビ+1': 1.35,
    'ハナ+クビ': 0.43,
    '1.3/4+クビ': 2.05,
    'クビ+1/2': 0.85,
    '2.1/2+ハナ': 2.58,
    'アタマ+クビ': 0.5,
    'ハナ+1.1/4': 1.33,
    '2+アタマ': 2.15,
    'ハナ+3/4': 0.83,
    '1.1/4+ハナ': 1.33,
    '1.1/4+2.1/2': 3.75,
    '3/4+クビ': 1.1,
    'クビ+2.1/2': 2.85,
    '2+1/2': 2.5,
    '1.3/4+2.1/2': 4.25,
    '1.1/2+3.1/2': 5,
    '1.1/4+3/4': 2,
    '1.3/4+1/2': 2.25,
    '1/2+クビ': 0.85,
    '7+大': 22,
    '3/4+3/4': 1.5,
    '3.1/2+6': 9.5,
    '1+1/2': 1.5,
    '3/4+アタマ': 0.9,
    '2+3/4': 2.75,
    '1.3/4+ハナ': 1.82,
    '同着': '同着',
    '1':1.00,
    '2':2.00,
    '3':3.00,
    '4':4.00,
    '5':5.00,
    '6':6.00,
    '7':7.00,
    '8':8.00,
    '9':9.00
}

In [None]:
r.loc['202110020812']

In [None]:
def add_tyakusa(arr):
    acc = 0
    return_arr = []
    nan_list = []
    f_1tyaku = True
    for a in arr:
        if a == '同着' and f_1tyaku:
            nan_list.append(np.nan)
            continue
            
        if a == '同着':
            return_arr.append(acc)
            continue
            
        if np.isnan(a):
            if f_1tyaku:
                nan_list.append(np.nan)
                continue
            return_arr.append(np.nan)
            continue

        f_1tyaku = False

        return_arr.append(round(a + acc, 2))
        acc += a
    nan_list = [return_arr[0] * -1] * len(nan_list) 
    return_arr = nan_list + return_arr
    return return_arr

In [None]:
a = [1,3,4]
b = map(str,a)

In [None]:
list(b)

In [None]:
r.loc['202109010711']['着差_頭身']

In [None]:
a = add_tyakusa(r.loc['202105040107']['着差_頭身'])
print(a)
print(len(a))

In [None]:
pass_data = ['10-8-5-(2,3)(1,12)6-(4,7)9,11', '10(8,5)-6(2,7)(12,1)-(3,11)-9,4']

pass_parsing = ParsePass()
for pass_str in pass_data:
    print(pass_parsing.parse(pass_str))

In [None]:
a = pass_parsing.parse('10-8-5-(2,3)(1,12)6-(4,7)9,11')['diff']
r1['first_'] = a.values.tolist()

In [None]:
a

In [None]:
r1

In [None]:
r['着差_頭身'] = r['着差'].map(tyakusa_dict)
r

In [None]:
r['着差_頭身'] = r['着差_頭身'].map(lambda x: float(x) if not (x == '頭身' or x == np.nan or x == '同着') else x )

In [None]:
r[r['着差'] == '同着']

In [None]:
list = [1,3]
list.map(lambda x: x+2)

In [None]:
st.data_c['day_of_week'] = st.data_c['date'].map(lambda x: x.strftime('%A'))

In [None]:
1.0 + np.nan

In [None]:
peds['peds_0'].unique()[:50]

In [None]:
res = re.search("[系]", "ワイルドラッシュ Wild Rush(米) 1994 鹿毛 [血統][産駒] Nearctic系")
print(res)

In [None]:
a = "スキャターザゴールド Scatter the Gold(加) 1997 黒鹿毛 [血統][産駒] Mr. Prospector系".split(']')

In [None]:
#         mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
#         new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
#         le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
#         df['jockey_id'] = le_jockey.transform(df['jockey_id'])

In [None]:
maskjockey = r.data_pe['jockey_id']

In [None]:
a = peds.copy()

In [None]:
a.apply(lambda x: print(x['peds_0']))

In [None]:
a

In [None]:
results['着差'].value_counts()

In [None]:
word = pp.Word(pp.alphanums)
command = pp.Group(pp.OneOrMore(word))
token =pp.Suppress('->')
device = pp.Group(pp.OneOrMore(word))
argument = pp.Group(pp.OneOrMore(word))

In [None]:
event = command + token + device + pp.Optional(token + argument)

In [None]:
cmd, dev, arg = event.parseString("increase -> frigde tempear")

In [None]:
a = r.data_p[r.data_p['straight_type'].isnull()]

In [None]:
r.data_p

In [None]:
cource_info

In [None]:
from pycaret.datasets import get_data

In [None]:
url = "https://db.netkeiba.com/race/202105050608"

In [None]:
df = pd.read_html(url)[4]

In [None]:
df

In [None]:
rh = pd.read_pickle('r.data_h.pickle')

In [None]:
rh

In [None]:
hr = pd.read_pickle('horse_results_.pickle')
hr

In [None]:
r.data_p