In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o
from itertools import combinations, permutations
import matplotlib.pyplot as plt



In [3]:
df = pd.read_pickle(
    "/Users/KeD/Scripts/python/keiba/KeibaAI/Horse_results/horse_results_0.pickle"
)
df

Unnamed: 0,日付,開催,天 気,R,レース名,映 像,頭 数,枠 番,馬 番,オ ッ ズ,...,ﾀｲﾑ 指数,通過,ペース,上り,馬体重,厩舎 ｺﾒﾝﾄ,備考,勝ち馬 (2着馬),賞金,jockey_id
2016104880,2022/02/27,2中山2,晴,11.0,中山記念(G2),,16,7.0,14,17.5,...,97.0,4-4-4-2,35.2-37.3,38.1,478(+2),,,パンサラッサ,,01127
2016104880,2021/11/14,2福島4,晴,11.0,福島記念(G3),,16,8.0,16,22.6,...,87.0,2-2-2-2,33.6-37.6,40.2,476(-4),,,パンサラッサ,,01043
2016104880,2021/09/12,4中山2,曇,11.0,京成杯オータムH(G3),,16,5.0,9,36.8,...,110.0,1-1-1,34.6-35.2,35.2,480(0),,,カテドラル,1616.4,01096
2016104880,2021/06/13,1札幌2,晴,11.0,函館スプリントS(G3),,16,6.0,11,10.5,...,97.0,8-5,32.8-34.8,34.5,480(0),,,ビアンフェ,,01127
2016104880,2021/03/06,2中山3,晴,11.0,夕刊フジオーシャンS(G3),,16,4.0,8,33.4,...,106.0,2-2,33.7-34.7,34.3,480(+6),,,(カレンモエ),4157.4,01127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016103957,2018/12/19,川崎,晴,11.0,全日本2歳優駿【国際(G1),,14,4.0,5,3.7,...,89.0,7-7-6-5,36.7-40.0,39.4,493(+2),,,ノーヴァレンダ,1225.0,05212
2016103957,2018/11/28,園田,曇,10.0,兵庫ジュニアグランプ(G2),,12,8.0,12,2.0,...,94.0,8-8-5-4,0.0-38.6,38.1,491(-7),,,(オルトグラフ),2200.0,05339
2016103957,2018/11/10,5東京3,晴,9.0,オキザリス賞(500万下),,16,1.0,2,2.1,...,93.0,11-9,35.6-36.3,35.3,498(+6),,出遅れ,(ナンヨーイザヨイ),1036.4,05339
2016103957,2018/09/22,4中山6,曇,1.0,2歳未勝利,,15,5.0,9,2.0,...,80.0,3-3-3-2,36.4-39.1,38.8,492(-4),,出遅れ,(モーンストルム),500.0,05339


In [2]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過', '開催', '距離']]
        self.preprocessing()
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離は10の位を切り捨てる
        #一部の馬で欠損値があり、intに変換できないためfloatに変換する
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(float) // 100
        df.drop(['距離'], axis=1, inplace=True)
        #インデックス名を与える
        df.index.name = 'horse_id'
        
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner', 'final_corner',
                            'first_to_rank', 'first_to_final','final_to_rank']
    
    #n_samplesレース分馬ごとに平均する
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
        
        #集計して辞書型に入れる
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean()\
            .add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))

        #6/6追加: 馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            self.latest = filtered_df.groupby('horse_id')['date'].max().rename('latest')
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        for column in ['course_len','race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left')

        #6/6追加：馬の出走間隔追加のために、全レースの日付を変数latestに格納
        if n_samples == 5:
            merged_df = merged_df.merge(self.latest, left_on='horse_id',
                             right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

#開催場所をidに変換するための辞書型
place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

#レースタイプをレース結果データと整合させるための辞書型
race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}

In [13]:
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_values("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test



In [63]:
df = pd.read_pickle('race.pickle')
def convert_to_seconds(timestr):
    minutes, seconds = timestr.split(':')
    return float(minutes)*60 + float(seconds)
df['タイム'] = df['タイム'].fillna('0:00').apply(convert_to_seconds)
df_label = df['着順']
df.drop(['馬名','着差', '調教師'], axis=1, inplace=True)


In [64]:
horse = pd.read_csv('horse_result.csv')


In [40]:
horse.drop(['Unnamed: 0', '日付', '開催', '天気', 'R', '映像', '頭数', '枠番', '馬番',
       'オッズ', '人気', '着順', '騎手', '斤量', '距離', '馬場','タイム', '着差',
       '通過', 'ペース', '上り', '馬体重', '備考', '賞金'], axis=1, inplace=True)

Index(['Unnamed: 0', '日付', '開催', '天気', 'R', 'レース名', '映像', '頭数', '枠番', '馬番',
       'オッズ', '人気', '着順', '騎手', '斤量', '距離', '馬場', '馬場指数', 'タイム', '着差', 'ﾀｲﾑ指数',
       '通過', 'ペース', '上り', '馬体重', '厩舎ｺﾒﾝﾄ', '備考', '勝ち馬(2着馬)', '賞金'],
      dtype='object')

In [76]:
def convert_to_seconds(timestr):
    if '.' in timestr:
        minutes, seconds_decimal = timestr.split(':')
        seconds, decimal = seconds_decimal.split('.')
        return float(minutes) * 60 + float(seconds) + float('0.' + decimal)
    else:
        minutes, seconds = timestr.split(':')
        return float(minutes) * 60 + float(seconds)

horse_temp = horse[(horse['レース名'].str.contains('東京優駿')) & (horse['日付'] >= '2017')].sort_values('日付')
horse_temp['タイム'].fillna('0:00', inplace=True)
horse_temp['タイム'] = horse_temp['タイム'].apply(convert_to_seconds)

numeric_columns = horse_temp.select_dtypes(include='number')
numeric_columns.corr()['タイム']

Unnamed: 0   -0.589190
R            -0.509454
映像                 NaN
頭数            0.290211
枠番            0.003512
馬番            0.012483
オッズ           0.211793
人気            0.237757
斤量            0.115716
タイム           1.000000
着差            0.397711
上り            0.183977
厩舎ｺﾒﾝﾄ             NaN
備考                 NaN
賞金           -0.099759
Name: タイム, dtype: float64

In [73]:
type(horse_temp['タイム'].iloc[0])

str

In [None]:

url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + '202305021201'

html = requests.get(url)
html.encoding = "EUC-JP"

df = pd.read_html(html.text)[0]
# 列名に半角スペースがあれば除去する
df = df.rename(columns=lambda x: x.replace(' ', ''))
df = df.T.reset_index(level=0, drop=True).T

soup = BeautifulSoup(html.text, "html.parser")

texts = soup.find('div', attrs={'class': 'RaceData01'}).text
texts = re.findall(r'\w+', texts)

In [None]:
import pandas as pd

for i in range(1000, 8000, 1000):
    path = f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/Lap_2022_{i}-{i+1000}.pickle"
    df = pd.read_csv(path)
    df.to_pickle(
        f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/2022/Lap_2022_{i}-{i+1000}.pickle"
    )

In [52]:
import numpy as np

year = 2023
for i in range(0, 8000, 1000):
    path = f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/{year}/Race_{year}_{i}-{i+1000}.pickle"
    df = pd.read_pickle(path)
    print(df.shape)

(3059, 29)
(3195, 29)
(4420, 29)
(7266, 29)
(6963, 29)
(4199, 29)
(5766, 29)
(7010, 29)


In [53]:
year = 2023
i = 3000
f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/{year}/Race_{year}_{i}-{i+1000}.pickle"
df = pd.read_pickle(path)
df

Unnamed: 0,枠番,馬番,斤量,タイム,タイム指数,上り,単勝,備考,馬主,賞金（万円）,...,性,年齢,体重,体重変化,調教場所,調教師名前,通過1,通過2,通過3,通過4
202309010805,3,3,56.0,135.1,79.0,35.0,1.5,,飯塚知一,550.0,...,牡,3,500,-6,西,辻野泰之,4,4,3.0,3.0
202309010805,5,8,54.0,135.5,76.0,35.7,7.3,,中西浩一,220.0,...,牝,3,474,-4,西,河内洋,1,1,1.0,1.0
202309010805,8,14,56.0,135.5,76.0,35.2,80.1,,ラッキーフィールド,140.0,...,牡,3,476,2,西,新谷功一,5,6,6.0,6.0
202309010805,7,12,56.0,135.6,76.0,35.1,8.2,出遅れ,ルクス,83.0,...,牡,3,464,0,西,清水久詞,9,9,8.0,7.0
202309010805,7,11,56.0,135.7,75.0,35.8,34.7,,国本哲秀,55.0,...,牡,3,472,-10,西,茶木太樹,2,2,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202310020612,5,10,58.0,107.2,68.0,38.7,15.1,,ゴドルフィン,,...,牡,5,494,0,東,高柳瑞樹,9,10,10.0,12.0
202310020612,4,8,57.0,107.2,68.0,39.1,33.4,,松本好雄,,...,牡,5,444,4,西,飯田祐史,5,4,6.0,5.0
202310020612,4,7,57.0,107.4,67.0,38.8,41.1,出遅れ,コウトミックレーシング,,...,牡,4,466,4,東,高橋裕,13,14,13.0,12.0
202310020612,7,13,58.0,108.5,57.0,40.2,122.3,出遅れ,松本好雄,,...,牡,5,490,19,西,河内洋,7,7,7.0,14.0
