In [207]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o
from itertools import combinations, permutations
import matplotlib.pyplot as plt

In [208]:
class DataProcessor:
    """
    Attributes:
    ----------
    data_raw : pd.DataFrame
        rawデータ
    data_preprocessed : pd.DataFrame
        preprocessing後のデータ
    data_horse_result_merged : pd.DataFrame
        merge_horse_results後のデータ
    data_peds_merged : pd.DataFrame
        merge_peds後のデータ
    data_categorical_processed : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """

    def __init__(self):
        self.data_raw = pd.DataFrame()
        self.data_preprocessed = pd.DataFrame()
        self.data_horse_result_merged = pd.DataFrame()
        self.data_peds_merged = pd.DataFrame()
        self.data_categorical_processed = pd.DataFrame()

    def merge_horse_results(self, hr, n_samples_list=[5, 9, "all"]):
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す

        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """

        self.data_horse_result_merged = self.data_preprocessed.copy()
        for n_samples in n_samples_list:
            self.data_horse_result_merged = hr.merge_all(self.data_horse_result_merged, n_samples=n_samples)

        # 6/6追加： 馬の出走間隔追加
        self.data_horse_result_merged["interval"] = (self.data_horse_result_merged["date"] - self.data_horse_result_merged["latest"]).dt.days
        self.data_horse_result_merged.drop(["開催", "latest"], axis=1, inplace=True)

    def merge_peds(self, peds):
        """
        5世代分血統データを追加してdata_peに返す

        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """

        self.data_peds_merged = self.data_horse_result_merged.merge(
            peds, left_on="horse_id", right_index=True, how="left"
        )
        self.no_peds = self.data_peds_merged[self.data_peds_merged["peds_0"].isnull()][
            "horse_id"
        ].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')

    def process_categorical(self, le_horse, le_jockey, results_m):
        """
        カテゴリ変数を処理してdata_cに返す

        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """

        df = self.data_peds_merged.copy()

        # ラベルエンコーディング。horse_id, jockey_idを0始まりの整数に変換
        mask_horse = df["horse_id"].isin(le_horse.classes_)
        new_horse_id = df["horse_id"].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df["horse_id"] = le_horse.transform(df["horse_id"])
        mask_jockey = df["jockey_id"].isin(le_jockey.classes_)
        new_jockey_id = df["jockey_id"].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df["jockey_id"] = le_jockey.transform(df["jockey_id"])

        # horse_id, jockey_idをpandasのcategory型に変換
        df["horse_id"] = df["horse_id"].astype("category")
        df["jockey_id"] = df["jockey_id"].astype("category")

        # そのほかのカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        # 列を一定にするため
        weathers = results_m["weather"].unique()
        race_types = results_m["race_type"].unique()
        ground_states = results_m["ground_state"].unique()
        sexes = results_m["性"].unique()
        df["weather"] = pd.Categorical(df["weather"], weathers)
        df["race_type"] = pd.Categorical(df["race_type"], race_types)
        df["ground_state"] = pd.Categorical(df["ground_state"], ground_states)
        df["性"] = pd.Categorical(df["性"], sexes)
        df = pd.get_dummies(df, columns=["weather", "race_type", "ground_state", "性"])

        self.data_categorical_processed = df

In [209]:
def update_data(old, new):
    """
    Parameters:
    ----------
    old : pandas.DataFrame
        古いデータ
    new : pandas.DataFrame
        新しいデータ
    """

    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])

In [210]:
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        # ネット競馬プレミアムのログイン情報
        self.user_id = "ken.osechi@gmail.com"
        self.password = "BQj4WGwzLxJZNkWiw"
        self.login_info = {
            "login_id": self.user_id,
            "pswd": self.password,
        }
        self.session = requests.session()
        self.url_login = "https://regist.netkeiba.com/account/?pid=login&action=auth"
        self.ses = self.session.post(self.url_login, data=self.login_info)
        self.cookie = self.ses.cookies

    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)

    @staticmethod
    def scrape(self, race_id_list):
        """
        レース結果データをスクレイピングする関数

        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト

        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """

        # race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.sp.netkeiba.com/race/" + race_id

                # html = requests.get(url)
                # html.encoding = "EUC-JP"

                html = self.session.get(url, cookies=self.cookie)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.content, "html.parser")

                # メインとなるテーブルデータを取得
                df = pd.read_html(html.text)[0]
                # 列名に半角スペースがあれば除去する
                df = df.rename(columns=lambda x: x.replace(" ", ""))

                # 天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                soup = BeautifulSoup(html.text, "html.parser")
                texts = (
                    soup.find("div", attrs={"class": "data_intro"})
                    .find_all("p")[0]
                    .text
                    + soup.find("div", attrs={"class": "data_intro"})
                    .find_all("p")[1]
                    .text
                )
                info = re.findall(r"\w+", texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[-1])] * len(
                            df
                        )  
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)

                # 馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                # インデックスをrace_idにする
                df.index = [race_id] * len(df)

                race_results[race_id] = df
            # 存在しないrace_idを飛ばす
            except IndexError:
                continue
            except AttributeError:  # 存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            # wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            # Jupyterで停止ボタンを押した時の対処
            except:
                break

        # pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df

    # 前処理
    def preprocessing(self):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df["着順"] = pd.to_numeric(df["着順"], errors="coerce")
        df.dropna(subset=["着順"], inplace=True)
        df["着順"] = df["着順"].astype(int)
        df["rank"] = df["着順"].map(lambda x: 1 if x < 4 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0]
        df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1]

        # errors='coerce'で、"計不"など変換できない時に欠損値にする
        df["体重"] = pd.to_numeric(df["体重"], errors="coerce")
        df["体重変化"] = pd.to_numeric(df["体重変化"], errors="coerce")

        # 単勝をfloatに変換
        df["単勝"] = df["単勝"].astype(float)
        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        # 不要な列を削除
        df.drop(
            ["タイム", "着差", "調教師", "性齢", "馬体重", "馬名", "騎手", "人気", "着順"],
            axis=1,
            inplace=True,
        )

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")

        # 開催場所
        df["開催"] = df.index.map(lambda x: str(x)[4:6])

        # 6/6出走数追加
        df["n_horses"] = df.index.map(df.index.value_counts())

        self.data_p = df

    # カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_peds_merged["horse_id"])
        self.le_jockey = LabelEncoder().fit(self.data_peds_merged["jockey_id"])
        super().process_categorical(
            self.le_horse, self.le_jockey, self.data_peds_merged
        )

In [240]:
class RaceResultsSP:
    def __init__(self):
        super().__init__()
        self.user_id = "ken.osechi@gmail.com"
        self.password = "BQj4WGwzLxJZNkWiw"
        self.login_info = {
            "login_id": self.user_id,
            "pswd": self.password,
        }
        self.session = requests.session()
        self.url_login = "https://regist.netkeiba.com/account/?pid=login&action=auth"
        self.ses = self.session.post(self.url_login, data=self.login_info)
        self.cookie = self.ses.cookies

    def scrape(self, race_id_list):
        # race_idをkeyにしてDataFrame型を格納
        race_results = {}
        odds_results = {}
        lap_results = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.sp.netkeiba.com/race/" + race_id
                html = self.session.get(url, cookies=self.cookie)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.content, "html.parser")
                race_data = soup.find("div", class_="RaceData").find_all("span")
                race_data = "".join(tag.get_text() for tag in race_data)

                # htmlからBeautifulSoupでデータを抽出
                df = pd.read_html(html.text)[0]
                df = df.rename(columns=lambda x: x.replace(" ", ""))  # カラム名の空白削除

                # 天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                soup = BeautifulSoup(html.text, "html.parser")
                texts = (
                    soup.find("div", attrs={"class": "RaceHeader_Value"}).text
                    + soup.find("div", attrs={"class": "RaceHeader_Value_Others"}).text
                )

                date = soup.find("span", attrs={"class": "Race_Date"}).text.strip()
                df["date"] = [date] * len(df)

                race_info = re.findall(r"\w+", texts)

                for text in race_info:
                    if "芝" in text:
                        df["race_type"] = ["芝"] * len(df)
                    if "ダート" in text:
                        df["race_type"] = ["ダート"] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[-1])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                        # 馬場指数
                        if "障" in text:
                            df["ground_index"] = [0] * len(df)
                        else:
                            baba_index = pd.read_html(html.text)[2].iloc[0, 1]
                            df["ground_index"] = [baba_index] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)

                # horse_id
                horse_id_list = []
                horse_a_list = (
                    soup.find(
                        "table", attrs={"class": "table_slide_body ResultsByRaceDetail"}
                    )
                    .find_all("tbody")[0]
                    .find_all("a", attrs={"href": re.compile(r"horse/(\d+)/")})
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])

                jockey_id_list = []
                jockey_a_list = (
                    soup.find(
                        "table", attrs={"class": "table_slide_body ResultsByRaceDetail"}
                    )
                    .find_all("tbody")[0]
                    .find_all("a", attrs={"href": re.compile(r"jockey/(\d+)/")})
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])

                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                # indexをrace_idにする
                df.index = [race_id] * len(df)
                race_results[race_id] = df

                # オッズの取得
                # odds = pd.read_html(html.text)[1]
                # type = odds.iloc[:, 0]
                # win = odds.iloc[:, 2]
                # odds_df = pd.concat([type, win], axis=1).set_index(0).T
                # odds_df.index = [race_id]
                # odds_results[race_id] = odds_df

                # laptimeの取得
                len_ls = (
                    [i for i in range(100, 2600, 100)]
                    + [i + 50 for i in range(100, 1200, 100)]
                    + [i for i in range(2600, 3601, 200)]
                )
                len_ls = sorted(len_ls)
                len_ls = [f"{i}m" for i in len_ls]
                lap_df = pd.DataFrame(columns=len_ls)
                if "障" in race_data:
                    lap_df.loc[0] = np.nan
                else:
                    lap_time_read = pd.read_html(html.text)[5]
                    lap_time_read = lap_time_read.T.reset_index().T.reset_index(
                        drop=True
                    )

                    for i in range(lap_time_read.shape[0]):
                        if i % 2 == 1:
                            pass
                        else:
                            for j in range(lap_time_read.shape[1]):
                                if pd.isna(lap_time_read.iloc[i, j]):
                                    break
                                length = str(lap_time_read.iloc[i, j])
                                lap_df[length] = [
                                    float(lap_time_read.iloc[i + 1, j].split()[1])
                                ]
                lap_df.index = [race_id]
                lap_results[race_id] = lap_df

            # 存在しないrace_idを飛ばす
            except IndexError:
                continue
            except AttributeError:  # 存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            # wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            # Jupyterで停止ボタンを押した時の対処
            except:
                break

        # pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])
        # odds_results_df = pd.concat([odds_results[key] for key in odds_results], ignore_index=True)
        lap_results_df = pd.concat([lap_results[key] for key in lap_results])

        return race_results_df, lap_results_df

    def preprocess(self, race_res):
        df = race_res.copy()

        df["着順"] = pd.to_numeric(df["着順"], errors="coerce")
        df.dropna(subset=["着順"], inplace=True)
        df["着順"] = df["着順"].astype(int)
        df["rank"] = df["着順"].map(lambda x: 1 if x < 6 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0]
        df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1]

        # errors='coerce'で、"計不"など変換できない時に欠損値にする
        df["体重"] = pd.to_numeric(df["体重"], errors="coerce")
        df["体重変化"] = pd.to_numeric(df["体重変化"], errors="coerce")

        # 単勝をfloatに変換
        df["単勝"] = df["単勝"].astype(float)
        # 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        def convert_time_to_seconds(time_str):
            # タイムの形式が '分:秒.ミリ秒' と仮定
            # 分と秒を分離
            minutes, seconds = time_str.split(":")
            # 分を秒に変換し、秒と合算
            return int(minutes) * 60 + float(seconds)

        # 'タイム' 列の各エントリに対して変換関数を適用
        df["タイム"] = df["タイム"].apply(convert_time_to_seconds)

        # def convert_margin_to_number(margin):
        #     if pd.isna(margin):
        #         return 0
        #     if margin == "同着":
        #         return 0
        #     if margin == "ハナ":
        #         return 0.2
        #     if margin == "アタマ":
        #         return 0.4
        #     if margin == "クビ":
        #         return 0.8
        #     if margin == "大":
        #         return 10
        #     # その他の値を数値に変換
        #     total = 0

        #     parts = margin.split(".")
        #     if len(parts) == 2:
        #         total += float(parts[0])
        #         fraction = parts[1].split("/")
        #         if len(fraction) == 2:
        #             total += float(fraction[0]) / float(fraction[1])

        #     else:
        #         fraction = margin.split("/")
        #         if len(fraction) == 2:
        #             total += float(fraction[0]) / float(fraction[1])
        #         else:
        #             total += float(margin)
        #     return total * 2.8

        # '着差' 列の各エントリに変換関数を適用し、2.8を掛ける
        # df["着差_数値"] = df["着差"].apply(convert_margin_to_number)
        # 累積合計を計算
        # df["着差累積"] = df["着差_数値"].cumsum()

        df["調教場所"] = df["調教師"].map(lambda x: str(x)[1:2])
        df["調教師名前"] = df["調教師"].map(lambda x: str(x)[3:])

        df["date"] = pd.to_datetime(df["date"].str.split("(").str[0])

        # 通過の列を分割する関数
        def split_pass_columns(pass_str):
            # 通過位置を'-'で分割し、最大4つの位置まで取得する
            if "-" not in str(pass_str):
                return [pass_str]
            parts = pass_str.split("-") + [None] * (4 - len(pass_str.split("-")))
            return parts[:4]

        # 新しい列を作成
        df[["通過1", "通過2", "通過3", "通過4"]] = df["通過"].apply(
            lambda x: pd.Series(split_pass_columns(x))
        )

        df.drop(
            ["着差", "通過", "調教師", "性齢", "馬体重", "馬名", "騎手", "人気", "着順", "調教タイム", "厩舎コメント"],
            axis=1,
            inplace=True,
        )

        return df

In [241]:
race_res = RaceResultsSP()

In [256]:
year = 2021
race_id_list = []
for place in range(1, 11, 1):
    for kai in range(1, 7, 1):
        for day in range(1, 13, 1):
            for r in range(1, 13, 1):
                race_id = (
                    str(year)
                    + str(place).zfill(2)
                    + str(kai).zfill(2)
                    + str(day).zfill(2)
                    + str(r).zfill(2)
                )
                race_id_list.append(race_id)
len(race_id_list)

8640

In [257]:
for i in range(0, len(race_id_list), 1000):
    a, b = race_res.scrape(race_id_list[i : i + 1000])
    df_a = race_res.preprocess(a)
    
    df_a.to_pickle(
        f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/Race_{year}_{i}-{i+1000}.pickle"
    )
    b.to_pickle(
        f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/Lap_{year}_{i}-{i+1000}.pickle"
    )

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


ValueError: No objects to concatenate

In [246]:
df_a = race_res.preprocess(a)
df_a.to_pickle(
    f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/Race_2023_{i}-{i+1000}.pickle"
)
b.to_pickle(
    f"/Users/KeD/Scripts/python/keiba/KeibaAI/keiba/RaceRes/Lap_2023_{i}-{i+1000}.pickle"
)

In [248]:
len(a)

3083