In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.request import urlopen
import optuna.integration.lightgbm as lgb_o
from itertools import combinations, permutations
import matplotlib.pyplot as plt


In [3]:
class Results:
    @staticmethod
    def scrape(race_id_list):

        """
        レース結果データをスクレイピングする関数
        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト
        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """
        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id
                # スクレイピング
                html = requests.get(url)
                html.encoding = "EUC-JP"
                # メインとなるテーブルデータを取得
                df = pd.read_html(html.text)[0]
                # 列名に半角スペースがあれば除去する
                df = df.rename(columns=lambda x: x.replace(' ', ''))
                # 天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                soup = BeautifulSoup(html.text, "html.parser")
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[-1])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)
                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list
                #インデックスをrace_idにする
                df.index = [race_id] * len(df)
                race_results[race_id] = df
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break
        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])
        return race_results_df

In [4]:
class HorseResults:
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(0.5)
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                html = requests.get(url)
                html.encoding = "EUC-JP"
                df = pd.read_html(html.text)[3]
                soup = BeautifulSoup(html.text, "html.parser")
                race_a_list = []
                races_data = soup.find("table", attrs={"class": "db_h_race_results nk_tb_common"}).find('tbody').find_all('tr')
                for i in range(len(races_data)):
                    a_to_add = races_data[i].find('a', attrs={'href': re.compile(r'^/race/\d+/$')})
                    if a_to_add == None:
                        a_to_add = races_data[i].find('a', attrs={'title': re.compile(r'.*')})
                    race_a_list.append(a_to_add)
                race_id_list = []
                for a in race_a_list:
                    if a==None:
                        print(horse_id, 'a==none')
                        race_id_list.append(np.nan)
                        continue
                    race_id = re.findall(r"\d+", a["href"])
                    race_id_list.append(race_id[0])

                df['race_id'] = pd.Series(race_id_list)
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    

In [4]:
class Peds:
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数

        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト

        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        return peds_df

In [6]:
class Return:
    @staticmethod
    def scrape(race_id_list):
        """
        払い戻し表データをスクレイピングする関数

        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト

        Returns:
        ----------
        return_tables_df : pandas.DataFrame
            全払い戻し表データをまとめてDataFrame型にしたもの
        """

        return_tables = {}
        for race_id in tqdm(race_id_list):
            time.sleep(1)
            try:
                url = "https://db.netkeiba.com/race/" + race_id

                #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
                #そのため、改行コードを文字列brに変換して後でsplitする
                f = urlopen(url)
                html = f.read()
                html = html.replace(b'<br />', b'br')
                dfs = pd.read_html(html)

                #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
                df = pd.concat([dfs[1], dfs[2]])

                df.index = [race_id] * len(df)
                return_tables[race_id] = df
            except IndexError:
                continue
            except AttributeError: #存在しないrace_idでAttributeErrorになるページもあるので追加
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        return_tables_df = pd.concat([return_tables[key] for key in return_tables])
        return return_tables_df

In [5]:
def update_data(old, new):
    """
    Parameters:
    ----------
    old : pandas.DataFrame
        古いデータ
    new : pandas.DataFrame
        新しいデータ
    """

    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])

In [14]:
horse_res = pd.read_pickle('beta/horse_result.pickle')
horse_id_list = horse_res.index.unique()
horse_id = horse_id_list[0]

url = 'https://db.netkeiba.com/horse/' + '2014106058'
html = requests.get(url)
html.encoding = "EUC-JP"
df = pd.read_html(html.text)[3]

In [35]:
soup = BeautifulSoup(html.text, "html.parser")
race_a_list = []
races_data = soup.find("table", attrs={"class": "db_h_race_results nk_tb_common"}).find('tbody').find_all('tr')
for i in range(len(races_data)):
    race_a_list.append(races_data[i].find('a', attrs={'href': re.compile(r'^/race/\d+/$')}))
race_id_list = []
for a in race_a_list:
      race_id = re.findall(r"\d+", a["href"])
      race_id_list.append(race_id[0])

df['race_id'] = pd.Series(race_id_list)
df

Unnamed: 0,日付,開催,天 気,R,レース名,映 像,頭 数,枠 番,馬 番,オ ッ ズ,...,ﾀｲﾑ 指数,通過,ペース,上り,馬体重,厩舎 ｺﾒﾝﾄ,備考,勝ち馬 (2着馬),賞金,race_id
0,2022/01/04,園田,晴,5,C2四,,10,5,5,2.9,...,**,7-6,0.0-39.0,,520(-3),,,デライーガー,,202250010405
1,2021/12/16,園田,曇,6,C2三,,10,3,3,3.1,...,**,5-5-3-4,0.0-39.2,39.6,523(0),,,シゲルジルコン,15.0,202150121606
2,2021/11/13,高知,晴,5,徳島県うずしお特別,,11,7,9,4.5,...,**,6-5-5-1,0.0-41.3,41.2,523(0),,,ミラクルヒッター,21.0,202154111305
3,2021/10/30,高知,晴,10,教養C新館完成特別,,11,6,7,4.4,...,**,8-6-6-5,0.0-40.0,39.3,523(0),,,アポロダーウィン,9.0,202154103010
4,2021/10/16,高知,曇,10,目指せジョッキー特別,,12,4,4,3.4,...,**,10-10-10-4,0.0-39.6,39.3,523(+1),,,ケイアイマボラ,9.0,202154101610
5,2021/10/02,高知,晴,10,大西輝門復帰待望特別,,12,2,2,1.6,...,**,9-9-7-5,0.0-40.7,41.5,522(+2),,,セイウンデルレイ,6.0,202154100210
6,2021/09/19,高知,晴,11,寒風山特別,,11,6,6,14.6,...,**,8-8-9-6,0.0-39.8,39.0,520(-1),,,イノバティブ,24.0,202154091911
7,2021/09/05,高知,曇,11,住吉池特別,,11,2,2,7.9,...,**,8-6-6-3,0.0-40.1,39.8,521(+4),,,サンライズフォルテ,24.0,202154090511
8,2021/08/21,高知,雨,11,Bー2,,11,8,12,5.3,...,**,9-9-8-5,0.0-38.7,38.2,517(+1),,,ペイシャクレア,35.0,202154082111
9,2021/07/31,高知,晴,5,エピカリス賞,,10,3,3,113.0,...,**,9-9-9-5,0.0-42.6,42.8,516(-1),,,クラウンシャイン,,202154073105


In [7]:
horse_res = pd.read_pickle('beta/horse_result.pickle')
horse_id_list = horse_res.index.unique()

for i in range(6000, len(horse_id_list), 1000):
    horse_res = HorseResults.scrape(horse_id_list[i:i+1000])
    df_prev = pd.read_pickle(f'beta/horse_data/horse_result{i}.pickle')
    horse_res.to_pickle(f'beta/horse_data_raceID_added_ver2/horse_result{i}.pickle')
    


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

In [2]:
horse_data_raceID_added_ver2 = pd.read_pickle('beta/horse_data_raceID_added_ver2/horse_result0.pickle')
for i in range(1000, 37000, 1000):
    horse_data_raceID_added_ver2_temp = pd.read_pickle(f'beta/horse_data_raceID_added_ver2/horse_result{i}.pickle')
    horse_data_raceID_added_ver2 = pd.concat([horse_data_raceID_added_ver2, horse_data_raceID_added_ver2_temp])
horse_data_raceID_added_ver2.to_pickle('horse_result_raceID_added.pickle')

In [17]:
horse_result = pd.read_pickle('Ver1.0/horse_result.pickle')
len(horse_result)

FileNotFoundError: [Errno 2] No such file or directory: 'Ver1.0/horse_result.pickle'

In [13]:
merged = horse_data_raceID_added_ver2.merge(horse_result, how='outer', indicator=True)

only_in_horse_data_raceID_added_ver2 = merged.loc[merged['_merge'] == 'left_only']
only_in_horse_result = merged.loc[merged['_merge'] == 'right_only']
only_in_horse_data_raceID_added_ver2 

Unnamed: 0,日付,開催,天 気,R,レース名,映 像,頭 数,枠 番,馬 番,オ ッ ズ,...,オッズ,人気,着順,斤量,馬場,馬場指数,ﾀｲﾑ指数,厩舎ｺﾒﾝﾄ,勝ち馬(2着馬),_merge
2941,2023/05/30,高知,雨,4.0,C3ー14,,9.0,8.0,8,20.1,...,,,,,,,,,,left_only
6695,2023/05/28,高知,曇,11.0,丸塚池特別,,9.0,7.0,7,60.2,...,,,,,,,,,,left_only
6918,2023/05/30,高知,雨,8.0,Aー3,,10.0,3.0,3,6.3,...,,,,,,,,,,left_only
7114,2023/05/25,園田,曇,2.0,C3一,,12.0,1.0,1,33.0,...,,,,,,,,,,left_only
7599,2023/06/01,浦和,晴,11.0,’23武蔵国OP(OP),,8.0,4.0,4,22.3,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782830,2023/05/27,1京都11,晴,5.0,3歳未勝利,,13.0,4.0,5,3.6,...,,,,,,,,,,left_only
782834,2023/05/27,1京都11,晴,5.0,3歳未勝利,,13.0,7.0,10,18.6,...,,,,,,,,,,left_only
782866,2023/05/26,園田,曇,9.0,3歳AB,,9.0,3.0,3,16.9,...,,,,,,,,,,left_only
782906,2023/05/27,1京都11,晴,4.0,3歳未勝利,,18.0,6.0,12,243.5,...,,,,,,,,,,left_only


In [18]:
only_in_horse_data_raceID_added_ver2.to_pickle('horse_result_raceID_added.pickle')

In [None]:
import pandas as pd
horse_res = pd.read_pickle('beta/horse_result.pickle')
horse_id_list = horse_res.index.unique()

horse_id_list[6357]
horse_res = HorseResults.scrape(horse_id_list[6355:6360])
    # horse_res.to_pickle('beta/horse_data_raceID_added/horse_result.pickle')

In [11]:
df = pd.read_pickle('beta/horse_data_raceID_added/horse_result0.pickle')
for i in range(10000, 37000, 1000):
    d = pd.read_pickle(f'beta/horse_data_raceID_added/horse_result{i}.pickle')
    df = pd.concat([df, d])
df.columns = df.columns.str.replace(' ', '')
df
        


Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,...,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金,race_id
2015100713,2020/11/21,5阪神5,晴,11.0,アンドロメダS(L),,15.0,1.0,1,68.8,...,**,12-11-11-12,35.4-35.5,35.5,428(-6),,,アドマイヤビルゴ,,202009050511
2015100713,2020/10/24,4新潟5,雨,11.0,新潟牝馬S(OP),,13.0,6.0,8,12.1,...,**,6-7-9-7,36.3-35.6,37.0,434(+6),,,ウラヌスチャーム,,202004040511
2015100713,2020/09/06,3新潟8,晴,11.0,新潟記念(G3),,18.0,7.0,15,75.8,...,**,4-5,36.8-33.1,33.5,428(+4),,,ブラヴァス,,202004030811
2015100713,2020/08/02,1札幌4,晴,11.0,クイーンS(G3),,14.0,6.0,10,16.8,...,**,10-10-10-11,34.8-36.0,35.1,424(+4),,,レッドアネモス,,202001010411
2015100713,2020/06/14,3阪神4,曇,11.0,マーメイドS(G3),,16.0,4.0,8,10.1,...,**,10-10-8-8,36.3-36.5,36.8,420(-4),,,サマーセント,,202009030411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020100674,2023/02/05,1中京12,晴,6.0,3歳新馬,,15.0,5.0,8,20.3,...,**,9-10-10,37.4-34.5,34.2,400(0),,,タイキバルドル,62.0,202307011206
2020106815,2023/05/14,1京都8,曇,4.0,3歳未勝利,,10.0,5.0,5,27.1,...,**,10-9-9-9,35.8-37.7,37.1,464(+4),,,ニホンピロアリー,,202308010804
2020106815,2023/02/26,1阪神6,曇,5.0,3歳未勝利,,17.0,7.0,14,75.0,...,**,13-13,36.5-35.0,34.8,460(-2),,,ルクスドヌーヴ,,202309010605
2020106815,2023/02/12,1阪神2,晴,6.0,3歳未勝利,,12.0,1.0,1,23.5,...,**,2-1-1-1,35.3-36.6,37.6,462(-2),,,レシプロシティ,,202309010206


In [15]:
for i in range(0, 37000, 1000):
    d1 = pd.read_pickle(f'beta/horse_data/horse_result{i}.pickle')
    d2 = pd.read_pickle(f'beta/horse_data_raceID_added/horse_result{i}.pickle')
    if len(d2) == len(d1):
        print(i)

In [4]:
df = pd.read_pickle('beta/horse_data_raceID_added/horse_result0.pickle')
for i in range(10000, 37000, 1000):
    d = pd.read_pickle(f'beta/horse_data_raceID_added/horse_result{i}.pickle')
    df = pd.concat([df, d])
df.columns = df.columns.str.replace(' ', '')

df2 = pd.read_pickle('beta/horse_result.pickle')
df = df.reset_index()
df2 = df2.reset_index()
df2 = pd.merge(df2, df, on=list(df2.columns), how='left')


Unnamed: 0,index,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,...,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金,race_id
0,2015100713,2020/11/21,5阪神5,晴,11.0,アンドロメダS(L),,15.0,1.0,1,...,**,12-11-11-12,35.4-35.5,35.5,428(-6),,,アドマイヤビルゴ,,202009050511
1,2015100713,2020/10/24,4新潟5,雨,11.0,新潟牝馬S(OP),,13.0,6.0,8,...,**,6-7-9-7,36.3-35.6,37.0,434(+6),,,ウラヌスチャーム,,202004040511
2,2015100713,2020/09/06,3新潟8,晴,11.0,新潟記念(G3),,18.0,7.0,15,...,**,4-5,36.8-33.1,33.5,428(+4),,,ブラヴァス,,202004030811
3,2015100713,2020/08/02,1札幌4,晴,11.0,クイーンS(G3),,14.0,6.0,10,...,**,10-10-10-11,34.8-36.0,35.1,424(+4),,,レッドアネモス,,202001010411
4,2015100713,2020/06/14,3阪神4,曇,11.0,マーメイドS(G3),,16.0,4.0,8,...,**,10-10-8-8,36.3-36.5,36.8,420(-4),,,サマーセント,,202009030411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780402,2020100674,2023/02/05,1中京12,晴,6.0,3歳新馬,,15.0,5.0,8,...,**,9-10-10,37.4-34.5,34.2,400(0),,,タイキバルドル,62.0,202307011206
780403,2020106815,2023/05/14,1京都8,曇,4.0,3歳未勝利,,10.0,5.0,5,...,**,10-9-9-9,35.8-37.7,37.1,464(+4),,,ニホンピロアリー,,202308010804
780404,2020106815,2023/02/26,1阪神6,曇,5.0,3歳未勝利,,17.0,7.0,14,...,**,13-13,36.5-35.0,34.8,460(-2),,,ルクスドヌーヴ,,202309010605
780405,2020106815,2023/02/12,1阪神2,晴,6.0,3歳未勝利,,12.0,1.0,1,...,**,2-1-1-1,35.3-36.6,37.6,462(-2),,,レシプロシティ,,202309010206


In [20]:

horse_id = '2015106464'
url = 'https://db.netkeiba.com/horse/' + horse_id
html = requests.get(url)
html.encoding = "EUC-JP"
df = pd.read_html(html.text)[3]
soup = BeautifulSoup(html.text, "html.parser")
race_a_list = []
races_data = soup.find("table", attrs={"class": "db_h_race_results nk_tb_common"}).find('tbody').find_all('tr')
for i in range(len(races_data)):
    a_to_add = races_data[i].find('a', attrs={'href': re.compile(r'^/race/\d+/$')})
    if a_to_add == None:
        a_to_add = races_data[i].find('a', attrs={'title': re.compile(r'.*')})
    race_a_list.append(a_to_add)
race_id_list = []
for a in race_a_list:
    if a==None:
        print(horse_id, 'a==none')
        race_id_list.append(np.nan)
        continue
    race_id = re.findall(r"\d+", a["href"])
    race_id_list.append(race_id[0])

for i in race_id_list:
    if i==np.nan: print(i)


In [5]:
ls = df2[df2['race_id'].isnull()]['index'].unique()

for i in range(0, len(ls), 1000):
    horse_res = HorseResults.scrape(ls[i:i+1000])
    # horse_res.to_pickle(f'beta/horse_data_raceID_added/horse_result_supplement{i}.pickle')

  0%|          | 0/1000 [00:00<?, ?it/s]

2015101621 a==none
2014105785 a==none
2013102955 a==none
2013102955 a==none
2014105304 a==none
2015104624 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2014106191 a==none
2013101999 a==none
2013101999 a==none
2010100855 a==none


KeyboardInterrupt: 

In [17]:
url = 'https://db.netkeiba.com/horse/'+horse_id_list[43]
html = requests.get(url)
html.encoding = "EUC-JP"
df = pd.read_html(html.text)[3]
soup = BeautifulSoup(html.text, "html.parser")
race_a_list = []
races_data = soup.find("table", attrs={"class": "db_h_race_results nk_tb_common"}).find('tbody').find_all('tr')
for i in range(len(races_data)):
    race_a_list.append(races_data[i].find('a', attrs={'href': re.compile(r'^/race/\d+/$')}))
race_id_list = []
for a in race_a_list:
      if a==None:
           race_id_list.append(np.nan)
           continue
      race_id = re.findall(r"\d+", a["href"])
      race_id_list.append(race_id)

df['race_id'] = pd.Series(race_id_list)
df

Unnamed: 0,日付,開催,天 気,R,レース名,映 像,頭 数,枠 番,馬 番,オ ッ ズ,...,ﾀｲﾑ 指数,通過,ペース,上り,馬体重,厩舎 ｺﾒﾝﾄ,備考,勝ち馬 (2着馬),賞金,race_id
0,2020/12/13,シャティ,,,香港スプリント(G1),,14,,6,8.9,...,,,,,計不,,,,,
1,2020/05/16,2東京7,雨,11.0,京王杯スプリングC(G2),,13,7.0,10,3.5,...,**,4-5,35.2-33.1,33.3,516(+2),,,ダノンスマッシュ,,[202005020711]
2,2020/03/29,1中京8,晴,11.0,高松宮記念(G1),,18,5.0,9,3.8,...,**,8-9,34.2-34.5,34.5,514(-6),,,モズスーパーフレア,,[202007010811]
3,2020/03/07,2中山3,曇,11.0,夕刊フジオーシャンS(G3),,16,1.0,1,2.5,...,**,9-6,33.1-34.3,34.4,520(+6),,,ダノンスマッシュ,1008.0,[202006020311]
4,2019/09/29,4中山9,曇,11.0,スプリンターズS(G1),,16,4.0,8,2.9,...,**,11-8,32.8-34.3,33.5,514(-2),,,(モズスーパーフレア),11344.4,[201906040911]
5,2019/09/08,4阪神2,晴,11.0,産経賞セントウルS(G2),,13,5.0,7,2.7,...,**,7-7,33.0-33.7,33.2,516(-4),,,(ファンタジスト),5993.8,[201909040211]
6,2019/08/25,2札幌4,晴,11.0,キーンランドC(G3),,16,4.0,7,4.7,...,**,12-12,33.2-36.0,34.9,520(+2),,,ダノンスマッシュ,1616.2,[201901020411]
7,2019/06/16,1函館2,小雨,11.0,函館スプリントS(G3),,7,8.0,13,1.8,...,**,5-5,34.4-34.0,33.5,518(0),,,カイザーメランジェ,986.7,[201902010211]
8,2019/05/11,2東京7,晴,11.0,京王杯スプリングC(G2),,15,5.0,9,3.6,...,**,7-6,34.2-33.9,33.1,518(-8),,,(リナーテ),6013.4,[201905020711]
9,2019/02/03,1東京4,晴,11.0,東京新聞杯(G3),,15,3.0,5,3.6,...,**,5-5,34.5-34.7,34.2,526(+10),,,インディチャンプ,390.0,[201905010411]


In [10]:
horse_peds = pd.DataFrame()
for i in range(0, len(horse_id_list), 1000):
    peds_temp = pd.read_pickle('horse_peds'+str(i)+'.pickle')
    horse_peds = pd.concat([horse_peds, peds_temp])
horse_peds.to_pickle('horse_peds.pickle')
horse_peds.to_csv('horse_peds.csv')

In [37]:
for i in range(0, len(horse_id_list), 1000):
    d = pd.read_pickle('horse_result'+str(i)+'.pickle')
    if len(d.index.unique()) != 1000: print(i, len(d.index.unique()))

36000 118


In [55]:
# soup = BeautifulSoup(html.content, 'html.parser')
# soup.find('table', attrs={'class': 'blood_table'}).find_all('a')
df = pd.read_html(url)[0]
peds_dict = {}
horse_id = '2015100713'
#重複を削除して1列のSeries型データに直す
generations = {}
for i in reversed(range(5)):
    generations[i] = df[i]
    df.drop([i], axis=1, inplace=True)
    df = df.drop_duplicates()
ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

peds_dict[horse_id] = ped.reset_index(drop=True)

peds_dict
peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')
peds_df

Unnamed: 0,peds_0,peds_1,peds_2,peds_3,peds_4,peds_5,peds_6,peds_7,peds_8,peds_9,...,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
2015100713,ステイゴールド 1994 黒鹿毛 [血統][産駒] Halo系,ピノブラン 2007 芦毛 [血統][産駒] FNo.[2-n],サンデーサイレンス Sunday Silence(米) 1986 青鹿毛 [血統][産駒],ゴールデンサッシュ 1988 栗毛 [血統][産駒],クロフネ 1998 芦毛 [血統][産駒],フェートデュヴァン 2000 鹿毛 [血統][産駒],Halo 1969 黒鹿毛 [血統][産駒],Wishing Well 1975 鹿毛 [血統][産駒],ディクタス Dictus(仏) 1967 栗毛 [血統][産駒],ダイナサッシュ 1979 鹿毛 [血統][産駒],...,Icecapade,コレラ,メジロアサマ,シエリル,リマンド,メジロアイリス,Northern Dancer,Flaming Page,Riva Ridge,Gliding By


In [13]:
r = pd.read_pickle('beta/race.pickle')
r[r['horse_id']=='2015101621']

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師,course_len,weather,race_type,ground_state,date,horse_id,jockey_id
201701010105,1,8,8,タワーオブロンドン,牡2,54.0,ルメール,1:30.4,,4.0,2.0,516(0),[東] 藤沢和雄,1500,曇,芝,良,2017年7月29日,2015101621,5339
201701020209,2,8,10,タワーオブロンドン,牡2,54.0,ルメール,1:30.9,3/4,1.7,1.0,522(+6),[東] 藤沢和雄,1500,晴,芝,良,2017年8月20日,2015101621,5339
201705050111,1,1,1,タワーオブロンドン,牡2,55.0,ルメール,1:21.9,,1.8,1.0,516(+4),[東] 藤沢和雄,1400,曇,芝,良,2017年11月4日,2015101621,5339
201709040609,1,8,11,タワーオブロンドン,牡2,54.0,ルメール,1:21.7,,1.8,1.0,512(-10),[東] 藤沢和雄,1400,曇,芝,良,2017年9月23日,2015101621,5339
201709050611,3,2,3,タワーオブロンドン,牡2,55.0,ルメール,1:33.9,クビ,3.9,2.0,518(+2),[東] 藤沢和雄,1600,晴,芝,良,2017年12月17日,2015101621,5339
201805020611,12,4,7,タワーオブロンドン,牡3,57.0,ルメール,1:33.8,クビ,2.6,1.0,510(0),[東] 藤沢和雄,1600,晴,芝,良,2018年5月6日,2015101621,5339
201805050711,2,6,9,タワーオブロンドン,牡3,56.0,ビュイッ,1:32.6,クビ,3.9,2.0,516(+6),[東] 藤沢和雄,1600,晴,芝,良,2018年11月24日,2015101621,5495
201809020711,1,5,6,タワーオブロンドン,牡3,56.0,ルメール,1:33.4,,3.1,1.0,510(-8),[東] 藤沢和雄,1600,雨,芝,良,2018年4月14日,2015101621,5339
201901020411,2,4,7,タワーオブロンドン,牡4,58.0,ルメール,1:09.3,3/4,4.7,2.0,520(+2),[東] 藤沢和雄,1200,晴,芝,稍重,2019年8月25日,2015101621,5339
201902010211,3,8,13,タワーオブロンドン,牡4,58.0,レーン,1:08.6,クビ,1.8,1.0,518(0),[東] 藤沢和雄,1200,小雨,芝,稍重,2019年6月16日,2015101621,5585
