In [8]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
from tqdm.notebook import tqdm
import time
import os
import random

#取得開始年〜取得終了年までのデータをスクレイピング
#取得開始年
year_start = 2022
#取得終了年
year_end = 2023


place_dict = {"01": "札幌", "02": "函館", "03": "福島", "04": "新潟", "05": "東京",
              "06": "中山", "07": "中京", "08": "京都", "09": "阪神", "10": "小倉"}

# ユーザーエージェントのリストを設定
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.67",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
]

def fetch_race_data(url, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            headers = {'User-Agent': random.choice(user_agents)}  # ランダムなユーザーエージェントを選択
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.content
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            retries += 1
            wait_time = random.uniform(2, 5)  # リトライ間隔を増加
            print(f"Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    return None

def parse_race_data(race_id, content):
    if content is None:
        return []
    soup = BeautifulSoup(content, "html.parser", from_encoding="euc-jp")
    soup_span = soup.find_all("span")
    main_table = soup.find("table", {"class": "race_table_01 nk_tb_common"})
    if not main_table:
        print('continue: ' + race_id)
        return []

    race_data = []
    for row in main_table.find_all("tr")[1:]:  # ヘッダ行をスキップ
        cols = row.find_all("td")

        # 走破時間
        runtime = cols[7].text.strip() if len(cols) > 7 else ''
        # 通過順
        pas = cols[10].text.strip() if len(cols) > 10 else ''
        # 体重
        var = cols[14].text.strip()
        try:
            weight = int(var.split("(")[0])
            weight_dif = int(var.split("(")[1].replace(")", ""))  # `[:-1]` の代わりに `replace(")", "")` を使用
        except (ValueError, IndexError):  # ValueErrorとIndexErrorの両方を捕捉
            weight = 0
            weight_dif = 0

        # 調教師名の抽出
        trainer_name = cols[18].find('a').text.strip() if cols[18].find('a') else ''

        # 上がり
        last = cols[11].text.strip() if len(cols) > 11 else ''
        # 人気
        pop = cols[13].text.strip() if len(cols) > 13 else ''
        
        # レースの詳細情報を取得
        try:
            var = soup_span[8]
            sur = str(var).split("/")[0].split(">")[1][0]
            rou = str(var).split("/")[0].split(">")[1][1]
            dis = str(var).split("/")[0].split(">")[1].split("m")[0][-4:]
            con = str(var).split("/")[2].split(":")[1][1]
            wed = str(var).split("/")[1].split(":")[1][1]
        except IndexError:
            try:
                var = soup_span[7]
                sur = str(var).split("/")[0].split(">")[1][0]
                rou = str(var).split("/")[0].split(">")[1][1]
                dis = str(var).split("/")[0].split(">")[1].split("m")[0][-4:]
                con = str(var).split("/")[2].split(":")[1][1]
                wed = str(var).split("/")[1].split(":")[1][1]
            except IndexError:
                var = soup_span[6]
                sur = str(var).split("/")[0].split(">")[1][0]
                rou = str(var).split("/")[0].split(">")[1][1]
                dis = str(var).split("/")[0].split(">")[1].split("m")[0][-4:]
                con = str(var).split("/")[2].split(":")[1][1]
                wed = str(var).split("/")[1].split(":")[1][1]
        soup_smalltxt = soup.find_all("p", class_="smalltxt")
        detail = str(soup_smalltxt).split(">")[1].split(" ")[1]
        date = str(soup_smalltxt).split(">")[1].split(" ")[0]
        clas = str(soup_smalltxt).split(">")[1].split(" ")[2].replace(u'\xa0', u' ').split(" ")[0]
        title = str(soup.find_all("h1")[1]).split(">")[1].split("<")[0]
        
        race_data.append([
            race_id,
            cols[3].text.strip(),  # 馬の名前
            cols[6].text.strip(),  # 騎手の名前
            cols[2].text.strip(),  # 馬番
            trainer_name,  # 調教師
            runtime,  # 走破時間
            cols[12].text.strip(),  # オッズ
            pas,  # 通過順
            cols[0].text.strip(),  # 着順
            weight,  # 体重
            weight_dif,  # 体重変化
            cols[4].text.strip()[0],  # 性
            cols[4].text.strip()[1],  # 齢
            cols[5].text.strip(),  # 斤量
            cols[20].text.strip(),  # 賞金
            last,  # 上がり
            pop,  # 人気
            title,  # レース名
            date,  # 日付
            detail,
            clas,  # クラス
            sur,  # 芝かダートか
            dis,  # 距離
            rou,  # 回り
            con,  # 馬場状態
            wed,  # 天気
            place_code,  # 場id
            place,  # 場名
        ])
    return race_data

def process_race(url_race_id_tuple):
    url, race_id = url_race_id_tuple
    content = fetch_race_data(url)
    return parse_race_data(race_id, content)

total_years = year_end - year_start + 1

with tqdm(total=total_years, desc="Total Progress", position=0, leave=True) as pbar_total:
    for year in range(year_start, year_end + 1):
        race_data_all = []
        urls = []
        race_ids = []

        for place_code, place in place_dict.items():
            for z in range(1, 8):  # 開催回数分ループ（1回〜6回）
                for y in range(1, 14):  # 開催日数分ループ（1日〜12日）
                    race_id_base = f"{year}{place_code}{z:02d}{y:02d}"
                    for x in range(1, 13):  # レース数分ループ（1R〜12R）
                        race_id = f"{race_id_base}{x:02d}"
                        url = f"https://db.netkeiba.com/race/{race_id}"
                        urls.append(url)
                        race_ids.append(race_id)

        with tqdm(total=len(urls), desc=f"Year {year}", position=1, leave=True) as pbar_year:
            with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
                future_to_url = {executor.submit(process_race, url_race_id): url_race_id for url_race_id in zip(urls, race_ids)}
                for future in concurrent.futures.as_completed(future_to_url):
                    result = future.result()
                    race_data_all.extend(result)
                    pbar_year.update(1)

        # スクレイピングしたデータをPandas DataFrameに変換
        df = pd.DataFrame(race_data_all, columns=[
            'race_id', '馬', '騎手', '馬番', '調教師', '走破時間', 'オッズ', '通過順', '着順', '体重', '体重変化',
            '性', '齢', '斤量', '賞金', '上がり', '人気', 'レース名', '日付', '開催', 'クラス',
            '芝・ダート', '距離', '回り', '馬場', '天気', '場id', '場名'
        ])
        # print(race_data_all)
        # 各race_idごとに出走頭数を計算
        headcount_series = df.groupby('race_id')['race_id'].transform('count')

        # 'race_id'列の次に出走頭数列を挿入
        race_id_index = df.columns.get_loc('race_id') + 1  # 'race_id'列の位置を取得し、その次の位置を計算
        df.insert(race_id_index, '出走頭数', headcount_series)
        
        # SHIFT-JISでエンコーディングする前にデータをクレンジング
        df = df.apply(lambda col: col.map(lambda x: x if isinstance(x, str) else str(x)).fillna(''))

        # 変更を加えたDataFrameをCSVファイルとして保存
        output_dir = 'data'
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f'{year}.csv')
        df.to_csv(output_path, index=False, encoding="SHIFT-JIS", errors="replace")
        
        print(f"{year}年のデータを保存しました: {output_path}")
        
        pbar_total.update(1)

print("終了")

Total Progress:   0%|          | 0/2 [00:00<?, ?it/s]

Year 2022:   0%|          | 0/10920 [00:00<?, ?it/s]

continue: 202201010102
continue: 202201010208
continue: 202201010309
continue: 202201010312
continue: 202201010408
continue: 202201010407
continue: 202201010504
continue: 202201010507
continue: 202201010512
continue: 202201010605
continue: 202201010701
continue: 202201010612
continue: 202201010702
continue: 202201010703
continue: 202201010705
continue: 202201010706
continue: 202201010704
continue: 202201010709
continue: 202201010708
continue: 202201010707
continue: 202201010710
continue: 202201010801
continue: 202201010711
continue: 202201010802
continue: 202201010803
continue: 202201010712
continue: 202201010805
continue: 202201010806
continue: 202201010804
continue: 202201010808
continue: 202201010809
continue: 202201010810
continue: 202201010807
continue: 202201010812
continue: 202201010901
continue: 202201010811
continue: 202201010904
continue: 202201010902
continue: 202201010903
continue: 202201010905
continue: 202201010906
continue: 202201010909
continue: 202201010908
continue: 2