In [2]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
from tqdm.notebook import tqdm
import os
import random
import datetime

# ユーザーエージェントリスト
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.67",
]

def fetch_race_data(url, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            headers = {'User-Agent': random.choice(user_agents)}
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.content
        except requests.exceptions.RequestException as e:
            print(f"Error: {str(e)[:50]}...")  # エラーメッセージ短縮表示
            retries += 1
    return None

def parse_payback_data(race_id, content):
    if not content:
        return None
    
    soup = BeautifulSoup(content, "html.parser", from_encoding="euc-jp")
    payback_tables = soup.find_all('table', class_='pay_table_01')
    
    payback_dict = {
        '単勝': [], '複勝': [], '枠連': [], '馬連': [],
        'ワイド': [], '馬単': [], '三連複': [], '三連単': []
    }
    
    for table in payback_tables:
        rows = table.find_all('tr')
        for row in rows:
            th = row.find('th')
            if not th:
                continue
                
            # 賭式判定
            bet_type = th.get('class', [''])[0]
            bet_type_map = {
                'tan': '単勝', 'fuku': '複勝', 'waku': '枠連',
                'uren': '馬連', 'wide': 'ワイド', 'utan': '馬単',
                'sanfuku': '三連複', 'santan': '三連単'
            }
            bet_type = bet_type_map.get(bet_type, '')
            if not bet_type:
                continue

            tds = row.find_all('td')
            if len(tds) < 2:
                continue

            # データ抽出処理
            numbers = [elem.text.strip() for elem in tds[0].contents if elem.text.strip()]
            payouts = [elem.text.strip().replace(',', '') for elem in tds[1].contents if elem.text.strip()]
            
            formatted_data = []
            for num, pay in zip(numbers, payouts):
                if any(c in num for c in ['-', '→']):
                    formatted_data.append(f"{num}:{pay}")
                else:
                    formatted_data.extend([num, pay])

            payback_dict[bet_type] = formatted_data

    return [
        race_id,
        payback_dict['単勝'],
        payback_dict['複勝'],
        payback_dict['枠連'],
        payback_dict['馬連'],
        payback_dict['ワイド'],
        payback_dict['馬単'],
        payback_dict['三連複'],
        payback_dict['三連単']
    ]

def process_race(url_race_id_tuple):
    url, race_id = url_race_id_tuple
    content = fetch_race_data(url)
    return parse_payback_data(race_id, content)

def scrape_payback_data(year_start, year_end):
    current_year = datetime.datetime.now().year
    if year_end > current_year:
        print(f"警告: {current_year+1}年以降のデータは存在しないため、{current_year}年までを取得します")
        year_end = current_year

    for year in range(year_start, year_end + 1):
        payback_data = []
        urls = []
        race_ids = []
        
        place_dict = {
            "01": "札幌", "02": "函館", "03": "福島", "04": "新潟", "05": "東京",
            "06": "中山", "07": "中京", "08": "京都", "09": "阪神", "10": "小倉"
        }
        
        # レースID生成ロジック
        for place_code in place_dict:
            for z in range(1, 8):  # 開催回
                for y in range(1, 14):  # 開催日
                    race_id_base = f"{year}{place_code}{z:02d}{y:02d}"
                    for x in range(1, 13):  # レース番号
                        race_id = f"{race_id_base}{x:02d}"
                        urls.append(f"https://db.netkeiba.com/race/{race_id}")
                        race_ids.append(race_id)

        with tqdm(total=len(urls), desc=f"Year {year}") as pbar:
            with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
                futures = [executor.submit(process_race, (url, race_id)) 
                          for url, race_id in zip(urls, race_ids)]
                
                for future in concurrent.futures.as_completed(futures):
                    result = future.result()
                    if result:
                        payback_data.append(result)
                    pbar.update(1)

        # データ検証
        if not payback_data:
            print(f"{year}年: 有効なデータがありませんでした")
            continue

        # DataFrame作成
        df = pd.DataFrame(payback_data, columns=[
            'race_id', 
            '単勝', 
            '複勝', 
            '枠連', 
            '馬連', 
            'ワイド', 
            '馬単', 
            '三連複', 
            '三連単'
        ])
        
        # データ保存
        output_dir = 'payback'
        os.makedirs(output_dir, exist_ok=True)
        try:
            df.to_csv(f"{output_dir}/{year}.csv", index=False, encoding='shift_jis')
            print(f"{year}年データ保存完了: {len(df)}件")
        except Exception as e:
            print(f"保存エラー({year}): {str(e)}")

# 実行例（2022-2023年のデータ取得）
scrape_payback_data(2020, 2021)

Year 2020:   0%|          | 0/10920 [00:00<?, ?it/s]

保存エラー(2020): 'shift_jis' codec can't encode character '\u0156' in position 145: illegal multibyte sequence


Year 2021:   0%|          | 0/10920 [00:00<?, ?it/s]

Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: Bad Request for url: https://db....
Error: 400 Client Error: