In [2]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
from tqdm.notebook import tqdm
import time
import os
import random
from functools import lru_cache

# 取得期間の設定
year_start = 2024
year_end = 2025

place_dict = {"01": "札幌", "02": "函館", "03": "福島", "04": "新潟", "05": "東京",
              "06": "中山", "07": "中京", "08": "京都", "09": "阪神", "10": "小倉"}

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    # 他のユーザーエージェントはそのまま
]

# キャッシュで404エラーを記録
failed_urls = set()

def fetch_race_data(url, max_retries=3):
    if url in failed_urls:
        return None

    retries = 0
    while retries < max_retries:
        try:
            headers = {'User-Agent': random.choice(user_agents)}
            with requests.Session() as session:
                r = session.get(url, headers=headers, timeout=10)
                if r.status_code == 404:
                    failed_urls.add(url)
                    return None
                r.raise_for_status()
                return r.content
        except requests.exceptions.RequestException as e:
            retries += 1
            wait = min(2 ** retries, 10) + random.random()
            time.sleep(wait)
    return None

def parse_race_data(race_id, content):
    # 前処理の改善
    if not content:
        return []
    
    soup = BeautifulSoup(content, "lxml")
    main_table = soup.find("table", {"class": "race_table_01 nk_tb_common"})
    if not main_table:
        return []

    # レース詳細情報の取得を効率化
    details = extract_race_details(soup)
    race_data = process_table_rows(main_table, race_id, details)
    
    return race_data

@lru_cache(maxsize=None)
def extract_race_details(soup):
    # 詳細情報抽出ロジックを最適化
    details = {}
    var = soup.find("div", class_="racedata")
    if var:
        items = var.text.split("/")
        if len(items) >= 3:
            details.update({
                'sur': items[0][0] if items[0] else '',
                'dis': items[0][1:-1] if items[0] else '',
                'wed': items[1].split(":")[-1].strip() if len(items) > 1 else '',
                'con': items[2].split(":")[-1].strip() if len(items) > 2 else ''
            })

    soup_smalltxt = soup.find("p", class_="smalltxt")
    if soup_smalltxt:
        date_info = soup_smalltxt.text.split()
        details.update({
            'date': date_info[0] if date_info else '',
            'clas': date_info[2] if len(date_info) > 2 else ''
        })
    
    return details

def process_table_rows(table, race_id, details):
    race_data = []
    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        if len(cols) < 19:
            continue

        # データ抽出ロジックの最適化
        try:
            runtime = cols[7].text.strip()
            weight_info = cols[14].text.strip()
            weight, weight_dif = parse_weight(weight_info)
            trainer = cols[18].find('a').text.strip() if cols[18].find('a') else ''

            entry = [
                race_id,
                cols[3].text.strip(),
                cols[6].text.strip(),
                cols[2].text.strip(),
                trainer,
                runtime,
                # 他のカラム...
                details.get('sur', ''),
                details.get('dis', ''),
                details.get('con', ''),
                details.get('wed', ''),
            ]
            race_data.append(entry)
        except Exception as e:
            print(f"Error parsing row: {e}")
            continue
    
    return race_data

def generate_race_ids(year):
    """効率的なrace_id生成"""
    valid_combinations = [
        (p, k, d, r) 
        for p in place_dict 
        for k in range(1, 7)       # 開催回数は最大6回
        for d in range(1, 10)      # 開催日数は最大9日
        for r in range(1, 13)      # レース数は最大12R
    ]
    
    for p, k, d, r in valid_combinations:
        yield f"{year}{p}{k:02}{d:02}{r:02}"

def main():
    for year in range(year_start, year_end + 1):
        output_path = f'data/{year}.csv'
        if os.path.exists(output_path):
            print(f"Skipping existing: {output_path}")
            continue

        all_data = []
        race_ids = list(generate_race_ids(year))
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            futures = []
            for race_id in race_ids:
                url = f"https://db.netkeiba.com/race/{race_id}"
                futures.append(executor.submit(fetch_race_data, url))

            with tqdm(total=len(futures), desc=f"Processing {year}") as pbar:
                for future, race_id in zip(concurrent.futures.as_completed(futures), race_ids):
                    content = future.result()
                    if content:
                        data = parse_race_data(race_id, content)
                        all_data.extend(data)
                    pbar.update(1)

        if all_data:
            df = pd.DataFrame(all_data)
            os.makedirs('data', exist_ok=True)
            df.to_csv(output_path, index=False, encoding='shift-jis')
            print(f"Saved: {output_path}")

if __name__ == "__main__":
    main()

Processing 2024:   0%|          | 0/6480 [00:00<?, ?it/s]

Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'par

Processing 2025:   0%|          | 0/6480 [00:00<?, ?it/s]

Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'parse_weight' is not defined
Error parsing row: name 'par