In [None]:
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
from tqdm.notebook import tqdm
import time
import os

In [None]:
# パラメータ設定
year_start = 2013
year_end = 2024
place_dict = {
    "01": "札幌", "02": "函館", "03": "福島", "04": "新潟", "05": "東京",
    "06": "中山", "07": "中京", "08": "京都", "09": "阪神", "10": "小倉"
}

In [None]:
# ユーザーエージェントリスト
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.67",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
]

In [None]:
def fetch_race_data(url, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            headers = {'User-Agent': random.choice(user_agents)}
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.content
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            retries += 1
            wait_time = random.uniform(2, 5)
            print(f"Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    return None

In [None]:
def parse_race_data(race_id, content):
    if content is None:
        return []
    
    soup = BeautifulSoup(content, "html.parser", from_encoding="euc-jp")
    soup_span = soup.find_all("span")
    main_table = soup.find("table", {"class": "race_table_01 nk_tb_common"})
    
    if not main_table:
        print('continue: ' + race_id)
        return []
    
    race_data = []
    for row in main_table.find_all("tr")[1:]:
        cols = row.find_all("td")
        
        # データ抽出処理...
        
        race_data.append([
            race_id,
            cols[3].text.strip(),  # 馬名
            cols[6].text.strip(),  # 騎手名
            # その他のデータ...
        ])
    return race_data

In [None]:
def process_race(url_race_id_tuple):
    url, race_id = url_race_id_tuple
    content = fetch_race_data(url)
    return parse_race_data(race_id, content)

In [None]:
total_years = year_end - year_start + 1

with tqdm(total=total_years, desc="Total Progress", position=0, leave=True) as pbar_total:
    for year in range(year_start, year_end + 1):
        race_data_all = []
        urls = []
        race_ids = []
        
        # URLとrace_idの生成...
        
        with tqdm(total=len(urls), desc=f"Year {year}", position=1, leave=True) as pbar_year:
            with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
                future_to_url = {executor.submit(process_race, url_race_id): url_race_id 
                               for url_race_id in zip(urls, race_ids)}
                
                for future in concurrent.futures.as_completed(future_to_url):
                    result = future.result()
                    race_data_all.extend(result)
                    pbar_year.update(1)
        
        # DataFrameの作成と保存...
        
        pbar_total.update(1)