In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

from webdriver_manager.chrome import ChromeDriverManager

chrome_options = Options()
chrome_options.add_argument("--headless")  # ヘッドレスモードを有効にする
chrome_options.add_argument("--disable-gpu")  # GPUを無効にする（特にWindowsで推奨）
chrome_options.add_argument("--no-sandbox")  # サンドボックスモードを無効にする（Linuxで推奨）

# Chromeドライバーパスを取得
chromedriver_path = ChromeDriverManager().install()

# Serviceクラスを使ってドライバーパスを指定
service = Service(chromedriver_path)

# サービスオブジェクトを指定してwebdriver.Chromeを起動
driver = webdriver.Chrome(service=service, options=chrome_options)


In [17]:
URL = "https://race.netkeiba.com/top/race_list.html?kaisai_date=20241014"

from selenium.webdriver.common.by import By
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(URL)
li_list = driver.find_elements(By.CLASS_NAME, "RaceList_DataItem")
# li_list = driver.find_elements(By.TAG_NAME, "li")
li = li_list[0]
li

<selenium.webdriver.remote.webelement.WebElement (session="aa8dc92e86094bffc4ea4be9c07bddbb", element="f.1157EC3CD6E02F929EF5A7F42ABA4B33.d.0C1279E1F58C043010E16E9C61805A12.e.98")>

In [18]:
href = li.find_element(By.TAG_NAME, "a").get_attribute("href")
href

'https://race.netkeiba.com/race/result.html?race_id=202405040401&rf=race_list'

In [19]:
import re

re.findall(r"race_id=(\d{12})", href)[0]

'202405040401'

In [20]:
import time

race_id_list = []
for li in li_list:
    href = li.find_element(By.TAG_NAME, "a").get_attribute("href")
    race_id = re.findall(r"race_id=(\d{12})", href)[0]
    race_id_list.append(race_id)
    time.sleep(1)
print(race_id_list)

['202405040401', '202405040402', '202405040403', '202405040404', '202405040405', '202405040406', '202405040407', '202405040408', '202405040409', '202405040410', '202405040411', '202405040412', '202404040401', '202404040402', '202404040403', '202404040404', '202404040405', '202404040406', '202404040407', '202404040408', '202404040409', '202404040410', '202404040411', '202404040412']


In [None]:
# 基本ライブラリ
import re
import time
import traceback

# Selenium関連のライブラリ
from selenium.webdriver.common.by import By

# 外部ライブラリ
from tqdm.notebook import tqdm

# 自作モジュール
import get_race_date
from logger_setting import setup_logger
from chrome_setting import get_chrome_driver

# jupyter用 auto restart
# %load_ext autoload

# ロガーの取得
logger = setup_logger(__name__)

# 定数の定義
URL_TMPLATE = "https://race.netkeiba.com/top/race_list.html?kaisai_date={kaisai_date}"


def scrape_race_id_list(kaisai_date_list: list[str]):
    # kaisai_date_listが渡されていない場合は、デフォルトで日付リストを取得
    if not kaisai_date_list:
        kaisai_date_list = get_race_date.scrape_kaisai_date(
            from_="2024-01", to_="2024-12"
        )
        logger.info(f"取得した開催日リスト: {kaisai_date_list}")
    kaisai_date_list = get_race_date.scrape_kaisai_date(from_="2024-01", to_="2024-12")
    race_id_list = []
    with get_chrome_driver(headless=True) as driver:
        for kaisai_date in tqdm(kaisai_date_list):
            url = URL_TMPLATE.format(kaisai_date=kaisai_date)
            try:
                driver.get(url)
                time.sleep(1)
                li_list = driver.find_elements(By.CLASS_NAME, "RaceList_DataItem")
                for li in li_list:
                    href = li.find_element(By.TAG_NAME, "a").get_attribute("href")
                    race_id = re.findall(r"race_id=(\d{12})", href)[0]
                    race_id_list.append(race_id)
            except:
                logger.error("stopped at {URL}")
                logger.debug(traceback.format_exc())
                break
    return race_id_list


In [19]:
import scraping

kaisai_date_list = scraping.scrape_kaisai_date(from_="2024-01", to_="2024-10")
race_id_list = scrape_race_id_list(kaisai_date_list)
print(race_id_list)

  0%|          | 0/10 [00:00<?, ?it/s]



  soup = BeautifulSoup(html)


  0%|          | 0/12 [00:00<?, ?it/s]



  soup = BeautifulSoup(html)


  0%|          | 0/94 [00:00<?, ?it/s]

[]


In [16]:
import pickle
with open("race_id_list.pickle", "wb") as f:
    pickle.dump(race_id_list, f)