# JRA Scraping Notebook

中央競馬（JRA）のデータをスクレイピングし、Google Drive上のデータセットに追加します。

In [None]:
# 1. Google Driveのマウント
from google.colab import drive
drive.mount('/content/drive')

import os
import sys

# ★★★ 設定項目 ★★★
# scraperフォルダが存在するパス (Google Drive上のパス)
# 例: '/content/drive/MyDrive/dai-keiba'
PROJECT_PATH = '/content/drive/MyDrive/dai-keiba'

if not os.path.exists(PROJECT_PATH):
    print(f"Error: Path {PROJECT_PATH} does not exist. Please check your Drive structure.")
else:
    print(f"Project path found: {PROJECT_PATH}")
    os.chdir(PROJECT_PATH)
    sys.path.append(PROJECT_PATH)


In [None]:
# 2. 必要なライブラリのインポート
try:
    import pandas as pd
    import requests
    import bs4
except ImportError:
    !pip install pandas requests beautifulsoup4
    import pandas as pd
    import requests
    import bs4

from datetime import datetime, date
from scraper.jra_scraper import scrape_jra_year, JRA_MONTH_PARAMS
import time


In [None]:
# 3. スクレイピング実行関数の定義

def jra_scrape_execution(year_str, start_date=None, end_date=None):
    CSV_FILE_PATH = os.path.join(PROJECT_PATH, "database.csv")
    print(f"Using CSV Path: {CSV_FILE_PATH}")

    def save_chunk(df_chunk):
        if os.path.exists(CSV_FILE_PATH):
            try:
                # Read types as string to prevent auto-float for IDs
                existing_df = pd.read_csv(CSV_FILE_PATH, dtype={'race_id': str, 'horse_id': str})
                combined_df = pd.concat([existing_df, df_chunk], ignore_index=True)
            except Exception as e:
                print(f"Read Error: {e}, creating new.")
                combined_df = df_chunk
        else:
            combined_df = df_chunk

        # Deduplicate
        subset_cols = ['race_id', '馬名']
        subset_cols = [c for c in subset_cols if c in combined_df.columns]
        if subset_cols:
            combined_df.drop_duplicates(subset=subset_cols, keep='last', inplace=True)

        combined_df.to_csv(CSV_FILE_PATH, index=False, encoding="utf-8-sig")
        print(f"  [Saved] Total rows: {len(combined_df)} (+{len(df_chunk)} new)")

    print(f"Starting Scraping for {year_str} ({start_date} ~ {end_date})")
    scrape_jra_year(year_str, start_date=start_date, end_date=end_date, save_callback=save_chunk)


In [None]:
# 4. 実行パラメータの設定と開始
# -----------------------------
TARGET_YEAR = "2025"

# 特定の期間だけ取得する場合 (Noneの場合は全期間または既存データの続きを自動判定できれば良いが、現在は手動指定推奨)
# 例: datetime(2025, 1, 1).date()
START_DATE = None 
END_DATE = None

# 自動判定ロジック (既存データの翌日から)
CSV_FILE_PATH = os.path.join(PROJECT_PATH, "database.csv")
if os.path.exists(CSV_FILE_PATH) and START_DATE is None:
    try:
        df_exist = pd.read_csv(CSV_FILE_PATH)
        if '日付' in df_exist.columns and not df_exist.empty:
             df_exist['date_obj'] = pd.to_datetime(df_exist['日付'], format='%Y年%m月%d日', errors='coerce')
             last_date = df_exist['date_obj'].max()
             if pd.notna(last_date):
                 START_DATE = last_date.date()
                 print(f"既存データの最終日時: {last_date.date()} -> この翌日から取得しますか？ (手動でSTART_DATEを設定して上書きも可能)")
                 # 同じ日も念のため再取得するか、翌日からにするか
                 # ここでは念のため同じ日からチェックする
                 pass
    except Exception as e:
        print(f"既存データ確認エラー: {e}")

print(f"Scraping Target: {TARGET_YEAR}, Start: {START_DATE}, End: {END_DATE}")
jra_scrape_execution(TARGET_YEAR, start_date=START_DATE, end_date=END_DATE)
