# NAR Scraping Notebook

地方競馬（NAR）のデータをスクレイピングし、Google Drive上のデータセットに追加します。

In [None]:
# 1. Google Driveのマウント
from google.colab import drive
drive.mount('/content/drive')

import os
import sys

# ★★★ 設定項目 ★★★
# scraperフォルダが存在するパス (Google Drive上のパス)
# 例: '/content/drive/MyDrive/dai-keiba'
PROJECT_PATH = '/content/drive/MyDrive/dai-keiba'

if not os.path.exists(PROJECT_PATH):
    print(f"Error: Path {PROJECT_PATH} does not exist. Please check your Drive structure.")
else:
    print(f"Project path found: {PROJECT_PATH}")
    os.chdir(PROJECT_PATH)
    sys.path.append(PROJECT_PATH)


In [None]:
# 2. 必要なライブラリのインポート
try:
    import pandas as pd
    import requests
    import bs4
except ImportError:
    !pip install pandas requests beautifulsoup4
    import pandas as pd
    import requests
    import bs4

from datetime import datetime, date
from scraper.auto_scraper import scrape_nar_year
import time


In [None]:
# 3. スクレイピング実行関数の定義

def nar_scrape_execution(year_str, start_date=None, end_date=None):
    CSV_FILE_PATH_NAR = os.path.join(PROJECT_PATH, "database_nar.csv")
    print(f"Using CSV Path: {CSV_FILE_PATH_NAR}")

    def save_callback(df_new):
        if df_new is None or df_new.empty: return
        
        if os.path.exists(CSV_FILE_PATH_NAR):
            try:
                existing = pd.read_csv(CSV_FILE_PATH_NAR, dtype={'race_id': str, 'horse_id': str})
                combined = pd.concat([existing, df_new], ignore_index=True)
                # Deduplicate
                if 'race_id' in combined.columns and '馬 番' in combined.columns:
                    combined = combined.drop_duplicates(subset=['race_id', '馬 番'], keep='last')
                combined.to_csv(CSV_FILE_PATH_NAR, index=False)
                print(f"  [Saved] {len(df_new)} rows added. Total: {len(combined)}")
            except Exception as e:
                print(f"Read Error: {e}, overwriting.")
                df_new.to_csv(CSV_FILE_PATH_NAR, index=False)
        else:
            df_new.to_csv(CSV_FILE_PATH_NAR, index=False)
            print(f"  [Created] {CSV_FILE_PATH_NAR} with {len(df_new)} rows.")

    print(f"Starting NAR Scraping for {year_str} ({start_date} ~ {end_date})")
    
    # Load existing IDs to skip
    existing_ids = set()
    if os.path.exists(CSV_FILE_PATH_NAR):
        try:
             df_e = pd.read_csv(CSV_FILE_PATH_NAR, usecols=['race_id'], dtype={'race_id': str})
             existing_ids = set(df_e['race_id'].astype(str))
             print(f"  Loaded {len(existing_ids)} existing race IDs to skip.")
        except:
             pass

    scrape_nar_year(year_str, start_date=start_date, end_date=end_date, save_callback=save_callback, existing_race_ids=existing_ids)


In [None]:
# 4. 実行パラメータの設定と開始
# -----------------------------
TARGET_YEAR = "2024"
TARGET_MONTH = 1  # ★何月を取得するか指定 (Noneの場合は全期間、1〜12を指定)

import calendar
from datetime import date

START_DATE = None
END_DATE = None

if TARGET_MONTH:
    # 指定した月の1日〜末日を設定
    _, last_day = calendar.monthrange(int(TARGET_YEAR), int(TARGET_MONTH))
    START_DATE = date(int(TARGET_YEAR), int(TARGET_MONTH), 1)
    END_DATE = date(int(TARGET_YEAR), int(TARGET_MONTH), last_day)
    print(f"Targeting specific month: {START_DATE} to {END_DATE}")
else:
    # 自動判定ロジック (既存データの翌日から)
    CSV_FILE_PATH_NAR = os.path.join(PROJECT_PATH, "database_nar.csv")
    if os.path.exists(CSV_FILE_PATH_NAR):
        try:
            df_exist = pd.read_csv(CSV_FILE_PATH_NAR)
            if '日付' in df_exist.columns and not df_exist.empty:
                 df_exist['date_obj'] = pd.to_datetime(df_exist['日付'], format='%Y年%m月%d日', errors='coerce')
                 last_date = df_exist['date_obj'].max()
                 if pd.notna(last_date):
                     START_DATE = last_date.date()
                     print(f"既存データの最終日時: {last_date.date()} -> 続きから取得します")
        except Exception as e:
            print(f"既存データ確認エラー: {e}")

print(f"Scraping Target: {TARGET_YEAR}, Start: {START_DATE}, End: {END_DATE}")
nar_scrape_execution(TARGET_YEAR, start_date=START_DATE, end_date=END_DATE)


In [None]:
# 5. データ加工 (Feature Engineering) の実行
# ----------------------------------------
# スクレイピングした database_nar.csv から学習用データを生成します
from ml.feature_engineering import calculate_features

INPUT_CSV_NAR = os.path.join(PROJECT_PATH, "database_nar.csv")
OUTPUT_CSV_NAR = os.path.join(PROJECT_PATH, "processed_data_nar.csv")

if os.path.exists(INPUT_CSV_NAR):
    print("Starting Feature Engineering (NAR)...")
    calculate_features(INPUT_CSV_NAR, OUTPUT_CSV_NAR)
    print("Done!")
else:
    print("Error: database_nar.csv not found.")
