# NAR Scraping Notebook

地方競馬（NAR）のデータをスクレイピングし、Google Drive上のデータセットに追加します。

In [1]:
# 1. Google Driveのマウント
from google.colab import drive
drive.mount('/content/drive')

import os
import sys

# ★★★ 設定項目 ★★★
# scraperフォルダが存在するパス (Google Drive上のパス)
# 例: '/content/drive/MyDrive/dai-keiba'
PROJECT_PATH = '/content/drive/MyDrive/dai-keiba'

if not os.path.exists(PROJECT_PATH):
    print(f"Error: Path {PROJECT_PATH} does not exist. Please check your Drive structure.")
else:
    print(f"Project path found: {PROJECT_PATH}")
    os.chdir(PROJECT_PATH)
    sys.path.append(PROJECT_PATH)


Mounted at /content/drive
Project path found: /content/drive/MyDrive/dai-keiba


In [2]:
# 2. 必要なライブラリのインポート
try:
    import pandas as pd
    import requests
    import bs4
except ImportError:
    !pip install pandas requests beautifulsoup4
    import pandas as pd
    import requests
    import bs4

from datetime import datetime, date
from scraper.auto_scraper import scrape_nar_year
import time


DEBUG: auto_scraper module loaded (Version: Fix-Name-Cache)


In [3]:
# 3. スクレイピング実行関数の定義

def nar_scrape_execution(year_str, start_date=None, end_date=None):
    CSV_FILE_PATH_NAR = os.path.join(PROJECT_PATH, "data", "raw", "database_nar.csv")
    print(f"Using CSV Path: {CSV_FILE_PATH_NAR}")

    def save_callback(df_new):
        if df_new is None or df_new.empty: return

        if os.path.exists(CSV_FILE_PATH_NAR):
            try:
                existing = pd.read_csv(CSV_FILE_PATH_NAR, dtype={'race_id': str, 'horse_id': str}, low_memory=False)
                combined = pd.concat([existing, df_new], ignore_index=True)
                # Deduplicate
                if 'race_id' in combined.columns and '馬 番' in combined.columns:
                    combined = combined.drop_duplicates(subset=['race_id', '馬 番'], keep='last')
                combined.to_csv(CSV_FILE_PATH_NAR, index=False)
                print(f"  [Saved] {len(df_new)} rows added. Total: {len(combined)}")
            except Exception as e:
                # ★ CRITICAL FIX
                print(f"❌ Read Error: {e}. Aborting to PREVENT OVERWRITE.")
                raise e
        else:
            df_new.to_csv(CSV_FILE_PATH_NAR, index=False)
            print(f"  [Created] {CSV_FILE_PATH_NAR} with {len(df_new)} rows.")

    print(f"Starting NAR Scraping for {year_str} ({start_date} ~ {end_date})")

    # Load existing IDs to skip
    existing_ids = set()
    if os.path.exists(CSV_FILE_PATH_NAR):
        try:
             df_e = pd.read_csv(CSV_FILE_PATH_NAR, usecols=['race_id'], dtype={'race_id': str}, low_memory=False)
             existing_ids = set(df_e['race_id'].astype(str))
             print(f"  Loaded {len(existing_ids)} existing race IDs to skip.")
        except:
             pass

    scrape_nar_year(year_str, start_date=start_date, end_date=end_date, save_callback=save_callback, existing_race_ids=existing_ids)


In [None]:
# 4. 実行パラメータの設定と開始
# -----------------------------
TARGET_YEAR = "2026"
TARGET_MONTH = None  # ★何月を取得するか指定 (Noneの場合は全期間、1〜12を指定)

import calendar
from datetime import date

START_DATE = None
END_DATE = None

if TARGET_MONTH:
    # 指定した月の1日〜末日を設定
    _, last_day = calendar.monthrange(int(TARGET_YEAR), int(TARGET_MONTH))
    START_DATE = date(int(TARGET_YEAR), int(TARGET_MONTH), 1)
    END_DATE = date(int(TARGET_YEAR), int(TARGET_MONTH), last_day)
    print(f"Targeting specific month: {START_DATE} to {END_DATE}")
else:
    # 自動判定ロジック (既存データの翌日から)
    CSV_FILE_PATH_NAR = os.path.join(PROJECT_PATH, "data", "raw", "database_nar.csv")
    if os.path.exists(CSV_FILE_PATH_NAR):
        try:
            df_exist = pd.read_csv(CSV_FILE_PATH_NAR)
            if '日付' in df_exist.columns and not df_exist.empty:
                 df_exist['date_obj'] = pd.to_datetime(df_exist['日付'], format='%Y年%m月%d日', errors='coerce')
                 last_date = df_exist['date_obj'].max()
                 if pd.notna(last_date):
                     # START_DATE = last_date.date() # 旧: 続きから
                     # 新: 欠落補完のために、強制的にその年の1月1日からスキャンする (existing_idsでスキップされる)
                     START_DATE = date(int(TARGET_YEAR), 1, 1)
                     print(f"既存データの最終日時: {last_date.date()} (欠落確認のため {START_DATE} からスキャンします)")
        except Exception as e:
            print(f"既存データ確認エラー: {e}")

print(f"Scraping Target: {TARGET_YEAR}, Start: {START_DATE}, End: {END_DATE}")
nar_scrape_execution(TARGET_YEAR, start_date=START_DATE, end_date=END_DATE)


  df_exist = pd.read_csv(CSV_FILE_PATH_NAR)


既存データの最終日時: 2025-12-31 (欠落確認のため 2026-01-01 からスキャンします)
Scraping Target: 2026, Start: 2026-01-01, End: None
Using CSV Path: /content/drive/MyDrive/dai-keiba/data/raw/database_nar.csv
Starting NAR Scraping for 2026 (2026-01-01 ~ None)
  Loaded 19324 existing race IDs to skip.
=== Starting NAR Bulk Scraping for 2026 ===
Checking 2026-01-01...
  Found 35 races.
  Enriching 11 horses with past data...


  combined = pd.concat([existing, df_new], ignore_index=True)


  [Saved] 11 rows added. Total: 196135
  Enriching 11 horses with past data...


In [None]:
# 4.2 欠損データの補完 (HorseID & 過去走 & 効率化) [NAR用]
import pandas as pd
import sys
import os
import re
from datetime import datetime
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Ensure scraper path
sys.path.append(os.path.join(PROJECT_PATH, 'scraper'))
from scraper.race_scraper import RaceScraper

def fetch_race_horse_ids(rid):
    scraper = RaceScraper()
    try:
        url = f'https://race.netkeiba.com/race/result.html?race_id={rid}'
        soup = scraper._get_soup(url)
        if not soup: return None

        table = soup.find('table', id='All_Result_Table')
        if not table: return None

        horse_map = {}
        rows = table.find_all('tr', class_='HorseList')
        for row in rows:
            name_tag = row.select_one('.Horse_Name a')
            if name_tag:
                h_name = name_tag.text.strip()
                href = name_tag.get('href', '')
                match = re.search(r'/horse/(\\d+)', href)
                if match:
                    horse_map[h_name] = match.group(1)
        return (rid, horse_map)
    except Exception as e:
        print(f'Error fetching race {rid}: {e}')
        return None

def fetch_horse_history(horse_id):
    scraper = RaceScraper()
    try:
        df = scraper.get_past_races(str(horse_id), n_samples=None)
        return (horse_id, df)
    except Exception as e:
        # print(f'Error fetching horse {horse_id}: {e}')
        return (horse_id, pd.DataFrame())

def fill_missing_past_data_nar_notebook():
    csv_path = os.path.join(PROJECT_PATH, 'data', 'raw', 'database_nar.csv')
    if not os.path.exists(csv_path):
        print(f'Error: {csv_path} not found.')
        return

    print(f'Reading {csv_path}...')
    df = pd.read_csv(csv_path)

    if '日付' in df.columns:
        df['date_dt'] = pd.to_datetime(df['日付'], format='%Y年%m月%d日', errors='coerce')
    else:
        print('Error: 日付 column not found.')
        return

    if 'horse_id' not in df.columns:
        df['horse_id'] = None

    # 1. Fill Missing Horse IDs (Optimized: Only missing)
    if 'race_id' in df.columns:
        missing_mask = df['horse_id'].isna() | (df['horse_id'] == '')
        if missing_mask.any():
            races_to_update = df.loc[missing_mask, 'race_id'].unique()
            print(f'Need to fetch IDs for {len(races_to_update)} races...')

            with ThreadPoolExecutor(max_workers=5) as executor:
                futures = {executor.submit(fetch_race_horse_ids, rid): rid for rid in races_to_update}
                completed = 0
                for future in as_completed(futures):
                    completed += 1
                    if completed % 10 == 0: print(f'  [IDs] {completed}/{len(races_to_update)}')
                    result = future.result()
                    if result:
                        rid, horse_map = result
                        if horse_map:
                            indices = df[df['race_id'] == rid].index
                            for idx in indices:
                                h_name = df.at[idx, '馬名']
                                if h_name in horse_map:
                                    df.at[idx, 'horse_id'] = horse_map[h_name]

            df.to_csv(csv_path, index=False, encoding='utf-8-sig')
            print('Saved updated IDs.')
        else:
            print('All Horse IDs present.')

    # 2. Fill Past History
    fields_map = {
        'date': 'date', 'rank': 'rank', 'time': 'time', 'run_style': 'run_style',
        'race_name': 'race_name', 'last_3f': 'last_3f', 'horse_weight': 'horse_weight',
        'jockey': 'jockey', 'condition': 'condition', 'odds': 'odds',
        'weather': 'weather', 'distance': 'distance', 'course_type': 'course_type'
    }

    # Dtype Fix: Batch create missing columns with object dtype
    new_cols = []
    for k in fields_map.keys():
        for i in range(1, 6):
            col = f'past_{i}_{k}'
            if col not in df.columns:
                new_cols.append(col)

    if new_cols:
        # Initialize with object type
        df_new = pd.DataFrame(None, index=df.index, columns=new_cols, dtype='object')
        df = pd.concat([df, df_new], axis=1)

    # --- Optimization: Only fetch horses that have missing past data ---
    # We check if 'past_1_date' is null. If so, that horse needs update.
    # Check all target columns? No, checking past_1_date is usually enough.
    if 'past_1_date' in df.columns:
        target_horses = df[df['past_1_date'].isna()]['horse_id'].dropna().unique()
    else:
        target_horses = df['horse_id'].dropna().unique()

    print(f'Found {len(target_horses)} horses needing update (out of {df["horse_id"].nunique()} total). Fetching history...')

    history_store = {}

    if len(target_horses) > 0:
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = {executor.submit(fetch_horse_history, hid): hid for hid in target_horses}
            completed = 0
            for future in as_completed(futures):
                completed += 1
                if completed % 50 == 0: print(f'  [History] {completed}/{len(target_horses)}')
                try:
                   hid, hist_df = future.result()
                   history_store[hid] = hist_df
                except:
                   pass

        print('Applying history data...')
        # Optimization: Only iterate rows that might need update would be faster but for simplicity iterate all
        # Check `hid in history_store` to quickly skip
        for idx, row in df.iterrows():
            hid = row.get('horse_id')
            if pd.isna(hid) or hid not in history_store:
                continue

            current_date = row.get('date_dt')
            hist_df = history_store[hid]
            if hist_df.empty: continue

            if 'date' in hist_df.columns:
                hist_df['date_obj'] = pd.to_datetime(hist_df['date'], errors='coerce')

            if 'date_obj' not in hist_df.columns: continue
            if pd.isna(current_date): continue

            # Filter past races
            past_races = hist_df[hist_df['date_obj'] < current_date].sort_values('date_obj', ascending=False).head(5)

            for i, (p_idx, p_row) in enumerate(past_races.iterrows()):
                n = i + 1
                if n > 5: break
                for k, v in fields_map.items():
                    df.at[idx, f'past_{n}_{k}'] = p_row.get(v)

        if 'date_dt' in df.columns: df.drop(columns=['date_dt'], inplace=True)
        df.to_csv(csv_path, index=False, encoding='utf-8-sig')
        print('Done filling past data for NAR.')
    else:
        print('No missing past data found. Skipping.')

fill_missing_past_data_nar_notebook()


In [None]:
# 上書き保存の関数定義 (Robust Journaling Mode) - NAR版
# 確実性を最優先し、1頭ずつキャッシュファイルに記録しながら進めます。

def fill_nar_bloodline_robust():
    from tqdm.auto import tqdm
    import os
    import csv
    import time

    csv_path = os.path.join(PROJECT_PATH, 'data', 'raw', 'database_nar.csv')
    cache_path = os.path.join(PROJECT_PATH, 'data', 'nar_pedigree_cache.csv')

    if not os.path.exists(os.path.dirname(cache_path)):
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)

    # 1. Load Main Database
    print(f'Reading {csv_path}...')
    df = pd.read_csv(csv_path, low_memory=False, dtype={'race_id': str, 'horse_id': str})

    if 'horse_id' in df.columns:
        df['horse_id'] = df['horse_id'].astype(str).str.replace(r'\.0$', '', regex=True)

    # 2. Load/Create Cache
    cached_ids = set()
    if os.path.exists(cache_path):
        try:
            cache_df = pd.read_csv(cache_path, dtype=str)
            cached_ids = set(cache_df['horse_id'].unique())
            print(f"Loaded {len(cached_ids)} cached records from {os.path.basename(cache_path)}")
        except:
            print("Cache file corrupted or empty. Starting fresh.")
    else:
        with open(cache_path, 'w', encoding='utf-8') as f:
            f.write('horse_id,father,mother,bms\n')

    # 3. Identify Missing Horses
    for col in ['father', 'mother', 'bms']:
        if col not in df.columns: df[col] = None

    mask_missing_db = (df['father'].isna()) | (df['father'] == '') | (df['father'] == 'nan')
    missing_ids = df.loc[mask_missing_db, 'horse_id'].dropna().unique()
    missing_ids = [h for h in missing_ids if h.isdigit()]

    target_horses = [h for h in missing_ids if h not in cached_ids]

    print(f"Total Missing in DB: {len(missing_ids)}")
    print(f"Already Cached: {len(cached_ids)}")
    print(f"Target to Scrape: {len(target_horses)}")

    if len(target_horses) > 0:
        from scraper.race_scraper import RaceScraper
        scraper = RaceScraper()

        print("Starting Row-by-Row Scraping...")

        with open(cache_path, 'a', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)

            for i, hid in enumerate(tqdm(target_horses)):
                try:
                    profile = scraper.get_horse_profile(hid)
                    if profile:
                        father = profile.get('father', '')
                        mother = profile.get('mother', '')
                        bms = profile.get('bms', '')

                        writer.writerow([hid, father, mother, bms])
                        f.flush()
                except Exception as e:
                    print(f"Error scraping {hid}: {e}")

    else:
        print("No new horses to scrape.")

    # 4. Merge Cache into Database
    print("Merging cache into database...")
    if os.path.exists(cache_path):
        cache_df = pd.read_csv(cache_path, dtype=str)
        f_map = dict(zip(cache_df['horse_id'], cache_df['father']))
        m_map = dict(zip(cache_df['horse_id'], cache_df['mother']))
        b_map = dict(zip(cache_df['horse_id'], cache_df['bms']))

        mask = df['horse_id'].isin(f_map.keys())
        df.loc[mask, 'father'] = df.loc[mask, 'horse_id'].map(f_map).fillna(df.loc[mask, 'father'])
        df.loc[mask, 'mother'] = df.loc[mask, 'horse_id'].map(m_map).fillna(df.loc[mask, 'mother'])
        df.loc[mask, 'bms'] = df.loc[mask, 'horse_id'].map(b_map).fillna(df.loc[mask, 'bms'])

        df.to_csv(csv_path, index=False, encoding='utf-8-sig')
        print("✅ NAR Database updated successfully.")

fill_nar_bloodline_robust()

# 5. 血統データの補完 (Bloodline Backfill)

取得漏れの血統データ（父、母、母父）をバックグラウンドで補完します。
既存のデータを確認し、欠損がある馬のみ対象とします。

In [None]:
# 血統データのバックフィル実行
from scraper.auto_scraper import fill_bloodline_data

print("Starting Bloodline Backfill process...")
fill_bloodline_data(mode="NAR")
print("Bloodline Backfill completed.")