# üõ†Ô∏è NAR „Éá„Éº„ÇøË£úÂÆå„ÉÑ„Éº„É´
Ê¨†Êêç„Åó„Å¶„ÅÑ„ÇãË°ÄÁµ±ÊÉÖÂ†±„Åä„Çà„Å≥ÈÅéÂéªËµ∞Â±•Ê≠¥„ÇíË£úÂÆå„Åó„Åæ„Åô„ÄÇ

In [None]:
# Google Drive„Çí„Éû„Ç¶„É≥„Éà„Åô„ÇãÂ†¥Âêà„ÅÆ„ÅøÂÆüË°å„Åó„Å¶„Åè„Å†„Åï„ÅÑ
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
import re
import time
from datetime import datetime

class RaceScraper:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    def _get_soup(self, url):
        try:
            time.sleep(1) # Be polite
            response = requests.get(url, headers=self.headers, timeout=10)
            response.encoding = response.apparent_encoding
            if response.status_code == 200:
                return BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            print(f"Error fetching {url}: {e}")
        return None

    def get_past_races(self, horse_id, target_date=None, n_samples=5):
        """
        Fetches past n_samples race results for a given horse_id from netkeiba db.
        If target_date is provided, filters for races STRICTLY BEFORE that date.
        Returns a DataFrame of past races.
        """
        url = f"https://db.netkeiba.com/horse/result/{horse_id}/"
        soup = self._get_soup(url)
        if not soup:
            return pd.DataFrame()

        # The results are usually in a table with class "db_h_race_results"
        table = soup.select_one("table.db_h_race_results")
        if not table:
            # Try to find any table with "ÁùÄÈ†Ü"
            tables = soup.find_all("table")
            for t in tables:
                if "ÁùÄÈ†Ü" in t.text:
                    table = t
                    break
        
        if not table:
            return pd.DataFrame()

        # Parse table
        try:
            df = pd.read_html(io.StringIO(str(table)))[0]
            
            # Basic cleaning
            df = df.dropna(how='all')
            
            # Normalize column names (remove spaces/newlines)
            df.columns = df.columns.astype(str).str.replace(r'\s+', '', regex=True)

            # Filter rows that look like actual races (Date column exists)
            if 'Êó•‰ªò' in df.columns:
                df['date_obj'] = pd.to_datetime(df['Êó•‰ªò'], format='%Y/%m/%d', errors='coerce')
                df = df.dropna(subset=['date_obj'])
                
                # Leakage Prevention: Filter races strictly before target_date
                if target_date:
                    if isinstance(target_date, str):
                        target_dt = pd.to_datetime(target_date, errors='coerce')
                    else:
                        target_dt = pd.to_datetime(target_date) # handle date/datetime
                        
                    if target_dt is not None:
                         # Use strictly less than (<) to exclude future and current race (if in DB)
                         df = df[df['date_obj'] < target_dt]

                df = df.sort_values('date_obj', ascending=False)
                
            # Take top N
            if n_samples:
                df = df.head(n_samples)
            
            # Process Run Style (Leg Type)
            if 'ÈÄöÈÅé' in df.columns:
                df['run_style_val'] = df['ÈÄöÈÅé'].apply(self.extract_run_style)
            else:
                df['run_style_val'] = 3 # Unknown

            # Extract/Rename Columns
            # We want: Êó•‰ªò, ÈñãÂÇ¨, Â§©Ê∞ó, R, „É¨„Éº„ÇπÂêç, Êò†ÂÉè, È†≠Êï∞, Êû†Áï™, ... ÁùÄÈ†Ü, ... ÈÄöÈÅé, ...
            # Important: '‰∏ä„Çä' (3F), 'È¶¨‰ΩìÈáç', 'È®éÊâã'
            
            # Map standard columns if they exist
            column_map = {
                'Êó•‰ªò': 'date',
                'ÈñãÂÇ¨': 'venue',
                'Â§©Ê∞ó': 'weather',
                '„É¨„Éº„ÇπÂêç': 'race_name',
                'ÁùÄÈ†Ü': 'rank',
                'Êû†Áï™': 'waku',
                'È¶¨Áï™': 'umaban',
                'È®éÊâã': 'jockey',
                'Êñ§Èáè': 'weight_carried',
                'È¶¨Â†¥': 'condition', # ËâØ/Èáç/Á®çÈáç etc.
                '„Çø„Ç§„É†': 'time',
                'ÁùÄÂ∑Æ': 'margin',
                '‰∏ä„Çä': 'last_3f',
                'ÈÄöÈÅé': 'passing',
                'È¶¨‰ΩìÈáç': 'horse_weight',
                'run_style_val': 'run_style',
                'ÂçòÂãù': 'odds',
                '„Ç™„ÉÉ„Ç∫': 'odds',
                'Ë∑ùÈõ¢': 'raw_distance' # e.g. "Ëäù1600"
            }
            
            # Rename available columns
            df.rename(columns=column_map, inplace=True)
            
            # Extract Surface and Distance from 'raw_distance'
            if 'raw_distance' in df.columns:
                def parse_dist(x):
                    if not isinstance(x, str): return None, None
                    # "Ëäù1600", "„ÉÄ1200", "Èöú3000"
                    # Sometimes "Ëäù1600" or just "1600"
                    surf = None
                    dist = None
                    if 'Ëäù' in x: surf = 'Ëäù'
                    elif '„ÉÄ' in x: surf = '„ÉÄ'
                    elif 'Èöú' in x: surf = 'Èöú'
                    
                    # Extract number
                    match = re.search(r'(\d+)', x)
                    if match:
                        dist = int(match.group(1))
                    return surf, dist

                parsed = df['raw_distance'].apply(parse_dist)
                df['course_type'] = parsed.apply(lambda x: x[0])
                df['distance'] = parsed.apply(lambda x: x[1])
            else:
                df['course_type'] = None
                df['distance'] = None

            # Coerce numeric
            if 'rank' in df.columns:
                df['rank'] = pd.to_numeric(df['rank'], errors='coerce')
            
            if 'odds' in df.columns:
                 df['odds'] = pd.to_numeric(df['odds'], errors='coerce')
            
            # Fill missing
            for target_col in list(column_map.values()) + ['course_type', 'distance']:
                if target_col not in df.columns:
                    df[target_col] = None
                
            return df
            
        except Exception as e:
            print(f"Error parsing past races for {horse_id}: {e}")
            return pd.DataFrame()

    def extract_run_style(self, passing_str):
        """
        Converts passing order string (e.g., "1-1-1", "10-10-12") to run style (1,2,3,4).
        1: Nige (Escape) - Lead at 1st corner
        2: Senkou (Leader) - Within first ~4 or so
        3: Sashi (Mid) - Midpack
        4: Oikomi (Chaser) - Back
        Returns integer code.
        """
        if not isinstance(passing_str, str):
            return 3 # Default to Mid
            
        # Clean string "1-1-1" -> [1, 1, 1]
        try:
            cleaned = re.sub(r'[^0-9-]', '', passing_str)
            parts = [int(p) for p in cleaned.split('-') if p]
            
            if not parts:
                return 3
                
            first_corner = parts[0]
            
            # Heuristics
            if first_corner == 1:
                return 1 # Nige
            elif first_corner <= 4:
                return 2 # Senkou
            elif first_corner <= 9: # Assuming standard field size of 10-16, 9 is mid-ish limit? 
                # Actually "Sashi" is usually mid-rear. 
                # Let's say: 1=Lead, 2-4=Front, 5-10=Mid, >10=Back
                return 3 # Sashi
            else:
                return 4 # Oikomi
                
        except:
            return 3

    def scrape_race_with_history(self, race_id):
        """
        Detailed scraper that enters a race_result page, finding horse IDs, 
        then fetches history for each horse.
        Returns a dictionary or structured object with the race result + history.
        """
        url = f"https://race.netkeiba.com/race/result.html?race_id={race_id}"
        soup = self._get_soup(url)
        if not soup:
            return None
            
        # 0. Extract Race Date for Leakage Prevention
        race_date = None
        try:
            # Try to find date in .RaceData01 or similar
            # Example text: "10:10ÊõáËâØ2021Âπ¥1Êúà5Êó•..."
            # Netkeiba often puts date in the title tag too like "2021Âπ¥1Êúà5Êó•..."
            
            # Strategy 1: Title
            if soup.title:
                title_text = soup.title.text
                match = re.search(r'(\d{4})Âπ¥(\d{1,2})Êúà(\d{1,2})Êó•', title_text)
                if match:
                    y, m, d = match.groups()
                    race_date = datetime(int(y), int(m), int(d))
            
            # Strategy 2: .RaceData01 (Common in Result page)
            if not race_date:
                rd1 = soup.find("div", class_="RaceData01")
                if rd1:
                    match = re.search(r'(\d{4})Âπ¥(\d{1,2})Êúà(\d{1,2})Êó•', rd1.text)
                    if match:
                        y, m, d = match.groups()
                        race_date = datetime(int(y), int(m), int(d))
            
            # Strategy 3: URL (kaisai_date=YYYYMMDD) - Though URL input is race_id, result page might link to kaisai
            if not race_date:
                # Some list links contain kaisai_date, but here we only have race_id
                pass

        except Exception as e:
            print(f"Warning: Could not extract race date: {e}")

        # 1. Parse Main Result Table
        result_data = []
        table = soup.find("table", id="All_Result_Table")
        if not table:
            return None
            
        rows = table.find_all("tr", class_="HorseList")
        print(f"Found {len(rows)} horses in race {race_id} ({race_date.date() if race_date else 'Unknown Date'}). Fetching histories...")
        
        for row in rows:
            # Extract basic info
            rank_elem = row.select_one(".Rank")
            rank = rank_elem.text.strip() if rank_elem else ""
            
            horse_name_elem = row.select_one(".Horse_Name a")
            horse_name = horse_name_elem.text.strip() if horse_name_elem else ""
            horse_url = horse_name_elem.get("href") if horse_name_elem else ""
            
            # Extract ID from URL
            horse_id = None
            if horse_url:
                match = re.search(r'/horse/(\d+)', horse_url)
                if match:
                    horse_id = match.group(1)
            
            if not horse_id:
                print(f"  Skipping {horse_name} (No ID)")
                continue

            # print(f"  Fetching history for {horse_name} ({horse_id})...")
            
            # 2. Get Past History (with Leakage Prevention)
            df_past = self.get_past_races(horse_id, target_date=race_date, n_samples=5)
            
            # 3. Structure Data
            # converting df_past to a list of dicts or flattened fields
            history = []
            if not df_past.empty:
                for idx, r in df_past.iterrows():
                    # Extract relevant columns
                    # We need at least: Rank, RunStyle, Time(Seconds?), Pace?
                    # For now just dump raw-ish data
                    hist_item = {
                        "date": r.get('Êó•‰ªò'),
                        "race_name": r.get('„É¨„Éº„ÇπÂêç'),
                        "rank": r.get('ÁùÄÈ†Ü'),
                        "passing": r.get('ÈÄöÈÅé'),
                        "run_style": r.get('run_style_val'),
                        "time": r.get('„Çø„Ç§„É†'),
                        # Add more as needed for Feature Engineering
                    }
                    history.append(hist_item)
            
            entry = {
                "race_id": race_id,
                "horse_id": horse_id,
                "horse_name": horse_name,
                "rank": rank,
                "history": history
            }
            result_data.append(entry)
            
        return result_data

    def get_horse_profile(self, horse_id):
        """
        Fetches horse profile to get pedigree (Father, Mother, Grandfather(BMS)).
        Returns a dictionary or None.
        """
        # Use pedigree page for reliable bloodline data
        url = f"https://db.netkeiba.com/horse/ped/{horse_id}/"
        soup = self._get_soup(url)
        if not soup:
            return None
        
        # Parse Blood Table
        # table class="blood_table"
        
        data = {
            "father": "",
            "mother": "",
            "bms": ""
        }
        
        try:
            table = soup.select_one("table.blood_table")
            if table:
                rows = table.find_all("tr")
                # 5-generation table has 32 rows usually
                # Father at Row 0 (rowspan 16)
                # Mother at Row 16 (rowspan 16)
                
                if len(rows) >= 17:
                    # Father: Row 0, Col 0
                    r0 = rows[0].find_all("td")
                    if r0:
                        txt = r0[0].text.strip()
                        # Clean: "„Çπ„ÇØ„É™„Éº„É≥„Éí„Éº„É≠„Éº\n2004 Ê†óÊØõ..." -> "„Çπ„ÇØ„É™„Éº„É≥„Éí„Éº„É≠„Éº"
                        # Take first line
                        data["father"] = txt.split('\n')[0].strip()
                        
                    # Mother & BMS: Row 16
                    r16 = rows[16].find_all("td")
                    if len(r16) >= 2:
                        # Mother
                        m_txt = r16[0].text.strip()
                        data["mother"] = m_txt.split('\n')[0].strip()
                        
                        # BMS (Mother's Father)
                        bms_txt = r16[1].text.strip()
                        data["bms"] = bms_txt.split('\n')[0].strip()
                        
        except Exception as e:
            print(f"Error parsing profile for {horse_id}: {e}")
            
        return data

    def get_race_metadata(self, race_id):
        """
        Fetches metadata for a specific race ID from Netkeiba.
        Returns dict with: race_name, date, venue, course_type, distance, weather, condition, turn
        """
        url = f"https://race.netkeiba.com/race/result.html?race_id={race_id}"
        soup = self._get_soup(url)
        if not soup:
            return None
            
        data = {
            "race_name": "",
            "date": "",
            "venue": "",
            "course_type": "",
            "distance": "",
            "weather": "",
            "condition": "",
            "turn": "", # New: Dictionary key for turn direction
            "race_id": race_id
        }
        
        try:
            # Race Name
            title_elem = soup.select_one(".RaceName")
            if title_elem:
                data["race_name"] = title_elem.text.strip()
                
            # Date & Venue & Conditions
            # <div class="RaceData01">... 2023Âπ¥1Êúà5Êó• ... 1Âõû‰∏≠Â±±1Êó• ...</div>
            # Content: "15:35Áô∫Ëµ∞ / Ëäù1600m (Âè≥ Â§ñ) / Â§©ÂÄô:Êô¥ / È¶¨Â†¥:ËâØ"
            
            rd1 = soup.select_one(".RaceData01")
            
            if rd1:
                txt = rd1.text.strip()
                
                # Weather
                if "Â§©ÂÄô:Êô¥" in txt: data["weather"] = "Êô¥"
                elif "Â§©ÂÄô:Êõá" in txt: data["weather"] = "Êõá"
                elif "Â§©ÂÄô:Â∞èÈõ®" in txt: data["weather"] = "Â∞èÈõ®"
                elif "Â§©ÂÄô:Èõ®" in txt: data["weather"] = "Èõ®"
                elif "Â§©ÂÄô:Èõ™" in txt: data["weather"] = "Èõ™"
                
                # Condition
                if "È¶¨Â†¥:ËâØ" in txt: data["condition"] = "ËâØ"
                elif "È¶¨Â†¥:Á®ç" in txt: data["condition"] = "Á®çÈáç" # Covers Á®çÈáç
                elif "È¶¨Â†¥:Èáç" in txt: data["condition"] = "Èáç"
                elif "È¶¨Â†¥:‰∏çËâØ" in txt: data["condition"] = "‰∏çËâØ"
                
                # Course & Distance ("Ëäù1600m")
                # Regex for "Ëäù", "„ÉÄ", "Èöú" followed by digits
                match = re.search(r'(Ëäù|„ÉÄ|Èöú)(\d+)m', txt)
                if match:
                    ctype_raw = match.group(1)
                    if ctype_raw == "Ëäù": data["course_type"] = "Ëäù"
                    elif ctype_raw == "„ÉÄ": data["course_type"] = "„ÉÄ„Éº„Éà"
                    elif ctype_raw == "Èöú": data["course_type"] = "ÈöúÂÆ≥"
                    
                    data["distance"] = match.group(2)
                
                # Turn Direction ("Âè≥", "Â∑¶", "Áõ¥Á∑ö")
                # Usually in parentheses like "(Âè≥)" or "(Â∑¶)" or "(Ëäù Â∑¶)"
                if "Âè≥" in txt: data["turn"] = "Âè≥"
                elif "Â∑¶" in txt: data["turn"] = "Â∑¶"
                elif "Áõ¥Á∑ö" in txt: data["turn"] = "Áõ¥"

            # Date
            # Try finding date in Title or dedicated element
            date_elem = soup.select_one("dl#RaceList_DateList dd.Active") 
            if date_elem:
                 # Usually "1Êúà5Êó•(Èáë)" - needs Year
                 # We can rely on the fact that race_id contains year (2025...)
                 # But let's look for YYYYÂπ¥ in the whole text or title
                 pass
            
            # Fallback Date from Title Tag or Meta
            if not data["date"]:
                 meta_title = soup.title.text if soup.title else ""
                 match_date = re.search(r'(\d{4}Âπ¥\d{1,2}Êúà\d{1,2}Êó•)', meta_title)
                 if match_date:
                     data["date"] = match_date.group(1)

        except Exception as e:
            print(f"Error parsing metadata for {race_id}: {e}")
            
        return data

if __name__ == "__main__":
    # Test
    scraper = RaceScraper()
    print("Running test...")
    # Example: Do Deuce (2019105219)
    # url = "https://db.netkeiba.com/horse/2019105219/"
    # print(f"Fetching {url}")
    df = scraper.get_past_races("2019105219")
    if df.empty:
        print("DF is empty. Checking raw soup for 'db_h_race_results'...")
        soup = scraper._get_soup(f"https://db.netkeiba.com/horse/result/2019105219/")
        if soup:
             t = soup.select_one("table.db_h_race_results")
             print(f"Selector 'table.db_h_race_results' found: {t is not None}")
             if not t:
                 print("Trying fallback 'table' with 'ÁùÄÈ†Ü'...")
                 tables = soup.find_all("table")
                 found = False
                 for i, tbl in enumerate(tables):
                     print(f"Table {i} classes: {tbl.get('class')}")
                     if "ÁùÄÈ†Ü" in tbl.text or "ÁùÄ È†Ü" in tbl.text or "Êó•‰ªò" in tbl.text:
                         print("Found a table with 'ÁùÄÈ†Ü/Êó•‰ªò'.")
                         # print(str(tbl)[:200])
                         t = tbl
                         found = True
                         break
                 if not found:
                     print("No table with 'ÁùÄÈ†Ü' found in soup.")
                     print("Soup snippet:", soup.text[:500])
                 else:
                    # Retry parsing with found table
                     try:
                        df = pd.read_html(str(t))[0]
                        print("Retry DF Head:")
                        print(df.head())
                     except Exception as e:
                        print(f"Retry parsing failed: {e}")
        else:
            print("Soup is None.")
    else:
        print(df.head())
        print("Columns:", df.columns)


In [None]:
import pandas as pd
import numpy as np
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm

# Add project root to path to ensure scraper imports work
# Assumes this script is in 'scripts/' and 'scraper/' is in root

try:
    pass # Replaced import
except ImportError:
    print("‚ùå Could not import RaceScraper. Make sure you are running this from the repository root or 'ids' folder structure is correct.")
    # Fallback/Mock for testing if needed, or exit
    sys.exit(1)

def fill_bloodline_data(df_path, mode="JRA"):
    """
    Backfills missing bloodline data (father, mother, bms).
    """
    print(f"\nüê¥ Starting Bloodline Backfill for {mode} ({os.path.basename(df_path)})")
    
    if not os.path.exists(df_path):
        print(f"‚ùå File not found: {df_path}")
        return

    # Load Data
    try:
        if df_path.endswith('.parquet'):
            df = pd.read_parquet(df_path)
        else:
            df = pd.read_csv(df_path, low_memory=False)
            # Ensure IDs are strings
            if 'horse_id' in df.columns:
                df['horse_id'] = df['horse_id'].astype(str).str.replace(r'\.0$', '', regex=True)
            if 'race_id' in df.columns:
                df['race_id'] = df['race_id'].astype(str).str.replace(r'\.0$', '', regex=True)
    except Exception as e:
        print(f"‚ùå Error loading file: {e}")
        return

    # Ensure Columns Exist
    for col in ['father', 'mother', 'bms']:
        if col not in df.columns:
            df[col] = None

    # Identify Missing
    # Criteria: 'father' is null/empty AND 'horse_id' is valid
    mask_missing = (df['father'].isna()) | (df['father'] == '') | (df['father'] == 'nan')
    
    if 'horse_id' not in df.columns:
        print("‚ùå 'horse_id' column missing.")
        return

    target_ids = df.loc[mask_missing, 'horse_id'].dropna().unique()
    target_ids = [hid for hid in target_ids if str(hid).isdigit()] # Filter valid IDs
    
    total_targets = len(target_ids)
    print(f"üéØ Found {total_targets} horses with missing bloodline data.")
    
    if total_targets == 0:
        print("‚úÖ No missing bloodline data found.")
        return

    # Scraper Setup
    scraper = RaceScraper()
    
    # Worker Function
    def fetch_pedigree(hid):
        # random sleep to avoid rate limiting
        time.sleep(0.1) 
        try:
            return (hid, scraper.get_horse_profile(hid))
        except Exception as e:
            return (hid, None)

    # Sequential Execution (No Parallel)
    print(f"üöÄ Fetching data for {total_targets} horses (Sequential)...")
    
    # Chunking to save progress
    CHUNK_SIZE = 1000
    
    results = {}
    for i in range(0, total_targets, CHUNK_SIZE):
        chunk = target_ids[i:i+CHUNK_SIZE]
        print(f"  Processing chunk {i}-{i+len(chunk)}...")
        
        # Sequential Loop
        for hid in tqdm(chunk, leave=False):
            try:
                # Random sleep to be gentle
                time.sleep(0.5) 
                data = scraper.get_horse_profile(hid)
                if data:
                    results[hid] = data
            except Exception as e:
                # print(f"Error fetching {hid}: {e}")
                pass
        
        # Apply Logic
        if len(results) > 0:
            print("  Applying updates to DataFrame...")
            # Create Maps
            f_map = {h: d.get('father') for h, d in results.items() if d}
            m_map = {h: d.get('mother') for h, d in results.items() if d}
            b_map = {h: d.get('bms') for h, d in results.items() if d}
            
            # Update only rows that match these IDs
            mask_chunk = df['horse_id'].isin(results.keys())
            
            # Efficient Map Update
            df.loc[mask_chunk, 'father'] = df.loc[mask_chunk, 'horse_id'].map(f_map).fillna(df.loc[mask_chunk, 'father'])
            df.loc[mask_chunk, 'mother'] = df.loc[mask_chunk, 'horse_id'].map(m_map).fillna(df.loc[mask_chunk, 'mother'])
            df.loc[mask_chunk, 'bms'] = df.loc[mask_chunk, 'horse_id'].map(b_map).fillna(df.loc[mask_chunk, 'bms'])
            
            # Clear results buffer
            results = {}
            
            # Save
            print(f"  üíæ Saving progress to {df_path}...")
            if df_path.endswith('.parquet'):
                df.to_parquet(df_path, index=False)
            else:
                df.to_csv(df_path, index=False)

    print("‚úÖ Bloodline backfill complete.")


def fill_history_data(df_path, mode="JRA"):
    """
    Backfills missing past race history (past_1_date, etc.).
    Target: Rows where 'past_1_date' is NaN AND race is NOT 'Shinba' (Debut).
    """
    print(f"\nüìú Starting History Backfill for {mode} ({os.path.basename(df_path)})")
    
    if not os.path.exists(df_path):
        print(f"‚ùå File not found: {df_path}")
        return

    # Load Data
    try:
        if df_path.endswith('.parquet'):
            df = pd.read_parquet(df_path)
        else:
            df = pd.read_csv(df_path, low_memory=False)
            if 'horse_id' in df.columns:
                df['horse_id'] = df['horse_id'].astype(str).str.replace(r'\.0$', '', regex=True)
    except Exception as e:
        print(f"‚ùå Error loading file: {e}")
        return

    # Filter Targets
    # 1. Not Shinba
    if '„É¨„Éº„ÇπÂêç' in df.columns:
        mask_shinba = df['„É¨„Éº„ÇπÂêç'].astype(str).str.contains('Êñ∞È¶¨|„É°„Ç§„ÇØ„Éá„Éì„É•„Éº', na=False)
    else:
        mask_shinba = False
        
    # 2. Missing History
    mask_missing = df['past_1_date'].isna() & (~mask_shinba)
    
    target_rows = df[mask_missing]
    target_ids = target_rows['horse_id'].unique()
    target_ids = [hid for hid in target_ids if str(hid).isdigit()]
    
    total_targets = len(target_ids)
    print(f"üéØ Found {len(target_rows)} rows ({total_targets} unique horses) missing history.")
    
    if total_targets == 0:
        print("‚úÖ No missing history found.")
        return

    scraper = RaceScraper()
    history_cache = {}

    # Worker for simple fetch
    def fetch_history(hid):
        time.sleep(0.1)
        try:
            return (hid, scraper.get_past_races(hid, n_samples=None))
        except:
            return (hid, None)
            
    # Process
    print(f"üöÄ Fetching history for {total_targets} horses (Sequential)...")
    
    CHUNK_SIZE = 500
    for i in range(0, total_targets, CHUNK_SIZE):
        chunk_ids = target_ids[i:i+CHUNK_SIZE]
        
        # Sequential Loop
        for hid in tqdm(chunk_ids, leave=False, desc=f"Chunk {i//CHUNK_SIZE+1}"):
             try:
                 time.sleep(0.5)
                 hist_df = scraper.get_past_races(hid, n_samples=None)
                 if hist_df is not None and not hist_df.empty:
                     if 'date' in hist_df.columns:
                         hist_df['date_dt'] = pd.to_datetime(hist_df['date'], format='%Y/%m/%d', errors='coerce')
                     history_cache[hid] = hist_df
             except:
                 pass
        
        # Apply to DataFrame iteratively (Complex because depends on Race Date)
        print("  Applying history to missing rows...")
        
        # We need to iterate over the rows in the main DF that correspond to these horses
        # This part is slow if not vectorized, but logic is complex (compare dates).
        # Optimization: Group by horse_id
        
        chunk_mask = df['horse_id'].isin(chunk_ids) & mask_missing
        affected_indices = df[chunk_mask].index
        
        updates = [] # List of (index, col, value)
        
        for idx in tqdm(affected_indices, desc="Updating Rows"):
            row = df.loc[idx]
            hid = row['horse_id']
            race_date_str = str(row['Êó•‰ªò']) # YYYYÂπ¥MMÊúàDDÊó•
            
            if hid not in history_cache: continue
            
            hist_df = history_cache[hid]
            if hist_df is None or hist_df.empty: continue
            
            try:
                # Parse race date
                # Handle 'YYYYÂπ¥MMÊúàDDÊó•' or 'YYYY/MM/DD'
                race_date_str = race_date_str.replace('Âπ¥','/').replace('Êúà','/').replace('Êó•','')
                current_date = pd.to_datetime(race_date_str, errors='coerce')
                
                if pd.isna(current_date): continue
                
                # Filter history < current_date
                valid_hist = hist_df[hist_df['date_dt'] < current_date].copy()
                
                if valid_hist.empty: continue
                
                # Take top 5
                valid_hist = valid_hist.sort_values('date_dt', ascending=False).head(5)
                
                # Prepare update dict for this row
                # Columns: past_1_date, past_1_rank, ...
                cols_map = {
                    'date': 'date', 'rank': 'rank', 'time': 'time', 'run_style': 'run_style',
                    'race_name': 'race_name', 'last_3f': 'last_3f', 'horse_weight': 'horse_weight',
                    'jockey': 'jockey', 'condition': 'condition', 'weather': 'weather',
                    'distance': 'distance', 'course_type': 'course_type', 'odds': 'odds'
                }
                
                for n, (_, h_row) in enumerate(valid_hist.iterrows()):
                    if n >= 5: break
                    prefix = f"past_{n+1}_"
                    
                    df.at[idx, prefix + 'date'] = h_row.get('date')
                    df.at[idx, prefix + 'rank'] = h_row.get('rank')
                    df.at[idx, prefix + 'time'] = h_row.get('time')
                    df.at[idx, prefix + 'race_name'] = h_row.get('race_name')
                    # ... Add other columns as needed. For brevity, main ones.
                    # Note: assign directly to avoid huge list overhead
                    
                    for key, val_key in cols_map.items():
                         df.at[idx, prefix + key] = h_row.get(val_key)

            except Exception as e:
                # print(f"Error updating row {idx}: {e}")
                pass

        # Clear cache for this chunk to free memory
        history_cache = {}
        
        # Save
        print(f"  üíæ Saving progress to {df_path}...")
        if df_path.endswith('.parquet'):
             df.to_parquet(df_path, index=False)
        else:
             df.to_csv(df_path, index=False)

    print("‚úÖ History backfill complete.")


def fill_race_metadata(df_path, mode="JRA"):
    """
    Backfills missing race metadata (course_type, distance, weather, condition).
    """
    print(f"\nüèüÔ∏è Starting Race Metadata Backfill for {mode} ({os.path.basename(df_path)})")
    
    if not os.path.exists(df_path):
        print(f"‚ùå File not found: {df_path}")
        return

    # Load Data
    try:
        if df_path.endswith('.parquet'):
            df = pd.read_parquet(df_path)
        else:
            df = pd.read_csv(df_path, low_memory=False)
            if 'race_id' in df.columns:
                df['race_id'] = df['race_id'].astype(str).str.replace(r'\.0$', '', regex=True)
    except Exception as e:
        print(f"‚ùå Error loading file: {e}")
        return

    # Identify missing rows
    target_cols = ['„Ç≥„Éº„Çπ„Çø„Ç§„Éó', 'Ë∑ùÈõ¢', 'Â§©ÂÄô', 'È¶¨Â†¥Áä∂ÊÖã']
    for c in target_cols:
        if c not in df.columns:
            df[c] = None
    
    missing_mask = (df['„Ç≥„Éº„Çπ„Çø„Ç§„Éó'].isna()) | (df['„Ç≥„Éº„Çπ„Çø„Ç§„Éó'] == '') | \
                   (df['Ë∑ùÈõ¢'].isna()) | (df['Ë∑ùÈõ¢'] == '') | \
                   (df['Â§©ÂÄô'].isna()) | (df['Â§©ÂÄô'] == '')
                   
    target_race_ids = df.loc[missing_mask, 'race_id'].unique()
    target_race_ids = [rid for rid in target_race_ids if str(rid).isdigit()]
    
    total_targets = len(target_race_ids)
    print(f"üéØ Found {total_targets} races with missing metadata.")
    
    if total_targets == 0:
        print("‚úÖ No missing metadata found.")
        return

    scraper = RaceScraper()
    results = {}
    
    # Sequential Execution
    print(f"üöÄ Fetching metadata for {total_targets} races (Sequential)...")
    
    CHUNK_SIZE = 200
    for i in range(0, total_targets, CHUNK_SIZE):
        chunk = target_race_ids[i:i+CHUNK_SIZE]
        
        for rid in tqdm(chunk, leave=False):
            try:
                time.sleep(0.5)
                data = scraper.get_race_metadata(rid)
                if data and data.get('course_type'):
                    results[rid] = data
            except:
                pass
        
        # Save Progress
        if len(results) > 0:
            print("  Applying metadata updates...")
            mask = df['race_id'].isin(results.keys())
            
            c_map = {rid: d['course_type'] for rid, d in results.items() if d.get('course_type')}
            d_map = {rid: d['distance'] for rid, d in results.items() if d.get('distance')}
            w_map = {rid: d['weather'] for rid, d in results.items() if d.get('weather')}
            cond_map = {rid: d['condition'] for rid, d in results.items() if d.get('condition')}
            
            df.loc[mask, '„Ç≥„Éº„Çπ„Çø„Ç§„Éó'] = df.loc[mask, 'race_id'].map(c_map).fillna(df.loc[mask, '„Ç≥„Éº„Çπ„Çø„Ç§„Éó'])
            df.loc[mask, 'Ë∑ùÈõ¢'] = df.loc[mask, 'race_id'].map(d_map).fillna(df.loc[mask, 'Ë∑ùÈõ¢'])
            df.loc[mask, 'Â§©ÂÄô'] = df.loc[mask, 'race_id'].map(w_map).fillna(df.loc[mask, 'Â§©ÂÄô'])
            df.loc[mask, 'È¶¨Â†¥Áä∂ÊÖã'] = df.loc[mask, 'race_id'].map(cond_map).fillna(df.loc[mask, 'È¶¨Â†¥Áä∂ÊÖã'])
            
            results = {} # Clear buffer
            
            print(f"  üíæ Saving progress to {df_path}...")
            if df_path.endswith('.parquet'):
                df.to_parquet(df_path, index=False)
            else:
                df.to_csv(df_path, index=False)

    print("‚úÖ Race metadata backfill complete.")


# if __name__ == "__main__":
#     # Example Usage
#     print("Usage: Select mode to run.")
#     # Uncomment based on need
#     # fill_bloodline_data('data/raw/database.csv', mode="JRA")
#     # fill_history_data('data/raw/database.csv', mode="JRA")
    
#     # fill_bloodline_data('data/raw/database_nar.csv', mode="NAR")
#     # fill_history_data('data/raw/database_nar.csv', mode="NAR")



In [None]:
# Ë®≠ÂÆö
DATA_DIR = '/content/drive/MyDrive/dai-keiba/data/raw'

# ÂÆüË°å„Éñ„É≠„ÉÉ„ÇØ
csv_path = os.path.join(DATA_DIR, 'database_nar.csv')
if os.path.exists(csv_path):
    print(f'Âá¶ÁêÜÂØæË±°: {csv_path}')
    fill_bloodline_data(csv_path, mode='NAR')
    fill_history_data(csv_path, mode='NAR')
    fill_race_metadata(csv_path, mode='NAR')
else:
    print(f'{csv_path} „ÅåË¶ã„Å§„Åã„Çä„Åæ„Åõ„Çì„ÄÇ')
