# üèá NAR ÂÖ®„É¨„Éº„ÇπÂèñÂæó
‰ª•‰∏ã„ÅÆË®≠ÂÆöÂ§âÊï∞„ÇíÂ§âÊõ¥„Åó„Å¶ÂÆüË°å„Åó„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇNARÔºàÂú∞ÊñπÁ´∂È¶¨Ôºâ„ÅÆ„Éá„Éº„Çø„ÇíÊó•‰ªòÈ†Ü„Å´ÂèñÂæó„Åó„Åæ„Åô„ÄÇ

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
import re
import time
from datetime import datetime

class RaceScraper:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    def _get_soup(self, url):
        try:
            time.sleep(1) # Be polite
            response = requests.get(url, headers=self.headers, timeout=10)
            response.encoding = response.apparent_encoding
            if response.status_code == 200:
                return BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            print(f"Error fetching {url}: {e}")
        return None

    def get_past_races(self, horse_id, n_samples=5):
        """
        Fetches past n_samples race results for a given horse_id from netkeiba db.
        Returns a DataFrame of past races.
        """
        url = f"https://db.netkeiba.com/horse/result/{horse_id}/"
        soup = self._get_soup(url)
        if not soup:
            return pd.DataFrame()

        # The results are usually in a table with class "db_h_race_results"
        table = soup.select_one("table.db_h_race_results")
        if not table:
            # Try to find any table with "ÁùÄÈ†Ü"
            tables = soup.find_all("table")
            for t in tables:
                if "ÁùÄÈ†Ü" in t.text:
                    table = t
                    break
        
        if not table:
            return pd.DataFrame()

        # Parse table
        # We need to manually parse to get clean data and handle links if needed (though for past data, text is mostly fine)
        # pd.read_html is easier for the table
        try:
            df = pd.read_html(io.StringIO(str(table)))[0]
            
            # Basic cleaning
            df = df.dropna(how='all')
            
            # The columns in db.netkeiba are roughly:
            # Êó•‰ªò, ÈñãÂÇ¨, Â§©Ê∞ó, R, „É¨„Éº„ÇπÂêç, Êò†ÂÉè, È†≠Êï∞, Êû†Áï™, ... ÁùÄÈ†Ü, ... ÈÄöÈÅé, ...
            
            # We want to keep: Date, Race Name, Course info, Rank, Time, Passing (Style)
            
            # Normalize column names (remove spaces/newlines)
            df.columns = df.columns.astype(str).str.replace(r'\s+', '', regex=True)

            # Filter rows that look like actual races (Date column exists)
            if 'Êó•‰ªò' in df.columns:
                df['date_obj'] = pd.to_datetime(df['Êó•‰ªò'], format='%Y/%m/%d', errors='coerce')
                df = df.dropna(subset=['date_obj'])
                df = df.sort_values('date_obj', ascending=False)
                
            # Take top N
            if n_samples:
                df = df.head(n_samples)
            
            # Process Run Style (Leg Type)
            if 'ÈÄöÈÅé' in df.columns:
                df['run_style_val'] = df['ÈÄöÈÅé'].apply(self.extract_run_style)
            else:
                df['run_style_val'] = 3 # Unknown

            # Extract/Rename Columns
            # We want: Êó•‰ªò, ÈñãÂÇ¨, Â§©Ê∞ó, R, „É¨„Éº„ÇπÂêç, Êò†ÂÉè, È†≠Êï∞, Êû†Áï™, ... ÁùÄÈ†Ü, ... ÈÄöÈÅé, ...
            # Important: '‰∏ä„Çä' (3F), 'È¶¨‰ΩìÈáç', 'È®éÊâã'
            
            # Map standard columns if they exist
            column_map = {
                'Êó•‰ªò': 'date',
                'ÈñãÂÇ¨': 'venue',
                'Â§©Ê∞ó': 'weather',
                '„É¨„Éº„ÇπÂêç': 'race_name',
                'ÁùÄÈ†Ü': 'rank',
                'Êû†Áï™': 'waku',
                'È¶¨Áï™': 'umaban',
                'È®éÊâã': 'jockey',
                'Êñ§Èáè': 'weight_carried',
                'È¶¨Â†¥': 'condition', # ËâØ/Èáç/Á®çÈáç etc.
                '„Çø„Ç§„É†': 'time',
                'ÁùÄÂ∑Æ': 'margin',
                '‰∏ä„Çä': 'last_3f',
                'ÈÄöÈÅé': 'passing',
                'È¶¨‰ΩìÈáç': 'horse_weight',
                'run_style_val': 'run_style',
                'ÂçòÂãù': 'odds',
                '„Ç™„ÉÉ„Ç∫': 'odds',
                'Ë∑ùÈõ¢': 'raw_distance' # e.g. "Ëäù1600"
            }
            
            # Rename available columns
            df.rename(columns=column_map, inplace=True)
            
            # Extract Surface and Distance from 'raw_distance'
            if 'raw_distance' in df.columns:
                def parse_dist(x):
                    if not isinstance(x, str): return None, None
                    # "Ëäù1600", "„ÉÄ1200", "Èöú3000"
                    # Sometimes "Ëäù1600" or just "1600"
                    surf = None
                    dist = None
                    if 'Ëäù' in x: surf = 'Ëäù'
                    elif '„ÉÄ' in x: surf = '„ÉÄ'
                    elif 'Èöú' in x: surf = 'Èöú'
                    
                    # Extract number
                    match = re.search(r'(\d+)', x)
                    if match:
                        dist = int(match.group(1))
                    return surf, dist

                parsed = df['raw_distance'].apply(parse_dist)
                df['course_type'] = parsed.apply(lambda x: x[0])
                df['distance'] = parsed.apply(lambda x: x[1])
            else:
                df['course_type'] = None
                df['distance'] = None

            # Coerce numeric
            if 'rank' in df.columns:
                df['rank'] = pd.to_numeric(df['rank'], errors='coerce')
            
            if 'odds' in df.columns:
                 df['odds'] = pd.to_numeric(df['odds'], errors='coerce')
            
            # Fill missing
            for target_col in list(column_map.values()) + ['course_type', 'distance']:
                if target_col not in df.columns:
                    df[target_col] = None
                
            return df
            
        except Exception as e:
            print(f"Error parsing past races for {horse_id}: {e}")
            return pd.DataFrame()

    def extract_run_style(self, passing_str):
        """
        Converts passing order string (e.g., "1-1-1", "10-10-12") to run style (1,2,3,4).
        1: Nige (Escape) - Lead at 1st corner
        2: Senkou (Leader) - Within first ~4 or so
        3: Sashi (Mid) - Midpack
        4: Oikomi (Chaser) - Back
        Returns integer code.
        """
        if not isinstance(passing_str, str):
            return 3 # Default to Mid
            
        # Clean string "1-1-1" -> [1, 1, 1]
        try:
            cleaned = re.sub(r'[^0-9-]', '', passing_str)
            parts = [int(p) for p in cleaned.split('-') if p]
            
            if not parts:
                return 3
                
            first_corner = parts[0]
            
            # Heuristics
            if first_corner == 1:
                return 1 # Nige
            elif first_corner <= 4:
                return 2 # Senkou
            elif first_corner <= 9: # Assuming standard field size of 10-16, 9 is mid-ish limit? 
                # Actually "Sashi" is usually mid-rear. 
                # Let's say: 1=Lead, 2-4=Front, 5-10=Mid, >10=Back
                return 3 # Sashi
            else:
                return 4 # Oikomi
                
        except:
            return 3

    def scrape_race_with_history(self, race_id):
        """
        Detailed scraper that enters a race_result page, finding horse IDs, 
        then fetches history for each horse.
        Returns a dictionary or structured object with the race result + history.
        """
        url = f"https://race.netkeiba.com/race/result.html?race_id={race_id}"
        soup = self._get_soup(url)
        if not soup:
            return None
            
        # 1. Parse Main Result Table to get Horse IDs and basic result
        # Note: auto_scraper already does some of this, but we need Horse IDs specifically.
        # "All_Result_Table"
        
        result_data = []
        
        table = soup.find("table", id="All_Result_Table")
        if not table:
            return None
            
        rows = table.find_all("tr", class_="HorseList")
        
        print(f"Found {len(rows)} horses in race {race_id}. Fetching histories...")
        
        for row in rows:
            # Extract basic info
            rank_elem = row.select_one(".Rank")
            rank = rank_elem.text.strip() if rank_elem else ""
            
            horse_name_elem = row.select_one(".Horse_Name a")
            horse_name = horse_name_elem.text.strip() if horse_name_elem else ""
            horse_url = horse_name_elem.get("href") if horse_name_elem else ""
            
            # Extract ID from URL
            # https://db.netkeiba.com/horse/2018105027
            horse_id = None
            if horse_url:
                match = re.search(r'/horse/(\d+)', horse_url)
                if match:
                    horse_id = match.group(1)
            
            if not horse_id:
                print(f"  Skipping {horse_name} (No ID)")
                continue

            print(f"  Fetching history for {horse_name} ({horse_id})...")
            
            # 2. Get Past History
            df_past = self.get_past_races(horse_id, n_samples=5)
            
            # 3. Structure Data
            # converting df_past to a list of dicts or flattened fields
            history = []
            if not df_past.empty:
                for idx, r in df_past.iterrows():
                    # Extract relevant columns
                    # We need at least: Rank, RunStyle, Time(Seconds?), Pace?
                    # For now just dump raw-ish data
                    hist_item = {
                        "date": r.get('Êó•‰ªò'),
                        "race_name": r.get('„É¨„Éº„ÇπÂêç'),
                        "rank": r.get('ÁùÄÈ†Ü'),
                        "passing": r.get('ÈÄöÈÅé'),
                        "run_style": r.get('run_style_val'),
                        "time": r.get('„Çø„Ç§„É†'),
                        # Add more as needed for Feature Engineering
                    }
                    history.append(hist_item)
            
            entry = {
                "race_id": race_id,
                "horse_id": horse_id,
                "horse_name": horse_name,
                "rank": rank,
                "history": history
            }
            result_data.append(entry)
            
        return result_data

    def get_horse_profile(self, horse_id):
        """
        Fetches horse profile to get pedigree (Father, Mother, Grandfather(BMS)).
        Returns a dictionary or None.
        """
        # Use pedigree page for reliable bloodline data
        url = f"https://db.netkeiba.com/horse/ped/{horse_id}/"
        soup = self._get_soup(url)
        if not soup:
            return None
        
        # Parse Blood Table
        # table class="blood_table"
        
        data = {
            "father": "",
            "mother": "",
            "bms": ""
        }
        
        try:
            table = soup.select_one("table.blood_table")
            if table:
                rows = table.find_all("tr")
                # 5-generation table has 32 rows usually
                # Father at Row 0 (rowspan 16)
                # Mother at Row 16 (rowspan 16)
                
                if len(rows) >= 17:
                    # Father: Row 0, Col 0
                    r0 = rows[0].find_all("td")
                    if r0:
                        txt = r0[0].text.strip()
                        # Clean: "„Çπ„ÇØ„É™„Éº„É≥„Éí„Éº„É≠„Éº\n2004 Ê†óÊØõ..." -> "„Çπ„ÇØ„É™„Éº„É≥„Éí„Éº„É≠„Éº"
                        # Take first line
                        data["father"] = txt.split('\n')[0].strip()
                        
                    # Mother & BMS: Row 16
                    r16 = rows[16].find_all("td")
                    if len(r16) >= 2:
                        # Mother
                        m_txt = r16[0].text.strip()
                        data["mother"] = m_txt.split('\n')[0].strip()
                        
                        # BMS (Mother's Father)
                        bms_txt = r16[1].text.strip()
                        data["bms"] = bms_txt.split('\n')[0].strip()
                        
        except Exception as e:
            print(f"Error parsing profile for {horse_id}: {e}")
            
        return data

    def get_race_metadata(self, race_id):
        """
        Fetches metadata for a specific race ID from Netkeiba.
        Returns dict with: race_name, date, venue, course_type, distance, weather, condition, turn
        """
        url = f"https://race.netkeiba.com/race/result.html?race_id={race_id}"
        soup = self._get_soup(url)
        if not soup:
            return None
            
        data = {
            "race_name": "",
            "date": "",
            "venue": "",
            "course_type": "",
            "distance": "",
            "weather": "",
            "condition": "",
            "turn": "", # New: Dictionary key for turn direction
            "race_id": race_id
        }
        
        try:
            # Race Name
            title_elem = soup.select_one(".RaceName")
            if title_elem:
                data["race_name"] = title_elem.text.strip()
                
            # Date & Venue & Conditions
            # <div class="RaceData01">... 2023Âπ¥1Êúà5Êó• ... 1Âõû‰∏≠Â±±1Êó• ...</div>
            # Content: "15:35Áô∫Ëµ∞ / Ëäù1600m (Âè≥ Â§ñ) / Â§©ÂÄô:Êô¥ / È¶¨Â†¥:ËâØ"
            
            rd1 = soup.select_one(".RaceData01")
            
            if rd1:
                txt = rd1.text.strip()
                
                # Weather
                if "Â§©ÂÄô:Êô¥" in txt: data["weather"] = "Êô¥"
                elif "Â§©ÂÄô:Êõá" in txt: data["weather"] = "Êõá"
                elif "Â§©ÂÄô:Â∞èÈõ®" in txt: data["weather"] = "Â∞èÈõ®"
                elif "Â§©ÂÄô:Èõ®" in txt: data["weather"] = "Èõ®"
                elif "Â§©ÂÄô:Èõ™" in txt: data["weather"] = "Èõ™"
                
                # Condition
                if "È¶¨Â†¥:ËâØ" in txt: data["condition"] = "ËâØ"
                elif "È¶¨Â†¥:Á®ç" in txt: data["condition"] = "Á®çÈáç" # Covers Á®çÈáç
                elif "È¶¨Â†¥:Èáç" in txt: data["condition"] = "Èáç"
                elif "È¶¨Â†¥:‰∏çËâØ" in txt: data["condition"] = "‰∏çËâØ"
                
                # Course & Distance ("Ëäù1600m")
                # Regex for "Ëäù", "„ÉÄ", "Èöú" followed by digits
                match = re.search(r'(Ëäù|„ÉÄ|Èöú)(\d+)m', txt)
                if match:
                    ctype_raw = match.group(1)
                    if ctype_raw == "Ëäù": data["course_type"] = "Ëäù"
                    elif ctype_raw == "„ÉÄ": data["course_type"] = "„ÉÄ„Éº„Éà"
                    elif ctype_raw == "Èöú": data["course_type"] = "ÈöúÂÆ≥"
                    
                    data["distance"] = match.group(2)
                
                # Turn Direction ("Âè≥", "Â∑¶", "Áõ¥Á∑ö")
                # Usually in parentheses like "(Âè≥)" or "(Â∑¶)" or "(Ëäù Â∑¶)"
                if "Âè≥" in txt: data["turn"] = "Âè≥"
                elif "Â∑¶" in txt: data["turn"] = "Â∑¶"
                elif "Áõ¥Á∑ö" in txt: data["turn"] = "Áõ¥"

            # Date
            # Try finding date in Title or dedicated element
            date_elem = soup.select_one("dl#RaceList_DateList dd.Active") 
            if date_elem:
                 # Usually "1Êúà5Êó•(Èáë)" - needs Year
                 # We can rely on the fact that race_id contains year (2025...)
                 # But let's look for YYYYÂπ¥ in the whole text or title
                 pass
            
            # Fallback Date from Title Tag or Meta
            if not data["date"]:
                 meta_title = soup.title.text if soup.title else ""
                 match_date = re.search(r'(\d{4}Âπ¥\d{1,2}Êúà\d{1,2}Êó•)', meta_title)
                 if match_date:
                     data["date"] = match_date.group(1)

        except Exception as e:
            print(f"Error parsing metadata for {race_id}: {e}")
            
        return data

if __name__ == "__main__":
    # Test
    scraper = RaceScraper()
    print("Running test...")
    # Example: Do Deuce (2019105219)
    # url = "https://db.netkeiba.com/horse/2019105219/"
    # print(f"Fetching {url}")
    df = scraper.get_past_races("2019105219")
    if df.empty:
        print("DF is empty. Checking raw soup for 'db_h_race_results'...")
        soup = scraper._get_soup(f"https://db.netkeiba.com/horse/result/2019105219/")
        if soup:
             t = soup.select_one("table.db_h_race_results")
             print(f"Selector 'table.db_h_race_results' found: {t is not None}")
             if not t:
                 print("Trying fallback 'table' with 'ÁùÄÈ†Ü'...")
                 tables = soup.find_all("table")
                 found = False
                 for i, tbl in enumerate(tables):
                     print(f"Table {i} classes: {tbl.get('class')}")
                     if "ÁùÄÈ†Ü" in tbl.text or "ÁùÄ È†Ü" in tbl.text or "Êó•‰ªò" in tbl.text:
                         print("Found a table with 'ÁùÄÈ†Ü/Êó•‰ªò'.")
                         # print(str(tbl)[:200])
                         t = tbl
                         found = True
                         break
                 if not found:
                     print("No table with 'ÁùÄÈ†Ü' found in soup.")
                     print("Soup snippet:", soup.text[:500])
                 else:
                    # Retry parsing with found table
                     try:
                        df = pd.read_html(str(t))[0]
                        print("Retry DF Head:")
                        print(df.head())
                     except Exception as e:
                        print(f"Retry parsing failed: {e}")
        else:
            print("Soup is None.")
    else:
        print(df.head())
        print("Columns:", df.columns)


In [None]:
# NAR „Çπ„ÇØ„É¨„Ç§„Éî„É≥„Ç∞„É≠„Ç∏„ÉÉ„ÇØ
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date, timedelta
import calendar
import time
import os

def run_nar_scraping(year, start_month=1, end_month=12):
    start_date = date(int(year), int(start_month), 1)
    last_day = calendar.monthrange(int(year), int(end_month))[1]
    end_date = date(int(year), int(end_month), last_day)
    
    today = date.today()
    if end_date > today: end_date = today
    
    print(f'NAR„Éá„Éº„Çø„Çí {start_date} „Åã„Çâ {end_date} „Åæ„ÅßÂèñÂæó„Åó„Åæ„Åô...')
    
    curr = start_date
    scraper = RaceScraper()
    
    while curr <= end_date:
        d_str = curr.strftime('%Y%m%d')
        url = f'https://nar.netkeiba.com/top/race_list_sub.html?kaisai_date={d_str}'
        try:
             time.sleep(0.5)
             resp = requests.get(url)
             resp.encoding = 'EUC-JP'
             soup = BeautifulSoup(resp.text, 'html.parser')
             links = soup.select('a[href*="race/result.html"]')
             if links:
                 print(f'{curr}: {len(links)} ‰ª∂„ÅÆ„É¨„Éº„Çπ„ÅåË¶ã„Å§„Åã„Çä„Åæ„Åó„Åü„ÄÇ(ÂèñÂæóÂá¶ÁêÜ„ÅØÊú™ÂÆüË£Ö„Åß„Åô)')
                 # ÂÆüÈöõ„ÅÆ„Çπ„ÇØ„É¨„Ç§„Éî„É≥„Ç∞Âá¶ÁêÜ„ÅØ„Åì„Åì„Å´Ë®òËø∞
        except Exception as e: print(e)
        curr += timedelta(days=1)


In [None]:
# Ë®≠ÂÆö („Åì„Åì„ÇíÂ§âÊõ¥„Åó„Å¶„Åè„Å†„Åï„ÅÑ)
YEAR = 2024          # ÂØæË±°Âπ¥Â∫¶
START_MONTH = 1      # ÈñãÂßãÊúà
END_MONTH = 12       # ÁµÇ‰∫ÜÊúà

# ÂÆüË°å„Éñ„É≠„ÉÉ„ÇØ
if YEAR:
    # „Éá„Ç£„É¨„ÇØ„Éà„É™‰ΩúÊàê
    os.makedirs('data/raw', exist_ok=True)
    run_nar_scraping(YEAR, START_MONTH, END_MONTH)
