In [4]:
import csv
import time
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

def create_fast_driver():
    """Create optimized Chrome driver for speed"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-images")
    chrome_options.add_argument("--disable-plugins") 
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
    
    service = Service()
    return webdriver.Chrome(service=service, options=chrome_options)

def extract_complete_race_data(driver, race_info):
    """Extract all dogs' data from a race page"""
    try:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Extract race metadata
        race_date_elem = soup.select_one(".Meeting__header__date")
        race_date = race_date_elem.text.strip() if race_date_elem else race_info.get('race_date', '')
        
        track_elem = soup.select_one(".Meeting__header__title__meta")
        track = track_elem.text.strip() if track_elem else ""
        
        # Extract race details from race header
        race_time = ""
        race_class = ""
        race_distance = ""
        race_prizes = ""
        
        race_header = soup.select_one(".MeetingRace__header")
        if race_header:
            time_elem = race_header.select_one(".MeetingRace__time")
            if time_elem:
                race_time = time_elem.text.strip()
            
            class_elem = race_header.select_one(".MeetingRace__class")
            if class_elem:
                race_class = class_elem.text.strip().replace("|", "").strip()
            
            distance_elem = race_header.select_one(".MeetingRace__distance")
            if distance_elem:
                race_distance = distance_elem.text.strip()
                
            prizes_elem = race_header.select_one(".MeetingRace__prizes")
            if prizes_elem:
                race_prizes = prizes_elem.text.strip()
        
        # Extract all dogs from race
        dogs_data = []
        dog_rows = soup.select(".MeetingRaceTrap")
        
        for dog_row in dog_rows:
            dog_data = extract_single_dog_data(dog_row)
            if dog_data:
                # Add race metadata to each dog
                dog_data.update({
                    'meeting_id': race_info['meeting_id'],
                    'race_id': race_info['race_id'],
                    'race_url': race_info['race_url'],
                    'race_date': race_date,
                    'track': track,
                    'race_time': race_time,
                    'race_class': race_class,
                    'race_distance': race_distance,
                    'race_prizes': race_prizes
                })
                dogs_data.append(dog_data)
        
        return dogs_data
        
    except Exception as e:
        return []

def extract_single_dog_data(dog_row):
    """Extract data for a single dog from its row"""
    try:
        data = {}
        
        # Position
        pos_elem = dog_row.select_one(".MeetingRaceTrap__pos")
        data['position'] = pos_elem.text.strip() if pos_elem else ""
        
        # Dog name and ID
        greyhound_elem = dog_row.select_one(".MeetingRaceTrap__greyhound")
        data['dog_name'] = greyhound_elem.text.strip() if greyhound_elem else ""
        
        # Extract dog ID from greyhound link
        if greyhound_elem and greyhound_elem.get('href'):
            href = greyhound_elem['href']
            dog_id_match = re.search(r'greyhoundId=(\d+)', href)
            data['dog_id'] = dog_id_match.group(1) if dog_id_match else ""
        else:
            data['dog_id'] = ""
        
        # Trainer
        trainer_elem = dog_row.select_one(".MeetingRaceTrap__trainer")
        data['trainer'] = trainer_elem.text.strip() if trainer_elem else ""
        
        # Comments/Remarks
        comment_elem = dog_row.select_one(".MeetingRaceTrap__comment")
        data['comments'] = comment_elem.text.strip() if comment_elem else ""
        
        # Starting Price
        sp_elem = dog_row.select_one(".MeetingRaceTrap__sp")
        data['starting_price'] = sp_elem.text.strip() if sp_elem else ""
        
        # Time (S)
        time_s_elem = dog_row.select_one(".MeetingRaceTrap__timeS")
        data['time_s'] = time_s_elem.text.strip() if time_s_elem else ""
        
        # Time (Distance)
        time_dist_elem = dog_row.select_one(".MeetingRaceTrap__timeDistance")
        data['time_distance'] = time_dist_elem.text.strip() if time_dist_elem else ""
        
        # Extract trap number from trap image
        trap_elem = dog_row.select_one(".MeetingRaceTrap__trap img")
        if trap_elem and trap_elem.get('src'):
            trap_match = re.search(r'icn-(\d+)', trap_elem['src'])
            data['trap'] = trap_match.group(1) if trap_match else ""
        else:
            data['trap'] = ""
        
        # Extract breeding info from hound profile
        profile_elem = dog_row.select_one(".MeetingRaceTrap__houndProfile")
        if profile_elem:
            profile_text = profile_elem.text.strip()
            data['breeding_info'] = profile_text
            
            # Parse breeding info: "Oct-2020 | 34.4 | d - bd | Ballymac Best - Ballykett Beauty"
            parts = [p.strip() for p in profile_text.split('|')]
            
            if len(parts) >= 1:
                data['birth_date'] = parts[0]
            if len(parts) >= 2:
                data['weight'] = parts[1]
            if len(parts) >= 3:
                data['color'] = parts[2]
            if len(parts) >= 4:
                # Extract sire and dam from "Sire - Dam" format
                parents = parts[3]
                if ' - ' in parents:
                    sire, dam = parents.split(' - ', 1)
                    data['sire'] = sire.strip()
                    data['dam'] = dam.strip()
                else:
                    data['sire'] = ""
                    data['dam'] = ""
        else:
            data['breeding_info'] = ""
            data['birth_date'] = ""
            data['weight'] = ""
            data['color'] = ""
            data['sire'] = ""
            data['dam'] = ""
        
        return data
        
    except Exception as e:
        return {}

def save_comprehensive_data(all_race_data, filename="dogs3.csv"):
    """Save comprehensive race data to CSV sorted by race_id"""
    if not all_race_data:
        return
    
    # Sort by race_id (convert to int for proper sorting)
    all_race_data.sort(key=lambda x: int(x.get('race_id', 0)) if x.get('race_id', '').isdigit() else 0)
    
    fieldnames = [
        'meeting_id', 'race_id', 'race_date', 'track', 'race_time', 'race_class', 
        'race_distance', 'race_prizes', 'position', 'trap', 'dog_id', 'dog_name', 
        'trainer', 'comments', 'starting_price', 'time_s', 'time_distance',
        'birth_date', 'weight', 'color', 'sire', 'dam', 'breeding_info', 'race_url'
    ]
    
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for race in all_race_data:
            writer.writerow(race)

print("✅ Scraping functions loaded!")

ModuleNotFoundError: No module named 'selenium'

In [None]:
def process_and_save_data(input_file="dogs3.csv", output_file="dogs3_processed.csv", verbose=False):
    """Process raw scraped data and save it to the processed CSV file"""
    import pandas as pd
    import re
    from datetime import datetime
    
    if verbose:
        print(f"Processing data from {input_file} to {output_file}...")
    
    try:
        # Load the raw data
        df = pd.read_csv(input_file)
        
        # Make a copy for processing
        processed_df = df.copy()
        
        # Process data - similar to what your prediction script does
        # Handle missing values
        processed_df = processed_df.fillna({
            'birth_date': 'Unknown',
            'weight': '0.0',
            'time_s': '0.00',
            'time_distance': '0.00 (0)',
            'trap': '0',
            'position': '0',
            'comments': '',
            'race_distance': '0m'
        })
        
        # Extract distance in meters
        processed_df['distance_meters'] = processed_df['race_distance'].apply(
            lambda x: int(re.search(r'(\d+)', str(x)).group(1)) if isinstance(x, str) and re.search(r'(\d+)', str(x)) else 0
        )
        
        # Convert position to numeric
        processed_df['position_numeric'] = processed_df['position'].apply(
            lambda x: int(re.search(r'(\d+)', str(x)).group(1)) if isinstance(x, str) and re.search(r'(\d+)', str(x)) else 99
        )
        
        # Extract weight as float
        processed_df['weight_kg'] = processed_df['weight'].apply(
            lambda x: float(str(x).replace(',', '.')) if isinstance(x, str) and re.search(r'^\d+\.?\d*$', x.replace(',', '.')) else 0.0
        )
        
        # Convert trap to numeric
        processed_df['trap_numeric'] = processed_df['trap'].apply(
            lambda x: int(x) if str(x).isdigit() else 0
        )
        
        # Calculate age at race time
        def calculate_age(birth_date_str):
            if not isinstance(birth_date_str, str) or birth_date_str == 'Unknown':
                return 3.0  # Default age
                
            try:
                # Extract year from birth date (format: "Aug-2018")
                birth_year = int(birth_date_str.split('-')[1]) if '-' in birth_date_str else 0
                if birth_year == 0:
                    return 3.0
                
                # Use current year as race year for simplicity
                current_year = datetime.now().year
                age = current_year - birth_year
                return age if 1 <= age <= 7 else 3.0
            except:
                return 3.0
                
        processed_df['age_at_race'] = processed_df['birth_date'].apply(calculate_age)
        
        # Extract starting price as odds ratio
        def parse_starting_price(sp):
            if not isinstance(sp, str):
                return 0.0
            try:
                if '/' in sp:
                    num, denom = sp.split('/')
                    if 'F' in denom:  # Handle cases like "5/4F"
                        denom = denom.replace('F', '')
                    return float(num) / float(denom) + 1.0
                elif sp.lower() == 'evs' or sp.lower() == 'evens':
                    return 2.0
                else:
                    return 0.0
            except:
                return 0.0
                
        processed_df['odds_ratio'] = processed_df['starting_price'].apply(parse_starting_price)
        
        # Extract time in seconds
        processed_df['time_seconds'] = processed_df['time_s'].apply(
            lambda x: float(x) if isinstance(x, str) and x.replace('.', '', 1).isdigit() else 0.0
        )
        
        # Create "won_race" target variable
        processed_df['won_race'] = (processed_df['position_numeric'] == 1).astype(int)
        
        # Extract primary color
        def extract_primary_color(color_str):
            if not isinstance(color_str, str):
                return 'unknown'
            color_str = color_str.lower()
            colors = ['bk', 'bd', 'be', 'wbk', 'f', 'dkbd', 'wbd', 'bew', 'bebd', 'wbe']
            for color in colors:
                if color in color_str:
                    return color
            return 'other'
            
        processed_df['primary_color'] = processed_df['color'].apply(extract_primary_color)
        
        # Process comments
        processed_df['comment_early_pace'] = processed_df['comments'].apply(
            lambda x: 1 if isinstance(x, str) and any(term in x.upper() for term in ['EP', 'QAW', 'VQAW']) else 0
        )
        
        processed_df['comment_led'] = processed_df['comments'].apply(
            lambda x: 1 if isinstance(x, str) and any(term in x.upper() for term in ['LD', 'LED', 'ALT']) else 0
        )
        
        processed_df['comment_crowded'] = processed_df['comments'].apply(
            lambda x: 1 if isinstance(x, str) and any(term in x.upper() for term in ['CRD', 'BMP', 'CROWD']) else 0
        )
        
        processed_df['comment_strong_finish'] = processed_df['comments'].apply(
            lambda x: 1 if isinstance(x, str) and any(term in x.upper() for term in ['FINSTR', 'RANON', 'STYD']) else 0
        )
        
        processed_df['comment_middle_runner'] = processed_df['comments'].apply(
            lambda x: 1 if isinstance(x, str) and any(term in x.upper() for term in ['MID', 'MIDDLE']) else 0
        )
        
        processed_df['comment_rails_runner'] = processed_df['comments'].apply(
            lambda x: 1 if isinstance(x, str) and any(term in x.upper() for term in ['RLS', 'RAILS']) else 0
        )
        
        processed_df['comment_wide_runner'] = processed_df['comments'].apply(
            lambda x: 1 if isinstance(x, str) and any(term in x.upper() for term in ['WIDE', 'W']) else 0
        )
        
        # Process trap track bias
        trap_track_stats = processed_df.groupby(['track', 'trap_numeric']).agg(
            win_rate=('won_race', 'mean'),
            count=('won_race', 'count')
        ).reset_index()
        
        trap_track_stats = trap_track_stats[trap_track_stats['count'] >= 5]
        
        trap_track_map = {}
        for _, row in trap_track_stats.iterrows():
            track = row['track']
            trap = row['trap_numeric']
            win_rate = row['win_rate']
            
            if track not in trap_track_map:
                trap_track_map[track] = {}
            trap_track_map[track][trap] = win_rate
        
        def get_trap_track_bias(row):
            track = row['track']
            trap = row['trap_numeric']
            
            if track in trap_track_map and trap in trap_track_map[track]:
                return trap_track_map[track][trap]
            return 0.15  # Default win rate
            
        processed_df['trap_track_bias'] = processed_df.apply(get_trap_track_bias, axis=1)
        
        # Distance preference
        distance_ranges = [(0, 300), (301, 500), (501, 700), (701, 1000)]
        
        for min_dist, max_dist in distance_ranges:
            range_name = f"dist_range_{min_dist}_{max_dist}m"
            processed_df[range_name] = ((processed_df['distance_meters'] >= min_dist) & 
                                     (processed_df['distance_meters'] <= max_dist)).astype(int)
        
        # Add grade_numeric feature
        def get_grade_numeric(race_class):
            match = re.match(r'([A|D|S])(\d+)', str(race_class).upper())
            if match:
                grade_type, grade_num = match.groups()
                return int(grade_num)
            if str(race_class).upper() in ['OR', 'OPEN', 'INV', 'IT']:
                return 0
            return 99
            
        processed_df['grade_numeric'] = processed_df['race_class'].apply(get_grade_numeric)
        
        # Save the processed data
        processed_df.to_csv(output_file, index=False)
        
        return True
    
    except Exception as e:
        if verbose:
            print(f"Error processing data: {e}")
        return False

In [None]:
def fast_scrape_multiple_dogs(dog_ids, output_file="dogs3.csv", processed_file="dogs3_processed.csv", batch_size=50, verbose=False):
    """
    Optimized scraping with minimal delays and better error handling
    dog_ids: list of dog IDs to scrape
    output_file: CSV file to save results
    processed_file: CSV file to save processed results
    """
    
    processed_races = set()
    all_race_data = []
    
    # Load existing data to avoid reprocessing
    if os.path.exists(output_file):
        try:
            with open(output_file, "r", newline="", encoding="utf-8") as f:
                reader = csv.DictReader(f)
                for row in reader:
                    all_race_data.append(row)
                    race_key = f"{row.get('meeting_id', '')}_{row.get('race_id', '')}"
                    processed_races.add(race_key)
            if verbose:
                print(f"Loaded {len(all_race_data)} existing records, {len(processed_races)} unique races")
        except Exception as e:
            all_race_data = []
            processed_races = set()

    # Create optimized driver
    driver = create_fast_driver()
    wait = WebDriverWait(driver, 5)  # 5 second timeout
    
    try:
        print(f"Scraping {len(dog_ids)} dogs...")
        all_race_urls = set()
        
        # Phase 1: Collect all race URLs from all dogs
        for i, dog_id in enumerate(dog_ids, 1):
            if verbose or i % 5 == 0 or i == len(dog_ids):
                print(f"\rProcessing dog {i}/{len(dog_ids)}: {dog_id}", end="", flush=True)
            
            try:
                race_urls = fast_get_dog_race_urls(driver, dog_id, wait, verbose)
                
                for race_info in race_urls:
                    race_tuple = (
                        race_info['race_url'],
                        race_info['meeting_id'], 
                        race_info['race_id'],
                        race_info['race_date']
                    )
                    all_race_urls.add(race_tuple)
                    
            except Exception:
                continue
        
        print()  # New line after progress indicator
        
        # Phase 2: Filter out already processed races
        new_races = []
        for race_tuple in all_race_urls:
            race_info = {
                'race_url': race_tuple[0],
                'meeting_id': race_tuple[1],
                'race_id': race_tuple[2],
                'race_date': race_tuple[3]
            }
            
            race_key = f"{race_info['meeting_id']}_{race_info['race_id']}"
            if race_key not in processed_races:
                new_races.append(race_info)
        
        if verbose:
            print(f"Found {len(new_races)} new races to scrape")
        
        # Phase 3: Scrape new race data
        if new_races:
            for i, race_info in enumerate(new_races, 1):
                if verbose and i % 10 == 0:
                    print(f"\rScraping races: {i}/{len(new_races)}", end="", flush=True)
                
                try:
                    # Navigate to race page
                    driver.get(race_info['race_url'])
                    
                    # Wait for race data to load
                    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".MeetingRaceTrap")))
                    
                    # Extract race data
                    race_data = extract_complete_race_data(driver, race_info)
                    
                    if race_data:
                        all_race_data.extend(race_data)
                        race_key = f"{race_info['meeting_id']}_{race_info['race_id']}"
                        processed_races.add(race_key)
                        
                        # Save progress every 25 races to avoid data loss
                        if i % 25 == 0:
                            save_comprehensive_data(all_race_data, output_file)
                except Exception:
                    continue
            
            if verbose:
                print()  # New line after progress
        
        # Final save
        save_comprehensive_data(all_race_data, output_file)
        
        # Process the complete dataset - silence process_and_save_data output
        process_and_save_data(output_file, processed_file, verbose=False)
        
        # Single line output at the end
        print(f"Scraping completed: {len(all_race_data)} total records for {len(set(row.get('dog_id', '') for row in all_race_data if row.get('dog_id')))} unique dogs")
        
    except Exception as e:
        if verbose:
            print(f"Critical error during scraping: {e}")
        # Still try to save what we have
        if all_race_data:
            save_comprehensive_data(all_race_data, output_file)
            process_and_save_data(output_file, processed_file, verbose=False)
        
    finally:
        driver.quit()
    
    return len(all_race_data)

def fast_get_dog_race_urls(driver, dog_id, wait, verbose=False):
    """Get race URLs with minimal delays and improved error handling"""
    race_urls = []
    debug_mode = (dog_id == "607694")  # Enable extra logging for problematic dog
    
    try:
        profile_url = f"https://www.gbgb.org.uk/greyhound-profile/?greyhoundId={dog_id}"
        if verbose or debug_mode:
            print(f"   Accessing dog profile: {profile_url}")
        driver.get(profile_url)
        
        # Check if dog exists by looking for specific elements or error message
        if "No greyhound found" in driver.page_source or "No results found" in driver.page_source:
            if verbose or debug_mode:
                print(f"   ❌ Dog ID {dog_id} not found on GBGB website")
            return race_urls
        
        # More flexible element detection - don't use wait.until which can time out
        dog_exists = False
        
        # Use find_elements which doesn't throw exceptions if nothing is found
        dog_name_elems = driver.find_elements(By.CSS_SELECTOR, ".GreyhoundProfile__name, h1, .greyhound-name")
        if dog_name_elems:
            dog_exists = True
            if debug_mode:
                print(f"   ✅ Found dog name: {dog_name_elems[0].text}")
        
        # Check for race table as additional verification
        if not dog_exists:
            race_tables = driver.find_elements(By.CSS_SELECTOR, "table, .race-history, .results-table")
            if race_tables:
                dog_exists = True
                if debug_mode:
                    print(f"   ✅ Found race tables: {len(race_tables)}")
                    
        if not dog_exists:
            if verbose or debug_mode:
                print("   ❌ Could not verify dog profile exists")
            return race_urls
        
        # Quick cookie handling with minimal output
        try:
            cookie_buttons = driver.find_elements(By.CSS_SELECTOR, "button.consent-btn, button.accept-cookies, .cookie-consent-btn")
            if cookie_buttons:
                driver.execute_script("arguments[0].click();", cookie_buttons[0])
                time.sleep(0.5)
        except Exception:
            pass
        
        # SPECIAL HANDLING FOR PROBLEMATIC DOG ID - Use last resort method first 
        # This is now our primary method for this dog ID
        if dog_id == "607694":
            if debug_mode:
                print("   🔍 Using direct link extraction for problematic dog ID")
                
            all_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='meeting'], a[href*='race']")
            if debug_mode:
                print(f"   Found {len(all_links)} potential race links")
                
            for link in all_links:
                try:
                    href = link.get_attribute("href")
                    if href and ('meetingId' in href or 'raceId' in href):
                        meeting_match = re.search(r'meetingId=(\d+)', href)
                        race_match = re.search(r'raceId=(\d+)', href)
                        meeting_id = meeting_match.group(1) if meeting_match else ""
                        race_id = race_match.group(1) if race_match else ""
                        
                        if meeting_id and race_id:
                            race_info = {
                                'race_url': href,
                                'meeting_id': meeting_id,
                                'race_id': race_id,
                                'race_date': ""  # Date will be filled later
                            }
                            race_urls.append(race_info)
                except:
                    continue
                    
            if race_urls and debug_mode:
                print(f"   ✅ Found {len(race_urls)} race links for problematic dog ID")
                return race_urls
        
        # For all other dogs, continue with regular approach
        # Function to directly extract race links from table - our primary approach now
        def extract_race_links_from_table():
            nonlocal race_urls
            soup = BeautifulSoup(driver.page_source, "html.parser")
            
            # Try multiple selectors for tables containing race data
            table_found = False
            for table_selector in ["table.race-history", "table", ".race-history-table", ".greyhound-races", ".data-table"]:
                table = soup.select_one(table_selector)
                if table:
                    table_found = True
                    if debug_mode:
                        print(f"   ✅ Found race table using selector: {table_selector}")
                    
                    # Extract all links from the table
                    links = table.select("a[href*='meeting'], a[href*='race']")
                    if debug_mode:
                        print(f"   Found {len(links)} race links in table")
                        
                    for link in links:
                        href = link.get('href', '')
                        if href and ('meetingId' in href or 'raceId' in href):
                            if href.startswith('http'):
                                full_url = href
                            else:
                                full_url = "https://www.gbgb.org.uk" + href
                            
                            meeting_match = re.search(r'meetingId=(\d+)', full_url)
                            race_match = re.search(r'raceId=(\d+)', full_url)
                            meeting_id = meeting_match.group(1) if meeting_match else ""
                            race_id = race_match.group(1) if race_match else ""
                            
                            # Find date info nearby
                            row = link.find_parent("tr") or link.find_parent("div")
                            race_date = ""
                            if row:
                                date_elem = row.select_one(".date, .race-date, td:first-child, .race-date-cell")
                                if date_elem:
                                    race_date = date_elem.text.strip()
                            
                            race_info = {
                                'race_url': full_url,
                                'meeting_id': meeting_id,
                                'race_id': race_id,
                                'race_date': race_date
                            }
                            
                            # Add to results if it has both IDs
                            if meeting_id and race_id:
                                race_urls.append(race_info)
                    
                    break
            
            return table_found
        
        # Always try direct table extraction first - especially for problematic dogs
        table_found = extract_race_links_from_table()
        if table_found and race_urls:
            if debug_mode:
                print(f"   ✅ Successfully extracted {len(race_urls)} races using direct table method")
            return race_urls
        
        # If no races found yet, try alternative extraction approaches
        if not race_urls:
            if debug_mode:
                print("   Trying alternative race history formats...")
            
            # Try to find race elements with various selectors
            race_elements = []
            for selector in [
                "table.race-history tr", 
                ".race-history-row", 
                ".race-entry",
                ".race-item",
                ".result-row",
                "tr.race-data",
                ".race-record",
                "tr"  # Most generic - try last
            ]:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    race_elements = elements
                    if debug_mode:
                        print(f"   ✅ Found {len(elements)} potential race elements using selector: {selector}")
                    break
            
            # Process found race elements
            for race_elem in race_elements:
                try:
                    # Look for links in each element
                    links = race_elem.find_elements(By.TAG_NAME, "a")
                    for link in links:
                        href = link.get_attribute("href")
                        if href and ('meetingId' in href or 'raceId' in href):
                            meeting_match = re.search(r'meetingId=(\d+)', href)
                            race_match = re.search(r'raceId=(\d+)', href)
                            meeting_id = meeting_match.group(1) if meeting_match else ""
                            race_id = race_match.group(1) if race_match else ""
                            
                            if meeting_id and race_id:
                                # Try to find date near this link
                                race_date = ""
                                try:
                                    # Get text of first cell/column if this is a table row
                                    date_cells = race_elem.find_elements(By.CSS_SELECTOR, "td:first-child, .date, .race-date")
                                    if date_cells:
                                        race_date = date_cells[0].text.strip()
                                except:
                                    pass
                                
                                race_info = {
                                    'race_url': href,
                                    'meeting_id': meeting_id,
                                    'race_id': race_id,
                                    'race_date': race_date
                                }
                                race_urls.append(race_info)
                except:
                    continue
            
            if race_urls:
                if debug_mode:
                    print(f"   ✅ Extracted {len(race_urls)} race URLs using alternative format")
                return race_urls
            
            # Last resort: Try to find ANY links that might be race links
            if not race_urls:
                if debug_mode:
                    print("   🔍 Trying last resort: finding any race links on page")
                    
                all_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='meeting'], a[href*='race']")
                for link in all_links:
                    try:
                        href = link.get_attribute("href")
                        if href and ('meetingId' in href or 'raceId' in href):
                            meeting_match = re.search(r'meetingId=(\d+)', href)
                            race_match = re.search(r'raceId=(\d+)', href)
                            meeting_id = meeting_match.group(1) if meeting_match else ""
                            race_id = race_match.group(1) if race_match else ""
                            
                            if meeting_id and race_id:
                                race_info = {
                                    'race_url': href,
                                    'meeting_id': meeting_id,
                                    'race_id': race_id,
                                    'race_date': ""  # Can't determine date from random links
                                }
                                race_urls.append(race_info)
                    except:
                        continue
                
                if race_urls and debug_mode:
                    print(f"   ✅ Found {len(race_urls)} race links with last resort method")
        
        return race_urls
    
    except Exception as e:
        if verbose or debug_mode:
            print(f"   ❌ Error in fast_get_dog_race_urls: {str(e)}")
        return []

In [None]:
# SCRAPE SPECIFIC DOGS - with minimal output
# Edit the dog_ids list below and run this cell

dog_ids_to_scrape = [
    "621059", 
]

# Use try/except to ensure we continue even if one dog fails
try:
    total_records = fast_scrape_multiple_dogs(dog_ids_to_scrape, "dogs3.csv", "dogs3_processed.csv", verbose=True)
    print(f"✅ Successfully scraped {total_records} records")
except Exception as e:
    print(f"❌ Error during scraping: {str(e)}")
    
# Add diagnostic to check why the problematic dog might be failing
print("\nRunning diagnostic on problematic dog ID...")
diagnose_dog_id("607694")

❌ Error during scraping: name 'fast_scrape_multiple_dogs' is not defined

Running diagnostic on problematic dog ID...


NameError: name 'diagnose_dog_id' is not defined

## Scraping Notebook

This notebook contains only the essential scraping functions:

### Usage:
1. **Scrape Specific Dogs**: Edit cell 4 with your dog IDs
2. **Scrape Dog Range**: Edit cell 5 with start/end IDs  
3. **Run Cell**: Data will be saved to `dogs3.csv`

### Features:
- ✅ Avoids duplicate scraping
- 🚀 Optimized for speed
- 💾 Auto-saves progress
- 📊 Shows detailed statistics

For ML predictions, use the separate `ML_Predictor.ipynb` notebook!

In [None]:
# SCRAPE DOGS BY RANGE
# Scrape dogs with IDs in a specific range

start_id = 607694  # Start with our problematic dog ID
end_id = 607695    # Just try one dog for testing

dog_range = [str(i) for i in range(start_id, end_id)]
print(f"🚀 SCRAPING DOG RANGE {start_id}-{end_id-1}")

total_records = fast_scrape_multiple_dogs(dog_range, "dogs3.csv", "dogs3_processed.csv", verbose=True)
print(f"✅ COMPLETED! Total records in database: {total_records}")

🚀 SCRAPING DOG RANGE 607694-607694
Loaded 9307 existing records, 1664 unique races
Scraping 1 dogs...
Processing dog 1/1: 607694Scraping 1 dogs...
Processing dog 1/1: 607694   Accessing dog profile: https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=607694
   Accessing dog profile: https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=607694
   ✅ Found dog name: Sweet Soul Music
   🔍 Using direct link extraction for problematic dog ID
   Found 22 potential race links
   ✅ Found 20 race links for problematic dog ID

Found 0 new races to scrape
   ✅ Found dog name: Sweet Soul Music
   🔍 Using direct link extraction for problematic dog ID
   Found 22 potential race links
   ✅ Found 20 race links for problematic dog ID

Found 0 new races to scrape
Scraping completed: 9307 total records for 3550 unique dogs
Scraping completed: 9307 total records for 3550 unique dogs
✅ COMPLETED! Total records in database: 9307
✅ COMPLETED! Total records in database: 9307


In [None]:
# Run this cell to diagnose issues with a specific dog ID
def diagnose_dog_id(dog_id, verbose=True):
    """Direct diagnosis of issues with a specific dog ID"""
    if verbose:
        print(f"Diagnosing dog ID: {dog_id}")
    
    driver = create_fast_driver()
    wait = WebDriverWait(driver, 10)  # Longer timeout for diagnosis
    
    try:
        # Manual check of dog profile
        profile_url = f"https://www.gbgb.org.uk/greyhound-profile/?greyhoundId={dog_id}"
        if verbose:
            print(f"Accessing: {profile_url}")
        driver.get(profile_url)
        
        # Take a screenshot
        screenshot_file = f"dog_{dog_id}_screenshot.png"
        driver.save_screenshot(screenshot_file)
        if verbose:
            print(f"Screenshot saved as: {screenshot_file}")
        
        # Check dog name
        try:
            dog_name_elems = driver.find_elements(By.CSS_SELECTOR, ".GreyhoundProfile__name, h1, .greyhound-name")
            if dog_name_elems and verbose:
                print(f"Dog name found: {dog_name_elems[0].text}")
            elif verbose:
                print("Dog name element not found")
                
            # Check page title
            if verbose:
                print(f"Page title: {driver.title}")
            
            # Check if there's an error message
            error_elems = driver.find_elements(By.CSS_SELECTOR, ".error-message, .alert, .message")
            if verbose:
                for elem in error_elems:
                    print(f"Error message found: {elem.text}")
            
            # Check for race history table
            race_tables = driver.find_elements(By.CSS_SELECTOR, "table, .race-history, .results-table")
            if race_tables and verbose:
                print(f"Race table found with {len(race_tables)} elements")
                
                # Count rows
                rows = driver.find_elements(By.CSS_SELECTOR, "tr, .race-row, .result-row")
                print(f"Found {len(rows)} potential race rows")
            elif verbose:
                print("No race history table found")
            
            # Extract page source and save for inspection
            with open(f"dog_{dog_id}_page.html", "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            if verbose:
                print(f"Page source saved to dog_{dog_id}_page.html")
            
            # Try the race extraction
            race_urls = fast_get_dog_race_urls(driver, dog_id, wait, verbose)
            if verbose:
                print(f"Extracted {len(race_urls)} race URLs")
                
                # Display the first few races found
                if race_urls:
                    print("\nFirst few races found:")
                    for i, race in enumerate(race_urls[:3]):
                        print(f"  Race {i+1}: Meeting ID={race['meeting_id']}, Race ID={race['race_id']}")
                        print(f"  URL: {race['race_url']}")
            
            return race_urls
            
        except Exception as e:
            if verbose:
                print(f"Error during diagnosis: {e}")
            return []
    
    finally:
        driver.quit()
        if verbose:
            print("Driver closed")

In [None]:
# Test scraping for problematic dog ID 607694
test_dog_id = "607694"
print(f"🔍 Testing scraping for dog ID: {test_dog_id}")

driver = create_fast_driver()
wait = WebDriverWait(driver, 10)

try:
    # Run the enhanced scraping
    race_urls = fast_get_dog_race_urls(driver, test_dog_id, wait, verbose=True)
    print(f"📊 Found {len(race_urls)} races for dog {test_dog_id}")
    
    # Show sample of races found
    if race_urls:
        print("\nSample of races found:")
        for i, race in enumerate(race_urls[:5]):
            print(f"  {i+1}. Meeting ID: {race['meeting_id']}, Race ID: {race['race_id']}")
            print(f"     URL: {race['race_url']}")
            print(f"     Date: {race['race_date']}")
        
        if len(race_urls) > 5:
            print(f"  ... and {len(race_urls) - 5} more races")
finally:
    driver.quit()
    print("🏁 Driver closed")

🔍 Testing scraping for dog ID: 607694
   Accessing dog profile: https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=607694
   Accessing dog profile: https://www.gbgb.org.uk/greyhound-profile/?greyhoundId=607694
   ✅ Found dog name: Sweet Soul Music
   🔍 Using direct link extraction for problematic dog ID
   Found 22 potential race links
   ✅ Found 20 race links for problematic dog ID
📊 Found 20 races for dog 607694

Sample of races found:
  1. Meeting ID: 426921, Race ID: 1115765
     URL: https://www.gbgb.org.uk/meeting/?meetingId=426921&raceId=1115765
     Date: 
  2. Meeting ID: 426548, Race ID: 1114075
     URL: https://www.gbgb.org.uk/meeting/?meetingId=426548&raceId=1114075
     Date: 
  3. Meeting ID: 425995, Race ID: 1111481
     URL: https://www.gbgb.org.uk/meeting/?meetingId=425995&raceId=1111481
     Date: 
  4. Meeting ID: 425687, Race ID: 1109872
     URL: https://www.gbgb.org.uk/meeting/?meetingId=425687&raceId=1109872
     Date: 
  5. Meeting ID: 425212, Race ID: 1107