In [None]:
# FAST SCRAPING NOTEBOOK - Single Cell Solution
# Import and use the optimized fast scraping function

import sys
import os
import time

# Add current directory to path to import our module
sys.path.append(os.getcwd())

# Import the fast scraping function
from scraping.fast_scraping import fast_scrape_multiple_dogs

def run_fast_scraping(start_id=637322, end_id=637332, output_file="dogs3.csv"):
    """
    Run fast scraping for a range of dog IDs
    
    Args:
        start_id: Starting dog ID
        end_id: Ending dog ID (exclusive)
        output_file: Output CSV file name
    """
    
    # Generate dog ID list
    dog_ids = [str(i) for i in range(start_id, end_id)]
    
    print(f"🚀 Starting fast scraping for {len(dog_ids)} dogs (IDs {start_id}-{end_id-1})")
    print(f"📂 Output file: {output_file}")
    print("=" * 50)
    
    start_time = time.time()
    
    try:
        # Run the fast scraping
        total_records = fast_scrape_multiple_dogs(
            dog_ids=dog_ids,
            output_file=output_file,
            batch_size=25  # Save progress every 25 races
        )
        
        elapsed_time = time.time() - start_time
        
        print("=" * 50)
        print("✅ SCRAPING COMPLETED!")
        print(f"📊 Total records scraped: {total_records}")
        print(f"⏱️ Total time: {elapsed_time:.1f} seconds")
        print(f"🐕 Dogs processed: {len(dog_ids)}")
        print(f"📈 Speed: {len(dog_ids)/elapsed_time*60:.1f} dogs per minute")
        print(f"💾 Data saved to: {output_file}")
        
        # FIXED: Proper calculation and analysis
        if total_records > 0:
            avg_races_per_dog = total_records / len(dog_ids)
            print(f"🏁 Average records per dog: {avg_races_per_dog:.1f}")
            
            # Check if this seems reasonable
            if avg_races_per_dog > 200:
                print("⚠️  WARNING: Unusually high records per dog detected!")
                print("   This suggests the scraper may be extracting duplicate records")
                print("   or pulling data from multiple dogs per race page.")
                print(f"   Expected: 10-100 races per dog, Got: {avg_races_per_dog:.1f}")
                
                # Try to diagnose the issue
                if os.path.exists(output_file):
                    import pandas as pd
                    try:
                        df = pd.read_csv(output_file)
                        unique_dogs = df['dog_id'].nunique() if 'dog_id' in df.columns else 0
                        unique_races = df['race_id'].nunique() if 'race_id' in df.columns else 0
                        
                        print(f"   🔍 Diagnosis:")
                        print(f"     - Total records: {len(df)}")
                        print(f"     - Unique dogs: {unique_dogs}")
                        print(f"     - Unique races: {unique_races}")
                        print(f"     - Dogs processed: {len(dog_ids)}")
                        
                        if unique_dogs > len(dog_ids):
                            print(f"     ❌ ISSUE: Found {unique_dogs} unique dogs but only processed {len(dog_ids)}")
                            print(f"        The scraper is extracting ALL dogs from each race page,")
                            print(f"        not just the target dog!")
                        
                        if unique_races > total_records / 6:  # Assuming ~6 dogs per race
                            print(f"     ❌ ISSUE: Too many unique races relative to records")
                    except Exception as e:
                        print(f"   Error analyzing data: {e}")
            else:
                print("✅ Records per dog seems reasonable for greyhound racing")
        
        return total_records
        
    except Exception as e:
        print(f"❌ Error during scraping: {str(e)}")
        return 0

# CONFIGURATION - Edit these values as needed
START_DOG_ID = 650968  # Starting dog ID  
END_DOG_ID = 650978    # Ending dog ID (reduced to 10 dogs for testing)
OUTPUT_FILE = "dogs3.csv"  # Output file name

# RUN THE SCRAPING
print("🔧 Fast Scraping Configuration:")
print(f"   Start ID: {START_DOG_ID}")
print(f"   End ID: {END_DOG_ID}")
print(f"   Total dogs: {END_DOG_ID - START_DOG_ID}")
print(f"   Output: {OUTPUT_FILE}")
print()

# Execute the scraping
total_scraped = run_fast_scraping(
    start_id=START_DOG_ID,
    end_id=END_DOG_ID,
    output_file=OUTPUT_FILE
)

# Final summary
print(f"\n🎯 FINAL RESULT: {total_scraped} total records scraped")
if os.path.exists(OUTPUT_FILE):
    file_size = os.path.getsize(OUTPUT_FILE) / 1024  # KB
    print(f"📁 File size: {file_size:.1f} KB")
    
    # Additional analysis
    print("\n🔬 DETAILED ANALYSIS:")
    try:
        import pandas as pd
        df = pd.read_csv(OUTPUT_FILE)
        
        if 'dog_id' in df.columns:
            unique_dogs = df['dog_id'].nunique()
            dogs_processed = END_DOG_ID - START_DOG_ID
            
            print(f"   📊 Data Breakdown:")
            print(f"     - Total records: {len(df)}")
            print(f"     - Unique dogs in data: {unique_dogs}")  
            print(f"     - Dogs we tried to scrape: {dogs_processed}")
            print(f"     - Records per unique dog: {len(df)/unique_dogs:.1f}")
            
            if unique_dogs != dogs_processed:
                print(f"   ⚠️  MISMATCH: Expected {dogs_processed} dogs, found {unique_dogs}")
                
            # Show sample of dog IDs found
            sample_dogs = sorted(df['dog_id'].unique())[:10]
            print(f"     - Sample dog IDs found: {sample_dogs}")
            
        else:
            print("   ❌ No 'dog_id' column found in output")
            print(f"     Available columns: {list(df.columns)}")
            
    except Exception as e:
        print(f"   ❌ Error analyzing output: {e}")

🔧 Fast Scraping Configuration:
   Start ID: 650968
   End ID: 650978
   Total dogs: 10
   Output: dogs3.csv

🚀 Starting fast scraping for 10 dogs (IDs 650968-650977)
📂 Output file: dogs3.csv
Loading existing data from dogs3.csv...
Loaded 11042 existing records, 1961 unique races
Collecting race URLs from all dogs...
Processing dog 1/10: 650968
Collecting race URLs from all dogs...
Processing dog 1/10: 650968
Processing dog 2/10: 650969
Processing dog 2/10: 650969
Processing dog 3/10: 650970
Processing dog 3/10: 650970


# Greyhound Racing Dataset - Column Explanations

## Overview
This CSV file contains comprehensive greyhound racing data with 42 columns capturing race information, dog characteristics, performance metrics, and derived features for machine learning analysis.

## Column Descriptions

### Basic Race Information
- **Column 1-2**: `meeting_id`, `race_id` - Unique identifiers for the racing meeting and specific race
- **Column 3**: `date` - Race date (e.g., "Thursday 6th February 2025")
- **Column 4**: `track` - Racing venue (Newcastle, Towcester, Doncaster, etc.)
- **Column 5**: `time` - Race start time
- **Column 6**: `grade` - Race grade/class (A3, A4, B4, etc.)
- **Column 7**: `distance` - Race distance in meters (450m, 480m, 500m, etc.)
- **Column 8**: `prize_info` - Prize money breakdown for winners and placed dogs

### Race Performance
- **Column 9**: `finishing_position_text` - Finishing position as text (1st, 2nd, 3rd, etc.)
- **Column 10**: `trap_number` - Starting trap number (1-6)
- **Column 11**: `dog_id` - Unique identifier for the dog
- **Column 12**: `dog_name` - Name of the greyhound
- **Column 13**: `trainer` - Trainer's name
- **Column 14**: `comment` - Race commentary describing the dog's performance
- **Column 15**: `odds` - Betting odds (e.g., "3/1", "11/8F" where F = favorite)
- **Column 16**: `sectional_time` - Split time at specific distance point
- **Column 17**: `finish_time` - Final race time with margin behind winner
- **Column 18**: `date_of_birth` - Dog's birth date
- **Column 19**: `weight` - Dog's racing weight in kg
- **Column 20**: `color_sex` - Color and sex code (e.g., "b - bk" = bitch - black)
- **Column 21**: `sire` - Father's name
- **Column 22**: `dam` - Mother's name
- **Column 23**: `breeding_info` - Combined breeding information
- **Column 24**: `url` - Link to race details

### Processed Features
- **Column 25**: `distance_numeric` - Race distance as numeric value
- **Column 26**: `finishing_position` - Finishing position as number (1.0, 2.0, etc.)
- **Column 27**: `weight_numeric` - Weight as numeric value
- **Column 28**: `trap_number_numeric` - Trap number as numeric
- **Column 29**: `sectional_time_numeric` - Sectional time as number
- **Column 30**: `won_race` - Binary indicator (1 if won, 0 if not)

### Performance Analysis Features
- **Column 31**: `margin_lengths` - Margin behind winner in lengths
- **Column 32**: `odds_numeric` - Converted odds as decimal number
- **Column 33**: `color_code` - Simplified color code (bk, bd, be, f, etc.)
- **Column 34**: `is_favorite` - Binary indicator if dog was favorite
- **Column 35**: `early_pace` - Indicator of early speed/position
- **Column 36**: `led_at_some_point` - Whether dog led during race
- **Column 37**: `bumped_or_crowded` - Indicator of racing interference
- **Column 38**: `clear_run` - Whether dog had unimpeded run
- **Column 39**: `ran_on` - Whether dog finished strongly
- **Column 40**: `checked_or_blocked` - Racing trouble indicators
- **Column 41**: `wide_run` - Whether dog raced wide

### Statistical Features
- **Column 42**: `performance_score` - Calculated performance metric
- **Column 43**: `is_short_distance` - Binary indicator for sprint races
- **Column 44**: `is_long_distance` - Binary indicator for distance races  
- **Column 45**: `is_middle_distance` - Binary indicator for middle distance
- **Column 46**: `track_type` - Numeric track type classification

## Key Insights from the Data

### Race Grades
- **A grades**: Higher class races (A2, A3, A4, etc.)
- **B grades**: Mid-level competition  
- **D grades**: Lower class/maiden races
- **HP/OR**: Handicap/Open races

### Performance Indicators
The comment field contains valuable racing information:
- **"ALed"** = Always led
- **"QAw"** = Quick away from traps
- **"Crd"** = Crowded during race
- **"Bmp"** = Bumped by other dogs
- **"RnOn"** = Ran on strongly at finish (positive - dog accelerated/finished with strong pace in final stretch)
- **"SAw"** = Slow away from traps
- **"Wide"** = Raced wide around bends

### Distance Categories
- **Short**: 245m-285m (sprint races)
- **Middle**: 400m-450m (standard distances)  
- **Long**: 480m-500m+ (staying races)

## Racing Commentary Explanation

### "Ran On" (RnOn) - Detailed Meaning:
In greyhound racing, **"Ran On"** is a **positive performance indicator** that means:

1. **Strong Finish**: The dog accelerated or maintained strong pace in the final portion of the race
2. **Late Speed**: Shows the dog has stamina and finishing kick
3. **Closing Ground**: Often indicates the dog was gaining on leaders or maintaining position strongly
4. **Good Fitness**: Suggests the dog is in good racing condition
5. **Distance Suitability**: May indicate the dog suits longer distances where stamina matters

**This is GOOD performance** - it shows the dog finished the race strongly rather than tiring. Dogs that "run on" are often considered to have good racing fitness and potential for improvement at longer distances.

**Contrast with negative terms**:
- "Tired" = Dog slowed significantly in final stretch
- "Faded" = Dog lost position/pace late in race
- "Weakened" = Dog showed lack of stamina

This dataset appears designed for predictive modeling of greyhound race outcomes, with features capturing both historical performance and race-day factors that influence results.

In [None]:
# TIME PROJECTION CALCULATOR
# Based on actual performance: 21 dogs in 204 seconds

def calculate_scraping_projections(dogs_scraped, time_taken, target_dogs=100000):
    """Calculate time projections for large-scale scraping"""
    
    seconds_per_dog = time_taken / dogs_scraped
    
    print("⏱️ SCRAPING TIME PROJECTIONS")
    print("=" * 50)
    print(f"Current Performance:")
    print(f"  - Dogs scraped: {dogs_scraped}")
    print(f"  - Time taken: {time_taken:.1f} seconds ({time_taken/60:.1f} minutes)")
    print(f"  - Rate: {seconds_per_dog:.1f} seconds per dog")
    print(f"  - Speed: {3600/seconds_per_dog:.1f} dogs per hour")
    
    print(f"\nProjection for {target_dogs:,} dogs:")
    
    total_seconds = target_dogs * seconds_per_dog
    hours = total_seconds / 3600
    days = hours / 24
    
    print(f"  - Total time: {total_seconds:,.0f} seconds")
    print(f"  - Hours: {hours:.1f} hours")
    print(f"  - Days: {days:.1f} days")
    
    if days > 7:
        weeks = days / 7
        print(f"  - Weeks: {weeks:.1f} weeks")
    
    # Show different scenarios
    print(f"\nTime estimates for different scales:")
    scales = [1000, 10000, 50000, 100000, 500000]
    
    for scale in scales:
        scale_seconds = scale * seconds_per_dog
        scale_hours = scale_seconds / 3600
        scale_days = scale_hours / 24
        
        if scale_days < 1:
            print(f"  - {scale:,} dogs: {scale_hours:.1f} hours")
        elif scale_days < 7:
            print(f"  - {scale:,} dogs: {scale_days:.1f} days")
        else:
            scale_weeks = scale_days / 7
            print(f"  - {scale:,} dogs: {scale_days:.1f} days ({scale_weeks:.1f} weeks)")
    
    # Optimization suggestions
    print(f"\n💡 OPTIMIZATION RECOMMENDATIONS:")
    
    if seconds_per_dog > 5:
        print("  ⚠️ Current speed is quite slow (>5 sec/dog)")
        print("  Suggestions:")
        print("    - Use more HTTP requests, less Selenium")
        print("    - Implement parallel processing")
        print("    - Cache/skip already processed dogs")
        print("    - Use headless browser optimization")
        
    target_speed = 2.0  # 2 seconds per dog target
    if seconds_per_dog > target_speed:
        improvement_factor = seconds_per_dog / target_speed
        optimized_time = total_seconds / improvement_factor
        optimized_days = optimized_time / 86400
        
        print(f"  🎯 If optimized to {target_speed} sec/dog:")
        print(f"    - {target_dogs:,} dogs would take: {optimized_days:.1f} days")
        print(f"    - Speed improvement needed: {improvement_factor:.1f}x faster")

# Calculate based on your actual results
calculate_scraping_projections(
    dogs_scraped=21,
    time_taken=204,
    target_dogs=100000
)

print("\n" + "="*50)
print("🚨 REALITY CHECK:")
print("11+ days of continuous scraping is not practical!")
print("\nBetter approach:")
print("1. 📊 Focus on specific valuable ranges (recent dogs)")
print("2. ⚡ Optimize scraper to <2 seconds per dog") 
print("3. 🔄 Use parallel processing")
print("4. 💾 Implement smart caching/resume capability")
print("5. 🎯 Target ~10,000-50,000 most valuable dogs instead")

In [None]:
# FAST SCRAPING NOTEBOOK - Single Cell Solution
# Import and use the optimized fast scraping function

import sys
import os
import time

# Add current directory to path to import our module
sys.path.append(os.getcwd())

# Import the fast scraping function
from fast_scraping import fast_scrape_multiple_dogs

def run_fast_scraping(start_id=637322, end_id=637332, output_file="dogs3.csv"):
    """
    Run fast scraping for a range of dog IDs
    
    Args:
        start_id: Starting dog ID
        end_id: Ending dog ID (exclusive)
        output_file: Output CSV file name
    """
    
    # Generate dog ID list
    dog_ids = [str(i) for i in range(start_id, end_id)]
    
    print(f"🚀 Starting fast scraping for {len(dog_ids)} dogs (IDs {start_id}-{end_id-1})")
    print(f"📂 Output file: {output_file}")
    print("=" * 50)
    
    start_time = time.time()
    
    try:
        # Run the fast scraping
        total_records = fast_scrape_multiple_dogs(
            dog_ids=dog_ids,
            output_file=output_file,
            batch_size=25  # Save progress every 25 races
        )
        
        elapsed_time = time.time() - start_time
        
        print("=" * 50)
        print("✅ SCRAPING COMPLETED!")
        print(f"📊 Total records scraped: {total_records}")
        print(f"⏱️ Total time: {elapsed_time:.1f} seconds")
        print(f"🐕 Dogs processed: {len(dog_ids)}")
        print(f"📈 Speed: {len(dog_ids)/elapsed_time*60:.1f} dogs per minute")
        print(f"💾 Data saved to: {output_file}")
        
        # FIXED: Proper calculation and analysis
        if total_records > 0:
            avg_races_per_dog = total_records / len(dog_ids)
            print(f"🏁 Average records per dog: {avg_races_per_dog:.1f}")
            
            # Check if this seems reasonable
            if avg_races_per_dog > 200:
                print("⚠️  WARNING: Unusually high records per dog detected!")
                print("   This suggests the scraper may be extracting duplicate records")
                print("   or pulling data from multiple dogs per race page.")
                print(f"   Expected: 10-100 races per dog, Got: {avg_races_per_dog:.1f}")
                
                # Try to diagnose the issue
                if os.path.exists(output_file):
                    import pandas as pd
                    try:
                        df = pd.read_csv(output_file)
                        unique_dogs = df['dog_id'].nunique() if 'dog_id' in df.columns else 0
                        unique_races = df['race_id'].nunique() if 'race_id' in df.columns else 0
                        
                        print(f"   🔍 Diagnosis:")
                        print(f"     - Total records: {len(df)}")
                        print(f"     - Unique dogs: {unique_dogs}")
                        print(f"     - Unique races: {unique_races}")
                        print(f"     - Dogs processed: {len(dog_ids)}")
                        
                        if unique_dogs > len(dog_ids):
                            print(f"     ❌ ISSUE: Found {unique_dogs} unique dogs but only processed {len(dog_ids)}")
                            print(f"        The scraper is extracting ALL dogs from each race page,")
                            print(f"        not just the target dog!")
                        
                        if unique_races > total_records / 6:  # Assuming ~6 dogs per race
                            print(f"     ❌ ISSUE: Too many unique races relative to records")
                    except Exception as e:
                        print(f"   Error analyzing data: {e}")
            else:
                print("✅ Records per dog seems reasonable for greyhound racing")
        
        return total_records
        
    except Exception as e:
        print(f"❌ Error during scraping: {str(e)}")
        return 0

# CONFIGURATION - Edit these values as needed
START_DOG_ID = 650947  # Starting dog ID  
END_DOG_ID = 650957    # Ending dog ID (reduced to 10 dogs for testing)
OUTPUT_FILE = "dogs3.csv"  # Output file name

# RUN THE SCRAPING
print("🔧 Fast Scraping Configuration:")
print(f"   Start ID: {START_DOG_ID}")
print(f"   End ID: {END_DOG_ID}")
print(f"   Total dogs: {END_DOG_ID - START_DOG_ID}")
print(f"   Output: {OUTPUT_FILE}")
print()

# Execute the scraping
total_scraped = run_fast_scraping(
    start_id=START_DOG_ID,
    end_id=END_DOG_ID,
    output_file=OUTPUT_FILE
)

# Final summary
print(f"\n🎯 FINAL RESULT: {total_scraped} total records scraped")
if os.path.exists(OUTPUT_FILE):
    file_size = os.path.getsize(OUTPUT_FILE) / 1024  # KB
    print(f"📁 File size: {file_size:.1f} KB")
    
    # Additional analysis
    print("\n🔬 DETAILED ANALYSIS:")
    try:
        import pandas as pd
        df = pd.read_csv(OUTPUT_FILE)
        
        if 'dog_id' in df.columns:
            unique_dogs = df['dog_id'].nunique()
            dogs_processed = END_DOG_ID - START_DOG_ID
            
            print(f"   📊 Data Breakdown:")
            print(f"     - Total records: {len(df)}")
            print(f"     - Unique dogs in data: {unique_dogs}")  
            print(f"     - Dogs we tried to scrape: {dogs_processed}")
            print(f"     - Records per unique dog: {len(df)/unique_dogs:.1f}")
            
            if unique_dogs != dogs_processed:
                print(f"   ⚠️  MISMATCH: Expected {dogs_processed} dogs, found {unique_dogs}")
                
            # Show sample of dog IDs found
            sample_dogs = sorted(df['dog_id'].unique())[:10]
            print(f"     - Sample dog IDs found: {sample_dogs}")
            
        else:
            print("   ❌ No 'dog_id' column found in output")
            print(f"     Available columns: {list(df.columns)}")
            
    except Exception as e:
        print(f"   ❌ Error analyzing output: {e}")

# Greyhound Racing Dataset - Column Explanations

## Overview
This CSV file contains comprehensive greyhound racing data with 42 columns capturing race information, dog characteristics, performance metrics, and derived features for machine learning analysis.

## Column Descriptions

### Basic Race Information
- **Column 1-2**: `meeting_id`, `race_id` - Unique identifiers for the racing meeting and specific race
- **Column 3**: `date` - Race date (e.g., "Thursday 6th February 2025")
- **Column 4**: `track` - Racing venue (Newcastle, Towcester, Doncaster, etc.)
- **Column 5**: `time` - Race start time
- **Column 6**: `grade` - Race grade/class (A3, A4, B4, etc.)
- **Column 7**: `distance` - Race distance in meters (450m, 480m, 500m, etc.)
- **Column 8**: `prize_info` - Prize money breakdown for winners and placed dogs

### Race Performance
- **Column 9**: `finishing_position_text` - Finishing position as text (1st, 2nd, 3rd, etc.)
- **Column 10**: `trap_number` - Starting trap number (1-6)
- **Column 11**: `dog_id` - Unique identifier for the dog
- **Column 12**: `dog_name` - Name of the greyhound
- **Column 13**: `trainer` - Trainer's name
- **Column 14**: `comment` - Race commentary describing the dog's performance
- **Column 15**: `odds` - Betting odds (e.g., "3/1", "11/8F" where F = favorite)
- **Column 16**: `sectional_time` - Split time at specific distance point
- **Column 17**: `finish_time` - Final race time with margin behind winner
- **Column 18**: `date_of_birth` - Dog's birth date
- **Column 19**: `weight` - Dog's racing weight in kg
- **Column 20**: `color_sex` - Color and sex code (e.g., "b - bk" = bitch - black)
- **Column 21**: `sire` - Father's name
- **Column 22**: `dam` - Mother's name
- **Column 23**: `breeding_info` - Combined breeding information
- **Column 24**: `url` - Link to race details

### Processed Features
- **Column 25**: `distance_numeric` - Race distance as numeric value
- **Column 26**: `finishing_position` - Finishing position as number (1.0, 2.0, etc.)
- **Column 27**: `weight_numeric` - Weight as numeric value
- **Column 28**: `trap_number_numeric` - Trap number as numeric
- **Column 29**: `sectional_time_numeric` - Sectional time as number
- **Column 30**: `won_race` - Binary indicator (1 if won, 0 if not)

### Performance Analysis Features
- **Column 31**: `margin_lengths` - Margin behind winner in lengths
- **Column 32**: `odds_numeric` - Converted odds as decimal number
- **Column 33**: `color_code` - Simplified color code (bk, bd, be, f, etc.)
- **Column 34**: `is_favorite` - Binary indicator if dog was favorite
- **Column 35**: `early_pace` - Indicator of early speed/position
- **Column 36**: `led_at_some_point` - Whether dog led during race
- **Column 37**: `bumped_or_crowded` - Indicator of racing interference
- **Column 38**: `clear_run` - Whether dog had unimpeded run
- **Column 39**: `ran_on` - Whether dog finished strongly
- **Column 40**: `checked_or_blocked` - Racing trouble indicators
- **Column 41**: `wide_run` - Whether dog raced wide

### Statistical Features
- **Column 42**: `performance_score` - Calculated performance metric
- **Column 43**: `is_short_distance` - Binary indicator for sprint races
- **Column 44**: `is_long_distance` - Binary indicator for distance races  
- **Column 45**: `is_middle_distance` - Binary indicator for middle distance
- **Column 46**: `track_type` - Numeric track type classification

## Key Insights from the Data

### Race Grades
- **A grades**: Higher class races (A2, A3, A4, etc.)
- **B grades**: Mid-level competition  
- **D grades**: Lower class/maiden races
- **HP/OR**: Handicap/Open races

### Performance Indicators
The comment field contains valuable racing information:
- **"ALed"** = Always led
- **"QAw"** = Quick away from traps
- **"Crd"** = Crowded during race
- **"Bmp"** = Bumped by other dogs
- **"RnOn"** = Ran on strongly at finish (positive - dog accelerated/finished with strong pace in final stretch)
- **"SAw"** = Slow away from traps
- **"Wide"** = Raced wide around bends

### Distance Categories
- **Short**: 245m-285m (sprint races)
- **Middle**: 400m-450m (standard distances)  
- **Long**: 480m-500m+ (staying races)

## Racing Commentary Explanation

### "Ran On" (RnOn) - Detailed Meaning:
In greyhound racing, **"Ran On"** is a **positive performance indicator** that means:

1. **Strong Finish**: The dog accelerated or maintained strong pace in the final portion of the race
2. **Late Speed**: Shows the dog has stamina and finishing kick
3. **Closing Ground**: Often indicates the dog was gaining on leaders or maintaining position strongly
4. **Good Fitness**: Suggests the dog is in good racing condition
5. **Distance Suitability**: May indicate the dog suits longer distances where stamina matters

**This is GOOD performance** - it shows the dog finished the race strongly rather than tiring. Dogs that "run on" are often considered to have good racing fitness and potential for improvement at longer distances.

**Contrast with negative terms**:
- "Tired" = Dog slowed significantly in final stretch
- "Faded" = Dog lost position/pace late in race
- "Weakened" = Dog showed lack of stamina

This dataset appears designed for predictive modeling of greyhound race outcomes, with features capturing both historical performance and race-day factors that influence results.

In [None]:
# TIME PROJECTION CALCULATOR
# Based on actual performance: 21 dogs in 204 seconds

def calculate_scraping_projections(dogs_scraped, time_taken, target_dogs=100000):
    """Calculate time projections for large-scale scraping"""
    
    seconds_per_dog = time_taken / dogs_scraped
    
    print("⏱️ SCRAPING TIME PROJECTIONS")
    print("=" * 50)
    print(f"Current Performance:")
    print(f"  - Dogs scraped: {dogs_scraped}")
    print(f"  - Time taken: {time_taken:.1f} seconds ({time_taken/60:.1f} minutes)")
    print(f"  - Rate: {seconds_per_dog:.1f} seconds per dog")
    print(f"  - Speed: {3600/seconds_per_dog:.1f} dogs per hour")
    
    print(f"\nProjection for {target_dogs:,} dogs:")
    
    total_seconds = target_dogs * seconds_per_dog
    hours = total_seconds / 3600
    days = hours / 24
    
    print(f"  - Total time: {total_seconds:,.0f} seconds")
    print(f"  - Hours: {hours:.1f} hours")
    print(f"  - Days: {days:.1f} days")
    
    if days > 7:
        weeks = days / 7
        print(f"  - Weeks: {weeks:.1f} weeks")
    
    # Show different scenarios
    print(f"\nTime estimates for different scales:")
    scales = [1000, 10000, 50000, 100000, 500000]
    
    for scale in scales:
        scale_seconds = scale * seconds_per_dog
        scale_hours = scale_seconds / 3600
        scale_days = scale_hours / 24
        
        if scale_days < 1:
            print(f"  - {scale:,} dogs: {scale_hours:.1f} hours")
        elif scale_days < 7:
            print(f"  - {scale:,} dogs: {scale_days:.1f} days")
        else:
            scale_weeks = scale_days / 7
            print(f"  - {scale:,} dogs: {scale_days:.1f} days ({scale_weeks:.1f} weeks)")
    
    # Optimization suggestions
    print(f"\n💡 OPTIMIZATION RECOMMENDATIONS:")
    
    if seconds_per_dog > 5:
        print("  ⚠️ Current speed is quite slow (>5 sec/dog)")
        print("  Suggestions:")
        print("    - Use more HTTP requests, less Selenium")
        print("    - Implement parallel processing")
        print("    - Cache/skip already processed dogs")
        print("    - Use headless browser optimization")
        
    target_speed = 2.0  # 2 seconds per dog target
    if seconds_per_dog > target_speed:
        improvement_factor = seconds_per_dog / target_speed
        optimized_time = total_seconds / improvement_factor
        optimized_days = optimized_time / 86400
        
        print(f"  🎯 If optimized to {target_speed} sec/dog:")
        print(f"    - {target_dogs:,} dogs would take: {optimized_days:.1f} days")
        print(f"    - Speed improvement needed: {improvement_factor:.1f}x faster")

# Calculate based on your actual results
calculate_scraping_projections(
    dogs_scraped=21,
    time_taken=204,
    target_dogs=100000
)

print("\n" + "="*50)
print("🚨 REALITY CHECK:")
print("11+ days of continuous scraping is not practical!")
print("\nBetter approach:")
print("1. 📊 Focus on specific valuable ranges (recent dogs)")
print("2. ⚡ Optimize scraper to <2 seconds per dog") 
print("3. 🔄 Use parallel processing")
print("4. 💾 Implement smart caching/resume capability")
print("5. 🎯 Target ~10,000-50,000 most valuable dogs instead")

In [None]:
# PARALLEL FAST SCRAPING NOTEBOOK - Multi-threaded Solution
# Import and use the optimized fast scraping function with parallel processing

import sys
import os
import time
import threading
import queue
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing

# Add current directory to path to import our module
sys.path.append(os.getcwd())

# Import the fast scraping function
from fast_scraping import fast_scrape_multiple_dogs

class ParallelScraper:
    """Parallel scraper class to handle multi-threaded dog scraping"""
    
    def __init__(self, max_workers=None):
        if max_workers is None:
            # Use 4x CPU cores but cap at 8 to avoid overwhelming the server
            max_workers = min(8, multiprocessing.cpu_count() * 4)
        
        self.max_workers = max_workers
        self.results_queue = queue.Queue()
        self.lock = threading.Lock()
        self.total_records = 0
        
        print(f"🚀 Parallel Scraper initialized with {max_workers} workers")
    
    def scrape_dog_batch(self, dog_batch, batch_id, output_file_base):
        """Scrape a batch of dogs in parallel"""
        try:
            batch_output_file = f"{output_file_base}_batch_{batch_id}.csv"
            
            print(f"  🔄 Worker {batch_id}: Processing {len(dog_batch)} dogs")
            start_time = time.time()
            
            # Use the existing fast scraping function for this batch
            batch_records = fast_scrape_multiple_dogs(
                dog_ids=dog_batch,
                output_file=batch_output_file,
                batch_size=10  # Smaller batch size for frequent saves
            )
            
            elapsed_time = time.time() - start_time
            
            with self.lock:
                self.total_records += batch_records
            
            print(f"  ✅ Worker {batch_id}: {batch_records} records in {elapsed_time:.1f}s")
            
            return {
                'batch_id': batch_id,
                'records': batch_records,
                'output_file': batch_output_file,
                'time': elapsed_time,
                'dogs_processed': len(dog_batch)
            }
            
        except Exception as e:
            print(f"  ❌ Worker {batch_id} failed: {str(e)}")
            return {
                'batch_id': batch_id,
                'records': 0,
                'output_file': None,
                'time': 0,
                'error': str(e)
            }
    
    def merge_batch_files(self, batch_results, final_output_file):
        """Merge all batch CSV files into one final file"""
        print("🔗 Merging batch files...")
        
        import pandas as pd
        all_dataframes = []
        
        # Check if final output file already exists and load existing data
        existing_data = None
        if os.path.exists(final_output_file):
            try:
                existing_data = pd.read_csv(final_output_file)
                print(f"  📂 Found existing data: {len(existing_data)} records")
            except Exception as e:
                print(f"  ⚠️ Could not read existing file: {e}")
        
        # Process batch files
        for result in batch_results:
            if result['output_file'] and os.path.exists(result['output_file']):
                try:
                    df = pd.read_csv(result['output_file'])
                    all_dataframes.append(df)
                    print(f"  📄 Batch {result['batch_id']}: {len(df)} records")
                    # Clean up batch file
                    os.remove(result['output_file'])
                except Exception as e:
                    print(f"  ⚠️ Error reading batch file {result['output_file']}: {e}")
        
        if all_dataframes:
            # Combine all new data
            new_data = pd.concat(all_dataframes, ignore_index=True)
            
            # Combine with existing data if present
            if existing_data is not None:
                final_df = pd.concat([existing_data, new_data], ignore_index=True)
                print(f"  🔗 Combined {len(existing_data)} existing + {len(new_data)} new = {len(final_df)} total records")
            else:
                final_df = new_data
                print(f"  📊 New dataset with {len(final_df)} records")
            
            # Save to final output file
            final_df.to_csv(final_output_file, index=False)
            print(f"  ✅ Merged {len(all_dataframes)} batch files into {final_output_file}")
            return len(final_df)
        else:
            print("  ❌ No valid batch files to merge")
            return 0

def run_parallel_scraping(start_id=650947, end_id=650967, output_file="dogs3.csv", max_workers=None):
    """
    Run parallel scraping for a range of dog IDs
    
    Args:
        start_id: Starting dog ID
        end_id: Ending dog ID (exclusive)
        output_file: Output CSV file name (MUST be dogs3.csv)
        max_workers: Number of parallel workers (None = auto-detect)
    """
    
    # FORCE output to be dogs3.csv to match existing data
    output_file = "dogs3.csv"
    
    # Generate dog ID list
    dog_ids = [str(i) for i in range(start_id, end_id)]
    
    if max_workers is None:
        max_workers = min(8, multiprocessing.cpu_count() * 2)
    
    print(f"🚀 PARALLEL SCRAPING - {len(dog_ids)} dogs with {max_workers} workers")
    print(f"📂 Output file: {output_file} (FIXED)")
    print(f"🔧 System: {multiprocessing.cpu_count()} CPU cores")
    print("=" * 60)
    
    start_time = time.time()
    
    try:
        # Initialize parallel scraper
        scraper = ParallelScraper(max_workers=max_workers)
        
        # Split dogs into batches for workers
        batch_size = max(1, len(dog_ids) // max_workers)
        dog_batches = [dog_ids[i:i + batch_size] for i in range(0, len(dog_ids), batch_size)]
        
        print(f"📦 Split into {len(dog_batches)} batches of ~{batch_size} dogs each")
        
        # Process batches in parallel
        batch_results = []
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all batch jobs
            future_to_batch = {
                executor.submit(scraper.scrape_dog_batch, batch, i, "temp_batch"): i
                for i, batch in enumerate(dog_batches)
            }
            
            # Collect results as they complete
            for future in as_completed(future_to_batch):
                batch_id = future_to_batch[future]
                try:
                    result = future.result()
                    batch_results.append(result)
                    
                    progress = len(batch_results) / len(dog_batches) * 100
                    print(f"📊 Progress: {len(batch_results)}/{len(dog_batches)} batches ({progress:.1f}%)")
                    
                except Exception as e:
                    print(f"❌ Batch {batch_id} generated an exception: {e}")
                    batch_results.append({
                        'batch_id': batch_id,
                        'records': 0,
                        'error': str(e)
                    })
        
        # Merge all batch files into dogs3.csv
        total_records = scraper.merge_batch_files(batch_results, output_file)
        
        elapsed_time = time.time() - start_time
        
        print("=" * 60)
        print("✅ PARALLEL SCRAPING COMPLETED!")
        print(f"📊 Total records in {output_file}: {total_records}")
        print(f"⏱️ Total time: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
        print(f"🐕 Dogs processed: {len(dog_ids)}")
        print(f"🔧 Workers used: {max_workers}")
        print(f"📈 Speed: {len(dog_ids)/elapsed_time*60:.1f} dogs per minute")
        print(f"💾 Data saved to: {output_file}")
        
        # Performance analysis
        successful_batches = [r for r in batch_results if r.get('records', 0) > 0]
        failed_batches = [r for r in batch_results if r.get('error')]
        
        print(f"\n📈 PERFORMANCE BREAKDOWN:")
        print(f"  - Successful batches: {len(successful_batches)}/{len(batch_results)}")
        print(f"  - Failed batches: {len(failed_batches)}")
        if successful_batches:
            print(f"  - Average time per batch: {sum(r.get('time', 0) for r in successful_batches)/len(successful_batches):.1f}s")
        print(f"  - Records per second: {total_records/elapsed_time:.1f}")
        
        # Data validation
        if total_records > 0:
            new_records_added = sum(r.get('records', 0) for r in successful_batches)
            print(f"🆕 New records added this run: {new_records_added}")
            
            # Analyze the final dataset
            if os.path.exists(output_file):
                try:
                    import pandas as pd
                    df = pd.read_csv(output_file)
                    
                    if 'dog_id' in df.columns:
                        unique_dogs = df['dog_id'].nunique()
                        print(f"📊 Final dataset analysis:")
                        print(f"  - Total records: {len(df)}")
                        print(f"  - Unique dogs: {unique_dogs}")
                        print(f"  - Average records per dog: {len(df)/unique_dogs:.1f}")
                        
                        # Show sample of latest dogs
                        latest_dogs = sorted(df['dog_id'].unique(), reverse=True)[:5]
                        print(f"  - Latest dog IDs: {latest_dogs}")
                except Exception as e:
                    print(f"  ❌ Error analyzing final dataset: {e}")
        
        return total_records
        
    except Exception as e:
        print(f"❌ Error during parallel scraping: {str(e)}")
        return 0

# CONFIGURATION - Edit these values as needed
START_DOG_ID = 600000  # Starting dog ID  
END_DOG_ID = 650000    # Ending dog ID (10 dogs for testing)
OUTPUT_FILE = "dogs3.csv"  # MUST be dogs3.csv to match existing data
MAX_WORKERS = None  # None = auto-detect optimal workers

# Choose your test mode
TEST_MODE = "parallel"  # Options: "parallel", "comparison", "scale_test"

if TEST_MODE == "parallel":
    print("🔧 Parallel Scraping Configuration:")
    print(f"   Start ID: {START_DOG_ID}")
    print(f"   End ID: {END_DOG_ID}")
    print(f"   Total dogs: {END_DOG_ID - START_DOG_ID}")
    print(f"   Output: {OUTPUT_FILE} (FORCED)")
    print(f"   Max workers: {MAX_WORKERS or 'Auto-detect'}")
    print()

    # Execute parallel scraping - output_file parameter ignored, always uses dogs3.csv
    total_scraped = run_parallel_scraping(
        start_id=START_DOG_ID,
        end_id=END_DOG_ID,
        output_file=OUTPUT_FILE,  # This will be forced to dogs3.csv
        max_workers=MAX_WORKERS
    )

    # Final summary
    print(f"\n🎯 FINAL RESULT: Data appended to dogs3.csv")
    if os.path.exists("dogs3.csv"):
        file_size = os.path.getsize("dogs3.csv") / 1024  # KB
        print(f"📁 dogs3.csv file size: {file_size:.1f} KB")
        
        # Show file info
        try:
            import pandas as pd
            df = pd.read_csv("dogs3.csv")
            print(f"📊 dogs3.csv contains {len(df)} total records")
            if 'dog_id' in df.columns:
                unique_dogs = df['dog_id'].nunique()
                print(f"🐕 Unique dogs in dataset: {unique_dogs}")
        except Exception as e:
            print(f"❌ Error reading dogs3.csv: {e}")

# ...existing test modes...

print(f"\n💡 IMPORTANT:")
print("✅ All data is now saved to dogs3.csv in the same format")
print("✅ New records are appended to existing data")
print("✅ No separate parallel output files are created")

🔧 Parallel Scraping Configuration:
   Start ID: 651200
   End ID: 651250
   Total dogs: 50
   Output: dogs3.csv (FORCED)
   Max workers: Auto-detect

🚀 PARALLEL SCRAPING - 50 dogs with 8 workers
📂 Output file: dogs3.csv (FIXED)
🔧 System: 12 CPU cores
🚀 Parallel Scraper initialized with 8 workers
📦 Split into 9 batches of ~6 dogs each
  🔄 Worker 0: Processing 6 dogs
  🔄 Worker 1: Processing 6 dogs
  🔄 Worker 2: Processing 6 dogs
  🔄 Worker 3: Processing 6 dogs
  🔄 Worker 4: Processing 6 dogs
  🔄 Worker 5: Processing 6 dogs
  🔄 Worker 6: Processing 6 dogs
  🔄 Worker 7: Processing 6 dogs
Collecting race URLs from all dogs...
Processing dog 1/6: 651242
Collecting race URLs from all dogs...
Processing dog 1/6: 651206
Collecting race URLs from all dogs...
Processing dog 1/6: 651212
Collecting race URLs from all dogs...
Processing dog 1/6: 651200
Collecting race URLs from all dogs...
Processing dog 1/6: 651236
Collecting race URLs from all dogs...
Processing dog 1/6: 651218
Collecting race UR

In [None]:
# UPDATED TIME PROJECTION CALCULATOR - PARALLEL PERFORMANCE
# Based on actual parallel performance: 50 dogs in 64 seconds

def calculate_parallel_scraping_projections(dogs_scraped, time_taken, target_dogs=20000):
    """Calculate time projections for parallel scraping performance"""
    
    seconds_per_dog = time_taken / dogs_scraped
    
    print("⚡ PARALLEL SCRAPING TIME PROJECTIONS")
    print("=" * 60)
    print(f"🚀 CURRENT PARALLEL PERFORMANCE:")
    print(f"  - Dogs scraped: {dogs_scraped}")
    print(f"  - Time taken: {time_taken:.1f} seconds ({time_taken/60:.1f} minutes)")
    print(f"  - Rate: {seconds_per_dog:.2f} seconds per dog")
    print(f"  - Speed: {3600/seconds_per_dog:.1f} dogs per hour")
    print(f"  - Speed: {dogs_scraped/(time_taken/60):.1f} dogs per minute")
    
    print(f"\n🎯 PROJECTION FOR {target_dogs:,} DOGS:")
    
    total_seconds = target_dogs * seconds_per_dog
    hours = total_seconds / 3600
    
    print(f"  - Total time: {total_seconds:,.0f} seconds")
    print(f"  - Hours: {hours:.1f} hours")
    
    if hours < 24:
        print(f"  - Time: {hours:.1f} hours ({hours*60:.0f} minutes)")
    else:
        days = hours / 24
        print(f"  - Days: {days:.1f} days")
        print(f"  - Work sessions: {hours/8:.1f} × 8-hour sessions")
    
    # Show different scenarios
    print(f"\n📊 TIME ESTIMATES FOR DIFFERENT SCALES:")
    scales = [1000, 5000, 10000, 20000, 50000, 100000]
    
    for scale in scales:
        scale_seconds = scale * seconds_per_dog
        scale_hours = scale_seconds / 3600
        
        if scale_hours < 1:
            scale_minutes = scale_seconds / 60
            print(f"  - {scale:,} dogs: {scale_minutes:.1f} minutes")
        elif scale_hours < 24:
            print(f"  - {scale:,} dogs: {scale_hours:.1f} hours")
        else:
            scale_days = scale_hours / 24
            print(f"  - {scale:,} dogs: {scale_days:.1f} days ({scale_hours:.1f} hours)")
    
    # Performance comparison with old system
    old_rate = 9.7  # seconds per dog from original system
    improvement_factor = old_rate / seconds_per_dog
    old_time_for_target = target_dogs * old_rate / 3600
    
    print(f"\n🏆 PERFORMANCE IMPROVEMENT ANALYSIS:")
    print(f"  - Old system: {old_rate:.1f} seconds per dog")
    print(f"  - New parallel: {seconds_per_dog:.2f} seconds per dog")
    print(f"  - Speed improvement: {improvement_factor:.1f}x faster!")
    print(f"  - Time saved for {target_dogs:,} dogs:")
    print(f"    • Old system: {old_time_for_target:.1f} hours ({old_time_for_target/24:.1f} days)")
    print(f"    • New parallel: {hours:.1f} hours")
    print(f"    • Time saved: {old_time_for_target - hours:.1f} hours")
    
    # Realistic recommendations
    print(f"\n💡 REALISTIC SCRAPING STRATEGY:")
    
    # Calculate optimal batch sizes
    batch_8_hours = int(8 * 3600 / seconds_per_dog)
    batch_4_hours = int(4 * 3600 / seconds_per_dog)
    batch_1_hour = int(1 * 3600 / seconds_per_dog)
    
    print(f"  🕐 What you can scrape in different time windows:")
    print(f"    • 1 hour session: ~{batch_1_hour:,} dogs")
    print(f"    • 4 hour session: ~{batch_4_hours:,} dogs") 
    print(f"    • 8 hour session: ~{batch_8_hours:,} dogs")
    
    sessions_needed = target_dogs / batch_8_hours
    print(f"\n  🎯 For {target_dogs:,} dogs:")
    print(f"    • {sessions_needed:.1f} × 8-hour sessions")
    print(f"    • Or {target_dogs/batch_4_hours:.1f} × 4-hour sessions")
    print(f"    • Spread over {sessions_needed:.0f}-{sessions_needed*2:.0f} days")
    
    # System recommendations
    print(f"\n⚙️ SYSTEM OPTIMIZATION TIPS:")
    print(f"  ✅ Current parallel setup is EXCELLENT!")
    print(f"  ✅ {improvement_factor:.1f}x improvement achieved")
    print(f"  💡 Consider these optimizations:")
    print(f"    • Save progress every 1000 dogs")
    print(f"    • Monitor for rate limiting")
    print(f"    • Run during off-peak hours")
    print(f"    • Use resume capability for long runs")

# Calculate based on your NEW parallel results
calculate_parallel_scraping_projections(
    dogs_scraped=50,
    time_taken=64,
    target_dogs=20000
)

print("\n" + "="*60)
print("🎉 EXCELLENT PROGRESS!")
print("Your parallel optimization worked brilliantly!")
print(f"20,000 dogs is now achievable in ~7 hours instead of 54+ hours!")
print("\n🚀 RECOMMENDED APPROACH:")
print("1. 📊 Run in 4-hour chunks (~5,600 dogs each)")
print("2. 🔄 Take breaks between sessions") 
print("3. 💾 Monitor progress and file sizes")
print("4. 🎯 Complete 20K dogs in 3-4 work sessions")
print("5. ⚡ Your 7.6x speedup makes this very practical!")