# GBGB Dog Racing API Scraper

This notebook uses the GBGB API to efficiently collect dog racing data.

Benefits of using the API instead of web scraping:
- Much faster data collection (10-100x faster)
- More reliable - no HTML parsing required
- Less resource-intensive - no browser needed
- Complete data directly from the source
- Less likely to be blocked

**Smart Append Mode**: Only scrapes dogs that haven't been scraped yet, preserving existing data.
**Batch Processing**: Saves data every 1000 dogs to prevent data loss.
**Data Location**: Saves to `../data/dogs5.csv` folder.

In [4]:
import requests
import os
import pandas as pd
import time

API_BASE = "https://api.gbgb.org.uk/api/results/dog"

# CSV fields exactly as in API 'items'
CSV_FIELDS = [
    "dogId",  # Add this first - we'll inject it manually
    "dogName",  # Add missing dog name
    "SP",
    "resultPosition",
    "resultBtnDistance",
    "resultSectionalTime",
    "resultComment",
    "resultRunTime",
    "resultDogWeight",
    "winnerOr2ndName",
    "winnerOr2ndId",
    "resultAdjustedTime",
    "trapNumber",
    "raceTime",
    "raceDate",
    "raceId",
    "raceNumber",
    "raceType",
    "raceClass",
    "raceDistance",
    "raceGoing",
    "raceWinTime",
    "meetingId",
    "trackName",
    "trainerName",
    "ownerName"
]

def get_existing_dog_ids(filename="dogs5.csv"):
    """Get set of dog IDs that have already been scraped"""
    if not os.path.exists(filename):
        return set()
    
    try:
        # Read only the dogId column for efficiency
        df = pd.read_csv(filename, usecols=['dogId'])
        existing_ids = set(df['dogId'].astype(str).unique())
        print(f"📊 Found {len(existing_ids)} unique dogs already in {filename}")
        return existing_ids
    except Exception as e:
        print(f"⚠️ Error reading existing file: {e}")
        return set()

def fetch_items(dog_id, per_page=1000):
    """Fetch up to 'per_page' items in one request."""
    url = f"{API_BASE}/{dog_id}"
    params = {"page": 1, "itemsPerPage": per_page}
    
    try:
        resp = requests.get(url, params=params, timeout=10)
        if resp.status_code == 404:
            return None  # Dog doesn't exist
        resp.raise_for_status()
        data = resp.json()
        return data.get("items", [])
    except Exception as e:
        print(f"  ❌ Error fetching dog {dog_id}: {e}")
        return None

def normalize_item(item, dog_id):
    """Extract fields from API item and add dog_id"""
    record = {field: item.get(field, "") for field in CSV_FIELDS}
    # Override dogId since it's not in the API response
    record["dogId"] = dog_id
    return record

def save_to_csv(records, filename="dogs5.csv", header=False):
    """Save records to CSV file"""
    if not records:
        return 0
    
    try:
        df = pd.DataFrame(records, columns=CSV_FIELDS)
        df.to_csv(filename, mode="a", index=False, header=header, encoding='utf-8')
        return len(records)
    except Exception as e:
        print(f"  ❌ Error saving to CSV: {e}")
        return 0

def main_smart_append(start_id=600000, end_id=600200, output_file="dogs5.csv"):
    """Smart append mode - only scrapes new dogs, saves every 1000 dogs"""
    print(f"🚀 SMART APPEND MODE: Scraping dogs {start_id} to {end_id}")
    print(f"📂 Output file: {output_file}")
    print(f"💾 Auto-save every 1000 dogs")
    
    # Get existing dog IDs to avoid duplicates
    existing_dog_ids = get_existing_dog_ids(output_file)
    
    # Check if file exists to determine if we need header
    file_exists = os.path.exists(output_file)
    header_needed = not file_exists
    
    if file_exists:
        print(f"✅ File exists - will append new data only")
    else:
        print(f"🆕 Creating new file")
    
    print("=" * 60)
    
    total_records = 0
    successful_dogs = 0
    skipped_dogs = 0
    missing_dogs = 0
    start_time = time.time()
    
    # Batch processing variables
    batch_records = []
    batch_start_id = start_id
    dogs_processed = 0
    
    for dog_id in range(start_id, end_id + 1):
        dogs_processed += 1
        
        # Skip if already scraped
        if str(dog_id) in existing_dog_ids:
            skipped_dogs += 1
            
            # Check if we should save batch (every 1000 dogs processed)
            if dogs_processed % 1000 == 0:
                if batch_records:
                    saved_count = save_to_csv(batch_records, output_file, header_needed)
                    total_records += saved_count
                    header_needed = False
                    print(f"📊 Batch save: {len(batch_records)} records from dogs {batch_start_id}-{dog_id}")
                    batch_records = []
                    batch_start_id = dog_id + 1
                print(f"🔄 Progress: {dogs_processed}/{end_id - start_id + 1} dogs processed")
            continue
        
        try:
            # Fetch items from API
            items = fetch_items(dog_id)
            
            if items is None:
                missing_dogs += 1
                if dog_id % 50 == 0:  # Show missing dogs occasionally
                    print(f"  ❌ Dog {dog_id}: No profile found")
                continue
                
            if not items:
                if dog_id % 50 == 0:  # Show empty profiles occasionally
                    print(f"  ⚠️ Dog {dog_id}: Profile exists but no race items")
                continue
            
            # Add records to batch instead of saving immediately
            records = [normalize_item(item, dog_id) for item in items]
            batch_records.extend(records)
            successful_dogs += 1
            print(f"  ✅ Dog {dog_id}: Queued {len(records)} records")
            
        except Exception as e:
            print(f"  ❌ Error processing dog {dog_id}: {str(e)}")
        
        # Save batch every 1000 dogs processed
        if dogs_processed % 1000 == 0:
            if batch_records:
                saved_count = save_to_csv(batch_records, output_file, header_needed)
                total_records += saved_count
                header_needed = False
                print(f"📊 Batch save: {len(batch_records)} records from dogs {batch_start_id}-{dog_id}")
                batch_records = []
                batch_start_id = dog_id + 1
            print(f"🔄 Progress: {dogs_processed}/{end_id - start_id + 1} dogs processed")
        
        # Small delay to be nice to the API
        time.sleep(0.1)
    
    # Save any remaining records in the final batch
    if batch_records:
        saved_count = save_to_csv(batch_records, output_file, header_needed)
        total_records += saved_count
        print(f"📊 Final batch save: {len(batch_records)} records from dogs {batch_start_id}-{end_id}")
        print(f"🎯 IMPORTANT: Final batch of {len(batch_records)} records was saved!")
    else:
        print(f"⚠️ No records in final batch to save")
    
    elapsed_time = time.time() - start_time
    
    print("\n" + "=" * 60)
    print("✅ SMART APPEND COMPLETED!")
    print(f"⏱️ Time: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
    print(f"🐕 Dogs processed: {end_id - start_id + 1}")
    print(f"✅ New dogs scraped: {successful_dogs}")
    print(f"⏭️ Dogs skipped (already existed): {skipped_dogs}")
    print(f"❌ Dogs not found: {missing_dogs}")
    print(f"📊 New records added: {total_records}")
    
    if successful_dogs > 0:
        print(f"⚡ Speed: {successful_dogs/elapsed_time:.2f} dogs/second")
    
    # Show final file stats
    if os.path.exists(output_file):
        try:
            df = pd.read_csv(output_file)
            unique_dogs = df['dogId'].nunique() if 'dogId' in df.columns else 0
            print(f"\n📈 Final file statistics:")
            print(f"  - Total records: {len(df)}")
            print(f"  - Unique dogs: {unique_dogs}")
            print(f"  - File size: {os.path.getsize(output_file) / 1024:.1f} KB")
            
            # Show preview of new data
            if total_records > 0:
                print(f"\n📋 Preview of newly added data:")
                print(df.tail(3).to_string())
                
        except Exception as e:
            print(f"⚠️ Could not read final stats: {e}")
    
    print(f"📂 CSV saved to: {output_file}")
    print(f"📂 Full path: {os.path.abspath(output_file)}")
    
    return total_records

# CONFIGURATION - Edit these values
START_DOG_ID = 500100  # Starting dog ID - pick up where script left off
END_DOG_ID = 500200    # Ending dog ID 
OUTPUT_FILE = "dogs5.csv"  # Output file name - saves to current directory

print("🎯 CONFIGURATION:")
print(f"  - Start ID: {START_DOG_ID}")
print(f"  - End ID: {END_DOG_ID}")
print(f"  - Total dogs to process: {END_DOG_ID - START_DOG_ID + 1}")
print(f"  - Output file: {OUTPUT_FILE}")
print(f"  - Full output path: {os.path.abspath(OUTPUT_FILE)}")
print()

# Run the smart append scraper
total_new_records = main_smart_append(
    start_id=START_DOG_ID,
    end_id=END_DOG_ID,
    output_file=OUTPUT_FILE
)

print(f"\n🎯 FINAL RESULT: {total_new_records} new records added to {OUTPUT_FILE}")

🎯 CONFIGURATION:
  - Start ID: 500100
  - End ID: 500200
  - Total dogs to process: 101
  - Output file: dogs5.csv
  - Full output path: c:\Users\ag67236\Desktop\Projekt_psi\dogs5.csv

🚀 SMART APPEND MODE: Scraping dogs 500100 to 500200
📂 Output file: dogs5.csv
💾 Auto-save every 1000 dogs
🆕 Creating new file
  ❌ Dog 500100: No profile found
  ✅ Dog 500102: Queued 4 records
  ✅ Dog 500104: Queued 110 records
  ✅ Dog 500105: Queued 58 records
  ✅ Dog 500107: Queued 18 records
  ✅ Dog 500108: Queued 5 records
  ✅ Dog 500110: Queued 108 records
  ✅ Dog 500113: Queued 6 records
  ✅ Dog 500114: Queued 8 records
  ✅ Dog 500116: Queued 20 records
  ✅ Dog 500118: Queued 38 records
  ✅ Dog 500119: Queued 48 records
  ✅ Dog 500121: Queued 22 records
  ✅ Dog 500122: Queued 88 records
  ✅ Dog 500130: Queued 42 records
  ✅ Dog 500136: Queued 19 records
  ✅ Dog 500137: Queued 4 records
  ✅ Dog 500140: Queued 48 records
  ✅ Dog 500143: Queued 93 records
  ✅ Dog 500144: Queued 2 records
  ✅ Dog 500147:

In [None]:
# Quick fix: Move the existing dogs5.csv to the data folder if it exists
import shutil

old_file = "dogs5.csv"
new_file = "../data/dogs5.csv"

if os.path.exists(old_file) and not os.path.exists(new_file):
    # Create data directory
    os.makedirs("../data", exist_ok=True)
    
    # Move the file
    shutil.move(old_file, new_file)
    print(f"✅ Moved {old_file} to {new_file}")
    
    # Analyze the moved file
    analyze_csv_file(new_file)
else:
    print(f"📂 File locations:")
    print(f"  - Old file exists: {os.path.exists(old_file)}")
    print(f"  - New file exists: {os.path.exists(new_file)}")
    
    if os.path.exists(new_file):
        print(f"✅ Data file is already in the correct location")
        analyze_csv_file(new_file)

In [None]:
# UTILITY FUNCTIONS FOR MANAGING THE CSV FILE

def analyze_csv_file(filename="dogs5.csv"):
    """Analyze the contents of the CSV file"""
    if not os.path.exists(filename):
        print(f"❌ File {filename} does not exist")
        return
    
    try:
        df = pd.read_csv(filename)
        
        print(f"📊 ANALYSIS OF {filename}")
        print("=" * 50)
        print(f"📋 Basic Info:")
        print(f"  - Total records: {len(df):,}")
        print(f"  - File size: {os.path.getsize(filename) / 1024:.1f} KB")
        print(f"  - Columns: {len(df.columns)}")
        
        if 'dogId' in df.columns:
            unique_dogs = df['dogId'].nunique()
            print(f"  - Unique dogs: {unique_dogs:,}")
            print(f"  - Average records per dog: {len(df)/unique_dogs:.1f}")
            
            # Show dog ID range
            dog_ids = df['dogId'].astype(str).astype(int)
            print(f"  - Dog ID range: {dog_ids.min()} to {dog_ids.max()}")
            
            # Show top dogs by record count
            top_dogs = df['dogId'].value_counts().head(5)
            print(f"\n🏆 Dogs with most records:")
            for dog_id, count in top_dogs.items():
                print(f"  - Dog {dog_id}: {count} records")
        
        if 'raceDate' in df.columns:
            print(f"\n📅 Date range:")
            print(f"  - Earliest race: {df['raceDate'].min()}")
            print(f"  - Latest race: {df['raceDate'].max()}")
        
        if 'trackName' in df.columns:
            unique_tracks = df['trackName'].nunique()
            print(f"\n🏁 Track info:")
            print(f"  - Unique tracks: {unique_tracks}")
            top_tracks = df['trackName'].value_counts().head(3)
            for track, count in top_tracks.items():
                print(f"  - {track}: {count} records")
        
        print(f"\n📋 Sample records:")
        print(df.head(2).to_string())
        
    except Exception as e:
        print(f"❌ Error analyzing file: {e}")

def find_missing_dogs(start_id, end_id, filename="dogs5.csv"):
    """Find which dogs in a range haven't been scraped yet"""
    existing_ids = set()
    
    if os.path.exists(filename):
        try:
            df = pd.read_csv(filename, usecols=['dogId'])
            existing_ids = set(df['dogId'].astype(str).astype(int))
        except Exception as e:
            print(f"⚠️ Error reading file: {e}")
    
    all_ids = set(range(start_id, end_id + 1))
    missing_ids = sorted(all_ids - existing_ids)
    
    print(f"🔍 MISSING DOGS ANALYSIS ({start_id} to {end_id})")
    print("=" * 50)
    print(f"📊 Total dogs in range: {len(all_ids)}")
    print(f"✅ Dogs already scraped: {len(all_ids) - len(missing_ids)}")
    print(f"❌ Dogs missing: {len(missing_ids)}")
    
    if missing_ids:
        print(f"\n📋 Missing dog IDs:")
        # Show in groups of 10 for readability
        for i in range(0, len(missing_ids), 10):
            group = missing_ids[i:i+10]
            print(f"  {', '.join(map(str, group))}")
        
        if len(missing_ids) <= 50:
            print(f"\n💡 Suggested next scraping range:")
            print(f"  START_DOG_ID = {min(missing_ids)}")
            print(f"  END_DOG_ID = {max(missing_ids)}")
    else:
        print(f"\n✅ All dogs in range {start_id}-{end_id} have been scraped!")
    
    return missing_ids

# UTILITY USAGE EXAMPLES:

# Analyze current CSV file
print("📊 ANALYZING CURRENT CSV FILE:")
analyze_csv_file("dogs5.csv")

print("\n" + "="*60)

# Find missing dogs in a range
print("🔍 CHECKING FOR MISSING DOGS:")
missing = find_missing_dogs(600000, 600200, "dogs5.csv")