In [2]:
pip install selenium beautifulsoup4 pandas webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, SessionNotCreatedException, NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import logging
import sys
import random
from datetime import datetime

# Custom exception for interrupt handling
class SigTermException(Exception):
    """Custom exception to signal the script was interrupted."""
    pass

# Attempt to import webdriver-manager for automatic driver handling
try:
    from webdriver_manager.chrome import ChromeDriverManager
    AUTOMATIC_DRIVER = True
except ImportError:
    AUTOMATIC_DRIVER = False
    print("Warning: 'webdriver-manager' not installed. Using manual path only.")

# --- Configuration and Setup ---

# Set up logging
logging.basicConfig(filename='player_scraper_errors.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Define script parameters
WAGES_URL = "https://fbref.com/en/comps/9/wages/Premier-League-Wages"
OUTPUT_DIR = "fbref_player_match_logs"
MANUAL_DRIVER_PATH = r"C:\Users\vanim\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
START_DATE = datetime(2016, 1, 1)  # January 2016

# Create base output directory
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# --- WebDriver Initialization ---

def initialize_driver():
    """Initialize Chrome WebDriver with optimal settings."""
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-gpu')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    driver = None
    try:
        if AUTOMATIC_DRIVER:
            print("Attempting automatic WebDriver setup...")
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=options)
            print("‚úÖ Automatic WebDriver setup successful.")
        else:
            raise ImportError("webdriver-manager not available.")

    except Exception as e:
        print(f"Automatic setup failed or skipped. Trying manual setup as failsafe.")
        logging.warning(f"Automatic WebDriver setup failed/skipped: {e}")

        try:
            ser = Service(MANUAL_DRIVER_PATH)
            driver = webdriver.Chrome(service=ser, options=options)
            print("‚úÖ Manual WebDriver setup successful.")

        except SessionNotCreatedException as manual_e:
            print(f"‚ùå FATAL ERROR: Manual setup failed. Check if ChromeDriver version matches Chrome browser.")
            logging.critical(f"FATAL WebDriver Error (Manual): {manual_e}")
            sys.exit(1)

        except WebDriverException as manual_e:
            print(f"‚ùå FATAL ERROR: Manual setup failed. Check the file path: {MANUAL_DRIVER_PATH}")
            logging.critical(f"FATAL WebDriver Error (Manual): {manual_e}")
            sys.exit(1)

    if driver is None:
        print("‚ùå FATAL ERROR: Driver could not be initialized. Exiting.")
        sys.exit(1)
    
    # Set timeouts
    driver.set_page_load_timeout(60)
    driver.implicitly_wait(10)
    
    return driver

driver = initialize_driver()

# --- Helper Functions ---

def extract_player_links(driver, url, max_retries=3):
    """
    Extracts all player profile links from the Premier League wages page.
    Returns a list of dictionaries with player name and URL.
    """
    for attempt in range(max_retries):
        try:
            print(f"Fetching player list from wages page (Attempt {attempt + 1})...")
            
            # Try to refresh the driver connection if having issues
            if attempt > 0:
                print("  Refreshing driver session...")
                try:
                    driver.refresh()
                except:
                    pass
            
            driver.get(url)
            
            # Wait for the wages table to load with a more specific condition
            try:
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "table"))
                )
            except TimeoutException:
                print(f"  Page load timeout, but attempting to parse anyway...")
            
            # Give the page a moment to fully render
            time.sleep(3)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            
            # DEBUG: Save HTML to file for inspection
            if attempt == 0:
                with open("debug_wages_page.html", "w", encoding="utf-8") as f:
                    f.write(html)
                print("  üìù Saved page HTML to debug_wages_page.html for inspection")
            
            players = []
            
            # Strategy 1: Find ALL links that point to player profiles
            all_links = soup.find_all("a", href=True)
            print(f"  Found {len(all_links)} total links on page")
            
            player_links_found = 0
            for link in all_links:
                href = link.get("href", "")
                # Player URLs follow pattern: /en/players/PLAYER_ID/PLAYER_NAME
                if "/players/" in href and href.count("/") >= 4:
                    player_name = link.text.strip()
                    
                    # Skip if no name
                    if not player_name or len(player_name) < 2:
                        continue
                    
                    # Build full URL
                    if not href.startswith("http"):
                        player_url = "https://fbref.com" + href
                    else:
                        player_url = href
                    
                    # Avoid duplicates
                    if not any(p["name"] == player_name for p in players):
                        players.append({
                            "name": player_name,
                            "url": player_url
                        })
                        player_links_found += 1
                        
                        # Debug: Print first few players found
                        if player_links_found <= 5:
                            print(f"    Found: {player_name}")
            
            print(f"  Total unique players found: {len(players)}")
            
            if players:
                print(f"‚úÖ Successfully extracted {len(players)} players")
                return players
            else:
                print(f"  ‚ö†Ô∏è  No players extracted. Retrying...")
                
                # DEBUG: Print sample of what we found
                print("  Debug - Sample links found:")
                sample_links = [l.get("href") for l in all_links[:10] if l.get("href")]
                for sl in sample_links:
                    print(f"    {sl}")
                
                time.sleep(3)
                continue
            
        except TimeoutException:
            logging.error(f"Timeout fetching player list (Attempt {attempt + 1})")
            print(f"  Timeout (Attempt {attempt + 1}). Retrying...")
            time.sleep(5)
        except Exception as e:
            logging.error(f"Error extracting player links (Attempt {attempt + 1}): {str(e)}")
            print(f"  Error (Attempt {attempt + 1}): {str(e)}")
            import traceback
            print(f"  Traceback: {traceback.format_exc()}")
            time.sleep(3)
    
    print(f"‚ùå Failed to extract player links after {max_retries} attempts.")
    print(f"   Please check debug_wages_page.html to see the actual page content.")
    return []

def get_match_log_url(driver, player_url, max_retries=3):
    """
    Navigates to a player's profile and extracts the full match log URL.
    Returns the match log URL or None if not found.
    """
    for attempt in range(max_retries):
        try:
            driver.get(player_url)
            
            # Wait for page to load
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            
            # Look for "Match Logs" link
            match_log_links = soup.find_all("a", href=True)
            for link in match_log_links:
                if "matchlogs" in link["href"] and "all_comps" in link["href"]:
                    full_url = "https://fbref.com" + link["href"] if link["href"].startswith("/") else link["href"]
                    return full_url
            
            # Strategy 2: Construct URL from player ID
            player_id = player_url.split("/players/")[1].split("/")[0] if "/players/" in player_url else None
            
            if player_id:
                constructed_url = f"https://fbref.com/en/players/{player_id}/matchlogs/all_comps/"
                return constructed_url
            
            return None
            
        except TimeoutException:
            logging.error(f"Timeout getting match log URL (Attempt {attempt + 1})")
            time.sleep(2)
        except Exception as e:
            logging.error(f"Error getting match log URL (Attempt {attempt + 1}): {str(e)}")
            time.sleep(2)
    
    return None

def scrape_match_logs(driver, match_log_url, player_name, max_retries=3):
    """
    Scrapes all match logs for a player from their match log page.
    Filters data from January 2016 onwards.
    Returns a DataFrame with all match log data.
    """
    for attempt in range(max_retries):
        try:
            print(f"  Scraping match logs (Attempt {attempt + 1})...")
            driver.get(match_log_url)
            
            # Wait and give page time to load
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            
            # Find the match logs table - try multiple possible IDs
            table = None
            for table_id_pattern in ["matchlogs_all", "matchlogs", "matchlogs_for"]:
                tables = soup.find_all("table", {"id": lambda x: x and table_id_pattern in x})
                if tables:
                    table = tables[0]
                    break
            
            if table is None:
                logging.warning(f"Match logs table not found for {player_name}")
                return None
            
            # Convert to DataFrame
            try:
                dfs = pd.read_html(str(table))
                df = dfs[0] if dfs else None
                
                if df is None or df.empty:
                    logging.warning(f"Empty DataFrame for {player_name}")
                    return None
                
                # Handle multi-level column headers if present
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = ['_'.join(col).strip() for col in df.columns.values]
                
                # Add player identification
                df['Player_Name'] = player_name
                df['Player_URL'] = match_log_url
                
                # Filter by date if Date column exists
                date_columns = [col for col in df.columns if 'date' in col.lower()]
                if date_columns:
                    date_col = date_columns[0]
                    try:
                        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
                        df = df[df[date_col] >= START_DATE]
                        df = df.dropna(subset=[date_col])
                    except Exception as e:
                        logging.warning(f"Could not filter by date for {player_name}: {str(e)}")
                
                print(f"  ‚úÖ Scraped {len(df)} match logs for {player_name}")
                return df
                
            except ValueError as e:
                logging.error(f"Error reading HTML table for {player_name}: {str(e)}")
                time.sleep(2)
                continue
                
        except TimeoutException:
            logging.error(f"Timeout scraping match logs for {player_name} (Attempt {attempt + 1})")
            print(f"  Timeout (Attempt {attempt + 1}). Retrying...")
            time.sleep(3)
        except Exception as e:
            logging.error(f"Error scraping match logs for {player_name} (Attempt {attempt + 1}): {str(e)}")
            print(f"  Error (Attempt {attempt + 1}): {str(e)}. Retrying...")
            time.sleep(2)
    
    print(f"  ‚ùå Failed to scrape match logs for {player_name} after {max_retries} attempts")
    return None

def consolidate_data(all_data_frames, output_dir):
    """Consolidates all collected DataFrames and saves them to a single CSV."""
    print("\n" + "="*60)
    print("Starting data consolidation...")
    print("="*60)
    
    if all_data_frames:
        consolidated_df = pd.concat(all_data_frames, ignore_index=True)
        consolidated_filename = os.path.join(output_dir, "All_Player_Match_Logs_Consolidated.csv")
        consolidated_df.to_csv(consolidated_filename, index=False)
        
        print(f"\n‚úÖ Consolidation successful!")
        print(f"   Total match logs: {len(consolidated_df)}")
        print(f"   Unique players: {consolidated_df['Player_Name'].nunique()}")
        print(f"   Output file: {consolidated_filename}")
        print("\nSample of consolidated data:")
        print(consolidated_df.head(10).to_string())
        
        return consolidated_df
    else:
        print("‚ùå No data was successfully scraped to consolidate.")
        return None

# --- Main Execution Loop ---

all_data_frames = []
successful_players = 0
failed_players = 0

try:
    # Step 1: Extract player links from wages page
    print("\n" + "="*60)
    print("STEP 1: Extracting Player Links")
    print("="*60)
    
    players = extract_player_links(driver, WAGES_URL)
    
    if not players:
        print("‚ùå No players found. Exiting.")
        sys.exit(1)
    
    print(f"\nTotal players to scrape: {len(players)}")
    
    # Step 2: Scrape match logs for each player
    print("\n" + "="*60)
    print("STEP 2: Scraping Player Match Logs")
    print("="*60 + "\n")
    
    for i, player in enumerate(players, 1):
        print(f"\n[{i}/{len(players)}] Processing: {player['name']}")
        print("-" * 50)
        
        try:
            # Get match log URL
            match_log_url = get_match_log_url(driver, player['url'])
            
            if not match_log_url:
                print(f"  ‚ö†Ô∏è  Could not find match log URL for {player['name']}")
                failed_players += 1
                continue
            
            print(f"  Match log URL: {match_log_url}")
            
            # Scrape match logs
            player_df = scrape_match_logs(driver, match_log_url, player['name'])
            
            if player_df is not None and not player_df.empty:
                all_data_frames.append(player_df)
                successful_players += 1
                
                # Save individual player file
                safe_name = player['name'].replace(" ", "_").replace("/", "-")
                individual_file = os.path.join(OUTPUT_DIR, f"{safe_name}_match_logs.csv")
                player_df.to_csv(individual_file, index=False)
                print(f"  üíæ Saved individual file: {safe_name}_match_logs.csv")
            else:
                print(f"  ‚ö†Ô∏è  No data extracted for {player['name']}")
                failed_players += 1
            
            # Random delay between requests (0.8 to 2.0 seconds)
            sleep_time = random.uniform(0.8, 2.0)
            print(f"  ‚è±Ô∏è  Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
            
        except Exception as e:
            logging.error(f"Unexpected error processing {player['name']}: {str(e)}")
            print(f"  ‚ùå Error processing {player['name']}: {str(e)}")
            failed_players += 1
            continue

except KeyboardInterrupt:
    print("\n\n*** SCRIPT INTERRUPTED BY USER (Ctrl+C)! ***")
    logging.info("Script interrupted by user.")
    raise SigTermException("User interrupted scraping.")

except Exception as e:
    logging.critical(f"Critical error in main loop: {str(e)}")
    print(f"\n‚ùå CRITICAL SCRIPT ERROR: {str(e)}")

finally:
    # Consolidation and cleanup
    try:
        print("\n" + "="*60)
        print("FINAL CONSOLIDATION & CLEANUP")
        print("="*60)
        
        print(f"\nüìä Scraping Summary:")
        print(f"   Successful: {successful_players}")
        print(f"   Failed: {failed_players}")
        print(f"   Total attempted: {successful_players + failed_players}")
        
        consolidate_data(all_data_frames, OUTPUT_DIR)
        
    except Exception as e:
        print(f"‚ùå Error during final consolidation: {str(e)}")
        logging.error(f"Error during final consolidation: {str(e)}")
    
    print("\nüîí Closing browser...")
    try:
        driver.quit()
    except:
        pass
    
    print("\n‚úÖ Script execution completed!")
    print(f"üìÅ All output saved to: {OUTPUT_DIR}")

Attempting automatic WebDriver setup...
‚úÖ Automatic WebDriver setup successful.

STEP 1: Extracting Player Links
Fetching player list from wages page (Attempt 1)...
  Page load timeout, but attempting to parse anyway...
  üìù Saved page HTML to debug_wages_page.html for inspection
  Found 2709 total links on page
    Found: Cristiano Ronaldo
    Found: Lionel Messi
    Found: Rayan Cherki
    Found: Manu Kon√©
    Found: Erling Haaland
  Total unique players found: 633
‚úÖ Successfully extracted 633 players

Total players to scrape: 633

STEP 2: Scraping Player Match Logs


[1/633] Processing: Cristiano Ronaldo
--------------------------------------------------
  Match log URL: https://fbref.com/en/players/dea698d9/matchlogs/all_comps/
  Scraping match logs (Attempt 1)...


  dfs = pd.read_html(str(table))


  ‚úÖ Scraped 5 match logs for Cristiano Ronaldo
  üíæ Saved individual file: Cristiano_Ronaldo_match_logs.csv
  ‚è±Ô∏è  Sleeping for 0.95 seconds...

[2/633] Processing: Lionel Messi
--------------------------------------------------
  Match log URL: https://fbref.com/en/players/d70ce98e/matchlogs/all_comps/
  Scraping match logs (Attempt 1)...
  ‚ö†Ô∏è  No data extracted for Lionel Messi
  ‚è±Ô∏è  Sleeping for 1.41 seconds...

[3/633] Processing: Rayan Cherki
--------------------------------------------------
  Match log URL: https://fbref.com/en/players/b34c63a5/matchlogs/all_comps/
  Scraping match logs (Attempt 1)...
