In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, SessionNotCreatedException, NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import logging
import sys
import random # Import random for variable sleep times

# Custom exception for interrupt handling
class SigTermException(Exception):
    """Custom exception to signal the script was interrupted."""
    pass

# Attempt to import webdriver-manager for automatic driver handling
try:
    from webdriver_manager.chrome import ChromeDriverManager
    AUTOMATIC_DRIVER = True
except ImportError:
    AUTOMATIC_DRIVER = False
    print("Warning: 'webdriver-manager' not installed. Using manual path only.")


# --- Configuration and Setup ---

# Set up logging
logging.basicConfig(filename='scraper_errors.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Define script parameters
START_YEAR = 2015
END_YEAR = 2024
OUTPUT_DIR = "fbref_premier_league_data_insider01"
# Manual driver path (Failsafe)
MANUAL_DRIVER_PATH = r"C:\Users\vanim\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"

# Create base output directory
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Full list of relevant Premier League teams/squads
TEAMS_FULL = [
    "Liverpool", "Arsenal", "Manchester-City", "Chelsea", "Newcastle-Utd",
    "Aston-Villa", "Nott'ham-Forest", "Brighton", "Bournemouth", "Brentford",
    "Fulham", "Crystal-Palace", "Everton", "West-Ham", "Manchester-Utd",
    "Wolves", "Tottenham", "Leicester-City", "Ipswich-Town", "Southampton","Middlesbrough","Hull-City","Burnley","Swansea-City","Stoke-City","West-Bromwich-Albion",
    "Huddersfield-Town","Norwich-City","Sheffield-United","Cardiff-City","Leeds-United","Luton-Town","Watford","Sunderland","Leicester"]

# Squad IDs map
SQUAD_IDS = {
    "Liverpool": "822bd0ba", "Arsenal": "18bb7c10", "Manchester-City": "b8fd03ef", "Chelsea": "cff3d9bb",
    "Newcastle-Utd": "b2b47a98", "Aston-Villa": "8602292d", "Nott'ham-Forest": "e4a775cb", "Brighton": "d07537b9",
    "Bournemouth": "4ba7cbea", "Brentford": "cd051869", "Fulham": "fd962109", "Crystal-Palace": "47c64c55",
    "Everton": "d3fd31cc", "West-Ham": "7c21e445", "Manchester-Utd": "19538871", "Wolves": "8cec06e1",
    "Tottenham": "361ca564", "Leicester-City": "a2fb4471", "Ipswich-Town": "b74092de", "Southampton": "33c895d4",
    "Norwich-City": "1c781004", "Stoke-City": "17892952", "Swansea-City": "fb10988f", "Watford": "2abfe087",
    "Sunderland": "8ef52968", "Leicester": "a2fb4471", "Middlesbrough": "7f59c601", "Hull-City": "bd8769d1",
    "Burnley": "943e8050", "West-Bromwich-Albion": "60c6b05f", "Huddersfield-Town": "f5922ca5",
    "Sheffield-United": "1df6b87e", "Cardiff-City": "75fae011", "Leeds-United": "5bfb9659", "Luton-Town": "e297cd13",
}

# --- WebDriver Initialization (Automatic -> Manual Failsafe) ---

driver = None
try:
    if AUTOMATIC_DRIVER:
        print("Attempting automatic WebDriver setup...")
        # 1. Automatic attempt using webdriver-manager
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service)
        print("✅ Automatic WebDriver setup successful.")
    else:
        # Skip straight to manual if package is missing
        raise ImportError("webdriver-manager not available.")

except Exception as e:
    print(f"Automatic setup failed or skipped. Trying manual setup as failsafe.")
    logging.warning(f"Automatic WebDriver setup failed/skipped: {e}")

    try:
        # 2. Manual Failsafe attempt
        ser = Service(MANUAL_DRIVER_PATH)
        driver = webdriver.Chrome(service=ser)
        print("✅ Manual WebDriver setup successful.")

    except SessionNotCreatedException as manual_e:
        print(f"❌ FATAL ERROR: Manual setup failed. Check if ChromeDriver version matches Chrome browser.")
        logging.critical(f"FATAL WebDriver Error (Manual): {manual_e}")
        sys.exit(1)

    except WebDriverException as manual_e:
        print(f"❌ FATAL ERROR: Manual setup failed. Check the file path: {MANUAL_DRIVER_PATH}")
        logging.critical(f"FATAL WebDriver Error (Manual): {manual_e}")
        sys.exit(1)

# Final check to ensure driver is ready
if driver is None:
    print("❌ FATAL ERROR: Driver could not be initialized by either method. Exiting.")
    sys.exit(1)


# --- Scraping Function ---

def scrape_team(driver, url, team, season, season_dir, max_retries=3):
    """
    Scrapes a team's stats, saves it to a CSV, and returns the DataFrame.
    Includes robust error handling for network/site issues.
    """
    for attempt in range(max_retries):
        try:
            print(f"Scraping (Attempt {attempt + 1}): {team} for {season}")

            # Set a page load timeout to catch slow/dead websites
            driver.set_page_load_timeout(45)
            driver.get(url)

            # Wait for the main table to load
            WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.XPATH, "//table[contains(@id, 'stats_standard')]"))
            )

            # Parse page source
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # Find the table (try multiple possible IDs)
            table = None
            for table_id in ["stats_standard_9", "stats_standard", "stats_standard_12"]:
                table = soup.find("table", {"id": table_id})
                if table:
                    break

            if table is None:
                logging.warning(f"Stats table not found for {team} in {season} at {url}. Skipping.")
                return None

            # Convert to DataFrame
            try:
                # pandas read_html is excellent for this task
                df = pd.read_html(str(table))[0]
            except ValueError:
                logging.error(f"Error reading HTML table into DataFrame for {team} in {season}. Retrying.")
                time.sleep(2) # Keep this sleep for error retries
                continue

            # Add 'Team' and 'Season' columns for consolidation
            df['Team'] = team
            df['Season'] = season

            # Save the individual file
            filename = os.path.join(season_dir, f"{team}_PremierLeague.csv")
            df.to_csv(filename, index=False)
            print(f"Saved individual file: {filename}")

            return df # Return the DataFrame for consolidation

        except TimeoutException:
            logging.error(f"Timeout (45s page load) for {team} in {season} (Attempt {attempt + 1}). Website might be slow or down.")
            print(f"Timeout for {team} in {season} (Attempt {attempt + 1}). Retrying.")
            time.sleep(5) # Keep this sleep for network/site issues
        except WebDriverException as e:
            # Catches network errors (site down, connection lost, etc.)
            logging.error(f"WebDriver/Network error for {team} in {season} (Attempt {attempt + 1}): {str(e)}")
            print(f"WebDriver/Network error for {team} in {season} (Attempt {attempt + 1}). Retrying.")
            time.sleep(5) # Keep this sleep for network/site issues
        except Exception as e:
            # Catch all other unexpected errors
            logging.error(f"Unexpected error scraping {team} for {season} (Attempt {attempt + 1}): {str(e)}")
            print(f"Unexpected error scraping {team} for {season} (Attempt {attempt + 1}). Retrying.")
            time.sleep(2) # Keep this sleep for general errors

    print(f"Failed to scrape {team} for {season} after {max_retries} attempts.")
    return None

# --- Consolidation Function (Pulled out for reuse on interrupt) ---

def consolidate_data(all_data_frames, output_dir):
    """Consolidates all collected DataFrames and saves them to a single CSV."""
    print("\nStarting data consolidation...")
    if all_data_frames:
        consolidated_df = pd.concat(all_data_frames, ignore_index=True)
        consolidated_filename = os.path.join(output_dir, "All_Premier_League_Stats_Consolidated.csv")
        consolidated_df.to_csv(consolidated_filename, index=False)
        print(f"Consolidation successful! Saved all data to: {consolidated_filename}")
        print(f"\n✅ Data saved in {len(consolidated_df)} rows.")
        print("\nHead of Consolidated Data (Partial/Full Output):")
        print(consolidated_df.head())
        # Return the DataFrame for immediate inspection in the testing phase
        return consolidated_df
    else:
        print("No data was successfully scraped to consolidate.")
        return None

# --- Main Execution Loop ---

all_data_frames = []

try:
    for year in range(START_YEAR, END_YEAR + 1):
        next_year = year + 1
        season = f"{year}-{next_year}"

        # Create season subdirectory
        season_dir = os.path.join(OUTPUT_DIR, season)
        if not os.path.exists(season_dir):
            os.makedirs(season_dir)
            print(f"Created directory: {season_dir}")

        for team in TEAMS_FULL:
            squad_id = SQUAD_IDS.get(team)
            if not squad_id:
                print(f"No squad ID for {team} in {season}. Skipping...")
                logging.error(f"No squad ID for {team} in {season}")
                continue

            url = f"https://fbref.com/en/squads/{squad_id}/{season}/{team}-Stats"

            team_df = scrape_team(driver, url, team, season, season_dir)

            if team_df is not None:
                all_data_frames.append(team_df)

            # AGGRESSIVELY REDUCED AND RANDOMIZED SLEEP TIME
            # Use random float sleep for aggressive, but variable, rate
            sleep_time = random.uniform(0.5, 1.5)
            print(f"Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)

except KeyboardInterrupt:
    # Handle user-initiated interrupt (Ctrl+C)
    print("\n\n*** SCRIPT INTERRUPTED BY USER (Ctrl+C)! ***")
    logging.info("Script interrupted by user.")
    # Raise custom exception to bypass the rest of the standard 'try' block
    raise SigTermException("User interrupted scraping.")

except Exception as e:
    logging.critical(f"A critical error occurred in the main scraping loop: {str(e)}")
    print(f"\nCRITICAL SCRIPT ERROR: {str(e)}")

finally:
    # This block executes even after a normal run, an exception, or a KeyboardInterrupt
    try:
        # Check if the interruption was manual before consolidation
        if isinstance(sys.exc_info()[1], SigTermException) or not all_data_frames:
            # If interrupted, consolidate what we have now
            print("Processing partially scraped data before exiting...")
        
        # Consolidation and saving is done here, making it interrupt-proof
        consolidate_data(all_data_frames, OUTPUT_DIR)
            
    except Exception as e:
        print(f"Error during final consolidation/cleanup: {str(e)}")
        logging.error(f"Error during final consolidation/cleanup: {str(e)}")

    print("Closing browser...")
    # Ensures the driver is closed cleanly
    try:
        driver.quit()
    except:
        pass

Attempting automatic WebDriver setup...
✅ Automatic WebDriver setup successful.
Created directory: fbref_premier_league_data_insider01\2015-2016
Scraping (Attempt 1): Liverpool for 2015-2016
Timeout for Liverpool in 2015-2016 (Attempt 1). Retrying.
Scraping (Attempt 2): Liverpool for 2015-2016


  df = pd.read_html(str(table))[0]


Saved individual file: fbref_premier_league_data_insider01\2015-2016\Liverpool_PremierLeague.csv
Sleeping for 1.23 seconds...
Scraping (Attempt 1): Arsenal for 2015-2016


  df = pd.read_html(str(table))[0]


Saved individual file: fbref_premier_league_data_insider01\2015-2016\Arsenal_PremierLeague.csv
Sleeping for 0.76 seconds...
Scraping (Attempt 1): Manchester-City for 2015-2016


  df = pd.read_html(str(table))[0]


Saved individual file: fbref_premier_league_data_insider01\2015-2016\Manchester-City_PremierLeague.csv
Sleeping for 1.16 seconds...
Scraping (Attempt 1): Chelsea for 2015-2016


  df = pd.read_html(str(table))[0]


Saved individual file: fbref_premier_league_data_insider01\2015-2016\Chelsea_PremierLeague.csv
Sleeping for 0.99 seconds...
Scraping (Attempt 1): Newcastle-Utd for 2015-2016
