In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import logging

# Set up logging
logging.basicConfig(filename='scraper_errors.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Create base output directory
output_dir = "fbref_premier_league_data_insider01"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# ADDED TO AUTOMATE DRIVER INSTALLATION AND SETUP
# TEST IF IT WORKS FINE ON YOUR MACHINE
# IF NOT PLEASE REVERT TO MANUAL SETUP AS COMMENTED BELOW
# IF THIS WORKS, MAKE SURE TO INSTALL THE 'webdriver-manager' PACKAGE VIA PIP
# pip install webdriver-manager
# OR JUST RUN PIP INSTALL REQUREMENTS.TXT

from webdriver_manager.chrome import ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)


# Initialize WebDriver (headless mode disabled for debugging)
# ser = Service(r"C:\chromedriver-win64\chromedriver-win64\chromedriver.exe")
# driver = webdriver.Chrome(service=ser)

# Define seasons and teams (filtered for 2015-2016 Premier League teams)
START_YEAR = 2015
END_YEAR = 2024

# Full team list for later seasons
teams_full = [
    "Liverpool", "Arsenal", "Manchester-City", "Chelsea", "Newcastle-Utd",
    "Aston-Villa", "Nott'ham-Forest", "Brighton", "Bournemouth", "Brentford",
    "Fulham", "Crystal-Palace", "Everton", "West-Ham", "Manchester-Utd",
    "Wolves", "Tottenham", "Leicester-City", "Ipswich-Town", "Southampton","Middlesbrough","Hull-City","Burnley","Swansea-City","Stoke-City","West-Bromwich-Albion",
    "Huddersfield-Town","Norwich-City","Sheffield-United","Cardiff-City","Leeds-United","Luton-Town","Watford","Sunderland","Leicester"]

# Squad IDs
squad_ids = {
    "Liverpool": "822bd0ba",
    "Arsenal": "18bb7c10",
    "Manchester-City": "b8fd03ef",
    "Chelsea": "cff3d9bb",
    "Newcastle-Utd": "b2b47a98",
    "Aston-Villa": "8602292d",
    "Nott'ham-Forest": "e4a775cb",
    "Brighton": "d07537b9",
    "Bournemouth": "4ba7cbea",
    "Brentford": "cd051869",
    "Fulham": "fd962109",
    "Crystal-Palace": "47c64c55",
    "Everton": "d3fd31cc",
    "West-Ham": "7c21e445",
    "Manchester-Utd": "19538871",
    "Wolves": "8cec06e1",
    "Tottenham": "361ca564",
    "Leicester-City": "a2d435b3",
    "Ipswich-Town": "b74092de",
    "Southampton": "33c895d4",
    "Norwich-City": "1c781004",
    "Stoke-City": "17892952",
    "Swansea-City": "fb10988f",
    "Watford": "2abfe087",
    
    "Sunderland": "8ef52968",
    "Leicester": "a2fb4471",
    "Middlesbrough":"7f59c601",
    "Hull-City":"bd8769d1",
    "Burnley":"943e8050",
    "West-Bromwich-Albion":"60c6b05f",
    "Huddersfield-Town":"f5922ca5",
    "Sheffield-United":"1df6b87e",
    "Cardiff-City":"75fae011",
    "Leeds-United":"5bfb9659",
    
    "Luton-Town":"e297cd13",

}

def scrape_team(driver, url, team, season, season_dir, max_retries=3):
    for attempt in range(max_retries):
        try:
            print(f"Scraping (Attempt {attempt + 1}): {url}")
            driver.get(url)
            
            # Wait for any table with 'stats_standard' in the ID
            WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.XPATH, "//table[contains(@id, 'stats_standard')]"))
            )
            
            # Parse page source
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            
            # Find the table (try multiple possible IDs)
            table = None
            for table_id in ["stats_standard_9", "stats_standard", "stats_standard_12"]:
                table = soup.find("table", {"id": table_id})
                if table:
                    break
            
            if table is None:
                logging.error(f"Table not found for {team} in {season}")
                print(f"Table not found for {team} in {season}. Skipping...")
                return False
            
            # Convert to DataFrame and save
            df = pd.read_html(str(table))[0]
            filename = os.path.join(season_dir, f"{team}_PremierLeague.csv")
            df.to_csv(filename, index=False)
            print(f"Saved: {filename}")
            return True
            
        except Exception as e:
            logging.error(f"Error scraping {team} for {season} (Attempt {attempt + 1}): {str(e)}")
            print(f"Error scraping {team} for {season} (Attempt {attempt + 1}): {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2)  # Wait before retrying
            continue
    return False

try:
    for year in range(START_YEAR, END_YEAR + 1):
        next_year = year + 1
        season = f"{year}-{next_year}"
        
        # Create season subdirectory
        season_dir = os.path.join(output_dir, season)
        if not os.path.exists(season_dir):
            os.makedirs(season_dir)
            print(f"Created directory: {season_dir}")
        
 
        
        for team in teams_full:
            squad_id = squad_ids.get(team)
            if not squad_id:
                print(f"No squad ID for {team}. Skipping...")
                logging.error(f"No squad ID for {team} in {season}")
                continue
                
            url = f"https://fbref.com/en/squads/{squad_id}/{season}/{team}-Stats"
            scrape_team(driver, url, team, season, season_dir)
              time.sleep(2)  # Delay to avoid rate limiting

finally:
    print("Closing browser...")
    time.sleep(2)
    driver.quit()

IndentationError: unexpected indent (1347963052.py, line 155)