In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time, os, re

 
START_YEAR = 2015
END_YEAR = 2024


from webdriver_manager.chrome import ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# CHROME_DRIVER_PATH = "C://chromedriver-win64//chromedriver.exe" 
# service = Service(CHROME_DRIVER_PATH)
# driver = webdriver.Chrome(service=service)

os.makedirs("fbref_premier_league_data", exist_ok=True)

def get_table_id(soup):
    
    pattern = re.compile(r"results\d{4}-\d{6}_overall")
    match = pattern.search(soup)
    if match:
        return match.group(0)
    return None
"""##################################################"""
for year in range(START_YEAR, END_YEAR + 1):
    next_year = year + 1
    season = f"{year}-{next_year}"
    url = f"https://fbref.com/en/comps/9/{season}/{season}-Premier-League-Stats"
    print(f"\n Scraping {season} season...")
    driver.get(url)


    time.sleep(3)
    html = driver.page_source

    #dynamically
    table_id = get_table_id(html)
    if not table_id:
        print(f"Could not find table ID for {season}")
        continue

   
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, f"table#{table_id}"))
        )
    except:
        print(f"Table did not load for {season}")
        continue

    """##################################################"""
    #BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", {"id": table_id})

    if not table:
        print(f"Table not found for {season}")
        continue

    rows = table.find_all("tr")
    headers = [
        "Rk", "Squad", "MP", "W", "D", "L", "GF", "GA", "GD", "Pts", "Pts/MP", "xG", "xGA",
        "xGD", "xGD/90", "Attendance", "Top Team Scorer", "Goalkeeper", "Notes"
    ]


    """##################################################"""
    data = []
    for row in rows:
        cols = row.find_all(["th", "td"])
        if not cols:
            continue

        text_data = [c.get_text(strip=True) for c in cols]
        if not text_data or not text_data[0].isdigit():
            continue

        while len(text_data) < len(headers):
            text_data.append("")
        data.append(text_data[:len(headers)])


    """##################################################"""
    if not data:
        print(f"No data rows found for {season}")
        continue



    """##################################################"""
    df = pd.DataFrame(data, columns=headers)

    
    filename = f"fbref_premier_league_data/{season}_PremierLeague.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {season} → {filename}")

    
    time.sleep(2)


driver.quit()
print("\nAll seasons scraped successfully!")



 Scraping 2015-2016 season...
Saved 2015-2016 → fbref_premier_league_data/2015-2016_PremierLeague.csv

 Scraping 2016-2017 season...
Saved 2016-2017 → fbref_premier_league_data/2016-2017_PremierLeague.csv

 Scraping 2017-2018 season...
Saved 2017-2018 → fbref_premier_league_data/2017-2018_PremierLeague.csv

 Scraping 2018-2019 season...
Saved 2018-2019 → fbref_premier_league_data/2018-2019_PremierLeague.csv

 Scraping 2019-2020 season...
Saved 2019-2020 → fbref_premier_league_data/2019-2020_PremierLeague.csv

 Scraping 2020-2021 season...
Saved 2020-2021 → fbref_premier_league_data/2020-2021_PremierLeague.csv

 Scraping 2021-2022 season...
Saved 2021-2022 → fbref_premier_league_data/2021-2022_PremierLeague.csv

 Scraping 2022-2023 season...
Saved 2022-2023 → fbref_premier_league_data/2022-2023_PremierLeague.csv

 Scraping 2023-2024 season...
Saved 2023-2024 → fbref_premier_league_data/2023-2024_PremierLeague.csv

 Scraping 2024-2025 season...
Saved 2024-2025 → fbref_premier_league_dat