In [34]:
import os
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# === Initialize driver ===
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 10)

# Go to Premier League page
driver.get("https://fbref.com/en/comps/9/2024-2025/2024-2025-Premier-League-Stats")
time.sleep(3)

# Get team links
teams = driver.find_elements(By.CSS_SELECTOR, 'td[data-stat="team"] a')
team_names = [t.text for t in teams]
team_links = [t.get_attribute("href") for t in teams]

print(f" Found {len(team_names)} teams.")

 Found 40 teams.


In [35]:
team_names = team_names[:20]
team_links = team_links[:20]


In [36]:


# === Loop over teams ===
for team_name, team_link in zip(team_names, team_links):
    print(f"\n Scraping {team_name}: {team_link}")

    # Create folder for team
    team_folder = os.path.join("../data/raw", team_name.replace(" ", "_"))
    os.makedirs(team_folder, exist_ok=True)

    driver.get(team_link)
    time.sleep(3)

    # --- Scrape PLAYER STATS (standard) ---
    try:
        table1 = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//table[@id="stats_standard_9"]')
        ))

        table1_rows = table1.find_elements(By.CSS_SELECTOR, "tbody tr")

        # Build file path
        player_file = os.path.join(team_folder, "players.csv")

        with open(player_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)

            # Get header names


            header_rows = table1.find_elements(By.CSS_SELECTOR, "thead tr")
            headers = [th.text.strip() for th in header_rows[-1].find_elements(By.TAG_NAME, "th")][:16]
            writer.writerow(headers)

            first_row = True


            for row in table1_rows:
                cells = row.find_elements(By.XPATH, ".//th | .//td")[:16]
                row_data = [cell.text.strip() for cell in cells]

                # Skip empty or total rows
                if not row_data or row_data[0]  in ["", " ", "Squad Total", "Opponent Total"] or  row_data[2] in ["Playing Time"]:
                    continue

                # Skip the rows that represent the head unless in the first time
                if row_data[0] == "Player" and not first_row:
                    continue

                writer.writerow(row_data)
                first_row = False

        print(f"   Saved player stats → {player_file}")

    except Exception as e:
        print(f"   Could not scrape player stats for {team_name}: {e}")

    # --- Scrape MATCH STATS (keeper advanced) ---
    try:
        table2 = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//table[@id="matchlogs_for"]')
        ))

        table2_rows = table2.find_elements(By.CSS_SELECTOR, "tbody tr")

        match_file = os.path.join(team_folder, "matches.csv")

        with open(match_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)

            # Get header names
            headers = [th.text.strip() for th in table2.find_elements(By.CSS_SELECTOR, "thead th")][:18]
            writer.writerow(headers)

            first_row = True
            
            for row in table2_rows:
                cells = row.find_elements(By.XPATH, ".//th | .//td")[:18]
                row_data = [cell.text.strip() for cell in cells]
            
                # Skip empty, "Total", or repeated header rows inside tbody
                if not row_data or row_data[0] in ["", "Squad Total", "Opponent Total"]:
                    continue
            
                # Some rows inside tbody repeat the column names ("Date", "Comp", etc.)
                # So if the first cell equals "Date", skip it after the first occurrence
                if row_data[0] == "Date" and not first_row:
                    continue
            
                writer.writerow(row_data)
                first_row = False
            

        print(f"   Saved match stats  {match_file}")
    except Exception as e:
        print(f"  Could not scrape match stats for {team_name}: {e}")

print("\n Done scraping all teams")



 Scraping Liverpool: https://fbref.com/en/squads/822bd0ba/2024-2025/Liverpool-Stats
   Saved player stats → ../data/raw\Liverpool\players.csv
   Saved match stats  ../data/raw\Liverpool\matches.csv

 Scraping Arsenal: https://fbref.com/en/squads/18bb7c10/2024-2025/Arsenal-Stats
   Saved player stats → ../data/raw\Arsenal\players.csv
   Saved match stats  ../data/raw\Arsenal\matches.csv

 Scraping Manchester City: https://fbref.com/en/squads/b8fd03ef/2024-2025/Manchester-City-Stats
   Saved player stats → ../data/raw\Manchester_City\players.csv
   Saved match stats  ../data/raw\Manchester_City\matches.csv

 Scraping Chelsea: https://fbref.com/en/squads/cff3d9bb/2024-2025/Chelsea-Stats
   Saved player stats → ../data/raw\Chelsea\players.csv
   Saved match stats  ../data/raw\Chelsea\matches.csv

 Scraping Newcastle Utd: https://fbref.com/en/squads/b2b47a98/2024-2025/Newcastle-United-Stats
   Saved player stats → ../data/raw\Newcastle_Utd\players.csv
   Saved match stats  ../data/raw\Newc