In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Set up Selenium WebDriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

In [19]:
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
#options.add_argument("--headless")  # Run headless to reduce detection
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage") 

In [20]:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [21]:
players_df = pd.read_csv("players_list_gca.csv")

In [22]:
all_players_data = []

In [None]:
for index, row in players_df.iterrows():
    try:
        player_name = row["Player"]
        player_url = row["URL"]

        print(f"Scraping data for {player_name}...")

        driver.get(player_url)

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(random.uniform(5, 10))

        soup = BeautifulSoup(driver.page_source, "html.parser")

        table = soup.find("table", {"id": "stats_standard_dom_lg"})

        if table:
            for row in table.find_all("tr", {"id": "stats"}):
                season_data = [player_name]

                for cell in row.find_all(["th", "td"]):
                    season_data.append(cell.text.strip())

                all_players_data.append(season_data)
        else:
            print(f"No stats table found for {player_name}")

    except Exception as e:
        print(f"Error with {player_name}: {e}")

Scraping data for Max Aarons...
Scraping data for Joshua Acheampong...
Scraping data for Tyler Adams...
Scraping data for Tosin Adarabioyo...
Scraping data for Simon Adingra...
Scraping data for Emmanuel Agbadou...
Scraping data for Asher Agbinone...
Scraping data for Ola Aina...
Scraping data for Rayan Aït-Nouri...
Scraping data for Kristoffer Ajer...
Scraping data for Manuel Akanji...
Scraping data for Nathan Aké...
Scraping data for Carlos Alcaraz...
Scraping data for Carlos Alcaraz...
Scraping data for Trent Alexander-Arnold...
Scraping data for Alisson...
Scraping data for Miguel Almirón...
Scraping data for Edson Álvarez...
Scraping data for Will Alves...
Scraping data for Samuel Amo-Ameyaw...
Scraping data for Mathis Amougou...
Scraping data for Joachim Andersen...
Scraping data for Joachim Andersen...
Scraping data for Elliot Anderson...
Scraping data for André...
Scraping data for Michail Antonio...
Scraping data for Antony...
Scraping data for Julian Araujo...
Scraping data f

In [24]:
driver.quit()

In [33]:
if all_players_data:
    # Extract headers from the table
    table_headers = table.find("thead").find_all("th")

    # Define the correct column names as required
    correct_columns = ["Player", "Season", "Age", "Squad", "Country", "Comp", "LgRank", "MP", "Starts", "Min", "90s", 
                       "Gls", "Ast", "G+A", "G-PK", "PK", "PKatt", "CrdY", "CrdR", "xG", "npxG", "xAG", "npxG+xAG", 
                       "PrgC", "PrgP", "PrgR", "Gls", "Ast", "G+A", "G-PK", "G+A-PK", "xG", "xAG", "xG+xAG", "npxG", 
                       "npxG+xAG", "Matches"]

    # Ensure that the number of columns matches the first row in data
    num_columns = len(all_players_data[0])  # Get the actual number of data columns
    columns = ["Player"] + [header.text.strip() for header in table_headers[:num_columns-1]]  # Ensure matching count

    # If there is a mismatch, manually assign the correct column names
    if len(correct_columns) == len(columns):
        df = pd.DataFrame(all_players_data, columns=correct_columns)
    else:
        print(f"⚠ Warning: Column count mismatch! Expected {len(correct_columns)}, but got {len(columns)}.")
        print("Adjusting automatically...")
        df = pd.DataFrame(all_players_data, columns=columns[:num_columns])  # Adjust dynamically

    # Save to CSV
    df.to_csv("players_seasonal_stats_selenium.csv", index=False)
    print("✅ Player seasonal data saved to players_seasonal_stats_selenium.csv")
else:
    print("❌ No data was scraped.")


✅ Player seasonal data saved to players_seasonal_stats_selenium.csv


In [34]:
pd.set_option('display.max_columns', None)  # Show all columns

In [35]:
df.head()

Unnamed: 0,Player,Season,Age,Squad,Country,Comp,LgRank,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,Gls.1,Ast.1,G+A.1,G-PK.1,G+A-PK,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,Matches
0,Max Aarons,2017-2018,17,Norwich City,eng ENG,Jr. PL2 — Div. 2,7th,15,15,1319,14.7,0,1,1,0,0,0,2,0,,,,,,,,0.0,0.07,0.07,0.0,0.07,,,,,,Matches
1,Max Aarons,2018-2019,18,Norwich City,eng ENG,2. Championship,1st,41,41,3664,40.7,2,6,8,2,0,0,8,0,2.6,2.6,4.6,7.2,87.0,151.0,226.0,0.05,0.15,0.2,0.05,0.2,0.06,0.11,0.18,0.06,0.18,Matches
2,Max Aarons,2019-2020,19,Norwich City,eng ENG,1. Premier League,20th,36,36,3240,36.0,0,1,1,0,0,0,7,0,0.6,0.6,3.9,4.5,92.0,78.0,151.0,0.0,0.03,0.03,0.0,0.03,0.02,0.11,0.12,0.02,0.12,Matches
3,Max Aarons,2020-2021,20,Norwich City,eng ENG,2. Championship,1st,45,45,4046,45.0,2,2,4,2,0,0,5,0,2.4,2.4,5.4,7.8,147.0,155.0,288.0,0.04,0.04,0.09,0.04,0.09,0.05,0.12,0.17,0.05,0.17,Matches
4,Max Aarons,2021-2022,21,Norwich City,eng ENG,1. Premier League,20th,34,32,2881,32.0,0,2,2,0,0,0,8,0,0.8,0.8,1.7,2.5,78.0,117.0,85.0,0.0,0.06,0.06,0.0,0.06,0.03,0.05,0.08,0.03,0.08,Matches


In [31]:
df.shape

(5880, 37)

In [32]:
df.columns

Index(['Player', '', '', 'Playing Time', 'Performance', 'Expected',
       'Progression', 'Per 90 Minutes', '', 'Season', 'Age', 'Squad',
       'Country', 'Comp', 'LgRank', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast',
       'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG',
       'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'Gls', 'Ast', 'G+A'],
      dtype='object')

In [52]:
df.to_csv("players_seasonal_stats_selenium.csv", index=False)

In [36]:
duplicate_counts = df.duplicated(subset=["Player", "Season"], keep=False)

In [37]:
players_changed_teams = df[duplicate_counts]["Player"].nunique()

In [38]:
players_changed_teams

441