In [4]:
import time
import os
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

# Specify the path to the directory containing the ChromeDriver executable
chrome_driver_directory = "C:/Users/moren/Downloads/chromedriver-win64" #insert your own path here #User moreno: 'moren'

# Add the ChromeDriver directory to the PATH environment variable
os.environ["PATH"] += os.pathsep + chrome_driver_directory

In [9]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4089710 # First Game ID of the season
end_match_id = 4089711 # Adjust this according to your requirement


# Initialize an empty DataFrame to store all lineup data
lineups_df = pd.DataFrame()

# Initialize an empty list to store all lineup stats dataframes
all_lineup_stats_dfs = []

# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    try:
        # Wait for the iframe to be present and switch to it
        wait = WebDriverWait(driver, 2)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)

        # Now wait for the 'Accept & continue' button to be clickable inside the iframe
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()

        # Switch back to the main document
        driver.switch_to.default_content()

    except TimeoutException:
        # If the iframe doesn't appear, continue after a couple of seconds
        print("Iframe not found. Continuing after a couple of seconds...")
        time.sleep(1)  # Adjust the time delay as needed

    
    
    ## SCRAPING ##
    
    # Extract the gameday information from the top of the page
    #gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]/a[1]').text
    gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]').text

    # Extract the home and away club names from the 'title' attribute
    home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]').get_attribute("title")
    away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]').get_attribute("title")


    # Function to extract data from a table given its rows
    def extract_table_data(table_rows, club_name):
        positions = []
        players = []
        ages = []
        market_values = []
        club_names = [club_name] * (len(table_rows) // 3) # There's a club name for each player row
        gamedays = [gameday] * (len(table_rows) // 3)  # Same gameday for all players in the match
        
        
    
        
    
        for i in range(0, len(table_rows), 3):  # Increment by 3 for each player's data set
            cells = table_rows[i].find_elements(By.TAG_NAME, "td")
            player_info = cells[1].text
            name_age_parts = player_info.split(' (')
            player_name = name_age_parts[0].strip()
            age_part = name_age_parts[1] if len(name_age_parts) > 1 else ''
            age_match = re.search(r'(\d+) years old', age_part)
            age = age_match.group(1) if age_match else None
            position_market_value = cells[4].text
            if ', ' in position_market_value:
                position, market_value = position_market_value.split(', ')
            else:
                position = position_market_value
                market_value = None
        
            players.append(player_name)
            ages.append(age)
            positions.append(position)
            market_values.append(market_value)
    
        return pd.DataFrame({
            'Position': positions,
            'Player': players,
            'Age': ages,
            'Market Value': market_values,
            'Club': club_names,
            'Gameday': gamedays,
        })
    all_tables_df = []

    # XPath or CSS Selector for each table
    tables_xpaths = {
        'starting_lineup_home': '//*[@id="main"]/main/div[4]/div[1]/div/div[1]/table', 
        'substitutes_home': '//*[@id="main"]/main/div[5]/div[1]/div/div[1]/table',
        'starting_lineup_away': '//*[@id="main"]/main/div[4]/div[2]/div/div[1]/table',
        'substitutes_away': '//*[@id="main"]/main/div[5]/div[2]/div/div[1]/table'
    }

    all_tables_df = []

    # Loop through the table paths and extract data
    for key, xpath in tables_xpaths.items():
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.TAG_NAME, "tr")
            team_type = 'Home' if 'home' in key else 'Away'
            club_name = home_club_name if 'home' in key else away_club_name
            df = extract_table_data(rows, club_name)  # Your custom function to extract data from rows
            df['H/A'] = team_type
            df['Status'] = 'Starting' if 'starting' in key else 'Substitute'
            all_tables_df.append(df)
        except NoSuchElementException:
            print(f"Table not found for {key} in match ID: {match_id}, skipping.")
            continue  # Skip this iteration if table is not found

    # Combine all dataframes from the current page into lineups_df
    if all_tables_df:  # Check if there's any data to concatenate
        temp_df = pd.concat(all_tables_df, ignore_index=True)
        temp_df['Match ID'] = match_id  # Add the match_id to every row in temp_df
    
        # Assuming lineups_df is defined somewhere above as the final dataframe
        lineups_df = pd.concat([lineups_df, temp_df], ignore_index=True)

    # Extract the home and away club names
    home_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]')
    home_club_name = home_club_name_element.get_attribute("title")
    away_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]')
    away_club_name = away_club_name_element.get_attribute("title")

    # Extract home and away managers' names using the updated XPaths
    home_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[1]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    home_manager_name = home_manager_element.text
    away_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[2]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    away_manager_name = away_manager_element.text

    # Extract additional information for both home and away teams with exception handling
    try:
        foreigners_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_home = "N/A"

    try:
        foreigners_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_home = "N/A"

    try:
        avg_age_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_home = "N/A"

    try:
        avg_age_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_home = "N/A"

    try:
        purchase_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_home = "N/A"

    try:
        purchase_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_home = "N/A"

    try:
        total_market_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_home = "N/A"

    try:
        total_market_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_home = "N/A"

    try:
        foreigners_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_away = "N/A"

    try:
        foreigners_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_away = "N/A"

    try:
        avg_age_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_away = "N/A"

    try:
        avg_age_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_away = "N/A"

    try:
        purchase_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_away = "N/A"

    try:
        purchase_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_away = "N/A"

    try:
        total_market_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_away = "N/A"

    try:
        total_market_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_away = "N/A"



    # Function to clean the extracted data by removing preceding text
    def clean_data(text, keep_eur_sign=False):
        if keep_eur_sign:
            # Directly slice away the preceding text if it follows a known pattern
            if 'Purchase value: ' in text:
                return text.replace('Purchase value: ', '')
            elif 'Total MV: ' in text:
                return text.replace('Total MV: ', '')
        else:
            # Using regex to find numeric values or percentages and return them for other columns
            match = re.search(r'\d+(\.\d+)?%?', text)
            return match.group(0) if match else text
    
    # Create a DataFrame for the club and manager information along with the newly extracted data
    lineups_stats_df = pd.DataFrame({
        'Club': [home_club_name, away_club_name],
        'H/A': ['Home', 'Away'],
        'Manager': [home_manager_name, away_manager_name],
        'Foreigners Starting': [clean_data(foreigners_starting_home), clean_data(foreigners_starting_away)],
        'Foreigners Subs': [clean_data(foreigners_subs_home), clean_data(foreigners_subs_away)],
        'Avg Age Starting': [clean_data(avg_age_starting_home), clean_data(avg_age_starting_away)],
        'Avg Age Subs': [clean_data(avg_age_subs_home), clean_data(avg_age_subs_away)],
        'Purchase Value Starting': [clean_data(purchase_value_starting_home, True), clean_data(purchase_value_starting_away, True)],
        'Purchase Value Subs': [clean_data(purchase_value_subs_home, True), clean_data(purchase_value_subs_away, True)],
        'Total Market Value Starting': [clean_data(total_market_value_starting_home, True), clean_data(total_market_value_starting_away, True)],
        'Total Market Value Subs': [clean_data(total_market_value_subs_home, True), clean_data(total_market_value_subs_away, True)],
        'Match ID': [match_id, match_id]
    })

    # Append the lineup stats dataframe for the current match to the list
    all_lineup_stats_dfs.append(lineups_stats_df)

    # Print the number of dataframes collected after each match
    print(f"Collected {len(all_lineup_stats_dfs)} dataframes after match ID: {match_id}")


# Before the concatenation, print out the number of dataframes to be concatenated
print(f"Concatenating {len(all_lineup_stats_dfs)} dataframes.")

# Concatenate all the lineup stats dataframes in the list
final_lineup_stats_df = pd.concat(all_lineup_stats_dfs, ignore_index=True)

# Close the driver after scraping is done
driver.quit()

# Print a success message after scraping all matches
print("Webscraping successfully completed for all matches.")

# Finally, save the dataframe to a CSV file for persistence
lineups_df.to_csv('data/lineups_2023_2024_1.csv', index=False)


The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Collected 1 dataframes after match ID: 4089710
Iframe not found. Continuing after a couple of seconds...
Collected 2 dataframes after match ID: 4089711
Concatenating 2 dataframes.
Webscraping successfully completed for all matches.


In [10]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4089693 # First Game ID of the season
end_match_id = 4089824 # Adjust this according to your requirement


# Initialize an empty DataFrame to store all lineup data
lineups_df = pd.DataFrame()

# Initialize an empty list to store all lineup stats dataframes
all_lineup_stats_dfs = []

# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    try:
        # Wait for the iframe to be present and switch to it
        wait = WebDriverWait(driver, 1)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)

        # Now wait for the 'Accept & continue' button to be clickable inside the iframe
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()

        # Switch back to the main document
        driver.switch_to.default_content()

    except TimeoutException:
        # If the iframe doesn't appear, continue after a couple of seconds
        print("Iframe not found. Continuing after a couple of seconds...")
        time.sleep(1)  # Adjust the time delay as needed

    
    
    ## SCRAPING ##
    
    # Extract the gameday information from the top of the page
    #gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]/a[1]').text
    gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]').text

    # Extract the home and away club names from the 'title' attribute
    home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]').get_attribute("title")
    away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]').get_attribute("title")


    # Function to extract data from a table given its rows
    def extract_table_data(table_rows, club_name):
        positions = []
        players = []
        ages = []
        market_values = []
        club_names = [club_name] * (len(table_rows) // 3) # There's a club name for each player row
        gamedays = [gameday] * (len(table_rows) // 3)  # Same gameday for all players in the match
        
        
    
        
    
        for i in range(0, len(table_rows), 3):  # Increment by 3 for each player's data set
            cells = table_rows[i].find_elements(By.TAG_NAME, "td")
            player_info = cells[1].text
            name_age_parts = player_info.split(' (')
            player_name = name_age_parts[0].strip()
            age_part = name_age_parts[1] if len(name_age_parts) > 1 else ''
            age_match = re.search(r'(\d+) years old', age_part)
            age = age_match.group(1) if age_match else None
            position_market_value = cells[4].text
            if ', ' in position_market_value:
                position, market_value = position_market_value.split(', ')
            else:
                position = position_market_value
                market_value = None
        
            players.append(player_name)
            ages.append(age)
            positions.append(position)
            market_values.append(market_value)
    
        return pd.DataFrame({
            'Position': positions,
            'Player': players,
            'Age': ages,
            'Market Value': market_values,
            'Club': club_names,
            'Gameday': gamedays,
        })
    all_tables_df = []

    # XPath or CSS Selector for each table
    tables_xpaths = {
        'starting_lineup_home': '//*[@id="main"]/main/div[4]/div[1]/div/div[1]/table', 
        'substitutes_home': '//*[@id="main"]/main/div[5]/div[1]/div/div[1]/table',
        'starting_lineup_away': '//*[@id="main"]/main/div[4]/div[2]/div/div[1]/table',
        'substitutes_away': '//*[@id="main"]/main/div[5]/div[2]/div/div[1]/table'
    }

    all_tables_df = []

    # Loop through the table paths and extract data
    for key, xpath in tables_xpaths.items():
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.TAG_NAME, "tr")
            team_type = 'Home' if 'home' in key else 'Away'
            club_name = home_club_name if 'home' in key else away_club_name
            df = extract_table_data(rows, club_name)  # Your custom function to extract data from rows
            df['H/A'] = team_type
            df['Status'] = 'Starting' if 'starting' in key else 'Substitute'
            all_tables_df.append(df)
        except NoSuchElementException:
            print(f"Table not found for {key} in match ID: {match_id}, skipping.")
            continue  # Skip this iteration if table is not found

    # Combine all dataframes from the current page into lineups_df
    if all_tables_df:  # Check if there's any data to concatenate
        temp_df = pd.concat(all_tables_df, ignore_index=True)
        temp_df['Match ID'] = match_id  # Add the match_id to every row in temp_df
    
        # Assuming lineups_df is defined somewhere above as the final dataframe
        lineups_df = pd.concat([lineups_df, temp_df], ignore_index=True)

    # Extract the home and away club names
    home_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]')
    home_club_name = home_club_name_element.get_attribute("title")
    away_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]')
    away_club_name = away_club_name_element.get_attribute("title")

    # Extract home and away managers' names using the updated XPaths
    home_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[1]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    home_manager_name = home_manager_element.text
    away_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[2]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    away_manager_name = away_manager_element.text

    # Extract additional information for both home and away teams with exception handling
    try:
        foreigners_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_home = "N/A"

    try:
        foreigners_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_home = "N/A"

    try:
        avg_age_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_home = "N/A"

    try:
        avg_age_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_home = "N/A"

    try:
        purchase_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_home = "N/A"

    try:
        purchase_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_home = "N/A"

    try:
        total_market_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_home = "N/A"

    try:
        total_market_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_home = "N/A"

    try:
        foreigners_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_away = "N/A"

    try:
        foreigners_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_away = "N/A"

    try:
        avg_age_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_away = "N/A"

    try:
        avg_age_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_away = "N/A"

    try:
        purchase_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_away = "N/A"

    try:
        purchase_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_away = "N/A"

    try:
        total_market_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_away = "N/A"

    try:
        total_market_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_away = "N/A"



    # Function to clean the extracted data by removing preceding text
    def clean_data(text, keep_eur_sign=False):
        if keep_eur_sign:
            # Directly slice away the preceding text if it follows a known pattern
            if 'Purchase value: ' in text:
                return text.replace('Purchase value: ', '')
            elif 'Total MV: ' in text:
                return text.replace('Total MV: ', '')
        else:
            # Using regex to find numeric values or percentages and return them for other columns
            match = re.search(r'\d+(\.\d+)?%?', text)
            return match.group(0) if match else text
    
    # Create a DataFrame for the club and manager information along with the newly extracted data
    lineups_stats_df = pd.DataFrame({
        'Club': [home_club_name, away_club_name],
        'H/A': ['Home', 'Away'],
        'Manager': [home_manager_name, away_manager_name],
        'Foreigners Starting': [clean_data(foreigners_starting_home), clean_data(foreigners_starting_away)],
        'Foreigners Subs': [clean_data(foreigners_subs_home), clean_data(foreigners_subs_away)],
        'Avg Age Starting': [clean_data(avg_age_starting_home), clean_data(avg_age_starting_away)],
        'Avg Age Subs': [clean_data(avg_age_subs_home), clean_data(avg_age_subs_away)],
        'Purchase Value Starting': [clean_data(purchase_value_starting_home, True), clean_data(purchase_value_starting_away, True)],
        'Purchase Value Subs': [clean_data(purchase_value_subs_home, True), clean_data(purchase_value_subs_away, True)],
        'Total Market Value Starting': [clean_data(total_market_value_starting_home, True), clean_data(total_market_value_starting_away, True)],
        'Total Market Value Subs': [clean_data(total_market_value_subs_home, True), clean_data(total_market_value_subs_away, True)],
        'Match ID': [match_id, match_id]
    })

    # Append the lineup stats dataframe for the current match to the list
    all_lineup_stats_dfs.append(lineups_stats_df)

    # Print the number of dataframes collected after each match
    print(f"Collected {len(all_lineup_stats_dfs)} dataframes after match ID: {match_id}")


# Before the concatenation, print out the number of dataframes to be concatenated
print(f"Concatenating {len(all_lineup_stats_dfs)} dataframes.")

# Concatenate all the lineup stats dataframes in the list
final_lineup_stats_df = pd.concat(all_lineup_stats_dfs, ignore_index=True)

# Close the driver after scraping is done
driver.quit()

# Print a success message after scraping all matches
print("Webscraping successfully completed for all matches.")

# Finally, save the dataframe to a CSV file for persistence
lineups_df.to_csv('data/lineups_2023_2024_1.csv', index=False)


The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Collected 1 dataframes after match ID: 4089693
Iframe not found. Continuing after a couple of seconds...
Collected 2 dataframes after match ID: 4089694
Iframe not found. Continuing after a couple of seconds...
Collected 3 dataframes after match ID: 4089695
Iframe not found. Continuing after a couple of seconds...
Collected 4 dataframes after match ID: 4089696
Iframe not found. Continuing after a couple of seconds...
Collected 5 dataframes after match ID: 4089697
Iframe not found. Continuing after a couple of seconds...
Collected 6 dataframes after match ID: 4089698
Iframe not found. Continuing after a couple of seconds...
Collected 7 dataframes after match ID: 4089699
Iframe not found. Continuing after a couple of seconds...
Collected 8 dataframes after match ID: 4089700
Iframe not found. Continuing after a couple of seconds...
Collected 9 dataframes after match ID: 4089701
Iframe not found. Continuing after a couple of seconds...
Collected 10 dataframes after match ID: 4089702
Iframe 

In [13]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4244785 # 4244791 First Game ID of the season
end_match_id = 4244789 #4244838 # Adjust this according to your requirement

# Initialize an empty DataFrame to store all lineup data
lineups_df = pd.DataFrame()

# Initialize an empty list to store all lineup stats dataframes
all_lineup_stats_dfs = []

# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    try:
        # Wait for the iframe to be present and switch to it
        wait = WebDriverWait(driver, 1)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)

        # Now wait for the 'Accept & continue' button to be clickable inside the iframe
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()

        # Switch back to the main document
        driver.switch_to.default_content()

    except TimeoutException:
        # If the iframe doesn't appear, continue after a couple of seconds
        print("Iframe not found. Continuing after a couple of seconds...")
        time.sleep(1)  # Adjust the time delay as needed

    
    
    ## SCRAPING ##
    
    # Extract the gameday information from the top of the page
    #gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]/a[1]').text
    gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]').text

    # Extract the home and away club names from the 'title' attribute
    home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]').get_attribute("title")
    away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]').get_attribute("title")


    # Function to extract data from a table given its rows
    def extract_table_data(table_rows, club_name):
        positions = []
        players = []
        ages = []
        market_values = []
        club_names = [club_name] * (len(table_rows) // 3) # There's a club name for each player row
        gamedays = [gameday] * (len(table_rows) // 3)  # Same gameday for all players in the match
        
        
    
        
    
        for i in range(0, len(table_rows), 3):  # Increment by 3 for each player's data set
            cells = table_rows[i].find_elements(By.TAG_NAME, "td")
            player_info = cells[1].text
            name_age_parts = player_info.split(' (')
            player_name = name_age_parts[0].strip()
            age_part = name_age_parts[1] if len(name_age_parts) > 1 else ''
            age_match = re.search(r'(\d+) years old', age_part)
            age = age_match.group(1) if age_match else None
            position_market_value = cells[4].text
            if ', ' in position_market_value:
                position, market_value = position_market_value.split(', ')
            else:
                position = position_market_value
                market_value = None
        
            players.append(player_name)
            ages.append(age)
            positions.append(position)
            market_values.append(market_value)
    
        return pd.DataFrame({
            'Position': positions,
            'Player': players,
            'Age': ages,
            'Market Value': market_values,
            'Club': club_names,
            'Gameday': gamedays,
        })
    all_tables_df = []

    # XPath or CSS Selector for each table
    tables_xpaths = {
        'starting_lineup_home': '//*[@id="main"]/main/div[4]/div[1]/div/div[1]/table', 
        'substitutes_home': '//*[@id="main"]/main/div[5]/div[1]/div/div[1]/table',
        'starting_lineup_away': '//*[@id="main"]/main/div[4]/div[2]/div/div[1]/table',
        'substitutes_away': '//*[@id="main"]/main/div[5]/div[2]/div/div[1]/table'
    }

    all_tables_df = []

    # Loop through the table paths and extract data
    for key, xpath in tables_xpaths.items():
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.TAG_NAME, "tr")
            team_type = 'Home' if 'home' in key else 'Away'
            club_name = home_club_name if 'home' in key else away_club_name
            df = extract_table_data(rows, club_name)  # Your custom function to extract data from rows
            df['H/A'] = team_type
            df['Status'] = 'Starting' if 'starting' in key else 'Substitute'
            all_tables_df.append(df)
        except NoSuchElementException:
            print(f"Table not found for {key} in match ID: {match_id}, skipping.")
            continue  # Skip this iteration if table is not found

    # Combine all dataframes from the current page into lineups_df
    if all_tables_df:  # Check if there's any data to concatenate
        temp_df = pd.concat(all_tables_df, ignore_index=True)
        temp_df['Match ID'] = match_id  # Add the match_id to every row in temp_df
    
        # Assuming lineups_df is defined somewhere above as the final dataframe
        lineups_df = pd.concat([lineups_df, temp_df], ignore_index=True)

    # Extract the home and away club names
    home_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]')
    home_club_name = home_club_name_element.get_attribute("title")
    away_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]')
    away_club_name = away_club_name_element.get_attribute("title")

    # Extract home and away managers' names using the updated XPaths
    home_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[1]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    home_manager_name = home_manager_element.text
    away_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[2]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    away_manager_name = away_manager_element.text

    # Extract additional information for both home and away teams with exception handling
    try:
        foreigners_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_home = "N/A"

    try:
        foreigners_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_home = "N/A"

    try:
        avg_age_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_home = "N/A"

    try:
        avg_age_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_home = "N/A"

    try:
        purchase_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_home = "N/A"

    try:
        purchase_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_home = "N/A"

    try:
        total_market_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_home = "N/A"

    try:
        total_market_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_home = "N/A"

    try:
        foreigners_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_away = "N/A"

    try:
        foreigners_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_away = "N/A"

    try:
        avg_age_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_away = "N/A"

    try:
        avg_age_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_away = "N/A"

    try:
        purchase_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_away = "N/A"

    try:
        purchase_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_away = "N/A"

    try:
        total_market_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_away = "N/A"

    try:
        total_market_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_away = "N/A"



    # Function to clean the extracted data by removing preceding text
    def clean_data(text, keep_eur_sign=False):
        if keep_eur_sign:
            # Directly slice away the preceding text if it follows a known pattern
            if 'Purchase value: ' in text:
                return text.replace('Purchase value: ', '')
            elif 'Total MV: ' in text:
                return text.replace('Total MV: ', '')
        else:
            # Using regex to find numeric values or percentages and return them for other columns
            match = re.search(r'\d+(\.\d+)?%?', text)
            return match.group(0) if match else text
    
    # Create a DataFrame for the club and manager information along with the newly extracted data
    lineups_stats_df = pd.DataFrame({
        'Club': [home_club_name, away_club_name],
        'H/A': ['Home', 'Away'],
        'Manager': [home_manager_name, away_manager_name],
        'Foreigners Starting': [clean_data(foreigners_starting_home), clean_data(foreigners_starting_away)],
        'Foreigners Subs': [clean_data(foreigners_subs_home), clean_data(foreigners_subs_away)],
        'Avg Age Starting': [clean_data(avg_age_starting_home), clean_data(avg_age_starting_away)],
        'Avg Age Subs': [clean_data(avg_age_subs_home), clean_data(avg_age_subs_away)],
        'Purchase Value Starting': [clean_data(purchase_value_starting_home, True), clean_data(purchase_value_starting_away, True)],
        'Purchase Value Subs': [clean_data(purchase_value_subs_home, True), clean_data(purchase_value_subs_away, True)],
        'Total Market Value Starting': [clean_data(total_market_value_starting_home, True), clean_data(total_market_value_starting_away, True)],
        'Total Market Value Subs': [clean_data(total_market_value_subs_home, True), clean_data(total_market_value_subs_away, True)],
        'Match ID': [match_id, match_id]
    })

    # Append the lineup stats dataframe for the current match to the list
    all_lineup_stats_dfs.append(lineups_stats_df)

    # Print the number of dataframes collected after each match
    print(f"Collected {len(all_lineup_stats_dfs)} dataframes after match ID: {match_id}")


# Before the concatenation, print out the number of dataframes to be concatenated
print(f"Concatenating {len(all_lineup_stats_dfs)} dataframes.")

# Concatenate all the lineup stats dataframes in the list
final_lineup_stats_df = pd.concat(all_lineup_stats_dfs, ignore_index=True)

# Close the driver after scraping is done
driver.quit()

# Print a success message after scraping all matches
print("Webscraping successfully completed for all matches.")

# Finally, save the dataframe to a CSV file for persistence
lineups_df.to_csv('data/lineups_2023_2024_2.csv', index=False)


The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Collected 1 dataframes after match ID: 4244785
Iframe not found. Continuing after a couple of seconds...
Collected 2 dataframes after match ID: 4244786
Iframe not found. Continuing after a couple of seconds...
Collected 3 dataframes after match ID: 4244787
Iframe not found. Continuing after a couple of seconds...
Collected 4 dataframes after match ID: 4244788
Iframe not found. Continuing after a couple of seconds...
Collected 5 dataframes after match ID: 4244789
Concatenating 5 dataframes.
Webscraping successfully completed for all matches.


In [14]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4244791 #First Game ID of the season
end_match_id = 4244838 # Adjust this according to your requirement

# Initialize an empty DataFrame to store all lineup data
lineups_df = pd.DataFrame()

# Initialize an empty list to store all lineup stats dataframes
all_lineup_stats_dfs = []

# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    try:
        # Wait for the iframe to be present and switch to it
        wait = WebDriverWait(driver, 1)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)

        # Now wait for the 'Accept & continue' button to be clickable inside the iframe
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()

        # Switch back to the main document
        driver.switch_to.default_content()

    except TimeoutException:
        # If the iframe doesn't appear, continue after a couple of seconds
        print("Iframe not found. Continuing after a couple of seconds...")
        time.sleep(1)  # Adjust the time delay as needed

    
    
    ## SCRAPING ##
    
    # Extract the gameday information from the top of the page
    #gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]/a[1]').text
    gameday = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/p[1]').text

    # Extract the home and away club names from the 'title' attribute
    home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]').get_attribute("title")
    away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]').get_attribute("title")


    # Function to extract data from a table given its rows
    def extract_table_data(table_rows, club_name):
        positions = []
        players = []
        ages = []
        market_values = []
        club_names = [club_name] * (len(table_rows) // 3) # There's a club name for each player row
        gamedays = [gameday] * (len(table_rows) // 3)  # Same gameday for all players in the match
        
        
    
        
    
        for i in range(0, len(table_rows), 3):  # Increment by 3 for each player's data set
            cells = table_rows[i].find_elements(By.TAG_NAME, "td")
            player_info = cells[1].text
            name_age_parts = player_info.split(' (')
            player_name = name_age_parts[0].strip()
            age_part = name_age_parts[1] if len(name_age_parts) > 1 else ''
            age_match = re.search(r'(\d+) years old', age_part)
            age = age_match.group(1) if age_match else None
            position_market_value = cells[4].text
            if ', ' in position_market_value:
                position, market_value = position_market_value.split(', ')
            else:
                position = position_market_value
                market_value = None
        
            players.append(player_name)
            ages.append(age)
            positions.append(position)
            market_values.append(market_value)
    
        return pd.DataFrame({
            'Position': positions,
            'Player': players,
            'Age': ages,
            'Market Value': market_values,
            'Club': club_names,
            'Gameday': gamedays,
        })
    all_tables_df = []

    # XPath or CSS Selector for each table
    tables_xpaths = {
        'starting_lineup_home': '//*[@id="main"]/main/div[4]/div[1]/div/div[1]/table', 
        'substitutes_home': '//*[@id="main"]/main/div[5]/div[1]/div/div[1]/table',
        'starting_lineup_away': '//*[@id="main"]/main/div[4]/div[2]/div/div[1]/table',
        'substitutes_away': '//*[@id="main"]/main/div[5]/div[2]/div/div[1]/table'
    }

    all_tables_df = []

    # Loop through the table paths and extract data
    for key, xpath in tables_xpaths.items():
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.TAG_NAME, "tr")
            team_type = 'Home' if 'home' in key else 'Away'
            club_name = home_club_name if 'home' in key else away_club_name
            df = extract_table_data(rows, club_name)  # Your custom function to extract data from rows
            df['H/A'] = team_type
            df['Status'] = 'Starting' if 'starting' in key else 'Substitute'
            all_tables_df.append(df)
        except NoSuchElementException:
            print(f"Table not found for {key} in match ID: {match_id}, skipping.")
            continue  # Skip this iteration if table is not found

    # Combine all dataframes from the current page into lineups_df
    if all_tables_df:  # Check if there's any data to concatenate
        temp_df = pd.concat(all_tables_df, ignore_index=True)
        temp_df['Match ID'] = match_id  # Add the match_id to every row in temp_df
    
        # Assuming lineups_df is defined somewhere above as the final dataframe
        lineups_df = pd.concat([lineups_df, temp_df], ignore_index=True)

    # Extract the home and away club names
    home_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]')
    home_club_name = home_club_name_element.get_attribute("title")
    away_club_name_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]')
    away_club_name = away_club_name_element.get_attribute("title")

    # Extract home and away managers' names using the updated XPaths
    home_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[1]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    home_manager_name = home_manager_element.text
    away_manager_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[6]/div[2]/div/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]')
    away_manager_name = away_manager_element.text

    # Extract additional information for both home and away teams with exception handling
    try:
        foreigners_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_home = "N/A"

    try:
        foreigners_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_home = "N/A"

    try:
        avg_age_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_home = "N/A"

    try:
        avg_age_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_home = "N/A"

    try:
        purchase_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_home = "N/A"

    try:
        purchase_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_home = "N/A"

    try:
        total_market_value_starting_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_home = "N/A"

    try:
        total_market_value_subs_home = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_home = "N/A"

    try:
        foreigners_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_starting_away = "N/A"

    try:
        foreigners_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[1]').text
    except NoSuchElementException:
        foreigners_subs_away = "N/A"

    try:
        avg_age_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_starting_away = "N/A"

    try:
        avg_age_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[2]').text
    except NoSuchElementException:
        avg_age_subs_away = "N/A"

    try:
        purchase_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_starting_away = "N/A"

    try:
        purchase_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[3]').text
    except NoSuchElementException:
        purchase_value_subs_away = "N/A"

    try:
        total_market_value_starting_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[4]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_starting_away = "N/A"

    try:
        total_market_value_subs_away = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[2]/div/div[2]/table/tbody/tr/td[4]').text
    except NoSuchElementException:
        total_market_value_subs_away = "N/A"



    # Function to clean the extracted data by removing preceding text
    def clean_data(text, keep_eur_sign=False):
        if keep_eur_sign:
            # Directly slice away the preceding text if it follows a known pattern
            if 'Purchase value: ' in text:
                return text.replace('Purchase value: ', '')
            elif 'Total MV: ' in text:
                return text.replace('Total MV: ', '')
        else:
            # Using regex to find numeric values or percentages and return them for other columns
            match = re.search(r'\d+(\.\d+)?%?', text)
            return match.group(0) if match else text
    
    # Create a DataFrame for the club and manager information along with the newly extracted data
    lineups_stats_df = pd.DataFrame({
        'Club': [home_club_name, away_club_name],
        'H/A': ['Home', 'Away'],
        'Manager': [home_manager_name, away_manager_name],
        'Foreigners Starting': [clean_data(foreigners_starting_home), clean_data(foreigners_starting_away)],
        'Foreigners Subs': [clean_data(foreigners_subs_home), clean_data(foreigners_subs_away)],
        'Avg Age Starting': [clean_data(avg_age_starting_home), clean_data(avg_age_starting_away)],
        'Avg Age Subs': [clean_data(avg_age_subs_home), clean_data(avg_age_subs_away)],
        'Purchase Value Starting': [clean_data(purchase_value_starting_home, True), clean_data(purchase_value_starting_away, True)],
        'Purchase Value Subs': [clean_data(purchase_value_subs_home, True), clean_data(purchase_value_subs_away, True)],
        'Total Market Value Starting': [clean_data(total_market_value_starting_home, True), clean_data(total_market_value_starting_away, True)],
        'Total Market Value Subs': [clean_data(total_market_value_subs_home, True), clean_data(total_market_value_subs_away, True)],
        'Match ID': [match_id, match_id]
    })

    # Append the lineup stats dataframe for the current match to the list
    all_lineup_stats_dfs.append(lineups_stats_df)

    # Print the number of dataframes collected after each match
    print(f"Collected {len(all_lineup_stats_dfs)} dataframes after match ID: {match_id}")


# Before the concatenation, print out the number of dataframes to be concatenated
print(f"Concatenating {len(all_lineup_stats_dfs)} dataframes.")

# Concatenate all the lineup stats dataframes in the list
final_lineup_stats_df = pd.concat(all_lineup_stats_dfs, ignore_index=True)

# Close the driver after scraping is done
driver.quit()

# Print a success message after scraping all matches
print("Webscraping successfully completed for all matches.")

# Finally, save the dataframe to a CSV file for persistence
lineups_df.to_csv('data/lineups_2023_2024_3.csv', index=False)


The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Collected 1 dataframes after match ID: 4244791
Iframe not found. Continuing after a couple of seconds...
Collected 2 dataframes after match ID: 4244792
Iframe not found. Continuing after a couple of seconds...
Collected 3 dataframes after match ID: 4244793
Iframe not found. Continuing after a couple of seconds...
Collected 4 dataframes after match ID: 4244794
Iframe not found. Continuing after a couple of seconds...
Collected 5 dataframes after match ID: 4244795
Iframe not found. Continuing after a couple of seconds...
Collected 6 dataframes after match ID: 4244796
Iframe not found. Continuing after a couple of seconds...
Collected 7 dataframes after match ID: 4244797
Iframe not found. Continuing after a couple of seconds...
Collected 8 dataframes after match ID: 4244798
Iframe not found. Continuing after a couple of seconds...
Collected 9 dataframes after match ID: 4244799
Iframe not found. Continuing after a couple of seconds...
Collected 10 dataframes after match ID: 4244800
Iframe 

In [None]:
lineups_df

In [None]:
all_lineup_stats_dfs

In [None]:
final_lineup_stats_df

In [None]:
lineups_stats_df

In [15]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4089693  # First Game ID of the season
end_match_id = 4089824 #4244838  # Adjust this according to your requirement

# Initialize an empty list to store all events dataframes
all_events_dfs = []

# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/index/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    # Handling the iframe and accept button if exists
    try:
        wait = WebDriverWait(driver, 2)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()
        driver.switch_to.default_content()
    except:
        print("Iframe not found. Continuing after a couple of seconds...")

    ## SCRAPING ## 

    # Extracting club names
    home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div/div/div[1]/div[1]/div[2]/nobr/a').get_attribute("title")
    away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div/div/div[2]/div[1]/div[2]/nobr/a').get_attribute("title")

    # Function to convert pixel values to minutes based on the pattern provided
    def convert_px_to_minute(x_px, y_px):
        # Remove any non-numeric characters and convert to integer
        x_px = int(re.sub(r'[^\d-]', '', str(x_px)))
        y_px = int(re.sub(r'[^\d-]', '', str(y_px)))
    
        # Convert negative values to positive
        x_px = abs(x_px)
        y_px = abs(y_px)
    
        unit_minutes = (x_px // 36) + 1
        ten_minutes = (y_px // 36) * 10
        timestamp = f"{unit_minutes + ten_minutes}'"
        return timestamp


    def extract_px_from_style(style_str):
        # Use regular expression to find all pixel values in the style string
        px_values = re.findall(r'-?\d+px', style_str)  # Include optional minus sign
    
        # Check if there are at least two pixel values
        if len(px_values) >= 2:
            x_px, y_px = [int(px.strip('px')) for px in px_values[:2]]  # Take the first two values
            return x_px, y_px
        else:
            # Handle the case when there are not enough values
            return None, None  # You can return None or some default values


    # Function to extract events with Remark Event adjustment
    def extract_events(event_type_xpath, event_type, home_club_name, away_club_name):
        try:
            events_list = driver.find_element(By.XPATH, event_type_xpath)
            events_items = events_list.find_elements(By.TAG_NAME, "li")
            events_data = []

            for item in events_items:
                team = "Home" if "heim" in item.get_attribute("class") else "Away"
                club = home_club_name if team == "Home" else away_club_name

                # Extract the style attribute for timestamp
                style_str = item.find_element(By.XPATH, ".//div/div[1]/span").get_attribute("style")
                x_px, y_px = extract_px_from_style(style_str)
                timestamp = convert_px_to_minute(x_px, y_px)

                player_event = "N/A"  # Default value if player name is not found
                player_out = None  # Initialize player_out to None
                remark_event = ""  # Initialize remark_event to empty string
                player_assist = None  # Ensure this variable is also initialized

                try:
                    player_event_element = None
                    full_text = item.find_element(By.XPATH, ".//div/div[4]").text.strip()
                    if event_type == "Substitution":
                        parts = full_text.split('\n')
                        if len(parts) > 1:
                            player_out_part = parts[-1]
                            player_out_parts = player_out_part.split(', ')
                            if len(player_out_parts) > 1:
                                player_out = player_out_parts[0]
                                remark_event = player_out_parts[1]
                            else:
                                player_out = player_out_parts[0]
                        player_event_element = item.find_element(By.XPATH, ".//div/div[4]/span[1]/a")
                        player_event = player_event_element.get_attribute("title")


                    else:
                        player_event_element = item.find_element(By.XPATH, ".//div/div[4]/a")
                        player_event = player_event_element.get_attribute("title")
                        # Adjust this block to handle goals and cards specifically
                        full_text = item.find_element(By.XPATH, ".//div/div[4]").text
                        if event_type == "Goal":
                            parts = full_text.split(',')
                            if len(parts) > 2:  # If there are at least 3 parts, indicating a remark is present
                                remark_event = parts[1].strip()  # The part before the second ',' is the remark for goals
                                # Handling Assist information for goals
                                if "Assist:" in full_text:
                                    assist_part = full_text.split('Assist:')[1].split(',')[0].strip()
                                    player_assist = assist_part  # Assume player_assist is already defined elsewhere as None
                            else:
                                remark_event = parts[0].strip() if len(parts) > 1 else ""
                        else:
                            # For Cards, just an example, adjust as needed
                            remark_event = full_text.split(',')[-1].strip() if ',' in full_text else full_text
                except NoSuchElementException:
                    pass



                card_type = event_type  # Default card type is the event type itself
                if event_type == "Card":
                    card_span_class = item.find_element(By.XPATH, ".//div/div[2]/span").get_attribute("class")
                    if "gelbrot" in card_span_class:
                        card_type = "Yellow-Red Card"
                    elif "gelb" in card_span_class and "rot" not in card_span_class:
                        card_type = "Yellow Card"
                    elif "rot" in card_span_class:
                        card_type = "Direct Red Card"

                events_data.append({
                    "Timestamp": timestamp,
                    "Club": club,
                    "H/A": team,
                    "Event": card_type,
                    "Player Event": player_event,
                    "Remark Event": remark_event,
                    "Player Assist": player_assist,
                    "Player Out": player_out,
                    "Match ID": match_id,
                }) 
            return events_data
        except NoSuchElementException:
            print(f"No {event_type} events found on the page.")
            return []

    all_events_data = []
    event_types = {"Goal": '//*[@id="sb-tore"]/ul', "Substitution": '//*[@id="sb-wechsel"]/ul', "Card": '//*[@id="sb-karten"]/ul'}

    # Iterate through each event type and extract data
    for event_type, xpath in event_types.items():
        events_data = extract_events(xpath, event_type, home_club_name, away_club_name)
        all_events_data.extend(events_data)

    # Create DataFrame and reorder columns to put 'Timestamp' second
    if all_events_data:  # Ensure there's data before creating the DataFrame
        events_df = pd.DataFrame(all_events_data)
        columns_order = ['Club', 'H/A', 'Timestamp', 'Event', 'Player Event', 'Remark Event', 'Player Assist', 'Player Out', 'Match ID']
        events_df = events_df[columns_order]
        all_events_dfs.append(events_df)
    
    print(f"Scraping completed for match ID: {match_id}")

# Check if all_events_dfs is not empty before attempting to concatenate
if all_events_dfs:  # This checks if the list is not empty
    # Concatenate all events dataframes
    final_events_df = pd.concat(all_events_dfs, ignore_index=True)

    # Finally, save the dataframe to a CSV file for persistence
    final_events_df.to_csv('data/match_events_2023_2024_1.csv', index=False)
else:
    print("No data was scraped.")

# Close the driver after scraping is done
driver.quit()

# Print a success message
print("Webscraping successfully completed for all matches.")

# Finally, save the dataframe to a CSV file for persistence
final_events_df.to_csv('data/match_events_2023_2024_1.csv', index=False)


The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Scraping completed for match ID: 4089693
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089694
Iframe not found. Continuing after a couple of seconds...
No Goal events found on the page.
Scraping completed for match ID: 4089695
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089696
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089697
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089698
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089699
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089700
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089701
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4089702
Iframe not found. Continuing after

In [16]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4244785 # 4244791 First Game ID of the season
end_match_id = 4244789 #4244838 # Adjust this according to your requirement

# Initialize an empty list to store all events dataframes
all_events_dfs = []

# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/index/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    # Handling the iframe and accept button if exists
    try:
        wait = WebDriverWait(driver, 2)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()
        driver.switch_to.default_content()
    except:
        print("Iframe not found. Continuing after a couple of seconds...")

    ## SCRAPING ## 

    # Extracting club names
    home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div/div/div[1]/div[1]/div[2]/nobr/a').get_attribute("title")
    away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div/div/div[2]/div[1]/div[2]/nobr/a').get_attribute("title")

    # Function to convert pixel values to minutes based on the pattern provided
    def convert_px_to_minute(x_px, y_px):
        # Remove any non-numeric characters and convert to integer
        x_px = int(re.sub(r'[^\d-]', '', str(x_px)))
        y_px = int(re.sub(r'[^\d-]', '', str(y_px)))
    
        # Convert negative values to positive
        x_px = abs(x_px)
        y_px = abs(y_px)
    
        unit_minutes = (x_px // 36) + 1
        ten_minutes = (y_px // 36) * 10
        timestamp = f"{unit_minutes + ten_minutes}'"
        return timestamp


    def extract_px_from_style(style_str):
        # Use regular expression to find all pixel values in the style string
        px_values = re.findall(r'-?\d+px', style_str)  # Include optional minus sign
    
        # Check if there are at least two pixel values
        if len(px_values) >= 2:
            x_px, y_px = [int(px.strip('px')) for px in px_values[:2]]  # Take the first two values
            return x_px, y_px
        else:
            # Handle the case when there are not enough values
            return None, None  # You can return None or some default values


    # Function to extract events with Remark Event adjustment
    def extract_events(event_type_xpath, event_type, home_club_name, away_club_name):
        try:
            events_list = driver.find_element(By.XPATH, event_type_xpath)
            events_items = events_list.find_elements(By.TAG_NAME, "li")
            events_data = []

            for item in events_items:
                team = "Home" if "heim" in item.get_attribute("class") else "Away"
                club = home_club_name if team == "Home" else away_club_name

                # Extract the style attribute for timestamp
                style_str = item.find_element(By.XPATH, ".//div/div[1]/span").get_attribute("style")
                x_px, y_px = extract_px_from_style(style_str)
                timestamp = convert_px_to_minute(x_px, y_px)

                player_event = "N/A"  # Default value if player name is not found
                player_out = None  # Initialize player_out to None
                remark_event = ""  # Initialize remark_event to empty string
                player_assist = None  # Ensure this variable is also initialized

                try:
                    player_event_element = None
                    full_text = item.find_element(By.XPATH, ".//div/div[4]").text.strip()
                    if event_type == "Substitution":
                        parts = full_text.split('\n')
                        if len(parts) > 1:
                            player_out_part = parts[-1]
                            player_out_parts = player_out_part.split(', ')
                            if len(player_out_parts) > 1:
                                player_out = player_out_parts[0]
                                remark_event = player_out_parts[1]
                            else:
                                player_out = player_out_parts[0]
                        player_event_element = item.find_element(By.XPATH, ".//div/div[4]/span[1]/a")
                        player_event = player_event_element.get_attribute("title")


                    else:
                        player_event_element = item.find_element(By.XPATH, ".//div/div[4]/a")
                        player_event = player_event_element.get_attribute("title")
                        # Adjust this block to handle goals and cards specifically
                        full_text = item.find_element(By.XPATH, ".//div/div[4]").text
                        if event_type == "Goal":
                            parts = full_text.split(',')
                            if len(parts) > 2:  # If there are at least 3 parts, indicating a remark is present
                                remark_event = parts[1].strip()  # The part before the second ',' is the remark for goals
                                # Handling Assist information for goals
                                if "Assist:" in full_text:
                                    assist_part = full_text.split('Assist:')[1].split(',')[0].strip()
                                    player_assist = assist_part  # Assume player_assist is already defined elsewhere as None
                            else:
                                remark_event = parts[0].strip() if len(parts) > 1 else ""
                        else:
                            # For Cards, just an example, adjust as needed
                            remark_event = full_text.split(',')[-1].strip() if ',' in full_text else full_text
                except NoSuchElementException:
                    pass



                card_type = event_type  # Default card type is the event type itself
                if event_type == "Card":
                    card_span_class = item.find_element(By.XPATH, ".//div/div[2]/span").get_attribute("class")
                    if "gelbrot" in card_span_class:
                        card_type = "Yellow-Red Card"
                    elif "gelb" in card_span_class and "rot" not in card_span_class:
                        card_type = "Yellow Card"
                    elif "rot" in card_span_class:
                        card_type = "Direct Red Card"

                events_data.append({
                    "Timestamp": timestamp,
                    "Club": club,
                    "H/A": team,
                    "Event": card_type,
                    "Player Event": player_event,
                    "Remark Event": remark_event,
                    "Player Assist": player_assist,
                    "Player Out": player_out,
                    "Match ID": match_id,
                }) 
            return events_data
        except NoSuchElementException:
            print(f"No {event_type} events found on the page.")
            return []

    all_events_data = []
    event_types = {"Goal": '//*[@id="sb-tore"]/ul', "Substitution": '//*[@id="sb-wechsel"]/ul', "Card": '//*[@id="sb-karten"]/ul'}

    # Iterate through each event type and extract data
    for event_type, xpath in event_types.items():
        events_data = extract_events(xpath, event_type, home_club_name, away_club_name)
        all_events_data.extend(events_data)

    # Create DataFrame and reorder columns to put 'Timestamp' second
    if all_events_data:  # Ensure there's data before creating the DataFrame
        events_df = pd.DataFrame(all_events_data)
        columns_order = ['Club', 'H/A', 'Timestamp', 'Event', 'Player Event', 'Remark Event', 'Player Assist', 'Player Out', 'Match ID']
        events_df = events_df[columns_order]
        all_events_dfs.append(events_df)
    
    print(f"Scraping completed for match ID: {match_id}")

# Check if all_events_dfs is not empty before attempting to concatenate
if all_events_dfs:  # This checks if the list is not empty
    # Concatenate all events dataframes
    final_events_df = pd.concat(all_events_dfs, ignore_index=True)

    # Finally, save the dataframe to a CSV file for persistence
    final_events_df.to_csv('data/match_events_2023_2024_2.csv', index=False)
else:
    print("No data was scraped.")

# Close the driver after scraping is done
driver.quit()

# Print a success message
print("Webscraping successfully completed for all matches.")

# Finally, save the dataframe to a CSV file for persistence
final_events_df.to_csv('data/match_events_2023_2024_2.csv', index=False)


The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Scraping completed for match ID: 4244785
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244786
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244787
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244788
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244789
Webscraping successfully completed for all matches.


In [17]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4244791 # 4244791 First Game ID of the season
end_match_id = 4244838 #4244838 # Adjust this according to your requirement

# Initialize an empty list to store all events dataframes
all_events_dfs = []

# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/index/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    # Handling the iframe and accept button if exists
    try:
        wait = WebDriverWait(driver, 2)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()
        driver.switch_to.default_content()
    except:
        print("Iframe not found. Continuing after a couple of seconds...")

    ## SCRAPING ## 

    # Extracting club names
    home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div/div/div[1]/div[1]/div[2]/nobr/a').get_attribute("title")
    away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div/div/div[2]/div[1]/div[2]/nobr/a').get_attribute("title")

    # Function to convert pixel values to minutes based on the pattern provided
    def convert_px_to_minute(x_px, y_px):
        # Remove any non-numeric characters and convert to integer
        x_px = int(re.sub(r'[^\d-]', '', str(x_px)))
        y_px = int(re.sub(r'[^\d-]', '', str(y_px)))
    
        # Convert negative values to positive
        x_px = abs(x_px)
        y_px = abs(y_px)
    
        unit_minutes = (x_px // 36) + 1
        ten_minutes = (y_px // 36) * 10
        timestamp = f"{unit_minutes + ten_minutes}'"
        return timestamp


    def extract_px_from_style(style_str):
        # Use regular expression to find all pixel values in the style string
        px_values = re.findall(r'-?\d+px', style_str)  # Include optional minus sign
    
        # Check if there are at least two pixel values
        if len(px_values) >= 2:
            x_px, y_px = [int(px.strip('px')) for px in px_values[:2]]  # Take the first two values
            return x_px, y_px
        else:
            # Handle the case when there are not enough values
            return None, None  # You can return None or some default values


    # Function to extract events with Remark Event adjustment
    def extract_events(event_type_xpath, event_type, home_club_name, away_club_name):
        try:
            events_list = driver.find_element(By.XPATH, event_type_xpath)
            events_items = events_list.find_elements(By.TAG_NAME, "li")
            events_data = []

            for item in events_items:
                team = "Home" if "heim" in item.get_attribute("class") else "Away"
                club = home_club_name if team == "Home" else away_club_name

                # Extract the style attribute for timestamp
                style_str = item.find_element(By.XPATH, ".//div/div[1]/span").get_attribute("style")
                x_px, y_px = extract_px_from_style(style_str)
                timestamp = convert_px_to_minute(x_px, y_px)

                player_event = "N/A"  # Default value if player name is not found
                player_out = None  # Initialize player_out to None
                remark_event = ""  # Initialize remark_event to empty string
                player_assist = None  # Ensure this variable is also initialized

                try:
                    player_event_element = None
                    full_text = item.find_element(By.XPATH, ".//div/div[4]").text.strip()
                    if event_type == "Substitution":
                        parts = full_text.split('\n')
                        if len(parts) > 1:
                            player_out_part = parts[-1]
                            player_out_parts = player_out_part.split(', ')
                            if len(player_out_parts) > 1:
                                player_out = player_out_parts[0]
                                remark_event = player_out_parts[1]
                            else:
                                player_out = player_out_parts[0]
                        player_event_element = item.find_element(By.XPATH, ".//div/div[4]/span[1]/a")
                        player_event = player_event_element.get_attribute("title")


                    else:
                        player_event_element = item.find_element(By.XPATH, ".//div/div[4]/a")
                        player_event = player_event_element.get_attribute("title")
                        # Adjust this block to handle goals and cards specifically
                        full_text = item.find_element(By.XPATH, ".//div/div[4]").text
                        if event_type == "Goal":
                            parts = full_text.split(',')
                            if len(parts) > 2:  # If there are at least 3 parts, indicating a remark is present
                                remark_event = parts[1].strip()  # The part before the second ',' is the remark for goals
                                # Handling Assist information for goals
                                if "Assist:" in full_text:
                                    assist_part = full_text.split('Assist:')[1].split(',')[0].strip()
                                    player_assist = assist_part  # Assume player_assist is already defined elsewhere as None
                            else:
                                remark_event = parts[0].strip() if len(parts) > 1 else ""
                        else:
                            # For Cards, just an example, adjust as needed
                            remark_event = full_text.split(',')[-1].strip() if ',' in full_text else full_text
                except NoSuchElementException:
                    pass



                card_type = event_type  # Default card type is the event type itself
                if event_type == "Card":
                    card_span_class = item.find_element(By.XPATH, ".//div/div[2]/span").get_attribute("class")
                    if "gelbrot" in card_span_class:
                        card_type = "Yellow-Red Card"
                    elif "gelb" in card_span_class and "rot" not in card_span_class:
                        card_type = "Yellow Card"
                    elif "rot" in card_span_class:
                        card_type = "Direct Red Card"

                events_data.append({
                    "Timestamp": timestamp,
                    "Club": club,
                    "H/A": team,
                    "Event": card_type,
                    "Player Event": player_event,
                    "Remark Event": remark_event,
                    "Player Assist": player_assist,
                    "Player Out": player_out,
                    "Match ID": match_id,
                }) 
            return events_data
        except NoSuchElementException:
            print(f"No {event_type} events found on the page.")
            return []

    all_events_data = []
    event_types = {"Goal": '//*[@id="sb-tore"]/ul', "Substitution": '//*[@id="sb-wechsel"]/ul', "Card": '//*[@id="sb-karten"]/ul'}

    # Iterate through each event type and extract data
    for event_type, xpath in event_types.items():
        events_data = extract_events(xpath, event_type, home_club_name, away_club_name)
        all_events_data.extend(events_data)

    # Create DataFrame and reorder columns to put 'Timestamp' second
    if all_events_data:  # Ensure there's data before creating the DataFrame
        events_df = pd.DataFrame(all_events_data)
        columns_order = ['Club', 'H/A', 'Timestamp', 'Event', 'Player Event', 'Remark Event', 'Player Assist', 'Player Out', 'Match ID']
        events_df = events_df[columns_order]
        all_events_dfs.append(events_df)
    
    print(f"Scraping completed for match ID: {match_id}")

# Check if all_events_dfs is not empty before attempting to concatenate
if all_events_dfs:  # This checks if the list is not empty
    # Concatenate all events dataframes
    final_events_df = pd.concat(all_events_dfs, ignore_index=True)

    # Finally, save the dataframe to a CSV file for persistence
    final_events_df.to_csv('data/match_events_2023_2024_3.csv', index=False)
else:
    print("No data was scraped.")

# Close the driver after scraping is done
driver.quit()

# Print a success message
print("Webscraping successfully completed for all matches.")

# Finally, save the dataframe to a CSV file for persistence
final_events_df.to_csv('data/match_events_2023_2024_3.csv', index=False)


The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Scraping completed for match ID: 4244791
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244792
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244793
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244794
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244795
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244796
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244797
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244798
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244799
Iframe not found. Continuing after a couple of seconds...
Scraping completed for match ID: 4244800
Iframe not found. Continuing after a couple of seconds...
Scraping c

In [None]:
final_events_df

In [18]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4089693 # First Game ID of the season
end_match_id = 4089824 # Adjust this according to your requirement


# Initialize an empty list to store all lineup stats dataframes
matches_info = []


# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    try:
        # Wait for the iframe to be present and switch to it
        wait = WebDriverWait(driver, 1)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)

        # Now wait for the 'Accept & continue' button to be clickable inside the iframe
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()

        # Switch back to the main document
        driver.switch_to.default_content()

    except TimeoutException:
        # If the iframe doesn't appear, continue after a couple of seconds
        print("Iframe not found. Continuing after a couple of seconds...")
        time.sleep(1)  # Adjust the time delay as needed


    # Extract match information
    try:
        # Extract the home and away club names
        league_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[1]/div/div[2]/h2/span/a').get_attribute("title")
        home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]').get_attribute("title")
        away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]').get_attribute("title")
        
        # XPath for the result of the game
        result_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/div/div/div')
        result = result_element.text

        # Append match information to the list
        matches_info.append({
            'Match ID': match_id,
            'Home Team': home_club_name,
            'Away Team': away_club_name,
            'Result': result,
            'League': league_name
        })

    except NoSuchElementException:
        print(f"Match information not found for match ID: {match_id}")

# Convert the list of match information into a DataFrame
matches_df = pd.DataFrame(matches_info)

# Print a success message after scraping all matches
print("Match information successfully extracted.")

# Finally, save the dataframe to a CSV file for persistence
matches_df.to_csv('data/matches_info_2023_2024_1.csv', index=False)

# Close the driver after scraping is done
driver.quit()

# Print a success message after scraping all matches
print("Webscraping successfully completed for all matches.")
    

The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not fou

In [19]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4244785 # 4244791 First Game ID of the season
end_match_id = 4244789 #4244838 # Adjust this according to your requirement


# Initialize an empty list to store all lineup stats dataframes
matches_info = []


# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    try:
        # Wait for the iframe to be present and switch to it
        wait = WebDriverWait(driver, 1)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)

        # Now wait for the 'Accept & continue' button to be clickable inside the iframe
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()

        # Switch back to the main document
        driver.switch_to.default_content()

    except TimeoutException:
        # If the iframe doesn't appear, continue after a couple of seconds
        print("Iframe not found. Continuing after a couple of seconds...")
        time.sleep(1)  # Adjust the time delay as needed


    # Extract match information
    try:
        # Extract the home and away club names
        league_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[1]/div/div[2]/h2/span/a').get_attribute("title")
        home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]').get_attribute("title")
        away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]').get_attribute("title")
        
        # XPath for the result of the game
        result_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/div/div/div')
        result = result_element.text

        # Append match information to the list
        matches_info.append({
            'Match ID': match_id,
            'Home Team': home_club_name,
            'Away Team': away_club_name,
            'Result': result,
            'League': league_name
        })

    except NoSuchElementException:
        print(f"Match information not found for match ID: {match_id}")

# Convert the list of match information into a DataFrame
matches_df = pd.DataFrame(matches_info)

# Print a success message after scraping all matches
print("Match information successfully extracted.")

# Finally, save the dataframe to a CSV file for persistence
matches_df.to_csv('data/matches_info_2023_2024_2.csv', index=False)

# Close the driver after scraping is done
driver.quit()

# Print a success message after scraping all matches
print("Webscraping successfully completed for all matches.")
    

The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Match information successfully extracted.
Webscraping successfully completed for all matches.


In [20]:
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Define the start and end match IDs
start_match_id = 4244791 # 4244791 First Game ID of the season
end_match_id = 4244838 #4244838 # Adjust this according to your requirement


# Initialize an empty list to store all lineup stats dataframes
matches_info = []


# Loop through the range of match IDs
for match_id in range(start_match_id, end_match_id + 1):
    # Construct the URL for the current match ID
    match_url = f"https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/{match_id}"

    # Navigate to the match URL
    driver.get(match_url)

    # Wait for page to load
    time.sleep(2)

    try:
        # Wait for the iframe to be present and switch to it
        wait = WebDriverWait(driver, 1)
        iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
        driver.switch_to.frame(iframe)

        # Now wait for the 'Accept & continue' button to be clickable inside the iframe
        accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
        accept_button.click()

        # Switch back to the main document
        driver.switch_to.default_content()

    except TimeoutException:
        # If the iframe doesn't appear, continue after a couple of seconds
        print("Iframe not found. Continuing after a couple of seconds...")
        time.sleep(1)  # Adjust the time delay as needed


    # Extract match information
    try:
        # Extract the home and away club names
        league_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[1]/div/div[2]/h2/span/a').get_attribute("title")
        home_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[1]/a[2]').get_attribute("title")
        away_club_name = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[3]/a[2]').get_attribute("title")
        
        # XPath for the result of the game
        result_element = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[1]/div/div/div[2]/div[2]/div/div/div')
        result = result_element.text

        # Append match information to the list
        matches_info.append({
            'Match ID': match_id,
            'Home Team': home_club_name,
            'Away Team': away_club_name,
            'Result': result,
            'League': league_name
        })

    except NoSuchElementException:
        print(f"Match information not found for match ID: {match_id}")

# Convert the list of match information into a DataFrame
matches_df = pd.DataFrame(matches_info)

# Print a success message after scraping all matches
print("Match information successfully extracted.")

# Finally, save the dataframe to a CSV file for persistence
matches_df.to_csv('data/matches_info_2023_2024_3.csv', index=False)

# Close the driver after scraping is done
driver.quit()

# Print a success message after scraping all matches
print("Webscraping successfully completed for all matches.")
    

The chromedriver version (121.0.6167.85) detected in PATH at C:\Users\moren\Downloads\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (122.0.6261.129); currently, chromedriver 122.0.6261.128 is recommended for chrome 122.*, so it is advised to delete the driver in PATH and retry


Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not found. Continuing after a couple of seconds...
Iframe not fou

In [21]:
matches_df

Unnamed: 0,Match ID,Home Team,Away Team,Result,League
0,4244791,FC Lausanne-Sport,Yverdon Sport FC,3:1\n(3:1),Super League
1,4244792,FC St. Gallen 1879,FC Winterthur,2:2\n(1:0),Super League
2,4244793,Grasshopper Club Zurich,FC Basel 1893,2:1\n(2:1),Super League
3,4244794,BSC Young Boys,FC Stade-Lausanne-Ouchy,1:0\n(0:0),Super League
4,4244795,FC Luzern,FC Zürich,0:1\n(0:0),Super League
5,4244796,Servette FC,FC Lugano,2:1\n(0:1),Super League
6,4244797,FC St. Gallen 1879,FC Stade-Lausanne-Ouchy,1:0\n(0:0),Super League
7,4244798,Yverdon Sport FC,FC Basel 1893,0:2\n(0:1),Super League
8,4244799,Grasshopper Club Zurich,FC Luzern,0:1\n(0:0),Super League
9,4244800,FC Lugano,FC Zürich,2:0\n(1:0),Super League
