# Coach Level Scraping

As a first draft, the following code extracts the data from the super league game  **Servette FC - Lugano (23.12.2023, Result = 2:2)**

## Line-Ups per Game

**Page Link:** https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/4089797 

**Description:** Shows Line-Up of the team and counter team, its substitudes as well as different statistics such as average age, market value of them.



We aim to extract the following attributes for each Game:
- 
- 
- 
- 
- 
- 



In [2]:
import time
import os
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Specify the path to the directory containing the ChromeDriver executable
chrome_driver_directory = "C:/Users/arnol/Downloads/chromedriver-win64/chromedriver.exe" #insert your own path here #User moreno: 'moren'

# Add the ChromeDriver directory to the PATH environment variable
os.environ["PATH"] += os.pathsep + chrome_driver_directory


In [5]:
## PAGE NAVIGATION ##
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Navigate to the tm page
driver.get('https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/4089797') 

# Wait for page to load
time.sleep(2) 

# Wait for the iframe to be present and switch to it
wait = WebDriverWait(driver, 10)
iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
driver.switch_to.frame(iframe)

# Now wait for the 'Accept & continue' button to be clickable inside the iframe
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
accept_button.click()

# Switch back to the main document
driver.switch_to.default_content()





## SCRAPING ## 

# Function to extract data from a table given its rows
def extract_table_data(table_rows):
    positions = []
    players = []
    ages = []
    market_values = []
    
    for i in range(0, len(table_rows), 3):  # Increment by 3 for each player's data set
        cells = table_rows[i].find_elements(By.TAG_NAME, "td")
        player_info = cells[1].text
        name_age_parts = player_info.split(' (')
        player_name = name_age_parts[0].strip()
        age_part = name_age_parts[1] if len(name_age_parts) > 1 else ''
        age_match = re.search(r'(\d+) years old', age_part)
        age = age_match.group(1) if age_match else None

        position_market_value = cells[4].text
        if ', ' in position_market_value:
            position, market_value = position_market_value.split(', ')
        else:
            position = position_market_value
            market_value = None
        
        players.append(player_name)
        ages.append(age)
        positions.append(position)
        market_values.append(market_value)
    
    return pd.DataFrame({
        'Position': positions,
        'Player': players,
        'Age': ages,
        'Market Value': market_values
    })

# XPath or CSS Selector for each table
# Note: Replace 'starting_lineup_home_xpath', 'substitutes_home_xpath', 'starting_lineup_away_xpath', and 'substitutes_away_xpath' with the actual values
tables_xpaths = {
    'starting_lineup_home': '//*[@id="main"]/main/div[5]/div[1]/div/div[1]/table', 
    'substitutes_home': '//*[@id="main"]/main/div[6]/div[1]/div/div[1]/table',
    'starting_lineup_away': '//*[@id="main"]/main/div[5]/div[2]/div/div[1]/table',
    'substitutes_away': '//*[@id="main"]/main/div[6]/div[2]/div/div[1]/table'
}

all_tables_df = []

for key, value in tables_xpaths.items():
    table = driver.find_element(By.XPATH, value)
    rows = table.find_elements(By.TAG_NAME, "tr")
    df = extract_table_data(rows)
    df['Team'] = 'Home' if 'home' in key else 'Away'
    df['Type'] = 'Starting' if 'starting' in key else 'Substitute'
    all_tables_df.append(df)

# Combine all dataframes
combined_df = pd.concat(all_tables_df, ignore_index=True)

# Close the driver after scraping is done
driver.quit()

# Convert 'Age' to int, handling missing or malformed data
combined_df['Age'] = pd.to_numeric(combined_df['Age'], errors='coerce').astype('Int64')

# Print a success message
print("Webscraping successfully completed")

# Display the combined DataFrame
combined_df.head()






## WRAP UP ##

# Close the driver after scraping is done
driver.quit()

# Convert 'Age' to int, handling missing or malformed data
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Print a success message
print("Webscraping successfully completed")

# Display the DataFrame
combined_df.head()

Webscraping successfully completed
Webscraping successfully completed


Unnamed: 0,Position,Player,Age,Market Value,Team,Type
0,Goalkeeper,Jérémy Frick,30,€500k,Home,Starting
1,Centre-Back,Yoan Severin,26,€1.10m,Home,Starting
2,Centre-Back,Steve Rouiller,33,€200k,Home,Starting
3,Left-Back,Bradley Mazikou,27,€1.00m,Home,Starting
4,Right-Back,Keigo Tsunemoto,25,€900k,Home,Starting


In [29]:
## PAGE NAVIGATION ##
# Initialize the Chrome driver
driver = webdriver.Chrome()

# Navigate to the tm page
driver.get('https://www.transfermarkt.com/servette-fc_fc-lugano/aufstellung/spielbericht/4089797') 

# Wait for page to load
time.sleep(2) 

# Wait for the iframe to be present and switch to it
wait = WebDriverWait(driver, 10)
iframe = wait.until(EC.presence_of_element_located((By.ID, "sp_message_iframe_953358")))
driver.switch_to.frame(iframe)

# Now wait for the 'Accept & continue' button to be clickable inside the iframe
accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'accept')]")))
accept_button.click()

# Switch back to the main document
driver.switch_to.default_content()





## SCRAPING ##

## SCRAPING THE TABLE HOME TEAM STARTING LINE UP ##
# Find the table by its XPath or CSS Selector
table = driver.find_element(By.XPATH, '//*[@id="main"]/main/div[5]/div[1]/div/div[1]/table')

# Locate all rows of the table
rows = table.find_elements(By.TAG_NAME, "tr")

# ... [previous code] ...

# Initialize lists to store each column's data
positions = []
players = []
ages = []
market_values = []

# Loop through every third row starting from row 0
for i in range(0, len(rows), 3):  # Increment by 3 to jump to the next set of player data
    cells = rows[i].find_elements(By.TAG_NAME, "td")
    
    # Extracting player's name and age from the second column
    player_info = cells[1].text  # Accessing the second column where player info is present
    name_age_parts = player_info.split(' (')  # Split the name and age
    player_name = name_age_parts[0].strip()
    age_part = name_age_parts[1] if len(name_age_parts) > 1 else ''
    age_match = re.search(r'(\d+) years old', age_part)
    age = age_match.group(1) if age_match else None
    
    # Extracting position and market value from the fifth column
    position_market_value = cells[4].text  # The position and market value are in the fifth column (index 4)
    if ', ' in position_market_value:
        position, market_value = position_market_value.split(', ')
    else:
        position = position_market_value  # Assume the whole string is the position if no comma is present
        market_value = None  # Assume no market value if no comma is present

    # Appending to lists
    players.append(player_name)
    ages.append(age)
    positions.append(position)
    market_values.append(market_value)

# Creating the DataFrame
df = pd.DataFrame({
    'Position': positions,
    'Player': players,
    'Age': ages,
    'Market Value': market_values
})

# Converting age to integer, handling cases where age might be None
df['Age'] = pd.to_numeric(df['Age'], errors='coerce').astype('Int64')

# ... [rest of the code to close the driver and print the success message] ...

# Creating the DataFrame
df = pd.DataFrame({
    'Position': positions,
    'Player': players,
    'Age': ages,
    'Market Value': market_values
})

## 

# Close the driver after scraping is done
driver.quit()

# Convert 'Age' to int, handling missing or malformed data
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Print a success message
print("Webscraping successfully completed")

# Display the DataFrame
df.head()

Guguseli
Webscraping successfully completed


Unnamed: 0,Position,Player,Age,Market Value
0,Goalkeeper,Jérémy Frick,30,€500k
1,Centre-Back,Yoan Severin,26,€1.10m
2,Centre-Back,Steve Rouiller,33,€200k
3,Left-Back,Bradley Mazikou,27,€1.00m
4,Right-Back,Keigo Tsunemoto,25,€900k


## Matchsheet (evtl)

Page Link: https://www.transfermarkt.com/servette-fc_fc-lugano/index/spielbericht/4089797

Description:


## Match statistics (evtl)

Page Link: https://www.transfermarkt.com/servette-fc_fc-lugano/statistik/spielbericht/4089797

Description: