In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

In [3]:
import time

from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

<h1>Project Description</h1>
<h2>Project Name: Tennis Match Data Scraper</h2>

<span>Description: This project is a Python web scraping tool designed to gather tennis match and player data from the Flashscore website. It utilizes the Selenium and BeautifulSoup libraries to automate the process of collecting data from the website. The primary goal is to retrieve match details, including player names, tournament information, and match IDs, as well as player information such as ranking, age, and country. The collected data is then stored in Pandas DataFrames for further analysis.</span>

In [4]:
class WebDriverManager:
    def __init__(self):
        self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
        self.driver.get("https://www.flashscore.com/tennis/")
        self.accept_cookies()

    def accept_cookies(self):
        try:
            accept_button = self.driver.find_element(By.ID, "onetrust-accept-btn-handler")
            accept_button.click()
        except Exception:
            pass
    def accept_page(self):
         try:
            accept_button = self.driver.find_element(By.CLASS, "close modal__closeButton")
            print(accept_button)
            accept_button.click()
         except Exception:
            pass

    def get_page_source(self, url):
        self.driver.get(url)
        time.sleep(2)
        return self.driver.page_source
        
    def get(self, url):
        self.driver.get(url)
        return self.driver
        
    def close(self):
        self.driver.quit()

<h3>Matches Class:</h3>

Purpose: Extracts match-related information, including match IDs, player names, tournament details, and more.
Methods:

<li>__init__(self, driver_manager): Initializes the Matches class with a WebDriverManager instance.</li>
<li>get_match_ids(self): Scrapes and returns a list of match IDs from the main tennis page on Flashscore.</li>
<li>scrape_matches(self): Scrapes match details, including player names, tournament, and player IDs, and stores them in a Pandas DataFrame.</li>

In [23]:
class Matches:
    def __init__(self, driver_manager):
        self.driver_manager = driver_manager
        self.match_data = pd.DataFrame()

    def get_match_ids(self):
        page_source = self.driver_manager.get_page_source("https://www.flashscore.com/tennis/")
        soup = BeautifulSoup(page_source, "html.parser")
        divs_with_matching_ids = soup.find_all('div', id=lambda value: value and value.startswith("g_2_"))
        match_ids = [div['id'].split("_")[2] for div in divs_with_matching_ids]
        return match_ids
    
    def scrape_matches(self):
        match_ids = self.get_match_ids()
        columns = ['Match_ID', 'Player', 'Tournament', 'Home/Away', 'Link', 'Player_ID']
        list = []
        for num, match_id in enumerate(match_ids[:10]):
            match_url = f"https://www.flashscore.com/match/{match_id}/#/match-summary"
            page_source = self.driver_manager.get_page_source(match_url)
            soup = BeautifulSoup(page_source, "html.parser")

            try:
                header = soup.find('span', {"class": "tournamentHeader__country"})
                tournament = header.text
                players = soup.find_all('a', {'class': 'participant__participantLink'})

                for number, player in enumerate(players):
                   
                    player_info = self.extract_player_info(player)
                    player_index = f"{player_info['last_name']} {player_info['first_name']}"

                    match_data = {
                        'Match_ID': match_id,
                        'Player': player_index,
                        'Tournament': tournament,
                        'Link_match': match_url,
                        'Link_player': "https://www.flashscore.com" + player['href'],
                        'Home/Away': "Home" if number % 2 == 0 else "Away",
                        'Player_ID': player_info['player_id']
                    }

                    list.append(match_data)
                    
                if num == 1:
                    break
            except Exception as e:
                print(f"Error scraping match {match_id}: {str(e)}")
        self.match_data = pd.DataFrame(list)
      
    def extract_player_info(self,player):
        player_id = player['href'].split("/")[3]
        last_name, first_name = player['href'].split("/")[2].split("-", 1)
        last_name = last_name.capitalize()
        first_name = first_name.capitalize()
        return {'last_name': last_name, 'first_name': first_name, 'player_id': player_id}


<h1>Players Class:</h1>

Purpose: Collects player-specific information such as ranking, age, and country.

Methods:
<li>__init__(self, driver_manager, match): Initializes the Players class with a WebDriverManager instance and a Matches instance.</li>
<li>collect_players(self): Collects player data by visiting individual player pages, extracting relevant information, and storing it in a Pandas DataFrame.</li>

In [7]:
class Players:
    def __init__(self, driver_manager, match):
        self.driver_manager = driver_manager
        self.players_data = pd.DataFrame()
        self.matches = match

    def collect_players(self):
        columns = ['Player_ID', 'Full Name', 'Ranking', 'Age','Country']
        list = []
        for player_link in self.matches['Link_player']:
            time.sleep(2)
            try:
                page_source = self.driver_manager.get_page_source(player_link)
                time.sleep(1)   
                self.driver_manager.accept_page()
                soup = BeautifulSoup(page_source, "html.parser")
                
                player_data = {
                            'Player_ID': player_link.split("/")[5],
                            'Full Name': soup.find('div',{'class':'heading__name'}).text,
                            'Ranking': soup.find('span',{'class':'participant-detail-rank'}).text.split(":")[1].split(".")[0].strip(),
                            'Age': soup.find('span',{'class':'jsl-age'}).text.split("\xa0")[1].split(" ")[0],
                            'Country': soup.find('span',{'class':'breadcrumb__text'}).text
                        }
                
                list.append(player_data)
            except Exception as e:
                print(f"Error scraping player {player_link}: {str(e)}")
            
        self.players_data = pd.DataFrame(list) 

Running program:

In [35]:
driver_manager = WebDriverManager()
matches = Matches(driver_manager)
matches.scrape_matches()
match = matches.match_data
players = Players(driver_manager,match )
players.collect_players()

<h5>Below results were scraped only for presentation, in fact above code may scrap all of the data from today. <h5>

In [28]:
match

Unnamed: 0,Match_ID,Player,Tournament,Link_match,Link_player,Home/Away,Player_ID
0,fZDViX0f,Evans Daniel,ATP - SINGLES: Davis Cup - World Group (World),https://www.flashscore.com/match/fZDViX0f/#/ma...,https://www.flashscore.com/player/evans-daniel...,Home,Ya9jkSU5
1,fZDViX0f,Arthur Fils,ATP - SINGLES: Davis Cup - World Group (World),https://www.flashscore.com/match/fZDViX0f/#/ma...,https://www.flashscore.com/player/arthur-fils/...,Away,IoIhUqIN
2,6FMNs8fp,Ajdukovic Duje,ATP - SINGLES: Davis Cup - World Group (World),https://www.flashscore.com/match/6FMNs8fp/#/ma...,https://www.flashscore.com/player/ajdukovic-du...,Home,n11bz7QK
3,6FMNs8fp,Van De-zandschulp-botic,ATP - SINGLES: Davis Cup - World Group (World),https://www.flashscore.com/match/6FMNs8fp/#/ma...,https://www.flashscore.com/player/van-de-zands...,Away,YwiLpILD


In [36]:
players.players_data

Unnamed: 0,Player_ID,Full Name,Ranking,Age,Country
0,Ya9jkSU5,Daniel Evans,27,33,United Kingdom
1,IoIhUqIN,Arthur Fils,44,19,France
2,n11bz7QK,Duje Ajdukovic,206,22,Croatia
3,YwiLpILD,Botic Van De Zandschulp,68,27,Netherlands


<h3> This project will be extended to get all data about matches and will present the statistics in Power BI program </h3>