## Notebook for scraping player data from the ATP Tour website

In [51]:
from pathlib import Path

import requests
from bs4 import BeautifulSoup

### Specify the output directory where we will write the data to

In [53]:
output_dir = Path('/Users/nph/Desktop/atp_player_profile_data')
output_dir.mkdir(exist_ok=True)

### First we get the singles rankings page for the top 200 players

In [54]:
rankings_url = 'https://www.atptour.com/en/rankings/singles?rankRange=1-200'
rankings_page = requests.get(rankings_url)

### Extract the link to each player's profile page and store in a dict

In [58]:
player_profile_paths = {}
soup = BeautifulSoup(rankings_page.content, "html.parser")
for player_cell in soup.find_all('span', class_='player-cell-wrapper'):
    name = player_cell.a['ga-label']
    href = player_cell.a['href']
    player_profile_paths[name] = href.replace('/overview', '')  # store the base url for the player

### Check the dict has the right size and show an example value

In [59]:
len(player_profile_paths)

200

In [60]:
player_profile_paths['Novak Djokovic']

'/en/players/novak-djokovic/d643'

### Helper function to parse the Beautiful Soup tags for the player bio pages

In [61]:
def parse_bio_tag(tag):
    # This works for the career and personal pages but not for the
    # Year by Year profile page (id="playerBioCareerHighlights")
    if tag:
        text = (tag.text.replace('<p>', '').replace('</p>', '')
                        .replace('<br />', '').replace('\r', '')
                        .replace('&nbsp', ' ').replace('&#39;', "'").replace('&rsquo;', "'"))
        lines = [line for line in text.split('\n') if line.startswith('&gt;')]
        lines = [line.lstrip('&gt; ') for line in lines]
        return '\n'.join(lines)
    else:
        return ''

### Extract the player data and write to files

In [68]:
pages = ['bio', 'fedex-atp-win-loss', 'titles-and-finals', 'player-stats', 'rankings-history']

for player, profile_path in player_profile_paths.items():
    print(f'Extracting data for {player}')
    
    for page in pages:        
        url = f'https://www.atptour.com{profile_path}/{page}'
        html = requests.get(url)
        soup = BeautifulSoup(html.content, "html.parser")

        if page == 'bio':
            for id in ['playerBioYearInReview', 'playerBioPersonal']:
                tag = soup.find(id=id)
                text = parse_bio_tag(tag)
                
                desc = id.replace('player', '')
                filename = output_dir / f"{player.replace(' ', '')}_{desc}.txt"
                with open(filename, 'wt') as f:
                    f.write(text)
                    
        elif page == 'fedex-atp-win-loss':
            ...
        elif page == 'titles-and-finals':
            ...
        elif page == 'player-stats':
            ...
        elif page == 'rankings-history':
            ...

Extracting data for Novak Djokovic
Extracting data for Daniil Medvedev
Extracting data for Alexander Zverev
Extracting data for Rafael Nadal
Extracting data for Stefanos Tsitsipas
Extracting data for Carlos Alcaraz
Extracting data for Andrey Rublev
Extracting data for Matteo Berrettini
Extracting data for Felix Auger-Aliassime
Extracting data for Casper Ruud
Extracting data for Cameron Norrie
Extracting data for Hubert Hurkacz
Extracting data for Jannik Sinner
Extracting data for Taylor Fritz
Extracting data for Diego Schwartzman
Extracting data for Denis Shapovalov
Extracting data for Reilly Opelka
Extracting data for Pablo Carreno Busta
Extracting data for Roberto Bautista Agut
Extracting data for Grigor Dimitrov
Extracting data for Gael Monfils
Extracting data for Alex de Minaur
Extracting data for Marin Cilic
Extracting data for Karen Khachanov
Extracting data for Nikoloz Basilashvili
Extracting data for Frances Tiafoe
Extracting data for John Isner
Extracting data for Lorenzo Sone