In [1]:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import json

def download_webpage(url, output_filename):
    """
    Downloads the HTML content of a webpage and saves it to a file.

    Args:
      url: The URL of the webpage to download.
      output_filename: The name of the file to save the HTML content to.
    """
    # Send an HTTP GET request to the URL
    response = requests.get(url)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Get the HTML content from the response
        html_content = response.text

        # Create the output directory if it doesn't exist
        output_dir = os.path.dirname(output_filename)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Save the HTML content to the specified file
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(html_content)
        # print(f"Successfully downloaded and saved {url} to {output_filename}")
    else:
        print(f"Failed to download {url}. Status code: {response.status_code}")

# Example usage:
base_url = "http://www.tennisabstract.com/cgi-bin/player.cgi?p="
output_dir = "tennis_abstract"

with open('player_names.json', 'r') as file:
    player_names = json.load(file)

# player_names = ["RogerFederer", "RafaelNadal"]

# Get a list of existing files in the output directory
existing_files = {os.path.join(output_dir, f) for f in os.listdir(output_dir)}

player_counter = 0
flag = False
for name in tqdm(player_names):
    # Construct the full URL and output filename
    output_filename = f"{output_dir}/{name}.html"
    if output_filename not in existing_files:
        if player_counter >= 15000:
            print("Reached download limit of 15,000. Stopping.")
            break
        download_webpage(f"{base_url}{name}", output_filename)
        player_counter += 1
        flag = True
# For a nested path: output_filename = "tennis_data/players/tennis_player_page.html"

# download_webpage(url, output_filename)


100%|██████████| 17693/17693 [01:09<00:00, 254.13it/s]
100%|██████████| 17928/17928 [01:21<00:00, 219.17it/s]
100%|██████████| 18167/18167 [01:21<00:00, 222.29it/s]
  0%|          | 83/18247 [00:00<02:21, 128.15it/s]


KeyboardInterrupt: 

In [2]:
len(player_names)

18247

In [3]:
player_names[0]

'CarlosAlcaraz'

In [117]:
with open('api_player_ids_names.json', 'r') as file:
    api_players_lnames = json.load(file)
len(api_players_lnames)

17693

In [None]:
with open('bets_player_ids_names.json', 'r') as file:
    bets_players = json.load(file)
with open('api_p_keys_to_bets_api_p_keys.json', 'r') as file:
    p_keys_map = json.load(file)
bets_players = {k:n for k,n in bets_players.items() if k in p_keys_map.values() }
bets_player_names = bets_players.values()
bets_formatted_names = [(k, n.replace(' ', '')) for k,n in bets_players.items()]
ta_counter = Counter(ta_names)
bets_names_in_ta = [(k,n) for k,n in bets_formatted_names if n in ta_names and ta_counter[n] == 1]
bets_names_not_in_ta = [(k,n) for k,n in bets_formatted_names if n not in ta_names]
len(bets_names_in_ta), len(bets_names_not_in_ta)
# bets_names_not_in_ta


(307, 14)

In [123]:
bets_p_keys_to_api_p_keys = {v:k for k,v in p_keys_map.items()}
api_p_keys_to_ta_names = {bets_p_keys_to_api_p_keys[k]:v for k,v in bets_names_in_ta}
len(api_p_keys_to_ta_names)
with open('api_p_keys_to_ta_names.json', 'r') as file:
    current_map = json.load(file)

for k,v in current_map.items():
    if k in api_p_keys_to_ta_names:
        assert api_p_keys_to_ta_names[k] == v
    else:
        api_p_keys_to_ta_names[k] = v

bets_names_in_ta = [(p_keys_map[k], v) for k,v in api_p_keys_to_ta_names.items()]
bets_names_not_in_ta = [(k, v) for k, v in bets_names_not_in_ta if k not in [k for k,v in bets_names_in_ta]]
len(bets_names_in_ta), len(bets_names_not_in_ta)
bets_names_not_in_ta

[('197700', 'CarlosAlcarazGarfia'),
 ('349209', 'JamesMcCabe'),
 ('2079', 'RobertoBautista-Agut'),
 ('3532', "ChristopherO'Connell"),
 ('26980', 'MackenzieMcDonald'),
 ('247884', 'MurkelDellien'),
 ('208721', 'SoonwooKwon'),
 ('2033', 'AlbertRamos-Vinolas'),
 ('47505', 'Tung-LinWu')]

In [149]:
manual_additions = [
    ('197700', 'CarlosAlcaraz'),
    ('349209', 'JamesMccabe'),
    ('2079', 'RobertoBautistaAgut'),
    ('3532', "ChristopherOconnell"),
    ('26980', 'MackenzieMcdonald'),
    ('247884', 'MurkelAlejandroDellienVelasco'),
    ('208721', 'SoonWooKwon'),
    ('2033', 'AlbertRamos'),
    ('47505', 'TungLinWu')
]
bets_names_in_ta += manual_additions
bets_names_in_ta = list(set(bets_names_in_ta))
bets_names_not_in_ta = [(k, v) for k, v in bets_names_not_in_ta if k not in [k for k,v in bets_names_in_ta]]
len(bets_names_in_ta), len(bets_names_not_in_ta)

(321, 0)

In [150]:
ta_counter['MurkelDellieMurkelAlejandroDellienVelascon']

0

In [154]:
for k,v in bets_names_in_ta:
    assert k in bets_players.keys()
    assert v in ta_names
    api_key = bets_p_keys_to_api_p_keys[k]
    if api_key in current_map:
        assert current_map[api_key] == v
        # if current_map[k] != v:
        #     print((k, v, current_map[k]))

In [156]:
api_p_keys_to_ta_names = {bets_p_keys_to_api_p_keys[k]:v for k,v in bets_names_in_ta}
bets_p_keys_to_ta_names = {k:v for k,v in bets_names_in_ta}
with open('bets_p_keys_to_ta_names.json', 'w') as file:
    json.dump(bets_p_keys_to_ta_names, file, indent=4)
with open('api_p_keys_to_ta_names.json', 'w') as file:
    json.dump(api_p_keys_to_ta_names, file, indent=4)