<a href="https://colab.research.google.com/github/jeanbouteiller-ds/tennis_prediction/blob/main/functions_fixed_table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import subprocess
import sys

In [6]:
def install_package(package_name):
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package_name])
        print(f"Successfully installed {package_name}")
    except subprocess.CalledProcessError:
        print(f"Failed to install {package_name}") # Install the PyGitHub library

install_package("requests")
# install_package("PyGitHub")
# install_package("pandas")

Successfully installed requests


In [7]:
import requests
import random
from bs4 import BeautifulSoup

from datetime import datetime, timedelta

import os
import io
# from github import Github

import importlib.util
import nbformat
# import pandas

In [8]:
# Define the URL of the nb1 notebook on GitHub
github_functions_url = "https://raw.githubusercontent.com/jeanbouteiller-ds/tennis_prediction/main/github_functions.ipynb"
scraping_functions_url = "https://raw.githubusercontent.com/jeanbouteiller-ds/tennis_prediction/main/scraping_functions.ipynb"

for file_functions in [github_functions_url,scraping_functions_url]:
  # Download the notebook as a raw .ipynb file
  response = requests.get(file_functions)
  if response.status_code == 200:
    notebook_content = response.text

    # Parse the notebook content
    notebook = nbformat.reads(notebook_content, as_version=4)

    # Now you can execute the cells in the notebook
    for cell in notebook.cells:
        if cell.cell_type == 'code':
            exec(cell.source)

Successfully installed requests
Successfully installed PyGitHub
Successfully installed pandas
Successfully installed requests


# Get the list of all players and their associated urls

In [10]:
#get the list of all first days of week between 2 dates
#needed since the url on the atp website is updated every monday
def get_list_ranking_dates(start_date, end_date):
    mondays = []
    current_date = start_date
    one_day = timedelta(days=1)
    while current_date <= end_date:
        if current_date.weekday() == 0:  # Monday has a weekday index of 0
            mondays.append(current_date.strftime('%Y-%m-%d'))
        current_date += one_day
    return mondays

#get the list of all urls in which we have the rankings
def find_list_weekly_ranking_url(start_date, end_date,nb_players_ranking):
  list_ranking_urls=[]
  #date should be in format YYYY-MM-DD and should be a string
  list_dates=get_list_ranking_dates(start_date, end_date)

  for date_ranking in list_dates:
    list_ranking_urls.append('https://www.atptour.com/en/rankings/singles?rankRange=1-'+str(nb_players_ranking)+'&rankDate='+date_ranking)
  return(list_ranking_urls)



In [11]:
#from the above spans, we extract all urls
def players_from_url(weekly_ranking_url):

  selected_span=html_elements_from_url(weekly_ranking_url,'span',['player-cell-wrapper'])['player-cell-wrapper']

  list_urls=[]
  list_players=[]

  for span_element in selected_span:
    a_element = span_element.find('a')

      # Extract the href attribute
    if a_element:
      # print(a_element)
      href = 'https://www.atptour.com'+a_element.get('href')
      player_name=href.split('players')[1].split('/')[1]
      list_players.append(player_name)
      list_urls.append(href)
    else:
        print("No <a> element found within the <span>.")
  return(list_players,list_urls)

#
def update_names_and_urls(weekly_ranking_url,list_all_names,list_all_urls):
  list_players_name_and_url=players_from_url(weekly_ranking_url)
  set_names_week = set(list_players_name_and_url[0])
  set_urls_week = set(list_players_name_and_url[1])
  set_names_all = set(list_all_names)
  set_urls_all = set(list_all_urls)

  names_to_add=list(set_names_week-set_names_all)
  urls_to_add=list(set_urls_week-set_urls_all)

  return(list_all_names+names_to_add,
         list_all_urls+urls_to_add)

In [12]:
def create_initial_list(start_date,end_date,nb_players_ranking=200):
  list_all_names=[]
  list_all_urls=[]

  # Define your start and end dates
    # Change this to your desired end date

  list_year=[]
  for weekly_url_ranking in find_list_weekly_ranking_url(start_date, end_date,nb_players_ranking):
    if weekly_url_ranking.split('rankDate=')[1].split('-')[0] not in list_year:
      list_year.append(weekly_url_ranking.split('rankDate=')[1].split('-')[0])
      # print(list_year)
    names_and_url_week=update_names_and_urls(weekly_url_ranking,list_all_names,list_all_urls)
    list_all_names=names_and_url_week[0]
    list_all_urls=names_and_url_week[1]

  list_all_urls_final=[0 for k in range(len(list_all_urls))]
  for k in range(len(list_all_names)):
    player_name=list_all_names[k]
    for player_url in list_all_urls:
      if player_name==player_url.split('/players/')[1].split('/')[0]:
        list_all_urls_final[k]=player_url

  return(list_all_names,list_all_urls_final)




# Get Players Data

In [13]:
def find_stats_from_span(url,span_names_list=['table-height-cm-wrapper','table-weight-lbs','table-birthday'],
               feature_names=['height','weight (lbs)','Birthdate']):
  dict_data={}
  data_spans=html_elements_from_url(url,'span',span_names_list)
  for k in range (len(feature_names)):
    key=feature_names[k]
    values=list(data_spans.values())[k][0].get_text()
    dict_data[key]=values.replace(' ','').replace('\r\n','')
  return dict_data

In [14]:
def find_stats_from_div(url,div_dict={'Turned Pro':['table-big-label','table-big-value'],
                                      'Plays':['table-label','table-value']}):
  #div_dict should be a dict where the key is the label that we want to find in the html code
  # and the values are the div that we want to search (for label and values)
  dict_data={}

  soup = parse_html_from_url(url)

  for feature_name in div_dict.keys():

    div_name=div_dict[feature_name][0]
    div_value=div_dict[feature_name][1]

    for div in soup.find_all('div', class_=div_name):
      # print(div)
      if feature_name in div.get_text():

        dict_data[feature_name]=(div.find_next_sibling('div', class_=div_value).get_text()).replace(' ','').replace('\r\n','')
        # if
        # print(div.find_next_sibling('div', class_='table-big-value').get_text())

  return (dict_data)


def find_all_stats(url):
  return({**find_stats_from_div(url),**find_stats_from_span(url)})

In [15]:
import pandas as pd
import concurrent.futures

def fetch_player_stats(url, player_name):
    try:
        stats = find_all_stats(url)
        return (player_name, stats)
    except Exception as e:
        return (player_name, None)

#the below function has been created by the chatgpt optimizer
def create_new_fixed_table(file_name, list_player_names, list_urls,write_csv=False):
    player_data = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(fetch_player_stats, url, player_name) for url, player_name in zip(list_urls, list_player_names)]

        for future in concurrent.futures.as_completed(futures):
            player_name, stats = future.result()
            if stats is not None:
                # Flatten the stats dictionary into separate columns
                player_data.append({**{'Player Name': player_name}, **stats})

    df_player_fixed = pd.DataFrame(player_data)

    # You can save the DataFrame to a file if needed.
    if write_csv==True:
      df_player_fixed_csv=df_player_fixed.to_csv(index=False)
      add_file_github('df_player_fixed.csv',df_player_fixed_csv)

    return df_player_fixed


# Match tables functions

In [2]:
def find_matchs(tournament_row):
  """
  Takes a row of the df_tournament.csv and returns the different games
  """
  list_matchs_tournament=[]
  tournament_url_archive="https://www.atptour.com/en/scores/archive"+tournament_row["url"].split('tournaments')[1].split("overview")[0]+tournament_row['tournament_date'].split('.')[0]+'/results'
  tournament_url_current="https://www.atptour.com/en/scores/current"+tournament_row["url"].split('tournaments')[1].split("overview")[0]+tournament_row['tournament_date'].split('.')[0]+'/results'
  print(tournament_url_archive,tournament_url_current)
  scrapping_tournament_archive=html_elements_from_url(tournament_url_archive,"div",['scores-results-content'])["scores-results-content"]
  scrapping_tournament_current=html_elements_from_url(tournament_url_current,"div",['scores-results-content'])["scores-results-content"]
  scrapping_tournament=''
  if len(scrapping_tournament_archive)>0:
    scrapping_tournament=scrapping_tournament_archive
  if (len(scrapping_tournament_current))>0:
    scrapping_tournament=scrapping_tournament_current
  if len(scrapping_tournament)>0:
    scrapping_tournament=scrapping_tournament[0]
    all_round=scrapping_tournament.find_all("tbody")
    round_tracker=0
    for round in all_round:
      all_games_round=round.find_all("td","day-table-name")
      round_name=scrapping_tournament.find_all("th")[round_tracker]
      for game_number in range(0,len(all_games_round),2):
        game_dict={}
        game_dict["tournament_name"]=tournament_row["tournament_name"]
        game_dict["tournament_date"]=tournament_row["tournament_date"]
        game_dict["round"]=round_name.text
        game_dict["winner_name"]=all_games_round[game_number].text.strip()
        game_dict["loser_name"]=all_games_round[game_number+1].text.strip()
        game_dict["score"]=re.sub(r'\s+','-',scrapping_tournament.find_all("td","day-table-score")[int(game_number/2)].text.strip().replace("\r\n",''))
        list_matchs_tournament.append(game_dict)
      round_tracker+=1
  else:
    game_dict={}
    game_dict["tournament_name"]=tournament_row["tournament_name"]
    game_dict["tournament_date"]=tournament_row["tournament_date"]
    list_matchs_tournament.append(game_dict)
  return list_matchs_tournament