In [4]:
# Import packages
import os
import pandas as pd
import numpy as np
import bs4 as bs
import urllib.request
#from google.colab import drive
from datetime import datetime

In [None]:
def atp_web_scr(soup: object, urls: dict, headers: dict) -> list:
  """
  Summary: Read each link and create table with scrapped data

  Parameters: object (web source), dict (key: date, item: url), dict (headers)
  Return: list
  """

  try:
    atp_data = [[] for i in range(len(urls) * 5000)]
  except:
    print("No data")
    return None

  atp_data[0].append('Date')

  try:
    for column in soup.find("table", {"class" : "mega-table"}).findAll('th'):
      atp_data[0].append(column.get_text().strip()) # Head row (0)
  except:
    print("No header row")
    return None

  row = 1
  for key, value in urls.items():
    try: 
      request=urllib.request.Request(url = value, headers = headers) 
      response = urllib.request.urlopen(request) 
      data = response.read() # data
      soup = bs.BeautifulSoup(data,'lxml')
    except ConnectionResetError:
      print("Connection Reset Error")
      break
    except:
      break
   

    # print("Check date:", key)
    for table_row in soup.find("table", {"class" : "mega-table"}).findAll('tr'):
      if table_row.findAll('td'):
        column_num = 0      
        atp_data[row].append(key)
        table_columns = table_row.find_all('td')

        for column in table_columns:
          atp_data[row].append(column.get_text().strip())
          if column_num == 1:
            if column.find('div', {'class' : 'move-up'}):
              atp_data[row][2] = "+" + column.get_text().strip()
            elif column.find('div', {'class' : 'move-down'}):
              atp_data[row][2] = "-" + column.get_text().strip()
            else: atp_data[row][2] = np.NaN
          if column_num == 2: 
            atp_data[row][3] = column.find('img')['alt']
          
          column_num += 1      
        row += 1
  return atp_data

In [None]:
def get_dates(soup: object) -> dict:
  """
  Summary: Check dates and create dict (key: date, item: url)

  Parameters: object (web source)
  Return: dict (key: date, item: url)
  """
  try:
    dates = soup.find("ul", {"data-value" : "rankDate"}).findAll('li')
    urls = {datetime.strptime(i.get_text().strip(), '%Y.%m.%d').strftime('%Y-%m-%d') : 
        ('https://www.atptour.com/en/rankings/singles?rankDate=' + datetime.strptime(i.get_text().strip(), '%Y.%m.%d').strftime('%Y-%m-%d') + '&rankRange=1-5000') for i in dates}
  except:
    print("Error while scraping dates")
    return None


  return urls

In [None]:
# Website into raw html format

agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
headers={'User-Agent': agent} 
url = 'https://www.atptour.com/en/rankings/singles'

try:
  request=urllib.request.Request(url = url, headers = headers) 
  response = urllib.request.urlopen(request)
  data = response.read() 
  soup = bs.BeautifulSoup(data,'lxml')
except:
  print("No connection")

urls = get_dates(soup)
data_atp = atp_web_scr(soup, urls, headers)

### Create a Data Frame & Export to CSV

In [None]:
df = pd.DataFrame(data_atp)
df.dropna(how = 'all', inplace = True) # drop empty rows
df.rename(columns = df.iloc[0], inplace = True) # first row as a columns' titles
df.drop([0], inplace = True) # drop first row

In [None]:
# Remove comma from the 'Points' and 'Points Dropping' columns
df.Points = df.Points.str.replace(',', '')
df['Points Dropping'] = df['Points Dropping'].str.replace(',', '')

In [None]:
# Export df to CSV
df.to_csv("data/atp_ranking.csv", index = False)
print("Done")