In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webelement import WebElement
from webdriver_manager.chrome import ChromeDriverManager
from typing import List

In [2]:
# Instantiate a Chrome driver.
def load_driver() -> None:
    # Load and configure webdriver.
    options: Options = Options()
    # Stop browser windows from actually popping up.
    options.add_argument('--headless')
    # Install a browser for use by Selenium.
    service: Service = Service(executable_path=ChromeDriverManager().install())
    return Chrome(service=service, options=options)

In [9]:
# Helper function that navigates to the search result page for a particular
# month on the USCF page for historical tournament data.
def navigate_to_uscf_page(driver: Chrome, date_to_visit: str) -> None:
    # Navigate to US Chess tournament search page.
    USCF_URL = 'http://www.uschess.org/datapage/events-rated.php'
    driver.get(USCF_URL)
    date_search_box: WebElement = driver.find_element('name', 'month')
    date_search_box.clear()
    date_search_box.send_keys(date_to_visit)

    # Select CA as the State Code, which is where all 
    # chess.com USCF tournaments are registered.
    state_search_box: WebElement = driver.find_element('name', 'states')
    state_search_box.clear()
    state_search_box.send_keys('CA')
    state_search_box.submit()

In [10]:
# Scrapes tournment urls for the month that the driver is currently
# pointed add. Helper function navigate_to_uscf_page() navigates to the
# correct page.
def scrape_uscf_tournament_urls(driver: Chrome) -> List[str]:
    table_body: List[WebElement] = driver.find_elements(By.TAG_NAME, 'tbody')[2]
    table_body_row: List[WebElement] = table_body.find_elements(By.TAG_NAME, 'tr')
    url_list: List[str] = []
    for row in table_body_row:
        # Each row is a WebElement with data about one tournament.
        table_row: List[WebElement] = row.find_elements(By.TAG_NAME, 'td')
        if len(table_row) >= 3:
            url: str = None
            for element in table_row:
                if element.text.isnumeric() and len(element.text) > 10:
                    url = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
                # Only keep tournament urls labeled "CHESS.COM"
                if 'CHESS.COM' in element.text.upper():
                    url_list.append(url)
    return url_list     

In [16]:
import sqlite3
from sqlite3 import Connection, Cursor

def get_cursor() -> Cursor:
    return sqlite3.Connection('scrape_data.db').cursor()

# Drop then create tables for all data scraped by this module.
def init_db() -> None:
    cur: Cursor = get_cursor()
    cur.execute("""DROP TABLE IF EXISTS uscf_urls""")
    # Stores urls to allow us to navigate to all relevant USCF tournaments.
    cur.execute("""CREATE TABLE IF NOT EXISTS uscf_urls (
        id INTEGER PRIMARY KEY,
        date TEXT,
        url TEXT,
        scraped INTEGER
        )""")

In [14]:
# Scrapes a list of urls. Each url is a USCF tournament with a parallel entry
# on chess.com. Every month between 2015 and 2023 inclusive is checked.
def scrape_all_uscf_urls(driver: Chrome) -> List[str]:
    url_list: List[str] = []
    for year in range(2023, 2014, -1):
        # Page requires single digit months to have a 0 in front.
        for month in range(1, 10):
            date: str = '0' + str(month) + '/' + str(year)
            print(date)
            navigate_to_uscf_page(driver, date)
            url_list.extend(scrape_uscf_tournament_urls(driver))
        for month in range(10, 13):
            date: str = str(month) + '/' + str(year)
            print(date)
            navigate_to_uscf_page(driver, date)
            url_list.extend(scrape_uscf_tournament_urls(driver))
    return url_list

In [15]:
driver: Chrome = load_driver()
# print(scrape_all_uscf_urls(driver))

01/2023
02/2023
03/2023
04/2023
05/2023
06/2023
07/2023
08/2023
09/2023
10/2023
11/2023
12/2023
01/2022
02/2022
03/2022
04/2022
05/2022
06/2022
07/2022
08/2022
09/2022
10/2022
11/2022
12/2022
['http://www.uschess.org/msa/XtblMain.php?202301023902', 'http://www.uschess.org/msa/XtblMain.php?202301023912', 'http://www.uschess.org/msa/XtblMain.php?202301023922', 'http://www.uschess.org/msa/XtblMain.php?202301023932', 'http://www.uschess.org/msa/XtblMain.php?202301043942', 'http://www.uschess.org/msa/XtblMain.php?202301045042', 'http://www.uschess.org/msa/XtblMain.php?202301065052', 'http://www.uschess.org/msa/XtblMain.php?202301065062', 'http://www.uschess.org/msa/XtblMain.php?202301075072', 'http://www.uschess.org/msa/XtblMain.php?202301075082', 'http://www.uschess.org/msa/XtblMain.php?202301098332', 'http://www.uschess.org/msa/XtblMain.php?202301098342', 'http://www.uschess.org/msa/XtblMain.php?202301098352', 'http://www.uschess.org/msa/XtblMain.php?202301098362', 'http://www.uschess.org

In [8]:
# from datetime import datetime

# STARTING_URL: str = 'https://www.chess.com/tournament/live?&page='

# def go_to_tournament(day, month, year, starting_url):
#     driver.get(starting_url)
#     date_string = driver.find_element(By.CLASS_NAME, 'tournaments-live-date')
#     date = datetime.strptime(date_string.text, '%b %d, %Y, %I:%M %p')

# 0 to 9000 is what chess.com currently allows
# def go_to_right_year(target, url, page) -> bool:
#     driver.get(url + page)
#     date_string = driver.find_element(By.CLASS_NAME, 'tournaments-live-date')
#     date = datetime.strptime(date_string.text, '%b %d, %Y, %I:%M %p')
#     if target < date.year:
#         page = page + step_size
#         step_size = step_size / 2


        