In [181]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webelement import WebElement
from webdriver_manager.chrome import ChromeDriverManager
from typing import List, Tuple, Generator

In [182]:
# Instantiate a Chrome driver.
def load_driver() -> None:
    # Load and configure webdriver.
    options: Options = Options()
    # Stop browser windows from actually popping up.
    options.add_argument('--headless')
    # Install a browser for use by Selenium.
    service: Service = Service(executable_path=ChromeDriverManager().install())
    return Chrome(service=service, options=options)

In [183]:
# Helper function that navigates to the search result page for a particular
# month on the USCF page for historical tournament data.
def navigate_to_uscf_page(driver: Chrome, date_to_visit: str) -> None:
    # Navigate to US Chess tournament search page.
    USCF_URL = 'http://www.uschess.org/datapage/events-rated.php'
    driver.get(USCF_URL)
    date_search_box: WebElement = driver.find_element('name', 'month')
    date_search_box.clear()
    date_search_box.send_keys(date_to_visit)

    # Select CA as the State Code, which is where all 
    # chess.com USCF tournaments are registered.
    state_search_box: WebElement = driver.find_element('name', 'states')
    state_search_box.clear()
    state_search_box.send_keys('CA')
    state_search_box.submit()

In [184]:
# Scrapes tournment urls for the month that the driver is currently
# pointed add. Helper function navigate_to_uscf_page() navigates to the
# correct page.
def scrape_uscf_tournament_urls(driver: Chrome) -> List[str]:
    table_body: List[WebElement] = driver.find_elements(By.TAG_NAME, 'tbody')[2]
    table_body_row: List[WebElement] = table_body.find_elements(By.TAG_NAME, 'tr')
    url_list: List[str] = []
    for row in table_body_row:
        # Each row is a WebElement with data about one tournament.
        table_row: List[WebElement] = row.find_elements(By.TAG_NAME, 'td')
        if len(table_row) >= 3:
            url: str = None
            for element in table_row:
                if element.text.isnumeric() and len(element.text) > 10:
                    url = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
                # Only keep tournament urls labeled "CHESS.COM"
                if 'CHESS.COM' in element.text.upper():
                    url_list.append(url)
    return url_list     

In [185]:
import sqlite3
from sqlite3 import Connection, Cursor

# Drop then create tables for all data scraped by this module.
def init_db() -> None:
    conn: Connection = sqlite3.Connection('scrape_data.db')
    cur: Cursor = conn.cursor()
    # # Stores urls to allow us to navigate to all relevant USCF tournaments.
    # cur.execute("""CREATE TABLE IF NOT EXISTS uscf_urls (
    #     id INTEGER PRIMARY KEY,
    #     date TEXT,
    #     url TEXT,
    #     scraped INTEGER
    #     )""")
    cur.execute("DROP TABLE IF EXISTS uscf_rounds")
    cur.execute("DROP TABLE IF EXISTS uscf_player_observations")
    cur.execute("DROP TABLE IF EXISTS uscf_tournaments")
    cur.execute("""CREATE TABLE IF NOT EXISTS uscf_tournaments (
        id INTEGER PRIMARY KEY,
        tournament_name TEXT,
        tournament_code INTEGER,
        event_date TEXT, 
        received_date TEXT,
        entered_date TEXT, 
        rated_date TEXT,
        section_count INTEGER,
        player_count INTEGER,
        k_factor TEXT,
        rating_system TEXT,
        tournament_type TEXT,
        time_control TEXT,
        urls_id INTEGER,
        FOREIGN KEY (urls_id)
            REFERENCES uscf_urls (id)
    )""")
    cur.execute("""CREATE TABLE IF NOT EXISTS uscf_player_observations (
        id INTEGER PRIMARY KEY,
        name TEXT,
        seed_number INTEGER,
        url TEXT,
        uscf_id INTEGER,
        record TEXT,
        state_code TEXT,
        rating_type TEXT,
        before_rating TEXT,
        after_rating TEXT,
        color_assignments TEXT,
        uscf_tournaments_id INTEGER,
        FOREIGN KEY (uscf_tournaments_id)
            REFERENCES uscf_tournaments (id) 
    )""")
    cur.execute("""CREATE TABLE IF NOT EXISTS uscf_rounds (
        id INTEGER PRIMARY KEY,
        round_number INTEGER,
        result TEXT,
        opponent INTEGER,
        uscf_tournaments_id INTEGER,
        uscf_player_id INTEGER,
        FOREIGN KEY (uscf_tournaments_id)
            REFERENCES uscf_tournaments (id),
        FOREIGN KEY (uscf_player_id)
            REFERENCES uscf_player_observations (id)
    )""")
    conn.commit()

In [186]:
# Scrapes a list of urls. Each url is a USCF tournament with a parallel entry
# on chess.com. Every month between 2015 and 2023 inclusive is checked.
def scrape_all_uscf_urls(driver: Chrome, cur: Cursor) -> List[str]:
    url_list: List[str] = []
    for year in range(2023, 2014, -1):
        # Page requires single digit months to have a 0 in front.
        for month in range(1, 10):
            date: str = '0' + str(month) + '/' + str(year)
            navigate_to_uscf_page(driver, date)
            url_list.extend(scrape_uscf_tournament_urls(driver))
        for month in range(10, 13):
            date: str = str(month) + '/' + str(year)
            navigate_to_uscf_page(driver, date)
            url_list.extend(scrape_uscf_tournament_urls(driver))
    return url_list

In [187]:
# driver: Chrome = load_driver()
# url_list: List[str] = scrape_all_uscf_urls(driver)

# url_tuples: List[Tuple] = []
# for url in url_list:
#     url_tuples.append((None, url, 0))

# conn: Connection = sqlite3.Connection('scrape_data.db')
# cur: Cursor = conn.cursor()
# cur.executemany("""INSERT INTO uscf_urls (
#     date, url, scraped
#     ) VALUES (?, ?, ? )""", url_tuples)
# conn.commit()

In [188]:
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
import requests
from requests.models import Response

# Currently set to only return data from one tourament page.
def get_tournament_page(tournament_id: str) -> BeautifulSoup:
    cur: Cursor = sqlite3.Connection('scrape_data.db').cursor()
    cur.execute("SELECT url FROM uscf_urls WHERE id = ?", (tournament_id,))
    url: str = cur.fetchall()[0][0]
    request: Response = requests.get(url)
    print(url)
    print('Status Code:', request.status_code)
    return BeautifulSoup(request.text, 'html.parser')

In [189]:
# Extracts the following in order: tournament_name, tournament_id, event_date, received_date,
# entered_date, rated_date, section_count, player_count
def extract_tournament_info1(soup: BeautifulSoup) -> List[str]:
    upper_table: ResultSet[Tag] = soup.find_all('table', attrs={
        'border': '0',
        'bgcolor': 'FFFFFF',
        'cellpadding': '3',
        'cellspacing': '0'
    })
    rows: ResultSet[Tag] = upper_table[0].find_all('tr')
    row1_tags: ResultSet[Tag] = rows[0].find_all('td')

    infoset1: List[str] = []
    infoset1.append(row1_tags[3].b.text)
    infoset1.append(row1_tags[3].small.text[1:-1])
    infoset1.append(row1_tags[7].b.text)

    dates_split: List[str] = row1_tags[13].b.text.split(' ')
    infoset1.append(dates_split[1])
    infoset1.append(dates_split[4])
    infoset1.append(dates_split[7])
    infoset1.append(row1_tags[15].b.text.split(' ')[0])
    infoset1.append(row1_tags[15].b.text.split(' ')[-2])
    return infoset1

In [190]:
# Extracts the following in order: k_factor, rating_system, tournament_type, time_control
def extract_tournament_info2(soup: BeautifulSoup) -> List[str]:
    upper_table: ResultSet[Tag] = soup.find_all('table', attrs={
            'border': '0',
            'bgcolor': 'FFFFFF',
            'cellpadding': '3',
            'cellspacing': '0'
        })
    header_box: Tag = upper_table[1]
    rules: str = header_box.find_all('b')[3].text
    rules_list: List[str] = rules.split(' ')

    infoset2: List[str] = []
    infoset2.append(rules_list[7])
    infoset2.append(rules_list[11])
    infoset2.append(rules_list[16])
    infoset2.append(rules_list[20])
    return infoset2


In [191]:
import re
from re import Match

# Includes both player names and links. Each link needs to be
# preceded by https://www.uschess.org/msa/.
def get_player_names_and_urls(soup: BeautifulSoup) -> ResultSet[Tag]:
    results: ResultSet[Tag] = soup.find_all('a', {'href': lambda x: x and x.startswith('MbrDtlMain')})
    players: List[Tuple[str]] = []
    for result in results:
        players.append((result.text, 'https://www.uschess.org/msa/' + result['href']))
    return players

def extract_tabular_results(soup: BeautifulSoup) -> List[str]:
    pre_results: ResultSet[Tag] = soup.find_all('pre')
    # Using .stripped_strings returns the data not surrounded by an HTML tag, which is the
    # results data we want.
    results_raw: Generator = pre_results[0].stripped_strings
    results_clean: List[Tuple[str]] = []
    results_query: str = r'\|\d+\.\d+\s*(?:\|[A-Z]\s*\d*)+'
    state_query: str = r'\|\s+[A-Z][A-Z]\s+\|'
    rating_query: str = r'(\|\s*\d+\s*\/\s*[A-Z]+:[A-Za-z\s0-9]+->[A-Za-z\s0-9]+)([\sBW\|]*)'

    player_rows: List[Tuple[str]] = []
    for string in results_raw:
        score: List[str] = re.findall(results_query, string)
        state: List[str] = re.findall(state_query, string)
        ratings_and_pairings = re.findall(rating_query, string)
        
        if score or state or ratings_and_pairings:
            player_rows.append((score, state, ratings_and_pairings[0][0], ratings_and_pairings[0][1]))

    clean_tabs: List[List[str]] = []
    for row in player_rows:
        player_data: List[str] = []
        # Player scores in different rounds
        if row[0]:
            split_scores: List[str] = row[0][0].split('|')
            rounds: List[str] = []
            for s in split_scores[1:]:
                s = s.strip()
                if s != '' and s != ',':
                    rounds.append(s)
            player_data.append(rounds)
        # State codes
        if row[1]:
            player_data.append(row[1][0].strip('|').strip('\n').strip(' '))
        # Rating changes during the tournament
        if row[2]:
            split_ratings: List[str] = row[2].strip('|').split('/')
            player_data.append(split_ratings[0].strip())
            second_split: List[str] = split_ratings[-1].split(':')
            player_data.append(second_split[0].strip())
            before_rating, after_rating = second_split[1].split('->')
            player_data.append(before_rating.strip())
            player_data.append(after_rating.strip())
        if row[3]:
            player_data.append(row[3].strip())
        clean_tabs.append(player_data)
    return clean_tabs

In [192]:
def get_max_id(conn: Connection) -> int:
    cursor: Cursor = conn.cursor()
    cursor.execute("SELECT MAX(rowid) FROM uscf_urls")
    return cursor.fetchall()[0][0]

In [193]:
# Returns an int representing the tournament's primary key
def store_tournament(conn: Connection, metadata1: List[str], metadata2: List[str], url_fk: int) -> int:
    cur: Cursor = conn.cursor()
    input_list: List[str] = []
    input_list.extend(metadata1)
    input_list.extend(metadata2)
    input_list.append(url_fk)
    cur.execute("""INSERT INTO uscf_tournaments (
                tournament_name, tournament_code, 
                event_date, received_date, entered_date, 
                rated_date, section_count, player_count,
                k_factor, rating_system, tournament_type,
                time_control, urls_id
                ) VALUES (?, ?, ?, ?, ?, ?, ?,
                ?, ?, ?, ?, ?, ?)
    """, tuple(input_list))
    conn.commit()
    cur.execute("SELECT MAX(rowid) FROM uscf_tournaments")
    return cur.fetchall()[0][0]

# Returns a List[int] representing the primary keys for each player observation
def store_players(conn: Connection, players: List[Tuple[str]], tournament_results: List[str], tournament_fk: int) -> List[int]:
    cur: Cursor = conn.cursor()
    if len(players) != len(tournament_results):
        raise Exception('players and tournament_results should be the same shape')
    
    max_ids: List[int] = []
    for i in range(len(players)):
        player: str = players[i]
        result: str = tournament_results[i]

        name: str = player[0]
        seed_number: str = str(i + 1)
        url: str = player[1]
        uscf_id: str = result[2]
        overall_record: str = result[0][:1]
        state_code: str = result[1]
        rating_type: str = result[3]
        before_rating: str = result[4]
        after_rating: str = result[5]
        color_assignments: str = result[6]
        input_tuple: Tuple[str] = tuple(name, seed_number, url, uscf_id,
                                overall_record, state_code, rating_type, 
                                before_rating, after_rating, color_assignments)
        cur.execute("""INSERT INTO uscf_player_observations (
            name, seed_number, url, uscf_id, record, state_code, rating_type,
            before_rating, after_rating, color_assignments, uscf_tournaments_id)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
            )""", input_tuple)
        conn.commit()
        cur.execute("SELECT MAX(rowid) FROM uscf_player_observations")
        max_ids.append(cur.fetchall()[0][0])
    return max_ids

def store_rounds(conn: Connection, tournament_fk: int, player_fks: List[int], tournament_results: List[str]) -> None:
    cur: Cursor = conn.cursor()
    

init_db()
conn: Connection = sqlite3.Connection('scrape_data.db')
max_id: int = get_max_id(conn)
for url_fk in range(max_id - 1, max_id):
    soup: BeautifulSoup = get_tournament_page('401')
    metadata1: List[str] = extract_tournament_info1(soup)
    metadata2: List[str] = extract_tournament_info2(soup)
    players: List[Tuple[str]] = get_player_names_and_urls(soup)
    tournament_results: List[str] = extract_tabular_results(soup)
    for r in tournament_results:
        print(r)
    tournament_fk: int = store_tournament(conn, metadata1, metadata2, url_fk)
    player_fks: List[int] = store_players(conn, players, tournament_results, tournament_fk)
    store_rounds(conn, tournament_fk, player_fks)

http://www.uschess.org/msa/XtblMain.php?202209146062
Status Code: 200
[['6.5', 'D  26', 'W  46', 'W   4', 'W  13', 'W  36', 'W   2', 'W   5'], 'WA', '16948874', 'OB', '2026', '2047', '|     |B    |W    |B    |W    |B    |B    |W    |']
[['5.5', 'W  15', 'W  11', 'W   3', 'D  36', 'W  35', 'L   1', 'W   7'], 'IN', '16115411', 'OB', '1800', '1902', '|     |B    |W    |B    |W    |B    |W    |B    |']
[['5.0', 'W  42', 'W  10', 'L   2', 'L   7', 'W  20', 'W  39', 'W  11'], 'VA', '16180893', 'OB', '1873', '1882', '|     |W    |B    |W    |B    |W    |B    |B    |']
[['5.0', 'X  51', 'W  22', 'L   1', 'W  18', 'W  10', 'L   7', 'W  16'], 'CA', '16083648', 'OB', '1843', '1855', '|     |B    |W    |W    |B    |B    |W    |W    |']
[['5.0', 'W  50', 'L  38', 'W  17', 'W  40', 'W  13', 'W  11', 'L   1'], 'WA', '20048591', 'OB', '1748', '1827', '|     |B    |W    |W    |B    |B    |W    |B    |']
[['5.0', 'L  11', 'W  20', 'W  48', 'D  41', 'W  19', 'W  36', 'D   9'], 'OH', '16368693', 'OB', '16

In [194]:
# soup: BeautifulSoup = get_tournament_data('501')
# pretty_soup = soup.prettify()
# with open('page.txt', 'w') as fout:
#     for line in pretty_soup:
#         fout.write(line)