In [134]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webelement import WebElement
from webdriver_manager.chrome import ChromeDriverManager
from typing import List, Tuple, Generator

In [135]:
# Instantiate a Chrome driver.
def load_driver() -> None:
    # Load and configure webdriver.
    options: Options = Options()
    # Stop browser windows from actually popping up.
    options.add_argument('--headless')
    # Install a browser for use by Selenium.
    service: Service = Service(executable_path=ChromeDriverManager().install())
    return Chrome(service=service, options=options)

In [136]:
# Helper function that navigates to the search result page for a particular
# month on the USCF page for historical tournament data.
def navigate_to_uscf_page(driver: Chrome, date_to_visit: str) -> None:
    # Navigate to US Chess tournament search page.
    USCF_URL = 'http://www.uschess.org/datapage/events-rated.php'
    driver.get(USCF_URL)
    date_search_box: WebElement = driver.find_element('name', 'month')
    date_search_box.clear()
    date_search_box.send_keys(date_to_visit)

    # Select CA as the State Code, which is where all 
    # chess.com USCF tournaments are registered.
    state_search_box: WebElement = driver.find_element('name', 'states')
    state_search_box.clear()
    state_search_box.send_keys('CA')
    state_search_box.submit()

In [137]:
# Scrapes tournment urls for the month that the driver is currently
# pointed add. Helper function navigate_to_uscf_page() navigates to the
# correct page.
def scrape_uscf_tournament_urls(driver: Chrome) -> List[str]:
    table_body: List[WebElement] = driver.find_elements(By.TAG_NAME, 'tbody')[2]
    table_body_row: List[WebElement] = table_body.find_elements(By.TAG_NAME, 'tr')
    url_list: List[str] = []
    for row in table_body_row:
        # Each row is a WebElement with data about one tournament.
        table_row: List[WebElement] = row.find_elements(By.TAG_NAME, 'td')
        if len(table_row) >= 3:
            url: str = None
            for element in table_row:
                if element.text.isnumeric() and len(element.text) > 10:
                    url = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
                # Only keep tournament urls labeled "CHESS.COM"
                if 'CHESS.COM' in element.text.upper():
                    url_list.append(url)
    return url_list     

In [138]:
import sqlite3
from sqlite3 import Connection, Cursor

# Drop then create tables for all data scraped by this module.
def init_db() -> None:
    conn: Connection = sqlite3.Connection('scrape_data.db')
    cur: Cursor = conn.cursor()
    cur.execute("""DROP TABLE IF EXISTS uscf_urls""")
    # Stores urls to allow us to navigate to all relevant USCF tournaments.
    cur.execute("""CREATE TABLE IF NOT EXISTS uscf_urls (
        id INTEGER PRIMARY KEY,
        date TEXT,
        url TEXT,
        scraped INTEGER
        )""")
    conn.commit()

In [139]:
# Scrapes a list of urls. Each url is a USCF tournament with a parallel entry
# on chess.com. Every month between 2015 and 2023 inclusive is checked.
def scrape_all_uscf_urls(driver: Chrome, cur: Cursor) -> List[str]:
    url_list: List[str] = []
    for year in range(2023, 2014, -1):
        # Page requires single digit months to have a 0 in front.
        for month in range(1, 10):
            date: str = '0' + str(month) + '/' + str(year)
            navigate_to_uscf_page(driver, date)
            url_list.extend(scrape_uscf_tournament_urls(driver))
        for month in range(10, 13):
            date: str = str(month) + '/' + str(year)
            navigate_to_uscf_page(driver, date)
            url_list.extend(scrape_uscf_tournament_urls(driver))
    return url_list

### TOURNAMENT OBSERVATIONS
INTEGER PRIMARY KEY
Round count
Player count
Number of rounds
Section date(s)
Received date
Entered date
Rated date
Re-Rated date
K Factor
Rating System
Tournament type
Time control
Tournament name
Tournament ID
Total points

### ROUNDS RESULT OBSERVATIONS
INTEGER PRIMARY KEY
TOURNAMENT FOREIGN KEY
Round number
Round result
Round opponent

### PLAYER OBSERVATIONS
INTEGER PRIMARY KEY
Real name
USCF before rating
USCF after rating
State
Pair number
Total points
USFC id


In [140]:
# from datetime import datetime

# STARTING_URL: str = 'https://www.chess.com/tournament/live?&page='

# def go_to_tournament(day, month, year, starting_url):
#     driver.get(starting_url)
#     date_string = driver.find_element(By.CLASS_NAME, 'tournaments-live-date')
#     date = datetime.strptime(date_string.text, '%b %d, %Y, %I:%M %p')

# 0 to 9000 is what chess.com currently allows
# def go_to_right_year(target, url, page) -> bool:
#     driver.get(url + page)
#     date_string = driver.find_element(By.CLASS_NAME, 'tournaments-live-date')
#     date = datetime.strptime(date_string.text, '%b %d, %Y, %I:%M %p')
#     if target < date.year:
#         page = page + step_size
#         step_size = step_size / 2

In [141]:
# driver: Chrome = load_driver()
# url_list: List[str] = scrape_all_uscf_urls(driver)

# url_tuples: List[Tuple] = []
# for url in url_list:
#     url_tuples.append((None, url, 0))

# conn: Connection = sqlite3.Connection('scrape_data.db')
# cur: Cursor = conn.cursor()
# cur.executemany("""INSERT INTO uscf_urls (
#     date, url, scraped
#     ) VALUES (?, ?, ? )""", url_tuples)
# conn.commit()

In [142]:
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
import requests
from requests.models import Response

cur: Cursor = sqlite3.Connection('scrape_data.db').cursor()
# Try 499 instead of 1900
cur.execute("SELECT url FROM uscf_urls WHERE id = 499")
url: str = cur.fetchall()[0][0]
request: Response = requests.get(url)
print(url)
print('Status Code:', request.status_code)
soup = BeautifulSoup(request.text, 'html.parser')

http://www.uschess.org/msa/XtblMain.php?202211214932
Status Code: 200


In [143]:
upper_table: ResultSet[Tag] = soup.find_all('table', attrs={
    'border': '0',
    'bgcolor': 'FFFFFF',
    'cellpadding': '3',
    'cellspacing': '0'
})
rows: ResultSet[Tag] = upper_table[0].find_all('tr')
row1_tags: ResultSet[Tag] = rows[0].find_all('td')

name: str = row1_tags[3].b.text
tournament_id = row1_tags[3].small.text[1:-1]
event_date: str = row1_tags[7].b.text

dates_split: List[str] = row1_tags[13].b.text.split(' ')
received_date: str = dates_split[1]
entered_date: str = dates_split[4]
rated_date: str = dates_split[7]

section_count: int = int(row1_tags[15].b.text.split(' ')[0])
player_count: str = row1_tags[15].b.text.split(' ')[-2]

print(name)
print(tournament_id)
print(event_date)
print(received_date)
print(entered_date)
print(rated_date)
print(section_count)
print(player_count)

US CHESS BLITZ ON CHESS.COM #2
202211214932
2022-11-21 
2022-12-04
2022-12-04
2022-12-14
1
46


In [144]:
header_box: Tag = upper_table[1]
rules: str = header_box.find_all('b')[3].text
rules_list: List[str] = rules.split(' ')

k_factor: str = rules_list[7]
rating_system: str = rules_list[11]
tournament_type: str = rules_list[16]
time_control: str = rules_list[20]

print(k_factor)
print(rating_system)
print(tournament_type)
print(time_control)





Section 1 - 1




Section Date(s)



2022-11-21




Processed



Received: 2022-12-04   Entered: 2022-12-04   Rated: 2022-12-14 Re-Rated: 2023-01-31




Stats



7 Rounds,  46 Players;   K Factor: F   Rating Sys: OB    Tnmt Type: S  Time Control: G/5;+0



F
OB
S
G/5;+0


In [145]:
import re
from re import Match

# Includes both player names and links. Each link needs to be
# preceded by https://www.uschess.org/msa/.
players: ResultSet[Tag] = soup.find_all('a', {'href': lambda x: x and x.startswith('MbrDtlMain')})
pre_results: ResultSet[Tag] = soup.find_all('pre')

# Using .stripped_strings returns the data not surrounded by an HTML tag, which is the
# results data we want.
results_raw: Generator = pre_results[0].stripped_strings
results_clean: List[Tuple[str]] = []
results_query: str = r'\|\d+\.\d+\s*(?:\|[A-Z]\s*\d*)+'
state_query: str = r'\|\s+[A-Z][A-Z]\s+'
rating_query: str = r'\|\s*\d+\s*\/\s*[A-Z]+:[A-Za-z\s0-9]+->[A-Za-z\s0-9]+'
scores: List[str] = []
states: List[str] = []
ratings: List[str] = []
for string in results_raw:
    score: List[str] = re.findall(results_query, string)
    state: List[str] = re.findall(state_query, string)
    rating: List[str] = re.findall(rating_query, string)
    if score:
        scores.append(score[0])
    if state:
        states.append(state[0])
    if rating:
        ratings.append(rating[0])

clean_tabs: List[List[str]] = []
if len(scores) == len(states) == len(ratings):
    for i in range(len(scores)):
        player_data: List[str] = []
        split_scores: List[str] = scores[i].split('|')
        for s in split_scores:
            s = s.strip()
            if s != '' and s != ',':
                player_data.append(s)
        
        player_data.append(states[i].strip('|').strip('\n').strip(' '))

        split_ratings: List[str] = ratings[i].strip('|').split('/')
        player_data.append(split_ratings[0].strip())
        second_split: List[str] = split_ratings[-1].split(':')
        player_data.append(second_split[0].strip())
        before_rating, after_rating = second_split[1].split('->')
        player_data.append(before_rating.strip())
        player_data.append(after_rating.strip())
        clean_tabs.append(player_data)
else:
    raise Exception('Scraped data is the wrong shape.')

for tab in clean_tabs:
    print(tab)


['6.5', 'D  15', 'W  14', 'W   6', 'W  31', 'W   2', 'W   3', 'W   7', 'SC', '12451516', 'OB', '2000', '2037']
['6.0', 'W  21', 'W  35', 'W   5', 'W   3', 'L   1', 'W   9', 'W   6', 'VA', '12841181', 'OB', '1887', '1961']
['5.0', 'W  12', 'W  23', 'W   7', 'L   2', 'W  13', 'L   1', 'W   9', 'RI', '30031880', 'OB', '1917', '1931']
['5.0', 'L  18', 'W  15', 'L   8', 'W  42', 'W  16', 'W  13', 'W  10', 'CA', '17347717', 'OB', '1922', '1914']
['5.0', 'W  33', 'W  41', 'L   2', 'W  20', 'L   8', 'W  16', 'W  12', 'TX', '16019966', 'OB', '1802', '1826']
['5.0', 'W  42', 'W  24', 'L   1', 'W  34', 'W   7', 'W   8', 'L   2', 'UT', '20043111', 'OB', '1907', '1919']
['4.0', 'W   8', 'W   9', 'L   3', 'W  10', 'L   6', 'W  20', 'L   1', 'OK', '15409954', 'OB', '1761', '1797']
['4.0', 'L   7', 'W  29', 'W   4', 'W  15', 'W   5', 'L   6', 'L  11', 'TX', '30047917', 'OB', '1715', '1757']
['4.0', 'W  26', 'L   7', 'W  12', 'W  18', 'W  31', 'L   2', 'L   3', 'GA', '16788958', 'OB', '1828', '1833']
[

In [146]:
pretty_soup = soup.prettify()
with open('page.txt', 'w') as fout:
    for line in pretty_soup:
        fout.write(line)