In [294]:
import requests
import sqlite3
from sqlite3 import Connection, Cursor
from requests import Response
from bs4 import BeautifulSoup
from bs4 import ResultSet
from typing import List, Tuple, Dict

In [295]:
def get_tournament_urls(page_number: int) -> List[Tuple[str]]:
    url: str = 'https://www.chess.com/club/live-tournaments/uschess-members-only?&page=' + str(page_number)
    response: Response = requests.get(url)
    soup: BeautifulSoup = BeautifulSoup(response.text)
    results: ResultSet = soup.find_all('a', {'class': 'tournaments-live-name'})
    urls: List[str] = []
    for r in results:
        if r['href']:
            urls.append((r['href'],))
    return urls

In [296]:
def create_url_table(conn: Connection) -> None:
    cur: Cursor = conn.cursor()
    query: str = """CREATE TABLE IF NOT EXISTS chess_urls (
                        id INTEGER PRIMARY KEY,
                        url TEXT
                    )"""
    cur.execute(query)
    conn.commit()

def store_urls(conn: Connection, urls: List[Tuple[str]]) -> None:
    cur: Cursor = conn.cursor()
    query: str = "INSERT INTO chess_urls (url) VALUES (?)"
    cur.executemany(query, urls)
    conn.commit()


In [297]:
# Used this code to get list of chess tournament URLs

# conn: Connection = sqlite3.Connection('scrape_data.db')
# # create_url_table(conn)

# CHESS_PAGE_COUNT = 106
# for i in range(1, CHESS_PAGE_COUNT + 1):
#     urls: List[Tuple[str]] = get_tournament_urls(i)
#     store_urls(conn, urls)


In [298]:
conn: Connection = sqlite3.Connection('scrape_data.db')
cur: Cursor = conn.cursor()
cur.execute('SELECT url from chess_urls ORDER BY RANDOM() LIMIT 1')
# url = cur.fetchall()[0][0]
url = 'https://www.chess.com/tournament/live/-us-chess-100-blitz-1079993'

In [299]:
response: Response = requests.get(url)
soup: BeautifulSoup = BeautifulSoup(response.text)
results: ResultSet = soup.find_all('a', {'v-tooltip': True})
test: str = ''

for tag in results:
    test += tag['v-tooltip'] + '\n'

print(test)

National Master
RMPerry (#1)   vs gpayne280 (#10) 
PhaseShift (#12)   vs RMPerry (#1) 
RMPerry (#1)   vs KingsBishop (#4) 
angusmc (#8)   vs RMPerry (#1) 
TedBelanoff (#2)   vs RMPerry (#1) 
Chesslover2158 (-)   vs ArkansasChessJunkie (#2) 
ArkansasChessJunkie (#2)   vs Putz57 (-) 
learnychess (#11)   vs ArkansasChessJunkie (#2) 
ArkansasChessJunkie (#2)   vs GopherBeast (#14) 
BlackSpock (#8)   vs ArkansasChessJunkie (#2) 
National Master
JoshTheDragon (-)   vs TedBelanoff (#2) 
TedBelanoff (#2)   vs ImpeneTreble (#19) 
el_sammo (#16)   vs TedBelanoff (#2) 
TedBelanoff (#2)   vs RMPerry (#1) 
GopherBeast (#14)   vs KingsBishop (#4) 
KingsBishop (#4)   vs TheCosmicWave (-) 
RMPerry (#1)   vs KingsBishop (#4) 
KingsBishop (#4)   vs PhaseShift (#12) 
KingsBishop (#4)   vs angusmc (#8) 
FozzieBando (#5)   vs FCD2018 (#13) 
Blues4 (#6)   vs FozzieBando (#5) 
Chesslover2158 (-)   vs FozzieBando (#5) 
FozzieBando (#5)   vs BlackSpock (#8) 
aslan7788n (#18)   vs FozzieBando (#5) 
Blues4 (#6) 

In [300]:
# Create a round number feature, that counts the number of rounds in every tournaments. 

# conn: Connection = sqlite3.Connection('scrape_data.db')
# cur: Cursor = conn.cursor()

# add_column = """ALTER TABLE uscf_tournaments
#                 ADD COLUMN rounds INTEGER"""
# cur.execute(add_column)
# conn.commit()

# query = """
#     SELECT ut.id, MAX(ur.round_number)
#     FROM uscf_rounds ur JOIN uscf_tournaments ut ON ur.uscf_tournaments_id = ut.id
#     GROUP BY ut.id
# """
# cur.execute(query)
# id_and_rounds: List[Tuple[int]] = cur.fetchall()

# add_rounds = """
# UPDATE uscf_tournaments
# SET rounds = ?
# WHERE id = ?
# """

# id_and_rounds_reversed: List[Tuple[int]] = []
# for element in id_and_rounds:
#     id_and_rounds_reversed.append(tuple(reversed(element)))

# cur.executemany(add_rounds, id_and_rounds_reversed)
# conn.commit()

In [301]:
test_list: List[str] = test.split('\n')
formatted = []
for line in test_list:
    pairing = line.strip().split('vs')
    formatted.append(tuple(pairing))

In [302]:
import pandas as pd
array = []
for row in formatted:
    if len(row) > 1:
        white = row[0]
        black = row[1]
        white = white.strip()
        black = black.strip()
        white_split = white.split(' ')
        black_split = black.split(' ')
        white_user = white_split[0]
        white_seed = white_split[1]
        black_user = black_split[0]
        black_seed = black_split[1]
        white_seed= white_seed.strip('(')
        white_seed=white_seed.strip(')')
        white_seed=white_seed.strip('#')
        black_seed=black_seed.strip('(')
        black_seed=black_seed.strip(')')
        black_seed=black_seed.strip('#')
        array.append((white_user, white_seed, black_user, black_seed))
df = pd.DataFrame(array, columns=['white', 'white_seed', 'black', 'black_seed'])

In [303]:
import numpy as np

df_subset = df[~df['white_seed'].isin(['-']) & ~df['black_seed'].isin(['-'])][['white_seed', 'black_seed']]
df_subset = df_subset.astype({'white_seed': 'int32', 'black_seed': 'int32'})

slice = df_subset.sort_values(by=['white_seed', 'black_seed'], ascending=True)
df_no_duplicates = slice.drop_duplicates(inplace=False, subset=['white_seed', 'black_seed'])
slice_pairs = df_no_duplicates.values.tolist()
slice_pairs

[[1, 4],
 [1, 10],
 [2, 1],
 [2, 14],
 [2, 19],
 [4, 8],
 [4, 12],
 [5, 8],
 [5, 13],
 [6, 5],
 [6, 13],
 [7, 8],
 [7, 21],
 [8, 1],
 [8, 2],
 [8, 16],
 [8, 21],
 [10, 14],
 [10, 19],
 [11, 2],
 [11, 14],
 [12, 1],
 [12, 6],
 [12, 20],
 [13, 19],
 [14, 4],
 [14, 7],
 [15, 20],
 [15, 21],
 [16, 2],
 [16, 7],
 [17, 10],
 [18, 5],
 [18, 6],
 [18, 19],
 [19, 17],
 [20, 10],
 [20, 21],
 [21, 8],
 [21, 11],
 [21, 16]]

In [304]:
conn: Connection = sqlite3.Connection('scrape_data.db')
cur: Cursor = conn.cursor()
cur.execute("""
    SELECT id
    FROM uscf_tournaments 
    WHERE rounds = 5
        AND SUBSTR(event_date, 1, 4) = '2019'
        AND (
            SUBSTR(event_date, 6, 2) = '06'
            OR SUBSTR(event_date, 6, 2) = '07'
            OR SUBSTR(event_date, 6, 2) = '08'
        )
    """)
matching_ids: List[Tuple[int]] = cur.fetchall()
matching_ids_unpacked: List[int] = []
for id in matching_ids:
    matching_ids_unpacked.append(id[0])

find_matching_seeds: str = """
SELECT upo.seed_number, ur.opponent, ur.uscf_tournaments_id
FROM uscf_rounds ur 
    JOIN uscf_player_observations upo 
    ON ur.uscf_player_id = upo.id
WHERE ur.uscf_tournaments_id IN ("""

parameter_str: str = ''
for i in range(len(matching_ids)):
    parameter_str = parameter_str + '?, '
parameter_str = parameter_str[:-2] # Remove last comma

find_matching_seeds = find_matching_seeds + parameter_str + ") ORDER BY ur.uscf_tournaments_id DESC"

cur.execute(find_matching_seeds, matching_ids_unpacked)
matching_seeds: List[Tuple[int]] = cur.fetchall()
matching_seeds

[(1, 24, 1686),
 (1, 21, 1686),
 (1, 2, 1686),
 (1, 9, 1686),
 (1, 15, 1686),
 (2, 11, 1686),
 (2, 6, 1686),
 (2, 1, 1686),
 (2, 4, 1686),
 (2, 8, 1686),
 (3, 15, 1686),
 (3, 10, 1686),
 (3, 8, 1686),
 (3, 7, 1686),
 (3, 9, 1686),
 (4, 23, 1686),
 (4, 16, 1686),
 (4, 9, 1686),
 (4, 2, 1686),
 (4, None, 1686),
 (5, None, 1686),
 (5, 13, 1686),
 (5, 6, 1686),
 (5, 15, 1686),
 (5, 11, 1686),
 (6, 14, 1686),
 (6, 2, 1686),
 (6, 5, 1686),
 (6, 12, 1686),
 (6, 10, 1686),
 (7, 19, 1686),
 (7, 14, 1686),
 (7, 21, 1686),
 (7, 3, 1686),
 (7, 12, 1686),
 (8, 25, 1686),
 (8, 9, 1686),
 (8, 3, 1686),
 (8, 18, 1686),
 (8, 2, 1686),
 (9, 20, 1686),
 (9, 8, 1686),
 (9, 4, 1686),
 (9, 1, 1686),
 (9, 3, 1686),
 (10, 17, 1686),
 (10, 3, 1686),
 (10, 14, 1686),
 (10, 13, 1686),
 (10, 6, 1686),
 (11, 2, 1686),
 (11, 12, 1686),
 (11, 18, 1686),
 (11, None, 1686),
 (11, 5, 1686),
 (12, None, 1686),
 (12, 11, 1686),
 (12, 15, 1686),
 (12, 6, 1686),
 (12, 7, 1686),
 (13, 16, 1686),
 (13, 5, 1686),
 (13, None, 

In [305]:
import copy

# Package each player as a class to keep things organized
class Player:
    def __init__(self, seed: int):
        self.__own_seed: int = seed
        self.__opponents: List[int] = []
        # self.__tournament_id: int = tournament

    def get_own_seed(self) -> int:
        return self.__own_seed

    def add_opponent_seed(self, opponent_seed: int) -> None:
        self.__opponents.append(opponent_seed)

    def get_opponents(self) -> List[int]:
        return copy.deepcopy(self.__opponents)

    # This method returns the number of pairings that matched between
    # the opponent in the USCF dataset and the chess.com dataset.
    def get_match_percentage(self, candidate_opps: List[int]) -> float:
        if type(candidate_opps) != list:
            raise Exception('Parameters was not a list.')

        matches: int = 0
        checked_indices: List[int] = []
        for i in range(len(self.__opponents)):
            for j in range(len(candidate_opps)):
                if self.__opponents[i] == candidate_opps[j]:
                    # print(self.__opponents[i], candidate_opps[j])
                    matches += 1
                    break
        return matches 
        #/ len(self.__opponents)

    def __str__(self) -> str:
        return str(self.__own_seed) + ' ' + str(self.__opponents)

In [306]:
# Input a list with all the pairs from all the tournaments that might
# match based on the date.
def wrap_uscf_players(uscf_pairs: List[Tuple[int]]) -> Dict:
    if len(uscf_pairs) == 0:
        raise Exception("wrap_uscf_players received a blank list.")
    current_tournament: int = uscf_pairs[0][2]
    output_dict: Dict = dict()
    tournament_list: List[Player] = []
    current_player: Player = Player(uscf_pairs[0][0])
    for tup in uscf_pairs:
        own_seed: int = tup[0]
        opp_seed: int = tup[1]
        tournament: int = tup[2]
        # If we've moved onto a new player, store the current player and move on.
        if current_player.get_own_seed() != own_seed:
            tournament_list.append(current_player)
            current_player = Player(own_seed)
        if tournament != current_tournament:
            # We've reached a new tournament. Store the player list and move on.
            output_dict[current_tournament] = tournament_list
            # Update the tournament id.
            current_tournament = tournament
        current_player.add_opponent_seed(opp_seed)
    return output_dict

In [307]:
# Outputs a dict where the keys are player numbers and the values
# are a corresponding player object.
def wrap_chesscom_players(chesscom_pairs: List[List[int]]) -> Dict:
    if len(chesscom_pairs) == 0:
        raise Exception('Received empty list.')
    current_player: Player = Player(chesscom_pairs[0][0])
    player_dict: Dict = dict()
    for pair in chesscom_pairs:
        own_seed: int = pair[0]
        opp_seed: int = pair[1]
        if current_player.get_own_seed() != own_seed:
            # If we reached another player, move onto that player.
            player_dict[own_seed] = current_player
            current_player = Player(own_seed)
        current_player.add_opponent_seed(opp_seed)
    return player_dict

In [312]:
from statistics import mean

# Spit out a List[Tuple[float]] indicating the match percentages
def find_tournament_match(chesscom_players: Dict, uscf_players: Dict) -> List[Tuple[float]]:
    if type(chesscom_players) != dict or type(uscf_players) != dict:
        raise Exception('Function parameters were not dicts.')
            
    percentages: List[Tuple] = []
    for tournament in uscf_players.keys():
        match_percentages: List[float] = []
        player_list: List[Player] = uscf_players[tournament]
        for uscf_player in player_list:
            uscf_own_seed: int = uscf_player.get_own_seed()
            if uscf_own_seed in chesscom_players:
                chesscom_opps: List[int] = chesscom_players[uscf_own_seed].get_opponents()
                match_percentages.append(uscf_player.get_match_percentage(chesscom_opps))
        percentages.append((tournament, mean(match_percentages)))
    return percentages

chesscom_players: Dict = wrap_chesscom_players(slice_pairs)
uscf_players: Dict = wrap_uscf_players(matching_seeds)
percentages: List[Tuple] = find_tournament_match(chesscom_players, uscf_players)
for tournament in percentages:
    print(tournament)

IndexError: tuple index out of range

In [309]:
conn: Connection = sqlite3.Connection('scrape_data.db')
cur: Cursor = conn.cursor()
cur.execute("""
    SELECT id
    FROM uscf_tournaments 
    WHERE rounds = 7
        AND SUBSTR(event_date, 1, 4) = '2019'
        AND (
            SUBSTR(event_date, 6, 2) = '06'
            OR SUBSTR(event_date, 6, 2) = '07'
            OR SUBSTR(event_date, 6, 2) = '08'
        )
    """)
matching_ids: List[Tuple[int]] = cur.fetchall()
matching_ids_unpacked: List[int] = []
for id in matching_ids:
    matching_ids_unpacked.append(id[0])

find_matching_seeds: str = """
SELECT uu.url
FROM uscf_tournaments ut JOIN uscf_urls uu ON ut.urls_id = uu.id
WHERE ut.id IN ("""

parameter_str: str = ''
for i in range(len(matching_ids)):
    parameter_str = parameter_str + '?, '
parameter_str = parameter_str[:-2] # Remove last comma

find_matching_seeds = find_matching_seeds + parameter_str + ")"

cur.execute(find_matching_seeds, matching_ids_unpacked)
matching_seeds: List[Tuple[int]] = cur.fetchall()
matching_seeds

[('http://www.uschess.org/msa/XtblMain.php?201906036292',),
 ('http://www.uschess.org/msa/XtblMain.php?201906056302',),
 ('http://www.uschess.org/msa/XtblMain.php?201906109642',),
 ('http://www.uschess.org/msa/XtblMain.php?201906129652',),
 ('http://www.uschess.org/msa/XtblMain.php?201906172722',),
 ('http://www.uschess.org/msa/XtblMain.php?201906192732',),
 ('http://www.uschess.org/msa/XtblMain.php?201906246082',),
 ('http://www.uschess.org/msa/XtblMain.php?201906266092',),
 ('http://www.uschess.org/msa/XtblMain.php?201907016812',),
 ('http://www.uschess.org/msa/XtblMain.php?201907036822',),
 ('http://www.uschess.org/msa/XtblMain.php?201907082312',),
 ('http://www.uschess.org/msa/XtblMain.php?201907102322',),
 ('http://www.uschess.org/msa/XtblMain.php?201907152382',),
 ('http://www.uschess.org/msa/XtblMain.php?201907172392',),
 ('http://www.uschess.org/msa/XtblMain.php?201907225852',),
 ('http://www.uschess.org/msa/XtblMain.php?201907248862',),
 ('http://www.uschess.org/msa/XtblMain.p