In [1]:
import pandas as pd # DataFrame library
import re

In [2]:
chess_games_2023 = pd.read_csv("./data/chess_games_2023.csv") # loads the 2023 data
chess_games_2024 = pd.read_csv("./data/chess_games_2024.csv") # loads the 2024 data
chess_games = pd.concat([chess_games_2023, chess_games_2024]) # combines both years of data

In [3]:
chess_games.info() # Gives us an overview of the combined dataframe

<class 'pandas.core.frame.DataFrame'>
Index: 671 entries, 0 to 175
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        671 non-null    int64  
 1   url               671 non-null    object 
 2   pgn               671 non-null    object 
 3   time_control      671 non-null    object 
 4   end_time          671 non-null    int64  
 5   rated             671 non-null    bool   
 6   tcn               671 non-null    object 
 7   uuid              671 non-null    object 
 8   initial_setup     671 non-null    object 
 9   fen               671 non-null    object 
 10  time_class        671 non-null    object 
 11  rules             671 non-null    object 
 12  white.rating      671 non-null    int64  
 13  white.result      671 non-null    object 
 14  white.@id         671 non-null    object 
 15  white.username    671 non-null    object 
 16  white.uuid        671 non-null    object 
 17  bl

In [4]:
chess_games_2023.shape # the number of rows and columns for the 2023 games

(495, 25)

In [5]:
chess_games_2024.shape # the number of rows and columns for the 2024 games

(176, 25)

In [6]:
assert chess_games_2023.shape[0] + chess_games_2024.shape[0] == chess_games.shape[0] # Tests to make sure that the 2023 and 2024 dataframes add up (if successful there shouldn't be an output)

In [7]:
chess_games.columns # the columns currently in the dataframe

Index(['Unnamed: 0', 'url', 'pgn', 'time_control', 'end_time', 'rated', 'tcn',
       'uuid', 'initial_setup', 'fen', 'time_class', 'rules', 'white.rating',
       'white.result', 'white.@id', 'white.username', 'white.uuid',
       'black.rating', 'black.result', 'black.@id', 'black.username',
       'black.uuid', 'start_time', 'accuracies.white', 'accuracies.black'],
      dtype='object')

In [8]:
columns_to_drop = ["Unnamed: 0", "url", "rated", "tcn", "uuid", "initial_setup", 
                   "fen", "rules", "white.@id", "white.uuid", "black.@id", "black.uuid"]
chess_games.drop(columns = columns_to_drop, inplace = True)

In [9]:
chess_games["time_class"].unique() # types of games that were played

array(['blitz', 'bullet', 'daily', 'rapid'], dtype=object)

In [29]:
# Writing out PGN data to a file (this will be helpful to extract missing information that wasn't already available in the API)
pgn_data = chess_games["pgn"]
with open("./data/pgn_data.pgn", "w") as p:
    p.writelines(pgn_data + "\n")

In [5]:
# Reading in the data from the newly created PGN file to extract information (i.e. the start and end times and the dates)
with open("./data/pgn_data.pgn", "r") as p:
    pgn_text = p.readlines()
pgn_text[:24]

['[Event "Live Chess"]\n',
 '[Site "Chess.com"]\n',
 '[Date "2023.01.18"]\n',
 '[Round "-"]\n',
 '[White "hfactor13"]\n',
 '[Black "MrEvi10verlord"]\n',
 '[Result "0-1"]\n',
 '[CurrentPosition "r6k/ppp4p/2n1r3/8/1nPp4/1P3p2/P1q5/RK4R1 w - -"]\n',
 '[Timezone "UTC"]\n',
 '[ECO "A01"]\n',
 '[ECOUrl "https://www.chess.com/openings/Nimzowitsch-Larsen-Attack-Modern-Variation"]\n',
 '[UTCDate "2023.01.18"]\n',
 '[UTCTime "02:03:52"]\n',
 '[WhiteElo "660"]\n',
 '[BlackElo "795"]\n',
 '[TimeControl "180"]\n',
 '[Termination "MrEvi10verlord won by checkmate"]\n',
 '[StartTime "02:03:52"]\n',
 '[EndDate "2023.01.18"]\n',
 '[EndTime "02:08:39"]\n',
 '[Link "https://www.chess.com/game/live/67746611751"]\n',
 '\n',
 '1. b3 {[%clk 0:02:55.3]} 1... e5 {[%clk 0:02:59.8]} 2. Nf3 {[%clk 0:02:50.5]} 2... Nc6 {[%clk 0:02:58]} 3. g3 {[%clk 0:02:48.6]} 3... d5 {[%clk 0:02:57.8]} 4. c4 {[%clk 0:02:42.1]} 4... Bc5 {[%clk 0:02:54.4]} 5. Bb2 {[%clk 0:02:35.1]} 5... f6 {[%clk 0:02:46.2]} 6. h4 {[%clk 0:02:27.3]}

In [20]:
# Creates separate dataframes for each type of game played
blitz_games = chess_games[chess_games["time_class"] == "blitz"]
bullet_games = chess_games[chess_games["time_class"] == "bullet"]
daily_games = chess_games[chess_games["time_class"] == "daily"]
rapid_games = chess_games[chess_games["time_class"] == "rapid"]