In [1]:
from pgn2gif import chess
import numpy as np
from openTSNE import TSNE
from openTSNE.callbacks import ErrorApproximations
from matplotlib import pyplot as plt
from scipy import interpolate
import re
import pandas as pd
from custom_chess_utils.utils import *
from umap.parametric_umap import ParametricUMAP
from matplotlib import pyplot as plt
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
seed = 2
np.random.seed(seed)
tf.random.set_seed(seed)

player_name = 'AminG92'

# chess_games_path = '/mnt/d/Work/CG Institute/chess/lichess data/lichess_db_standard_rated_2021-08.pgn'
# chess_games_path = '/pse/notebooks/chess/data/lichess_db_standard_rated_2021-08_subsampled.pgn'
chess_games_path = './data/lichess_db_standard_rated_2021-08_subsampled.pgn'
# chess_games_path = 'data/eco_games.pgn'
pgn_folder = 'games'
# umap_path = 'lichess_param_umap_nodup_custom_seed'+str(seed)+'.csv'
umap_path = 'lichess_games_player_data'+player_name+'.csv'

Num GPUs Available:  1


### Data loading and preprocessing

#### Extract individual games from PGN file

In [2]:
split_games = extract_individual_games_from_pgn(chess_games_path, pgn_folder, lines_to_read=1500000)

#### Filter out games without eval score or without clk

In [3]:
# split_games = get_games_with_eval_and_clk(split_games)
# print(len(split_games))

#### Filter games based on player name

In [4]:
print(len(split_games), 'games before filtering for player', player_name)
split_games = get_games_played_by(split_games, player_name)
print(len(split_games), 'games after filtering for player', player_name)

74960 games before filtering for player AminG92
19 games after filtering for player AminG92


#### keep n games

In [5]:
n_games = 99999
split_games = split_games[:n_games]
print(len(split_games))

19


#### store individual games as PGN files

In [6]:
store_games_as_pgn(split_games, pgn_folder)
game_paths = [pgn_folder+'/game-{:05d}.pgn'.format(n+1) for n in range(n_games)]

#### Loading games from individual PGN files

In [7]:
game_paths_checked, metadata, metadata_evals, metadata_clks, md_keys = get_metadata_from_pgns(game_paths)
print(md_keys)

['White', 'BlackElo', 'Round', 'UTCDate', 'Event', 'ECO', 'UTCTime', 'Result', 'Black', 'WhiteRatingDiff', 'Site', 'Opening', 'Date', 'WhiteElo', 'TimeControl', 'BlackRatingDiff', 'Termination']


In [8]:
game_matrices, games_pgn = game_matrices_from_pgn(pgn_folder, game_paths_checked, first_moves_filter=None)
print(len(game_matrices))

19


#### keep opening moves only
using the list of openings that corresponds to the lichess dataset https://github.com/niklasf/chess-openings

In [9]:
eco_df = get_eco_df('ECOs')

#### add ECO category (A,B,C,D,E,F) to metadata

In [10]:
metadata = create_opening_categories_feature(metadata)

#### filter games with Openings that don't exist in our ECO dataset

In [11]:
game_matrices, metadata, metadata_evals, metadata_clks, games_pgn = filter_unknown_ecos(eco_df, game_matrices, metadata, metadata_evals, metadata_clks, games_pgn)

#### using eco dataframe to determine amount n of moves in the corresponding opening, cutting off each game after n

In [12]:
game_matrices = cut_off_games_after_opening(game_matrices, eco_df, metadata)

#### get captured pieces throughout games and turns

In [13]:
captures, _ = get_captures(game_matrices, games_pgn)

#### Create dummy csv without embedding to be able to load it as a pandas df

In [14]:
embedding_split = all_zeros_embedding_shape(game_matrices)

In [15]:
write_csv(umap_path, md_keys, embedding_split, game_matrices, metadata, captures, metadata_evals, metadata_clks)