In [1]:
from pgn2gif import chess
import numpy as np
from openTSNE import TSNE
from openTSNE.callbacks import ErrorApproximations
from matplotlib import pyplot as plt
from scipy import interpolate
import re
import pandas as pd
from custom_chess_utils.utils import *
import umap.umap_ as umap
from matplotlib import pyplot as plt
seed = 0
np.random.seed(seed)

chess_games_path = '/mnt/d/Work/CG Institute/chess/lichess data/lichess_db_standard_rated_2021-08.pgn'
pgn_folder = 'games'
tsne_path = 'lichess_project_captures_tsne_seed'+str(seed)+'.csv'
umap_path = 'lichess_project_captures_umap_seed'+str(seed)+'.csv'

2022-01-13 10:55:29.435333: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-13 10:55:29.435397: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Data loading and preprocessing

#### Extract individual games from PGN file

In [2]:
split_games = extract_individual_games_from_pgn(chess_games_path, pgn_folder, lines_to_read=800000)

#### Filter out games without eval score or without clk

In [3]:
split_games = get_games_with_eval_and_clk(split_games)
print(len(split_games))

2228


#### keep n games

In [4]:
n_games = 250
split_games = split_games[:n_games]
print(len(split_games))

250


#### store individual games as PGN files

In [5]:
store_games_as_pgn(split_games, pgn_folder)
game_paths = [pgn_folder+'/game-{:05d}.pgn'.format(n+1) for n in range(n_games)]

#### Loading games from individual PGN files

In [6]:
game_paths_checked, metadata, metadata_evals, metadata_clks, md_keys = get_metadata_from_pgns(game_paths)
print(md_keys)

['Termination', 'Round', 'Black', 'WhiteElo', 'Result', 'UTCTime', 'White', 'Opening', 'UTCDate', 'BlackElo', 'TimeControl', 'Event', 'Site', 'ECO', 'Date']


In [7]:
game_matrices, games_pgn = game_matrices_from_pgn(pgn_folder, game_paths_checked, first_moves_filter=None)
print(len(game_matrices))

250


#### keep opening moves only
using the list of openings that corresponds to the lichess dataset https://github.com/niklasf/chess-openings

In [8]:
eco_df = get_eco_df('ECOs')

#### add ECO category (A,B,C,D,E,F) to metadata

In [9]:
metadata = create_opening_categories_feature(metadata)

#### filter games with Openings that don't exist in our ECO dataset

In [10]:
game_matrices, metadata, metadata_evals, metadata_clks, games_pgn = filter_unknown_ecos(eco_df, game_matrices, metadata, metadata_evals, metadata_clks, games_pgn)

#### using eco dataframe to determine amount n of moves in the corresponding opening, cutting off each game after n

In [11]:
game_matrices = cut_off_games_after_opening(game_matrices, eco_df, metadata)

#### get captured pieces throughout games and turns

In [12]:
captures, _ = get_captures(game_matrices, games_pgn)

#### concatenate games into final data that will be projected

In [13]:
final_data = np.concatenate(game_matrices)

#### concat captured pieces data onto chess board data such that it is also used for projection

In [14]:
final_data = np.concatenate([final_data, np.concatenate(captures)], axis=1)

### Calculating Embeddings / Projections and Writing to Files

#### project using t-sne

In [15]:
tsne = TSNE(
    perplexity=200,
    n_jobs=6,
    metric='euclidean',
    random_state=seed,
    verbose=True
)

In [16]:
%time embedding = tsne.fit(np.array(final_data))

--------------------------------------------------------------------------------
TSNE(n_jobs=6, perplexity=200, random_state=0, verbose=True)
--------------------------------------------------------------------------------
===> Finding 600 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 0.84 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.26 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.09 seconds
===> Running optimization with exaggeration=12.00, lr=200.00 for 250 iterations...
Iteration   50, KL divergence 1.7260, 50 iterations in 0.3235 sec
Iteration  100, KL divergence 1.7435, 50 iterations in 0.3409 sec
Iteration  150, KL divergence 1.7406, 50 iterations in 0.3211 sec
Iteration  200, KL divergence 1.7225, 50 iterations in 0.3500 sec
Iteration  250, KL divergence 1.7180, 50 iterations in 0.3186 sec
   --> Time elapsed: 1.66 seconds
===> Running optimization with exaggeration=1.00, lr=

#### reshape resulting embedding into games, states, 2

In [17]:
embedding_split = reshape_embedding(embedding, game_matrices)

### create resulting csv file using all metadata and embedding

In [18]:
write_csv(tsne_path, md_keys, embedding_split, game_matrices, metadata, captures, metadata_evals, metadata_clks)

['Termination', 'Round', 'Black', 'WhiteElo', 'Result', 'UTCTime', 'White', 'Opening', 'UTCDate', 'BlackElo', 'TimeControl', 'Event', 'Site', 'ECO', 'Date']


#### project using umap

In [19]:
reducer = umap.UMAP(verbose=True, random_state=seed)

In [20]:
umap_embedding = reducer.fit_transform(final_data)

UMAP(dens_frac=0.0, dens_lambda=0.0, random_state=0, verbose=True)
Construct fuzzy simplicial set
Thu Jan 13 10:55:56 2022 Finding Nearest Neighbors
Thu Jan 13 10:55:59 2022 Finished Nearest Neighbor Search
Thu Jan 13 10:56:01 2022 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Jan 13 10:56:10 2022 Finished embedding


#### reshape resulting embedding into games, states, 2

In [21]:
umap_embedding_split = reshape_embedding(umap_embedding, game_matrices)

#### use existing csv and simply overwrite x,y before storing to new file

In [22]:
store_embedding_over_existing_file(tsne_path, umap_embedding, umap_path)