In [None]:
from pgn2gif import chess
import numpy as np
from openTSNE import TSNE
from openTSNE.callbacks import ErrorApproximations
from matplotlib import pyplot as plt
from scipy import interpolate
import re
import pandas as pd
from custom_chess_utils.utils import *
from umap.parametric_umap import ParametricUMAP
from matplotlib import pyplot as plt
import tensorflow as tf
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
seed = 2
np.random.seed(seed)
tf.random.set_seed(seed)

# chess_games_path = '/mnt/d/Work/CG Institute/chess/lichess data/lichess_db_standard_rated_2021-08.pgn'
# chess_games_path = '/pse/notebooks/chess/data/lichess_db_standard_rated_2021-08_subsampled.pgn'
chess_games_path = './data/lichess_db_standard_rated_2021-08_subsampled.pgn'
# chess_games_path = 'data/eco_games.pgn'
pgn_folder = 'games'
# umap_path = 'lichess_param_umap_nodup_custom_seed'+str(seed)+'.csv'
umap_path = 'lichess_games_seed'+str(seed)+'_no_duplicate_projection.csv'

### Data loading and preprocessing

#### Extract individual games from PGN file

In [None]:
split_games = extract_individual_games_from_pgn(chess_games_path, pgn_folder, lines_to_read=1500000)

#### Filter out games without eval score or without clk

In [None]:
split_games = get_games_with_eval_and_clk(split_games)
print(len(split_games))

#### keep n games

In [None]:
n_games = 3000#99999
split_games = split_games[:n_games]
print(len(split_games))

#### store individual games as PGN files

In [None]:
store_games_as_pgn(split_games, pgn_folder)
game_paths = [pgn_folder+'/game-{:05d}.pgn'.format(n+1) for n in range(n_games)]

#### Loading games from individual PGN files

In [None]:
game_paths_checked, metadata, metadata_evals, metadata_clks, md_keys = get_metadata_from_pgns(game_paths)
print(md_keys)

In [None]:
game_matrices, games_pgn = game_matrices_from_pgn(pgn_folder, game_paths_checked, first_moves_filter=None)
print(len(game_matrices))

#### keep opening moves only
using the list of openings that corresponds to the lichess dataset https://github.com/niklasf/chess-openings

In [None]:
eco_df = get_eco_df('ECOs')

#### add ECO category (A,B,C,D,E,F) to metadata

In [None]:
metadata = create_opening_categories_feature(metadata)

#### filter games with Openings that don't exist in our ECO dataset

In [None]:
game_matrices, metadata, metadata_evals, metadata_clks, games_pgn = filter_unknown_ecos(eco_df, game_matrices, metadata, metadata_evals, metadata_clks, games_pgn)

#### using eco dataframe to determine amount n of moves in the corresponding opening, cutting off each game after n

In [None]:
game_matrices = cut_off_games_after_opening(game_matrices, eco_df, metadata)

#### get captured pieces throughout games and turns

In [None]:
captures, _ = get_captures(game_matrices, games_pgn)

#### Create dummy csv without embedding to be able to load it as a pandas df

In [None]:
embedding_split = all_zeros_embedding_shape(game_matrices)

In [None]:
write_csv(umap_path, md_keys, embedding_split, game_matrices, metadata, captures, metadata_evals, metadata_clks)

#### finding duplicates

In [None]:
dup_df, drop_dup_df = find_duplicates(umap_path)

#### concatenate games into final data that will be projected, remove duplicates

In [None]:
final_data = np.concatenate(game_matrices)
drop_dup_idx = drop_dup_df.index.array
final_data = np.take(final_data, drop_dup_idx, axis=0)

### Calculating Embeddings / Projections and Writing to Files

#### define the network, create UMAP object

In [None]:
dims = (8, 8, 13)
n_components = 2
encoder = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=dims),
    tf.keras.layers.Conv2D(
        filters=16, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=32, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=64, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=128, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=256, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=100),
    tf.keras.layers.Dense(units=100),
    tf.keras.layers.Dense(units=100),
    tf.keras.layers.Dense(units=n_components),
])
encoder.summary()

In [None]:
keras_fit_kwargs = {"callbacks": [
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        min_delta=10**-2,
        patience=10,
        verbose=1,
    )
]}

In [None]:
# pass encoder network to ParametricUMAP
reducer = ParametricUMAP(
    verbose=True,
    keras_fit_kwargs = keras_fit_kwargs,
    encoder=encoder,
    dims=dims,
    random_state=seed,
    n_training_epochs=20)

#### project using parametric UMAP

In [None]:
umap_embedding = reducer.fit_transform(final_data)

In [None]:
fig, ax = plt.subplots()
ax.plot(reducer._history['loss'])
ax.set_ylabel('Cross Entropy')
ax.set_xlabel('Epoch')

#### reshape resulting embedding into games, states, 2

In [None]:
umap_embedding_split = reshape_embedding(umap_embedding, game_matrices)

#### merge the embeddings with duplicate states and store

In [None]:
concat_df = merge_embeddings_with_duplicate_states(dup_df, drop_dup_df, umap_embedding)
concat_df.to_csv(umap_path)

#### store the umap model for future embeddings

In [None]:
# reducer.save('./parametric_umap_model_eco_no_duplicates_seed'+str(seed))
encoder.save('./parametric_umap_model_eco_no_duplicates_seed'+str(seed))

In [None]:
# model = tf.keras.models.load_model('./test_save_model')

In [None]:
# loaded_reducer = ParametricUMAP(
#     encoder=model,
#     dims=dims,
#     random_state=seed)

In [None]:
# from umap.parametric_umap import load_ParametricUMAP
# embedder = load_ParametricUMAP('parametric_umap_model_eco_no_duplicates_seed2')

In [None]:
# test_embedding = loaded_reducer.transform(final_data.reshape(-1,8,8,13))

In [None]:
# import pickle
# print(pickle.format_version)
# print(tf.__version__)