In [1]:
from pgn2gif import chess
import numpy as np
from openTSNE import TSNE
from openTSNE.callbacks import ErrorApproximations
from matplotlib import pyplot as plt
from scipy import interpolate
import re
import pandas as pd

# FILE_NAME = '/mnt/d/Work/CG Institute/chess/lichess data/lichess_db_standard_rated_2021-08.pgn'
FILE_NAME = '/mnt/d/Work/CG Institute/path explorer/PSE/projection-space-explorer/notebooks/chess/data/eco_games.pgn'

In [2]:
def state_to_vector(state):
    piece_dict = {
        'wr': [1,0,0,0,0,0,0,0,0,0,0,0,0],
        'wn': [0,1,0,0,0,0,0,0,0,0,0,0,0],
        'wb': [0,0,1,0,0,0,0,0,0,0,0,0,0],
        'wk': [0,0,0,1,0,0,0,0,0,0,0,0,0],
        'wq': [0,0,0,0,1,0,0,0,0,0,0,0,0],
        'wp': [0,0,0,0,0,1,0,0,0,0,0,0,0],
        'br': [0,0,0,0,0,0,1,0,0,0,0,0,0],
        'bn': [0,0,0,0,0,0,0,1,0,0,0,0,0],
        'bb': [0,0,0,0,0,0,0,0,1,0,0,0,0],
        'bk': [0,0,0,0,0,0,0,0,0,1,0,0,0],
        'bq': [0,0,0,0,0,0,0,0,0,0,1,0,0],
        'bp': [0,0,0,0,0,0,0,0,0,0,0,1,0],
        '':   [0,0,0,0,0,0,0,0,0,0,0,0,1],
    }    
    state_list = list(state.values())    
    vector = []
    for piece in state_list:
        vector.append(piece_dict[piece])
    return np.array(vector).ravel()

def vector_to_state(vector):
    vec_dict = {
        '1000000000000': "wr",
        '0100000000000': "wn",
        '0010000000000': "wb",
        '0001000000000': "wk",
        '0000100000000': "wq",
        '0000010000000': "wp",
        '0000001000000': "br",
        '0000000100000': "bn",
        '0000000010000': "bb",
        '0000000001000': "bk",
        '0000000000100': "bq",
        '0000000000010': "bp",
        '0000000000001': ""
    }
    
    return vec_dict[vector]

In [3]:
def game_to_vectors(file):
    game = chess.ChessGame(file)
    vectors = [state_to_vector(game.state)]
    while not game.is_finished:
        try:
            game.next()
        except:
            pass
        vectors.append(state_to_vector(game.state))
    return np.stack(vectors)

In [4]:
def get_moves_from_pgn(pgn, keep_x=False):
    with open(pgn) as p:
        data = p.read()
        data = re.sub(r'\{.*?\}', '', data) # Removes pgn comments
        data = re.sub(r'\[.*?\]', '', data) # removes metadata
        moves = re.findall(
            r'[a-h]x?[a-h]?[1-8]=?[BKNRQ]?|O-O-?O?|[BKNRQ][a-h1-8]?[a-h1-8]?x?[a-h][1-8]',
            data)
        if keep_x:
            return moves
        else:
            return [move.replace('x', '') for move in moves]

# method for retrieving all metadata

In [5]:
def get_metadata_from_pgn(pgn):
    with open(pgn) as p:
        data = p.read()
        
        # per mvoe metadata
        evals = re.findall(r'\[%eval (.*?)\]',data)
        clks = re.findall(r'\[%clk (.*?)\]',data)
        # add metadata for before first move
        evals = ['0']+evals
        clks = ['0:00:00'] + clks
        
        # per game metadata
        data = re.sub(r'\{.*?\}', '', data)  # Removes pgn comments
        m = re.findall(r'\[(.*) "(.*)"]',data)
        metadata_keys = [i[0] for i in m]
        metadata_values = [i[1].replace(',',';') for i in m]
        
        return dict(zip(metadata_keys, metadata_values)), evals, clks

In [6]:
print("start loading")

start loading


# Extract individual games from PGN file

In [7]:
lines_to_read = 800000

In [8]:
from pathlib import Path
Path("games").mkdir(parents=True, exist_ok=True)

with open(FILE_NAME, 'r') as f:
    all_games = ''.join([f.readline() for i in range(lines_to_read)])

span = 2
all_games = all_games.split("\n\n")
split_games  = ["\n\n".join(all_games[i:i+span]) for i in range(0, len(all_games), span)]
print(split_games[-2])
print(len(split_games))

[Event "N/A"]
[Site "N/A"]
[Date "??"]
[Round "-"]
[White "White"]
[Black "Black"]
[Result "*"]
[ECO "E99"]
[Opening "King's Indian Defense: Orthodox Variation, Classical System, Benko Attack"]

1. d4 Nf6 2. c4 g6 3. Nc3 Bg7 4. e4 d6 5. Nf3 O-O 6. Be2 e5 7. O-O Nc6 8. d5 Ne7 9. Ne1 Nd7 10. f3 f5 11. g4
3397


## filter out games without eval score or without clk

In [9]:
# # only games that contain clk and eval metadata for moves
# filtered = []
# for game in split_games:
#     if '%eval' in game and '%clk' in game:
#         filtered.append(game)
# split_games = filtered

# # sometimes there are individual turns that only have a clk but no eval, discard those games
# filtered = []
# for game in split_games:
#     m = re.findall(r'{ \[%clk(.*?) (.*?)\] }', game)
#     if len(m) == 0:
#         filtered.append(game)
# split_games = filtered

# print(len(split_games))

## keep n games

In [10]:
# n_games = 2000
# split_games = split_games[:n_games]
# print(len(split_games))
# print(split_games[-1])

## store individual games as PGN files

In [11]:
for i in range(len(split_games)):
    with open('games/game-{:05d}.pgn'.format(i+1),'w') as f:
        f.write(split_games[i])

In [12]:
notrandgames = ['games/game-{:05d}.pgn'.format(n+1) for n in range(10000)]
print(len(notrandgames))

10000


# Loading games from individual PGN files

In [13]:
notrandgames_checked = []
metadata = []
metadata_evals = []
metadata_clks = []
old_md_keys = None
for id, g in enumerate(notrandgames):
    try:
        game_to_vectors(g)
    except:
        pass
    else:
        notrandgames_checked.append((id,g))
        metadata_dict, evals, clks = get_metadata_from_pgn(g)
        # get least common denominator among keys in all samples such that there aren't outlier samples that have more metadata than others
        md_keys = [k for k in metadata_dict]
        if old_md_keys:
            md_keys = list(set(md_keys).intersection(old_md_keys))
        old_md_keys = md_keys
        metadata.append(metadata_dict)
        metadata_evals.append(evals)
        metadata_clks.append(clks)
        
# remove outlier metadata such that only shared metadata among all samples remains
for d in metadata:
    keys = [k for k in d]
    dif = list(set(keys) - set(md_keys))
    for k in dif:
        d.pop(k)
        
print(md_keys)
print(metadata[:100])

['Site', 'Date', 'Event', 'Black', 'Round', 'White', 'Result', 'ECO', 'Opening']
[{'Event': 'N/A', 'Site': 'N/A', 'Date': '??', 'Round': '-', 'White': 'White', 'Black': 'Black', 'Result': '*', 'ECO': 'A00', 'Opening': 'Amar Gambit'}, {'Event': 'N/A', 'Site': 'N/A', 'Date': '??', 'Round': '-', 'White': 'White', 'Black': 'Black', 'Result': '*', 'ECO': 'A00', 'Opening': 'Amar Opening'}, {'Event': 'N/A', 'Site': 'N/A', 'Date': '??', 'Round': '-', 'White': 'White', 'Black': 'Black', 'Result': '*', 'ECO': 'A00', 'Opening': 'Amar Opening: Gent Gambit'}, {'Event': 'N/A', 'Site': 'N/A', 'Date': '??', 'Round': '-', 'White': 'White', 'Black': 'Black', 'Result': '*', 'ECO': 'A00', 'Opening': 'Amar Opening: Paris Gambit'}, {'Event': 'N/A', 'Site': 'N/A', 'Date': '??', 'Round': '-', 'White': 'White', 'Black': 'Black', 'Result': '*', 'ECO': 'A00', 'Opening': 'Amsterdam Attack'}, {'Event': 'N/A', 'Site': 'N/A', 'Date': '??', 'Round': '-', 'White': 'White', 'Black': 'Black', 'Result': '*', 'ECO': 'A00'

In [14]:
d1 = {'a': 1, 'b': 2, 'c': 3}
d2 = {'a': 1, 'b': 2}
d1k = [k for k in d1]
d2k = [k for k in d2]
keys = list(set(d1k).difference(d2k))
print(keys)
for k in keys:
    d1.pop(k)
print(d1)
    

['c']
{'a': 1, 'b': 2}


In [15]:
# add all games regardless of first move
firstmoves = [(g[0],get_moves_from_pgn(g[1])[0]) for g in notrandgames_checked]
indices = []
for idx, fm in firstmoves:
    indices.append(idx)
games = ['games/game-{:05d}.pgn'.format(n+1) for n in np.array(indices)]
game_matrices = [game_to_vectors(g) for g in games]

### handling an error where the last 2 states of each game are equivalent - remove redundant one

In [16]:
# game matrices is num_games * num_turns * 832 
for game in range(len(game_matrices)):
    # check for each game whether last 2 game states are equivalent
    if np.all(game_matrices[game][-2] == game_matrices[game][-1]):
        # if so, remove the last state
        game_matrices[game] = game_matrices[game][:-1]

## keep opening moves only
using the list of openings that corresponds to the lichess dataset https://github.com/niklasf/chess-openings

In [17]:
eco_a_df = pd.read_csv('ECos/a.tsv', sep='\t', header=0)
eco_b_df = pd.read_csv('ECos/b.tsv', sep='\t', header=0)
eco_c_df = pd.read_csv('ECos/c.tsv', sep='\t', header=0)
eco_d_df = pd.read_csv('ECos/d.tsv', sep='\t', header=0)
eco_e_df = pd.read_csv('ECos/e.tsv', sep='\t', header=0)

In [18]:
eco_df = pd.concat([eco_a_df, eco_b_df, eco_c_df, eco_d_df, eco_e_df])

In [19]:
print(metadata[111]['Opening'])
moves = eco_df.loc[eco_df['name'] == metadata[111]['Opening'].replace(';',',')]['pgn'].iloc[0]
pattern = r'.\..'
print(moves)
print(moves.count(' '))
moves = (re.sub(pattern, '', moves))
print(moves)
print(len(moves.split(' ')))

Van Geet Opening: Düsseldorf Gambit
1. Nc3 c5 2. b4
4
Nc3 c5 b4
3


In [20]:
print(game_matrices[0])
print(metadata[0])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
{'Event': 'N/A', 'Site': 'N/A', 'Date': '??', 'Round': '-', 'White': 'White', 'Black': 'Black', 'Result': '*', 'ECO': 'A00', 'Opening': 'Amar Gambit'}


### add ECO category (A,B,C,D,E,F) to metadata

In [21]:
for m in metadata:
    if m['ECO'][0] == 'A':
        m['Opening Category'] = 'A - Flank Opening'
    elif m['ECO'][0] == 'B':
        m['Opening Category'] = 'B - Semi-Open Games other than the French Defense'
    elif m['ECO'][0] == 'C':
        m['Opening Category'] = 'C - Open Games and the French Defense'
    elif m['ECO'][0] == 'D':
        m['Opening Category'] = 'D - Closed Games and Semi-Closed Games'
    elif m['ECO'][0] == 'E':
        m['Opening Category'] = 'E - Indian Defenses'

### filter games with Openings that don't exist in our ECO dataset

In [22]:
pattern = r'.\..'
to_delete = []
for id, gm in enumerate(game_matrices):
    filtered_df = eco_df.loc[eco_df['name'] == metadata[id]['Opening'].replace(';',',')]
    if filtered_df.empty:
        to_delete += [id]

for idx in sorted(to_delete, reverse=True):
    del game_matrices[idx]
    del metadata[idx]
    del metadata_evals[idx]
    del metadata_clks[idx]
    del games[idx]

### using eco dataframe to determine amount n of moves in the corresponding opening, cutting off each game after n

In [23]:
# pattern used to remove turn numbers such that we can determine amount of moves
pattern = r'.\..'
for id, gm in enumerate(game_matrices):
#     print(metadata[id]['Opening'].replace(';',','))
    filtered_df = eco_df.loc[eco_df['name'] == metadata[id]['Opening'].replace(';',',')]
    moves = filtered_df['pgn'].iloc[0]
    moves = (re.sub(pattern, '', moves))
    n_moves = len(moves.split(' '))
    # +1 because the 0th is before any moves have happened
    game_matrices[id] = game_matrices[id][:n_moves+1]

In [24]:
print(game_matrices[0])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
final_data = np.concatenate(game_matrices)

In [26]:
tsne = TSNE(
    perplexity=200,
    n_jobs=6,
    metric='euclidean',
    random_state=42
)

In [27]:
# %time embedding = tsne.fit(np.array(final_data))

In [28]:
# embedding_split = np.array_split(embedding, np.add.accumulate([len(l) for l in game_matrices]))[:-1]

In [29]:
# add empty coordinates for further processing instead of calculating t-sne
embedding_split = []
for i, game in enumerate(game_matrices):
    embedding_split += [[[0, 0] for move in game]]

In [30]:
# fig = plt.figure(figsize=(15,15))
# ax = fig.add_subplot(111)
# ax.set_aspect(1)
# for game in embedding_split[:10]:
#     tck, u = interpolate.splprep(game.transpose(), s=0)
#     unew = np.arange(0, 1.01, 0.01)
#     out = interpolate.splev(unew, tck)
#     ax.plot(out[0], out[1], '-r', alpha=0.03, color='red')
#     ax.scatter(game[:,0], game[:,1], s=0.1, color='red')
# #for game in embedding_split[10:800]:
# #    tck, u = interpolate.splprep(game.transpose(), s=0)
# #    unew = np.arange(0, 1.01, 0.01)
# #    out = interpolate.splev(unew, tck)
# #    ax.plot(out[0], out[1], '-r', alpha=0.03, color='blue')
# #    ax.scatter(game[:,0], game[:,1], s=0.1, color='blue')
# plt.xlim((-40,50));
# plt.ylim((-60,40));
# #for game in embedding_split[100:]:
# #    ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='blue')


# write header
csv = open("lichess_tsne.csv", "w")
features = "x,y,line,cp,algo,player,age,"
print(md_keys)
features += ','.join(md_keys)
features += ",eval,clk,a8,b8,c8,d8,e8,f8,g8,h8,a7,b7,c7,d7,e7,f7,g7,h7,a6,b6,c6,d6,e6,f6,g6,h6,a5,b5,c5,d5,e5,f5,g5,h5,a4,b4,c4,d4,e4,f4,g4,h4,a3,b3,c3,d3,e3,f3,g3,h3,a2,b2,c2,d2,e2,f2,g2,h2,a1,b1,c1,d1,e1,f1,g1,h1"
csv.write(features)
csv.write("\n")
idx = 0

# for gameIndex, game in enumerate(embedding_split[:450]):
for gameIndex, game in enumerate(embedding_split):
    pi = 0
    for idx, pos in enumerate(game):
        csv.write(str(pos[0]))
        csv.write(",")
        csv.write(str(pos[1]))
        
        # number of game
        csv.write(",")
        csv.write(str(gameIndex))
        
        # checkpoint
        csv.write(",")
        if idx == 0:
            csv.write("1")
        elif idx == len(game) - 1:
            csv.write("1")
        else:
            csv.write("0")
        
        
        
        # 'algo', i.e., path coloring method
        csv.write(",")
        # using opening category from A through E
        csv.write(metadata[gameIndex]['Opening Category'])
        # in this case the winner
        # winner = metadata[gameIndex]['Result']
        # winner = winner.replace('1-0', metadata[gameIndex]['White'])
        # winner = winner.replace('0-1', metadata[gameIndex]['Black'])
        # csv.write(winner)

        # player - whose turn is it
        csv.write(",")
        if idx % 2 == 0:
            csv.write(metadata[gameIndex]['Black'])
        else:
            csv.write(metadata[gameIndex]['White'])
        
        # age
        csv.write(",")
        csv.write(str(idx))
        csv.write(",")
        
        # per game metadata
        md_values = [metadata[gameIndex][k] for k in md_keys]
        csv.write(','.join(md_values))
        # write metadata gameIndex idx %clk and %eval
        
        # per move metadata
        csv.write(',')
        if 0 <= idx < len(metadata_evals[gameIndex]):
            csv.write(metadata_evals[gameIndex][idx])
        else:
            csv.write('N/A')
        csv.write(',')
        if 0 <= idx < len(metadata_clks[gameIndex]):
            csv.write(metadata_clks[gameIndex][idx])
        else:
            csv.write('N/A')
        
        for n in range(0, 64):
            csv.write(",")
            str1 = ''.join(str(e) for e in game_matrices[gameIndex][idx][n * 13: (n+1) * 13])
            csv.write(vector_to_state(str1))
            
        csv.write("\n")
    gameIndex += 1
        
csv.close()

['Site', 'Date', 'Event', 'Black', 'Round', 'White', 'Result', 'ECO', 'Opening']


In [None]:
from umap.parametric_umap import ParametricUMAP
from matplotlib import pyplot as plt
np.random.seed(2)

In [None]:
# define the network
import tensorflow as tf
tf.random.set_seed(2)
dims = (8, 8, 13)
n_components = 2
encoder = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=dims),
    tf.keras.layers.Conv2D(
        filters=16, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=32, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=64, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=128, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Conv2D(
        filters=256, kernel_size=3, strides=(2, 2), activation="relu", padding="same"
    ),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=100),
    tf.keras.layers.Dense(units=100),
    tf.keras.layers.Dense(units=100),
    tf.keras.layers.Dense(units=n_components),
])
encoder.summary()

In [None]:
keras_fit_kwargs = {"callbacks": [
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        min_delta=10**-2,
        patience=10,
        verbose=1,
    )
]}

In [None]:
# pass encoder network to ParametricUMAP
reducer = ParametricUMAP(
    verbose=True,
    keras_fit_kwargs = keras_fit_kwargs,
    encoder=encoder,
    dims=dims,
    random_state=2,
    n_training_epochs=20)

In [None]:
print(final_data.shape)

In [None]:
import pandas as pd
df = pd.read_csv('lichess_tsne.csv', header=0, index_col=False)
df.reset_index(drop=True, inplace=True)
df.head()

### list of features (fields/pieces of the chessboard)

In [None]:
features = ['a8','b8','c8','d8','e8','f8','g8','h8','a7','b7','c7','d7','e7','f7','g7','h7','a6','b6','c6','d6','e6','f6','g6','h6','a5','b5','c5','d5','e5','f5','g5','h5','a4','b4','c4','d4','e4','f4','g4','h4','a3','b3','c3','d3','e3','f3','g3','h3','a2','b2','c2','d2','e2','f2','g2','h2','a1','b1','c1','d1','e1','f1','g1','h1']

### finding duplicates

In [None]:
dup_df = df[df.duplicated(features)]
print(len(dup_df))

### df without duplicates

In [None]:
drop_dup_df = df.drop_duplicates(subset=features)
print(len(drop_dup_df))

### filter numpy array final_data using filtered df without duplicates

In [None]:
# get indices from drop_dup_df
drop_dup_idx = drop_dup_df.index.array

filtered_final_data = np.take(final_data, drop_dup_idx, axis=0)
print(filtered_final_data.shape)

### calculate embedding

In [None]:
umap_embedding = reducer.fit_transform(filtered_final_data)

In [None]:
fig, ax = plt.subplots()
ax.plot(reducer._history['loss'])
ax.set_ylabel('Cross Entropy')
ax.set_xlabel('Epoch')

### add embedding results as x,y into filtered df

In [None]:
drop_dup_df=drop_dup_df.drop(['x', 'y'], axis=1)
drop_dup_df.insert(0, "x", umap_embedding[:,0], True)
drop_dup_df.insert(1, "y", umap_embedding[:,1], True)
drop_dup_df.head()

### build a dictionary of state->coords, iterating through filtered df

In [None]:
state_to_coords = {}

for i, row in drop_dup_df.iterrows():
    values = row[features]
    k = ''.join([str(x) for x in values.tolist()])
    state_to_coords[k] = [row['x'], row['y']]

### add coords to duplicates df using dict

In [None]:
dup_df=dup_df.drop(['x', 'y'], axis=1)
xs = []
ys = []
for i, row in dup_df.iterrows():
    values = row[features]
    k = ''.join([str(x) for x in values.tolist()])
    x, y = state_to_coords[k]
    xs += [x]
    ys += [y]
dup_df.insert(0, 'x', xs, True)
dup_df.insert(1, 'y', ys, True)
dup_df.head()

### concatenating filtered df with df of duplicates after they have been updated by x,y

In [None]:
conc_df = pd.concat([drop_dup_df, dup_df]).sort_index()
conc_df.head()

In [None]:
# umap_embedding_split = np.array_split(umap_embedding, np.add.accumulate([len(l) for l in game_matrices]))[:-1]

In [None]:
conc_df.to_csv('lichess_umap_seed0_no_duplicate_projection_eco_games.csv')

In [None]:
# fig = plt.figure(figsize=(8,8))
# ax = fig.add_subplot(111)
# ax.set_aspect(1)
# for game in umap_embedding_split[:100]:
#     ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='red')
# for game in umap_embedding_split[100:]:
#     ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='blue')