In [1]:
from pgn2gif import chess
import numpy as np
from openTSNE import TSNE
from openTSNE.callbacks import ErrorApproximations
from matplotlib import pyplot as plt
from scipy import interpolate
import re

FILE_NAME = '../alphazero_vs_stockfish_all.pgn'

In [2]:
def state_to_vector(state):
    piece_dict = {
        'wr': [1,0,0,0,0,0,0,0,0,0,0,0,0],
        'wn': [0,1,0,0,0,0,0,0,0,0,0,0,0],
        'wb': [0,0,1,0,0,0,0,0,0,0,0,0,0],
        'wk': [0,0,0,1,0,0,0,0,0,0,0,0,0],
        'wq': [0,0,0,0,1,0,0,0,0,0,0,0,0],
        'wp': [0,0,0,0,0,1,0,0,0,0,0,0,0],
        'br': [0,0,0,0,0,0,1,0,0,0,0,0,0],
        'bn': [0,0,0,0,0,0,0,1,0,0,0,0,0],
        'bb': [0,0,0,0,0,0,0,0,1,0,0,0,0],
        'bk': [0,0,0,0,0,0,0,0,0,1,0,0,0],
        'bq': [0,0,0,0,0,0,0,0,0,0,1,0,0],
        'bp': [0,0,0,0,0,0,0,0,0,0,0,1,0],
        '':   [0,0,0,0,0,0,0,0,0,0,0,0,1],
    }    
    state_list = list(state.values())    
    vector = []
    for piece in state_list:
        vector.append(piece_dict[piece])
    return np.array(vector).ravel()

def vector_to_state(vector):
    vec_dict = {
        '1000000000000': "wr",
        '0100000000000': "wn",
        '0010000000000': "wb",
        '0001000000000': "wk",
        '0000100000000': "wq",
        '0000010000000': "wp",
        '0000001000000': "br",
        '0000000100000': "bn",
        '0000000010000': "bb",
        '0000000001000': "bk",
        '0000000000100': "bq",
        '0000000000010': "bp",
        '0000000000001': ""
    }
    
    return vec_dict[vector]

In [3]:
# def game_to_vectors(file):
#     moves = chess.get_moves_from_pgn(file)
#     game = chess.ChessGame()
#     vectors = [state_to_vector(game.state)]
#     for move in moves:
#         game.push(move)
#         vectors.append(state_to_vector(game.state))
#     return np.stack(vectors)

In [4]:
def game_to_vectors(file):
    game = chess.ChessGame(file)
    vectors = [state_to_vector(game.state)]
    while not game.is_finished:
        try:
            game.next()
        except:
            pass
        vectors.append(state_to_vector(game.state))
    return np.stack(vectors)

In [5]:
def get_moves_from_pgn(pgn):
    with open(pgn) as p:
        data = p.read()
        data = re.sub(r'\{.*?\}', '', data)  # Removes pgn comments
        moves = re.findall(
            r'[a-h]x?[a-h]?[1-8]=?[BKNRQ]?|O-O-?O?|[BKNRQ][a-h1-8]?[a-h1-8]?x?[a-h][1-8]',
            data)
        return [move.replace('x', '') for move in moves]

In [6]:
def get_metadata_from_pgn(pgn):
    with open(pgn) as p:
        data = p.read()
        data = re.sub(r'\{.*?\}', '', data)  # Removes pgn comments
        metadata = {}
        metadata['white'] = re.findall(
            r'\[White "(.*)"]',
            data)[0]
        metadata['black'] = re.findall(
            r'\[Black "(.*)"]',
            data)[0]
        metadata['result'] = re.findall(
            r'\[Result "(.*)"]',
            data)[0]
        return metadata

In [7]:
print("start loading")

start loading


In [8]:
np.random.seed(0)
randgames = ['games/game-{:05d}.pgn'.format(n) for n in np.unique(np.random.randint(40000,size=50))]

In [9]:
from pathlib import Path
Path("games").mkdir(parents=True, exist_ok=True)

with open(FILE_NAME, 'r') as f:
    all_games = f.read()

span = 2
all_games = all_games.split("\n\n")
split_games  = ["\n\n".join(all_games[i:i+span]) for i in range(0, len(all_games), span)]


for i in range(len(split_games)):
    with open('games/game-{:05d}.pgn'.format(i+1),'w') as f:
        f.write(split_games[i])

In [10]:
notrandgames = ['games/game-{:05d}.pgn'.format(n+1) for n in range(10000)]

In [11]:
# notrandgames = [FILE_NAME]

In [12]:
game_to_vectors(notrandgames[0])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [13]:
notrandgames_checked = []
metadata = []
for id, g in enumerate(notrandgames):
    try:
        game_to_vectors(g)
    except:
        pass
    else:
        notrandgames_checked.append((id,g))
        metadata.append(get_metadata_from_pgn(g))

In [14]:
# notrandgames_checked

In [15]:
# get_moves_from_pgn(FILE_NAME)

In [16]:
# firstmoves = [(g[0],get_moves_from_pgn(g[1])[0]) for g in notrandgames_checked]

In [17]:
# nf3_indices = []
# for idx, fm in firstmoves:
#     if fm == 'Nf3':
#         nf3_indices.append(idx)

In [18]:
# d4_indices = []
# for idx, fm in firstmoves:
#     if fm == 'd4':
#         d4_indices.append(idx)

In [19]:
# c4_indices = []
# for idx, fm in firstmoves:
#     if fm == 'c4':
#         c4_indices.append(idx)

In [20]:
# e4_indices = []
# for idx, fm in firstmoves:
#     if fm == 'e4':
#         e4_indices.append(idx)

In [21]:
# nf3_games = ['games/game-{:05d}.pgn'.format(n+1) for n in np.array(nf3_indices)[:150]]
# d4_games = ['games/game-{:05d}.pgn'.format(n+1) for n in np.array(d4_indices)[:150]]
# c4_games = ['games/game-{:05d}.pgn'.format(n+1) for n in np.array(c4_indices)[:150]]
# e4_games = ['games/game-{:05d}.pgn'.format(n+1) for n in np.array(e4_indices)[:150]]

In [22]:
# print(len(nf3_games))
# print(len(d4_games))
# print(len(c4_games))
# print(len(e4_games))

In [23]:
# both_games = np.concatenate([nf3_games, d4_games, c4_games, e4_games])

In [24]:
# game_matrices = [game_to_vectors(g) for g in both_games]

In [25]:
# add all games regardless of first move
firstmoves = [(g[0],get_moves_from_pgn(g[1])[0]) for g in notrandgames_checked]
indices = []
for idx, fm in firstmoves:
    indices.append(idx)
games = ['games/game-{:05d}.pgn'.format(n+1) for n in np.array(indices)]
game_matrices = [game_to_vectors(g) for g in games]

### handling an error where the last 2 states of each game are equivalent - remove redundant one

In [26]:
# game matrices is num_games * num_turns * 832 
for game in range(len(game_matrices)):
    # check for each game whether last 2 game states are equivalent
    if np.all(game_matrices[game][-2] == game_matrices[game][-1]):
        # if so, remove the last state
        game_matrices[game] = game_matrices[game][:-1]

### removing last state after checkmate
there is no move left to be done, therefore lc0 or any chess network wouldn't predict on it - the lc0 dataset does not contain this state

In [27]:
# # game matrices is num_games * num_turns * 832 
# for game in range(len(game_matrices)):
#     game_matrices[game] = game_matrices[game][:-1]

In [28]:
# TODO remove index restriction
final_data = np.concatenate(game_matrices)

In [29]:
print(metadata[0])

{'white': 'AlphaZero', 'black': 'Stockfish 8', 'result': '1-0'}


In [30]:
tsne = TSNE(
    perplexity=200,
    n_jobs=6,
    metric='euclidean',
    random_state=42
)

In [31]:
%time embedding = tsne.fit(np.array(final_data))

CPU times: user 11min 32s, sys: 12.5 s, total: 11min 44s
Wall time: 2min 35s


In [32]:
embedding_split = np.array_split(embedding, np.add.accumulate([len(l) for l in game_matrices]))[:-1]

In [33]:
# fig = plt.figure(figsize=(15,15))
# ax = fig.add_subplot(111)
# ax.set_aspect(1)
# for game in embedding_split[:10]:
#     tck, u = interpolate.splprep(game.transpose(), s=0)
#     unew = np.arange(0, 1.01, 0.01)
#     out = interpolate.splev(unew, tck)
#     ax.plot(out[0], out[1], '-r', alpha=0.03, color='red')
#     ax.scatter(game[:,0], game[:,1], s=0.1, color='red')
# #for game in embedding_split[10:800]:
# #    tck, u = interpolate.splprep(game.transpose(), s=0)
# #    unew = np.arange(0, 1.01, 0.01)
# #    out = interpolate.splev(unew, tck)
# #    ax.plot(out[0], out[1], '-r', alpha=0.03, color='blue')
# #    ax.scatter(game[:,0], game[:,1], s=0.1, color='blue')
# plt.xlim((-40,50));
# plt.ylim((-60,40));
# #for game in embedding_split[100:]:
# #    ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='blue')


# write header
csv = open("tsne.csv", "w")
csv.write("x,y,line,cp,algo,player,age,a8,b8,c8,d8,e8,f8,g8,h8,a7,b7,c7,d7,e7,f7,g7,h7,a6,b6,c6,d6,e6,f6,g6,h6,a5,b5,c5,d5,e5,f5,g5,h5,a4,b4,c4,d4,e4,f4,g4,h4,a3,b3,c3,d3,e3,f3,g3,h3,a2,b2,c2,d2,e2,f2,g2,h2,a1,b1,c1,d1,e1,f1,g1,h1")
csv.write("\n")
idx = 0

# for gameIndex, game in enumerate(embedding_split[:450]):
for gameIndex, game in enumerate(embedding_split):
    pi = 0
    for idx, pos in enumerate(game):
        csv.write(str(pos[0]))
        csv.write(",")
        csv.write(str(pos[1]))
        
        # number of game
        csv.write(",")
        csv.write(str(gameIndex))
        
        # checkpoint
        csv.write(",")
        if idx == 0:
            csv.write("1")
        elif idx == len(game) - 1:
            csv.write("1")
        else:
            csv.write("0")
        
        
        
        # method, in this case the starting move
        csv.write(",")
        # TODO replace with which player wins this game
        winner = metadata[gameIndex]['result']
        winner = winner.replace('1-0', metadata[gameIndex]['white'])
        winner = winner.replace('0-1', metadata[gameIndex]['black'])
        csv.write(winner)
#         if 0 <= gameIndex <= 150:
#             csv.write("0")
#         elif 100 < gameIndex <= 300:
#             csv.write("1")
#         else:
#             csv.write("2")

        # player - whose turn is it
        csv.write(",")
        if idx % 2 == 0:
            csv.write(metadata[gameIndex]['black'])
        else:
            csv.write(metadata[gameIndex]['white'])
        
        # age
        csv.write(",")
        csv.write(str(idx))
        
        for n in range(0, 64):
            csv.write(",")
            str1 = ''.join(str(e) for e in game_matrices[gameIndex][idx][n * 13: (n+1) * 13])
            csv.write(vector_to_state(str1))
            
        csv.write("\n")
    gameIndex += 1
        
csv.close()

In [34]:
import umap.umap_ as umap
from matplotlib import pyplot as plt
np.random.seed(0)

In [35]:
reducer = umap.UMAP(metric='cosine')

In [36]:
umap_embedding = reducer.fit_transform(final_data)

In [37]:
umap_embedding_split = np.array_split(umap_embedding, np.add.accumulate([len(l) for l in game_matrices]))[:-1]

In [38]:
import pandas as pd
df = pd.read_csv('tsne.csv')
df.head()

Unnamed: 0,x,y,line,cp,algo,player,age,a8,b8,c8,...,g2,h2,a1,b1,c1,d1,e1,f1,g1,h1
0,51.900578,-11.508027,0,1,AlphaZero,Stockfish 8,0,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,wn,wr
1,50.325249,-11.06261,0,0,AlphaZero,AlphaZero,1,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
2,52.491436,-8.243605,0,0,AlphaZero,Stockfish 8,2,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
3,53.0567,-5.131282,0,0,AlphaZero,AlphaZero,3,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
4,54.079796,-3.365637,0,0,AlphaZero,Stockfish 8,4,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr


In [39]:
df['x'] = umap_embedding[:,0]
df['y'] = umap_embedding[:,1]
df.head()
# store

Unnamed: 0,x,y,line,cp,algo,player,age,a8,b8,c8,...,g2,h2,a1,b1,c1,d1,e1,f1,g1,h1
0,-12.203864,5.410787,0,1,AlphaZero,Stockfish 8,0,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,wn,wr
1,-7.167518,10.975452,0,0,AlphaZero,AlphaZero,1,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
2,-6.991467,11.288157,0,0,AlphaZero,Stockfish 8,2,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
3,-6.883981,11.161638,0,0,AlphaZero,AlphaZero,3,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
4,-6.886517,11.086916,0,0,AlphaZero,Stockfish 8,4,br,bn,bb,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr


In [40]:
df.to_csv('umap_cosine_seed0.csv')

In [41]:
# fig = plt.figure(figsize=(8,8))
# ax = fig.add_subplot(111)
# ax.set_aspect(1)
# for game in umap_embedding_split[:100]:
#     ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='red')
# for game in umap_embedding_split[100:]:
#     ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='blue')

## default parameters for UMAP over multiple seeds

In [42]:
# import pandas as pd
# import numpy as np

# for seed in range(20):
#     np.random.seed(seed)
#     reducer = umap.UMAP()
#     umap_embedding = reducer.fit_transform(final_data)
#     df = pd.read_csv('tsne.csv')
#     df['x'] = umap_embedding[:,0]
#     df['y'] = umap_embedding[:,1]
#     save_path = 'umap_seed_'+str(seed)+'.csv'
#     print('storing to', save_path)
#     df.to_csv(save_path)

## UMAP Hparam Search

In [43]:
# import pandas as pd
# import umap.umap_ as umap
# from matplotlib import pyplot as plt
# OUTPUT_FILE_NAME = 'umap_outputs/'

In [44]:
# def run_umap(data, path, learning_rate, nn, n_epochs, min_dist=0.1
#     reducer = umap.UMAP(n_neighbors=nn, learning_rate=learning_rate, n_epochs=n_epochs, min_dist=min_dist)
    
#     csv_path = path+'_nn'+str(nn)+'_lr'+str(learning_rate)+'_nepochs'+str(n_epochs)+'_mindist'+str(min_dist)+'.csv'
#     image_file_name = path+'_nn'+str(nn)+'_lr'+str(learning_rate)+'_nepochs'+str(n_epochs)+'_mindist'+str(min_dist)+'.png'
    
#     print('fitting umap embedding for', csv_path)
#     umap_embedding = reducer.fit_transform(data)
    
#     umap_df = pd.DataFrame(umap_embedding)
#     umap_df.to_csv(csv_path)
    
#     plt.figure()
#     plt.scatter(umap_embedding[:,0],umap_embedding[:,1])
#     print('storing to', csv_path)
#     plt.savefig(image_file_name)
#     plt.close()

In [45]:
# nns = [25,30,35,40,45]
# n_epochs_settings = [200, 300, 400, 500, 600, 700,]
# min_dist = [0.1]
# learning_rate = [1.0]
# for nn in nns:
#     for n_epochs in n_epochs_settings:
#         for md in min_dist:
#             for lr in learning_rate:
#                 run_umap(data=final_data, path=OUTPUT_FILE_NAME, learning_rate=lr, nn=nn, n_epochs=n_epochs, min_dist=md)