In [1]:
from pgn2gif import chess
import numpy as np
from openTSNE import TSNE
from openTSNE.callbacks import ErrorApproximations
from matplotlib import pyplot as plt
from scipy import interpolate
import re
import pandas as pd

FILE_NAME = '/mnt/d/Work/CG Institute/chess/lichess data/lichess_db_standard_rated_2021-08.pgn'

In [2]:
def state_to_vector(state):
    piece_dict = {
        'wr': [1,0,0,0,0,0,0,0,0,0,0,0,0],
        'wn': [0,1,0,0,0,0,0,0,0,0,0,0,0],
        'wb': [0,0,1,0,0,0,0,0,0,0,0,0,0],
        'wk': [0,0,0,1,0,0,0,0,0,0,0,0,0],
        'wq': [0,0,0,0,1,0,0,0,0,0,0,0,0],
        'wp': [0,0,0,0,0,1,0,0,0,0,0,0,0],
        'br': [0,0,0,0,0,0,1,0,0,0,0,0,0],
        'bn': [0,0,0,0,0,0,0,1,0,0,0,0,0],
        'bb': [0,0,0,0,0,0,0,0,1,0,0,0,0],
        'bk': [0,0,0,0,0,0,0,0,0,1,0,0,0],
        'bq': [0,0,0,0,0,0,0,0,0,0,1,0,0],
        'bp': [0,0,0,0,0,0,0,0,0,0,0,1,0],
        '':   [0,0,0,0,0,0,0,0,0,0,0,0,1],
    }    
    state_list = list(state.values())    
    vector = []
    for piece in state_list:
        vector.append(piece_dict[piece])
    return np.array(vector).ravel()

def vector_to_state(vector):
    vec_dict = {
        '1000000000000': "wr",
        '0100000000000': "wn",
        '0010000000000': "wb",
        '0001000000000': "wk",
        '0000100000000': "wq",
        '0000010000000': "wp",
        '0000001000000': "br",
        '0000000100000': "bn",
        '0000000010000': "bb",
        '0000000001000': "bk",
        '0000000000100': "bq",
        '0000000000010': "bp",
        '0000000000001': ""
    }
    
    return vec_dict[vector]

In [3]:
def game_to_vectors(file):
    game = chess.ChessGame(file)
    vectors = [state_to_vector(game.state)]
    while not game.is_finished:
        try:
            game.next()
        except:
            pass
        vectors.append(state_to_vector(game.state))
    return np.stack(vectors)

In [4]:
def get_moves_from_pgn(pgn, keep_x=False):
    with open(pgn) as p:
        data = p.read()
        data = re.sub(r'\{.*?\}', '', data) # Removes pgn comments
        data = re.sub(r'\[.*?\]', '', data) # removes metadata
        moves = re.findall(
            r'[a-h]x?[a-h]?[1-8]=?[BKNRQ]?|O-O-?O?|[BKNRQ][a-h1-8]?[a-h1-8]?x?[a-h][1-8]',
            data)
        if keep_x:
            return moves
        else:
            return [move.replace('x', '') for move in moves]

In [5]:
chrs = {
    'we': "\u25FB",
    'wp': "\u265F",
    'wr': "\u265C",
    'wn': "\u265E",
    'wb': "\u265D",
    'wk': "\u265A",
    'wq': "\u265B",
    'be': "\u25FC",
    'bp': "\u2659",
    'br': "\u2656",
    'bn': "\u2658",
    'bb': "\u2657",
    'bk': "\u2654",
    'bq': "\u2655",
}

def print_game(game, symbols=True):
    game = np.array(game).reshape(-1,8,8,13)
    for si, s in enumerate(game):
        print('move:', si)
        for y_i, y in enumerate(s):
            for x_i, x in enumerate(y):
                vec = np.array2string(x,separator='').replace('[','').replace(']','')
                st = vector_to_state(vec)
                if symbols:
                    if st == '':
                        if ((1-y_i)%2==0 and x_i%2==1) or ((1-y_i)%2==1 and x_i%2==0):
                            st = 'we'
                        else:
                            st = 'be'
                    print(chrs[st]+',',end='')
                else:
                    if st == '':
                        st = '  '
                    print(st+',',end='')
            print()
        print()

# method for retrieving all metadata

In [6]:
def get_metadata_from_pgn(pgn):
    with open(pgn) as p:
        data = p.read()
        
        # per mvoe metadata
        evals = re.findall(r'\[%eval (.*?)\]',data)
        clks = re.findall(r'\[%clk (.*?)\]',data)
        # add metadata for before first move
        evals = ['0']+evals
        clks = ['0:00:00'] + clks
        
        # per game metadata
        data = re.sub(r'\{.*?\}', '', data)  # Removes pgn comments
        m = re.findall(r'\[(.*) "(.*)"]',data)
        metadata_keys = [i[0] for i in m]
        metadata_values = [i[1].replace(',',';') for i in m]
        
        return dict(zip(metadata_keys, metadata_values)), evals, clks

In [7]:
print("start loading")

start loading


# Extract individual games from PGN file

In [8]:
lines_to_read = 800000

In [9]:
from pathlib import Path
Path("games").mkdir(parents=True, exist_ok=True)

with open(FILE_NAME, 'r') as f:
    all_games = ''.join([f.readline() for i in range(lines_to_read)])

span = 2
all_games = all_games.split("\n\n")
split_games  = ["\n\n".join(all_games[i:i+span]) for i in range(0, len(all_games), span)]
print(split_games[-2])
print(len(split_games))

[Event "Rated UltraBullet game"]
[Site "https://lichess.org/1kMq0rEz"]
[Date "2021.08.01"]
[Round "-"]
[White "Body9"]
[Black "Snlyldrm37"]
[Result "0-1"]
[UTCDate "2021.08.01"]
[UTCTime "00:30:19"]
[WhiteElo "1300"]
[BlackElo "1486"]
[WhiteRatingDiff "-3"]
[BlackRatingDiff "+3"]
[ECO "A40"]
[Opening "English Defense"]
[TimeControl "15+0"]
[Termination "Time forfeit"]

1. d4 { [%clk 0:00:15] } 1... b6 { [%clk 0:00:15] } 2. c4 { [%clk 0:00:15] } 2... Bb7 { [%clk 0:00:15] } 3. e4 { [%clk 0:00:14] } 3... e6 { [%clk 0:00:13] } 4. d5 { [%clk 0:00:14] } 4... d6 { [%clk 0:00:13] } 5. dxe6 { [%clk 0:00:13] } 5... fxe6 { [%clk 0:00:13] } 6. c5 { [%clk 0:00:13] } 6... Bxe4 { [%clk 0:00:12] } 7. c6 { [%clk 0:00:13] } 7... Bxb1 { [%clk 0:00:11] } 8. Rxb1 { [%clk 0:00:12] } 8... Nxc6 { [%clk 0:00:10] } 9. Qxd6 { [%clk 0:00:12] } 9... Qxd6 { [%clk 0:00:09] } 10. f3 { [%clk 0:00:11] } 10... Qb4+ { [%clk 0:00:08] } 11. Bd2 { [%clk 0:00:10] } 11... Qa5 { [%clk 0:00:08] } 12. Bxa5 { [%clk 0:00:08] } 12.

## filter out games without eval score or without clk

In [10]:
# only games that contain clk and eval metadata for moves
filtered = []
for game in split_games:
    if '%eval' in game and '%clk' in game:
        filtered.append(game)
split_games = filtered

# sometimes there are individual turns that only have a clk but no eval, discard those games
filtered = []
for game in split_games:
    m = re.findall(r'{ \[%clk(.*?) (.*?)\] }', game)
    if len(m) == 0:
        filtered.append(game)
split_games = filtered

print(len(split_games))

2228


## keep n games

In [11]:
n_games = 250
split_games = split_games[:n_games]
print(len(split_games))
print(split_games[-1])

250
[Event "Rated Blitz game"]
[Site "https://lichess.org/yNHg7sBf"]
[Date "2021.08.01"]
[Round "-"]
[White "nightphone"]
[Black "Fabrik81"]
[Result "1-0"]
[UTCDate "2021.08.01"]
[UTCTime "00:03:18"]
[WhiteElo "1520"]
[BlackElo "1508"]
[WhiteRatingDiff "+13"]
[BlackRatingDiff "-5"]
[ECO "C25"]
[Opening "Vienna Game"]
[TimeControl "180+2"]
[Termination "Normal"]

1. e4 { [%eval 0.24] [%clk 0:03:00] } 1... e5 { [%eval 0.12] [%clk 0:03:00] } 2. Nc3 { [%eval 0.13] [%clk 0:02:59] } 2... c6?! { [%eval 0.65] [%clk 0:02:59] } 3. Bc4 { [%eval 0.67] [%clk 0:02:59] } 3... Nf6 { [%eval 0.67] [%clk 0:02:58] } 4. d3? { [%eval -0.36] [%clk 0:02:58] } 4... Bb4 { [%eval -0.12] [%clk 0:02:54] } 5. Bd2 { [%eval -0.32] [%clk 0:02:57] } 5... O-O { [%eval -0.16] [%clk 0:02:53] } 6. Nf3 { [%eval -0.01] [%clk 0:02:57] } 6... Bd6 { [%eval 0.17] [%clk 0:02:46] } 7. Qe2 { [%eval -0.16] [%clk 0:02:53] } 7... b5 { [%eval -0.22] [%clk 0:02:45] } 8. Bb3 { [%eval -0.27] [%clk 0:02:52] } 8... Bb7? { [%eval 0.73] [%clk

## store individual games as PGN files

In [12]:
for i in range(len(split_games)):
    with open('games/game-{:05d}.pgn'.format(i+1),'w') as f:
        f.write(split_games[i])

In [13]:
notrandgames = ['games/game-{:05d}.pgn'.format(n+1) for n in range(10000)]
print(len(notrandgames))

10000


# Loading games from individual PGN files

In [14]:
notrandgames_checked = []
metadata = []
metadata_evals = []
metadata_clks = []
old_md_keys = None
for id, g in enumerate(notrandgames):
    try:
        game_to_vectors(g)
    except:
        pass
    else:
        notrandgames_checked.append((id,g))
        metadata_dict, evals, clks = get_metadata_from_pgn(g)
        # get least common denominator among keys in all samples such that there aren't outlier samples that have more metadata than others
        md_keys = [k for k in metadata_dict]
        if old_md_keys:
            md_keys = list(set(md_keys).intersection(old_md_keys))
        old_md_keys = md_keys
        metadata.append(metadata_dict)
        metadata_evals.append(evals)
        metadata_clks.append(clks)
        
# remove outlier metadata such that only shared metadata among all samples remains
for d in metadata:
    keys = [k for k in d]
    dif = list(set(keys) - set(md_keys))
    for k in dif:
        d.pop(k)
        
print(md_keys)
print(metadata[:100])

['Opening', 'TimeControl', 'Termination', 'ECO', 'WhiteElo', 'White', 'Result', 'Site', 'Round', 'Black', 'Date', 'UTCDate', 'BlackElo', 'Event', 'UTCTime']
[{'Event': 'Rated Blitz tournament https://lichess.org/tournament/zTLnP8ob', 'Site': 'https://lichess.org/q5HJFu3Z', 'Date': '2021.08.01', 'Round': '-', 'White': 'Gersonz', 'Black': 'Scheyla_Perdomo26', 'Result': '1-0', 'UTCDate': '2021.08.01', 'UTCTime': '00:00:24', 'WhiteElo': '1552', 'BlackElo': '1321', 'ECO': 'B22', 'Opening': 'Sicilian Defense: Alapin Variation', 'TimeControl': '180+2', 'Termination': 'Time forfeit'}, {'Event': 'Rated Bullet game', 'Site': 'https://lichess.org/PstuIwhh', 'Date': '2021.08.01', 'Round': '-', 'White': 'Mezoo777000222', 'Black': 'rj270', 'Result': '0-1', 'UTCDate': '2021.08.01', 'UTCTime': '00:00:25', 'WhiteElo': '1083', 'BlackElo': '1099', 'ECO': 'B10', 'Opening': 'Caro-Kann Defense', 'TimeControl': '60+0', 'Termination': 'Time forfeit'}, {'Event': 'Rated Bullet game', 'Site': 'https://lichess.or

In [15]:
d1 = {'a': 1, 'b': 2, 'c': 3}
d2 = {'a': 1, 'b': 2}
d1k = [k for k in d1]
d2k = [k for k in d2]
keys = list(set(d1k).difference(d2k))
print(keys)
for k in keys:
    d1.pop(k)
print(d1)
    

['c']
{'a': 1, 'b': 2}


In [16]:
# add all games regardless of first move
firstmoves = [(g[0],get_moves_from_pgn(g[1])[0]) for g in notrandgames_checked]
indices = []
for idx, fm in firstmoves:
    indices.append(idx)
games = ['games/game-{:05d}.pgn'.format(n+1) for n in np.array(indices)]
game_matrices = [game_to_vectors(g) for g in games]

In [17]:
def piece_by_game_state_position(game_matrices, game, state, position):
    x = ord(position[0]) - ord('a')
    y = 8 - int(position[1])
    idx = y*8+x
    str1 = ''.join(str(e) for e in game_matrices[game][state][idx * 13: (idx+1) * 13])
    return vector_to_state(str1)

In [18]:
def get_captured_piece(game_matrices, game, g_i, s_i):
    m = get_moves_from_pgn(game, keep_x=True)
    if 'x' in m[s_i]:
        piece = piece_by_game_state_position(game_matrices, g_i, s_i, m[s_i][-2:])
        return piece

### handling an error where the last 2 states of each game are equivalent - remove redundant one

In [19]:
# game matrices is num_games * num_turns * 832 
for game in range(len(game_matrices)):
    # check for each game whether last 2 game states are equivalent
    if np.all(game_matrices[game][-2] == game_matrices[game][-1]):
        # if so, remove the last state
        game_matrices[game] = game_matrices[game][:-1]

## keep opening moves only
using the list of openings that corresponds to the lichess dataset https://github.com/niklasf/chess-openings

In [20]:
eco_a_df = pd.read_csv('ECos/a.tsv', sep='\t', header=0)
eco_b_df = pd.read_csv('ECos/b.tsv', sep='\t', header=0)
eco_c_df = pd.read_csv('ECos/c.tsv', sep='\t', header=0)
eco_d_df = pd.read_csv('ECos/d.tsv', sep='\t', header=0)
eco_e_df = pd.read_csv('ECos/e.tsv', sep='\t', header=0)

In [21]:
eco_df = pd.concat([eco_a_df, eco_b_df, eco_c_df, eco_d_df, eco_e_df])

In [22]:
print(metadata[111]['Opening'])
moves = eco_df.loc[eco_df['name'] == metadata[111]['Opening'].replace(';',',')]['pgn'].iloc[0]
pattern = r'.\..'
print(moves)
print(moves.count(' '))
moves = (re.sub(pattern, '', moves))
print(moves)
print(len(moves.split(' ')))

Sicilian Defense: Smith-Morra Gambit Declined; Center Formation
1. e4 c5 2. c3 e5 3. d4 cxd4
8
e4 c5 c3 e5 d4 cxd4
6


In [23]:
print(game_matrices[0])
print(metadata[0])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
{'Event': 'Rated Blitz tournament https://lichess.org/tournament/zTLnP8ob', 'Site': 'https://lichess.org/q5HJFu3Z', 'Date': '2021.08.01', 'Round': '-', 'White': 'Gersonz', 'Black': 'Scheyla_Perdomo26', 'Result': '1-0', 'UTCDate': '2021.08.01', 'UTCTime': '00:00:24', 'WhiteElo': '1552', 'BlackElo': '1321', 'ECO': 'B22', 'Opening': 'Sicilian Defense: Alapin Variation', 'TimeControl': '180+2', 'Termination': 'Time forfeit'}


### add ECO category (A,B,C,D,E,F) to metadata

In [24]:
for m in metadata:
    if m['ECO'][0] == 'A':
        m['Opening Category'] = 'A - Flank Opening'
    elif m['ECO'][0] == 'B':
        m['Opening Category'] = 'B - Semi-Open Games other than the French Defense'
    elif m['ECO'][0] == 'C':
        m['Opening Category'] = 'C - Open Games and the French Defense'
    elif m['ECO'][0] == 'D':
        m['Opening Category'] = 'D - Closed Games and Semi-Closed Games'
    elif m['ECO'][0] == 'E':
        m['Opening Category'] = 'E - Indian Defenses'

### filter games with Openings that don't exist in our ECO dataset

In [25]:
pattern = r'.\..'
to_delete = []
for id, gm in enumerate(game_matrices):
    filtered_df = eco_df.loc[eco_df['name'] == metadata[id]['Opening'].replace(';',',')]
    if filtered_df.empty:
        to_delete += [id]

for idx in sorted(to_delete, reverse=True):
    del game_matrices[idx]
    del metadata[idx]
    del metadata_evals[idx]
    del metadata_clks[idx]
    del games[idx]

### using eco dataframe to determine amount n of moves in the corresponding opening, cutting off each game after n

In [26]:
# pattern used to remove turn numbers such that we can determine amount of moves
pattern = r'.\..'
for id, gm in enumerate(game_matrices):
#     print(metadata[id]['Opening'].replace(';',','))
    filtered_df = eco_df.loc[eco_df['name'] == metadata[id]['Opening'].replace(';',',')]
    moves = filtered_df['pgn'].iloc[0]
    moves = (re.sub(pattern, '', moves))
    n_moves = len(moves.split(' '))
    # +1 because the 0th is before any moves have happened
    game_matrices[id] = game_matrices[id][:n_moves+1]

In [27]:
print(game_matrices[0])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [28]:
final_data = np.concatenate(game_matrices)

In [29]:
tsne = TSNE(
    perplexity=200,
    n_jobs=6,
    metric='euclidean',
    random_state=42,
    verbose=True
)

In [30]:
%time embedding = tsne.fit(np.array(final_data))

--------------------------------------------------------------------------------
TSNE(n_jobs=6, perplexity=200, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 600 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 0.83 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.26 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.09 seconds
===> Running optimization with exaggeration=12.00, lr=200.00 for 250 iterations...
Iteration   50, KL divergence 1.7374, 50 iterations in 0.3489 sec
Iteration  100, KL divergence 1.7468, 50 iterations in 0.3146 sec
Iteration  150, KL divergence 1.7227, 50 iterations in 0.3088 sec
Iteration  200, KL divergence 1.7100, 50 iterations in 0.3170 sec
Iteration  250, KL divergence 1.7104, 50 iterations in 0.3123 sec
   --> Time elapsed: 1.61 seconds
===> Running optimization with exaggeration=1.00, lr

In [31]:
embedding_split = np.array_split(embedding, np.add.accumulate([len(l) for l in game_matrices]))[:-1]

In [32]:
# write header
csv = open("lichess_captures_tsne.csv", "w")
features = "x,y,line,cp,algo,player,age,"
print(md_keys)
features += ','.join(md_keys)
features += ",eval,clk,a8,b8,c8,d8,e8,f8,g8,h8,a7,b7,c7,d7,e7,f7,g7,h7,a6,b6,c6,d6,e6,f6,g6,h6,a5,b5,c5,d5,e5,f5,g5,h5,a4,b4,c4,d4,e4,f4,g4,h4,a3,b3,c3,d3,e3,f3,g3,h3,a2,b2,c2,d2,e2,f2,g2,h2,a1,b1,c1,d1,e1,f1,g1,h1"
features += ',bb,bk,bn,bp,bq,br,wb,wk,wn,wp,wq,wr'
csv.write(features)
csv.write("\n")
idx = 0

# for gameIndex, game in enumerate(embedding_split[:450]):
for gameIndex, game in enumerate(embedding_split):
    capture_dict = {}
    # every game initialize dictionary that says no piece has been captured (this means we can't load incomplete games)
    for piece in 'bb,bk,bn,bp,bq,br,wb,wk,wn,wp,wq,wr'.split(','):
        capture_dict[piece] = 0
    pi = 0
    for idx, pos in enumerate(game):
        csv.write(str(pos[0]))
        csv.write(",")
        csv.write(str(pos[1]))
        
        # number of game
        csv.write(",")
        csv.write(str(gameIndex))
        
        # checkpoint
        csv.write(",")
        if idx == 0:
            csv.write("1")
        elif idx == len(game) - 1:
            csv.write("1")
        else:
            csv.write("0")
        
        
        
        # 'algo', i.e., path coloring method
        csv.write(",")
        # using opening category from A through E
        csv.write(metadata[gameIndex]['Opening Category'])
        # in this case the winner
        # winner = metadata[gameIndex]['Result']
        # winner = winner.replace('1-0', metadata[gameIndex]['White'])
        # winner = winner.replace('0-1', metadata[gameIndex]['Black'])
        # csv.write(winner)

        # player - whose turn is it
        csv.write(",")
        if idx % 2 == 0:
            csv.write(metadata[gameIndex]['Black'])
        else:
            csv.write(metadata[gameIndex]['White'])
        
        # age
        csv.write(",")
        csv.write(str(idx))
        csv.write(",")
        
        # per game metadata
        md_values = [metadata[gameIndex][k] for k in md_keys]
        csv.write(','.join(md_values))
        # write metadata gameIndex idx %clk and %eval
        
        # per move metadata
        csv.write(',')
        if idx < len(metadata_evals[gameIndex]):
            csv.write(metadata_evals[gameIndex][idx])
        csv.write(',')
        if idx < len(metadata_clks[gameIndex]):
            csv.write(metadata_clks[gameIndex][idx])
        
        for n in range(0, 64):
            csv.write(",")
            str1 = ''.join(str(e) for e in game_matrices[gameIndex][idx][n * 13: (n+1) * 13])
            csv.write(vector_to_state(str1))
            
        csv.write(',')
        # captured pieces
        # bb,bk,bn,bp,bq,br,wb,wk,wn,wp,wq,wr
        # update capture dict
        if idx > 0:
            captured = get_captured_piece(game_matrices, games[gameIndex], gameIndex, idx-1)
            if captured:
                capture_dict[captured] = capture_dict[captured] + 1
        # write capture dict
        for piece in 'bb,bk,bn,bp,bq,br,wb,wk,wn,wp,wq,wr'.split(','):
            csv.write(str(capture_dict[piece]))
            csv.write(',')
            
        csv.write("\n")
    gameIndex += 1
        
csv.close()

['Opening', 'TimeControl', 'Termination', 'ECO', 'WhiteElo', 'White', 'Result', 'Site', 'Round', 'Black', 'Date', 'UTCDate', 'BlackElo', 'Event', 'UTCTime']


In [33]:
import umap.umap_ as umap
from matplotlib import pyplot as plt
np.random.seed(0)

2022-01-11 14:12:50.411238: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-11 14:12:50.411304: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [34]:
reducer = umap.UMAP()

In [35]:
umap_embedding = reducer.fit_transform(final_data)

In [36]:
umap_embedding_split = np.array_split(umap_embedding, np.add.accumulate([len(l) for l in game_matrices]))[:-1]

In [37]:
import pandas as pd
df = pd.read_csv('lichess_captures_tsne.csv', header=0, index_col=False)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,x,y,line,cp,algo,player,age,Opening,TimeControl,Termination,...,bn,bp,bq,br,wb,wk,wn,wp,wq,wr
0,-10.350784,1.190717,0,1,B - Semi-Open Games other than the French Defense,Scheyla_Perdomo26,0,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
1,4.44209,-1.633663,0,0,B - Semi-Open Games other than the French Defense,Gersonz,1,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
2,4.353424,0.020715,0,0,B - Semi-Open Games other than the French Defense,Scheyla_Perdomo26,2,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
3,3.066621,-0.800069,0,1,B - Semi-Open Games other than the French Defense,Gersonz,3,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
4,-8.897102,3.586104,1,1,B - Semi-Open Games other than the French Defense,rj270,0,Caro-Kann Defense,60+0,Time forfeit,...,0,0,0,0,0,0,0,0,0,0


In [38]:
df['x'] = umap_embedding[:,0]
df['y'] = umap_embedding[:,1]
df.head()
# store

Unnamed: 0,x,y,line,cp,algo,player,age,Opening,TimeControl,Termination,...,bn,bp,bq,br,wb,wk,wn,wp,wq,wr
0,26.894022,43.695774,0,1,B - Semi-Open Games other than the French Defense,Scheyla_Perdomo26,0,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
1,9.903042,-28.5289,0,0,B - Semi-Open Games other than the French Defense,Gersonz,1,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
2,-27.536922,-19.170029,0,0,B - Semi-Open Games other than the French Defense,Scheyla_Perdomo26,2,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
3,-27.266459,-19.041006,0,1,B - Semi-Open Games other than the French Defense,Gersonz,3,Sicilian Defense: Alapin Variation,180+2,Time forfeit,...,0,0,0,0,0,0,0,0,0,0
4,27.89164,43.239712,1,1,B - Semi-Open Games other than the French Defense,rj270,0,Caro-Kann Defense,60+0,Time forfeit,...,0,0,0,0,0,0,0,0,0,0


In [39]:
df.to_csv('lichess_captures_umap_seed0.csv')

In [40]:
# fig = plt.figure(figsize=(8,8))
# ax = fig.add_subplot(111)
# ax.set_aspect(1)
# for game in umap_embedding_split[:100]:
#     ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='red')
# for game in umap_embedding_split[100:]:
#     ax.plot(game[:,0], game[:,1], '-r', alpha=0.1, color='blue')

## default parameters for UMAP over multiple seeds

In [41]:
# import pandas as pd
# import numpy as np

# for seed in range(20):
#     np.random.seed(seed)
#     reducer = umap.UMAP()
#     umap_embedding = reducer.fit_transform(final_data)
#     df = pd.read_csv('tsne.csv')
#     df['x'] = umap_embedding[:,0]
#     df['y'] = umap_embedding[:,1]
#     save_path = 'umap_seed_'+str(seed)+'.csv'
#     print('storing to', save_path)
#     df.to_csv(save_path)

## UMAP Hparam Search

In [42]:
# import pandas as pd
# import umap.umap_ as umap
# from matplotlib import pyplot as plt
# OUTPUT_FILE_NAME = 'umap_outputs/'

In [43]:
# def run_umap(data, path, learning_rate, nn, n_epochs, min_dist=0.1
#     reducer = umap.UMAP(n_neighbors=nn, learning_rate=learning_rate, n_epochs=n_epochs, min_dist=min_dist)
    
#     csv_path = path+'_nn'+str(nn)+'_lr'+str(learning_rate)+'_nepochs'+str(n_epochs)+'_mindist'+str(min_dist)+'.csv'
#     image_file_name = path+'_nn'+str(nn)+'_lr'+str(learning_rate)+'_nepochs'+str(n_epochs)+'_mindist'+str(min_dist)+'.png'
    
#     print('fitting umap embedding for', csv_path)
#     umap_embedding = reducer.fit_transform(data)
    
#     umap_df = pd.DataFrame(umap_embedding)
#     umap_df.to_csv(csv_path)
    
#     plt.figure()
#     plt.scatter(umap_embedding[:,0],umap_embedding[:,1])
#     print('storing to', csv_path)
#     plt.savefig(image_file_name)
#     plt.close()

In [44]:
# nns = [25,30,35,40,45]
# n_epochs_settings = [200, 300, 400, 500, 600, 700,]
# min_dist = [0.1]
# learning_rate = [1.0]
# for nn in nns:
#     for n_epochs in n_epochs_settings:
#         for md in min_dist:
#             for lr in learning_rate:
#                 run_umap(data=final_data, path=OUTPUT_FILE_NAME, learning_rate=lr, nn=nn, n_epochs=n_epochs, min_dist=md)