This file was created for the multi-level embedding resubmission to try out a different encoding and whether that would lead to slightly nicer chess embeddings.
The idea is that we change the one-hot encoding such that empty states are not encoded juts like all the other pieces with their own unique one-hot string but instead as all-zero, thus allowing us to use the cosine metric where the empty pieces should be processed differently from all the non-empty fields.

In [18]:
from openTSNE import TSNE
from openTSNE.callbacks import ErrorApproximations
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
file = 'openings30.csv'

In [19]:
df = pd.read_csv(file, header=0)

In [20]:
fields = ['a8','b8','c8','d8','e8','f8','g8','h8','a7','b7','c7','d7','e7','f7','g7','h7','a6','b6','c6','d6','e6','f6','g6','h6','a5','b5','c5','d5','e5','f5','g5','h5','a4','b4','c4','d4','e4','f4','g4','h4','a3','b3','c3','d3','e3','f3','g3','h3','a2','b2','c2','d2','e2','f2','g2','h2','a1','b1','c1','d1','e1','f1','g1','h1']
print(len(df[fields]))
df[fields].head()

13428


Unnamed: 0,a8,b8,c8,d8,e8,f8,g8,h8,a7,b7,...,g2,h2,a1,b1,c1,d1,e1,f1,g1,h1
0,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,wn,wr
1,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
2,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
3,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
4,br,bn,bb,bq,bk,bb,,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr


In [21]:
df = df[fields].fillna('')
df.head()

Unnamed: 0,a8,b8,c8,d8,e8,f8,g8,h8,a7,b7,...,g2,h2,a1,b1,c1,d1,e1,f1,g1,h1
0,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,wn,wr
1,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
2,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
3,br,bn,bb,bq,bk,bb,bn,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr
4,br,bn,bb,bq,bk,bb,,br,bp,bp,...,wp,wp,wr,wn,wb,wq,wk,wb,,wr


In [22]:
data = df[fields].to_numpy()
print(data)

[['br' 'bn' 'bb' ... 'wb' 'wn' 'wr']
 ['br' 'bn' 'bb' ... 'wb' '' 'wr']
 ['br' 'bn' 'bb' ... 'wb' '' 'wr']
 ...
 ['br' '' '' ... '' '' 'wr']
 ['br' '' '' ... '' '' 'wr']
 ['br' '' '' ... '' '' 'wr']]


## one hot, then change enocding for empty field to all zeros

In [23]:
def one_hot(array):
    unique, inverse = np.unique(array, return_inverse=True)
    onehot = np.eye(unique.shape[0])[inverse]
    return onehot

idx = np.argwhere(data == '')

oh_data = one_hot(data)

# find oh encoding for empty string
oh_data_3d = oh_data.reshape(len(df), -1, oh_data.shape[-1])
empty = oh_data_3d[idx[0][0]][idx[0][1]]

# replace oh encoding for empty string with all zeros
oh_data_3d = np.where(oh_data_3d==empty, np.zeros_like(empty), oh_data_3d)
print('"'+str(oh_data_3d[idx[0][0]][idx[0][1]])+'"')

data = oh_data_3d.reshape(data.shape[0],-1)
print(data.shape)

"[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]"
(13428, 832)


In [24]:
# [print(s) for s in oh_data_3d[0]]

In [25]:
seed = 42
n_epochs = 100
perplexity = 50

tsne = TSNE(
    perplexity=perplexity,
    initialization='random',
    n_jobs=6,
    metric='cosine',
    random_state=seed,
    n_iter=n_epochs,
    verbose=True
)

In [26]:
%time embedding = tsne.fit(np.array(data))

--------------------------------------------------------------------------------
TSNE(initialization='random', metric='cosine', n_iter=100, n_jobs=6,
     perplexity=50, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 150 nearest neighbors using Annoy approximate search using cosine distance...
   --> Time elapsed: 8.98 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.65 seconds
===> Running optimization with exaggeration=12.00, lr=1119.00 for 250 iterations...
Iteration   50, KL divergence 5.1147, 50 iterations in 1.6070 sec
Iteration  100, KL divergence 4.0727, 50 iterations in 1.8285 sec
Iteration  150, KL divergence 3.9188, 50 iterations in 1.5827 sec
Iteration  200, KL divergence 3.8639, 50 iterations in 1.5287 sec
Iteration  250, KL divergence 3.8378, 50 iterations in 1.5254 sec
   --> Time elapsed: 8.07 seconds
===> Running optimization with exaggeration=1.00, lr=1119.00 for 100 iterat

In [27]:
a = np.array([
    [1,2,3],
    [4,5,6]
])
x = [1,2,3]
r = np.zeros_like(x)
a = np.where(a==x, r, a)
print(a)

[[0 0 0]
 [4 5 6]]


In [28]:
df = pd.read_csv(file, header=0)

In [29]:
df['x'] = embedding[:,0]
df['y'] = embedding[:,1]
df.head()

Unnamed: 0.1,Unnamed: 0,x,y,line,cp,algo,oldage,a8,b8,c8,...,b1,c1,d1,e1,f1,g1,h1,max20,max30,max40
0,0,-21.603554,0.421642,0,1,0,0,br,bn,bb,...,wn,wb,wq,wk,wb,wn,wr,True,True,True
1,1,-5.870071,21.133714,0,0,0,1,br,bn,bb,...,wn,wb,wq,wk,wb,,wr,True,True,True
2,2,0.605484,25.142196,0,0,0,2,br,bn,bb,...,wn,wb,wq,wk,wb,,wr,True,True,True
3,3,-3.239994,15.836035,0,0,0,3,br,bn,bb,...,wn,wb,wq,wk,wb,,wr,True,True,True
4,4,2.459086,10.6057,0,0,0,4,br,bn,bb,...,wn,wb,wq,wk,wb,,wr,True,True,True


In [30]:
df.to_csv('cosine_30_tsne_seed'+str(seed)+'_epochs'+str(n_epochs)+'_prepl'+str(perplexity)+'.csv')

In [None]:
import umap.umap_ as umap
from matplotlib import pyplot as plt
np.random.seed(seed)

In [11]:
n_epochs=500
n_neighbors=30
reducer = umap.UMAP(random_state=seed,
                    n_neighbors=n_neighbors,
                    verbose=True,
                    metric='cosine',
                    n_epochs=n_epochs)

NameError: name 'umap' is not defined

In [None]:
umap_embedding = reducer.fit_transform(data)

In [None]:
print(umap_embedding.shape)

In [None]:
df['x'] = umap_embedding[:,0]
df['y'] = umap_embedding[:,1]
df.head()

In [None]:
df.to_csv('cosine_30_umap_seed'+str(seed)+'_epochs'+str(n_epochs)+'_neighbors'+str(n_neighbors)+'.csv')