In [None]:
import numpy as np
import pandas as pd
import chess.pgn

# Replace the commas in the stockfish file w/ spaces
# Also, replace NA w/ whitespace (moves that weren't able to be assessed by Stockfish)
with open('stockfish.csv','r') as file:
    data = file.read()
    data = data.replace(',',' ')
    data = data.replace('NA','')
with open('new_stockfish.csv','w') as file:
    file.write(data)

# Use spaces as a delimiter to create a pandas dataframe
df = pd.read_csv('new_stockfish.csv',header=None,skiprows=1,delim_whitespace=True,names=range(331))
#display(df)

# Code for filling NaN entries if needed
df = df.fillna(0)

# Extracting train/test features
train_features = df.values[:25000,1:]
test_features = df.values[25000:,1:]


In [None]:
# Opening the data file and reading the training data elos to two numpy arrays
pgn = open("data.pgn")

train_label_white = np.zeros(25000)
train_label_black = np.zeros(25000)

for x in range(25000):
    game = chess.pgn.read_game(pgn)
    train_label_white[x] = game.headers['WhiteElo']
    train_label_black[x] = game.headers['BlackElo']
print(train_label_white)
print(np.shape(train_label_white))

[2354. 2523. 1915. ... 2634. 2319. 1717.]
(25000,)


In [None]:
# Selecting ~20% of the training data for a validation set
mask = np.random.choice([False,True], len(train_label_white), p=[0.8,0.2])

valid_features = train_features[mask]
valid_label_white = train_label_white[mask]
valid_label_black = train_label_black[mask]

train_features = train_features[~mask]
train_label_white = train_label_white[~mask]
train_label_black = train_label_black[~mask]

In [None]:
# KNN method
def KNN(t_features,v_features,k,black_labels,white_labels):
    v_num = v_features.shape[0]
    t_num = t_features.shape[0]
    distances = np.zeros((v_num, t_num))

    # Euclidean distance, result is matrix with each row representing a validation points
    distances = np.sqrt((v_features**2).sum(axis=1)[:,np.newaxis] + (t_features**2).sum(axis=1) - 2 * v_features.dot(t_features.T))

    #Get the indexes of the k closest distances, and use them to get the k closest label elos
    ten_closest_indexes = np.argpartition(distances,k,axis=1)[:,0:k]
    ten_closest_labels_white = np.take(white_labels,ten_closest_indexes)
    ten_closest_labels_black = np.take(black_labels,ten_closest_indexes)

    #For each validation point, set the prediction as the average elo of the k closest
    label_predictions_white = np.array([np.mean(row) for row in ten_closest_labels_white])
    label_predictions_black = np.array([np.mean(row) for row in ten_closest_labels_black])

    return label_predictions_white, label_predictions_black

In [None]:

label_predictions_white, label_predictions_black = KNN(train_features,valid_features,10,train_label_black,train_label_white)

print(label_predictions_white)
print(np.shape(label_predictions_white))
print(label_predictions_black)

[2233.1 2264.9 1931.6 ... 2342.3 2315.7 2256.8]
(5084,)
[2034.2 2193.  2143.2 ... 2258.  2124.4 2266.7]
