In [59]:
import numpy as np
import pandas as pd
import chess.pgn
import math

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# Replace the commas in the stockfish file w/ spaces
# Also, replace NA w/ whitespace (moves that weren't able to be assessed by Stockfish)
with open('stockfish.csv','r') as file:
    data = file.read()
    data = data.replace(',',' ')
    data = data.replace('NA','')
with open('new_stockfish.csv','w') as file:
    file.write(data)
    
# Use spaces as a delimiter to create a pandas dataframe
df = pd.read_csv('new_stockfish.csv',header=None,skiprows=1,delim_whitespace=True,names=range(331))
#display(df)


# Extracting train/test features
train_features = df.values[:25000,1:]
test_features = df.values[25000:,1:]


In [60]:
# Code for creating new training features
# Firstmove, mean cp, % of moves w/ accuracy 90-100, 80-90, 70-80, 60-70, etc.

new_train_features = pd.DataFrame(columns=['First Move','Mean Score','90%+','90-80%','80-70%','70-60%','60-50%','50-40%'\
                                           ,'40-30%','30-20%','20-10%','10-0%'])
# For each of the games in the training set
for scores in train_features:
    first_move = 0
    second_move = 1
    total = scores[first_move] #running total of cp
    nine = 0  #90%+
    eight = 0 #90-80%
    seven = 0 #80-70%
    six = 0   #70-60%
    five = 0  #...
    four = 0
    three = 0
    two = 0
    one = 0
    zero = 0
    # While there is a cp pair
    while(pd.isnull(scores[second_move])==False):
        total += scores[second_move]
        # Use given formulas to calculate win percentage of first and second cp in pair
        win_percent_before = 50+50*(2/(1+math.exp(-0.00368208*scores[first_move]))-1)
        win_percent_after = 50+50*(2/(1+math.exp(-0.00368208*scores[second_move]))-1)
        # If the pair is a black move, the win percentage is 100 - calculated
        if(first_move%2==0):
            win_percent_before = 100 - win_percent_before
            win_percent_after  = 100 - win_percent_after
        # Calculate accuracy of the move with given formula
        accuracy = 103.1668 * math.exp(-0.04354*(win_percent_before - win_percent_after)) - 3.1669
        first_move += 1
        second_move += 1
        # Increment the bucket corresponding to the accuracy recorded
        if(accuracy>90):
            nine+=1
        if(accuracy<90 and accuracy>80):
            eight+=1
        if(accuracy<80 and accuracy>70):
            seven+=1
        if(accuracy<70 and accuracy>60):
            six+=1
        if(accuracy<60 and accuracy>50):
            five+=1
        if(accuracy<50 and accuracy>40):
            four+=1
        if(accuracy<40 and accuracy>30):
            three+=1
        if(accuracy<30 and accuracy>20):
            two+=1
        if(accuracy<20 and accuracy>10):
            one+=1
        if(accuracy<10 and accuracy>0):
            zero+=1
    total_num = second_move + 1
    # Add the new row of features using the first move, mean, and amount of moves in each accuracy bucket
    new_train_features = pd.concat([new_train_features, pd.DataFrame({'First Move':[scores[0]],'Mean Score':[total/total_num],'90%+':[nine/(total_num-1)]\
                                                    ,'90-80%':[eight/(total_num-1)],'80-70%':[seven/(total_num-1)],'70-60%':[six/(total_num-1)]\
                                                        ,'60-50%':[five/(total_num-1)],'50-40%':[four/(total_num-1)],'40-30%':[three/(total_num-1)]\
                                                            ,'30-20%':[two/(total_num-1)],'20-10%':[one/(total_num-1)],'10-0%':[zero/(total_num-1)]})])
    
new_train_features = new_train_features.fillna(value=0)
display(new_train_features)

Unnamed: 0,First Move,Mean Score,90%+,90-80%,80-70%,70-60%,60-50%,50-40%,40-30%,30-20%,20-10%,10-0%
0,18.0,23.333333,0.894737,0.078947,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
0,26.0,32.142857,0.923077,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
0,26.0,-685.728972,0.783019,0.150943,0.018868,0.009434,0.018868,0.000000,0.000000,0.009434,0.0,0.0
0,2.0,48.615385,0.753247,0.194805,0.038961,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
0,26.0,299.920000,0.693878,0.224490,0.020408,0.020408,0.000000,0.020408,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
0,26.0,-3.335878,0.776923,0.146154,0.046154,0.007692,0.007692,0.007692,0.000000,0.000000,0.0,0.0
0,26.0,-115.985507,0.838235,0.102941,0.014706,0.029412,0.000000,0.000000,0.000000,0.000000,0.0,0.0
0,19.0,56.154412,0.903704,0.044444,0.022222,0.007407,0.007407,0.007407,0.000000,0.000000,0.0,0.0
0,18.0,117.186047,0.858824,0.117647,0.011765,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0


In [61]:
# Opening the data file and reading the training data elos to two numpy arrays
pgn = open("data.pgn")

train_label_white = np.zeros(25000)
train_label_black = np.zeros(25000)

for x in range(25000):
    game = chess.pgn.read_game(pgn)
    train_label_white[x] = game.headers['WhiteElo']
    train_label_black[x] = game.headers['BlackElo']
print(train_label_white)
print(np.shape(train_label_white))
print(np.shape(train_label_black))

[2354. 2523. 1915. ... 2634. 2319. 1717.]
(25000,)
(25000,)


In [62]:
# XGBoost model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

model = XGBRegressor()

print(np.shape(new_train_features))
print(np.shape(train_label_white))

# Model evaluation method (three runs of 10-fold cross validation)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluate the model using method defined
scores = cross_val_score(model, new_train_features, train_label_white, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)

print('Mean Absolute Error: %.3f elo' % (scores.mean()) )


(25000, 12)
(25000,)
Mean Absolute Error: 201.711 elo


In [63]:
# Code for creating new test features (run in separate cell from training to reduce time to run)
# Firstmove, % of moves w/ accuracy 95-100, 90-95, 85-90, 80-85, etc.
new_test_features = pd.DataFrame(columns=['First Move','Mean Score','90%+','90-80%','80-70%','70-60%','60-50%','50-40%'\
                                           ,'40-30%','30-20%','20-10%','10-0%'])
for scores in test_features:
    first_move = 0
    second_move = 1
    total = scores[first_move]
    nine = 0
    eight = 0
    seven = 0
    six = 0
    five = 0 
    four = 0
    three = 0
    two = 0
    one = 0
    zero = 0
    while(second_move!=330 and pd.isnull(scores[second_move])==False):
        total += scores[second_move]
        win_percent_before = 50+50*(2/(1+math.exp(-0.00368208*scores[first_move]))-1)
        win_percent_after = 50+50*(2/(1+math.exp(-0.00368208*scores[second_move]))-1)
        # black move
        if(first_move%2==0):
            win_percent_before = 100 - win_percent_before
            win_percent_after  = 100 - win_percent_after
        accuracy = 103.1668 * math.exp(-0.04354*(win_percent_before - win_percent_after)) - 3.1669
        first_move += 1
        second_move += 1
        if(accuracy>90):
            nine+=1
        if(accuracy<90 and accuracy>80):
            eight+=1
        if(accuracy<80 and accuracy>70):
            seven+=1
        if(accuracy<70 and accuracy>60):
            six+=1
        if(accuracy<60 and accuracy>50):
            five+=1
        if(accuracy<50 and accuracy>40):
            four+=1
        if(accuracy<40 and accuracy>30):
            three+=1
        if(accuracy<30 and accuracy>20):
            two+=1
        if(accuracy<20 and accuracy>10):
            one+=1
        if(accuracy<10 and accuracy>0):
            zero+=1
    total_num = second_move + 1
    new_test_features = pd.concat([new_test_features, pd.DataFrame({'First Move':[scores[0]],'Mean Score':[total/total_num],'90%+':[nine/(total_num-1)]\
                                                    ,'90-80%':[eight/(total_num-1)],'80-70%':[seven/(total_num-1)],'70-60%':[six/(total_num-1)]\
                                                        ,'60-50%':[five/(total_num-1)],'50-40%':[four/(total_num-1)],'40-30%':[three/(total_num-1)]\
                                                            ,'30-20%':[two/(total_num-1)],'20-10%':[one/(total_num-1)],'10-0%':[zero/(total_num-1)]})])
new_test_features = new_test_features.fillna(value=0)
display(new_test_features)

Unnamed: 0,First Move,Mean Score,90%+,90-80%,80-70%,70-60%,60-50%,50-40%,40-30%,30-20%,20-10%,10-0%
0,26.0,25.583333,0.884211,0.052632,0.052632,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,19.0,-273.185714,0.753623,0.159420,0.028986,0.028986,0.000000,0.000000,0.000000,0.014493,0.000000,0.000000
0,19.0,71.785714,0.771242,0.084967,0.045752,0.019608,0.013072,0.006536,0.032680,0.006536,0.006536,0.006536
0,19.0,52.444444,0.887097,0.096774,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,26.0,21.770492,0.850000,0.100000,0.016667,0.016667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
0,19.0,116.827869,0.834711,0.082645,0.041322,0.008264,0.008264,0.008264,0.008264,0.000000,0.000000,0.000000
0,19.0,106.171642,0.827068,0.082707,0.052632,0.015038,0.007519,0.000000,0.007519,0.000000,0.000000,0.000000
0,26.0,152.671429,0.681159,0.173913,0.101449,0.000000,0.000000,0.014493,0.014493,0.000000,0.000000,0.000000
0,18.0,95.750000,0.906667,0.066667,0.000000,0.013333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [66]:
# Final prediction for white elo
model.fit(new_train_features,train_label_white)
test_white_predictions = model.predict(new_test_features)

# Final prediction for black elo
model.fit(new_train_features,train_label_black)
test_black_predictions = model.predict(new_test_features)

df = pd.DataFrame({'WhiteElo':test_white_predictions,'BlackElo':test_black_predictions})
display(df)

Unnamed: 0,WhiteElo,BlackElo
0,2357.085938,2335.202637
1,2138.393066,2253.043457
2,2371.539551,2282.745605
3,2333.196777,2280.451172
4,2282.516113,2253.440674
...,...,...
24995,2428.752197,2460.075684
24996,2436.783936,2421.132568
24997,2290.244629,2111.347412
24998,2491.107178,2330.479980
