In [139]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import ElasticNetCV
import numpy as np
from sklearn.metrics import r2_score
import warnings
import math
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [145]:
df = pd.read_json('data/curated/FIDE_MAIA_shifted.ndjson', lines = True)
# df = pd.read_json('data/curated/FIDE_ALL.ndjson', lines = True)
#df = pd.read_json("data/raw/stockfish_150.ndjson", lines = True)
#df = df.fillna(0)

In [146]:
print(len(df))
df = df[df['white'] != "?"]
df = df[df['black'] != "?"]
print(len(df))
df = df[df['stockfish_eval'].apply(lambda x: len(x) > 7)]
print(len(df))
df = df[df['white_cpl'].apply(lambda x: isinstance(x, (list, str)) and len(x) > 0)]
df = df[df['black_cpl'].apply(lambda x: isinstance(x, (list, str)) and len(x) > 0)]
df = df[df['stockfish_eval'].apply(lambda x: isinstance(x, (list, str)) and len(x) > 0)]
print(len(df))
df = df[df['maia_cpl_w'].apply(lambda x: isinstance(x, (list, str)) and len(x) > 0)]
df = df[df['maia_cpl_b'].apply(lambda x: isinstance(x, (list, str)) and len(x) > 0)]
print(len(df))

df['white_elo'] = pd.to_numeric(df['white_elo'], errors='coerce')
df['black_elo'] = pd.to_numeric(df['black_elo'], errors='coerce')


df["ECO"] = df["ECO"].astype('category')
df["Opening"] = df["Opening"].astype('category')
df["result"] = df["result"].astype('category')

df["mean_elos"] = df[['white_elo', 'black_elo']].mean(axis=1)
df["diff_elos"] = df['white_elo'] - df['black_elo']

df['white_mean'] = df['white_cpl'].apply(np.mean)
df['white_std'] = df['white_cpl'].apply(np.std)
df['white_min'] = df['white_cpl'].apply(np.min)
df['white_max'] = df['white_cpl'].apply(np.max)

df['black_mean'] = df['black_cpl'].apply(np.mean)
df['black_std'] = df['black_cpl'].apply(np.std)
df['black_min'] = df['black_cpl'].apply(np.min)
df['black_max'] = df['black_cpl'].apply(np.max)

df['stockfish_mean'] = df['stockfish_eval'].apply(np.mean)
df['stockfish_std'] = df['stockfish_eval'].apply(np.std)
df['stockfish_min'] = df['stockfish_eval'].apply(np.min)
df['stockfish_max'] = df['stockfish_eval'].apply(np.max)

df['maia_w_mean'] = df['maia_cpl_w'].apply(np.mean)
df['maia_w_std'] = df['maia_cpl_w'].apply(np.std)
df['maia_w_min'] = df['maia_cpl_w'].apply(np.min)
df['maia_w_max'] = df['maia_cpl_w'].apply(np.max)

df['maia_b_mean'] = df['maia_cpl_b'].apply(np.mean)
df['maia_b_std'] = df['maia_cpl_b'].apply(np.std)
df['maia_b_min'] = df['maia_cpl_b'].apply(np.min)
df['maia_b_max'] = df['maia_cpl_b'].apply(np.max)

dep_var = "mean_elos" # 19955

40030
32506
28935
28934
28933


In [6]:
#df.to_json('data/curated/FIDE_MAIA_curated.ndjson', orient='records', lines=True)
# df.read_json('data/curated/FIDE_MAIA_curated.ndjson', lines = True)

In [148]:
cont = ['queen_moved_at', 'total_checks', 'first_check_at', 'queen_changed_at', 'total_moves', 'end_r',
 'end_p', 'end_b', 'end_n', 'end_q', 'end_N', 'end_P', 'end_Q', 'end_B', 'end_R', 'promotion', 'can_claim_draw',
 'white_mean', 'white_std', 'white_min', 'white_max', 'black_mean', 'black_std', 'black_min', 'black_max', 'stockfish_mean',
 'stockfish_std', 'stockfish_min', 'stockfish_max', 'maia_w_mean',
 'maia_w_std', 'maia_w_min', 'maia_w_max', 'maia_b_mean',
 'maia_b_std', 'maia_b_min', 'maia_b_max']

In [149]:
import joblib
results = pd.get_dummies(df["result"])

X = pd.concat([df[cont], results], axis = 1)
X = X.fillna(0)

y1 = df["mean_elos"]
y2 = df["diff_elos"]

In [150]:
from joblib import load
rf_mean = load('rf_mean.pkl')
rf_diff = load('rf_diff.pkl')

In [151]:
to_keep_diff = ['0-1', 'stockfish_max', 'stockfish_mean', 'stockfish_min', '1-0', 
                'black_std', 'black_max', 'black_mean', 'white_mean', 'maia_b_mean', 
                'stockfish_std', 'white_max', 'white_std', 'maia_b_min', 'maia_w_mean', 
                'black_min', 'white_min', 'maia_w_min', 'maia_b_max', 'maia_b_std', 'maia_w_max', 
                'maia_w_std', 'total_moves', 'queen_moved_at', 'first_check_at', 'queen_changed_at', 
                'total_checks', 'end_p', 'end_P']

to_keep_mean = ['white_mean', 'black_mean', 'stockfish_mean', 'stockfish_std', 'maia_w_mean', 
                'white_min', 'maia_b_mean', 'black_min', 'total_moves', 'stockfish_min', 
                'stockfish_max', 'first_check_at', 'white_std', 'maia_b_max', 'maia_b_min', 
                'maia_w_min', 'maia_b_std', 'maia_w_max', 'white_max', 'queen_moved_at', 
                'maia_w_std', 'black_std', 'black_max', 'queen_changed_at', 'total_checks', 
                'end_P', 'end_p']

In [152]:
pred_diff = rf_diff.predict(X[to_keep_diff])

pred_mean = rf_mean.predict(X[to_keep_mean])

In [153]:
df["pred_diff"] = rf_diff.predict(X[to_keep_diff])
df["pred_mean"] = rf_mean.predict(X[to_keep_mean])

In [154]:
df["white_pred"] = df["pred_mean"] + df["pred_diff"]/2
df["black_pred"] = df["pred_mean"] - df["pred_diff"]/2

In [155]:
df.columns

Index(['event', 'round', 'white', 'black', 'result', 'moves', 'white_elo',
       'black_elo', 'ECO', 'Opening', 'white_cpl', 'black_cpl',
       'stockfish_eval', 'queen_moved_at', 'queen_changed_at', 'total_checks',
       'first_check_at', 'total_moves', 'end_r', 'end_k', 'end_p', 'end_Q',
       'end_b', 'end_P', 'end_R', 'end_K', 'end_n', 'end_q', 'end_B', 'end_N',
       'promotion', 'can_claim_draw', 'insufficient_material', 'maia_cpl_w',
       'maia_cpl_b', 'mean_elos', 'diff_elos', 'white_mean', 'white_std',
       'white_min', 'white_max', 'black_mean', 'black_std', 'black_min',
       'black_max', 'stockfish_mean', 'stockfish_std', 'stockfish_min',
       'stockfish_max', 'maia_w_mean', 'maia_w_std', 'maia_w_min',
       'maia_w_max', 'maia_b_mean', 'maia_b_std', 'maia_b_min', 'maia_b_max',
       'pred_diff', 'pred_mean', 'white_pred', 'black_pred'],
      dtype='object')

In [156]:
# df.to_json('data/curated/FIDE_MAIA_PREDS.ndjson', orient='records', lines=True)
# df = pd.read_json('data/curated/FIDE_MAIA_PREDS.ndjson', lines=True)

In [157]:
white_actual = []
black_actual = []
for ind, row in df.iterrows():
    white_actual.append(row["white_elo"])
    black_actual.append(row["black_elo"])

all_actual = white_actual + black_actual

In [158]:
white_pred = []
black_pred = []
for ind, row in df.iterrows():
    white_pred.append(row["white_pred"])
    black_pred.append(row["black_pred"])

all_preds = white_pred + black_pred

In [159]:
import numpy as np
from sklearn.metrics import mean_absolute_error

# Convert to numpy arrays if they aren't already
all_actual = np.array(all_actual)
all_preds = np.array(all_preds)

# Create a mask to filter out NaN values
mask = ~np.isnan(all_actual) & ~np.isnan(all_preds)

# Filter the data
all_actual_clean = all_actual[mask]
all_preds_clean = all_preds[mask]

# Calculate MAE
mae = mean_absolute_error(all_actual_clean, all_preds_clean)
print(mae)

346.7468325699287


In [160]:
len(all_actual_clean)

46312

Now we try to predict the elo rating based wihtout predicting the difference.

In [161]:
df.columns

Index(['event', 'round', 'white', 'black', 'result', 'moves', 'white_elo',
       'black_elo', 'ECO', 'Opening', 'white_cpl', 'black_cpl',
       'stockfish_eval', 'queen_moved_at', 'queen_changed_at', 'total_checks',
       'first_check_at', 'total_moves', 'end_r', 'end_k', 'end_p', 'end_Q',
       'end_b', 'end_P', 'end_R', 'end_K', 'end_n', 'end_q', 'end_B', 'end_N',
       'promotion', 'can_claim_draw', 'insufficient_material', 'maia_cpl_w',
       'maia_cpl_b', 'mean_elos', 'diff_elos', 'white_mean', 'white_std',
       'white_min', 'white_max', 'black_mean', 'black_std', 'black_min',
       'black_max', 'stockfish_mean', 'stockfish_std', 'stockfish_min',
       'stockfish_max', 'maia_w_mean', 'maia_w_std', 'maia_w_min',
       'maia_w_max', 'maia_b_mean', 'maia_b_std', 'maia_b_min', 'maia_b_max',
       'pred_diff', 'pred_mean', 'white_pred', 'black_pred'],
      dtype='object')

In [162]:
games = []
for index, row in df.iterrows():
    white = {
        "name": row["white"],
        "event": row["event"],
        "opening": row["Opening"],
        "elo": row["white_elo"],
        "stockfish_mean": row["white_mean"],
        "stockfish_min": row["white_min"],
        "stockfish_max": row["white_max"],
        "stockfish_std": row["white_std"],
        "maia_mean": row["maia_w_mean"],
        "maia_min": row["maia_w_min"],
        "maia_max": row["maia_w_max"],
        "maia_std": row["maia_w_std"],
        "result": row["result"],
        "is_white": 1,
        'moves': row["moves"]
    }

    black = {
        "name": row["black"],
        "event": row["event"],
        "opening": row["Opening"],
        "elo": row["black_elo"],
        "stockfish_mean": row["black_mean"],
        "stockfish_min": row["black_min"],
        "stockfish_max": row["black_max"],
        "stockfish_std": row["black_std"],
        "maia_mean": row["maia_b_mean"],
        "maia_min": row["maia_b_min"],
        "maia_max": row["maia_b_max"],
        "maia_std": row["maia_b_std"],
        "result": row["result"],
        "is_white": 0,
        'moves': row["moves"]
    }

    games.append(white)
    games.append(black)

In [163]:
g = pd.DataFrame(games)

In [164]:
g.to_json('data/curated/FIDE_INDIVIDUAL.ndjson', orient='records', lines=True)

In [165]:
g.head()

Unnamed: 0,name,event,opening,elo,stockfish_mean,stockfish_min,stockfish_max,stockfish_std,maia_mean,maia_min,maia_max,maia_std,result,is_white,moves
0,"Tran, Thi Bich Thuy",FIDE Online Olympiad for people with disabilities,"Trompovsky attack (Ruth, Opovcensky opening)",1422.0,22.057143,-28,121,34.22943,3.777778,-117,104,37.087068,1-0,1,"[d2d4, g8f6, c1g5, d7d6, g5f6, g7f6, e2e3, e7e..."
1,"Zhukovskaya, Nina",FIDE Online Olympiad for people with disabilities,"Trompovsky attack (Ruth, Opovcensky opening)",1475.0,35.939394,-22,408,75.443713,-12.542857,-634,624,185.118239,1-0,0,"[d2d4, g8f6, c1g5, d7d6, g5f6, g7f6, e2e3, e7e..."
2,"Molenda, Marcin",FIDE Online Olympiad for people with disabilities,QGD Slav,2349.0,15.15,-25,193,46.988589,-9.75,-165,171,60.9917,1-0,1,"[d2d4, d7d5, c2c4, c7c6, g1f3, g8f6, d1b3, d8b..."
3,"Alam, Md. Khorshed",FIDE Online Olympiad for people with disabilities,QGD Slav,2065.0,64.473684,-6,566,125.665122,27.210526,-89,451,104.022052,1-0,0,"[d2d4, d7d5, c2c4, c7c6, g1f3, g8f6, d1b3, d8b..."
4,"Husain, Ejaz",FIDE Online Olympiad for people with disabilities,Sicilian,1952.0,16.135135,-18,195,34.810811,-9.513514,-227,58,44.854492,0-1,1,"[e2e4, c7c5, g1f3, d7d6, c2c3, g8f6, f1e2, c8d..."
