In [85]:
import pandas as pd
import numpy as np

In [86]:
fide = pd.read_json('data/curated/FIDE_MAIA_PREDS.ndjson', lines=True)

In [87]:
fide.columns

Index(['event', 'round', 'white', 'black', 'result', 'moves', 'white_elo',
       'black_elo', 'ECO', 'Opening', 'white_cpl', 'black_cpl',
       'stockfish_eval', 'queen_moved_at', 'queen_changed_at', 'total_checks',
       'first_check_at', 'total_moves', 'end_r', 'end_k', 'end_p', 'end_Q',
       'end_b', 'end_P', 'end_R', 'end_K', 'end_n', 'end_q', 'end_B', 'end_N',
       'promotion', 'can_claim_draw', 'insufficient_material', 'maia_cpl_w',
       'maia_cpl_b', 'mean_elos', 'diff_elos', 'white_mean', 'white_std',
       'white_min', 'white_max', 'black_mean', 'black_std', 'black_min',
       'black_max', 'stockfish_mean', 'stockfish_std', 'stockfish_min',
       'stockfish_max', 'maia_w_mean', 'maia_w_std', 'maia_w_min',
       'maia_w_max', 'maia_b_mean', 'maia_b_std', 'maia_b_min', 'maia_b_max',
       'pred_diff', 'pred_mean', 'white_pred', 'black_pred'],
      dtype='object')

In [88]:
outcome_w = {
    '1-0': 1, # win 
    '0-1': 0, # lose
    '1/2-1/2': 0.5 # draw
}
outcome_b = {
    '1-0': 0, # lose
    '0-1': 1, # win
    '1/2-1/2': 0.5 # draw
}

# Flattening the dataframe
games = []

for ind, row in fide.iterrows():
    white = {
        "name": row["white"],
        "event": row["event"],
        "opening": row["Opening"],
        "elo": row["white_elo"],
        "stockfish_mean": row["white_mean"],
        "stockfish_min": row["white_min"],
        "stockfish_max": row["white_max"],
        "stockfish_std": row["white_std"],
        "maia_mean": row["maia_w_mean"],
        "maia_min": row["maia_w_min"],
        "maia_max": row["maia_w_max"],
        "maia_std": row["maia_w_std"],
        "result": outcome_w[row["result"]],
        "is_white": 1,
        'moves': row["moves"],
        'pred': row["white_pred"],
        "opponent_elo": row["black_elo"]
    }

    black = {
        "name": row["black"],
        "event": row["event"],
        "opening": row["Opening"],
        "elo": row["black_elo"],
        "stockfish_mean": row["black_mean"],
        "stockfish_min": row["black_min"],
        "stockfish_max": row["black_max"],
        "stockfish_std": row["black_std"],
        "maia_mean": row["maia_b_mean"],
        "maia_min": row["maia_b_min"],
        "maia_max": row["maia_b_max"],
        "maia_std": row["maia_b_std"],
        "result": outcome_b[row["result"]],
        "is_white": 0,
        "moves": row["moves"],
        "pred": row["black_pred"],
        "opponent_elo": row["white_elo"]
    }

    games.append(white)
    games.append(black)

In [89]:
df = pd.DataFrame(games)

In [90]:
df.columns

Index(['name', 'event', 'opening', 'elo', 'stockfish_mean', 'stockfish_min',
       'stockfish_max', 'stockfish_std', 'maia_mean', 'maia_min', 'maia_max',
       'maia_std', 'result', 'is_white', 'moves', 'pred', 'pred_diff',
       'opponent_elo'],
      dtype='object')

In [91]:
# There are players who do not have any elo ratings. 
no_elo = df[df["elo"].isna()]
no_elo.head(2)

Unnamed: 0,name,event,opening,elo,stockfish_mean,stockfish_min,stockfish_max,stockfish_std,maia_mean,maia_min,maia_max,maia_std,result,is_white,moves,pred,pred_diff,opponent_elo
55,"Belyayeva, Nadezhda",FIDE Online Olympiad for people with disabilities,Ruy Lopez,,26.783784,-29,198,47.18631,-24.297297,-350,143,89.223335,0.0,0,"[e2e4, e7e5, g1f3, b8c6, f1b5, g8f6, d1e2, d7d...",1580.572697,,2001.0
70,"Arsova, Marija",FIDE Online Olympiad for people with disabilities,Sicilian defence,,40.821429,-17,261,57.579171,31.96875,-203,576,130.843056,0.0,1,"[e2e4, c7c5, h2h3, d7d6, g1f3, g7g6, b2b3, f8g...",1336.277012,,1199.0


In [92]:
# Define a Python function to calculate a player's Elo rating based on game results and opponent ratings for 
# players that do not have elo.
def calculate_elo(initial_rating, games, K=32):
    """
    Calculate a player's Elo rating based on the initial rating, game outcomes, and opponent ratings.
    
    :param initial_rating: The starting Elo rating of the player.
    :param games: A list of tuples where each tuple contains the opponent's rating and the game outcome.
                  Game outcome is 1 for a win, 0.5 for a draw, and 0 for a loss.
    :param K: The K-factor, which determines how much the rating changes after each game (default is 32).
    :return: The player's final Elo rating after all games.
    """
    rating = initial_rating
    
    for opponent_rating, outcome in games:
        if np.isnan(opponent_rating):
            continue

        # Calculate expected score
        expected_score = 1 / (1 + 10 ** ((opponent_rating - rating) / 400))
        # Update the player's rating based on the outcome
        rating = rating + K * (outcome - expected_score)
    
    return rating

def calculate_player_elo(df, initial_rating=1500):
    # Convert the DataFrame into a list of tuples (opponent_elo, result)
    games = list(zip(df['opponent_elo'], df['result']))
    # Use the calculate_elo function
    return calculate_elo(initial_rating, games)


In [102]:
# Group by 'name' and calculate the final Elo rating for each player
final_ratings = no_elo.groupby('name').apply(calculate_player_elo)

# Convert the result into a dictionary or keep it as a Series
final_ratings_dict = final_ratings.to_dict()

  final_ratings = no_elo.groupby('name').apply(calculate_player_elo)


In [None]:
# Only update players that are found in the dictionary
# Keep the original rating if the player is not in the dictionary
df['elo'] = df['name'].map(final_ratings_dict).combine_first(df['elo'])

In [100]:
# Redefining the prediction difference using the new elo ratings
df["pred_diff"] = df["elo"] - df["pred"]

In [150]:
# Aggregate the players based on different events they play
agg_df = df.groupby(['name', 'event']).agg(
    pred_diff_avg = ('pred_diff', 'mean'),
    pred_diff_std = ('pred_diff', 'std'),
    pred_diff_min = ('pred_diff', 'min'),
    pred_diff_max = ('pred_diff', 'max'),
    stockfish_cpl_avg=('stockfish_mean', 'mean'),
    stockfish_cpl_std=('stockfish_mean', 'std'),
    stockfish_cpl_min=('stockfish_mean', 'min'),
    stockfish_cpl_max=('stockfish_mean', 'max'),
    maia_cpl_avg=('maia_mean', 'mean'),
    maia_cpl_std=('maia_mean', 'std'),
    maia_cpl_min=('maia_mean', 'min'),
    maia_cpl_max=('maia_mean', 'max'),
).reset_index()

# Display the resulting DataFrame
agg_df.head(2)

Unnamed: 0,name,event,pred_diff_avg,pred_diff_std,pred_diff_min,pred_diff_max,stockfish_cpl_avg,stockfish_cpl_std,stockfish_cpl_min,stockfish_cpl_max,maia_cpl_avg,maia_cpl_std,maia_cpl_min,maia_cpl_max
0,"-, Shweta",Queen’s Chess Festival - Asia - Category D,-48.931294,72.419104,-201.952115,50.165799,50.538535,17.783643,24.076923,72.923077,19.669882,48.737898,-43.15,138.178571
1,"., Kulvinder",WUOCC - Rapid Div C,-5.379325,144.109774,-186.572245,210.12233,20.494096,17.379366,-3.920635,43.105263,-26.398804,18.141012,-50.190476,2.473684


In [110]:
# Reading in the cheaters data
cheaters = pd.read_csv('data/landing/Cheating.csv')

In [152]:
# Adding cheater column to our aggregated dataframe
agg_df["is_cheater"] = agg_df["name"].isin(cheaters["Player name"])

In [153]:
sum(agg_df["is_cheater"])

42

In [154]:
agg_df[agg_df["name"] == "Alkortabi,, Abdalhakeem A."] 

Unnamed: 0,name,event,pred_diff_avg,pred_diff_std,pred_diff_min,pred_diff_max,stockfish_cpl_avg,stockfish_cpl_std,stockfish_cpl_min,stockfish_cpl_max,maia_cpl_avg,maia_cpl_std,maia_cpl_min,maia_cpl_max,is_cheater
322,"Alkortabi,, Abdalhakeem A.",FIDE World Youth Rapid Champ - Africa - U18 - ...,88.981992,51.759154,11.878673,122.12876,11.821371,5.075262,7.642857,19.205128,-37.06743,15.095297,-52.8125,-21.415094,True


In [155]:
agg_df[agg_df["name"] == "Abhishek, Bhargav"] 

Unnamed: 0,name,event,pred_diff_avg,pred_diff_std,pred_diff_min,pred_diff_max,stockfish_cpl_avg,stockfish_cpl_std,stockfish_cpl_min,stockfish_cpl_max,maia_cpl_avg,maia_cpl_std,maia_cpl_min,maia_cpl_max,is_cheater
75,"Abhishek, Bhargav",WUOCC - Blitz - Main list,-343.996023,232.553072,-508.435876,-179.556169,29.307407,31.358876,7.133333,51.481481,-21.477236,4.9348,-24.966667,-17.987805,True
76,"Abhishek, Bhargav",WUOCC - Blitz Div D,-528.040551,136.490258,-712.651362,-369.47101,13.985537,9.211209,0.770833,27.446429,-60.066516,26.364209,-95.448276,-21.395833,True


In [156]:
agg_df.to_json("data/curated/FIDE_agg.ndjson", orient='records', lines=True)

In [157]:
len(agg_df)

7543