In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

# Read the Parquet file
file_path = "/Users/Macington/Documents/Projects/Project Gambit/Games/ply.parquet"
table = pq.read_table(file_path)
df = table.to_pandas()

print(df)

                      pgn_id  ply             board_sum     progression_hash  \
0        4966373121947802697    0  18446462598732906495  6548726006382385350   
1        1613272150230853183    0  18446462598732906495  6548726006382385350   
2        1613272150230853183    1  18446462598735003583  6838202551196677307   
3         127762129624462438    0  18446462598732906495  6548726006382385350   
4         127762129624462438    1  18446462598749683455 -1959233720066286525   
...                      ...  ...                   ...                  ...   
7054364 -2014757526705662466  236  13835102769989829121 -3085291962456116356   
7054365 -2014757526705662466  237  13835102769985634817  2208772014170310088   
7054366 -2014757526705662466  238  13835102767846539777  5917023950694882328   
7054367 -2014757526705662466  239  13835102767578105345 -7270769053707336975   
7054368 -2014757526705662466  240  13835102492700198401  7625980533342292681   

         centipawn  
0             24.0

In [2]:
import numpy as np
import pandas as pd

# Sample data generation (replace this with your actual DataFrame)
# df = your_actual_dataframe

# Calculate the max ply for each game
df['max_ply'] = df.groupby('pgn_id')['ply'].transform('max')

# Calculate how far each move is from the end of its game
df['ply_from_end'] = df['max_ply'] - df['ply']

# Calculate the difference between consecutive 'centipawn' values within each game.
df['centipawn_diff'] = df.groupby('pgn_id')['centipawn'].diff().fillna(0)

# Calculate a running average of the centipawn difference for each 'pgn_id'.
window_size = 10
df['running_avg'] = df.groupby('pgn_id')['centipawn_diff'].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())

# Calculate the standard deviation of the running average
df['running_std'] = df.groupby('pgn_id')['centipawn_diff'].transform(lambda x: x.rolling(window=window_size, min_periods=1).std().fillna(0))

# Define the significance level dynamically based on the running standard deviation
dynamic_significance_level = df['running_std'] * 2  # Here, we use 2 standard deviations as the level

# Identify significant deviations from the running average
df['significant_deviation'] = np.abs(df['centipawn_diff'] - df['running_avg']) >= dynamic_significance_level

# Filter moves where the centipawn value is in the interval [-50, 50] (and not 0)
df['interval_condition'] = (df['centipawn'].between(-50, 50)) & (df['centipawn'] != 0)

# Add the near-the-end-of-game condition
df['near_end_condition'] = np.abs(np.abs(df['centipawn']) - df['ply_from_end']) <= 5

# Add the condition to ignore moves where ply is 0
df['ply_not_zero'] = df['ply'] != 0

# Identify entries with incorrect notation
df['incorrect_notation'] = df['significant_deviation'] & df['interval_condition'] & df['near_end_condition'] & df['ply_not_zero']

# Add the condition for subsequent 'mate in X' moves
df['prev_incorrect_notation'] = df.groupby('pgn_id')['incorrect_notation'].shift(1).fillna(False)
df['subsequent_mate_condition'] = df['prev_incorrect_notation'] & df['centipawn_diff'].isin([1, -1])

# Update 'incorrect_notation' to include the 'subsequent_mate_condition'
df['incorrect_notation'] = df['incorrect_notation'] | df['subsequent_mate_condition']

# Add the condition for NaN centipawn values when ply_from_end = 0
df['checkmate_condition'] = (df['ply_from_end'] == 0) & df['centipawn'].isna()

# Update 'incorrect_notation' to include the 'checkmate_condition'
df['incorrect_notation'] = df['incorrect_notation'] | df['checkmate_condition']

# Correct these entries (with revised logic)
large_constant = 10 ** 5

# Correct these entries for non-NaN centipawn values
non_nan_condition = df['incorrect_notation'] & ~df['centipawn'].isna()
df.loc[non_nan_condition, 'centipawn'] = np.sign(df.loc[non_nan_condition, 'centipawn']) * large_constant + -1 * df.loc[non_nan_condition, 'centipawn']

# Correct these entries specifically for NaN centipawn values
nan_condition = df['incorrect_notation'] & df['centipawn'].isna()
df.loc[nan_condition, 'centipawn'] = -large_constant  # You can assign a specific value for these NaN cases

# Optionally, you can drop temporary columns
# df.drop(['max_ply', 'ply_from_end', 'centipawn_diff', 'running_avg', 'running_std', 'significant_deviation', 'interval_condition', 'near_end_condition', 'ply_not_zero', 'incorrect_notation', 'prev_incorrect_notation', 'subsequent_mate_condition', 'checkmate_condition'], axis=1, inplace=True)

# Your DataFrame should now have the correct notation for 'mate in X', including subsequent moves and checkmate conditions
print(df)


                      pgn_id  ply             board_sum     progression_hash  \
0        4966373121947802697    0  18446462598732906495  6548726006382385350   
1        1613272150230853183    0  18446462598732906495  6548726006382385350   
2        1613272150230853183    1  18446462598735003583  6838202551196677307   
3         127762129624462438    0  18446462598732906495  6548726006382385350   
4         127762129624462438    1  18446462598749683455 -1959233720066286525   
...                      ...  ...                   ...                  ...   
7054364 -2014757526705662466  236  13835102769989829121 -3085291962456116356   
7054365 -2014757526705662466  237  13835102769985634817  2208772014170310088   
7054366 -2014757526705662466  238  13835102767846539777  5917023950694882328   
7054367 -2014757526705662466  239  13835102767578105345 -7270769053707336975   
7054368 -2014757526705662466  240  13835102492700198401  7625980533342292681   

         centipawn  max_ply  ply_from_e