In [15]:
import pandas as pd

df = pd.read_parquet('/Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=43/data.parquet')

df

Unnamed: 0,game_id,pgn,ply,board_sum,centipawn_evaluation,final_centipawn_value
0,4.316888e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",0,18446462598732906495,,5.0
1,4.316888e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",1,18446462598867122175,20.0,5.0
2,4.316888e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",2,13834811764811823103,23.0,5.0
3,4.316888e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",3,13834811764878930943,18.0,5.0
4,4.316888e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",4,13830325757437604863,19.0,5.0
...,...,...,...,...,...,...
24161,5.151859e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",38,5256002066029849169,-112.0,-556.0
24162,5.151859e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",39,5256002066029587033,-505.0,-556.0
24163,5.151859e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",40,5256002066096695897,-533.0,-556.0
24164,5.151859e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",41,5256002066094598745,-579.0,-556.0


In [13]:
import pandas as pd
from pathlib import Path
import os

def update_partitions(storage_directory):
    # Iterate through all the subdirectories (partitions) in the storage directory
    for partition_dir in os.listdir(storage_directory):
        partition_path = Path(storage_directory, partition_dir)
        if not partition_path.is_dir():
            continue
        
        # Iterate through all the parquet files in the partition directory
        for file_path in partition_path.glob("*.parquet"):
            print(f"Processing file: {file_path}")
            
            # Read the parquet file
            df = pd.read_parquet(file_path)

            df.drop(columns=['centipawn_diff', 'final_centipawn_value_x', 'final_centipawn_value_y'], errors='ignore', inplace=True)

            # Remove duplicates based on game_id and ply, keeping the same sort order
            df.drop_duplicates(subset=['game_id', 'ply'], inplace=True)
            
            # Get the final centipawn value for each game_id
            final_centipawn_values = df.groupby('game_id')['centipawn_evaluation'].last()
            final_centipawn_values_dict = final_centipawn_values.to_dict()

            # Update the final_centipawn_value column using the mapping
            df['final_centipawn_value'] = df['game_id'].map(final_centipawn_values_dict)

            # Reindex the DataFrame
            df.reset_index(drop=True, inplace=True)

            # Save the updated DataFrame back to the parquet file
            df.to_parquet(file_path)

            print(f"File {file_path} updated.")

    print("All partitions processed.")

if __name__ == "__main__":
    storage_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage"
    update_partitions(storage_directory)

Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=40/data.parquet
File /Users/Macington/Documents/Projects

In [None]:
import chess.pgn
import pandas as pd
from pathlib import Path
from io import StringIO
import gc

def load_previously_evaluated_sequences(storage_directory, start_ply, end_ply):
    evaluated_sequences = {}
    for total_ply in range(start_ply, end_ply + 1):
        partition_path = Path(storage_directory, f"total_ply={total_ply}", "data.parquet")
        if not partition_path.exists():
            continue

        df = pd.read_parquet(partition_path)
        for pgn, evaluation in zip(df['pgn'], df['centipawn_evaluation']):
            moves = [str(move) for move in chess.pgn.read_game(StringIO(pgn)).mainline_moves()]
            for i in range(1, len(moves) + 1):
                evaluated_sequences[' '.join(moves[:i])] = evaluation

    return evaluated_sequences

def match_and_join_sequences(storage_directory, storage_test_directory, start_ply, end_ply, evaluated_sequences):
    print("Starting matching and joining sequences...")
    for total_ply in range(start_ply, end_ply + 1):
        partition_path = Path(storage_directory, f"total_ply={total_ply}", "data.parquet")
        if not partition_path.exists():
            continue

        df = pd.read_parquet(partition_path)
        evaluations = []

        for game_id, pgn in df[['game_id', 'pgn']].itertuples(index=False):
            moves = [str(move) for move in chess.pgn.read_game(StringIO(pgn)).mainline_moves()]
            for i in range(1, len(moves) + 1):
                move_sequence = ' '.join(moves[:i])
                evaluation = evaluated_sequences.get(move_sequence, None)
                if evaluation is not None:
                    evaluations.append((game_id, i, evaluation))

        # Create a DataFrame with the evaluations
        eval_df = pd.DataFrame(evaluations, columns=['game_id', 'ply', 'centipawn_evaluation'])
        df = df.merge(eval_df, on=['game_id', 'ply'], how='left')
        new_partition_dir = Path(storage_test_directory, f"total_ply={total_ply}")
        new_partition_dir.mkdir(exist_ok=True)
        new_partition_path = new_partition_dir / "data.parquet"
        df.to_parquet(new_partition_path)

        print(f"Saved partition for total_ply={total_ply}...")
        gc.collect()  # Call garbage collection inside the loop

    print("Processing completed!")

def main():
    print("Starting the main function...")
    storage_directory = "/Users/Macington/Documents/Projects/Project Scotch/Games/Storage"
    storage_test_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage"
    start_ply = 47
    end_ply = 350

    # Load previously evaluated sequences
    print("Calling load_previously_evaluated_sequences function...")
    evaluated_sequences = load_previously_evaluated_sequences(storage_test_directory, start_ply - 1, end_ply - 1) # Adjust start and end plies if needed

    # Match and join sequences
    print("Calling match_and_join_sequences function...")
    match_and_join_sequences(storage_directory, storage_test_directory, start_ply, end_ply, evaluated_sequences)

    print("Main function execution completed.")

if __name__ == "__main__":
    main()
