In [35]:
import pandas as pd

df = pd.read_parquet('/Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=90/data.parquet')

df

Unnamed: 0,game_id,pgn,ply,board_sum,centipawn_evaluation,final_centipawn_value
0,7.256935e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",0,18446462598732906495,,-1.0
1,7.256935e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",1,18446462599001337855,210.0,-1.0
2,7.256935e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",2,18445336716274364415,210.0,-1.0
3,7.256935e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",3,18445336716276461503,210.0,-1.0
4,7.256935e+20,"[Event ""Sparkassen GM""]\n[Site ""Dortmund GER""]...",4,18440850708835135423,541.0,-1.0
...,...,...,...,...,...,...
80365,9.996075e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",85,9263953104615850144,,-20.0
80366,9.996075e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",86,9263953104615784608,,-20.0
80367,9.996075e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",87,9263953104615784544,,-20.0
80368,9.996075e+20,"[Event ""DSB-21.Kongress""]\n[Site ""Hamburg""]\n[...",88,9263953104611590240,,-20.0


In [37]:
import pandas as pd

def count_records(partition_path):
    # Read the DataFrame from the given partition
    df = pd.read_parquet(partition_path)
    
    # Get the total count of records
    total_count = len(df)
    
    # Get the count of records where 'ply' is not null
    ply_not_null_count = df['centipawn_evaluation'].notnull().sum()
    
    return total_count, ply_not_null_count

# Path to the partition
partition_path = '/Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=100/data.parquet'

# Get the counts
total_count, ply_not_null_count = count_records(partition_path)

print(f"Total count of records: {total_count}")
print(f"Count of records where 'ply' is not null: {ply_not_null_count}")

Total count of records: 67000
Count of records where 'ply' is not null: 5480


In [30]:
import pandas as pd
from pathlib import Path
import os

def update_partitions(storage_directory):
    # Iterate through all the subdirectories (partitions) in the storage directory
    for partition_dir in os.listdir(storage_directory):
        partition_path = Path(storage_directory, partition_dir)
        if not partition_path.is_dir():
            continue
        
        # Iterate through all the parquet files in the partition directory
        for file_path in partition_path.glob("*.parquet"):
            print(f"Processing file: {file_path}")
            
            # Read the parquet file
            df = pd.read_parquet(file_path)

            df.drop(columns=['centipawn_diff', 'final_centipawn_value_x', 'final_centipawn_value_y'], errors='ignore', inplace=True)

            # Remove duplicates based on game_id and ply, keeping the same sort order
            df.drop_duplicates(subset=['game_id', 'ply'], inplace=True)
            
            # Get the final centipawn value for each game_id
            final_centipawn_values = df.groupby('game_id')['centipawn_evaluation'].last()
            final_centipawn_values_dict = final_centipawn_values.to_dict()

            # Update the final_centipawn_value column using the mapping
            df['final_centipawn_value'] = df['game_id'].map(final_centipawn_values_dict)

            # Reindex the DataFrame
            df.reset_index(drop=True, inplace=True)

            # Save the updated DataFrame back to the parquet file
            df.to_parquet(file_path)

            print(f"File {file_path} updated.")

    print("All partitions processed.")

if __name__ == "__main__":
    storage_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage"
    update_partitions(storage_directory)

Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/data.parquet
File /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/data.parquet updated.
Processing file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=85/data.parquet
File /Users/Macington/Documents/Projects

In [27]:
import chess.pgn
import pandas as pd
from pathlib import Path
from io import StringIO
import gc

def load_previously_evaluated_sequences(storage_directory, start_ply, end_ply):
    evaluated_sequences = {}
    for total_ply in range(start_ply, end_ply + 1):
        partition_path = Path(storage_directory, f"total_ply={total_ply}", "data.parquet")
        if not partition_path.exists():
            continue

        df = pd.read_parquet(partition_path)
        for pgn, evaluation in zip(df['pgn'], df['centipawn_evaluation']):
            moves = [str(move) for move in chess.pgn.read_game(StringIO(pgn)).mainline_moves()]
            for i in range(1, len(moves) + 1):
                evaluated_sequences[' '.join(moves[:i])] = evaluation

    return evaluated_sequences

def match_and_join_sequences(storage_directory, storage_test_directory, start_ply, end_ply, evaluated_sequences):
    print("Starting matching and joining sequences...")
    for total_ply in range(start_ply, end_ply + 1):
        partition_path = Path(storage_directory, f"total_ply={total_ply}", "data.parquet")
        if not partition_path.exists():
            continue

        df = pd.read_parquet(partition_path)
        evaluations = []

        for game_id, pgn in df[['game_id', 'pgn']].itertuples(index=False):
            moves = [str(move) for move in chess.pgn.read_game(StringIO(pgn)).mainline_moves()]
            for i in range(1, len(moves) + 1):
                move_sequence = ' '.join(moves[:i])
                evaluation = evaluated_sequences.get(move_sequence, None)
                if evaluation is not None:
                    evaluations.append((game_id, i, evaluation))

        # Create a DataFrame with the evaluations
        eval_df = pd.DataFrame(evaluations, columns=['game_id', 'ply', 'centipawn_evaluation'])
        df = df.merge(eval_df, on=['game_id', 'ply'], how='left')
        new_partition_dir = Path(storage_test_directory, f"total_ply={total_ply}")
        new_partition_dir.mkdir(exist_ok=True)
        new_partition_path = new_partition_dir / "data.parquet"
        df.to_parquet(new_partition_path)

        print(f"Saved partition for total_ply={total_ply}...")
        gc.collect()  # Call garbage collection inside the loop

    print("Processing completed!")

def main():
    print("Starting the main function...")
    storage_directory = "/Users/Macington/Documents/Projects/Project Scotch/Games/Storage"
    storage_test_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage"
    start_ply = 47
    end_ply = 228

    # Load previously evaluated sequences
    print("Calling load_previously_evaluated_sequences function...")
    evaluated_sequences = load_previously_evaluated_sequences(storage_test_directory, start_ply - 1, end_ply - 1) # Adjust start and end plies if needed

    # Match and join sequences
    print("Calling match_and_join_sequences function...")
    match_and_join_sequences(storage_directory, storage_test_directory, start_ply, end_ply, evaluated_sequences)

    print("Main function execution completed.")

if __name__ == "__main__":
    main()


Starting the main function...
Calling load_previously_evaluated_sequences function...
Calling match_and_join_sequences function...
Starting matching and joining sequences...
Saved partition for total_ply=47...
Saved partition for total_ply=48...
Saved partition for total_ply=49...
Saved partition for total_ply=50...
Saved partition for total_ply=51...
Saved partition for total_ply=52...
Saved partition for total_ply=53...
Saved partition for total_ply=54...
Saved partition for total_ply=55...
Saved partition for total_ply=56...
Saved partition for total_ply=57...
Saved partition for total_ply=58...
Saved partition for total_ply=59...
Saved partition for total_ply=60...
Saved partition for total_ply=61...
Saved partition for total_ply=62...
Saved partition for total_ply=63...
Saved partition for total_ply=64...
Saved partition for total_ply=65...
Saved partition for total_ply=66...
Saved partition for total_ply=67...
Saved partition for total_ply=68...
Saved partition for total_ply=69..

KeyboardInterrupt: 