In [None]:
from pathlib import Path
from shutil import copyfile

def copy_num_rows_files(src_directory: str, dest_directory: str):
    '''
    Copy the matching "num_rows.txt" file from every partition in the source directory
    and put it in the matching partition folder inside the destination directory, but
    only if the corresponding partition exists in the destination directory.

    Arguments:
        src_directory  : Source directory containing the original partitions.
        dest_directory : Destination directory where the files will be copied.
    '''

    src_path  = Path(src_directory)
    dest_path = Path(dest_directory)

    # Loop through all total_ply directories in the source path
    for total_ply_dir in src_path.iterdir():
        if total_ply_dir.is_dir():
            # Path to the num_rows.txt file in the source directory
            src_num_rows_path = total_ply_dir / "num_rows.txt"

            # Check if the file exists
            if src_num_rows_path.exists():
                # Check if the corresponding destination directory exists
                dest_total_ply_dir = dest_path / total_ply_dir.name
                if dest_total_ply_dir.exists():
                    # Path to the num_rows.txt file in the destination directory
                    dest_num_rows_path = dest_total_ply_dir / "num_rows.txt"

                    # Copy the file
                    copyfile(src_num_rows_path, dest_num_rows_path)
                    print(f"Copied {src_num_rows_path} to {dest_num_rows_path}")

    print("Copying completed!")

def main():
    src_directory  = "/Users/Macington/Documents/Projects/Project Scotch/Games/Storage" # Source directory containing original partitions
    dest_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" # Destination directory to copy files to

    copy_num_rows_files(src_directory, dest_directory)

if __name__ == "__main__":
    main()

In [6]:
import os
import pandas as pd
from pathlib import Path

def deduplicate_partition(partition_path: str) -> None:
    '''
    Read the partition from the given path, deduplicate the rows, reset the index, and write the modified DataFrame back to the same path.

    Arguments:
        partition_path : The path to the partition file.
    '''

    # Read the partition
    df = pd.read_parquet(partition_path)

    # Deduplicate rows
    df = df.drop_duplicates()

    # Reset index to create a monotonic index
    df.reset_index(drop=True, inplace=True)

    # Write the modified DataFrame back to the same path
    df.to_parquet(partition_path)

def main():
    storage_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" # Directory containing original partitions
    storage_path      = Path(storage_directory)

    # Loop through all total_ply directories
    for total_ply_dir in storage_path.iterdir():
        if total_ply_dir.is_dir():
            # Path to the data.parquet file
            partition_path = total_ply_dir / "data.parquet"

            # Check if the file exists
            if partition_path.exists():
                print(f"Processing partition: {partition_path}")
                deduplicate_partition(partition_path)

    print("Processing completed!")

if __name__ == "__main__":
    main()


Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=40/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=24/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=23/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=15/data.parquet
Processing partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply

In [7]:
import pandas as pd

df = pd.read_parquet('/Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=40/data.parquet')
df

Unnamed: 0,game_id,pgn,ply,board_sum,centipawn_evaluation,centipawn_diff
0,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",0,18446462598732906495,,
1,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",1,18446462599001337855,53.0,0.0
2,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",2,18445336716274364415,49.0,4.0
3,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",3,18445336716276461503,47.0,2.0
4,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",4,18443093712555798463,42.0,5.0
...,...,...,...,...,...,...
24195,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",35,9438701631127525925,357.0,29.0
24196,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",36,9437575748400552485,412.0,55.0
24197,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",37,9437574648956033573,392.0,20.0
24198,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",38,9293461659903433253,426.0,34.0
