In [None]:
from pathlib import Path
from shutil import copyfile

def copy_num_rows_files(src_directory: str, dest_directory: str):
    '''
    Copy the matching "num_rows.txt" file from every partition in the source directory
    and put it in the matching partition folder inside the destination directory, but
    only if the corresponding partition exists in the destination directory.

    Arguments:
        src_directory  : Source directory containing the original partitions.
        dest_directory : Destination directory where the files will be copied.
    '''

    src_path  = Path(src_directory)
    dest_path = Path(dest_directory)

    # Loop through all total_ply directories in the source path
    for total_ply_dir in src_path.iterdir():
        if total_ply_dir.is_dir():
            # Path to the num_rows.txt file in the source directory
            src_num_rows_path = total_ply_dir / "num_rows.txt"

            # Check if the file exists
            if src_num_rows_path.exists():
                # Check if the corresponding destination directory exists
                dest_total_ply_dir = dest_path / total_ply_dir.name
                if dest_total_ply_dir.exists():
                    # Path to the num_rows.txt file in the destination directory
                    dest_num_rows_path = dest_total_ply_dir / "num_rows.txt"

                    # Copy the file
                    copyfile(src_num_rows_path, dest_num_rows_path)
                    print(f"Copied {src_num_rows_path} to {dest_num_rows_path}")

    print("Copying completed!")

def main():
    src_directory  = "/Users/Macington/Documents/Projects/Project Scotch/Games/Storage" # Source directory containing original partitions
    dest_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" # Destination directory to copy files to

    copy_num_rows_files(src_directory, dest_directory)

if __name__ == "__main__":
    main()

In [None]:
import os
import pandas as pd
from pathlib import Path

def deduplicate_partition(partition_path: str) -> None:
    '''
    Read the partition from the given path, deduplicate the rows, reset the index, and write the modified DataFrame back to the same path.

    Arguments:
        partition_path : The path to the partition file.
    '''

    # Read the partition
    df = pd.read_parquet(partition_path)

    # Deduplicate rows
    df = df.drop_duplicates()

    # Reset index to create a monotonic index
    df.reset_index(drop=True, inplace=True)

    # Write the modified DataFrame back to the same path
    df.to_parquet(partition_path)

def main():
    storage_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" # Directory containing original partitions
    storage_path      = Path(storage_directory)

    # Loop through all total_ply directories
    for total_ply_dir in storage_path.iterdir():
        if total_ply_dir.is_dir():
            # Path to the data.parquet file
            partition_path = total_ply_dir / "data.parquet"

            # Check if the file exists
            if partition_path.exists():
                print(f"Processing partition: {partition_path}")
                deduplicate_partition(partition_path)

    print("Processing completed!")

if __name__ == "__main__":
    main()


In [12]:
import pandas as pd
from pathlib import Path

def add_final_centipawn_value(storage_directory: str):
    '''
    Add a new column to each partition that contains the `centipawn_evaluation` value for the last move (highest ply) 
    within each `game_id`.

    Arguments:
        storage_directory : Directory containing the partitions.
    '''

    storage_path = Path(storage_directory)

    # Loop through all total_ply directories
    for total_ply_dir in storage_path.iterdir():
        if total_ply_dir.is_dir():
            # Path to the data.parquet file
            partition_path = total_ply_dir / "data.parquet"

            # Check if the file exists
            if partition_path.exists():
                # Read the Parquet file
                df = pd.read_parquet(partition_path)

                # Find the `centipawn_evaluation` value for the last move in each `game_id`
                final_centipawn_values = df.groupby('game_id')['centipawn_evaluation'].last()

                # Map the final `centipawn_evaluation` value to a new column based on `game_id`
                df['final_centipawn_value'] = df['game_id'].map(final_centipawn_values)

                # Write the updated DataFrame back to the same location
                df.to_parquet(partition_path)

                print(f"Processed partition: {partition_path}")

    print("Processing completed!")

def main():
    storage_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" # Directory containing partitions

    add_final_centipawn_value(storage_directory)

if __name__ == "__main__":
    main()


Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=40/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=24/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=23/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=15/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=12/data.

In [38]:
import pandas as pd
from pathlib import Path

def add_final_centipawn_value(storage_directory: str):
    '''
    Add a new column to each partition that contains the `centipawn_evaluation` value for the last move (highest ply) 
    within each `game_id`.

    Arguments:
        storage_directory : Directory containing the partitions.
    '''

    storage_path = Path(storage_directory)

    # Loop through all total_ply directories
    for total_ply_dir in storage_path.iterdir():
        if total_ply_dir.is_dir():
            # Path to the data.parquet file
            partition_path = total_ply_dir / "data.parquet"

            # Check if the file exists
            if partition_path.exists():
                # Read the Parquet file
                df = pd.read_parquet(partition_path)

                # Replace null values with 0
                df['centipawn_evaluation'] = df['centipawn_evaluation'].fillna(0)
                df['final_centipawn_value'] = df['final_centipawn_value'].fillna(0)

                # Write the updated DataFrame back to the same location
                df.to_parquet(partition_path)

                print(f"Processed partition: {partition_path}")

    print("Processing completed!")

def main():
    storage_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" # Directory containing partitions

    add_final_centipawn_value(storage_directory)

if __name__ == "__main__":
    main()


Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=40/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=24/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=23/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=15/data.parquet
Processed partition: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=12/data.

In [28]:
from pathlib import Path
import os

def remove_num_rows_txt(storage_directory: str):
    '''
    Remove the `num_rows.txt` file from each partition within the specified storage directory.

    Arguments:
        storage_directory : Directory containing the partitions.
    '''

    storage_path = Path(storage_directory)

    # Loop through all total_ply directories
    for total_ply_dir in storage_path.iterdir():
        if total_ply_dir.is_dir():
            # Path to the num_rows.txt file
            num_rows_path = total_ply_dir / "num_rows.txt"

            # Check if the file exists
            if num_rows_path.exists():
                # Remove the file
                os.remove(num_rows_path)
                print(f"Removed file: {num_rows_path}")

    print("Processing completed!")

def main():
    storage_directory = "/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" # Directory containing partitions
    remove_num_rows_txt(storage_directory)

if __name__ == "__main__":
    main()


Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=13/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=14/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=22/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=25/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=40/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=24/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=23/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=15/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=12/num_rows.txt
Removed file: /Users/Macington/Documents/Projects/Proje

In [34]:
import pyarrow.dataset as ds

def read_entire_directory(storage_directory: str, columns = None):
    '''
    Read the entire directory of Parquet files within the specified storage directory.

    Arguments:
        storage_directory : Directory containing the partitions.
        columns           : List of columns to read (optional).

    Returns:
        DataFrame : Pandas DataFrame containing the data from all partitions.
    '''

    # Create a dataset object for the given directory
    dataset = ds.dataset(storage_directory, format="parquet")

    # # Read the entire dataset into a table with optional column selection
    # table = dataset.to_table(columns=columns)

    # # Convert the table to a pandas DataFrame
    # df = table.to_pandas()

    print(dataset.schema)


read_entire_directory("/Users/Macington/Documents/Projects/Project Gambit/Games/Storage" )

game_id: double
pgn: string
ply: int64
board_sum: uint64
centipawn_evaluation: null
final_centipawn_value: null
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 979


In [39]:
import pandas as pd

df = pd.read_parquet('/Users/Macington/Documents/Projects/Project Gambit/Games/Storage/total_ply=40/data.parquet')
df

Unnamed: 0,game_id,pgn,ply,board_sum,centipawn_evaluation,final_centipawn_value
0,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",0,18446462598732906495,0.0,-87.0
1,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",1,18446462599001337855,53.0,-87.0
2,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",2,18445336716274364415,49.0,-87.0
3,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",3,18445336716276461503,47.0,-87.0
4,4.046685e+20,"[Event ""Lugano""]\n[Site ""Lugano""]\n[Date ""1970...",4,18443093712555798463,42.0,-87.0
...,...,...,...,...,...,...
24195,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",35,9438701631127525925,357.0,429.0
24196,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",36,9437575748400552485,412.0,429.0
24197,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",37,9437574648956033573,392.0,429.0
24198,5.125144e+20,"[Event ""Heidelberg""]\n[Site ""Heidelberg""]\n[Da...",38,9293461659903433253,426.0,429.0
