In [1]:
import polars as pl

In [None]:
# Read all Parquet files from ../lichess_eval_parquet, sort out-of-memory, save to lichess_eval_parquet_sorted

import glob
import os

input_dir = "../lichess_eval_parquet"
output_dir = "lichess_eval_parquet_sorted"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# List all parquet files in the input directory
input_files = sorted(glob.glob(f"{input_dir}/*.parquet"))

# Read all parquet files as a lazy scan for out-of-core operation
df = pl.scan_parquet(input_files)

# You may want to change the "sort" columns as required!
# We'll assume sorting by "fen" and "evals" columns as an example; modify as appropriate for your schema
# For nested columns, you may need to flatten or extract required keys.
# Here, let's just sort by "fen" field:
sorted_df = df.sort("fen")

# Collect in reasonable sized batches to fit memory and write out partitioned (sharded) Parquet files.
# For example: write 100_000 rows per file
batch_size = 100_000

# The number of rows in the scan is not known before computation, so write in an iterator loop
row_idx = 0
for batch in sorted_df.collect(streaming=True).iter_slices(n_rows=batch_size):
    out_path = os.path.join(output_dir, f"part-{row_idx:05}.parquet")
    pl.DataFrame(batch).write_parquet(out_path)
    row_idx += 1

print(f"Finished sorting. Output files are in {output_dir}")