## Row-group splitting

This notebook demonstrates an initial implementation of the row-group splitting strategies discussed in recent meetings.

In [1]:
import os
import tempfile
import pyarrow.parquet as pq
import hats_import.catalog.run_import as runner

from dask.distributed import Client
from hats.pixel_math import HealpixPixel
from hats.pixel_math.spatial_index import spatial_index_to_healpix
from hats_import.catalog.arguments import ImportArguments

In [2]:
tmp_dir = tempfile.TemporaryDirectory()
tmp_path = tmp_dir.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_path)

### Split by `num_rows`

Let's split into row groups of size 100.

In [3]:
args = ImportArguments(
    output_artifact_name="small_sky_source_catalog",
    input_file_list=["small_sky_source.csv"],
    file_reader="csv",
    catalog_type="source",
    ra_column="source_ra",
    dec_column="source_dec",
    # Specify a sorting column
    sort_columns="source_id",
    output_path=tmp_path,
    dask_tmp=tmp_path,
    highest_healpix_order=2,
    pixel_threshold=3_000,
    simple_progress_bar=True,
    # Specify a custom row group size
    row_group_kwargs={"num_rows": 100},
)
runner.run(args, client)

Planning  : 100%|██████████| 4/4 [00:00<00:00, 1195.72it/s]
Mapping   : 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Binning   : 100%|██████████| 2/2 [00:00<00:00, 418.84it/s]
Splitting : 100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
Reducing  : 100%|██████████| 14/14 [00:04<00:00,  2.90it/s]
Finishing : 100%|██████████| 5/5 [00:00<00:00, 95.80it/s]


In [4]:
# Grab a single partition file
output_file = os.path.join(args.catalog_path, "dataset", "Norder=1", "Dir=0", "Npix=47.parquet")
pixel = pq.ParquetFile(output_file)
data = pixel.read()
metadata = pixel.metadata

In [5]:
# The metadata num_rows matches the actual data size
assert metadata.num_rows == len(data) == 2397

# We expect having 2397 / 100 number of row_groups
assert metadata.num_row_groups == pixel.num_row_groups == 24

for i in range(metadata.num_row_groups):
    num_rows_metadata = metadata.row_group(i).num_rows
    num_actual_rows = len(pixel.read_row_group(i))
    assert num_rows_metadata == num_actual_rows
    # The last row group has only 97 rows, which is fine
    expected_num_rows = 100 if i < metadata.num_row_groups - 1 else 97
    assert num_rows_metadata == expected_num_rows

Let's also check that the sorting columns were applied and saved to the parquet metadata:

In [6]:
sorting_columns = metadata.row_group(0).sorting_columns
sorting_columns = pq.SortingColumn.to_ordering(data.schema, sorting_columns)[0]
sorting_columns

(('_healpix_29', 'ascending'), ('source_id', 'ascending'))

In [7]:
pixel_df = data.to_pandas()
pixel_df

Unnamed: 0,_healpix_29,source_id,source_ra,source_dec,mjd,mag,band,object_id,object_ra,object_dec
0,3388234672763677299,79057,319.689276,-35.471646,58994.729122,19.275708,u,756,319.5,-35.5
1,3388234672763677299,99998,319.689276,-35.471646,58994.729122,19.275708,u,756,319.5,-35.5
2,3388234672763677299,99999,319.689276,-35.471646,58994.729122,19.275708,u,756,319.5,-35.5
3,3388234896379895635,73886,319.592796,-35.483273,58633.906487,20.147505,y,756,319.5,-35.5
4,3388235099832987970,71841,319.679813,-35.441961,58489.957445,15.386864,r,756,319.5,-35.5
...,...,...,...,...,...,...,...,...,...,...
2392,3424208044572750931,82429,307.815932,-24.920788,59233.671893,20.213820,r,743,307.5,-25.5
2393,3424209168946108787,77865,307.979441,-24.914767,58912.155282,16.821213,z,743,307.5,-25.5
2394,3424289351527689270,85500,307.546865,-25.013211,59448.830181,17.698490,z,743,307.5,-25.5
2395,3424289397526654168,70604,307.553185,-25.007915,58406.843975,16.936253,u,743,307.5,-25.5


In [8]:
# The first key used for sorting is _healpix_29
assert pixel_df["_healpix_29"].is_monotonic_increasing
# Rows with the same _healpix_29 are sorted by `source_id` next (also in ascending order)
assert data.equals(data.sort_by(sorting_columns))

In [9]:
tmp_dir.cleanup()

### Split by `sub-tiling`

Let's split the target pixels into HEALPix row groups of higher order (delta=2).

In [10]:
args = ImportArguments(
    output_artifact_name="small_sky_source_catalog",
    input_file_list=["small_sky_source.csv"],
    file_reader="csv",
    catalog_type="source",
    ra_column="source_ra",
    dec_column="source_dec",
    sort_columns="source_id",
    output_path=tmp_path,
    dask_tmp=tmp_path,
    highest_healpix_order=2,
    pixel_threshold=3_000,
    simple_progress_bar=True,
    # Specify a custom row group delta split
    row_group_kwargs={"subtile_order_delta": 2},
)
runner.run(args, client)

Planning  : 100%|██████████| 4/4 [00:00<00:00, 1139.60it/s]
Mapping   : 100%|██████████| 1/1 [00:00<00:00, 14.65it/s]
Binning   : 100%|██████████| 2/2 [00:00<00:00, 444.43it/s]
Splitting : 100%|██████████| 1/1 [00:00<00:00,  9.29it/s]
Reducing  : 100%|██████████| 14/14 [00:01<00:00, 13.06it/s]
Finishing : 100%|██████████| 5/5 [00:00<00:00, 279.03it/s]


In [11]:
# Grab a single partition file
output_file = os.path.join(args.catalog_path, "dataset", "Norder=1", "Dir=0", "Npix=47.parquet")
pixel = pq.ParquetFile(output_file)
data = pixel.read()
metadata = pixel.metadata
assert metadata.num_rows == len(data) == 2397

With this splitting strategy this pixel will have **non-empty** row groups of order 3:

In [12]:
child_pixels = HealpixPixel(1, 47).convert_to_higher_order(delta_order=2)
print(f"Possible {len(child_pixels)} child pixels:")
child_pixels

Possible 16 child pixels:


[Order: 3, Pixel: 752,
 Order: 3, Pixel: 753,
 Order: 3, Pixel: 754,
 Order: 3, Pixel: 755,
 Order: 3, Pixel: 756,
 Order: 3, Pixel: 757,
 Order: 3, Pixel: 758,
 Order: 3, Pixel: 759,
 Order: 3, Pixel: 760,
 Order: 3, Pixel: 761,
 Order: 3, Pixel: 762,
 Order: 3, Pixel: 763,
 Order: 3, Pixel: 764,
 Order: 3, Pixel: 765,
 Order: 3, Pixel: 766,
 Order: 3, Pixel: 767]

In [13]:
# The empty sub-tiles are not kept
assert pixel.num_row_groups == 5

Let's confirm that the data was correctly partitioned:

In [14]:
seen_pixels = []

for i in range(metadata.num_row_groups):
    row_group = metadata.row_group(i)

    # The row group is not empty
    assert row_group.num_rows > 0
    min_healpix29 = row_group.column(0).statistics.min
    max_healpix29 = row_group.column(0).statistics.max

    # The data in this row group only belongs to one pixel at order 3
    pixel_min, pixel_max = spatial_index_to_healpix([min_healpix29, max_healpix29], target_order=3)
    assert pixel_min == pixel_max
    assert HealpixPixel(3, pixel_min) in child_pixels

    # The row group contains data that does in fact belong to the pixel
    row_group_healpix29 = pixel.read_row_group(i)["_healpix_29"].to_numpy()
    assert all(row_group_healpix29 >= min_healpix29)
    assert all(row_group_healpix29 <= max_healpix29)
    
    seen_pixels.append(int(pixel_min))

In [15]:
# There is no overlap between row group pixels
seen_pixels

[752, 753, 754, 755, 760]

#### WIP: Default behavior (when no strategy is specified)

For files >=100MB we should split each pixel into row groups of 100MB each. The row group size needs to be calculated dinamically according to the size per row.

In [None]:
client.close()
tmp_dir.cleanup()