## Path Generator (applied to Gaia DR4)

This notebook demonstrates how to use custom path generators with LSDB. 

These generators are useful for HATS-like catalogs with parquet files stored in a different directory structure.

In [1]:
import os
import lsdb
import hats.io.file_io as file_io
import hats.io.paths as paths
import pandas as pd

from dask.distributed import Client
from glob import glob
from hats.catalog import PartitionInfo
from hats.catalog.dataset.table_properties import TableProperties
from hats.io.file_io import write_parquet_metadata
from hats.io.file_io.file_pointer import get_upath
from hats.pixel_math.healpix_pixel import HealpixPixel
from lsdb import ConeSearch
from lsdb.loaders.hats.path_generator import PathGenerator
from upath import UPath

For demonstration purposes we'll use Gaia DR3 formatted in the DR4 directory structure. 

We'll call it "Gaia DR4" even though such statement is not accurate.

In [2]:
GAIA_DR4_DIR = "/Users/scampos/HatsSource"

In [3]:
client = Client(n_workers=3)

### Gaia helper methods

These methods help converting a HEALPix to a Gaia region and vice-versa.

In [4]:
def healpix_to_region_id(order, npix):
    digits = []
    for power in range(order, -1, -1):
        factor = 4**power
        digit = npix // factor
        digits.append(format(digit, "x"))
        npix %= factor
    return "".join(digits)


def region_id_to_healpix(region_id):
    order = len(region_id) - 1
    npix = 0
    for i in range(0, order + 1):
        npix = (npix * 4) + int(region_id[i : i + 1], base=16)
    return HealpixPixel(order, npix)


def get_region_ids():
    pattern = os.path.join(GAIA_DR4_DIR, "**", "region=*")
    region_dirs = [d for d in glob(pattern) if os.path.isdir(d)]
    return [os.path.basename(d).split("=", 1)[1] for d in region_dirs]

### Create HATS properties and metadata

The catalog properties and metadata files should live where HATS expects them.

In [5]:
def read_pyarrow_dataset():
    catalog_path = get_upath(GAIA_DR4_DIR)
    dataset_subdir = catalog_path / "data"
    (_, dataset) = file_io.read_parquet_dataset(dataset_subdir)
    return dataset


def create_hats_properties(total_rows):
    props = TableProperties(
        catalog_name="Gaia_DR3",
        catalog_type="object",
        total_rows=total_rows,
        ra_column="ra",
        dec_column="dec",
        healpix_column="healpix29",
        healpix_order=29,
    )
    props.to_properties_file(GAIA_DR4_DIR)


def write_partition_info():
    pixels = [region_id_to_healpix(region_id) for region_id in get_region_ids()]
    PartitionInfo(pixels).write_to_file(f"{GAIA_DR4_DIR}/partition_info.csv")


def write_common_metadata(schema):
    common_metadata_file_pointer = paths.get_common_metadata_pointer(GAIA_DR4_DIR)
    common_metadata_file_pointer.parent.mkdir(exist_ok=True)
    write_parquet_metadata(schema, common_metadata_file_pointer)


dataset = read_pyarrow_dataset()
create_hats_properties(total_rows=dataset.count_rows())
write_partition_info()
write_common_metadata(dataset.schema)

### Read the catalogs and crossmatch

A very small cone search:

In [6]:
cone = ConeSearch(ra=270, dec=30, radius_arcsec=2 * 3600)
columns = ["source_id", "ra", "dec"]

Reading Gaia DR3 in HATS using the default PathGenerator:

In [7]:
gaia_dr3 = lsdb.open_catalog(
    "s3://stpubdata/gaia/gaia_dr3/public/hats", search_filter=cone, columns=columns
)

In [8]:
assert isinstance(gaia_dr3.loading_config.path_generator, PathGenerator)
gaia_dr3.loading_config.path_generator.__dict__

{'catalog_base_dir': S3Path('s3://stpubdata/gaia/gaia_dr3/public/hats/gaia_10arcs'),
 'npix_suffix': '.parquet',
 'query_url_params': None}

Gaia DR4 requires a custom path generator:

In [9]:
class GaiaPathGenerator(PathGenerator):
    """Reads files in the Gaia DR4 format."""

    def __init__(self, data_dir: str):
        super().__init__()
        self.data_dir = data_dir

    def __call__(self, pixel: HealpixPixel) -> UPath:
        region_id = healpix_to_region_id(pixel.order, pixel.pixel)
        return self.catalog_base_dir / self.data_dir / f"region={region_id}"

gaia_dr4 = lsdb.open_catalog(
    GAIA_DR4_DIR,
    search_filter=cone,
    columns=columns,
    path_generator=GaiaPathGenerator(data_dir="data"),
)

Having a look at the loading config:

In [10]:
assert isinstance(gaia_dr4.loading_config.path_generator, GaiaPathGenerator)
gaia_dr4.loading_config.path_generator.__dict__

{'data_dir': 'data',
 'catalog_base_dir': PosixUPath('/Users/scampos/HatsSource'),
 'npix_suffix': '.parquet',
 'query_url_params': None}

- The columns are not present in the query_params since we're not accessing via HTTP.

- **[TODO]**: We should still generate the filters using the moc when the healpix column != "_healpix29".

Quick sanity check to make sure we can reach the parquet files:

In [11]:
cone_df = gaia_dr4.compute()

Let's also crossmatch with the existing Gaia DR3 HATS:

In [12]:
xmatch_cat = gaia_dr4.crossmatch(gaia_dr3, radius_arcsec=0.1, suffixes=("_left", "_right"))



We have exactly one match and their `source_id` match:

In [13]:
xmatch_result = xmatch_cat.compute()
assert len(xmatch_result) == len(cone_df)
assert all(xmatch_result["source_id_left"] == xmatch_result["source_id_right"])

In [14]:
client.close()