## Read Gaia DR3 in DR4 format

This notebook demonstrates how to read Gaia DR3 formatted in the DR4 directory structure.

In [1]:
import os
import lsdb
import hats.io.file_io as file_io
import hats.io.paths as paths

from dask.distributed import Client
from glob import glob
from hats.catalog import PartitionInfo
from hats.catalog.dataset.table_properties import TableProperties
from hats.io.file_io import write_parquet_metadata
from hats.io.file_io.file_pointer import get_upath
from hats.pixel_math.healpix_pixel import HealpixPixel
from lsdb import ConeSearch
from lsdb.loaders.hats.path_generator import PathGenerator
from upath import UPath

In [2]:
gaia_dir = "/Users/scampos/HatsSource"

### Helper methods

In [3]:
def healpix_to_region_id(order, npix):
    digits = []
    remaining = npix
    for power in range(order, -1, -1):
        factor = 4 ** power
        digit = remaining // factor
        digits.append(format(digit, "x"))
        remaining %= factor
    return "".join(digits)


def region_id_to_healpix(region_id):
    order = len(region_id) - 1
    npix = 0
    for i in range(0, order+1):
        npix = (npix * 4) + int(region_id[i:i+1], base=16)
    return HealpixPixel(order, npix)


def get_region_ids():
    pattern = os.path.join(gaia_dir, "**", "region=*")
    region_dirs = [d for d in glob(pattern) if os.path.isdir(d)]
    return [os.path.basename(d).split("=", 1)[1] for d in region_dirs]

### Create missing HATS metadata / properties

In [4]:
def read_pyarrow_dataset():
    catalog_path = get_upath(gaia_dir)
    dataset_subdir = catalog_path / "dataset"
    (_, dataset) = file_io.read_parquet_dataset(dataset_subdir)
    return dataset

def create_hats_properties(total_rows):
    props = TableProperties(
        catalog_name="Gaia_DR3",
        catalog_type="object",
        total_rows=total_rows,
        ra_column="ra",
        dec_column="dec",
        healpix_column="healpix29",
        healpix_order=29,
    )
    props.to_properties_file(gaia_dir)

def write_partition_info():
    pixels = [region_id_to_healpix(region_id) for region_id in get_region_ids()]
    PartitionInfo(pixels).write_to_file(f"{gaia_dir}/partition_info.csv")

def write_common_metadata(schema):
    common_metadata_file_pointer = paths.get_common_metadata_pointer(gaia_dir)
    write_parquet_metadata(schema, common_metadata_file_pointer)

In [5]:
dataset = read_pyarrow_dataset()
create_hats_properties(total_rows=dataset.count_rows())
write_partition_info()
write_common_metadata(dataset.schema)

### Reading the catalogs and crossmatching

A very small cone search for demonstration purposes.

In [6]:
cone = ConeSearch(ra=0,dec=0,radius_arcsec=3600)

Reading Gaia DR3 in HATS:

In [7]:
gaia_dr3 = lsdb.open_catalog(
    's3://stpubdata/gaia/gaia_dr3/public/hats',
    search_filter=cone,
    columns=["source_id","ra","dec"])

Reading Gaia DR3 in DR4 format using a custom path generator:

In [8]:
class GaiaPathGenerator(PathGenerator):
    """Reads files in the Gaia DR4 format."""
    def __call__(self, pixel: HealpixPixel) -> UPath:
        catalog_base_dir = get_upath(self.base_dir)
        region_id = healpix_to_region_id(pixel.order, pixel.pixel)
        return catalog_base_dir / "dataset" / f"region={region_id}/"
    
gaia_dr4 = lsdb.open_catalog(
    gaia_dir,
    search_filter=cone,
    path_generator_type=GaiaPathGenerator)

In [9]:
with Client(n_workers=3):
    xmatch_cat = gaia_dr4.crossmatch(gaia_dr3, radius_arcsec=0.5, suffixes=("_left","_right"))
    xmatch_result = xmatch_cat.compute()



Quick sanity check to make sure the `source_id` match:

In [10]:
left_source_id = xmatch_result["source_id_left"]
right_source_id = xmatch_result["source_id_right"]
assert all(left_source_id == right_source_id)