# Association catalogs

There is a new work-in-progress implementation of join through association catalogs.

In [None]:
#%pip install git+https://github.com/astronomy-commons/lsdb.git@sandro/join-through-xmatch-association

In [1]:
import lsdb
import pandas as pd
import tempfile
from dask.distributed import Client
from pathlib import Path

In [2]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir)

Let's use Rubin DP1 Object x ZTF DR22 as an example:

In [3]:
hats_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/hats/v29_0_0")
object_collection = lsdb.read_hats(hats_dir / "object_collection", columns=["objectId","coord_ra","coord_dec"])
object_collection

Unnamed: 0_level_0,objectId,coord_ra,coord_dec
npartitions=389,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Order: 6, Pixel: 130",int64[pyarrow],double[pyarrow],double[pyarrow]
"Order: 8, Pixel: 2176",...,...,...
...,...,...,...
"Order: 9, Pixel: 2302101",...,...,...
"Order: 7, Pixel: 143884",...,...,...


In [4]:
ztf_dr22 = lsdb.open_catalog(
    "https://data.lsdb.io/hats/ztf_dr22/ztf_lc",
    margin_cache="https://data.lsdb.io/hats/ztf_dr22/ztf_lc_10arcs",
    columns=["objectid","objra","objdec"]
)
ztf_dr22

Unnamed: 0_level_0,objectid,objra,objdec
npartitions=10839,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Order: 4, Pixel: 0",int64[pyarrow],float[pyarrow],float[pyarrow]
"Order: 4, Pixel: 1",...,...,...
...,...,...,...
"Order: 5, Pixel: 12286",...,...,...
"Order: 5, Pixel: 12287",...,...,...


In [None]:
xmatch = object_collection.crossmatch(
    ztf_dr22,
    radius_arcsec=0.2,
    n_neighbors=20,
    suffixes=("", "_ztf"),
)
xmatch

Unnamed: 0_level_0,objectId,coord_ra,coord_dec,objectid_ztf,objra_ztf,objdec_ztf,_dist_arcsec
npartitions=236,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Order: 6, Pixel: 130",int64[pyarrow],double[pyarrow],double[pyarrow],int64[pyarrow],float[pyarrow],float[pyarrow],double[pyarrow]
"Order: 8, Pixel: 2176",...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
"Order: 8, Pixel: 575315",...,...,...,...,...,...,...
"Order: 7, Pixel: 143829",...,...,...,...,...,...,...


We can write this association catalog to disk with:

In [7]:
lsdb.io.to_association(
    # Selecting only the extra columns from the crossmatch algorithm
    xmatch[["objectId", "objectid_ztf", "_dist_arcsec"]],
    catalog_name="object_ztf_assoc",
    base_catalog_path="object_ztf_assoc",
    primary_catalog_dir=object_collection.hc_structure.catalog_path,
    primary_column_association="objectId",
    primary_id_column="objectId",
    join_catalog_dir=ztf_dr22.hc_structure.catalog_path,
    join_column_association="objectid_ztf",
    join_id_column="objectid",
    # Optional, adds `hats_assn_max_separation` to the properties
    separation_column="_dist_arcsec",
    overwrite=True,
)

We can later join through this association:

In [None]:
assoc = lsdb.open_catalog("object_ztf_assoc")
joined = object_collection.join(ztf_dr22, through=assoc, suffixes=("","_ztf"))
joined

Unnamed: 0_level_0,objectId,coord_ra,coord_dec,_dist_arcsec,objectid_ztf,objra_ztf,objdec_ztf
npartitions=235,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Order: 6, Pixel: 130",int64[pyarrow],double[pyarrow],double[pyarrow],double[pyarrow],int64[pyarrow],float[pyarrow],float[pyarrow]
"Order: 8, Pixel: 2176",...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
"Order: 8, Pixel: 575315",...,...,...,...,...,...,...
"Order: 7, Pixel: 143829",...,...,...,...,...,...,...


Let's check that the IDs match:

In [None]:
xmatch_df = xmatch.compute()
joined_df = joined.compute()

In [None]:
pd.testing.assert_series_equal(xmatch_df["objectId"], joined_df["objectId"])
pd.testing.assert_series_equal(xmatch_df["objectid_ztf"], joined_df["objectid_ztf"])

### Improvements

- The association max separation calculation should not depend on a `separation_column`, since not all algorithms have it as an output. The calculation should be done directly via RA/DEC information; but we do not currently store any provenance information about the crossmatched catalogs.

- The API could be more user-friendly. We currently provide to `to_association` a lot of arguments that are known during the crossmatch.

- The extra columns (e.g. `_dist_arcsec`) should be moved to the end of the Catalog.

In [20]:
client.close()