In [1]:
%pip install -U hats lsdb

Note: you may need to restart the kernel to use updated packages.


In [1]:
import math
from pathlib import Path
from shutil import rmtree

import lsdb
from dask.distributed import Client
from lsdb.core.search.pixel_search import PixelSearch
from tqdm.auto import tqdm

from hats_reimport import hats_reimport

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# "Global" paths on PSC Bridges2 cluster
GLOBAL_HATS_PATH = Path("/ocean/projects/phy210048p/shared/hats/catalogs/")
LOCAL_HATS_PATH = Path("./hats")

PS1_OTMO_PATH = GLOBAL_HATS_PATH / "ps1/ps1_otmo"
PS1_OTMO_MARGIN_PATH = GLOBAL_HATS_PATH / "ps1/ps1_otmo_10arcs"

ZUBERCAL_PATH = GLOBAL_HATS_PATH / "ztf_dr16/zubercal"

GAIA_CATALOG_TYPE = "vrrlyr"
GAIA_VARS_PATH = LOCAL_HATS_PATH / f"gaia_dr3_{GAIA_CATALOG_TYPE}"
GAIA_VARS_MARGIN_PATH = LOCAL_HATS_PATH / f"gaia_dr3_{GAIA_CATALOG_TYPE}_10arcsec"

PS1_FILTERS = 'grizy'
PS1_MAG_SUFFIXES = ['MeanPSFMag', 'MeanPSFMagErr', 'Flags']

OUTPUT_CATALOG_NAME = f"zubercal_{GAIA_CATALOG_TYPE}"
LSDB_OUTPUT_PATH = Path("./lsdb") / OUTPUT_CATALOG_NAME

In [3]:
def matched_catalog(search_filter, output_catalog_name):
    ps1_otmo = lsdb.read_hats(
        PS1_OTMO_PATH,
        margin_cache=PS1_OTMO_MARGIN_PATH,
        # Few useful columns from PS1 object catalog
        columns=(
            ['objID', 'raMean', 'decMean']
            + [f'{fltr}{suffix}' for fltr in PS1_FILTERS for suffix in PS1_MAG_SUFFIXES]
        ),
        search_filter=search_filter,
    )

    # Zubercal catalog, skip coordinates and few other columns
    # Column description:
    # http://atua.caltech.edu/ZTF/Fields/ReadMe.txt
    zubercal = lsdb.read_hats(
        ZUBERCAL_PATH,
        columns=['mjd', 'mag', 'magerr', 'objectid', 'info', 'flag', 'band'],
        search_filter=search_filter,
    )

    gaia_var = lsdb.read_hats(
        GAIA_VARS_PATH,
        margin_cache=GAIA_VARS_MARGIN_PATH,
        search_filter=search_filter,
    )
    
    return gaia_var.crossmatch(
        ps1_otmo,
        radius_arcsec=1.0,
        suffixes=["", ""],
        output_catalog_name="gaia_vars_x_ps1_otmo",
    ).join_nested(
        zubercal,
        left_on='objID',
        right_on='objectid',
        nested_column_name='lc',
        output_catalog_name=OUTPUT_CATALOG_NAME,
    )

In [None]:
%%time


with Client(n_workers=6, memory_limit="40GB", threads_per_worker=1) as client:
    display(client)

    for npix in tqdm(range(12)):        
        batch_str = f"batch_{npix:02d}"
        output_path = LSDB_OUTPUT_PATH / batch_str
        output_catalog_name = f"{OUTPUT_CATALOG_NAME}_{batch_str}"

        if output_path.exists():
            if (properties := output_path / "properties").exists():
                print(f"{properties} exists, skipping")
                continue
            print(f"Deleting incomplete catalog {output_path}")
            rmtree(output_path)

        pixel_search = PixelSearch([(0, npix)])

        print("X-matching...")
        batch = matched_catalog(pixel_search, output_catalog_name)

        print(f"Matched partitions: {batch._ddf.npartitions}")

        print("Computing...")
        batch.to_hats(output_path)

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 223.52 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:35005,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 223.52 GiB

0,1
Comm: tcp://127.0.0.1:39791,Total threads: 1
Dashboard: http://127.0.0.1:37671/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:35775,
Local directory: /var/tmp/dask-scratch-space/worker-yftherl1,Local directory: /var/tmp/dask-scratch-space/worker-yftherl1

0,1
Comm: tcp://127.0.0.1:38665,Total threads: 1
Dashboard: http://127.0.0.1:44707/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:36739,
Local directory: /var/tmp/dask-scratch-space/worker-1m1bxd0v,Local directory: /var/tmp/dask-scratch-space/worker-1m1bxd0v

0,1
Comm: tcp://127.0.0.1:43757,Total threads: 1
Dashboard: http://127.0.0.1:37949/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:34205,
Local directory: /var/tmp/dask-scratch-space/worker-aem_m1_c,Local directory: /var/tmp/dask-scratch-space/worker-aem_m1_c

0,1
Comm: tcp://127.0.0.1:36309,Total threads: 1
Dashboard: http://127.0.0.1:42479/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:43057,
Local directory: /var/tmp/dask-scratch-space/worker-2yo9nuug,Local directory: /var/tmp/dask-scratch-space/worker-2yo9nuug


  0%|          | 0/12 [00:00<?, ?it/s]

lsdb/zubercal_vrrlyr/batch_00/properties exists, skipping
lsdb/zubercal_vrrlyr/batch_01/properties exists, skipping
Deleting incomplete catalog lsdb/zubercal_vrrlyr/batch_02
X-matching...




Matched partitions: 2490
Computing...


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
2024-11-27 12:34:00,868 - distributed.worker - ERROR - Worker stream died during communication: tcp://127.0.0.1:36309
Traceback (most recent call last):
  File "/ocean/projects/phy210048p/malanche/zubercal-filtering/cenv/lib/python3.12/site-packages/tornado/iostream.py", line 861, in _read_to_buffer
    bytes_read = self.read_from_fd(buf)
                 ^^^^^^^^^^^^^^^^^^^^^^
  File "/ocean/projects/phy210048p/malanche/zubercal-filtering/cenv/lib/python3.12/site-packages/tornado/iostream.py", line 1116, in read_from_fd
    return self.socket.recv_into(buf, len(buf))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [Errno 104] Connection reset by peer

The above exception was the direct cause of the f

X-matching...




Matched partitions: 10119
Computing...


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [None]:
%%time

hats_reimport(LSDB_OUTPUT_PATH, LOCAL_HATS_PATH, OUTPUT_CATALOG_NAME)