# Join SFD map with a point source catalog

We need LSDB for that

In [1]:
from pathlib import Path

import dask
import lsdb
import numpy as np
import pandas as pd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
from hipscat.pixel_math.hipscat_id import HIPSCAT_ID_COLUMN, hipscat_id_to_healpix
from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm

Data paths

Hardcoded path to PS1 DR2 object table (OTMO) and SFD map at PSC

In [2]:
# STARS_PATH = Path('/ocean/projects/phy210048p/shared/hipscat/catalogs/ps1/ps1_otmo')

# Use SDSS DR16 Quasar catalog for a while...
STARS_PATH = Path('/ocean/projects/phy210048p/shared/hipscat/catalogs/agns_dr16q_prop_May16')

# Fixed order 14 SFD map
SFD_PATH = Path('/ocean/projects/phy210048p/shared/hipscat/catalogs/sfd/sfd_order14_map')
# Multiorder SFD map, interpolation error is <1%
# SFD_PATH = Path('/ocean/projects/phy210048p/shared/hipscat/catalogs/sfd/sfd_multiorder_map')

### We are using LSDB's cross-matching interface for joining

In [3]:
class JoinWithContinuousMap(AbstractCrossmatchAlgorithm):
    DISTANCE_COLUMN_NAME = '_DIST'
    
    def crossmatch(self) -> pd.DataFrame:
        # Check that both catalogs are sorted by HIPSCAT_ID_COLUMN
        assert np.all(np.diff(self.left.index) > 0)
        assert np.all(np.diff(self.right[HIPSCAT_ID_COLUMN]) > 0)
        
        # Initial implementation with the binary search, it is O(n_star * log(n_sfd))
        # For large star catalogs it is better to use the linear search, which is O(n_star + n_sfd)
        idx = np.searchsorted(self.right[HIPSCAT_ID_COLUMN], self.left.index, side='right') - 1
        
        # np.searchsorted output must be between 0 and N,
        # so we are checking -1 case only
        assert np.all(idx >= 0)
        
        self._rename_columns_with_suffix(self.left, self.suffixes[0])
        self._rename_columns_with_suffix(self.right, self.suffixes[1])
        
        left_join_part = self.left.reset_index()
        right_join_part = self.right.iloc[idx].reset_index(drop=True)
        
        out = pd.concat(
            [
                left_join_part,
                right_join_part,
            ],
            axis=1,
        )
        out[self.DISTANCE_COLUMN_NAME] = 0.0
        out.set_index(HIPSCAT_ID_COLUMN, inplace=True)
        
        return out

In [4]:
# Make a command for dashboard ssh-tunneling

import socket
from getpass import getuser
from urllib.parse import urlparse

local_addr = '127.0.0.1:8787'
remote_host = 'bridges2.psc.edu'


def print_client_info(client):
    display(client)
    
    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
        s.connect(('1.1.1.1', 53))
        ip = s.getsockname()[0]
    username = getuser()
    dashboard_port = urlparse(client.dashboard_link).port

    print(f'''
    Copy-paste and run in your terminal:

    ssh -N -L {local_addr}:{ip}:{dashboard_port} {username}@{remote_host}

    And open this URL in your browser to see the dashboard:
    http://{local_addr}/
    ''')

In [5]:
%%time

# I have some connect issues runiing on PSC...
dask.config.set({
    'distributed.comm.timeouts.connect': '60s',
    'distributed.comm.timeouts.tcp': '60s',
})

# with SLURMCluster(
#     # Number of Dask workers per node
#     processes=4,
#     # Regular memory node type on PSC bridges2
#     queue="RM",
#     # dask_jobqueue requires cores and memory to be specified
#     # We set them to match RM specs
#     cores=128,
#     memory="256GB",
#     walltime="12:00:00",
# ) as cluster:
#     # Run multiple jobs
#     # cluster.scale(jobs=10)
#     # Allow to run more jobs
#     cluster.adapt(maximum_jobs=10)

#     with Client(cluster) as client:
with Client(n_workers=8) as client:
        print_client_info(client)

        stars = lsdb.read_hipscat(STARS_PATH)
        sfd = lsdb.read_hipscat(SFD_PATH)
        matched = stars.crossmatch(sfd, algorithm=JoinWithContinuousMap)
        mean_sfd = matched._ddf[f'ebv_{sfd.name}'].mean().compute()
mean_sfd

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 128,Total memory: 247.07 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:42759,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 128
Started: Just now,Total memory: 247.07 GiB

0,1
Comm: tcp://127.0.0.1:36541,Total threads: 16
Dashboard: http://127.0.0.1:42739/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:33867,
Local directory: /tmp/dask-scratch-space/worker-_r6_o1yg,Local directory: /tmp/dask-scratch-space/worker-_r6_o1yg

0,1
Comm: tcp://127.0.0.1:41587,Total threads: 16
Dashboard: http://127.0.0.1:39199/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:37891,
Local directory: /tmp/dask-scratch-space/worker-_0h6or7h,Local directory: /tmp/dask-scratch-space/worker-_0h6or7h

0,1
Comm: tcp://127.0.0.1:38057,Total threads: 16
Dashboard: http://127.0.0.1:37989/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:34357,
Local directory: /tmp/dask-scratch-space/worker-5yx4gh5o,Local directory: /tmp/dask-scratch-space/worker-5yx4gh5o

0,1
Comm: tcp://127.0.0.1:36397,Total threads: 16
Dashboard: http://127.0.0.1:45893/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:34153,
Local directory: /tmp/dask-scratch-space/worker-ujmj3sl2,Local directory: /tmp/dask-scratch-space/worker-ujmj3sl2

0,1
Comm: tcp://127.0.0.1:44257,Total threads: 16
Dashboard: http://127.0.0.1:43037/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:45859,
Local directory: /tmp/dask-scratch-space/worker-kv_zuyvj,Local directory: /tmp/dask-scratch-space/worker-kv_zuyvj

0,1
Comm: tcp://127.0.0.1:38309,Total threads: 16
Dashboard: http://127.0.0.1:45385/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:40251,
Local directory: /tmp/dask-scratch-space/worker-9y1sb0yi,Local directory: /tmp/dask-scratch-space/worker-9y1sb0yi

0,1
Comm: tcp://127.0.0.1:35281,Total threads: 16
Dashboard: http://127.0.0.1:33635/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:38989,
Local directory: /tmp/dask-scratch-space/worker-e5t77tg0,Local directory: /tmp/dask-scratch-space/worker-e5t77tg0

0,1
Comm: tcp://127.0.0.1:33203,Total threads: 16
Dashboard: http://127.0.0.1:38235/status,Memory: 30.88 GiB
Nanny: tcp://127.0.0.1:35103,
Local directory: /tmp/dask-scratch-space/worker-56l6mtoy,Local directory: /tmp/dask-scratch-space/worker-56l6mtoy



    Copy-paste and run in your terminal:

    ssh -N -L 127.0.0.1:8787:10.8.9.182:8787 malanche@bridges2.psc.edu

    And open this URL in your browser to see the dashboard:
    http://127.0.0.1:8787/
    


2023-12-06 11:17:11,346 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/worker.py", line 1255, in heartbeat
    response = await retry_operation(
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/utils_comm.py", line 455, in retry_operation
    return await retry(
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/utils_comm.py", line 434, in retry
    return await coro()
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/si

CPU times: user 6min 6s, sys: 34.4 s, total: 6min 40s
Wall time: 6min 33s


0.0359235139702884

Alternative approach: use dustmaps package

In [5]:
%%time
import dask
import pandas as pd
from astropy.coordinates import SkyCoord
from dustmaps.sfd import SFDQuery

# Get original SFD FITS file location, INPUT_DIR
from paths import *


def worker(df, query):
    coord = SkyCoord(
        ra=df[stars.hc_structure.catalog_info.ra_column],
        dec=df[stars.hc_structure.catalog_info.dec_column],
        unit='deg',
    )
    ebv = query(coord)
    return pd.DataFrame(dict(ebv=ebv), index=df.index)


with Client(n_workers=24) as client:
    print_client_info(client)
    
    query = dask.delayed(SFDQuery, pure=True, traverse=False)(INPUT_DIR)
    
    stars = lsdb.read_hipscat(STARS_PATH)
    values = stars._ddf.map_partitions(worker, query, meta={'ebv': np.float32})
    mean_values = values.mean().compute()
    
mean_values

Configuration file not found:

    /jet/home/malanche/.dustmapsrc

To create a new configuration file in the default location, run the following python code:

    from dustmaps.config import config
    config.reset()

Note that this will delete your configuration! For example, if you have specified a data directory, then dustmaps will forget about its location.


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 24
Total threads: 144,Total memory: 247.07 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39345,Workers: 24
Dashboard: http://127.0.0.1:8787/status,Total threads: 144
Started: Just now,Total memory: 247.07 GiB

0,1
Comm: tcp://127.0.0.1:38661,Total threads: 6
Dashboard: http://127.0.0.1:45827/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:44919,
Local directory: /tmp/dask-scratch-space/worker-bmr0mrsu,Local directory: /tmp/dask-scratch-space/worker-bmr0mrsu

0,1
Comm: tcp://127.0.0.1:36515,Total threads: 6
Dashboard: http://127.0.0.1:43387/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:42607,
Local directory: /tmp/dask-scratch-space/worker-6k7zt19q,Local directory: /tmp/dask-scratch-space/worker-6k7zt19q

0,1
Comm: tcp://127.0.0.1:37867,Total threads: 6
Dashboard: http://127.0.0.1:42417/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:45325,
Local directory: /tmp/dask-scratch-space/worker-93x4kd72,Local directory: /tmp/dask-scratch-space/worker-93x4kd72

0,1
Comm: tcp://127.0.0.1:41127,Total threads: 6
Dashboard: http://127.0.0.1:36537/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:41255,
Local directory: /tmp/dask-scratch-space/worker-82p1p4ko,Local directory: /tmp/dask-scratch-space/worker-82p1p4ko

0,1
Comm: tcp://127.0.0.1:43555,Total threads: 6
Dashboard: http://127.0.0.1:37879/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:39269,
Local directory: /tmp/dask-scratch-space/worker-4_xrl416,Local directory: /tmp/dask-scratch-space/worker-4_xrl416

0,1
Comm: tcp://127.0.0.1:35303,Total threads: 6
Dashboard: http://127.0.0.1:45109/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:39753,
Local directory: /tmp/dask-scratch-space/worker-vj7di4sj,Local directory: /tmp/dask-scratch-space/worker-vj7di4sj

0,1
Comm: tcp://127.0.0.1:33627,Total threads: 6
Dashboard: http://127.0.0.1:44255/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:39615,
Local directory: /tmp/dask-scratch-space/worker-7de5skgw,Local directory: /tmp/dask-scratch-space/worker-7de5skgw

0,1
Comm: tcp://127.0.0.1:34455,Total threads: 6
Dashboard: http://127.0.0.1:41581/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:33531,
Local directory: /tmp/dask-scratch-space/worker-_1cwa8c0,Local directory: /tmp/dask-scratch-space/worker-_1cwa8c0

0,1
Comm: tcp://127.0.0.1:35981,Total threads: 6
Dashboard: http://127.0.0.1:43051/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:38413,
Local directory: /tmp/dask-scratch-space/worker-bq9ql4v3,Local directory: /tmp/dask-scratch-space/worker-bq9ql4v3

0,1
Comm: tcp://127.0.0.1:40631,Total threads: 6
Dashboard: http://127.0.0.1:41681/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:38085,
Local directory: /tmp/dask-scratch-space/worker-6iv14hg6,Local directory: /tmp/dask-scratch-space/worker-6iv14hg6

0,1
Comm: tcp://127.0.0.1:37395,Total threads: 6
Dashboard: http://127.0.0.1:39649/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:40501,
Local directory: /tmp/dask-scratch-space/worker-khd3ra9u,Local directory: /tmp/dask-scratch-space/worker-khd3ra9u

0,1
Comm: tcp://127.0.0.1:44975,Total threads: 6
Dashboard: http://127.0.0.1:44929/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:45923,
Local directory: /tmp/dask-scratch-space/worker-n81i0x0z,Local directory: /tmp/dask-scratch-space/worker-n81i0x0z

0,1
Comm: tcp://127.0.0.1:39257,Total threads: 6
Dashboard: http://127.0.0.1:42091/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:45521,
Local directory: /tmp/dask-scratch-space/worker-bgxbw02u,Local directory: /tmp/dask-scratch-space/worker-bgxbw02u

0,1
Comm: tcp://127.0.0.1:37405,Total threads: 6
Dashboard: http://127.0.0.1:46407/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:33899,
Local directory: /tmp/dask-scratch-space/worker-4ykng7a4,Local directory: /tmp/dask-scratch-space/worker-4ykng7a4

0,1
Comm: tcp://127.0.0.1:40919,Total threads: 6
Dashboard: http://127.0.0.1:38957/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:34641,
Local directory: /tmp/dask-scratch-space/worker-kvmsuvmx,Local directory: /tmp/dask-scratch-space/worker-kvmsuvmx

0,1
Comm: tcp://127.0.0.1:42955,Total threads: 6
Dashboard: http://127.0.0.1:39459/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:34095,
Local directory: /tmp/dask-scratch-space/worker-e1tsg3hy,Local directory: /tmp/dask-scratch-space/worker-e1tsg3hy

0,1
Comm: tcp://127.0.0.1:39863,Total threads: 6
Dashboard: http://127.0.0.1:43921/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:46817,
Local directory: /tmp/dask-scratch-space/worker-ih8ifrcu,Local directory: /tmp/dask-scratch-space/worker-ih8ifrcu

0,1
Comm: tcp://127.0.0.1:38045,Total threads: 6
Dashboard: http://127.0.0.1:43025/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:39133,
Local directory: /tmp/dask-scratch-space/worker-0sk7u_oj,Local directory: /tmp/dask-scratch-space/worker-0sk7u_oj

0,1
Comm: tcp://127.0.0.1:39427,Total threads: 6
Dashboard: http://127.0.0.1:43211/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:46651,
Local directory: /tmp/dask-scratch-space/worker-pseu1e3w,Local directory: /tmp/dask-scratch-space/worker-pseu1e3w

0,1
Comm: tcp://127.0.0.1:35863,Total threads: 6
Dashboard: http://127.0.0.1:36003/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:37211,
Local directory: /tmp/dask-scratch-space/worker-8s37g84r,Local directory: /tmp/dask-scratch-space/worker-8s37g84r

0,1
Comm: tcp://127.0.0.1:40451,Total threads: 6
Dashboard: http://127.0.0.1:38267/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:44783,
Local directory: /tmp/dask-scratch-space/worker-vtr0whvi,Local directory: /tmp/dask-scratch-space/worker-vtr0whvi

0,1
Comm: tcp://127.0.0.1:36759,Total threads: 6
Dashboard: http://127.0.0.1:45523/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:42919,
Local directory: /tmp/dask-scratch-space/worker-cqp6qp8d,Local directory: /tmp/dask-scratch-space/worker-cqp6qp8d

0,1
Comm: tcp://127.0.0.1:33107,Total threads: 6
Dashboard: http://127.0.0.1:42619/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:44481,
Local directory: /tmp/dask-scratch-space/worker-8wnxspus,Local directory: /tmp/dask-scratch-space/worker-8wnxspus

0,1
Comm: tcp://127.0.0.1:35785,Total threads: 6
Dashboard: http://127.0.0.1:36611/status,Memory: 10.29 GiB
Nanny: tcp://127.0.0.1:35949,
Local directory: /tmp/dask-scratch-space/worker-pj1pl9fn,Local directory: /tmp/dask-scratch-space/worker-pj1pl9fn



    Copy-paste and run in your terminal:

    ssh -N -L 127.0.0.1:8787:10.8.10.22:8787 malanche@bridges2.psc.edu

    And open this URL in your browser to see the dashboard:
    http://127.0.0.1:8787/
    
Configuration file not found:

    /jet/home/malanche/.dustmapsrc

To create a new configuration file in the default location, run the following python code:

    from dustmaps.config import config
    config.reset()

Note that this will delete your configuration! For example, if you have specified a data directory, then dustmaps will forget about its location.
Configuration file not found:

    /jet/home/malanche/.dustmapsrc

To create a new configuration file in the default location, run the following python code:

    from dustmaps.config import config
    config.reset()

Note that this will delete your configuration! For example, if you have specified a data directory, then dustmaps will forget about its location.
Configuration file not found:

    /jet/home/malanche/.dustmapsrc

2023-12-04 15:44:58,877 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/worker.py", line 1255, in heartbeat
    response = await retry_operation(
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/utils_comm.py", line 455, in retry_operation
    return await retry(
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/site-packages/distributed/utils_comm.py", line 434, in retry
    return await coro()
  File "/jet/home/malanche/.virtualenvs/sfd/lib/python3.9/si

Configuration file not found:

    /jet/home/malanche/.dustmapsrc

To create a new configuration file in the default location, run the following python code:

    from dustmaps.config import config
    config.reset()

Note that this will delete your configuration! For example, if you have specified a data directory, then dustmaps will forget about its location.




CPU times: user 4min 13s, sys: 42.4 s, total: 4min 55s
Wall time: 4min 35s


ebv    0.035924
dtype: float64

### Validation

First, we check that both hipscat indexes and SFD pixel index-order pair are all consistent 

In [6]:
np.testing.assert_array_equal(
    hipscat_id_to_healpix(result[f'_hipscat_index_{SFD_NAME}'], result[f'pixel_Norder_{SFD_NAME}']),
    result[f'pixel_Npix_{SFD_NAME}'],
)
np.testing.assert_array_equal(
    hipscat_id_to_healpix(result.index, result[f'pixel_Norder_{SFD_NAME}']),
    result[f'pixel_Npix_{SFD_NAME}'],
)

NameError: name 'SFD_NAME' is not defined

Check that SFD map values are close enough to the ones from `dustmap` module.
The difference must be below 16% for fixed order and 1% for multiorder.

In [None]:
# Validate
from astropy.coordinates import SkyCoord
from dustmaps.sfd import SFDQuery

sfd_query = SFDQuery(INPUT_DIR)
coord = SkyCoord(ra=result['ra_small_sky_order1'], dec=result['dec_small_sky_order1'], unit='deg')
dustmaps_sfd_values = sfd_query(coord)

diff = (
    np.abs(result[f'ebv_{SFD_NAME}'] - dustmaps_sfd_values)
    / np.where(result[f'ebv_{SFD_NAME}'] > dustmaps_sfd_values, result[f'ebv_{SFD_NAME}'], dustmaps_sfd_values)
)
i = np.argsort(diff)[::-1]
display(result.assign(diff=diff, ebv_dustmap=dustmaps_sfd_values).iloc[i[:10]])
diff.max()

In [None]:
area17 = 4 ** (17 - sfd._ddf['pixel_Norder'].astype(np.uint64))
area17.sum().compute(), 12 * 4 ** 17

In [None]:
import pyarrow.parquet as pq

for norder in range(8, 18):
    count = (sfd._ddf['pixel_Norder'] == norder).sum().compute()
    count_real = pq.read_metadata(PARQUET_DIR / f'pixel_Norder={norder:02d}.parquet').num_rows
    print(norder, count - count_real) 

In [None]:
import dask.array as da

index = sfd._ddf['_hipscat_index'].to_dask_array(lengths=True)
display(da.sum(da.diff(index) <= 0).compute())
index.argmin().compute()

In [None]:
from hipscat.pixel_math.hipscat_id import healpix_to_hipscat_id

index = sfd._ddf['_hipscat_index'].to_dask_array(lengths=True)
diff_index = da.diff(index)
diff_index_from_norder = sfd._ddf['pixel_Norder'].to_dask_array(lengths=True).astype(np.uint64).map_blocks(lambda order: healpix_to_hipscat_id(order, 1))[:-1]

da.sum((diff_index != diff_index_from_norder).astype(np.uint64)).compute()