# Join SFD map with a point source catalog

We need LSDB for that

In [1]:
from pathlib import Path

import lsdb
import numpy as np
import pandas as pd
from hipscat.pixel_math.hipscat_id import HIPSCAT_ID_COLUMN, hipscat_id_to_healpix
from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm

from paths import *

### Load data lazily

Hardcoded path to the test catalog from LSDB - sorry for that!

In [2]:
STARS_PATH = Path('/Users/hombit/projects/lincc-frameworks/lsdb/tests/data/small_sky_order1')
SFD_NAME = 'sfd_multiorder_map'
SFD_PATH = OUTPUT_DIR / 'sfd_multiorder_map'

In [3]:
stars = lsdb.read_hipscat(STARS_PATH)
stars

Unnamed: 0_level_0,id,ra,dec,ra_error,dec_error,Norder,Dir,Npix
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int64,float64,float64,int64,int64,int32,int32,int32
,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...


In [4]:
sfd = lsdb.read_hipscat(SFD_PATH)
sfd

Unnamed: 0_level_0,_hipscat_index,pixel_Norder,pixel_Npix,ebv,Norder,Dir,Npix
npartitions=4035,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,uint64,uint8,uint64,float32,uint8,uint64,uint64
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


### We are using LSDB's cross-matching interface for joining

In [5]:
class JoinWithContinuousMap(AbstractCrossmatchAlgorithm):
    DISTANCE_COLUMN_NAME = '_DIST'
    
    def crossmatch(self) -> pd.DataFrame:
        # Check that both catalogs are sorted by HIPSCAT_ID_COLUMN
        assert np.all(np.diff(self.left.index) > 0)
        assert np.all(np.diff(self.right[HIPSCAT_ID_COLUMN]) > 0)
        
        # Initial implementation with the binary search, it is O(n_star * log(n_sfd))
        # For large star catalogs it is better to use the linear search, which is O(n_star + n_sfd)
        idx = np.searchsorted(self.right[HIPSCAT_ID_COLUMN], self.left.index) - 1
        
        # np.searchsorted output must be between 0 and N,
        # so we are checking -1 case only
        assert np.all(idx >= 0)
        
        self._rename_columns_with_suffix(self.left, self.suffixes[0])
        self._rename_columns_with_suffix(self.right, self.suffixes[1])
        
        left_join_part = self.left.reset_index()
        right_join_part = self.right.iloc[idx].reset_index(drop=True)
        
        out = pd.concat(
            [
                left_join_part,
                right_join_part,
            ],
            axis=1,
        )
        out[self.DISTANCE_COLUMN_NAME] = 0.0
        out.set_index(HIPSCAT_ID_COLUMN, inplace=True)
        
        return out

In [6]:
result = stars.crossmatch(sfd, algorithm=JoinWithContinuousMap).compute()
result

Unnamed: 0_level_0,id_small_sky_order1,ra_small_sky_order1,dec_small_sky_order1,ra_error_small_sky_order1,dec_error_small_sky_order1,Norder_small_sky_order1,Dir_small_sky_order1,Npix_small_sky_order1,_hipscat_index_sfd_multiorder_map,pixel_Norder_sfd_multiorder_map,pixel_Npix_sfd_multiorder_map,ebv_sfd_multiorder_map,Norder_sfd_multiorder_map,Dir_sfd_multiorder_map,Npix_sfd_multiorder_map,_DIST
_hipscat_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
12749688880727326720,707,308.5,-69.5,0,0,1,0,44,12749688827098955776,12,185532391,0.052364,4,0,2830,0.0
12751184493818150912,792,320.5,-69.5,0,0,1,0,44,12751184489330245632,13,742216623,0.036652,4,0,2831,0.0
12753202806647685120,723,315.5,-68.5,0,0,1,0,44,12753202797541851136,13,742334104,0.043459,4,0,2831,0.0
12753202806647685121,811,315.5,-68.5,0,0,1,0,44,12753202797541851136,13,742334104,0.043459,4,0,2831,0.0
12770681119980912640,826,335.5,-69.5,0,0,1,0,44,12770681119708282880,14,2973405905,0.026550,4,0,2835,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13591216801265483776,791,312.5,-28.5,0,0,1,0,47,13591216784609902592,13,791112938,0.085012,4,0,3017,0.0
13596001812279721984,824,305.5,-28.5,0,0,1,0,47,13596001811969343488,14,3165565853,0.060051,4,0,3018,0.0
13598131468743213056,702,310.5,-27.5,0,0,1,0,47,13598131467208097792,14,3166061702,0.071358,4,0,3019,0.0
13601023174257934336,767,314.5,-29.5,0,0,1,0,47,13601023157019344896,12,197920936,0.083611,4,0,3020,0.0


### Validation

First, we check that both hipscat indexes and SFD pixel index-order pair are all consistent 

In [7]:
np.testing.assert_array_equal(
    hipscat_id_to_healpix(result[f'_hipscat_index_{SFD_NAME}'], result[f'pixel_Norder_{SFD_NAME}']),
    result[f'pixel_Npix_{SFD_NAME}'],
)
np.testing.assert_array_equal(
    hipscat_id_to_healpix(result.index, result[f'pixel_Norder_{SFD_NAME}']),
    result[f'pixel_Npix_{SFD_NAME}'],
)

Check that SFD map values are close enough to the ones from `dustmap` module.
The difference must be below 16% for fixed order and 1% for multiorder.

In [11]:
# Validate
from astropy.coordinates import SkyCoord
from dustmaps.sfd import SFDQuery

sfd_query = SFDQuery(INPUT_DIR)
coord = SkyCoord(ra=result['ra_small_sky_order1'], dec=result['dec_small_sky_order1'], unit='deg')
dustmaps_sfd_values = sfd_query(coord)

diff = (
    np.abs(result[f'ebv_{SFD_NAME}'] - dustmaps_sfd_values)
    / np.where(result[f'ebv_{SFD_NAME}'] > dustmaps_sfd_values, result[f'ebv_{SFD_NAME}'], dustmaps_sfd_values)
)
i = np.argsort(diff)[::-1]
display(result.assign(diff=diff, ebv_dustmap=dustmaps_sfd_values).iloc[i[:10]])
diff.max()

Unnamed: 0_level_0,id_small_sky_order1,ra_small_sky_order1,dec_small_sky_order1,ra_error_small_sky_order1,dec_error_small_sky_order1,Norder_small_sky_order1,Dir_small_sky_order1,Npix_small_sky_order1,_hipscat_index_sfd_multiorder_map,pixel_Norder_sfd_multiorder_map,pixel_Npix_sfd_multiorder_map,ebv_sfd_multiorder_map,Norder_sfd_multiorder_map,Dir_sfd_multiorder_map,Npix_sfd_multiorder_map,_DIST,diff,ebv_dustmap
_hipscat_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
12927513300782022656,760,320.5,-53.5,0,0,1,0,44,12927513254976028672,12,188120077,0.022748,4,0,2870,0.0,0.004891,0.02286
13187453677775880192,732,337.5,-39.5,0,0,1,0,45,13187453660801531904,13,767610831,0.016534,4,0,2928,0.0,0.004392,0.016461
13557123557418336256,778,313.5,-36.5,0,0,1,0,47,13557123540372684800,13,789128450,0.058645,4,0,3010,0.0,0.003584,0.058856
12924400840801779712,758,325.5,-53.5,0,0,1,0,44,12924400829615570944,13,752299141,0.020073,4,0,2869,0.0,0.00328,0.020007
13250788433850269696,803,336.5,-25.5,0,0,1,0,45,13250788416779452416,13,771297399,0.023856,4,0,2942,0.0,0.003099,0.02393
13158407025211736064,724,323.5,-41.5,0,0,1,0,45,13158407020316983296,13,765920094,0.026594,4,0,2921,0.0,0.003063,0.026513
13601023174257934336,767,314.5,-29.5,0,0,1,0,47,13601023157019344896,12,197920936,0.083611,4,0,3020,0.0,0.003036,0.083866
13488986123334057984,752,291.5,-34.5,0,0,1,0,46,13488985945805946880,11,49072645,0.109579,4,0,2995,0.0,0.003009,0.10925
13425161974698737664,783,286.5,-42.5,0,0,1,0,46,13425161907153666048,12,195361818,0.081658,4,0,2980,0.0,0.003005,0.081412
13025270726448381952,731,343.5,-52.5,0,0,1,0,45,13025270726427410432,15,12130728668,0.011725,5,10000,11568,0.0,0.002986,0.01176


0.0048911734

In [13]:
area17 = 4 ** (17 - sfd._ddf['pixel_Norder'].astype(np.uint64))
area17.sum().compute(), 12 * 4 ** 17

(206158430208, 206158430208)

In [14]:
import pyarrow.parquet as pq

for norder in range(8, 18):
    count = (sfd._ddf['pixel_Norder'] == norder).sum().compute()
    count_real = pq.read_metadata(PARQUET_DIR / f'pixel_Norder={norder:02d}.parquet').num_rows
    print(norder, count - count_real) 

8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0


In [12]:
import dask.array as da

index = sfd._ddf['_hipscat_index'].to_dask_array(lengths=True)
display(da.sum(da.diff(index) <= 0).compute())
index.argmin().compute()

0

0

In [16]:
from hipscat.pixel_math.hipscat_id import healpix_to_hipscat_id

index = sfd._ddf['_hipscat_index'].to_dask_array(lengths=True)
diff_index = da.diff(index)
diff_index_from_norder = sfd._ddf['pixel_Norder'].to_dask_array(lengths=True).astype(np.uint64).map_blocks(lambda order: healpix_to_hipscat_id(order, 1))[:-1]

da.sum((diff_index != diff_index_from_norder).astype(np.uint64)).compute()

0