In [9]:
from pathlib import Path
from typing import Literal

import numpy as np
import pandas as pd
from astropy.io import fits
from astropy.wcs import WCS

import hipscat_import.pipeline as runner
from hipscat_import.catalog.arguments import ImportArguments

In [10]:
DATA_DIR = Path('./data')
INPUT_DIR = DATA_DIR / 'input'
PARQUET_DIR = DATA_DIR / 'tmp-parquet'
OUTPUT_DIR = DATA_DIR / 'output'

# Transform FITS files to parquet

In [11]:
%%time

def produce_ids(x: np.ndarray[int], y: np.ndarray[int]) -> np.ndarray:
    # Max number of digits in y:
    d = int(np.log10(y.max())) + 1
    return x * 10**d + y

def get_hemisphere(hemisphere: Literal["n", "s"]) -> pd.DataFrame:
    filename = INPUT_DIR / f"SFD_dust_4096_{hemisphere}gp.fits.gz"
    b_sign = 1 if hemisphere == "n" else -1
    
    with fits.open(filename, memmap=False) as hdul:
        hdu, = hdul
        wcs = WCS(hdu.header)
        data = hdu.data
        
        # Produce pixel coordinates
        pixel_grid = np.meshgrid(
            np.arange(data.shape[0]),
            np.arange(data.shape[1]),
        )
        coord = wcs.pixel_to_world(*pixel_grid)
        # And pixel IDs
        ids = produce_ids(*pixel_grid)
        
        # Filter out pixels belong to the other hemisphere
        mask = b_sign * coord.b > 0
        data, coord, ids = data[mask], coord[mask], ids[mask]
        
    # Convert to equatorial coordinates
    eq = coord.transform_to("icrs")
    # Swap bytes to get the correct endianness
    data = data.newbyteorder().byteswap(inplace=True)
    
    df = pd.DataFrame(dict(
        id=ids.ravel(),
        ra_deg=eq.ra.deg.ravel(),
        dec_deg=eq.dec.deg.ravel(),
        ebv=data.ravel(),
    ))
    return df


PARQUET_DIR.mkdir(exist_ok=True, parents=True)
for hemisphere in ["n", "s"]:
    df = get_hemisphere(hemisphere)
    df.to_parquet(PARQUET_DIR / f"sfd-{hemisphere}.parquet")

CPU times: user 5.89 s, sys: 1.07 s, total: 6.96 s
Wall time: 6.85 s


# Import parquet 

In [12]:
%%time

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

args = ImportArguments(
    id_column="id",
    ra_column="ra_deg",
    dec_column="dec_deg",
    input_path=PARQUET_DIR,
    input_format="parquet",
    output_catalog_name="sfd",
    output_path=OUTPUT_DIR,
    pixel_threshold=10_000,
    overwrite=True,
)
runner.pipeline(args)

Planning  : 100%|██████████| 4/4 [00:00<00:00, 5616.74it/s]
Mapping   : 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
Binning   : 100%|██████████| 2/2 [01:20<00:00, 40.23s/it]
Splitting : 100%|██████████| 2/2 [00:31<00:00, 15.66s/it]
Reducing  : 100%|██████████| 3072/3072 [00:38<00:00, 79.09it/s] 
Finishing : 100%|██████████| 6/6 [00:00<00:00,  6.89it/s]


CPU times: user 1min 3s, sys: 2.43 s, total: 1min 6s
Wall time: 2min 36s
