In [1]:
import gzip
from pathlib import Path
from shutil import copyfileobj

import dask.distributed
import healpy
import numpy as np
import pandas as pd
from astropy.coordinates import SkyCoord
from dustmaps.sfd import SFDQuery
from hipscat.pixel_math.hipscat_id import HIPSCAT_ID_COLUMN, HIPSCAT_ID_HEALPIX_ORDER, healpix_to_hipscat_id

from paths import *

In [2]:
ORDER = 14
assert HIPSCAT_ID_HEALPIX_ORDER >= ORDER
BATCH_SIZE = 1 << 20

NSIDE = healpy.order2nside(ORDER)
NPIX = healpy.nside2npix(NSIDE)

NBATCHES = int(np.ceil(NPIX / BATCH_SIZE))
NBATCHES_LEN = len(str(NBATCHES - 1))

Decompress the input files for `dustmaps` and create `SFDQuery` instance

In [3]:
for gz_file in INPUT_DIR.glob('*.gz'):
    decompressed = INPUT_DIR / gz_file.stem
    if decompressed.exists():
        continue
    with gzip.open(gz_file, 'rb') as f_in:
        with open(decompressed, 'wb') as f_out:
            copyfileobj(f_in, f_out)

Generate pixel list, coordinates and E(B-V)

In [4]:
def gen_pixel_list():
    for i in range(0, NPIX, BATCH_SIZE):
        yield i, min(i + BATCH_SIZE, NPIX)

def ebv(sfd, pixels):
    ra, dec = healpy.pix2ang(NSIDE, pixels, nest='nested', lonlat=True)
    coord = SkyCoord(ra=ra, dec=dec, unit='deg')
    return sfd(coord)

def pixel_to_hipscat_id(pixels):
    order_diff = HIPSCAT_ID_HEALPIX_ORDER - ORDER
    return pixels << order_diff

def parquet_worker(sfd, i, start_end):
    pixels = np.arange(*start_end, dtype=np.uint32)
    values = ebv(sfd, pixels)
    filename = PARQUET_DIR / f'{i:0{NBATCHES_LEN}}.parquet'
    hipscat_id = healpix_to_hipscat_id(ORDER, pixels)
    
    df = pd.DataFrame({
        HIPSCAT_ID_COLUMN: hipscat_id,
        'pixel_Norder': np.asarray(ORDER, dtype=np.uint8),
        'pixel_Npix': pixels,
        'ebv': np.asarray(values, dtype=np.float32),
    })
    df.to_parquet(filename, index=False)


with dask.distributed.Client() as client:
    sfd = dask.delayed(SFDQuery)(INPUT_DIR)
    
    pixel_lists = list(gen_pixel_list())
    delayed_worker = dask.delayed(parquet_worker)
    
    futures = [delayed_worker(sfd, i, start_end) for i, start_end in enumerate(pixel_lists)]
    _result = dask.compute(*futures)