In [1]:
import gzip
from pathlib import Path
from shutil import copyfileobj

import dask.distributed
import healpy
import hipscat_import.pipeline as runner
import numpy as np
import pandas as pd
from astropy.coordinates import SkyCoord
from dustmaps.sfd import SFDQuery
from hipscat_import.catalog.arguments import ImportArguments
from tqdm import tqdm

In [2]:
ORDER = 14
BATCH_SIZE = 1 << 20

NSIDE = healpy.order2nside(ORDER)
NPIX = healpy.nside2npix(NSIDE)

NBATCHES = int(np.ceil(NPIX / BATCH_SIZE))
NBATCHES_LEN = len(str(NBATCHES - 1))

In [3]:
DATA_DIR = Path('./data')
INPUT_DIR = DATA_DIR / 'input'
PARQUET_DIR = DATA_DIR / 'tmp-parquet'
OUTPUT_DIR = DATA_DIR / 'output'

Decompress the input files for `dustmaps` and create `SFDQuery` instance

In [4]:
for gz_file in INPUT_DIR.glob('*.gz'):
    decompressed = INPUT_DIR / gz_file.stem
    if decompressed.exists():
        continue
    with gzip.open(gz_file, 'rb') as f_in:
        with open(decompressed, 'wb') as f_out:
            copyfileobj(f_in, f_out)

In [5]:
sfd = SFDQuery(INPUT_DIR)

Generate pixel list, coordinates and E(B-V)

In [None]:
def gen_pixel_list():
    for i in range(0, NPIX, BATCH_SIZE):
        yield i, min(i + BATCH_SIZE, NPIX)

def ebv(pixels):
    ra, dec = healpy.pix2ang(NSIDE, pixels, nest='nested', lonlat=True)
    coord = SkyCoord(ra=ra, dec=dec, unit='deg')
    return sfd(coord)

def write_parquet(filename, pixels, values):
    df = pd.DataFrame({f'healpix{ORDER}': pixels, 'ebv': values})
    df.to_parquet(filename, index=False)

def parquet_worker(i, start_end):
    pixels = np.arange(*start_end)
    values = ebv(pixels)
    filename = PARQUET_DIR / f'{i:0{NBATCHES_LEN}}.parquet'
    write_parquet(filename, pixels, values)


with dask.distributed.Client(n_workers=2) as client:
    futures = [client.submit(parquet_worker, *args) for args in enumerate(gen_pixel_list())]
    _results = client.gather(futures)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
