In [1]:
# %pip install -q -r requirements.txt --upgrade

In [2]:
path = "/ocean/projects/phy210048p/shared/hats/raw/DESY3_metacal_v03-004.h5"

In [3]:
import hats
hats.__version__

'0.6.4'

In [15]:
from hats_import.catalog.file_readers import InputReader
import pyarrow as pa
import h5py
import numpy as np

class ShapeCatalogReader(InputReader):
    def __init__(self):
        self.chunksize=100_00
    def read(self, input_file, read_columns=None):
        fh = h5py.File(input_file,'r')
        dataset_len = fh['catalog']['unsheared']['ra'].len()
        col_names = []
        chunk_start = 0
        chunk_end = self.chunksize
        if read_columns is None:
            tab_names = [f"{key}" for key in fh['catalog']['unsheared'].keys()]
            col_names.extend(tab_names)
            for table in ['sheared_1m', 'sheared_1p', 'sheared_2m', 'sheared_2p']:
                tab_names = [f"{table}_{key}" for key in fh['catalog'][table].keys()]
                col_names.extend(tab_names)
        else:
            ras = fh['catalog']['unsheared']['ra'][chunk_start:chunk_end]
            ras = np.asanyarray(ras, dtype=ras.dtype.newbyteorder("="))
            decs = fh['catalog']['unsheared']['dec'][chunk_start:chunk_end]
            decs = np.asanyarray(decs, dtype=decs.dtype.newbyteorder("="))
            yield pa.Table.from_arrays([ras, decs], names=["ra", "dec"])

        while chunk_start < dataset_len:
            if read_columns is None:
                col_vals = []
                tab_values = [fh['catalog']['unsheared'][key][chunk_start:chunk_end] for key in fh['catalog']['unsheared'].keys()]
                tab_values = [np.asanyarray(arr, dtype=arr.dtype.newbyteorder("=")) for arr in tab_values]
    
                col_vals.extend(tab_values)
                for table in ['sheared_1m', 'sheared_1p', 'sheared_2m', 'sheared_2p']:
                    tab_values = [fh['catalog'][table][key][chunk_start:chunk_end] for key in fh['catalog'][table].keys()]
                    tab_values = [np.asanyarray(arr, dtype=arr.dtype.newbyteorder("=")) for arr in tab_values]
        
                    col_vals.extend(tab_values)
                yield pa.Table.from_arrays(col_vals, names=col_names)
            else:
                ras = fh['catalog']['unsheared']['ra'][chunk_start:chunk_end]
                ras = np.asanyarray(ras, dtype=ras.dtype.newbyteorder("="))
                decs = fh['catalog']['unsheared']['dec'][chunk_start:chunk_end]
                decs = np.asanyarray(decs, dtype=decs.dtype.newbyteorder("="))
                yield pa.Table.from_arrays([ras, decs], names=["ra", "dec"])
            
            chunk_start += self.chunksize
            chunk_end = min(chunk_end+self.chunksize, dataset_len)
    

In [16]:
from hats_import.catalog.arguments import ImportArguments

args = ImportArguments(
    sort_columns="coadd_object_id",
    ra_column="ra",
    dec_column="dec",
    input_file_list=[path],
    file_reader=ShapeCatalogReader(),
    output_artifact_name="DESY3_metacal",
    output_path="/ocean/projects/phy210048p/shared/hats/catalogs/des/",
)

In [None]:
from hats_import import pipeline

pipeline(args)

Planning  : 100%|██████████| 4/4 [00:00<00:00, 1668.38it/s]


tmp_path (/ocean/projects/phy210048p/shared/hats/catalogs/des/DESY3_metacal/intermediate) contains intermediate files; resuming prior progress.


Binning   : 100%|██████████| 2/2 [00:23<00:00, 11.62s/it]
Splitting :   0%|          | 0/1 [00:00<?, ?it/s]