In [1]:
# %pip install -q -r requirements.txt --upgrade

In [2]:
import hats
from hats_import.catalog.file_readers import InputReader
import pyarrow as pa
import h5py
import numpy as np
from hats_import.catalog.arguments import ImportArguments


hats.__version__

  from .autonotebook import tqdm as notebook_tqdm


'0.6.4'

In [3]:
class ShapeCatalogReader(InputReader):
    def __init__(self, input_path):
        self.input_path = input_path
        self.chunksize=100_000
        
    def read(self, input_file, read_columns=None):
        fh = h5py.File(self.input_path,'r')
        dataset_len = fh['catalog']['unsheared']['ra'].len()

        ## input_file will be formatted like "interval_start:interval_end"
        interval_pieces = input_file.split(':')
        (chunk_start, interval_end) = int(interval_pieces[0]), int(interval_pieces[1])
        interval_end = min(interval_end, dataset_len)
        chunk_end = chunk_start + self.chunksize

        col_names = []
        if read_columns is None:
            tab_names = [f"{key}" for key in fh['catalog']['unsheared'].keys()]
            col_names.extend(tab_names)
            for table in ['sheared_1m', 'sheared_1p', 'sheared_2m', 'sheared_2p']:
                tab_names = [f"{table}_{key}" for key in fh['catalog'][table].keys()]
                col_names.extend(tab_names)

        while chunk_start < interval_end:
            if read_columns is None:
                col_vals = []
                tab_values = [fh['catalog']['unsheared'][key][chunk_start:chunk_end] for key in fh['catalog']['unsheared'].keys()]
                tab_values = [np.asanyarray(arr, dtype=arr.dtype.newbyteorder("=")) for arr in tab_values]
    
                col_vals.extend(tab_values)
                for table in ['sheared_1m', 'sheared_1p', 'sheared_2m', 'sheared_2p']:
                    tab_values = [fh['catalog'][table][key][chunk_start:chunk_end] for key in fh['catalog'][table].keys()]
                    tab_values = [np.asanyarray(arr, dtype=arr.dtype.newbyteorder("=")) for arr in tab_values]
        
                    col_vals.extend(tab_values)
                yield pa.Table.from_arrays(col_vals, names=col_names)
            else:
                ras = fh['catalog']['unsheared']['ra'][chunk_start:chunk_end]
                ras = np.asanyarray(ras, dtype=ras.dtype.newbyteorder("="))
                decs = fh['catalog']['unsheared']['dec'][chunk_start:chunk_end]
                decs = np.asanyarray(decs, dtype=decs.dtype.newbyteorder("="))
                yield pa.Table.from_arrays([ras, decs], names=["ra", "dec"])
            
            chunk_start += self.chunksize
            chunk_end = min(chunk_end+self.chunksize, dataset_len, interval_end)

In [4]:
path = "/ocean/projects/phy210048p/shared/hats/raw/DESY3_metacal_v03-004.h5"

In [5]:
chunk = 1_000_000

read_keys = [f"{interval}:{interval+chunk}" for interval in range(0, 400_000_000, chunk)]
len(read_keys)

400

In [8]:
args = ImportArguments(
    sort_columns="coadd_object_id",
    ra_column="ra",
    dec_column="dec",
    input_file_list=read_keys,
    file_reader=ShapeCatalogReader(path),
    expected_total_rows=399263026,
    output_artifact_name="DESY3_metacal",
    output_path="/ocean/projects/phy210048p/shared/hats/catalogs/des/",
    resume=False,
)

In [9]:
from hats_import import pipeline_with_client
from dask.distributed import Client
import os

local_tmp = os.path.expandvars("$LOCAL")

with Client(
        local_directory=local_tmp,
        n_workers=30,
        threads_per_worker=1,
        memory_limit=None,
    ) as client:
        pipeline_with_client(args, client)

Planning  : 100%|██████████| 4/4 [00:00<00:00, 781.75it/s]
Mapping   : 100%|██████████| 400/400 [00:29<00:00, 13.46it/s]
Binning   : 100%|██████████| 2/2 [00:57<00:00, 28.54s/it]
Splitting : 100%|██████████| 400/400 [17:03<00:00,  2.56s/it]  
Reducing  : 100%|██████████| 1051/1051 [09:07<00:00,  1.92it/s] 
Finishing : 100%|██████████| 5/5 [25:25<00:00, 305.04s/it]   


In [13]:
from hats_import import pipeline, VerificationArguments
args = VerificationArguments(
    input_catalog_path="/ocean/projects/phy210048p/shared/hats/catalogs/des/DESY3_metacal",
    output_path="./verification/DESY3_metacal",
)
pipeline(args)

Loading dataset and schema.

Starting: Test hats.io.validation.is_valid_catalog.
Validating catalog at path /ocean/projects/phy210048p/shared/hats/catalogs/des/DESY3_metacal ... 
Found 1051 partitions.
Approximate coverage is 20.05 % of the sky.
Result: PASSED

Starting: Test that files in _metadata match the data files on disk.
Result: PASSED

Starting: Test that number of rows are equal.
	file footers vs catalog properties
	file footers vs _metadata
Result: PASSED

Starting: Test that schemas are equal, excluding metadata.
	_common_metadata vs truth
	_metadata vs truth
	file footers vs truth
Result: PASSED

Verifier results written to verification/DESY3_metacal/verifier_results.csv
Elapsed time (seconds): 14.19
