## DASH pipeline benchmarking

Let's obtain a Dask performance report for the most time-consuming dataset import (*object_forced_source*). 

The goal is to assess how long the I/O operations take in the **hats-import** pipeline:

1. Mapping: Read ra/dec
2. Splitting: Read every input parquet
3. Splitting: Write shards
4. Reducing: Read shards
5. Reducing: Write every output parquet

In [1]:
import tempfile
import hats_import.pipeline as runner

from dask.distributed import performance_report, Client
from dimension_reader import DimensionParquetReader
from hats_import.catalog.arguments import ImportArguments
from pathlib import Path

In [2]:
raw_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning/raw/v29_0_0_rc5")
hats_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning/sandro_test")

In [3]:
tmp_path = tempfile.TemporaryDirectory()
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_path.name)

In [4]:
def get_paths(dataset_type):
    index_dir = raw_dir / "index" / dataset_type
    return list(index_dir.glob("*.csv"))

def get_common_import_args():
    return {
        "output_path": hats_dir,
        "simple_progress_bar": True,
        "resume": False,
        "delete_intermediate_parquet_files": False,
        "delete_resume_log_files": False,
    }

In [5]:
def profile_import(args):
    """Profiles performance for each Rubin data product"""
    catalog_name = args.output_artifact_name
    report_filepath = f"{catalog_name}.html"
    with performance_report(filename=report_filepath):
        runner.pipeline_with_client(args, client)
    # Gather statistics on the number of files
    get_num_input(catalog_name)
    get_num_shards(catalog_name)
    get_num_output(catalog_name)
    save_intermediate_filetree(catalog_name)

def get_num_input(catalog_name):
    # Number of input parquet files read for mapping is not 25, those are the grouped indexed ones
    index_dir = raw_dir / "index" / catalog_name
    num_input_files = !find {index_dir} -type f -name "*.csv" | xargs awk 'FNR > 1' | wc -l
    print(f"Num of input parquet files: {num_input_files}")

def get_num_shards(catalog_name):
    catalog_dir = hats_dir / catalog_name
    num_shards = !find {catalog_dir}/"intermediate" -type f | grep '/order_[^/]\+' | wc -l
    print(f"Num of generated shards: {num_shards}")

def get_num_output(catalog_name):
    catalog_dir = hats_dir / catalog_name
    num_output_files = !find {catalog_dir} -type f | grep '/Norder=[^/]\+' | wc -l 
    print(f"Num of output parquet: {num_output_files}")

def save_intermediate_filetree(catalog_name):
    intermediate_dir = hats_dir / catalog_name / "intermediate"
    !tree {intermediate_dir} > {catalog_name}.txt

#### dia_object

In [6]:
dia_obj_default_columns = ["diaObjectId", "ra", "dec", "nDiaSources", "radecMjdTai"]

args = ImportArguments(
    output_artifact_name="dia_object",
    input_file_list=get_paths("dia_object"),
    file_reader=DimensionParquetReader(column_names=dia_obj_default_columns),
    ra_column="ra",
    dec_column="dec",
    catalog_type="object",
    highest_healpix_order=0,
    pixel_threshold=5_000_000,
    **get_common_import_args()
)
profile_import(args)

Planning  : 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 257.25it/s]
Mapping   : 100%|███████████████████████████████████████████████████████████| 25/25 [00:04<00:00,  5.70it/s]
Binning   : 100%|█████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 62.48it/s]
Splitting : 100%|███████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 89.73it/s]
Reducing  : 100%|█████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 14.53it/s]
Finishing : 100%|█████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 59.97it/s]


Num of input parquet files: ['25']
Num of generated shards: ['26']
Num of output parquet: ['4']


#### dia_source

In [7]:
args = ImportArguments(
    output_artifact_name="dia_source",
    input_file_list=get_paths("dia_source"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    highest_healpix_order=0,
    pixel_threshold=4_000_000,
    **get_common_import_args()
)
profile_import(args)

Planning  : 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 566.89it/s]
Mapping   : 100%|██████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 111.66it/s]
Binning   : 100%|█████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 61.45it/s]
Splitting : 100%|███████████████████████████████████████████████████████████| 25/25 [00:02<00:00, 11.86it/s]
Reducing  : 100%|█████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.12s/it]
Finishing : 100%|█████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 48.36it/s]


Num of input parquet files: ['25']
Num of generated shards: ['27']
Num of output parquet: ['4']


#### dia_object_forced_source

In [8]:
args = ImportArguments(
    output_artifact_name="dia_object_forced_source",
    input_file_list=get_paths("dia_object_forced_source"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    pixel_threshold=25_000_000,
    highest_healpix_order=9,
    **get_common_import_args()
)
profile_import(args)

Planning  : 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 207.56it/s]
Mapping   : 100%|███████████████████████████████████████████████████████████| 25/25 [00:07<00:00,  3.14it/s]
Binning   : 100%|█████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.32s/it]
Splitting : 100%|███████████████████████████████████████████████████████████| 25/25 [01:42<00:00,  4.10s/it]
Reducing  : 100%|███████████████████████████████████████████████████████████| 28/28 [00:21<00:00,  1.32it/s]
Finishing : 100%|█████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.43it/s]


Num of input parquet files: ['596']
Num of generated shards: ['1006']
Num of output parquet: ['28']


#### object

In [9]:
cols_per_band = []
for band in list("ugrizy"):
    for flux_type in ["psf", "kron"]:
        prefix = f"{band}_{flux_type}"
        cols_per_band.extend([f"{prefix}Flux", f"{prefix}FluxErr"])
    cols_per_band.append(f"{band}_kronRad")

obj_default_columns = [
    "objectId",
    "refFwhm",
    "shape_flag",
    "sky_object",
    "parentObjectId",
    "x",
    "y",
    "xErr",
    "yErr",
    "shape_yy",
    "shape_xx",
    "shape_xy",
    "coord_ra",
    "coord_dec",
    "coord_raErr",
    "coord_decErr",
    "tract",
    "patch",
    "detect_isIsolated",
] + cols_per_band

args = ImportArguments(
    output_artifact_name="object",
    input_file_list=get_paths("object"),
    file_reader=DimensionParquetReader(
        column_names=obj_default_columns, chunksize=250_000
    ),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="object",
    highest_healpix_order=7,
    pixel_threshold=300_000,
    **get_common_import_args()
)
profile_import(args)

Planning  : 100%|█████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 61.46it/s]
Mapping   : 100%|██████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 109.67it/s]
Binning   : 100%|█████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.39it/s]
Splitting : 100%|███████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 19.05it/s]
Reducing  : 100%|███████████████████████████████████████████████████████████| 21/21 [00:01<00:00, 15.93it/s]
Finishing : 100%|█████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 31.98it/s]


Num of input parquet files: ['29']
Num of generated shards: ['56']
Num of output parquet: ['21']


#### source

In [10]:
args = ImportArguments(
    output_artifact_name="source",
    input_file_list=get_paths("source"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    highest_healpix_order=9,
    pixel_threshold=1_000_000,
    **get_common_import_args()
)
profile_import(args)

Planning  : 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 241.35it/s]
Mapping   : 100%|███████████████████████████████████████████████████████████| 75/75 [00:02<00:00, 36.86it/s]
Binning   : 100%|█████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.20s/it]
Splitting : 100%|███████████████████████████████████████████████████████████| 75/75 [00:38<00:00,  1.93it/s]
Reducing  : 100%|█████████████████████████████████████████████████████████| 127/127 [00:20<00:00,  6.31it/s]
Finishing : 100%|█████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.13it/s]


Num of input parquet files: ['1786']
Num of generated shards: ['4846']
Num of output parquet: ['127']


#### object_forced_source

In [11]:
args = ImportArguments(
    output_artifact_name="object_forced_source",
    input_file_list=get_paths("object_forced_source"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    highest_healpix_order=9,
    pixel_threshold=25_000_000,
    **get_common_import_args()
)
profile_import(args)

Planning  : 100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 262.01it/s]
Mapping   : 100%|███████████████████████████████████████████████████████████| 29/29 [00:12<00:00,  2.35it/s]
Binning   : 100%|█████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.29s/it]
Splitting : 100%|███████████████████████████████████████████████████████████| 29/29 [02:37<00:00,  5.43s/it]
Reducing  : 100%|███████████████████████████████████████████████████████████| 34/34 [00:22<00:00,  1.49it/s]
Finishing : 100%|█████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  8.13it/s]


Num of input parquet files: ['635']
Num of generated shards: ['1469']
Num of output parquet: ['34']


### Cleanup

In [12]:
client.close()
tmp_path.cleanup()