# Partition Info.read_from_dir Benchmarking

## Section 0

Set up.

In [1]:
# Imports

import time
from unittest.mock import patch

import pandas as pd

from hats.catalog.partition_info import PartitionInfo
from hats.io import file_io

In [2]:
# Catalogs

ztf_path = "/epyc/data3/hats/catalogs/ztf_dr22/ztf_lc"
gaia_path = "/epyc/data3/hats/catalogs/gaia_dr3/gaia"
neowise_path = "/epyc/data3/hats/catalogs/wise/neowise/neowise"

In [None]:
# Helper functions


def time_call(fn, *args, **kwargs):
    start = time.perf_counter()
    result = fn(*args, **kwargs)
    return result, time.perf_counter() - start


def quick_inspect(res_and_time, name: str = "PartitionInfo"):
    print(f"{name}: took {res_and_time[1]*1000:,.2f} ms")
    print(
        "\tPartition Info:",
        ", ".join(map(lambda hp: str((hp.order, hp.pixel)), res_and_time[0].pixel_list[:5])),
        "...",
    )
    print(f"\t{len(res_and_time[0].pixel_list:,)} pixels.\n")

## Section 1

Call the fastest option: `hats.catalog.PartitionInfo.read_from_dir` is able to locate and reference `partition_info.csv`, and use it to construct the `PartitionInfo` object.

We'll want to time this, then print or otherwise inspect/verify its values.

In [4]:
ztf_1 = time_call(PartitionInfo.read_from_dir, ztf_path)
gaia_1 = time_call(PartitionInfo.read_from_dir, gaia_path)
neo_1 = time_call(PartitionInfo.read_from_dir, neowise_path)

quick_inspect(ztf_1, "ZTF")
quick_inspect(gaia_1, "Gaia")
quick_inspect(neo_1, "NEOWISE")

ZTF: took 10.14 ms
	Partition Info: (2, 137), (2, 138), (3, 257), (3, 258), (3, 321) ...
	10839 pixels.

Gaia: took 2.33 ms
	Partition Info: (2, 0), (2, 1), (2, 2), (2, 3), (2, 8) ...
	2016 pixels.

NEOWISE: took 14.24 ms
	Partition Info: (5, 0), (5, 1), (5, 2), (5, 3), (5, 4) ...
	39774 pixels.



## Section 2

(Mock) delete the `partition_info.csv` file.

Then, run `hats.catalog.PartitionInfo` again (forcing it to reference `_metadata` instead) and see how long it takes.

Inspect and verify the values.

In [5]:
with patch.object(file_io, "does_file_or_directory_exist", autospec=True) as mock_exists:

    def fake_exists(path):
        path = str(path)
        if path.endswith("partition_info.csv"):
            return False
        if path.endswith("_metadata"):
            return True
        return file_io.does_file_or_directory_exist.__wrapped__(path)  # type: ignore[attr-defined]

    mock_exists.side_effect = fake_exists

    ztf_2 = time_call(PartitionInfo.read_from_dir, ztf_path)
    gaia_2 = time_call(PartitionInfo.read_from_dir, gaia_path)
    neo_2 = time_call(PartitionInfo.read_from_dir, neowise_path)

print()

quick_inspect(ztf_2, "ZTF")
quick_inspect(gaia_2, "Gaia")
quick_inspect(neo_2, "NEOWISE")




ZTF: took 147.40 ms
	Partition Info: (4, 0), (4, 1), (5, 8), (5, 9), (5, 10) ...
	10839 pixels.

Gaia: took 943.07 ms
	Partition Info: (2, 0), (2, 1), (2, 2), (2, 3), (3, 16) ...
	2016 pixels.

NEOWISE: took 4,634.71 ms
	Partition Info: (5, 0), (5, 1), (5, 2), (5, 3), (5, 4) ...
	39774 pixels.



## Section 3

(Mock) delete both `partition_info.csv` and `_metadata`.

Call `hats.catalog.PartitionInfo` with `compute_from_catalog=True`, and see how long it takes (probably a while).

Inspect and verify the values.

In [6]:
with patch.object(file_io, "does_file_or_directory_exist", autospec=True) as mock_exists:
    def fake_exists(path):
        path = str(path)
        if path.endswith("partition_info.csv") or path.endswith("_metadata"):
            return False
        return file_io.does_file_or_directory_exist.__wrapped__(path)  # type: ignore[attr-defined]
    mock_exists.side_effect = fake_exists

    ztf_3 = time_call(PartitionInfo.read_from_dir, ztf_path, {"compute_from_catalog": True})
    gaia_3 = time_call(PartitionInfo.read_from_dir, gaia_path, {"compute_from_catalog": True})
    neo_3 = time_call(PartitionInfo.read_from_dir, neowise_path, {"compute_from_catalog": True})

print()

quick_inspect(ztf_3, "ZTF")
quick_inspect(gaia_3, "Gaia")
quick_inspect(neo_3, "NEOWISE")




ZTF: took 102.37 ms
	Partition Info: (2, 137), (2, 138), (3, 257), (3, 258), (3, 321) ...
	10839 pixels.

Gaia: took 20.82 ms
	Partition Info: (2, 0), (2, 1), (2, 2), (2, 3), (2, 8) ...
	2016 pixels.

NEOWISE: took 458.40 ms
	Partition Info: (5, 0), (5, 1), (5, 2), (5, 3), (5, 4) ...
	39774 pixels.



## Comparision

In [7]:
ztf_sorted_1 = sorted(ztf_1[0].pixel_list)
ztf_sorted_2 = sorted(ztf_2[0].pixel_list)
ztf_sorted_3 = sorted(ztf_3[0].pixel_list)

print(ztf_sorted_1 == ztf_sorted_2 == ztf_sorted_3)  # Should be True

gaia_sorted_1 = sorted(gaia_1[0].pixel_list)
gaia_sorted_2 = sorted(gaia_2[0].pixel_list)
gaia_sorted_3 = sorted(gaia_3[0].pixel_list)

print(gaia_sorted_1 == gaia_sorted_2 == gaia_sorted_3)  # Should be True

neo_sorted_1 = sorted(neo_1[0].pixel_list)
neo_sorted_2 = sorted(neo_2[0].pixel_list)
neo_sorted_3 = sorted(neo_3[0].pixel_list)

print(neo_sorted_1 == neo_sorted_2 == neo_sorted_3)  # Should be True

True
True
True


In [8]:
df = pd.DataFrame(
        [
            {"catalog": "ZTF", "strategy": "read csv", "ms": ztf_1[1] * 1_000},
            {"catalog": "ZTF", "strategy": "read metadata", "ms": ztf_2[1] * 1_000},
            {"catalog": "ZTF", "strategy": "compute", "ms": ztf_3[1] * 1_000},
            {"catalog": "Gaia", "strategy": "read csv", "ms": gaia_1[1] * 1_000},
            {"catalog": "Gaia", "strategy": "read metadata", "ms": gaia_2[1] * 1_000},
            {"catalog": "Gaia", "strategy": "compute", "ms": gaia_3[1] * 1_000},
            {"catalog": "NEOWISE", "strategy": "read csv", "ms": neo_1[1] * 1_000},
            {"catalog": "NEOWISE", "strategy": "read metadata", "ms": neo_2[1] * 1_000},
            {"catalog": "NEOWISE", "strategy": "compute", "ms": neo_3[1] * 1_000},
        ]
)

summary_seconds = (
    df.pivot(index="catalog", columns="strategy", values="ms").loc[
        ["ZTF", "Gaia", "NEOWISE"], ["read csv", "read metadata", "compute"]
    ]
    / 1_000
).round(2)

summary_seconds.columns = [f"{c} (s)" for c in summary_seconds.columns]
summary_seconds.style.set_caption("PartitionInfo.read_from_dir timings (seconds)").format("{:,.2f}")

Unnamed: 0_level_0,read csv (s),read metadata (s),compute (s)
catalog,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ZTF,0.01,0.15,0.1
Gaia,0.0,0.94,0.02
NEOWISE,0.01,4.63,0.46
