# Partition Info.read_from_dir Benchmarking

## Section 0

Set up.

In [1]:
# Imports

import time
from unittest.mock import patch

import pandas as pd

from hats.catalog.partition_info import PartitionInfo
from hats.io import file_io

In [2]:
# Catalogs

ztf_path = "/epyc/data3/hats/catalogs/ztf_dr22/ztf_lc"
gaia_path = "/epyc/data3/hats/catalogs/gaia_dr3/gaia"

In [3]:
# Helper functions


def time_call(fn, *args, **kwargs):
    start = time.perf_counter()
    result = fn(*args, **kwargs)
    return result, time.perf_counter() - start


def quick_inspect(res_and_time, name: str = "PartitionInfo"):
    print(f"{name}: took {res_and_time[1]*1000:.2f} ms")
    print(
        "\tPartition Info:",
        ", ".join(map(lambda hp: str((hp.order, hp.pixel)), res_and_time[0].pixel_list[:5])),
        "...",
    )
    print(f"\t{len(res_and_time[0].pixel_list)} pixels.\n")

## Section 1

Call the fastest option: `hats.catalog.PartitionInfo.read_from_dir` is able to locate and reference `partition_info.csv`, and use it to construct the `PartitionInfo` object.

We'll want to time this, then print or otherwise inspect/verify its values.

In [4]:
ztf_1 = time_call(PartitionInfo.read_from_dir, ztf_path)
gaia_1 = time_call(PartitionInfo.read_from_dir, gaia_path)

quick_inspect(ztf_1, "ZTF")
quick_inspect(gaia_1, "Gaia")

ZTF: took 9.42 ms
	Partition Info: (2, 137), (2, 138), (3, 257), (3, 258), (3, 321) ...
	10839 pixels.

Gaia: took 2.16 ms
	Partition Info: (2, 0), (2, 1), (2, 2), (2, 3), (2, 8) ...
	2016 pixels.



## Section 2

(Mock) delete the `partition_info.csv` file.

Then, run `hats.catalog.PartitionInfo` again (forcing it to reference `_metadata` instead) and see how long it takes.

Inspect and verify the values.

In [5]:
with patch.object(file_io, "does_file_or_directory_exist", autospec=True) as mock_exists:

    def fake_exists(path):
        path = str(path)
        if path.endswith("partition_info.csv"):
            return False
        if path.endswith("_metadata"):
            return True
        return file_io.does_file_or_directory_exist.__wrapped__(path)  # type: ignore[attr-defined]

    mock_exists.side_effect = fake_exists

    ztf_2 = time_call(PartitionInfo.read_from_dir, ztf_path)
    gaia_2 = time_call(PartitionInfo.read_from_dir, gaia_path)

print()

quick_inspect(ztf_2, "ZTF")
quick_inspect(gaia_2, "Gaia")




ZTF: took 143.91 ms
	Partition Info: (4, 0), (4, 1), (5, 8), (5, 9), (5, 10) ...
	10839 pixels.

Gaia: took 925.47 ms
	Partition Info: (2, 0), (2, 1), (2, 2), (2, 3), (3, 16) ...
	2016 pixels.



## Section 3

(Mock) delete both `partition_info.csv` and `_metadata`.

Call `hats.catalog.PartitionInfo` with `compute_from_catalog=True`, and see how long it takes (probably a while).

Inspect and verify the values.

In [6]:
with patch.object(file_io, "does_file_or_directory_exist", autospec=True) as mock_exists:
    def fake_exists(path):
        path = str(path)
        if path.endswith("partition_info.csv") or path.endswith("_metadata"):
            return False
        return file_io.does_file_or_directory_exist.__wrapped__(path)  # type: ignore[attr-defined]
    mock_exists.side_effect = fake_exists

    ztf_3 = time_call(PartitionInfo.read_from_dir, ztf_path, {"compute_from_catalog": True})
    gaia_3 = time_call(PartitionInfo.read_from_dir, gaia_path, {"compute_from_catalog": True})

print()

quick_inspect(ztf_3, "ZTF")
quick_inspect(gaia_3, "Gaia")


ZTF: took 99.85 ms
	Partition Info: (2, 137), (2, 138), (3, 257), (3, 258), (3, 321) ...
	10839 pixels.

Gaia: took 20.63 ms
	Partition Info: (2, 0), (2, 1), (2, 2), (2, 3), (2, 8) ...
	2016 pixels.





## Comparision

In [12]:
ztf_sorted_1 = sorted(ztf_1[0].pixel_list)
ztf_sorted_2 = sorted(ztf_2[0].pixel_list)
ztf_sorted_3 = sorted(ztf_3[0].pixel_list)

ztf_sorted_1 == ztf_sorted_2 == ztf_sorted_3  # Should be True

gaia_sorted_1 = sorted(gaia_1[0].pixel_list)
gaia_sorted_2 = sorted(gaia_2[0].pixel_list)
gaia_sorted_3 = sorted(gaia_3[0].pixel_list)

gaia_sorted_1 == gaia_sorted_2 == gaia_sorted_3  # Should be True

True

In [7]:
rows = [
    {"catalog": "ZTF", "strategy": "read csv", "seconds": ztf_1[1]},
    {"catalog": "Gaia", "strategy": "read csv", "seconds": gaia_1[1]},
    {"catalog": "ZTF", "strategy": "read metadata", "seconds": ztf_2[1]},
    {"catalog": "Gaia", "strategy": "read metadata", "seconds": gaia_2[1]},
    {"catalog": "ZTF", "strategy": "compute", "seconds": ztf_3[1]},
    {"catalog": "Gaia", "strategy": "compute", "seconds": gaia_3[1]},
]
df = pd.DataFrame(rows)
df["milliseconds"] = (df["seconds"] * 1_000).round(2)
df = df.drop(columns="seconds")
df

Unnamed: 0,catalog,strategy,milliseconds
0,ZTF,read csv,9.42
1,Gaia,read csv,2.16
2,ZTF,read metadata,143.91
3,Gaia,read metadata,925.47
4,ZTF,compute,99.85
5,Gaia,compute,20.63
