# Optimizing `get_pixel_paths` and Skymap at higher orders

By Sean McGuire

Here we show how the method to get the hive paths of the partition files was improved

Before: 

In [2]:
from IPython.core.display import HTML
from hipscat.pixel_math import HealpixPixel
import hipscat as hc
from hipscat.io.file_io.file_pointer import get_fs

In [3]:
base_dir = "/test/base_dir"
pixels = [HealpixPixel(0, 0), HealpixPixel(0, 1), HealpixPixel(3, 100)]

paths = [
    hc.io.paths.pixel_catalog_file(
        catalog_base_dir=base_dir,
        pixel_order=pixel.order,
        pixel_number=pixel.pixel,
    )
    for pixel in pixels
]
paths

['/test/base_dir/Norder=0/Dir=0/Npix=0.parquet',
 '/test/base_dir/Norder=0/Dir=0/Npix=1.parquet',
 '/test/base_dir/Norder=3/Dir=0/Npix=100.parquet']

`pixel_catalog_files` computes pixel_dir and calls os.path.join

Profiling shows it's slow, 3.75s to get the paths:

![thing](images/Screenshot%202024-04-04%20at%202.36.36%E2%80%AFPM.png)

New method:

In [4]:
from hipscat.io.paths import ORDER_DIRECTORY_PREFIX, DIR_DIRECTORY_PREFIX, PIXEL_DIRECTORY_PREFIX


def pixel_catalog_files(catalog_base_dir, pixels, storage_options):
    fs, _ = get_fs(catalog_base_dir, storage_options)
    base_path_stripped = catalog_base_dir.removesuffix(fs.sep)
    return [fs.sep.join([base_path_stripped,
                         f"{ORDER_DIRECTORY_PREFIX}={pixel.order}",
                         f"{DIR_DIRECTORY_PREFIX}={pixel.pixel // 10000 * 10000}",
                         f"{PIXEL_DIRECTORY_PREFIX}={pixel.pixel}.parquet"
                         ]) for pixel in pixels]

In [5]:
pixel_catalog_files(base_dir, pixels, None)

['/test/base_dir/Norder=0/Dir=0/Npix=0.parquet',
 '/test/base_dir/Norder=0/Dir=0/Npix=1.parquet',
 '/test/base_dir/Norder=3/Dir=0/Npix=100.parquet']

new profiling, down to 0.5s:

![thing](images/Screenshot%202024-04-04%20at%203.10.22%E2%80%AFPM.png)

## Skymap histogram at high orders

In [2]:
import lsdb

In [3]:
ztf = lsdb.read_hipscat("/data3/epyc/data3/hipscat/catalogs/ztf_axs/ztf_dr14")

In [35]:
ztf.order_search(6, 6).box(ra=[0,630])._ddf.npartitions

4

In [36]:
ztf.order_search(1, 1)._ddf.npartitions

3

In [4]:
from distributed import Client

client = Client(n_workers=6, memory_limit='10GB', local_directory="/data3/epyc/projects3/sean_hipscat", dashboard_address=':41899', threads_per_worker=1)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36670 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:36670/status,

0,1
Dashboard: http://127.0.0.1:36670/status,Workers: 6
Total threads: 6,Total memory: 55.88 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41291,Workers: 6
Dashboard: http://127.0.0.1:36670/status,Total threads: 6
Started: Just now,Total memory: 55.88 GiB

0,1
Comm: tcp://127.0.0.1:41926,Total threads: 1
Dashboard: http://127.0.0.1:40299/status,Memory: 9.31 GiB
Nanny: tcp://127.0.0.1:33018,
Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-ninibbqg,Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-ninibbqg

0,1
Comm: tcp://127.0.0.1:34923,Total threads: 1
Dashboard: http://127.0.0.1:39095/status,Memory: 9.31 GiB
Nanny: tcp://127.0.0.1:46188,
Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-nax5mm8q,Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-nax5mm8q

0,1
Comm: tcp://127.0.0.1:35751,Total threads: 1
Dashboard: http://127.0.0.1:43271/status,Memory: 9.31 GiB
Nanny: tcp://127.0.0.1:45396,
Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-ymsvg222,Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-ymsvg222

0,1
Comm: tcp://127.0.0.1:40389,Total threads: 1
Dashboard: http://127.0.0.1:46483/status,Memory: 9.31 GiB
Nanny: tcp://127.0.0.1:37481,
Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-52ua4oc0,Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-52ua4oc0

0,1
Comm: tcp://127.0.0.1:34479,Total threads: 1
Dashboard: http://127.0.0.1:40951/status,Memory: 9.31 GiB
Nanny: tcp://127.0.0.1:35329,
Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-zqwtuw6w,Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-zqwtuw6w

0,1
Comm: tcp://127.0.0.1:39684,Total threads: 1
Dashboard: http://127.0.0.1:44780/status,Memory: 9.31 GiB
Nanny: tcp://127.0.0.1:32842,
Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-t9kuimy3,Local directory: /data3/epyc/projects3/sean_hipscat/dask-scratch-space/worker-t9kuimy3


In [40]:
%timeit ztf.order_search(1, 1).skymap_histogram(lambda x, _: len(x), order=5)

474 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
%timeit ztf.order_search(6, 6).box(ra=[0,630]).skymap_histogram(lambda x, _: len(x), order=10)

545 ms ± 37.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%timeit ztf.order_search(1, 1).skymap_histogram(lambda x, _: len(x), order=10)

6.7 s ± 1.58 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
from distributed import performance_report

with performance_report(filename="/astro/users/seanmcgu/histogram_skymap.html"):
    ztf.order_search(1, 1).skymap_histogram(lambda x, _: len(x), order=10)

Traceback (most recent call last):
  File "/astro/users/seanmcgu/anaconda3/envs/lsdb/lib/python3.10/site-packages/distributed/sizeof.py", line 17, in safe_sizeof
    return sizeof(obj)
  File "/astro/users/seanmcgu/anaconda3/envs/lsdb/lib/python3.10/site-packages/dask/utils.py", line 767, in __call__
    return meth(arg, *args, **kwargs)
  File "/astro/users/seanmcgu/anaconda3/envs/lsdb/lib/python3.10/site-packages/dask/sizeof.py", line 96, in sizeof_python_dict
    + sizeof(list(d.values()))
  File "/astro/users/seanmcgu/anaconda3/envs/lsdb/lib/python3.10/site-packages/dask/utils.py", line 767, in __call__
    return meth(arg, *args, **kwargs)
  File "/astro/users/seanmcgu/anaconda3/envs/lsdb/lib/python3.10/site-packages/dask/sizeof.py", line 59, in sizeof_python_collection
    return sys.getsizeof(seq) + sum(map(sizeof, seq))
  File "/astro/users/seanmcgu/anaconda3/envs/lsdb/lib/python3.10/site-packages/dask/utils.py", line 767, in __call__
    return meth(arg, *args, **kwargs)
  Fil

In [49]:
HTML("/astro/users/seanmcgu/histogram_skymap.html")

In [6]:
from distributed import performance_report

with performance_report(filename="/astro/users/seanmcgu/histogram_skymap_old_code.html"):
    ztf.order_search(1, 1).skymap_histogram(lambda x, _: len(x), order=10)

In [3]:
HTML("/astro/users/seanmcgu/histogram_skymap_old_code.html")

## Old Code

```
@delayed
def perform_inner_skymap(
    partition: pd.DataFrame,
    func: Callable[[pd.DataFrame, HealpixPixel], Any],
    pixel: HealpixPixel,
    target_order: int,
    **kwargs,
) -> np.ndarray:
    """Splits a partition into pixels at a target order and performs a given function on the new pixels"""
    delta_order = target_order - pixel.order
    pixels = np.arange(pixel.pixel << (2 * delta_order), (pixel.pixel + 1) << (2 * delta_order))
    return np.vectorize(
        lambda p: func(
            filter_by_hipscat_index_to_pixel(partition, target_order, p),
            HealpixPixel(target_order, p),
            **kwargs,
        )
    )(pixels)
```

New Code:

```
@delayed
def perform_inner_skymap(
    partition: pd.DataFrame,
    func: Callable[[pd.DataFrame, HealpixPixel], Any],
    pixel: HealpixPixel,
    target_order: int,
    default_value: Any = 0,
    **kwargs,
) -> np.ndarray:
    """Splits a partition into pixels at a target order and performs a given function on the new pixels"""
    hipscat_index = partition.index.values
    order_pixels = hipscat_id_to_healpix(hipscat_index, target_order=target_order)

    def apply_func(df):
        # gets the healpix pixel of the partition using the hipscat_id
        p = hipscat_id_to_healpix([df.index.values[0]], target_order=target_order)[0]
        return func(df, HealpixPixel(target_order, p), **kwargs)

    gb = partition.groupby(order_pixels, sort=False).apply(apply_func)
    delta_order = target_order - pixel.order
    img = np.full(1 << 2 * delta_order, fill_value=default_value)
    min_pixel_value = pixel.pixel << 2 * delta_order
    img[gb.index.values - min_pixel_value] = gb.values
    return img
```