<img align='left' src = '../../images/linea.png' width=150 style='padding: 20px'> 

# Report HATS
## Generating the margin cache for DP0.2 (Object) Skinny catalog

Performance report on LINCC libraries.

Contacts: Luigi Silva ([luigi.silva@linea.org.br](mailto:luigi.silva@linea.org.br)); Julia Gschwend ([julia@linea.org.br](mailto:julia@linea.org.br)).

Last check: 08/11/2024

#### Acknowledgments

'_This notebook used computational resources from the Associação Laboratório Interinstitucional de e-Astronomia (LIneA) with financial support from the INCT of e-Universe (Process No. 465376/2014-2)._'

'_This notebook uses libraries from the LSST Interdisciplinary Network for Collaboration and Computing (LINCC) Frameworks project, such as the hats, hats_import, and lsdb libraries. The LINCC Frameworks project is supported by Schmidt Sciences. It is also based on work supported by the National Science Foundation under Grant No. AST-2003196. Additionally, it receives support from the DIRAC Institute at the Department of Astronomy of the University of Washington. The DIRAC Institute is supported by gifts from the Charles and Lisa Simonyi Fund for Arts and Sciences and the Washington Research Foundation._'

# Imports

Let us import the packages that we will need.

In [None]:
########################### GENERAL ##########################
import os
import gc
import re
import sys
import glob
import time
import math
import getpass
import warnings
import tables_io
import subprocess
import numpy as np
import pandas as pd
import healpy as hp
from pathlib import Path
from datetime import datetime
############################ DASK ############################
import dask
from dask import dataframe as dd
from dask import delayed
from dask.distributed import Client, performance_report, wait
import dask_jobqueue
from dask_jobqueue import SLURMCluster
########################## HATS ###########################
import hats
from hats.inspection.visualize_catalog import plot_pixels
from hats.pixel_math import HealpixPixel
########################## HATS IMPORT ###########################
import hats_import
from hats_import.catalog.file_readers import ParquetReader, FitsReader
from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments
from hats_import.pipeline import ImportArguments, pipeline_with_client
############################ LSDB ############################
import lsdb
from lsdb.core.search import BoxSearch
######################## VISUALIZATION #######################
### BOKEH
import bokeh
from bokeh.io import output_notebook, show
from bokeh.models import ColorBar, LinearColorMapper
from bokeh.palettes import Viridis256

### HOLOVIEWS
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import rasterize, dynspread

### GEOVIEWS
import geoviews as gv
import geoviews.feature as gf
from cartopy import crs

### DATASHADER
import datashader as ds

### MATPLOTLIB
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
########################## ASTRONOMY #########################
from astropy.io import fits
from astropy import units as u
from astropy.coordinates import SkyCoord
from astropy.units.quantity import Quantity

Defining the plots to be inline.

In [None]:
hv.extension('bokeh')
gv.extension('bokeh')
output_notebook()
%matplotlib inline

Printing the versions of the libraries.

In [None]:
### Getting hats version manually, because it has no __version__ attribute.
result = subprocess.run(
    ["conda", "run", "-p", "/lustre/t0/scratch/users/luigi.silva/hats_env_081124", "conda", "list", "hats"],
    stdout=subprocess.PIPE, text=True
)
for line in result.stdout.splitlines():
    if line.startswith("hats "):
        hats_version = line.split()[1]
        break

### Printing the versions.
print(f'python version: {sys.version}')
print(f'numpy version: {np.__version__}')
print(f'dask version: {dask.__version__}')
print(f'dask_jobqueue version: {dask_jobqueue.__version__}')
print(f'hats version: {hats_version}')
print(f'hats_import version: {hats_import.__version__}')
print(f'lsdb version: {lsdb.__version__}')

# Configurations

## Running configurations

Set the configurations for this running.

In [None]:
# DO YOU WANT TO RUN THE MARGIN CACHE PIPELINE? 
run_the_pipeline = False

# DO YOU WANT TO SAVE ALL THE DASK JOBS OUTPUTS AND ERRORS OF DASK SLURMCluster?
save_the_dask_jobs_info = True

# DO YOU WANT TO SAVE ALL THE GENERAL INFORMATIONS OF THIS RUNNING (MAIN LIB VERSIONS, INPUT FILES SIZES, JOBS SCONTROL INFO, OUTPUT FILES SIZES)?
save_the_info = True

# DO YOU WANT TO SHOW THE INFO INLINE?
show_info_inline = True

# DO YOU WANT TO CLOSE THE CLIENT AND THE CLUSTER AT THE END?
close_the_cluster = True

## Catalogs paths configurations

Defining the path to the INPUT HATS catalog.

In [None]:
hats_input_catalog = '/lustre/t1/cl/lsst/dp02/secondary/catalogs/skinny_hats'

Defining the name of the RA and DEC columns in the input catalog.

In [None]:
hats_input_catalog_ra = 'coord_ra'
hats_input_catalog_dec = 'coord_dec'

If you choose not to run the pipeline, give the path to an existing margin cache for the above HATS catalog.

In [None]:
CATALOG_MARGIN_CACHE_DIR='/lustre/t1/cl/lsst/pz_project/dp02_skinny_margin_cache'

Defining the OUTPUT catalog path and name.

In [None]:
if run_the_pipeline==True:
    hats_margin_cache_path = '/lustre/t1/cl/lsst/pz_project'
    hats_margin_cache_name = f'dp02_skinny_margin_cache'

Defining the USER base path, for saving logs, graphs and other informations about the running.

In [None]:
if save_the_dask_jobs_info or save_the_info:
    user = getpass.getuser()
    user_base_path = f'/lustre/t0/scratch/users/{user}/report_hats/DP02-skinny-margin-cache'

### Creating directories

In [None]:
if save_the_dask_jobs_info or save_the_info:
    os.makedirs(user_base_path, exist_ok=True)

    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')

    run_path = os.path.join(user_base_path, f'run_hats_{current_date}')
    os.makedirs(run_path, exist_ok=True)

    logs_dir = os.path.join(run_path, f'logs')
    os.makedirs(logs_dir, exist_ok=True)

    dask_logs_dir = os.path.join(logs_dir, f'dask_logs')
    os.makedirs(dask_logs_dir , exist_ok=True)

## Cluster configurations

Do you want to customize extra dask parameters?

In [None]:
extra_dask_configs=False

If you choose ```True```, see the explanation of the parameters and customize them below.

**Explanation of Parameters**

* ```distributed.worker.memory.target```: sets the memory limit before Dask attempts to release memory from completed tasks. At the specified percentage, Dask will start memory collection earlier, reducing the risk of excessive accumulation.

* ```distributed.worker.memory.spill```: defines the point at which Dask starts spilling data to disk (swap) instead of keeping it in RAM. This helps free up memory for new tasks.

* ```distributed.worker.memory.pause```: when memory usage reaches the specified percentage, Dask will temporarily pause the worker to prevent excessive resource use.

* ```distributed.worker.memory.terminate```: if memory usage reaches the specified percentage, the worker will be restarted, which prevents crashes and helps keep usage under control.

* ```distributed.worker.memory.recent-to-old```: determines the fraction of recently accessed data Dask considers as “old” and, therefore, eligible for spilling to disk. A lower percentage (e.g., 0.2 for 20%) means only the most recent data is retained in RAM, while older data is more likely to be released, helping to manage cache memory efficiently.

In [None]:
if extra_dask_configs==True:
    # Additional Dask configurations
    dask_config = {
        "distributed.worker.memory.target": 0.75,         # 75% before starting memory collection
        "distributed.worker.memory.spill": 0.85,          # 85% before starting to use disk
        "distributed.worker.memory.pause": 0.92,          # Pause the worker at 92%
        "distributed.worker.memory.terminate": 0.98,      # Restart the worker at 98%
        "distributed.worker.memory.recent-to-old": 0.2    # Keep 20% of recent data in memory
    }

    # Applying the Dask configurations
    dask.config.set(dask_config)
else:
    print("Running DASK with the standard memory configuration.")

Defining the configurations for the cluster.

In [None]:
interface="ib0"
queue='cpu_small'
cores=48         
processes=2       
memory='114GB'   
walltime='04:00:00'

if save_the_dask_jobs_info:
    job_extra_directives=[
        '--propagate',
        f'--output={dask_logs_dir}/dask_job_%j_{current_date}.out',  
        f'--error={dask_logs_dir}/dask_job_%j_{current_date}.err'
    ]
else:
    job_extra_directives=[
        '--propagate',
        f'--output=/dev/null',  
        f'--error=/dev/null'
    ]

number_of_nodes=20 

Starting the cluster.

In [None]:
current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')

# Configuring the SLURMCluster.
cluster = SLURMCluster(
    interface=interface,         # Lustre interface
    queue=queue,                 # Name of the queue
    cores=cores,                 # Number of logical cores per node
    processes=processes,         # Number of dask processes per node
    memory=memory,               # Memory per node
    walltime=walltime,           # Maximum execution time
    job_extra_directives=job_extra_directives,
)

# Scaling the cluster to use X nodes
cluster.scale(jobs=number_of_nodes)

# Defining the dask client
client = Client(cluster)

# Wait for the workers to initialize
cluster.wait_for_workers(n_workers=number_of_nodes*processes)
client.run(lambda: gc.collect())

Showing informations about the cluster.

In [None]:
cluster_info = client.cluster
cluster_info

Saving the requested resources.

In [None]:
if save_the_info == True:  

    # Specific settings that you want to separate for the memory section
    memory_params = {
        "distributed.worker.memory.target": None,
        "distributed.worker.memory.spill": None,
        "distributed.worker.memory.pause": None,
        "distributed.worker.memory.terminate": None,
        "distributed.worker.memory.recent-to-old": "None",
        "distributed.worker.memory.recent-to-old-time": "None"
    }

    # Example of requested resource settings
    requested_resources = {
        "interface": f"{interface}",
        "queue": f"{queue}",
        "cores": cores,
        "processes": processes,
        "memory": f"{memory}",
        "walltime": f"{walltime}",
        "job_extra_directives": job_extra_directives,
        "number_of_nodes": number_of_nodes
    }

    # Getting Dask configurations
    dask_config = dask.config.config

    # Overwrite the memory parameters if they are set in the Dask configuration
    for param in memory_params.keys():
        sections = param.split('.')
        config = dask_config
        for section in sections:
            config = config.get(section, None)
            if config is None:
                break
        if config is not None:
            memory_params[param] = config

    # Preparing sections
    output = []

    # Requested resources section
    output.append("# Requested resources")
    for key, value in requested_resources.items():
        output.append(f"{key}={value}")

    # Memory configuration section
    output.append("\n# Dask memory configuration:")
    for key, value in memory_params.items():
        output.append(f'"{key}": {value}')

    # Section with all Dask configurations
    output.append("\n# Dask all configurations:")
    for section, config in dask_config.items():
        if isinstance(config, dict):
            output.append(f"[{section}]")
            for key, value in config.items():
                output.append(f"{key}: {value}")
        else:
            output.append(f"{section}: {config}")

    # Saving to a file or displaying the result
    with open(f'{logs_dir}/requested_resources_info.txt', 'w') as f:
        f.write("\n".join(output))

    print("Informations saved in requested_resources_info.txt")

# Reading the input catalog

Loading the input catalog with hats.

In [None]:
loaded_hats_catalog_from_disk_lsdb = lsdb.read_hats(hats_input_catalog)
loaded_hats_catalog_from_disk_hats = hats.read_hats(hats_input_catalog)

Making the pixels plot.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    plot_pixels(loaded_hats_catalog_from_disk_hats)
    plt.savefig(f"{logs_dir}/input_pixels_plot_{current_date}.png")
else:
    plot_pixels(loaded_hats_catalog_from_disk_hats)

Computing the number of rows in the HATS catalog.

In [None]:
hats_catalog_total_columns = loaded_hats_catalog_from_disk_lsdb.columns.to_list()
hats_catalog_total_rows = loaded_hats_catalog_from_disk_hats.catalog_info.total_rows

if show_info_inline == True:
    print(f"HATS catalog path: {hats_input_catalog} \n")
    print(f"Total number of rows: {hats_catalog_total_rows}\n")
    print(f"Total number of columns: {len(hats_catalog_total_columns)}\n\n")

if save_the_info == True:
    with open(f'{logs_dir}/input_total_len_of_catalog_{current_date}.txt', 'a') as f:
        f.write(f"HATS catalog path: {hats_input_catalog}\n")
        f.write(f"Total number of rows: {hats_catalog_total_rows}\n")
        f.write(f"Total number of columns: {len(hats_catalog_total_columns)}\n\n")

### Summarize pixels and sizes of input catalog
* "healpix orders: distinct healpix orders represented in the partitions

* num partitions: total number of partition files

Size on disk data - using the file sizes fetched above, check the balance of your data. If your rows are fixed-width (e.g. no nested arrays, and few NaNs), the ratio here should be similar to the ratio above. If they’re very different, and you experience problems when parallelizing operations on your data, you may consider re-structuring the data representation.

* min size_on_disk: smallest file (in GB)

* max size_on_disk: largest file size (in GB)

* size_on_disk ratio: max/min

total size_on_disk: sum of all parquet catalog files (actual catalog size may vary due to other metadata files)"

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
catalog_dir = hats_input_catalog

catalog = loaded_hats_catalog_from_disk_hats

info_frame = catalog.partition_info.as_dataframe()

for index, partition in info_frame.iterrows():
    file_name = result = hats.io.paths.pixel_catalog_file(
        catalog_dir, HealpixPixel(partition["Norder"], partition["Npix"])
    )
    info_frame.loc[index, "size_on_disk"] = os.path.getsize(file_name)

info_frame = info_frame.astype(int)
info_frame["gbs"] = info_frame["size_on_disk"] / (1024 * 1024 * 1024)

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    with open(f"{logs_dir}/input_summarize_pixels_{current_date}.txt", "w") as file:
        file.write(f'healpix orders: {info_frame["Norder"].unique()}\n')
        file.write(f'num partitions: {len(info_frame["Npix"])}\n')
        file.write("------\n")
        file.write(f'min size_on_disk: {info_frame["gbs"].min():.8f}\n')
        file.write(f'max size_on_disk: {info_frame["gbs"].max():.8f}\n')
        file.write(f'size_on_disk ratio: {info_frame["gbs"].max()/info_frame["gbs"].min():.8f}\n')
        file.write(f'total size_on_disk: {info_frame["gbs"].sum():.8f}\n')
if show_info_inline==True:
    print(f'healpix orders: {info_frame["Norder"].unique()}')
    print(f'num partitions: {len(info_frame["Npix"])}')
    print("------")
    print(f'min size_on_disk: {info_frame["gbs"].min():.7f}')
    print(f'max size_on_disk: {info_frame["gbs"].max():.7f}')
    print(f'size_on_disk ratio: {info_frame["gbs"].max()/info_frame["gbs"].min():.7f}')
    print(f'total size_on_disk: {info_frame["gbs"].sum():.7f}')

### File size distribution of input catalog
"Below we look at histograms of file sizes.

In our initial testing, we find that there’s a “sweet spot” file size of 100MB-1GB. Files that are smaller create more overhead for individual reads. Files that are much larger may create slow-downs when cross-matching between catalogs. Files that are much larger can create out-of-memory issues for dask when loading from disk.

The majority of your files should be in the “sweet spot”, and no files in the “too-big” category."

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    plt.hist(info_frame["gbs"], edgecolor='black')
    plt.xlabel("File size (GB)")
    plt.ylabel("Number of files")
    plt.savefig(f"{logs_dir}/input_file_size_histogram_{current_date}.png")  
    plt.close()  

    bins = [0, 0.5, 1, 2, 100]
    labels = ["small-ish", "sweet-spot", "big-ish", "too-big"]
    hist = np.histogram(info_frame["gbs"], bins=bins)[0]
    pcts = hist / len(info_frame)

    with open(f"{logs_dir}/input_file_size_distribution_{current_date}.txt", "w") as file:
        for i in range(len(labels)):
            file.write(f"{labels[i]} \t: {hist[i]} \t({pcts[i]*100:.1f} %)\n")
            
if show_info_inline==True:
    plt.hist(info_frame["gbs"], edgecolor='black')
    plt.xlabel("File size (GB)")
    plt.ylabel("Number of files")
    plt.show()

    bins = [0, 0.5, 1, 2, 100]
    labels = ["small-ish", "sweet-spot", "big-ish", "too-big"]
    hist = np.histogram(info_frame["gbs"], bins=bins)[0]
    pcts = hist / len(info_frame)
    for i in range(0, len(labels)):
        print(f"{labels[i]} \t: {hist[i]} \t({pcts[i]*100:.1f} %)")

# Saving libraries and jobs informations

Saving the libraries versions information.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    with open(f'{logs_dir}/main_lib_versions_{current_date}.txt', 'w') as f:
        f.write(f'python version: {sys.version} \n')
        f.write(f'numpy version: {np.__version__} \n')
        f.write(f'dask version: {dask.__version__} \n')
        f.write(f'dask_jobqueue version: {dask_jobqueue.__version__} \n')
        f.write(f'hats version: {hats_version} \n')
        f.write(f'hats_import version: {hats_import.__version__} \n')
        f.write(f'lsdb version: {lsdb.__version__} \n')
    print(f'File saved as: {logs_dir}/main_lib_versions_{current_date}.txt \n')

Defining functions to get informations about the jobs running in the cluster.

In [None]:
# Function to collect information about a job using the scontrol show job command
def get_scontrol_job_info(job_id):
    # Remove any interval or `%` from job_id
    clean_job_id = re.sub(r'\[.*?\]', '', job_id)
    
    # Execute scontrol show job
    result = subprocess.run(['scontrol', 'show', 'job', clean_job_id], stdout=subprocess.PIPE)
    job_info = result.stdout.decode('utf-8')
    
    job_dict = {}
    
    # Process the info line by line
    for line in job_info.splitlines():
        items = line.split()
        for item in items:
            if "=" in item:
                key, value = item.split("=", 1)
                job_dict[key] = value
    
    return job_dict

# Function to collect information about all jobs of the user
def get_all_jobs_info_MINE():
    # Gets the username using os.getenv('USER')
    user = os.getenv('USER')
    
    # Captures the list of running jobs for the user
    result = subprocess.run(['squeue', '-u', user, '-h', '-o', '%i'], stdout=subprocess.PIPE)
    job_ids = result.stdout.decode('utf-8').splitlines()

    # Collects information for each job
    jobs_info = []
    for job_id in job_ids:
        # Removes intervals or % from job_id before passing it to scontrol
        clean_job_id = re.sub(r'\[.*?\]', '', job_id)
        try:
            job_info = get_scontrol_job_info(clean_job_id)
            jobs_info.append(job_info)
        except Exception as e:
            print(f"Error processing job {job_id}: {e}")
    
    # Converts the list of dictionaries into a Pandas DataFrame
    df = pd.DataFrame(jobs_info)
    
    return df


# Function to collect information about all jobs that do not belong to the current user
def get_all_jobs_info_NOT_MINE():
    current_user = os.getenv('USER')
    
    # Captures the list of running jobs
    result = subprocess.run(['squeue', '-h', '-o', '%i %u'], stdout=subprocess.PIPE)
    job_lines = result.stdout.decode('utf-8').splitlines()
    
    # Filters jobs from other users
    jobs_info = []
    for line in job_lines:
        job_id, user = line.split()
        
        # Ignores jobs belonging to the current user
        if user != current_user:
            # Removes intervals or % from job_id before passing it to scontrol
            clean_job_id = re.sub(r'\[.*?\]', '', job_id)
            try:
                job_info = get_scontrol_job_info(clean_job_id)
                jobs_info.append(job_info)
            except Exception as e:
                print(f"Error processing job {job_id}: {e}")
    
    # Converts to DataFrame
    df = pd.DataFrame(jobs_info)
    return df

Getting my jobs.

In [None]:
# Collects information of all jobs and saves it in the DataFrame
df_jobs_MINE = get_all_jobs_info_MINE()

if show_info_inline==True:
    print(df_jobs_MINE[['JobId','NodeList','NumNodes','NumCPUs','NumTasks','CPUs/Task','TRES']])

Getting other people jobs.

In [None]:
# Collects information of all jobs and saves it in the DataFrame
df_jobs_NOT_MINE = get_all_jobs_info_NOT_MINE()

if len(df_jobs_NOT_MINE)!=0:
    if show_info_inline==True:
        print(df_jobs_NOT_MINE[['JobId','NodeList','NumNodes','NumCPUs','NumTasks','CPUs/Task','TRES']])
else:
    df_jobs_NOT_MINE_EMPTY_MSG = pd.DataFrame({"EMPTY": ["There are no other jobs running in the cluster."]})
    print("There are no other jobs running in the cluster.")

Saving the data of the jobs in a csv.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    
    file_name_MINE = f'{logs_dir}/jobs_info_MINE_{current_date}.csv'
    file_name_NOT_MINE = f'{logs_dir}/jobs_info_NOT_MINE_{current_date}.csv'
    
    df_jobs_MINE.to_csv(file_name_MINE, index=False)
    if len(df_jobs_NOT_MINE)!=0:
        df_jobs_NOT_MINE.to_csv(file_name_NOT_MINE, index=False)
    else:
        df_jobs_NOT_MINE_EMPTY_MSG.to_csv(file_name_NOT_MINE, index=False)
        
    print(f'Files saved as: \n')
    print(f'{file_name_MINE} \n')
    print(f'{file_name_NOT_MINE} \n')

# Generating the margin cache for the HATS catalog

Generating the margin cache for the HATS catalog.

In [None]:
if run_the_pipeline==True:
    ################################## INPUT CONFIGS #################################
    ### Directory of the input catalog.
    CATALOG_HATS_DIR = Path(hats_input_catalog)
    MARGIN_CACHE_THRESHOLD = 1.0 #arcsec
    ###########################################################################################

    ################################# CONFIGURAÇÕES DE OUTPUT #################################
    ### Name of the margin cache to be saved.
    CATALOG_MARGIN_CACHE_NAME = hats_margin_cache_name
    
    ### Output directory for the margin cache and logs.
    HATS_DIR = Path(hats_margin_cache_path)
    LOGS_DIR = Path(logs_dir)
    
    CATALOG_MARGIN_CACHE_DIR = HATS_DIR / CATALOG_MARGIN_CACHE_NAME

    ### Path to dask performance report.
    PERFORMANCE_REPORT_NAME = f'dask_performance_report_{current_date}.html'
    PERFORMANCE_DIR = LOGS_DIR / PERFORMANCE_REPORT_NAME
    ###########################################################################################

    ############################### EXECUTANDO O PIPELINE ######################################
    with performance_report(filename=PERFORMANCE_DIR):   
        ### Getting informations from the catalog.
        catalog = hats.read_hats(CATALOG_HATS_DIR)
        info_frame = catalog.partition_info.as_dataframe()
        info_frame = info_frame.astype(int)
        
        ### Computing the margin cache, if it is possible.
        number_of_pixels = len(info_frame["Npix"])
        if number_of_pixels <= 1:
            warnings.warn(f"Number of pixels is equal to {number_of_pixels}. Impossible to compute margin cache.")
        else:
            margin_cache_args = MarginCacheArguments(
                input_catalog_path=CATALOG_HATS_DIR,
                output_path=HATS_DIR,
                margin_threshold=MARGIN_CACHE_THRESHOLD,  # arcsec
                output_artifact_name=CATALOG_MARGIN_CACHE_NAME,
            )
            pipeline_with_client(margin_cache_args, client)
###########################################################################################
else:
    print('You selected not to run the pipeline.') 

# Analysing the outputs

## Loading the catalog with the margin cache

Define if you want to load the full catalog or just a region in the sky.

In [None]:
load_full_catalog_with_margin_cache = True

If you choose ```False```, select the region of the sky.

In [None]:
#ra=(48, 76)
#dec=(-45, -26)
#box = BoxSearch(ra=ra, dec=dec)

Loading the catalog.

In [None]:
if load_full_catalog_with_margin_cache==True:
    loading_message = 'You choose to load the full catalog and margin cache.'
    catalog_with_margin = lsdb.read_hats(
        hats_input_catalog, margin_cache=CATALOG_MARGIN_CACHE_DIR, columns=[hats_input_catalog_ra, hats_input_catalog_dec]
    )
else:
    loading_message = f'You choose to load a limited portion of the catalog and its margin cache. They were limited by the box search in the region R.A {ra} and DEC {dec}.'
    catalog_with_margin = lsdb.read_hats(
        hats_input_catalog, search_filter=box, margin_cache=CATALOG_MARGIN_CACHE_DIR, columns=[hats_input_catalog_ra, hats_input_catalog_dec]
    )

if save_the_info == True:
    with open(f'{logs_dir}/margin_cache_catalog_info_{current_date}.txt', 'a') as f:
        f.write(loading_message)
        f.write(f"\n Margin size: {catalog_with_margin.margin.hc_structure.catalog_info.margin_threshold} arcsec \n")

if show_info_inline == True:
    print(loading_message, '\n')
    print(f"Margin size: {catalog_with_margin.margin.hc_structure.catalog_info.margin_threshold} arcsec \n")
    print(catalog_with_margin.margin)

## Getting the len of rows and columns

In [None]:
loaded_margin_cache_from_disk_lsdb = lsdb.read_hats(CATALOG_MARGIN_CACHE_DIR)
loaded_margin_cache_from_disk_hats = hats.read_hats(CATALOG_MARGIN_CACHE_DIR)

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    plot_pixels(loaded_margin_cache_from_disk_hats)
    plt.savefig(f"{logs_dir}/margin_cache_pixels_plot_{current_date}.png")
else:
    plot_pixels(loaded_margin_cache_from_disk_hats)

In [None]:
margin_cache_total_columns = loaded_margin_cache_from_disk_lsdb.columns.to_list()
margin_cache_total_rows = loaded_margin_cache_from_disk_hats.catalog_info.total_rows

if show_info_inline == True:
    print(f"Margin cache path: {CATALOG_MARGIN_CACHE_DIR} \n")
    print(f"Total number of rows: {margin_cache_total_rows}\n")
    print(f"Total number of columns: {len(margin_cache_total_columns)}\n\n")

if save_the_info == True:
    with open(f'{logs_dir}/margin_cache_total_len_of_catalog_{current_date}.txt', 'a') as f:
        f.write(f"Margin cache path: {CATALOG_MARGIN_CACHE_DIR}\n")
        f.write(f"Total number of rows: {margin_cache_total_rows}\n")
        f.write(f"Total number of columns: {len(margin_cache_total_columns)}\n\n")

## Plotting the margin cache

Defining a function to plot the points in a pixel and the pixel boundary.

In [None]:
def plot_points(
    pixel_dfs, order, pixel, colors, ra_columns, dec_columns, xlim=None, ylim=None, markers=None, alphas=None, save_path=None
):
    ax = plt.subplot()

    # Plot hp pixel bounds
    nsides = hp.order2nside(order)
    pix0_bounds = hp.vec2dir(hp.boundaries(nsides, pixel, step=100, nest=True), lonlat=True)
    lon = pix0_bounds[0]
    lat = pix0_bounds[1]
    vertices = np.vstack([lon.ravel(), lat.ravel()]).transpose()
    p = Polygon(vertices, closed=True, edgecolor="#3b81db", facecolor="none")
    ax.add_patch(p)

    if markers is None:
        markers = ["+"] * len(pixel_dfs)

    if alphas is None:
        alphas = [1] * len(pixel_dfs)  # Default to alpha=1 for all dataframes

    # Plot the points
    for pixel_df, color, ra_column, dec_column, marker, alpha in zip(
        pixel_dfs, colors, ra_columns, dec_columns, markers, alphas
    ):
        ax.scatter(
            pixel_df[ra_column].to_numpy(),
            pixel_df[dec_column].to_numpy(),
            c=color,
            marker=marker,
            linewidths=1,
            alpha=alpha,
        )

    # Plotting configuration
    VIEW_MARGIN = 2
    xlim_low = np.min(lon) - VIEW_MARGIN if xlim is None else xlim[0]
    xlim_high = np.max(lon) + VIEW_MARGIN if xlim is None else xlim[1]
    ylim_low = np.min(lat) - VIEW_MARGIN if ylim is None else ylim[0]
    ylim_high = np.max(lat) + VIEW_MARGIN if ylim is None else ylim[1]

    plt.xlim(xlim_low, xlim_high)
    plt.ylim(ylim_low, ylim_high)
    plt.xlabel("ra")
    plt.ylabel("dec")

    # Save the plot if a save_path is provided
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")

    # Show the plot
    plt.show()

Plotting the points for the first pixel in the info_frame.

In [None]:
catalog_with_margin_cache_info_frame = catalog_with_margin.hc_structure.partition_info.as_dataframe()

In [None]:
def get_valid_partition(catalog_with_margin, orders, pixels):
    """
    Função para encontrar o primeiro par válido de order e pixel no catálogo.
    """
    for order, pixel in zip(orders, pixels):
        try:
            # Tenta acessar a partição
            catalog_with_margin.get_partition(order, pixel).compute()
            catalog_with_margin.margin.get_partition(order, pixel).compute()
            return order, pixel  # Retorna o primeiro par válido
        except ValueError:
            continue  # Continua para o próximo par
    raise ValueError("Nenhum pixel válido encontrado no catálogo.")


orders = catalog_with_margin_cache_info_frame['Norder']
pixels = catalog_with_margin_cache_info_frame['Npix']

try:
    order, pixel = get_valid_partition(catalog_with_margin, orders, pixels)
    print(f'Plotting points for healpix of order {order} and pixel {pixel}. \n')

    if save_the_info:
        current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
        save_path = f"{logs_dir}/margin_cache_order_{order}_pixel_{pixel}_plot_{current_date}.png"
    else:
        save_path = None

    plot_points(
        [
            catalog_with_margin.get_partition(order, pixel).compute(),
            catalog_with_margin.margin.get_partition(order, pixel).compute(),
        ],
        order,
        pixel,
        ["green", "red"],
        [hats_input_catalog_ra, hats_input_catalog_ra],
        [hats_input_catalog_dec, hats_input_catalog_dec],
        save_path=save_path,
    )
except ValueError as e:
    print(e)

Below, you can personalize the plot if you want.

In [None]:
plot_personalized_plot = True

First, take a look in the info_frame dataframe for getting a pixel order and number.

In [None]:
catalog_with_margin_cache_info_frame.head(5)

Then, select the pixel you want to plot.

In [None]:
if plot_personalized_plot==True:
    order = 3
    pixel = 535

Now, select a region in the sky to restrict the x and y axis.

In [None]:
restrict_x_and_y_axis = True

if plot_personalized_plot==True:
    if restrict_x_and_y_axis==True:
        xlim = [72.5,74.5]
        ylim = [-37, -34]
    else:
        xlim = None
        ylim = None

Select an alpha level for the catalog and its margin cache.

In [None]:
select_alpha_levels = True

if plot_personalized_plot==True:
    if select_alpha_levels==True:
        alphas = [0.1,1.0]
    else:
        alphas = None

If you want to save the graph, select the place.

In [None]:
if save_the_info == True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    save_path = f"{logs_dir}/margin_cache_order_{order}_pixel_{pixel}_personalized_plot_{current_date}.png"
else:
    save_path = None

Plotting the points from the specified pixel in green, and from the pixel's margin cache in red.

In [None]:
if plot_personalized_plot == True:
    print(f'Plotting points for healpix of order {order} and pixel {pixel}. \n')
        
    plot_points(
        [
            catalog_with_margin.get_partition(order, pixel).compute(),
            catalog_with_margin.margin.get_partition(order, pixel).compute(),
        ],
        order,
        pixel,
        ["green", "red"],
        [hats_input_catalog_ra, hats_input_catalog_ra],
        [hats_input_catalog_dec, hats_input_catalog_dec],
        xlim=xlim,
        ylim=ylim,
        alphas=alphas,
        save_path=save_path,
    )

## Summarize pixels and sizes of margin cache

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
catalog_dir = CATALOG_MARGIN_CACHE_DIR

catalog = hats.read_hats(catalog_dir)

info_frame = catalog.partition_info.as_dataframe()

for index, partition in info_frame.iterrows():
    file_name = result = hats.io.paths.pixel_catalog_file(
        catalog_dir, HealpixPixel(partition["Norder"], partition["Npix"])
    )
    info_frame.loc[index, "size_on_disk"] = os.path.getsize(file_name)

info_frame = info_frame.astype(int)
info_frame["gbs"] = info_frame["size_on_disk"] / (1024 * 1024 * 1024)

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    with open(f"{logs_dir}/margin_cache_summarize_pixels_{current_date}.txt", "w") as file:
        file.write(f'healpix orders: {info_frame["Norder"].unique()}\n')
        file.write(f'num partitions: {len(info_frame["Npix"])}\n')
        file.write("------\n")
        file.write(f'min size_on_disk: {info_frame["gbs"].min():.8f}\n')
        file.write(f'max size_on_disk: {info_frame["gbs"].max():.8f}\n')
        file.write(f'size_on_disk ratio: {info_frame["gbs"].max()/info_frame["gbs"].min():.8f}\n')
        file.write(f'total size_on_disk: {info_frame["gbs"].sum():.8f}\n')
if show_info_inline==True:
    print(f'healpix orders: {info_frame["Norder"].unique()}')
    print(f'num partitions: {len(info_frame["Npix"])}')
    print("------")
    print(f'min size_on_disk: {info_frame["gbs"].min():.7f}')
    print(f'max size_on_disk: {info_frame["gbs"].max():.7f}')
    print(f'size_on_disk ratio: {info_frame["gbs"].max()/info_frame["gbs"].min():.7f}')
    print(f'total size_on_disk: {info_frame["gbs"].sum():.7f}')

## File size distribution of margin cache

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    plt.hist(info_frame["gbs"], edgecolor='black')
    plt.xlabel("File size (GB)")
    plt.ylabel("Number of files")
    plt.savefig(f"{logs_dir}/margin_cache_file_size_histogram_{current_date}.png")  
    plt.close()  

    bins = [0, 0.5, 1, 2, 100]
    labels = ["small-ish", "sweet-spot", "big-ish", "too-big"]
    hist = np.histogram(info_frame["gbs"], bins=bins)[0]
    pcts = hist / len(info_frame)

    with open(f"{logs_dir}/margin_cache_file_size_distribution_{current_date}.txt", "w") as file:
        for i in range(len(labels)):
            file.write(f"{labels[i]} \t: {hist[i]} \t({pcts[i]*100:.1f} %)\n")
            
if show_info_inline==True:
    plt.hist(info_frame["gbs"], edgecolor='black')
    plt.xlabel("File size (GB)")
    plt.ylabel("Number of files")
    plt.show()

    bins = [0, 0.5, 1, 2, 100]
    labels = ["small-ish", "sweet-spot", "big-ish", "too-big"]
    hist = np.histogram(info_frame["gbs"], bins=bins)[0]
    pcts = hist / len(info_frame)
    for i in range(0, len(labels)):
        print(f"{labels[i]} \t: {hist[i]} \t({pcts[i]*100:.1f} %)")

# Closing the cluster

In [None]:
if close_the_cluster==True:
    client.close()
    cluster.close()