<img align='left' src = '../../images/linea.png' width=150 style='padding: 20px'> 

# Report HATS
## Doing the crossmatching for DP0.2 (Object) Skinny VS DP0.1 Truth Random Sample catalogs

Performance report on LINCC libraries.

Contacts: Luigi Silva ([luigi.silva@linea.org.br](mailto:luigi.silva@linea.org.br)); Julia Gschwend ([julia@linea.org.br](mailto:julia@linea.org.br)).

Last check: 08/11/2024

#### Acknowledgments

'_This notebook used computational resources from the Associação Laboratório Interinstitucional de e-Astronomia (LIneA) with financial support from the INCT of e-Universe (Process No. 465376/2014-2)._'

'_This notebook uses libraries from the LSST Interdisciplinary Network for Collaboration and Computing (LINCC) Frameworks project, such as the hats, hats_import, and lsdb libraries. The LINCC Frameworks project is supported by Schmidt Sciences. It is also based on work supported by the National Science Foundation under Grant No. AST-2003196. Additionally, it receives support from the DIRAC Institute at the Department of Astronomy of the University of Washington. The DIRAC Institute is supported by gifts from the Charles and Lisa Simonyi Fund for Arts and Sciences and the Washington Research Foundation._'

# Imports

Let us import the packages that we will need.

In [None]:
########################### GENERAL ##########################
import os
import gc
import re
import sys
import glob
import time
import math
import getpass
import warnings
import tables_io
import subprocess
import numpy as np
import pandas as pd
import healpy as hp
from pathlib import Path
from datetime import datetime
############################ DASK ############################
import dask
from dask import dataframe as dd
from dask import delayed
from dask.distributed import Client, performance_report, wait
import dask_jobqueue
from dask_jobqueue import SLURMCluster
########################## HATS ###########################
import hats
from hats.inspection.visualize_catalog import plot_pixels
from hats.pixel_math import HealpixPixel
########################## HATS IMPORT ###########################
import hats_import
from hats_import.catalog.file_readers import ParquetReader, FitsReader
from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments
from hats_import.pipeline import ImportArguments, pipeline_with_client
############################ LSDB ############################
import lsdb
from lsdb.core.search import BoxSearch
######################## VISUALIZATION #######################
### BOKEH
import bokeh
from bokeh.io import output_notebook, show
from bokeh.models import ColorBar, LinearColorMapper, PrintfTickFormatter
from bokeh.palettes import Viridis256

### HOLOVIEWS
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import datashade, rasterize, dynspread

### GEOVIEWS
import geoviews as gv
import geoviews.feature as gf
from cartopy import crs

### DATASHADER
import datashader as ds
from datashader.colors import viridis

### MATPLOTLIB
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
########################## ASTRONOMY #########################
from astropy.io import fits
from astropy import units as u
from astropy.coordinates import SkyCoord
from astropy.units.quantity import Quantity

Defining the plots to be inline.

In [None]:
hv.extension('bokeh')
gv.extension('bokeh')
output_notebook()
%matplotlib inline

Printing the versions of the libraries.

In [None]:
### Getting hats version manually, because it has no __version__ attribute.
result = subprocess.run(
    ["conda", "run", "-p", "/lustre/t0/scratch/users/luigi.silva/hats_env_081124", "conda", "list", "hats"],
    stdout=subprocess.PIPE, text=True
)
for line in result.stdout.splitlines():
    if line.startswith("hats "):
        hats_version = line.split()[1]
        break

### Printing the versions.
print(f'python version: {sys.version}')
print(f'numpy version: {np.__version__}')
print(f'dask version: {dask.__version__}')
print(f'dask_jobqueue version: {dask_jobqueue.__version__}')
print(f'hats version: {hats_version}')
print(f'hats_import version: {hats_import.__version__}')
print(f'lsdb version: {lsdb.__version__}')

# Configurations

## Running configurations

Set the configurations for this running.

In [None]:
# DO YOU WANT TO RUN THE X MATCHING PIPELINE? 
run_the_pipeline = False

# DO YOU WANT TO SAVE ALL THE DASK JOBS OUTPUTS AND ERRORS OF DASK SLURMCluster?
save_the_dask_jobs_info = True

# DO YOU WANT TO SAVE ALL THE GENERAL INFORMATIONS OF THIS RUNNING (MAIN LIB VERSIONS, INPUT FILES SIZES, JOBS SCONTROL INFO, OUTPUT FILES SIZES)?
save_the_info = True

# DO YOU WANT TO SHOW THE INFO INLINE?
show_info_inline = True

# DO YOU WANT TO CLOSE THE CLIENT AND THE CLUSTER AT THE END?
close_the_cluster = True

If you choose not to run the pipeline, give the path to an existing x-matching.

In [None]:
OUTPUT_HATS_DIR='/lustre/t1/cl/lsst/pz_project/test_data/dp01_truth_random_sample_hats_x_skinny_hats'

## Catalogs paths configurations

First of all, what catalog do you want to use as 'left'? Set ```True``` for using the first catalog as 'left', or ```False``` for using the second catalog as 'left'. The left catalog doesn't need a margin cache, however you need to define the margin_cache variable below anyway, as ```None``` for example.

In [None]:
use_first_catalog_as_left = False

Defining the path to the first HATS catalog and its margin cache.

In [None]:
hats_first_catalog_path = '/lustre/t1/cl/lsst/dp02/secondary/catalogs'
hats_first_catalog_name = 'skinny_hats'
hats_first_margin_cache = '/lustre/t1/cl/lsst/pz_project/test_data/dp02_skinny_margin_cache'

Defining the path to the second HATS catalog and its margin cache.

In [None]:
hats_second_catalog_path = '/lustre/t1/cl/lsst/pz_project/test_data'
hats_second_catalog_name = 'dp01_truth_random_sample_hats'
hats_second_margin_cache = '/lustre/t1/cl/lsst/pz_project/test_data/dp01_truth_random_sample_margin_cache'

Defining the names of key columns.

In [None]:
id_first_name = 'objectId'
ra_first_name = 'coord_ra'
dec_first_name = 'coord_dec'

id_second_name = 'id'
ra_second_name = 'ra'
dec_second_name = 'dec'

Defining the OUTPUT catalog path and name.

In [None]:
if run_the_pipeline==True:
    x_matching_path = '/lustre/t1/cl/lsst/pz_project/test_data'

Defining the USER base path, for saving logs, graphs and other informations about the running.

In [None]:
if save_the_dask_jobs_info or save_the_info:
    user = getpass.getuser()
    user_base_path = f'/lustre/t0/scratch/users/{user}/report_hats/DP02-skinny-VS-DP01-truth-random-sample'

### Creating directories

In [None]:
if save_the_dask_jobs_info or save_the_info:
    os.makedirs(user_base_path, exist_ok=True)

    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')

    run_path = os.path.join(user_base_path, f'run_hats_{current_date}')
    os.makedirs(run_path, exist_ok=True)

    logs_dir = os.path.join(run_path, f'logs')
    os.makedirs(logs_dir, exist_ok=True)

    dask_logs_dir = os.path.join(logs_dir, f'dask_logs')
    os.makedirs(dask_logs_dir , exist_ok=True)

## Cluster configurations

Do you want to customize extra dask parameters?

In [None]:
extra_dask_configs=False

If you choose ```True```, see the explanation of the parameters and customize them below.

**Explanation of Parameters**

* ```distributed.worker.memory.target```: sets the memory limit before Dask attempts to release memory from completed tasks. At the specified percentage, Dask will start memory collection earlier, reducing the risk of excessive accumulation.

* ```distributed.worker.memory.spill```: defines the point at which Dask starts spilling data to disk (swap) instead of keeping it in RAM. This helps free up memory for new tasks.

* ```distributed.worker.memory.pause```: when memory usage reaches the specified percentage, Dask will temporarily pause the worker to prevent excessive resource use.

* ```distributed.worker.memory.terminate```: if memory usage reaches the specified percentage, the worker will be restarted, which prevents crashes and helps keep usage under control.

* ```distributed.worker.memory.recent-to-old```: determines the fraction of recently accessed data Dask considers as “old” and, therefore, eligible for spilling to disk. A lower percentage (e.g., 0.2 for 20%) means only the most recent data is retained in RAM, while older data is more likely to be released, helping to manage cache memory efficiently.

In [None]:
if extra_dask_configs==True:
    # Additional Dask configurations
    dask_config = {
        "distributed.worker.memory.target": 0.75,         # 75% before starting memory collection
        "distributed.worker.memory.spill": 0.85,          # 85% before starting to use disk
        "distributed.worker.memory.pause": 0.92,          # Pause the worker at 92%
        "distributed.worker.memory.terminate": 0.98,      # Restart the worker at 98%
        "distributed.worker.memory.recent-to-old": 0.2    # Keep 20% of recent data in memory
    }

    # Applying the Dask configurations
    dask.config.set(dask_config)
else:
    print("Running DASK with the standard memory configuration.")

Defining the configurations for the cluster.

In [None]:
interface="ib0"
queue='cpu_small'
cores=48         
processes=2       
memory='114GB'   
walltime='04:00:00'

if save_the_dask_jobs_info:
    job_extra_directives=[
        '--propagate',
        f'--output={dask_logs_dir}/dask_job_%j_{current_date}.out',  
        f'--error={dask_logs_dir}/dask_job_%j_{current_date}.err'
    ]
else:
    job_extra_directives=[
        '--propagate',
        f'--output=/dev/null',  
        f'--error=/dev/null'
    ]

number_of_nodes=20 

Starting the cluster.

In [None]:
current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')

# Configuring the SLURMCluster.
cluster = SLURMCluster(
    interface=interface,         # Lustre interface
    queue=queue,                 # Name of the queue
    cores=cores,                 # Number of logical cores per node
    processes=processes,         # Number of dask processes per node
    memory=memory,               # Memory per node
    walltime=walltime,           # Maximum execution time
    job_extra_directives=job_extra_directives,
)

# Scaling the cluster to use X nodes
cluster.scale(jobs=number_of_nodes)

# Defining the dask client
client = Client(cluster)

# Wait for the workers to initialize
cluster.wait_for_workers(n_workers=number_of_nodes*processes)
client.run(lambda: gc.collect())

Showing informations about the cluster.

In [None]:
cluster_info = client.cluster
cluster_info

Saving the requested resources.

In [None]:
if save_the_info == True:  

    # Specific settings that you want to separate for the memory section
    memory_params = {
        "distributed.worker.memory.target": None,
        "distributed.worker.memory.spill": None,
        "distributed.worker.memory.pause": None,
        "distributed.worker.memory.terminate": None,
        "distributed.worker.memory.recent-to-old": "None",
        "distributed.worker.memory.recent-to-old-time": "None"
    }

    # Example of requested resource settings
    requested_resources = {
        "interface": f"{interface}",
        "queue": f"{queue}",
        "cores": cores,
        "processes": processes,
        "memory": f"{memory}",
        "walltime": f"{walltime}",
        "job_extra_directives": job_extra_directives,
        "number_of_nodes": number_of_nodes
    }

    # Getting Dask configurations
    dask_config = dask.config.config

    # Overwrite the memory parameters if they are set in the Dask configuration
    for param in memory_params.keys():
        sections = param.split('.')
        config = dask_config
        for section in sections:
            config = config.get(section, None)
            if config is None:
                break
        if config is not None:
            memory_params[param] = config

    # Preparing sections
    output = []

    # Requested resources section
    output.append("# Requested resources")
    for key, value in requested_resources.items():
        output.append(f"{key}={value}")

    # Memory configuration section
    output.append("\n# Dask memory configuration:")
    for key, value in memory_params.items():
        output.append(f'"{key}": {value}')

    # Section with all Dask configurations
    output.append("\n# Dask all configurations:")
    for section, config in dask_config.items():
        if isinstance(config, dict):
            output.append(f"[{section}]")
            for key, value in config.items():
                output.append(f"{key}: {value}")
        else:
            output.append(f"{section}: {config}")

    # Saving to a file or displaying the result
    with open(f'{logs_dir}/requested_resources_info.txt', 'w') as f:
        f.write("\n".join(output))

    print("Informations saved in requested_resources_info.txt")

# Preview of the input catalogs

## Loading the catalogs

Loading the input catalogs with hats.

In [None]:
if use_first_catalog_as_left == True:
    hats_left_catalog_name = hats_first_catalog_name
    left_catalog_complete_dir = Path(hats_first_catalog_path)/Path(hats_first_catalog_name)
    left_catalog_from_disk_lsdb = lsdb.read_hats(left_catalog_complete_dir)
    left_catalog_from_disk_hats = hats.read_hats(left_catalog_complete_dir)
    hats_left_margin_cache = hats_first_margin_cache
    
    id_left_name = id_first_name
    ra_left_name = ra_first_name
    dec_left_name = dec_first_name
    
    hats_right_catalog_name = hats_second_catalog_name
    right_catalog_complete_dir = Path(hats_second_catalog_path)/Path(hats_second_catalog_name)
    right_catalog_from_disk_lsdb = lsdb.read_hats(right_catalog_complete_dir)
    right_catalog_from_disk_hats = hats.read_hats(right_catalog_complete_dir)
    hats_right_margin_cache = hats_second_margin_cache
    
    id_right_name = id_second_name
    ra_right_name = ra_second_name
    dec_right_name = dec_second_name
else:
    hats_left_catalog_name = hats_second_catalog_name
    left_catalog_complete_dir = Path(hats_second_catalog_path)/Path(hats_second_catalog_name)
    left_catalog_from_disk_lsdb = lsdb.read_hats(left_catalog_complete_dir)
    left_catalog_from_disk_hats = hats.read_hats(left_catalog_complete_dir)
    hats_left_margin_cache = hats_second_margin_cache
    
    id_left_name = id_second_name
    ra_left_name = ra_second_name
    dec_left_name = dec_second_name
    
    hats_right_catalog_name = hats_first_catalog_name
    right_catalog_complete_dir = Path(hats_first_catalog_path)/Path(hats_first_catalog_name)
    right_catalog_from_disk_lsdb = lsdb.read_hats(right_catalog_complete_dir)
    right_catalog_from_disk_hats = hats.read_hats(right_catalog_complete_dir)
    hats_right_margin_cache = hats_first_margin_cache
    
    id_right_name = id_first_name
    ra_right_name = ra_first_name
    dec_right_name = dec_first_name

## Making the pixels plot

Making the pixels plot for the left catalog.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    
    plot_pixels(left_catalog_from_disk_hats)
    plt.savefig(f"{logs_dir}/input_left_catalog_pixels_plot_{current_date}.png")
else:
    plot_pixels(left_catalog_from_disk_hats)

Making the pixels plot for the right catalog.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    
    plot_pixels(right_catalog_from_disk_hats)
    plt.savefig(f"{logs_dir}/input_right_catalog_pixels_plot_{current_date}.png")
else:
    plot_pixels(right_catalog_from_disk_hats)

## Summarize pixels and sizes
* "healpix orders: distinct healpix orders represented in the partitions

* num partitions: total number of partition files

Size on disk data - using the file sizes fetched above, check the balance of your data. If your rows are fixed-width (e.g. no nested arrays, and few NaNs), the ratio here should be similar to the ratio above. If they’re very different, and you experience problems when parallelizing operations on your data, you may consider re-structuring the data representation.

* min size_on_disk: smallest file (in GB)

* max size_on_disk: largest file size (in GB)

* size_on_disk ratio: max/min

total size_on_disk: sum of all parquet catalog files (actual catalog size may vary due to other metadata files)"

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
############################################################################################
left_catalog = hats.read_hats(left_catalog_complete_dir)

left_output_info_frame = left_catalog.partition_info.as_dataframe()

for index, partition in left_output_info_frame.iterrows():
    file_name = result = hats.io.paths.pixel_catalog_file(
        left_catalog_complete_dir, HealpixPixel(partition["Norder"], partition["Npix"])
    )
    left_output_info_frame.loc[index, "size_on_disk"] = os.path.getsize(file_name)

left_output_info_frame = left_output_info_frame.astype(int)
left_output_info_frame["gbs"] = left_output_info_frame["size_on_disk"] / (1024 * 1024 * 1024)

############################################################################################
right_catalog = hats.read_hats(right_catalog_complete_dir)

right_output_info_frame = right_catalog.partition_info.as_dataframe()

for index, partition in right_output_info_frame.iterrows():
    file_name = result = hats.io.paths.pixel_catalog_file(
        right_catalog_complete_dir, HealpixPixel(partition["Norder"], partition["Npix"])
    )
    right_output_info_frame.loc[index, "size_on_disk"] = os.path.getsize(file_name)

right_output_info_frame = right_output_info_frame.astype(int)
right_output_info_frame["gbs"] = right_output_info_frame["size_on_disk"] / (1024 * 1024 * 1024)

In [None]:
#############################################################
if save_the_info == True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    with open(f"{logs_dir}/input_summarize_pixels_{current_date}.txt", "w") as file:
        # Informações do catálogo à esquerda
        file.write(f'Left catalog: {hats_left_catalog_name}\n')
        file.write(f'healpix orders: {left_output_info_frame["Norder"].unique()}\n')
        file.write(f'num partitions: {len(left_output_info_frame["Npix"])}\n')
        file.write("------\n")
        file.write(f'min size_on_disk: {left_output_info_frame["gbs"].min():.6f}\n')
        file.write(f'max size_on_disk: {left_output_info_frame["gbs"].max():.6f}\n')
        file.write(f'size_on_disk ratio: {left_output_info_frame["gbs"].max()/left_output_info_frame["gbs"].min():.6f}\n')
        file.write(f'total size_on_disk: {left_output_info_frame["gbs"].sum():.6f}\n\n')

        # Informações do catálogo à direita
        file.write(f'Right catalog: {hats_right_catalog_name}\n')
        file.write(f'healpix orders: {right_output_info_frame["Norder"].unique()}\n')
        file.write(f'num partitions: {len(right_output_info_frame["Npix"])}\n')
        file.write("------\n")
        file.write(f'min size_on_disk: {right_output_info_frame["gbs"].min():.6f}\n')
        file.write(f'max size_on_disk: {right_output_info_frame["gbs"].max():.6f}\n')
        file.write(f'size_on_disk ratio: {right_output_info_frame["gbs"].max()/right_output_info_frame["gbs"].min():.6f}\n')
        file.write(f'total size_on_disk: {right_output_info_frame["gbs"].sum():.6f}\n\n')

if show_info_inline == True:
    # Informações do catálogo à esquerda
    print(f'Left catalog: {hats_left_catalog_name}')
    print(f'healpix orders: {left_output_info_frame["Norder"].unique()}')
    print(f'num partitions: {len(left_output_info_frame["Npix"])}')
    print("------")
    print(f'min size_on_disk: {left_output_info_frame["gbs"].min():.6f}')
    print(f'max size_on_disk: {left_output_info_frame["gbs"].max():.6f}')
    print(f'size_on_disk ratio: {left_output_info_frame["gbs"].max()/left_output_info_frame["gbs"].min():.6f}')
    print(f'total size_on_disk: {left_output_info_frame["gbs"].sum():.6f}\n')

    # Informações do catálogo à direita
    print(f'Right catalog: {hats_right_catalog_name}')
    print(f'healpix orders: {right_output_info_frame["Norder"].unique()}')
    print(f'num partitions: {len(right_output_info_frame["Npix"])}')
    print("------")
    print(f'min size_on_disk: {right_output_info_frame["gbs"].min():.6f}')
    print(f'max size_on_disk: {right_output_info_frame["gbs"].max():.6f}')
    print(f'size_on_disk ratio: {right_output_info_frame["gbs"].max()/right_output_info_frame["gbs"].min():.6f}')
    print(f'total size_on_disk: {right_output_info_frame["gbs"].sum():.6f}')

## File size distribution
"Below we look at histograms of file sizes.

In our initial testing, we find that there’s a “sweet spot” file size of 100MB-1GB. Files that are smaller create more overhead for individual reads. Files that are much larger may create slow-downs when cross-matching between catalogs. Files that are much larger can create out-of-memory issues for dask when loading from disk.

The majority of your files should be in the “sweet spot”, and no files in the “too-big” category."

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
def process_file_size_info(info_frame, type_of_files, bins, labels, logs_dir, save=False, show=False):
    
    plt.hist(info_frame["gbs"], edgecolor='black')
    plt.xlabel("File size (GB)")
    plt.ylabel("Number of files")

    if save:
        current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
        plt.savefig(f"{logs_dir}/{type_of_files}_file_size_histogram_{current_date}.png")

        hist = np.histogram(info_frame["gbs"], bins=bins)[0]
        pcts = hist / len(info_frame)

        with open(f"{logs_dir}/{type_of_files}_file_size_distribution_{current_date}.txt", "w") as file:
            for i, label in enumerate(labels):
                file.write(f"{label} \t: {hist[i]} \t({pcts[i]*100:.1f} %)\n")

    if show:
        plt.show()

        hist = np.histogram(info_frame["gbs"], bins=bins)[0]
        pcts = hist / len(info_frame)

        for i, label in enumerate(labels):
            print(f"{label} \t: {hist[i]} \t({pcts[i]*100:.1f} %)")

In [None]:
left_type_of_files = 'input_left'
right_type_of_files = 'input_right'

bins = [0, 0.5, 1, 2, 100]
labels = ["small-ish", "sweet-spot", "big-ish", "too-big"]

if save_the_info:
    logs_dir = logs_dir
else:
    logs_dir=None

Left catalog.

In [None]:
process_file_size_info(
    info_frame=left_output_info_frame,
    type_of_files=left_type_of_files,
    bins=bins,
    labels=labels,
    logs_dir=logs_dir,
    save=save_the_info,
    show=show_info_inline
)

Right catalog.

In [None]:
process_file_size_info(
    info_frame=right_output_info_frame,
    type_of_files=right_type_of_files,
    bins=bins,
    labels=labels,
    logs_dir=logs_dir,
    save=save_the_info,
    show=show_info_inline
)

## Computing the total number of rows and columns

Computing the number of rows in the HATS catalog.

In [None]:
left_catalog_total_columns = left_catalog_from_disk_lsdb.columns.to_list()
left_catalog_total_rows = left_catalog_from_disk_hats.catalog_info.total_rows

right_catalog_total_columns = right_catalog_from_disk_lsdb.columns.to_list()
right_catalog_total_rows = right_catalog_from_disk_hats.catalog_info.total_rows

if show_info_inline == True:
    print(f"Left HATS catalog path: {left_catalog_complete_dir} \n")
    print(f"Total number of rows: {left_catalog_total_rows}\n")
    print(f"Total number of columns: {len(left_catalog_total_columns)}\n\n")
    
    print(f"Right HATS catalog path: {right_catalog_complete_dir} \n")
    print(f"Total number of rows: {right_catalog_total_rows}\n")
    print(f"Total number of columns: {len(right_catalog_total_columns)}\n\n")

In [None]:
if save_the_info == True:
    with open(f'{logs_dir}/input_total_len_of_input_catalogs_{current_date}.txt', 'a') as f:
        f.write(f"Left HATS catalog path: {left_catalog_complete_dir}\n")
        f.write(f"Total number of rows: {left_catalog_total_rows}\n")
        f.write(f"Total number of columns: {len(left_catalog_total_columns)}\n\n")
        
        f.write(f"Right HATS catalog path: {right_catalog_complete_dir}\n")
        f.write(f"Total number of rows: {right_catalog_total_rows}\n")
        f.write(f"Total number of columns: {len(right_catalog_total_columns)}\n\n")

# Saving libraries and jobs informations

Saving the libraries versions information.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    with open(f'{logs_dir}/main_lib_versions_{current_date}.txt', 'w') as f:
        f.write(f'python version: {sys.version} \n')
        f.write(f'numpy version: {np.__version__} \n')
        f.write(f'dask version: {dask.__version__} \n')
        f.write(f'dask_jobqueue version: {dask_jobqueue.__version__} \n')
        f.write(f'hats version: {hats_version} \n')
        f.write(f'hats_import version: {hats_import.__version__} \n')
        f.write(f'lsdb version: {lsdb.__version__} \n')
    print(f'File saved as: {logs_dir}/main_lib_versions_{current_date}.txt \n')

Defining functions to get informations about the jobs running in the cluster.

In [None]:
# Function to collect information about a job using the scontrol show job command
def get_scontrol_job_info(job_id):
    # Remove any interval or `%` from job_id
    clean_job_id = re.sub(r'\[.*?\]', '', job_id)
    
    # Execute scontrol show job
    result = subprocess.run(['scontrol', 'show', 'job', clean_job_id], stdout=subprocess.PIPE)
    job_info = result.stdout.decode('utf-8')
    
    job_dict = {}
    
    # Process the info line by line
    for line in job_info.splitlines():
        items = line.split()
        for item in items:
            if "=" in item:
                key, value = item.split("=", 1)
                job_dict[key] = value
    
    return job_dict

# Function to collect information about all jobs of the user
def get_all_jobs_info_MINE():
    # Gets the username using os.getenv('USER')
    user = os.getenv('USER')
    
    # Captures the list of running jobs for the user
    result = subprocess.run(['squeue', '-u', user, '-h', '-o', '%i'], stdout=subprocess.PIPE)
    job_ids = result.stdout.decode('utf-8').splitlines()

    # Collects information for each job
    jobs_info = []
    for job_id in job_ids:
        # Removes intervals or % from job_id before passing it to scontrol
        clean_job_id = re.sub(r'\[.*?\]', '', job_id)
        try:
            job_info = get_scontrol_job_info(clean_job_id)
            jobs_info.append(job_info)
        except Exception as e:
            print(f"Error processing job {job_id}: {e}")
    
    # Converts the list of dictionaries into a Pandas DataFrame
    df = pd.DataFrame(jobs_info)
    
    return df


# Function to collect information about all jobs that do not belong to the current user
def get_all_jobs_info_NOT_MINE():
    current_user = os.getenv('USER')
    
    # Captures the list of running jobs
    result = subprocess.run(['squeue', '-h', '-o', '%i %u'], stdout=subprocess.PIPE)
    job_lines = result.stdout.decode('utf-8').splitlines()
    
    # Filters jobs from other users
    jobs_info = []
    for line in job_lines:
        job_id, user = line.split()
        
        # Ignores jobs belonging to the current user
        if user != current_user:
            # Removes intervals or % from job_id before passing it to scontrol
            clean_job_id = re.sub(r'\[.*?\]', '', job_id)
            try:
                job_info = get_scontrol_job_info(clean_job_id)
                jobs_info.append(job_info)
            except Exception as e:
                print(f"Error processing job {job_id}: {e}")
    
    # Converts to DataFrame
    df = pd.DataFrame(jobs_info)
    return df

Getting my jobs.

In [None]:
# Collects information of all jobs and saves it in the DataFrame
df_jobs_MINE = get_all_jobs_info_MINE()

if show_info_inline==True:
    print(df_jobs_MINE[['JobId','NodeList','NumNodes','NumCPUs','NumTasks','CPUs/Task','TRES']])

Getting other people jobs.

In [None]:
# Collects information of all jobs and saves it in the DataFrame
df_jobs_NOT_MINE = get_all_jobs_info_NOT_MINE()

if len(df_jobs_NOT_MINE)!=0:
    if show_info_inline==True:
        print(df_jobs_NOT_MINE[['JobId','NodeList','NumNodes','NumCPUs','NumTasks','CPUs/Task','TRES']])
else:
    df_jobs_NOT_MINE_EMPTY_MSG = pd.DataFrame({"EMPTY": ["There are no other jobs running in the cluster."]})
    print("There are no other jobs running in the cluster.")

Saving the data of the jobs in a csv.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    
    file_name_MINE = f'{logs_dir}/jobs_info_MINE_{current_date}.csv'
    file_name_NOT_MINE = f'{logs_dir}/jobs_info_NOT_MINE_{current_date}.csv'
    
    df_jobs_MINE.to_csv(file_name_MINE, index=False)
    if len(df_jobs_NOT_MINE)!=0:
        df_jobs_NOT_MINE.to_csv(file_name_NOT_MINE, index=False)
    else:
        df_jobs_NOT_MINE_EMPTY_MSG.to_csv(file_name_NOT_MINE, index=False)
        
    print(f'Files saved as: \n')
    print(f'{file_name_MINE} \n')
    print(f'{file_name_NOT_MINE} \n')

# Doing the cross-matching

Doing the cross-matching.

In [None]:
if run_the_pipeline==True:
    ################################## INPUT CONFIGS #################################

    LEFT_HATS_DIR = left_catalog_complete_dir
    LEFT_CATALOG_HATS_NAME = hats_left_catalog_name
    RIGHT_HATS_DIR = right_catalog_complete_dir
    RIGHT_CATALOG_HATS_NAME = hats_right_catalog_name
    RIGHT_MARGIN_CACHE_DIR = hats_right_margin_cache

    CROSS_MATCHING_RADIUS = 1.0 # Up to 1 arcsec distance, it is the default
    NEIGHBORS_NUMBER = 1 # Single closest object, it is the default
    ###########################################################################################

    ################################# CONFIGURAÇÕES DE OUTPUT #################################
    ### Output directory for the x-matching and logs.
    HATS_DIR = Path(x_matching_path)
    LOGS_DIR = Path(logs_dir)

    XMATCH_NAME = LEFT_CATALOG_HATS_NAME+'_x_'+RIGHT_CATALOG_HATS_NAME
    OUTPUT_HATS_DIR = HATS_DIR / XMATCH_NAME

    ### Path to dask performance report.
    PERFORMANCE_REPORT_NAME = f'dask_performance_report_{current_date}.html'
    PERFORMANCE_DIR = LOGS_DIR / PERFORMANCE_REPORT_NAME
    ###########################################################################################

    ############################### EXECUTANDO O PIPELINE ######################################
    with performance_report(filename=PERFORMANCE_DIR):
        left_catalog = lsdb.read_hats(LEFT_HATS_DIR)
        right_catalog = lsdb.read_hats(RIGHT_HATS_DIR, margin_cache=RIGHT_MARGIN_CACHE_DIR)
    
        xmatched = left_catalog.crossmatch(
            right_catalog,
            radius_arcsec=CROSS_MATCHING_RADIUS,
            n_neighbors=NEIGHBORS_NUMBER,
            suffixes=(LEFT_CATALOG_HATS_NAME, RIGHT_CATALOG_HATS_NAME),
        )
        xmatched.to_hats(OUTPUT_HATS_DIR, overwrite=True)
###########################################################################################
else:
    print('You selected not to run the pipeline.') 

# Analysing the outputs

## Loading the catalog

Loading the crossmatching output catalog.

In [None]:
output_catalog_complete_dir = OUTPUT_HATS_DIR
output_catalog_from_disk_lsdb = lsdb.read_hats(output_catalog_complete_dir)
output_catalog_from_disk_hats = hats.read_hats(output_catalog_complete_dir)

print(OUTPUT_HATS_DIR)

Loading the catalog as a dask dataframe.

In [None]:
output_catalog_from_disk_delayed = output_catalog_from_disk_lsdb.to_delayed()
output_catalog_from_disk_ddf = dd.from_delayed(output_catalog_from_disk_delayed)

## Making the pixels plot

Making the pixels plot.

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    
    plot_pixels(output_catalog_from_disk_hats)
    plt.savefig(f"{logs_dir}/output_catalog_pixels_plot_{current_date}.png")
else:
    plot_pixels(output_catalog_from_disk_hats)

## Summarize pixels and sizes
* "healpix orders: distinct healpix orders represented in the partitions

* num partitions: total number of partition files

Size on disk data - using the file sizes fetched above, check the balance of your data. If your rows are fixed-width (e.g. no nested arrays, and few NaNs), the ratio here should be similar to the ratio above. If they’re very different, and you experience problems when parallelizing operations on your data, you may consider re-structuring the data representation.

* min size_on_disk: smallest file (in GB)

* max size_on_disk: largest file size (in GB)

* size_on_disk ratio: max/min

total size_on_disk: sum of all parquet catalog files (actual catalog size may vary due to other metadata files)"

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
catalog = hats.read_hats(OUTPUT_HATS_DIR)

output_info_frame = catalog.partition_info.as_dataframe()

for index, partition in output_info_frame.iterrows():
    file_name = result = hats.io.paths.pixel_catalog_file(
        OUTPUT_HATS_DIR, HealpixPixel(partition["Norder"], partition["Npix"])
    )
    output_info_frame.loc[index, "size_on_disk"] = os.path.getsize(file_name)

output_info_frame = output_info_frame.astype(int)
output_info_frame["gbs"] = output_info_frame["size_on_disk"] / (1024 * 1024 * 1024)

In [None]:
if save_the_info==True:
    current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
    with open(f"{logs_dir}/output_summarize_pixels_{current_date}.txt", "w") as file:
        file.write(f'healpix orders: {output_info_frame["Norder"].unique()}\n')
        file.write(f'num partitions: {len(output_info_frame["Npix"])}\n')
        file.write("------\n")
        file.write(f'min size_on_disk: {output_info_frame["gbs"].min():.6f}\n')
        file.write(f'max size_on_disk: {output_info_frame["gbs"].max():.6f}\n')
        file.write(f'size_on_disk ratio: {output_info_frame["gbs"].max()/output_info_frame["gbs"].min():.6f}\n')
        file.write(f'total size_on_disk: {output_info_frame["gbs"].sum():.6f}\n')
if show_info_inline==True:
    print(f'healpix orders: {output_info_frame["Norder"].unique()}')
    print(f'num partitions: {len(output_info_frame["Npix"])}')
    print("------")
    print(f'min size_on_disk: {output_info_frame["gbs"].min():.6f}')
    print(f'max size_on_disk: {output_info_frame["gbs"].max():.6f}')
    print(f'size_on_disk ratio: {output_info_frame["gbs"].max()/output_info_frame["gbs"].min():.6f}')
    print(f'total size_on_disk: {output_info_frame["gbs"].sum():.6f}')

## File size distribution
"Below we look at histograms of file sizes.

In our initial testing, we find that there’s a “sweet spot” file size of 100MB-1GB. Files that are smaller create more overhead for individual reads. Files that are much larger may create slow-downs when cross-matching between catalogs. Files that are much larger can create out-of-memory issues for dask when loading from disk.

The majority of your files should be in the “sweet spot”, and no files in the “too-big” category."

Source: https://hats.readthedocs.io/en/stable/notebooks/catalog_size_inspection.html

In [None]:
def process_file_size_info(info_frame, type_of_files, bins, labels, logs_dir, save=False, show=False):
    
    plt.hist(info_frame["gbs"], edgecolor='black')
    plt.xlabel("File size (GB)")
    plt.ylabel("Number of files")

    if save:
        current_date = datetime.now().strftime('%Y-%m-%d_%H-%M')
        plt.savefig(f"{logs_dir}/{type_of_files}_file_size_histogram_{current_date}.png")

        hist = np.histogram(info_frame["gbs"], bins=bins)[0]
        pcts = hist / len(info_frame)

        with open(f"{logs_dir}/{type_of_files}_file_size_distribution_{current_date}.txt", "w") as file:
            for i, label in enumerate(labels):
                file.write(f"{label} \t: {hist[i]} \t({pcts[i]*100:.1f} %)\n")

    if show:
        plt.show()

        hist = np.histogram(info_frame["gbs"], bins=bins)[0]
        pcts = hist / len(info_frame)

        for i, label in enumerate(labels):
            print(f"{label} \t: {hist[i]} \t({pcts[i]*100:.1f} %)")

In [None]:
type_of_files = 'output'
bins = [0, 0.5, 1, 2, 100]
labels = ["small-ish", "sweet-spot", "big-ish", "too-big"]

if save_the_info:
    logs_dir = logs_dir
else:
    logs_dir=None

process_file_size_info(
    info_frame=output_info_frame,
    type_of_files=type_of_files,
    bins=bins,
    labels=labels,
    logs_dir=logs_dir,
    save=save_the_info,
    show=show_info_inline
)

## Computing the total number of rows and columns

Computing the total number of rows and columns for the output catalog.

In [None]:
output_catalog_total_columns = output_catalog_from_disk_lsdb.columns.to_list()
output_catalog_total_rows = output_catalog_from_disk_hats.catalog_info.total_rows

if show_info_inline == True:
    print(f"HATS catalog path: {output_catalog_complete_dir} \n")
    print(f"Total number of rows: {output_catalog_total_rows}\n")
    print(f"Total number of columns: {len(output_catalog_total_columns)}\n\n")
    
if save_the_info == True:
    with open(f'{logs_dir}/output_total_len_of_files_{current_date}.txt', 'a') as f:
        f.write(f"HATS catalog path: {output_catalog_complete_dir}\n")
        f.write(f"Total number of rows: {output_catalog_total_rows}\n")
        f.write(f"Total number of columns: {len(output_catalog_total_columns)}\n\n")

## Plots

Columns to be used for plotting.

In [None]:
id_left_output_name = id_left_name+hats_left_catalog_name
ra_left_output_name = ra_left_name+hats_left_catalog_name
dec_left_output_name = dec_left_name+hats_left_catalog_name

id_right_output_name = id_right_name+hats_right_catalog_name
ra_right_output_name = ra_right_name+hats_right_catalog_name
dec_right_output_name = dec_right_name+hats_right_catalog_name

### Plotting a region in the sky

First, select the coordinates of the region for the plot.

In [None]:
ra_min = 70
ra_max = 70.5
dec_min = -30
dec_max = -29.5

In [None]:
print(f"R.A. min: {ra_min}")
print(f"R.A. max: {ra_max}")
print(f"DEC min: {dec_min}")
print(f"DEC max: {dec_max}")

We use the polygon_search method from LSDB lib to select this region of interest in the catalogs.

In [None]:
polygon_coords = [[ra_min, dec_max], [ra_max, dec_max], [ra_max, dec_min], [ra_min, dec_min]]

left_catalog_box = left_catalog_from_disk_lsdb.polygon_search(polygon_coords).compute()
right_catalog_box = right_catalog_from_disk_lsdb.polygon_search(polygon_coords).compute()
output_catalog_box = output_catalog_from_disk_lsdb.polygon_search(polygon_coords).compute()

Converting the R.A. coordinates to the interval $(-180^{\circ}, 180^{\circ}]$.

In [None]:
ra_left = np.where(left_catalog_box[ra_left_name] > 180, left_catalog_box[ra_left_name] - 360, left_catalog_box[ra_left_name])
ra_right = np.where(right_catalog_box[ra_right_name] > 180, right_catalog_box[ra_right_name] - 360, right_catalog_box[ra_right_name])
ra_output = np.where(output_catalog_box[ra_right_output_name] > 180, output_catalog_box[ra_right_output_name] - 360, output_catalog_box[ra_right_output_name])

Making the plot with matplotlib.

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(ra_left, left_catalog_box[dec_left_name], s=10, alpha=0.5, marker="+", color="blue", label="left")
plt.scatter(ra_right, right_catalog_box[dec_right_name], s=1, alpha=0.05, color="green", label="right")
plt.scatter(ra_output, output_catalog_box[dec_right_output_name], s=5, alpha=0.8, color="red", label="x-matched")
plt.xlabel("R.A. (deg)")
plt.ylabel("DEC (deg)")
plt.title(f'X-matched points - {hats_left_catalog_name} vs {hats_right_catalog_name}')
plt.legend(loc='lower right')

plt.gca().invert_xaxis()  # Inverter o eixo x

if save_the_info == True:
    plt.savefig(f"{logs_dir}/output_small_region_{ra_min}_{ra_max}_{dec_min}_{dec_max}.png")
    
plt.show()

### Plotting all the points

Filtering the dataframe to contain only id, ra, dec and distance info, and computing.

In [None]:
# Criar um novo DataFrame contendo apenas as colunas relevantes e garantir que seja independente
filtered_df = output_catalog_from_disk_ddf[[id_left_output_name, ra_left_output_name, dec_left_output_name,
                                            id_right_output_name, ra_right_output_name, dec_right_output_name, '_dist_arcsec']].compute()

# Converter o DataFrame para tipos nativos do Pandas/NumPy
filtered_df = filtered_df.astype({
    ra_left_output_name: 'float64',
    dec_left_output_name: 'float64',
    ra_right_output_name: 'float64',
    dec_right_output_name: 'float64',
    '_dist_arcsec': 'float64'
})

# Computar os valores mínimo e máximo de '_dist_arcsec'
lowest_dist_value = filtered_df['_dist_arcsec'].min()
highest_dist_value = filtered_df['_dist_arcsec'].max()

Plotting the spatial distribution of points.

In [None]:
# Criar o gráfico 2D com Holoviews
points_counts = hv.Points(
    filtered_df,
    kdims=[ra_left_output_name, dec_right_output_name]
)

# Aplicar rasterização com contagem de pontos
rasterized_counts = rasterize(points_counts, aggregator='count')

# Adicionar opções ao gráfico com formatação personalizada no ColorBar
rasterized_counts = rasterized_counts.opts(
    width=750, height=500,
    cmap=viridis,
    colorbar=True,
    colorbar_opts={
        'formatter': PrintfTickFormatter(format="%.0f"),  # Formato inteiro para contagem
        'title': 'Counts',  # Título da barra de cores
    },
    tools=['hover'], 
    cnorm='linear',  # Linear por padrão para contagem
    xlabel='RA (deg)', ylabel='DEC (deg)',
    fontsize={'xticks': 12, 'yticks': 12, 'xlabel': 14, 'ylabel': 14},
    invert_xaxis=True  # Inverte apenas o eixo X
)

# Salvar o gráfico como PNG, se necessário
if save_the_info:
    output_image_path = os.path.join(logs_dir, 'output_spatial_distribution_counts.html')
    hv.save(rasterized_counts, output_image_path, fmt='html')
    print(f"Plot saved in: {output_image_path}")

# Renderizar e exibir o gráfico
hv.output(rasterized_counts)

Plotting the spatial distribution of points and coloring according to the separation distances.

In [None]:
# Criar o gráfico 2D com Holoviews
points = hv.Points(
    filtered_df,
    kdims=[ra_left_output_name, dec_right_output_name],
    vdims=['_dist_arcsec']
)

# Aplicar rasterização com equalização de histograma
rasterized = rasterize(points, aggregator='mean')

# Adicionar opções ao gráfico com formatação personalizada no ColorBar
rasterized = rasterized.opts(
    width=750, height=500,
    cmap=viridis,
    colorbar=True,
    colorbar_opts={
        'formatter': PrintfTickFormatter(format="%.2f"),  # Formato legível com 2 casas decimais
        'title': 'Dist. (Arcsec)',  # Título da barra de cores
    },
    tools=['hover'], 
    cnorm='eq_hist',  # Equalização de histograma
    xlabel='RA (deg)', ylabel='DEC (deg)',
    fontsize={'xticks': 12, 'yticks': 12, 'xlabel': 14, 'ylabel': 14},
    invert_xaxis=True  # Inverte apenas o eixo X
)

if save_the_info:
    output_image_path = os.path.join(logs_dir, 'output_spatial_distribution_distances.html') 
    hv.save(rasterized, output_image_path, fmt='html')
    print(f"Plot saved in: {output_image_path}")

# Renderizar e exibir o gráfico
hv.output(rasterized)

## Plotting the histogram of distances

In [None]:
# Criar um Dataset a partir do DataFrame
dataset = hv.Dataset(filtered_df, kdims='_dist_arcsec')

# Aplicar a operação de histograma
hist = hv.operation.histogram(dataset, dimension='_dist_arcsec', normed=False, bins=20)

# Personalizar o histograma
hist = hist.opts(
    xlabel='Distance (arcsec)',
    ylabel='Frequency',
    title='Histogram of separation distances',
    color='blue',
    tools=['hover'],
    width=750,
    height=500,
    yticks=[(i, f"{i}") for i in range(0, int(hist.range(1)[1]) + 1, int(hist.range(1)[1]) // 10)]  # Formato legível no eixo Y
)

if save_the_info==True:
    output_image_path = os.path.join(logs_dir, 'output_separation_distances_histogram.html') 
    hv.save(hist, output_image_path, fmt='html')
    print(f"Plot saved in: {output_image_path}")

# Mostrar o histograma
hist

## Checking for duplicates

In [None]:
# Mantendo todos os valores duplicados com base na coluna 'id_right_output_name'
df_duplicates_total = filtered_df[filtered_df.duplicated(subset=[id_right_output_name], keep=False)]

# Organizando o DataFrame resultante em ordem crescente pela coluna 'id_right_output_name'
df_duplicates_total = df_duplicates_total.sort_values(by=id_right_output_name)

# Mantendo apenas uma ocorrência de cada valor duplicado na coluna 'id_right_output_name'
df_duplicates_individual = df_duplicates_total.drop_duplicates(subset=[id_right_output_name], keep='first')

# Salvando os DataFrames em arquivos CSV, se necessário
if save_the_info:
    # Salvando duplicados totais
    output_csv_path_total = os.path.join(logs_dir, 'output_duplicates_total.csv')
    df_duplicates_total.to_csv(output_csv_path_total, index=False)
    print(f"DataFrame with total duplicates saved in: {output_csv_path_total}")
    
    # Salvando duplicados individuais
    output_csv_path_individual = os.path.join(logs_dir, 'output_duplicates_individual.csv')
    df_duplicates_individual.to_csv(output_csv_path_individual, index=False)
    print(f"DataFrame with individual duplicates saved in: {output_csv_path_individual}")

# Exibindo os DataFrames resultantes
print("Total duplicates:")
print(len(df_duplicates_total))

print("\nIndividual duplicates: ")
print(len(df_duplicates_individual))

# Closing the cluster

In [None]:
if close_the_cluster==True:
    client.close()
    cluster.close()