# Loading catalogs with pyarrow types

In [1]:
%reload_ext autoreload
%autoreload 2

import dask
import logging

from dask.distributed import Client

client = Client(n_workers=1, memory_limit="30G", dashboard_address=':38766', silence_logs=logging.ERROR)

In [2]:
%pip install -q git+https://github.com/astronomy-commons/lsdb.git@main

Note: you may need to restart the kernel to use updated packages.


In [3]:
import lsdb
print(lsdb.__version__)

0.2.2.dev8+g1609fe7


In [4]:
from dask.distributed import performance_report
from IPython.display import IFrame, display

from lsdb.core.search import ConeSearch

def bottleneck_workflow():
    ztf_source = "/epyc/data3/hipscat/catalogs/ztf_axs/ztf_zource"
    search_filter = ConeSearch(ra=20, dec=-20, radius_arcsec=5*3600)
    catalog = lsdb.read_hipscat(ztf_source, columns=["band", "mag"], search_filter=search_filter)
    print(catalog.dtypes)
    print(catalog.compute())
    
def compute_performance(filename):
    file = f"{filename}.html"
    with performance_report(file):
        bottleneck_workflow()
    display(IFrame(file, 950, 500))

We have been using whatever data types the user imported the catalogs with. Internally, performing operations on string columns with Dask has been expensive, in large part due to the fact that Python strings were being converted to pyarrow strings via `to_pyarrow_str` tasks.

In [5]:
# Computing with pyarrow string conversion active
dask.config.set({"dataframe.convert-string": True})
compute_performance("with_conversion")
# Compute time: 181.35 s

band    string[pyarrow]
mag             float32
dtype: object
                     band        mag
_hipscat_index                      
4701758013566877696     r  20.654202
4701758013566877697     r  20.231649
4701758013566877698     r  19.789747
4701758013566877699     r  20.559649
4701758013566877700     r  20.640318
...                   ...        ...
10281717893410324481    r  21.176765
10281717893410324482    r  21.047438
10281717893410324483    r  20.985826
10281717926935396352    r  21.149563
10281717926935396353    r  20.556887

[144178423 rows x 2 columns]


CPU times: user 4.65 s, sys: 7.34 s, total: 12 s
Wall time: 1min 8s


In [6]:
# Computing with pyarrow string conversion inactive
dask.config.set({"dataframe.convert-string": False})
%time compute_performance("with_conversion_disabled")
# Compute time: 88.31 s

band     object
mag     float32
dtype: object
                     band        mag
_hipscat_index                      
4701758013566877696     r  20.654202
4701758013566877697     r  20.231649
4701758013566877698     r  19.789747
4701758013566877699     r  20.559649
4701758013566877700     r  20.640318
...                   ...        ...
10281717893410324481    r  21.176765
10281717893410324482    r  21.047438
10281717893410324483    r  20.985826
10281717926935396352    r  21.149563
10281717926935396353    r  20.556887

[144178423 rows x 2 columns]


CPU times: user 6.92 s, sys: 6.37 s, total: 13.3 s
Wall time: 32.3 s


This was clearly a performance bottleneck so we decided to update the code to read data using pyarrow types by default. We get rid of the string conversion tasks and it has additional benefits (e.g. support for nullable types)!

In [7]:
%pip install -q git+https://github.com/astronomy-commons/lsdb.git@issue/279/default-pyarrow-backend

Note: you may need to restart the kernel to use updated packages.


In [8]:
import lsdb
print(lsdb.__version__)

0.2.2.dev18+g513e5df


In [9]:
compute_performance("with_pyarrow_types")
# Compute time: 83.81 s

band    string[pyarrow]
mag      float[pyarrow]
dtype: object
                     band        mag
_hipscat_index                      
4701758013566877696     r  20.654202
4701758013566877697     r  20.231649
4701758013566877698     r  19.789747
4701758013566877699     r  20.559649
4701758013566877700     r  20.640318
...                   ...        ...
10281717893410324481    r  21.176765
10281717893410324482    r  21.047438
10281717893410324483    r  20.985826
10281717926935396352    r  21.149563
10281717926935396353    r  20.556887

[144178423 rows x 2 columns]


CPU times: user 4.44 s, sys: 12.1 s, total: 16.5 s
Wall time: 34.1 s


In [10]:
client.close()