# Demo Mar 19, 2025

In [1]:
import lsdb
from nested_pandas import NestedDtype

Reading the small sky nested sources catalog:

In [2]:
nested_catalog_dir = 'tests/data/small_sky_order1_nested_sources'
nested_margin_dir = 'tests/data/small_sky_order1_nested_sources_margin'
nested_catalog = lsdb.read_hats(nested_catalog_dir, margin_cache=nested_margin_dir)

#### `map_partitions` also updates margins

We currently need to cast nested columns to the appropriate type after invoking `read_hats´:

In [3]:
nested_catalog.margin["sources"].dtype

struct<source_id: list<element: int64>, source_ra: list<element: double>, source_dec: list<element: double>, mjd: list<element: double>, mag: list<element: double>, band: list<element: string>, object_ra: list<element: double>, object_dec: list<element: double>>[pyarrow]

In [4]:
def cast_nested(df, columns):
    """Helper function to cast nested columns to the correct type."""
    return df.assign(
        **{col: df[col].astype(NestedDtype.from_pandas_arrow_dtype(df.dtypes[col])) for col in columns},
    )

nested_catalog = nested_catalog.map_partitions(cast_nested, columns=["sources"])
nested_catalog

Unnamed: 0_level_0,id,ra,dec,ra_error,dec_error,sources
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Order: 1, Pixel: 44",int64[pyarrow],double[pyarrow],double[pyarrow],int64[pyarrow],int64[pyarrow],"nested<source_id: [int64], source_ra: [double]..."
"Order: 1, Pixel: 45",...,...,...,...,...,...
"Order: 1, Pixel: 46",...,...,...,...,...,...
"Order: 1, Pixel: 47",...,...,...,...,...,...


Now `map_partitions` applies the custom user function to the margins as well:

In [5]:
nested_catalog.margin["sources"].dtype

<nested_pandas.series.dtype.NestedDtype at 0x127142ea0>

#### `sort_nested_values`

We can also sort the nested data of a catalog according to one of its columns (powered by *nested-pandas*).

In [6]:
head = nested_catalog.head()

In [7]:
# selecting the light curve for the first object
head["sources"].iloc[0]

Unnamed: 0,source_id,source_ra,source_dec,mjd,mag,band,object_ra,object_dec
0,74859,308.584555,-69.462753,58702.504576,16.338498,i,308.5,-69.5
1,77874,309.001417,-69.498428,58912.758183,19.832633,y,308.5,-69.5
...,...,...,...,...,...,...,...,...
129,85947,308.863222,-68.916876,59480.071192,16.105387,i,308.5,-69.5
130,77334,308.912122,-68.909644,58873.975676,18.780901,i,308.5,-69.5


In [8]:
# it is not ordered by mjd
head["sources"].iloc[0]["mjd"].is_monotonic_increasing

False

Let's sort all object light curves by mjd and compute again:

In [9]:
nested_catalog = nested_catalog.sort_nested_values(by="sources.mjd")
head = nested_catalog.head()

In [10]:
# selecting the light curve for the first object
head["sources"].iloc[0]

Unnamed: 0,source_id,source_ra,source_dec,mjd,mag,band,object_ra,object_dec
0,70083,308.886403,-69.096314,58369.631492,16.274146,u,308.5,-69.5
1,70684,308.931966,-69.360047,58413.642635,16.703503,i,308.5,-69.5
...,...,...,...,...,...,...,...,...
129,86609,308.766311,-69.345622,59525.228143,20.708147,r,308.5,-69.5
130,87095,308.901041,-69.495312,59557.673848,20.886599,u,308.5,-69.5


In [11]:
# confirm that they are now ordered by mjd
head["sources"].iloc[0]["mjd"].is_monotonic_increasing

True

We do not allow sorting on base columns so that will raise an error:

In [12]:
nested_catalog.sort_nested_values(by="id")

ValueError: id not found in nested columns