In [1]:
import numpy as np
import pandas as pd

#from: Jake Vanderplas https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.08-Aggregation-and-Grouping.ipynb
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

# Introducing Nested-Pandas

A Pandas Extension for Efficient Representation of Nested Associated Datasets.

## Motivation: A Data Format for Time-Domain Analysis

In time-domain, astronomers deal with a dataset that is often comprised of two tables, "Object" and "Source"

In [2]:
object = pd.read_parquet("objects.parquet")
source = pd.read_parquet("ztf_sources.parquet").sort_index()

display('object', 'source')

Unnamed: 0,ra,dec
0,17.447868,35.547046
1,1.020437,4.353613
2,3.695975,31.130105
3,13.242558,6.099142
4,2.744142,48.444456
...,...,...
995,6.547263,40.249140
996,18.391919,17.643616
997,18.587638,46.568135
998,10.871655,6.719466

Unnamed: 0_level_0,mjd,flux,band
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.422340,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.983920,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r


The tables above are completely independent from an API perspective, but in the context of doing science these two tables are much more closely connected.

In [3]:
# An object query only affects the object table
queried_object = object.query("ra > 15")

# I probably want to propagate the result to my source table
queried_source = source.join(queried_object[[]], how="right")

display('queried_object', 'source', 'queried_source')

Unnamed: 0,ra,dec
0,17.447868,35.547046
13,16.868734,8.958951
15,15.678835,23.491108
16,15.574044,21.706137
30,15.722621,47.896184
...,...,...
979,19.180670,10.274467
981,15.474806,21.436014
996,18.391919,17.643616
997,18.587638,46.568135

Unnamed: 0_level_0,mjd,flux,band
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.422340,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.983920,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r

Unnamed: 0,mjd,flux,band
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.422340,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.983920,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r


## Nested-Pandas: Make These Two Tables Feel Like One Dataset

In [4]:
import nested_pandas as npd

object = npd.read_parquet("objects.parquet")
source = npd.read_parquet("ztf_sources.parquet").sort_index()

display('object', 'source')

Unnamed: 0,ra,dec
0,17.447868,35.547046
1,1.020437,4.353613
2,3.695975,31.130105
3,13.242558,6.099142
4,2.744142,48.444456
...,...,...
995,6.547263,40.249140
996,18.391919,17.643616
997,18.587638,46.568135
998,10.871655,6.719466

Unnamed: 0_level_0,mjd,flux,band
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.422340,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.983920,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r


We can use `add_nested()` to "nest" the sources into the object table.

In [5]:
# Add a "ztf_sources" column of all tied sources to each object
object_with_sources = object.add_nested(source, "ztf_sources")
object_with_sources

Unnamed: 0,ra,dec,ztf_sources
0,17.447868,35.547046,mjd flux band 0 8.420511...
1,1.020437,4.353613,mjd flux band 0 15.019776...
2,3.695975,31.130105,mjd flux band 0 13.168814...
3,13.242558,6.099142,mjd flux band 0 18.708019...
4,2.744142,48.444456,mjd flux band 0 1.035325...
...,...,...,...
995,6.547263,40.249140,mjd flux band 0 17.260228...
996,18.391919,17.643616,mjd flux band 0 18.159739...
997,18.587638,46.568135,mjd flux band 0 7.700205...
998,10.871655,6.719466,mjd flux band 0 0.886458...


Each row now has an additional column, "ztf_sources", where the value is a dataframe with the contents of ztf_sources but filtered for that specific row.

In [6]:
# Let's look at one row, the dataframe has all ztf_source rows for that object row
object_with_sources.loc[0]["ztf_sources"]

Unnamed: 0,mjd,flux,band
0,8.420511,259.454128,r
1,14.442831,29.947062,g
2,17.276088,250.422340,r
3,11.874109,0.395589,g
4,15.418783,228.769717,g
...,...,...,...
995,6.206966,41.100829,g
996,0.181429,217.918431,g
997,15.897106,224.327657,g
998,5.327694,217.656239,r


We're not limited to one "nested" column:

In [7]:
ps1_sources = npd.read_parquet("ps1_sources.parquet")

object_with_sources = object_with_sources.add_nested(ps1_sources, "ps1_sources")
object_with_sources

Unnamed: 0,ra,dec,ztf_sources,ps1_sources
0,17.447868,35.547046,mjd flux band 0 8.420511...,mjd flux band 0 0.091356...
1,1.020437,4.353613,mjd flux band 0 15.019776...,mjd flux band 0 12.475696...
2,3.695975,31.130105,mjd flux band 0 13.168814...,mjd flux band 0 13.717712...
3,13.242558,6.099142,mjd flux band 0 18.708019...,mjd flux band 0 16.759764...
4,2.744142,48.444456,mjd flux band 0 1.035325...,mjd flux band 0 18.139101...
...,...,...,...,...
995,6.547263,40.249140,mjd flux band 0 17.260228...,mjd flux band 0 5.474614...
996,18.391919,17.643616,mjd flux band 0 18.159739...,mjd flux band 0 11.889307...
997,18.587638,46.568135,mjd flux band 0 7.700205...,mjd flux band 0 16.421570...
998,10.871655,6.719466,mjd flux band 0 0.886458...,mjd flux band 0 14.044775...


## Extending the API to Support Nested Columns

### Hierarchical Column Access

Existing Pandas functions have been tweaked to allow hierarchical column access, where "ztf_sources.mjd" would apply a query to the ztf_sources frames based on some criteria related to the mjd column:

In [8]:
# Let's look at query again

object_with_sources.query("ra > 15")

Unnamed: 0,ra,dec,ztf_sources,ps1_sources
0,17.447868,35.547046,mjd flux band 0 8.420511...,mjd flux band 0 0.091356...
13,16.868734,8.958951,mjd flux band 0 7.073765...,mjd flux band 0 0.910268...
15,15.678835,23.491108,mjd flux band 0 0.756320...,mjd flux band 0 7.842562...
16,15.574044,21.706137,mjd flux band 0 9.838356...,mjd flux band 0 17.813559...
30,15.722621,47.896184,mjd flux band 0 18.979234...,mjd flux band 0 6.391691...
...,...,...,...,...
979,19.180670,10.274467,mjd flux band 0 7.713358...,mjd flux band 0 7.842704...
981,15.474806,21.436014,mjd flux band 0 3.073578...,mjd flux band 0 17.369936...
996,18.391919,17.643616,mjd flux band 0 18.159739...,mjd flux band 0 11.889307...
997,18.587638,46.568135,mjd flux band 0 7.700205...,mjd flux band 0 16.421570...


In [9]:
# Querying Nested Layer

ztf_g = object_with_sources.query("ztf_sources.band == 'g'") # nested_layer.column syntax allows access to sub-queries
ztf_g

Unnamed: 0,ra,dec,ztf_sources,ps1_sources
0,17.447868,35.547046,mjd flux band 0 14.442831...,mjd flux band 0 0.091356...
1,1.020437,4.353613,mjd flux band 0 15.019776...,mjd flux band 0 12.475696...
2,3.695975,31.130105,mjd flux band 0 13.168814...,mjd flux band 0 13.717712...
3,13.242558,6.099142,mjd flux band 0 0.911046...,mjd flux band 0 16.759764...
4,2.744142,48.444456,mjd flux band 0 7.315952...,mjd flux band 0 18.139101...
...,...,...,...,...
995,6.547263,40.249140,mjd flux band 0 8.131266...,mjd flux band 0 5.474614...
996,18.391919,17.643616,mjd flux band 0 17.188258...,mjd flux band 0 11.889307...
997,18.587638,46.568135,mjd flux band 0 7.700205...,mjd flux band 0 16.421570...
998,10.871655,6.719466,mjd flux band 0 0.886458...,mjd flux band 0 14.044775...


In [10]:
ztf_g["ztf_sources"].nest.to_flat() # using the "nest" accessor

Unnamed: 0,mjd,flux,band
0,14.442831,29.947062,g
0,11.874109,0.395589,g
0,15.418783,228.769717,g
0,2.557222,75.081593,g
0,13.989636,126.599935,g
...,...,...,...
999,11.01821,120.45817,g
999,1.98392,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g


### The "Nest" Accessor

Nested Columns have a custom "nest" accessor that provides additional functionality

In [11]:
object_with_sources["ztf_sources"].nest

<nested_pandas.series.accessor.NestSeriesAccessor at 0x1627db3a0>

In [12]:
object_with_sources["ztf_sources"].nest.fields

['mjd', 'flux', 'band']

In [13]:
object_with_sources["ztf_sources"].nest.to_lists()

Unnamed: 0,mjd,flux,band
0,[ 8.420511 14.4428307 17.27608833 11.874109...,[2.59454128e+02 2.99470615e+01 2.50422340e+02 ...,['r' 'g' 'r' 'g' 'g' 'g' 'r' 'g' 'g' 'g' 'r' '...
1,[1.50197761e+01 1.62759275e+01 1.95249948e+01 ...,[2.18915184e+02 2.05306999e+02 2.58942815e+02 ...,['g' 'g' 'g' 'g' 'r' 'g' 'r' 'g' 'r' 'g' 'g' '...
2,[1.31688137e+01 3.97885604e+00 1.10253835e+01 ...,[2.45973181e+02 2.68172075e+02 1.23332535e+02 ...,['g' 'r' 'g' 'r' 'r' 'r' 'g' 'g' 'g' 'g' 'g' '...
3,[18.70801899 0.91104621 14.66964087 16.346055...,[209.62874153 263.73554858 131.2688076 126.92...,['r' 'g' 'g' 'g' 'r' 'r' 'r' 'g' 'g' 'g' 'r' '...
4,[1.03532518e+00 7.31595153e+00 1.79369142e+01 ...,[4.23415665e+01 1.50842931e+02 9.87072444e+01 ...,['r' 'g' 'r' 'g' 'r' 'r' 'r' 'g' 'g' 'g' 'g' '...
...,...,...,...
995,[1.72602277e+01 1.84054985e+00 8.13126560e+00 ...,[115.89353928 99.53544249 246.40052166 281.30...,['r' 'r' 'g' 'g' 'g' 'g' 'g' 'g' 'r' 'g' 'r' '...
996,[1.81597388e+01 1.71882583e+01 1.36051488e+01 ...,[278.96727235 199.20984588 53.18154158 133.42...,['r' 'g' 'g' 'r' 'g' 'g' 'g' 'r' 'r' 'r' 'r' '...
997,[ 7.70020501 6.15943084 15.79614033 12.123862...,[ 64.99182313 156.24787909 189.99684145 227.43...,['g' 'r' 'r' 'r' 'r' 'r' 'r' 'r' 'r' 'g' 'r' '...
998,[8.86458380e-01 9.11417097e+00 3.27679867e+00 ...,[2.60842240e+02 2.55076058e+02 5.50011273e+01 ...,['g' 'r' 'r' 'r' 'r' 'g' 'r' 'r' 'g' 'r' 'r' '...


### Additional API Functions

The `reduce` function is similar to Pandas `apply` but packages inputs from nested columns into arrays.

In [14]:
# Find the max ZTF flux for each object
object_with_sources.reduce(np.max, "ztf_sources.flux")

Unnamed: 0,0
0,299.924162
1,299.026207
2,299.990637
3,299.787405
4,299.736281
...,...
995,299.784388
996,299.868455
997,299.908486
998,299.968485


In [15]:
def test(*args):
    return args

inputs = object_with_sources.reduce(test, "ra", "ztf_sources.mjd")
inputs

Unnamed: 0,0,1
0,17.447868,"[8.420511002293274, 14.442830695475122, 17.276..."
1,1.020437,"[15.019776119882977, 16.27592751795432, 19.524..."
2,3.695975,"[13.168813738794169, 3.9788560356494895, 11.02..."
3,13.242558,"[18.708018993053507, 0.911046210332267, 14.669..."
4,2.744142,"[1.0353251757192172, 7.315951527439497, 17.936..."
...,...,...
995,6.547263,"[17.26022771511918, 1.8405498469691928, 8.1312..."
996,18.391919,"[18.159738764417707, 17.188258267692188, 13.60..."
997,18.587638,"[7.700205012551913, 6.159430843711425, 15.7961..."
998,10.871655,"[0.886458380208146, 9.114170969038186, 3.27679..."


## Where We're Going

### Nested-Dask

Introduces Dask to enable work at the scale of full-survey datasets. We have seen very encouraging performance results out of Nested-Pandas/Nested-Dask when compared TAPE (another LINCC package with a similar goal), with initial tests showing ~3x speedups and ~2x less memory usage.

In [16]:
import nested_dask as nd

nd.NestedFrame.from_nested_pandas(object_with_sources)

Unnamed: 0_level_0,ra,dec,ztf_sources,ps1_sources
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,float64,float64,"nested<mjd: [double], flux: [double], band: [string]>","nested<mjd: [double], flux: [double], band: [string]>"
999,...,...,...,...


### LSDB Integration

Our goal over the next ~1-2 months is to get Nested-Dask integrated with LSDB, meaning that LSDB users will have this nested functionality implicitly when working with their Catalog objects

## Try it out for yourself!

[nested-pandas.readthedocs.io](https://nested-pandas.readthedocs.io/en/latest/)