In [1]:
import numpy as np
import pandas as pd

#from: Jake Vanderplas https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.08-Aggregation-and-Grouping.ipynb
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

# Nested-Pandas as an Engine for Time-Domain Analysis

Outline:
* What is Nested-Pandas
* How does it work: hierarchical column access, nest accessor
* Why use it: Performance Benchmarking Comparison based off of Plasticc dataset

From a software-perspective, working with time-domain data means finding a way to best handle Object-level information alongside it's actual time-stamped measurements. In Rubin language, we refer to these as "Object" and "Source" respectively:

In [3]:
object = pd.read_parquet("objects.parquet")
source = pd.read_parquet("ztf_sources.parquet").sort_index()

display('object', 'source')

Unnamed: 0,ra,dec
0,17.447868,35.547046
1,1.020437,4.353613
2,3.695975,31.130105
3,13.242558,6.099142
4,2.744142,48.444456
...,...,...
995,6.547263,40.249140
996,18.391919,17.643616
997,18.587638,46.568135
998,10.871655,6.719466

Unnamed: 0_level_0,mjd,flux,band
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.422340,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.983920,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r


We load the above as two independent Pandas DataFrames. However, for analysis workflows these two tables are often closely connected.

In [4]:
# An object query only affects the object table
queried_object = object.query("ra > 15")

# I probably want to propagate the result to my source table
queried_source = source.join(queried_object[[]], how="right")

display('queried_object', 'source', 'queried_source')

Unnamed: 0,ra,dec
0,17.447868,35.547046
13,16.868734,8.958951
15,15.678835,23.491108
16,15.574044,21.706137
30,15.722621,47.896184
...,...,...
979,19.180670,10.274467
981,15.474806,21.436014
996,18.391919,17.643616
997,18.587638,46.568135

Unnamed: 0_level_0,mjd,flux,band
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.422340,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.983920,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r

Unnamed: 0,mjd,flux,band
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.422340,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.983920,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r


## Nested-Pandas: Nesting DataFrames within DataFrames

The core idea of Nested-Pandas is the ability to "nest" DataFrames within DataFrames. Below, we load our data again using Nested-Pandas but this time we additionally nest our sources within our object table.

In [8]:
import nested_pandas as npd

object_df = npd.read_parquet("objects.parquet")
source_df = npd.read_parquet("ztf_sources.parquet").sort_index()

# Add a "ztf_sources" column of all tied sources to each object
object_nf = object_df.add_nested(source_df, "ztf_sources")
object_nf

Unnamed: 0,ra,dec,ztf_sources
0,17.447868,35.547046,mjd flux band 0 8.420511...
1,1.020437,4.353613,mjd flux band 0 15.019776...
2,3.695975,31.130105,mjd flux band 0 13.168814...
3,13.242558,6.099142,mjd flux band 0 18.708019...
4,2.744142,48.444456,mjd flux band 0 1.035325...
...,...,...,...
995,6.547263,40.249140,mjd flux band 0 17.260228...
996,18.391919,17.643616,mjd flux band 0 18.159739...
997,18.587638,46.568135,mjd flux band 0 7.700205...
998,10.871655,6.719466,mjd flux band 0 0.886458...


Our DataFrame (or "NestedFrame" for Nested-Pandas) now has a new column which contains the full contents of our source table. Every row now has a DataFrame of nested source information available to it. For example, let's look at the first row:

In [9]:
# The dataframe has all ztf_source rows for object 0
object_nf.loc[0]["ztf_sources"]

Unnamed: 0,mjd,flux,band
0,8.420511,259.454128,r
1,14.442831,29.947062,g
2,17.276088,250.422340,r
3,11.874109,0.395589,g
4,15.418783,228.769717,g
...,...,...,...
995,6.206966,41.100829,g
996,0.181429,217.918431,g
997,15.897106,224.327657,g
998,5.327694,217.656239,r


## Working with Nested Data

The Nested-Pandas API tweaks the Pandas API to support work with nested columns and adds additional functionality on top to further enable analysis.

### Using the Pandas API

Nested-Pandas is an extension of Pandas, meaning all functionality of Pandas is present in Nested-Pandas.

In [10]:
# Query works as expected
object_nf.query("ra > 15")

Unnamed: 0,ra,dec,ztf_sources
0,17.447868,35.547046,mjd flux band 0 8.420511...
13,16.868734,8.958951,mjd flux band 0 7.073765...
15,15.678835,23.491108,mjd flux band 0 0.756320...
16,15.574044,21.706137,mjd flux band 0 9.838356...
30,15.722621,47.896184,mjd flux band 0 18.979234...
...,...,...,...
979,19.180670,10.274467,mjd flux band 0 7.713358...
981,15.474806,21.436014,mjd flux band 0 3.073578...
996,18.391919,17.643616,mjd flux band 0 18.159739...
997,18.587638,46.568135,mjd flux band 0 7.700205...


Nested-Pandas equips a number of Pandas functions with ability to work directly with nested columns:

In [13]:
# nested_layer.column syntax allows access to sub-queries
object_nf_g = object_nf.query("ztf_sources.band == 'g'")
object_nf_g

Unnamed: 0,ra,dec,ztf_sources
0,17.447868,35.547046,mjd flux band 0 14.442831...
1,1.020437,4.353613,mjd flux band 0 15.019776...
2,3.695975,31.130105,mjd flux band 0 13.168814...
3,13.242558,6.099142,mjd flux band 0 0.911046...
4,2.744142,48.444456,mjd flux band 0 7.315952...
...,...,...,...
995,6.547263,40.249140,mjd flux band 0 8.131266...
996,18.391919,17.643616,mjd flux band 0 17.188258...
997,18.587638,46.568135,mjd flux band 0 7.700205...
998,10.871655,6.719466,mjd flux band 0 0.886458...


Operations like this affect the nested data directly:

In [14]:
object_nf_g.loc[0]["ztf_sources"]

Unnamed: 0,mjd,flux,band
0,14.442831,29.947062,g
1,11.874109,0.395589,g
2,15.418783,228.769717,g
3,2.557222,75.081593,g
4,13.989636,126.599935,g
...,...,...,...
488,5.904323,299.269974,g
489,13.271755,101.348946,g
490,6.206966,41.100829,g
491,0.181429,217.918431,g


### Nested-Pandas Unique Functions

The `reduce` function is similar to Pandas `apply` but packages inputs from nested columns into arrays:

In [19]:
# Find the mean g-band flux for each object
object_nf_g.reduce(np.mean, "ztf_sources.flux").rename(columns={0:"g_mean_flux"}) # rename the output column

Unnamed: 0,g_mean_flux
0,146.386013
1,149.530865
2,158.365247
3,150.590805
4,154.176187
...,...
995,148.844946
996,149.470083
997,153.178028
998,150.725953


### The `nest` Accessor

Nested Columns have their own API available via the `nest` Accessor. This API provides methods to preview and transform data from a nested column.

In [20]:
# How to access the accessor
object_nf["ztf_sources"].nest

<nested_pandas.series.accessor.NestSeriesAccessor at 0x11edcb910>

In [21]:
# See the sub-columns (or fields) of the nested column
object_nf["ztf_sources"].nest.fields

['mjd', 'flux', 'band']

In [22]:
# Retrieve the full original source table
object_nf["ztf_sources"].nest.to_flat()

Unnamed: 0,mjd,flux,band
0,8.420511,259.454128,r
0,14.442831,29.947062,g
0,17.276088,250.42234,r
0,11.874109,0.395589,g
0,15.418783,228.769717,g
...,...,...,...
999,1.98392,77.994248,g
999,18.761759,77.129061,g
999,14.686133,1.661199,g
999,7.810396,249.381225,r


## Why Nested-Pandas?

We think that Nested-Pandas offers a straightforward API to astronomers looking to do time-domain analysis, or potentially other types of analysis that work with similarly structured data (spectra). And as a result, we hope this package serves a role as an approachable package for researchers looking to do time-domain analysis with Rubin data.

In addition, in initial testing we've been excited about the performance of Nested-Pandas and it's potential to meet Rubin's data scale. To finish, let's use the Nested-Dask package (which enables Dask on Nested-Pandas) and show a more sizable analysis example:

### Example: Eclipsing Binary Search in the PLAsTiCC dataset

The goal of the workflow is to identify a subset of Eclipsing Binary Candidates within the dataset.

Photometric LSST Astronomical Time-Series Classification Challenge (PLAsTiCC)
* 3,492,890 Objects
* 453,653,104 Sources

In [1]:
import nested_dask as nd
import numpy as np
import light_curve as licu

from dask.distributed import Client
client = Client(n_workers=4,
                dashboard_address=':39876')

DATA_DIR = "/Users/dbranton/lincc/timeseries/data/plasticc/parquet"

client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:39876/status,

0,1
Dashboard: http://127.0.0.1:39876/status,Workers: 4
Total threads: 12,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:56554,Workers: 4
Dashboard: http://127.0.0.1:39876/status,Total threads: 12
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:56565,Total threads: 3
Dashboard: http://127.0.0.1:56567/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:56557,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-c8bk32nq,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-c8bk32nq

0,1
Comm: tcp://127.0.0.1:56566,Total threads: 3
Dashboard: http://127.0.0.1:56571/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:56558,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-iw6hupoz,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-iw6hupoz

0,1
Comm: tcp://127.0.0.1:56569,Total threads: 3
Dashboard: http://127.0.0.1:56573/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:56559,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-4yhm4nxt,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-4yhm4nxt

0,1
Comm: tcp://127.0.0.1:56570,Total threads: 3
Dashboard: http://127.0.0.1:56575/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:56560,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-kijrp3lh,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-kijrp3lh


In [3]:
%%time
# Load in Plasticc data
object = nd.read_parquet(DATA_DIR+"/object/*.parquet", 
                         dtype_backend="pyarrow", 
                         index="object_id",
                         calculate_divisions=True)
source = nd.read_parquet(DATA_DIR+"/source/*.parquet", 
                         dtype_backend="pyarrow",
                         index="object_id",
                         calculate_divisions=True)
objsor = object.add_nested(source, "source")

# First, let's select only Galactic objects, by cutting on hostgal_photoz.
objsor = objsor.query("hostgal_photoz < 0.001")

# Second, let's select persistent sources, by cutting on the duration of the light curve.
def calc_ptp(time, detected):
    try:
        return {"duration": np.ptp(time[np.asarray(detected, dtype=bool)])}
    except ValueError:
        return {"duration": 0}

duration = objsor.reduce(calc_ptp, 'source.mjd', 'source.detected_bool',
                         meta={"duration":"float"})
# Filter by the calculated duration
objsor = objsor.assign(duration=duration["duration"])
objsor = objsor.query("duration > 366")

# Next, we use Otsu's method to split light curves into two groups:
# one with high flux, and one with low flux. Eclipsing binaries should have
# lower flux group smaller than the higher flux group, but having larger 
# variability. We use light-curve package to extract these features.
# (https://github.com/light-curve/light-curve-python)
# For simplicity, we only calculate these features for the i (3) band.
def otsu_fmt(*args, **kwargs):
    otsu = licu.OtsuSplit()
    res = otsu(*args, **kwargs)
    return {'otsu_mean_diff': res[0],
           'otsu_std_lower': res[1],
           'otsu_std_upper': res[2],
            'otsu_lower_to_all_ratio': res[3]}

objsor_3 = objsor.query("source.passband == 3")
otsu_features = objsor_3.reduce(otsu_fmt, 'source.mjd', 'source.flux',
                               meta={'otsu_mean_diff': float,
                                     'otsu_std_lower': float,
                                     'otsu_std_upper': float,
                                     'otsu_lower_to_all_ratio': float,})
# Assign Columns
objsor = objsor.assign(
    otsu_lower_to_all_ratio=otsu_features['otsu_lower_to_all_ratio'],
    otsu_std_lower=otsu_features['otsu_std_lower'],
    otsu_std_upper=otsu_features['otsu_std_upper'],
)
# Filter by Otsu Features
objsor = objsor.query(
    "otsu_lower_to_all_ratio < 0.1 and otsu_std_lower > otsu_std_upper",
)

# Let's compute the result (dask)
objsor.compute()

CPU times: user 3.37 s, sys: 320 ms, total: 3.69 s
Wall time: 9.6 s


Unnamed: 0_level_0,ra,decl,ddf_bool,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,true_target,...,tflux_g,tflux_r,tflux_i,tflux_z,tflux_y,source,duration,otsu_lower_to_all_ratio,otsu_std_lower,otsu_std_upper
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1040934,192.1674,-46.3751,0,-9.0,0.0,0.0,-9.0,0.083,0,65,...,33.7,110.5,539.1,1183.9,1991.1,mjd passband flux flux_er...,410.0927,0.090909,10.038970,7.000799
1145188,104.4141,-12.1773,0,-9.0,0.0,0.0,-9.0,0.625,0,16,...,3106.7,3207.9,2379.2,1631.3,717.8,mjd passband flux flux_...,1073.9476,0.090909,111.621817,67.960107
1184639,326.9531,-22.8314,0,-9.0,0.0,0.0,-9.0,0.032,0,16,...,19954.1,20819.5,15006.2,9573.4,3997.2,mjd passband flux flux_e...,728.8966,0.086957,563.869493,25.370866
1226593,48.5156,-32.2662,0,-9.0,0.0,0.0,-9.0,0.014,0,65,...,25.1,43.4,140.4,285.6,467.0,mjd passband flux flux_er...,830.8233,0.080000,12.008922,4.690816
1227515,183.3398,-16.9578,0,-9.0,0.0,0.0,-9.0,0.049,0,16,...,11232.6,16446.4,14553.2,10543.4,4789.5,mjd passband flux flux_...,1056.0646,0.095238,765.642029,122.818703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130633244,152.9297,-12.4828,0,-9.0,0.0,0.0,-9.0,0.058,0,16,...,1957.9,1864.3,1267.8,781.9,319.2,mjd passband flux flux_er...,1086.8704,0.080000,57.311277,12.255049
130634660,36.8182,-46.7685,0,-9.0,0.0,0.0,-9.0,0.014,0,16,...,379.4,331.9,214.8,128.2,51.3,mjd passband flux flux_er...,839.7401,0.080000,8.145193,7.080347
130708654,95.0977,-36.0536,0,-9.0,0.0,0.0,-9.0,0.052,0,16,...,20970.8,23290.0,17505.6,11486.4,4893.3,mjd passband flux flux_er...,1008.0936,0.095238,1425.256064,480.315334
130711141,133.0349,-47.1613,0,-9.0,0.0,0.0,-9.0,1.271,0,16,...,6468.5,7909.4,6421.0,4922.0,2323.3,mjd passband flux flu...,1025.3035,0.095238,139.958460,136.504984
