In [1]:
import nested_dask as nd
import nested_pandas as npd
import numpy as np
import light_curve as licu
import pandas as pd

from dask.distributed import Client
client = Client(n_workers=4,
                dashboard_address=':38764')

from pathlib import Path

DATA_DIR = "/Users/dbranton/lincc/timeseries/data/plasticc/parquet"

client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:38764/status,

0,1
Dashboard: http://127.0.0.1:38764/status,Workers: 4
Total threads: 12,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:60187,Workers: 4
Dashboard: http://127.0.0.1:38764/status,Total threads: 12
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:60198,Total threads: 3
Dashboard: http://127.0.0.1:60201/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:60190,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-2ntwwsny,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-2ntwwsny

0,1
Comm: tcp://127.0.0.1:60199,Total threads: 3
Dashboard: http://127.0.0.1:60203/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:60191,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-tw3pe4hh,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-tw3pe4hh

0,1
Comm: tcp://127.0.0.1:60200,Total threads: 3
Dashboard: http://127.0.0.1:60206/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:60192,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-351jk1m0,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-351jk1m0

0,1
Comm: tcp://127.0.0.1:60205,Total threads: 3
Dashboard: http://127.0.0.1:60208/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:60193,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-f891w4qr,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-f891w4qr


# Nested-Pandas/Dask Prove-it Notebook: PLAsTiCC Eclipsing Binaries

This notebook is meant to test/showcase the performance of Nested-Pandas relative to TAPE for the PLAsTiCC Eclipsing Binary notebook, here: https://github.com/lincc-frameworks/notebooks_lf/blob/main/plasticc-eclipsing-binaries.ipynb

The data used here is available from zenodo, but with the extra required step of needing to rewrite it to parquet files when grabbed from zenodo. See the original notebook for download details.

In [2]:
%%time
# Load in Plasticc data

object = nd.read_parquet(DATA_DIR+"/object/*.parquet", 
                         dtype_backend="pyarrow", 
                         index="object_id",
                         calculate_divisions=True)
source = nd.read_parquet(DATA_DIR+"/source/*.parquet", 
                         dtype_backend="pyarrow",
                         index="object_id",
                         calculate_divisions=True)

objsor = object.add_nested(source, "source")

# Let's run some analysis!
print("Starting analysis...")
# First, let's select only Galactic objects, by cutting on hostgal_photoz.
print("First, filter by photoz")
objsor = objsor.query("hostgal_photoz < 0.001")



# Second, let's select persistent sources, by cutting on the duration of the light curve.
print("Extract durations")
def calc_ptp(time, detected):
    try:
        return {"duration": np.ptp(time[np.asarray(detected, dtype=bool)])}
    except ValueError:
        return {"duration": 0}

duration = objsor.reduce(calc_ptp, 'source.mjd', 'source.detected_bool',
                         meta={"duration":"float"})

print("Assign a column")
objsor = objsor.assign(duration=duration["duration"])
print("Filter by duration")
objsor = objsor.query("duration > 366")

# Next, we use Otsu's method to split light curves into two groups:
# one with high flux, and one with low flux. Eclipsing binaries should have
# lower flux group smaller than the higher flux group, but having larger 
# variability. We use light-curve package to extract these features.
# (https://github.com/light-curve/light-curve-python)
# For simplicity, we only calculate these features for the i (3) band.
print("Extract Otsu features")
def otsu_fmt(*args, **kwargs):
    otsu = licu.OtsuSplit()
    res = otsu(*args, **kwargs)
    return {'otsu_mean_diff': res[0],
           'otsu_std_lower': res[1],
           'otsu_std_upper': res[2],
            'otsu_lower_to_all_ratio': res[3]}

objsor_3 = objsor.query("source.passband == 3")
otsu_features = objsor_3.reduce(otsu_fmt, 'source.mjd', 'source.flux',
                               meta={'otsu_mean_diff': float,
                                     'otsu_std_lower': float,
                                     'otsu_std_upper': float,
                                     'otsu_lower_to_all_ratio': float,})

print("Assign columns")
objsor = objsor.assign(
    otsu_lower_to_all_ratio=otsu_features['otsu_lower_to_all_ratio'],
    otsu_std_lower=otsu_features['otsu_std_lower'],
    otsu_std_upper=otsu_features['otsu_std_upper'],
)
print('Filter by Otsu features')
objsor = objsor.query(
    "otsu_lower_to_all_ratio < 0.1 and otsu_std_lower > otsu_std_upper",
)
print("Compute object table")
objsor.compute()



Starting analysis...
First, filter by photoz
Extract durations
Assign a column
Filter by duration
Extract Otsu features
Assign columns
Filter by Otsu features
Compute object table
CPU times: user 3.47 s, sys: 283 ms, total: 3.75 s
Wall time: 9.36 s


Unnamed: 0_level_0,ra,decl,ddf_bool,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,true_target,...,tflux_g,tflux_r,tflux_i,tflux_z,tflux_y,source,duration,otsu_lower_to_all_ratio,otsu_std_lower,otsu_std_upper
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1040934,192.1674,-46.3751,0,-9.0,0.0,0.0,-9.0,0.083,0,65,...,33.7,110.5,539.1,1183.9,1991.1,mjd passband flux flux_er...,410.0927,0.090909,10.038970,7.000799
1145188,104.4141,-12.1773,0,-9.0,0.0,0.0,-9.0,0.625,0,16,...,3106.7,3207.9,2379.2,1631.3,717.8,mjd passband flux flux_...,1073.9476,0.090909,111.621817,67.960107
1184639,326.9531,-22.8314,0,-9.0,0.0,0.0,-9.0,0.032,0,16,...,19954.1,20819.5,15006.2,9573.4,3997.2,mjd passband flux flux_e...,728.8966,0.086957,563.869493,25.370866
1226593,48.5156,-32.2662,0,-9.0,0.0,0.0,-9.0,0.014,0,65,...,25.1,43.4,140.4,285.6,467.0,mjd passband flux flux_er...,830.8233,0.080000,12.008922,4.690816
1227515,183.3398,-16.9578,0,-9.0,0.0,0.0,-9.0,0.049,0,16,...,11232.6,16446.4,14553.2,10543.4,4789.5,mjd passband flux flux_...,1056.0646,0.095238,765.642029,122.818703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130633244,152.9297,-12.4828,0,-9.0,0.0,0.0,-9.0,0.058,0,16,...,1957.9,1864.3,1267.8,781.9,319.2,mjd passband flux flux_er...,1086.8704,0.080000,57.311277,12.255049
130634660,36.8182,-46.7685,0,-9.0,0.0,0.0,-9.0,0.014,0,16,...,379.4,331.9,214.8,128.2,51.3,mjd passband flux flux_er...,839.7401,0.080000,8.145193,7.080347
130708654,95.0977,-36.0536,0,-9.0,0.0,0.0,-9.0,0.052,0,16,...,20970.8,23290.0,17505.6,11486.4,4893.3,mjd passband flux flux_er...,1008.0936,0.095238,1425.256064,480.315334
130711141,133.0349,-47.1613,0,-9.0,0.0,0.0,-9.0,1.271,0,16,...,6468.5,7909.4,6421.0,4922.0,2323.3,mjd passband flux flu...,1025.3035,0.095238,139.958460,136.504984
