In [1]:
import light_curve as licu
import dask.dataframe as dd
from tape import Ensemble, ColumnMapper
import numpy as np
from pathlib import Path

# LINCC Demo-Day: PLAsTiCC Eclipsing Binary Exploration with TAPE

Showing off some TAPE analysis when working with the PLAsTiCC dataset (converted to parquet files from csv files). This workflow was created by Kostya, where he was interested in exploring Eclipsing Binaries within the dataset.

Dataset Details:
* Total Size: ~10 GBs
* Number of Sources: 453,653,104
* Number of Objects: 3,492,890

## Setup and Loading

Begin by initializing an Ensemble, and we can also grab the Dask Dashboard link for inspecting the Dask cluster work as we run through the cells.

In [2]:
# Set some paths and variables
DATA_DIR = "../../../data/plasticc/parquet" # You'll need to grab this data yourself
N_PROCESSORS = 4

# Initialize an Ensemble
ens = Ensemble(n_workers=N_PROCESSORS)
ens.client_info()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 12,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:64757,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:64768,Total threads: 3
Dashboard: http://127.0.0.1:64772/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:64760,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-xf61295u,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-xf61295u

0,1
Comm: tcp://127.0.0.1:64769,Total threads: 3
Dashboard: http://127.0.0.1:64773/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:64761,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-qtkequmr,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-qtkequmr

0,1
Comm: tcp://127.0.0.1:64770,Total threads: 3
Dashboard: http://127.0.0.1:64776/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:64762,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-l9i32n4k,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-l9i32n4k

0,1
Comm: tcp://127.0.0.1:64771,Total threads: 3
Dashboard: http://127.0.0.1:64778/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:64763,
Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-bi8jwgui,Local directory: /var/folders/lc/dws63_cs5gz5mf8s869hjpx40000gn/T/dask-scratch-space/worker-bi8jwgui


In [3]:
# Loading PLAsTiCC into the Ensemble

# ColumnMapper Establishes which table columns map to timeseries quantities
colmap = ColumnMapper(
        id_col='object_id',
        time_col='mjd',
        flux_col='flux',
        err_col='flux_err',
        band_col='passband',
      )

# We can read from parquet
ens.from_parquet(
    source_file=DATA_DIR+"/source/*.parquet",
    object_file=DATA_DIR+"/object/*.parquet",
    column_mapper=colmap,
    sync_tables=False, # Avoid doing an initial sync
    sorted=True, # If the input data is already sorted by the chosen index
    sort=False,
)

<tape.ensemble.Ensemble at 0x15b261960>

We've loaded the data with the `sorted` flag set to `True`, this will populate divisions for the Ensemble dataframes. Below, we see the divisions populated (the numbers along the index) even when the data itself is still represented lazily.

_**Divisions**: Given a sorted index, the boundary values for each partition that indicate which index slices live in which partition. Used to search for data only in a single partition, rather than needing to search all partitions._

In [4]:
ens._source

Unnamed: 0_level_0,mjd,passband,flux,flux_err,detected_bool,provenance
npartitions=193,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,float64,int64,float64,float64,int64,string
88856,...,...,...,...,...,...
...,...,...,...,...,...,...
130106493,...,...,...,...,...,...
130788054,...,...,...,...,...,...


In [5]:
ens._object

Unnamed: 0_level_0,ra,decl,ddf_bool,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,true_target,true_submodel,true_z,true_distmod,true_lensdmu,true_vpec,true_rv,true_av,true_peakmjd,libid_cadence,tflux_u,tflux_g,tflux_r,tflux_i,tflux_z,tflux_y
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
13,float64,float64,int64,float64,float64,float64,float64,float64,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,int64,float64,float64,float64,float64,float64,float64
26008612,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104609945,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130788054,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


## Analysis

First, let's select only Galactic objects, by cutting on hostgal_photoz. We use `query` to cut on a column of the object table, this will propagate to the source table when it's next used.

In [6]:
ens = ens.query("hostgal_photoz < 1e-3", table="object")

Second, let's select persistent sources, by cutting on the duration of the light curve. In this case, we use the `batch` interface to apply a custom function to each light curve.

In [7]:
duration = ens.batch(
    lambda time, detected: np.ptp(time[np.asarray(detected, dtype=bool)]),
    ens._time_col, 'detected_bool',
    use_map=True,
    compute=False,
)

# Duration is a dask series, which we can assign as a column to the Object Table
ens.assign(table="object", duration=duration)

<tape.ensemble.Ensemble at 0x15b261960>

In [8]:
ens.head("object", 5)

Unnamed: 0_level_0,ra,decl,ddf_bool,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,true_target,...,true_av,true_peakmjd,libid_cadence,tflux_u,tflux_g,tflux_r,tflux_i,tflux_z,tflux_y,duration
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
168,349.4295,-62.5086,1,-9.0,0.0,0.0,-9.0,0.02,0,16,...,0.0,59570.0,104,50.4,399.3,366.0,243.5,147.8,59.8,785.8281
1063,53.7891,-27.7844,1,-9.0,0.0,0.0,-9.0,0.009,0,92,...,0.0,59570.0,63,3201.3,19615.8,23366.0,23362.2,23175.5,22985.7,848.8198
2270,151.1719,2.2387,1,-9.0,0.0,0.0,-9.0,0.024,0,65,...,0.0,59570.0,49,1.0,9.8,28.0,102.8,176.9,226.8,0.0371
2790,349.2857,-62.8847,1,-9.0,0.0,0.0,-9.0,0.018,0,92,...,0.0,59570.0,100,637.1,3830.5,4167.1,3869.8,3639.3,3608.1,873.7903
4025,33.2227,-4.7802,1,-9.0,0.0,0.0,-9.0,0.018,0,991,...,0.0,60193.742,91,50.9,844.5,3668.8,7450.0,11709.1,15460.1,121.8277


Now we can use our new duration column to further filter the dataset. Once again, we use `query`.

In [9]:
ens = ens.query("duration > 366", table="object")

Next, we use Otsu's method to split light curves into two groups:
* one with high flux
* one with low flux

Eclipsing binaries should have lower flux group smaller than the higher flux group, but having larger  variability. We use light-curve package to extract these features (https://github.com/light-curve/light-curve-python). For simplicity, we only calculate these features for the i (3) band.

In [10]:
# Once again using batch to apply a custom function
otsu_features = ens.batch(licu.OtsuSplit(), band_to_calc=3, use_map=True, compute=False)

# otsu_features is a dataframe with multiple columns, can assign them to object
ens = ens.assign(
    table="object",
    otsu_lower_to_all_ratio=otsu_features['otsu_lower_to_all_ratio'],
    otsu_std_lower=otsu_features['otsu_std_lower'],
    otsu_std_upper=otsu_features['otsu_std_upper'],
)

Now we can query by these columns to filter down to to our objects of interest.

In [11]:
ens = ens.query(
    "otsu_lower_to_all_ratio < 0.1 and otsu_std_lower > otsu_std_upper",
    table="object",
)

Thus far, everything has mostly been run lazily. We can kick off the analysis by bringing the resulting object table into memory.

In [12]:
df = ens.compute("object")
df



Unnamed: 0_level_0,ra,decl,ddf_bool,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,true_target,...,tflux_u,tflux_g,tflux_r,tflux_i,tflux_z,tflux_y,duration,otsu_lower_to_all_ratio,otsu_std_lower,otsu_std_upper
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1040934,192.1674,-46.3751,0,-9.0,0.0,0.0,-9.0,0.083,0,65,...,5.7,33.7,110.5,539.1,1183.9,1991.1,410.0927,0.090909,10.038970,7.000799
1145188,104.4141,-12.1773,0,-9.0,0.0,0.0,-9.0,0.625,0,16,...,341.5,3106.7,3207.9,2379.2,1631.3,717.8,1073.9476,0.090909,111.621817,67.960107
1184639,326.9531,-22.8314,0,-9.0,0.0,0.0,-9.0,0.032,0,16,...,2114.8,19954.1,20819.5,15006.2,9573.4,3997.2,728.8966,0.086957,563.869493,25.370866
1226593,48.5156,-32.2662,0,-9.0,0.0,0.0,-9.0,0.014,0,65,...,15.0,25.1,43.4,140.4,285.6,467.0,830.8233,0.080000,12.008922,4.690816
1227515,183.3398,-16.9578,0,-9.0,0.0,0.0,-9.0,0.049,0,16,...,733.2,11232.6,16446.4,14553.2,10543.4,4789.5,1056.0646,0.095238,765.642029,122.818703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130633244,152.9297,-12.4828,0,-9.0,0.0,0.0,-9.0,0.058,0,16,...,232.1,1957.9,1864.3,1267.8,781.9,319.2,1086.8704,0.080000,57.311277,12.255049
130634660,36.8182,-46.7685,0,-9.0,0.0,0.0,-9.0,0.014,0,16,...,51.2,379.4,331.9,214.8,128.2,51.3,839.7401,0.080000,8.145193,7.080347
130708654,95.0977,-36.0536,0,-9.0,0.0,0.0,-9.0,0.052,0,16,...,2059.7,20970.8,23290.0,17505.6,11486.4,4893.3,1008.0936,0.095238,1425.256064,480.315334
130711141,133.0349,-47.1613,0,-9.0,0.0,0.0,-9.0,1.271,0,16,...,489.5,6468.5,7909.4,6421.0,4922.0,2323.3,1025.3035,0.095238,139.958460,136.504984
