# PLAsTiCC data exploration with TAPE

Let's explore [PLAsTiCC](http://plasticc.org) data!

It is publically avilable through [this Zenodo repository](https://zenodo.org/record/2539456).

In [1]:
## Uncomment to install packages

# !pip install tape joblib requests

In [3]:
%load_ext memory_profiler

from pathlib import Path

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


### Download the data

Please get the data from [Zenodo](https://zenodo.org/record/2539456) and put them to the `./plasticc` folder (you may change the location bellow with `DATA_DIR`).

In [1]:
DATA_DIR = Path("./plasticc")

META_FILENAME = "plasticc_test_metadata.csv"
LC_FILENAMES = [f"plasticc_test_lightcurves_{i:02d}.csv" for i in range(1, 12)]
# META_FILENAME = "plasticc_train_metadata.csv.gz"
# LC_FILENAMES = ['plasticc_train_lightcurves.csv.gz']

N_PARTITIONS = len(LC_FILENAMES)

You may skip the next cell if you already have the data downloaded in the `DATA_DIR`.

# Read and analyse the data with TAPE

In [4]:
%%time

import light_curve as licu
import dask.dataframe as dd
from tape import Ensemble, ColumnMapper

# In TAPE's (and LSST's) terminology, sources are individual detections,
# and objects are the underlying astrophysical objects.

# We load object table first, from the metadata file.
print("Loading object table...")
object_table = dd.read_csv(
    DATA_DIR / META_FILENAME,
    blocksize=100e6,
)
# object_table = object_table.set_index('object_id', sorted=True, sort=False,)# divisions=[13, 130788054])

# Then we load the sources:
print("Loading source tables...")
source_table = dd.read_csv(
    [DATA_DIR / filename for filename in LC_FILENAMES],
    blocksize=100e6,
)
# source_table = source_table.set_index('object_id', sorted=True, sort=False,) # divisions=[13, 1000183, 13952428, 26956806, 39933855, 52935297, 65930419, 78887012, 91875367, 104853940, 117792961, 130788054])

# Now we can make an Ensemble.
# To make parallel processing work we need to partition the data.
# After that, when we run analysis, TAPE will distribute the work:
# one partition per worker.
print("Building Ensemble...")
# ens = Ensemble(dashboard_address="127.0.0.1:8787", memory_limit='16GB')
ens = Ensemble(sync_mode=False, memory_limit='16GB', n_workers=2)
ens.from_dask_dataframe(
    source_frame=source_table,
    object_frame=object_table,
    # npartitions=N_PARTITIONS,
    npartitions=None,
    column_mapper=ColumnMapper(
        id_col='object_id',
        time_col='mjd',
        flux_col='flux',
        err_col='flux_err',
        band_col='passband',
    ),
    sync_tables=False,
)

# Let's run some analysis!

print("Starting analysis...")
# First, let's select only Galactic objects, by cutting on hostgal_photoz.
print("First, filter by photoz")
ens = ens.query("hostgal_photoz < 1e-3", table="object")

# Second, let's select persistent sources, by cutting on the duration of the light curve.
print("Extract durations")
duration = ens.batch(
    lambda time, detected: np.ptp(time[np.asarray(detected, dtype=bool)]),
    ens._time_col, 'detected_bool',
    meta=('duration', "float64"),
    use_map=False,
    compute=False,
)
print("Assign a column")
ens.assign(table="object", duration=duration)
print("Filter by duration")
ens = ens.query("duration > 366", table="object")

# Next, we use Otsu's method to split light curves into two groups:
# one with high flux, and one with low flux. Eclipsing binaries should have
# lower flux group smaller than the higher flux group, but having larger 
# variability. We use light-curve package to extract these features.
# (https://github.com/light-curve/light-curve-python)
# For simplicity, we only calculate these features for the i band.
print("Extract Otsu features")
otsu_features = ens.batch(licu.OtsuSplit(), band_to_calc=3, use_map=False, compute=False)
print("Assign columns")
ens = ens.assign(
    table="object",
    otsu_lower_to_all_ratio=otsu_features['otsu_lower_to_all_ratio'],
    otsu_std_lower=otsu_features['otsu_std_lower'],
    otsu_std_upper=otsu_features['otsu_std_upper'],
)
print('Filter by Otsu features')
ens = ens.query(
    "otsu_lower_to_all_ratio < 0.1 and otsu_std_lower > otsu_std_upper",
    table="object",
)
ens = ens.compute()

Loading object table...
Loading source tables...
Building Ensemble...




Starting analysis...
First, filter by photoz
Extract durations
Assign a column
Filter by duration
Extract Otsu features
Assign columns
Filter by Otsu features


2023-09-28 12:19:22,088 - distributed.worker - ERROR - failed during get data with tcp://127.0.0.1:53035 -> None
Traceback (most recent call last):
  File "/Users/hombit/.virtualenvs/tape/lib/python3.11/site-packages/tornado/iostream.py", line 861, in _read_to_buffer
    bytes_read = self.read_from_fd(buf)
                 ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/hombit/.virtualenvs/tape/lib/python3.11/site-packages/tornado/iostream.py", line 1116, in read_from_fd
    return self.socket.recv_into(buf, len(buf))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [Errno 54] Connection reset by peer

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/hombit/.virtualenvs/tape/lib/python3.11/site-packages/distributed/worker.py", line 1800, in get_data
    response = await comm.read(deserializers=serializers)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/hombit/.virtualenvs/tape/li

KeyboardInterrupt: 

### Do the same, but with bare Pandas + PyArrow and nested arrays

In [None]:
%%time
# %%memit

import light_curve as licu
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.csv as pacsv
from joblib import Parallel, delayed

# Read the data
# -------------

# First we load object table, from the metadata file.
print("Loading object table...")
object_table = pacsv.read_csv(
    DATA_DIR / META_FILENAME,
    # We'd like to load the whole file into a single partition
    read_options=pacsv.ReadOptions(block_size=(1<<31)-1),
)
object_table = pd.DataFrame(
    {
        col: pd.Series(
            object_table[col],
            dtype=pd.ArrowDtype(object_table[col].type),
            index=object_table['object_id'],
            copy=False,
        )
        for col in object_table.column_names if col != 'object_id'
    },
)

# Then we load the sources:
print("Loading source tables...")

def read_source_table(filename):
    table = pacsv.read_csv(
        DATA_DIR / filename,
        # We'd like to have a partition per an original file
        read_options=pacsv.ReadOptions(block_size=(1<<31)-1),
    )
    return pd.DataFrame(
        {
            col: pd.Series(
                table[col],
                dtype=pd.ArrowDtype(table[col].type),
                index=table['object_id'],
                copy=False,
            )
            for col in table.column_names if col != 'object_id'
        },
    )

# source_tables = []
# for filename in LC_FILENAMES:
#     source_table = pa.csv.read_csv(
#         DATA_DIR / filename,
#         # We'd like to have a partition per an original file
#         read_options=pa.csv.ReadOptions(block_size=(1<<31)-1),
#     )
#     source_tables.append(pd.DataFrame(
#         {
#             col: pd.Series(
#                 source_table[col],
#                 dtype=pd.ArrowDtype(source_table[col].type),
#                 index=source_table['object_id'],
#                 copy=False,
#             )
#             for col in source_table.column_names if col != 'object_id'
#         },
#     ))
source_tables = Parallel(backend='threading', n_jobs=2)(
    delayed(read_source_table)(filename) for filename in LC_FILENAMES
)
source_table = pd.concat(source_tables, ignore_index=True, sort=False)


# Add sources to the object table
# -------------------------------

# First, let's do some sanity checks
print("Sanity checks...")
np.testing.assert_array_equal(
    object_table.index.values,
    np.unique(object_table.index.values),
    err_msg="Object table has duplicate indices or is not sorted.",
)
assert np.all(np.diff(source_table.index) >= 0), "Source table index must be sorted."

# We need an offsets array to know where each source light curve starts.
source_offsets = []
for table in source_tables:
    offset = np.nonzero(np.diff(table.index, prepend=-1, append=-1))[0]
    source_offsets.append(pa.array(offset))

# Update the object table with list-arrays built from the source table

print("Updating object table with list-arrays...")
for column in source_table.columns:
    list_arrays = []
    for table, offset in zip(source_tables, source_offsets):
        list_arrays.append(pa.ListArray.from_arrays(
            offset,
            pa.array(table[column]),
        ))
    chunked_array = pa.chunked_array(list_arrays)
    object_table[column] = pd.Series(
        chunked_array,
        dtype=pd.ArrowDtype(chunked_array.type),
        index=object_table.index,
    )
    
# Do analysis
# -----------

print("Starting analysis...")
# First, let's select only Galactic objects, by cutting on hostgal_photoz.
df = object_table[object_table['hostgal_photoz'] < 1e-3]

# Second, let's select persistent sources, by cutting on the duration of the light curve.
df['duration'] = df[['mjd', 'detected_bool']].apply(
    lambda row: np.ptp(row['mjd'][np.asarray(row['detected_bool'], dtype=bool)]),
    axis=1
)
df = df[df['duration'] > 366]

# Next, we use Otsu's method to split light curves into two groups:
# one with high flux, and one with low flux. Eclipsing binaries should have
# lower flux group significantly smaller than the higher flux group,
# but having larger variability.
# We use light-curve package to extract these features.
# (https://github.com/light-curve/light-curve-python)
# For simplicity, we only calculate these features for the i band.
def extract_band(*arrays, bands, band_to_calc):
    mask = np.asarray(bands) == band_to_calc
    return [np.asarray(arr)[mask] for arr in arrays]

otsu_split = licu.OtsuSplit()
otsu_features = df[['mjd', 'flux', 'flux_err', 'passband']].apply(
    lambda row: pd.Series(
        otsu_split(
            *extract_band(
                row['mjd'],
                row['flux'],
                row['flux_err'],
                bands=row['passband'],
                band_to_calc=3,  # i band
            ),
            sorted=True,
            check=False,
        ),
        index=otsu_split.names,
    ),
    axis=1
)

df = df[otsu_features['otsu_lower_to_all_ratio'] < 0.1]
df = df[otsu_features['otsu_std_lower'] > otsu_features['otsu_std_upper']]

Loading object table...
Loading source tables...


In [7]:
df

Unnamed: 0,ra,decl,ddf_bool,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,true_target,...,tflux_r,tflux_i,tflux_z,tflux_y,mjd,passband,flux,flux_err,detected_bool,duration
168,349.4295,-62.5086,1,-9.0,0.0,0.0,-9.0,0.020,0,16,...,366.0,243.5,147.8,59.8,[59750.4229 59750.4306 59750.4383 59750.445 5...,[2 1 3 4 2 1 3 4 5 2 1 3 4 5 2 1 3 4 5 2 1 3 4...,[ 8.04076000e-01 1.87697700e+00 1.80772400e+...,[ 1.653534 2.331876 2.394275 10.431469 1.26...,[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,785.8281
104007,1.3636,-46.7685,1,-9.0,0.0,0.0,-9.0,0.008,0,16,...,658.3,497.2,325.6,138.3,[59770.3662 59770.374 59770.3817 59770.3928 5...,[2 1 3 4 5 2 1 3 4 5 2 1 3 4 5 2 1 3 4 5 2 1 3...,[ 8.1388500e-01 1.0303110e+00 1.4764110e+00 ...,[ 1.07554 1.322442 1.359521 1.942241 4.68...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,713.0030
268222,52.9102,-26.2768,1,-9.0,0.0,0.0,-9.0,0.008,0,16,...,407.2,267.6,161.0,64.7,[59825.26 59825.2676 59825.2752 59825.2862 5...,[2 1 3 4 5 2 1 3 4 5 2 1 3 4 5 0 2 1 3 4 5 2 1...,[ 7.4472100e-01 -2.7603720e+00 -1.7110180e+00 ...,[1.174541 1.48612 1.978493 3.024909 7.139806 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,412.9690
1000889,106.1719,-40.0330,0,-9.0,0.0,0.0,-9.0,0.136,0,65,...,117.9,426.1,737.4,949.2,[59583.298 59583.3544 59584.0467 59589.0609 5...,[0 4 4 5 3 5 5 3 5 4 5 4 5 5 5 5 5 4 4 4 5 5 2...,[ 2.80417700e+00 -7.52903700e+00 -5.13894700e+...,[ 8.161173 13.003259 19.156862 22.240126 9.21...,[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,616.0312
1014192,98.9648,2.6867,0,-9.0,0.0,0.0,-9.0,1.849,0,92,...,3574.3,3425.8,3972.8,4227.6,[59638.0155 59653.0841 59662.0036 59682.0035 5...,[4 5 5 5 5 5 4 5 5 5 4 4 4 0 1 2 4 2 2 0 2 0 3...,[-1.78175034e+02 4.11250040e+01 4.01937256e+...,[ 169.611145 423.33371 249.60318 214....,[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,980.1724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130773164,279.8438,-29.4845,0,-9.0,0.0,0.0,-9.0,0.162,0,16,...,10962.4,9186.6,6481.9,2894.3,[59698.4292 59700.2404 59722.3742 59729.4327 5...,[4 0 4 4 3 4 1 5 4 3 2 4 4 2 5 5 0 5 1 5 2 4 4...,[-6.01033500e+00 -1.71352540e+01 2.04168870e+...,[43.856094 24.732826 17.160511 21.696514 22.08...,[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0...,595.3709
130773875,288.8086,-12.6356,0,-9.0,0.0,0.0,-9.0,0.180,0,16,...,22191.3,18744.6,13295.9,5959.6,[59696.3432 59698.3456 59698.4209 59701.3375 5...,[3 3 4 2 0 5 3 4 4 5 5 4 4 5 2 1 5 5 4 4 0 4 1...,[-3.22548462e+02 -2.54991400e+01 -4.62516440e+...,[15.321712 15.063808 37.548401 19.989159 21.80...,[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,831.7051
130775146,328.8867,-34.9539,0,-9.0,0.0,0.0,-9.0,0.019,0,16,...,4365.2,3988.1,2935.0,1347.0,[59753.2174 59769.2853 59770.2698 59771.1771 5...,[1 4 5 4 5 5 5 5 4 1 2 5 0 3 5 4 5 1 3 2 3 2 3...,[ 2.37857150e+01 4.81287120e+01 1.95647320e+...,[ 6.818478 10.075212 17.565044 13.315582 30.69...,[0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1 1 0...,828.6954
130786299,169.6289,1.4922,0,-9.0,0.0,0.0,-9.0,0.041,0,16,...,20735.3,14952.9,9547.2,3988.9,[59610.2991 59610.3785 59611.3766 59612.3737 5...,[3 4 4 4 4 0 5 4 5 5 1 5 4 4 4 4 4 2 4 3 2 3 4...,[ 1.13636230e+01 -2.06327606e+02 1.00225400e+...,[16.489899 26.350208 23.926029 24.907831 26.94...,[0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1...,848.6973


In [None]:
import matplotlib.pyplot as plt

LSST_BANDS = 'ugrizy'

def plot(row):
    plt.figure()
    plt.title(f"Object {row.Index}, true class {row.true_target}")
    plt.xlabel('MJD')
    plt.ylabel('Flux, zp=27.5')
    for band_idx, band_name in enumerate(LSST_BANDS):
        mjd, flux, flux_err = extract_band(
            row.mjd,
            row.flux,
            row.flux_err,
            bands=row.passband,
            band_to_calc=band_idx,
        )
        color = f'C{band_idx}'
        plt.scatter(mjd, flux, c=color, label=band_name)
        plt.errorbar(mjd, flux, yerr=flux_err, ls='none', c=color)
        plt.legend()

# Random objects from the selected sample
for row in object_table[object_table['true_target'] == 16].sample(5, random_state=0).itertuples():
    plot(row)
    
# Random objects from the selected sample
for row in df.sample(5, random_state=0).itertuples():
    plot(row)

In [23]:
object_table['mjd'].apply(lambda t: t.ctypes.data)

13           20468334592
14           20468337232
17           20468339872
23           20468342688
34           20468345328
                ...     
130787966    24097554064
130787971    24097555224
130787974    24097556344
130788053    24097557480
130788054    24097558304
Name: mjd, Length: 3492890, dtype: int64

In [21]:
object_table.apply(lambda row: row['mjd'].ctypes.data, axis=1)

13           20468334592
14           20468337232
17           20468339872
23           20468342688
34           20468345328
                ...     
130787966    24097554064
130787971    24097555224
130787974    24097556344
130788053    24097557480
130788054    24097558304
Length: 3492890, dtype: int64

In [34]:
print(np.diff(pa.array(object_table['mjd']).chunks[0].offsets))
object_table['mjd'].apply(lambda t: t.ctypes.data).diff() / 8

[330 330 352 ... 350 255 255]


13             NaN
14           330.0
17           330.0
23           352.0
34           330.0
             ...  
130787966    139.0
130787971    145.0
130787974    140.0
130788053    142.0
130788054    103.0
Name: mjd, Length: 3492890, dtype: float64