In [1]:
import lsdb
from tape import Ensemble, ColumnMapper
import matplotlib.pyplot as plt
import dask
dask.config.set({'temporary_directory': '/data/epyc/users/brantd/tmp'})

from dask.distributed import Client
client = Client(n_workers=10, threads_per_worker=10,
                memory_limit="60G")

client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 10
Total threads: 100,Total memory: 558.79 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:33658,Workers: 10
Dashboard: http://127.0.0.1:8787/status,Total threads: 100
Started: Just now,Total memory: 558.79 GiB

0,1
Comm: tcp://127.0.0.1:45180,Total threads: 10
Dashboard: http://127.0.0.1:33583/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:40061,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-gk3ieb2c,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-gk3ieb2c

0,1
Comm: tcp://127.0.0.1:44949,Total threads: 10
Dashboard: http://127.0.0.1:42335/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:41537,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-yruv_bgu,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-yruv_bgu

0,1
Comm: tcp://127.0.0.1:32834,Total threads: 10
Dashboard: http://127.0.0.1:42354/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:39713,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-mi7nl_xv,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-mi7nl_xv

0,1
Comm: tcp://127.0.0.1:42099,Total threads: 10
Dashboard: http://127.0.0.1:43797/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:42518,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-rqth5dl_,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-rqth5dl_

0,1
Comm: tcp://127.0.0.1:34255,Total threads: 10
Dashboard: http://127.0.0.1:44103/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:46879,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-8i481u7n,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-8i481u7n

0,1
Comm: tcp://127.0.0.1:36649,Total threads: 10
Dashboard: http://127.0.0.1:45083/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:45928,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-0q0u2qf4,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-0q0u2qf4

0,1
Comm: tcp://127.0.0.1:45113,Total threads: 10
Dashboard: http://127.0.0.1:43574/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:44996,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-0ct50nh3,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-0ct50nh3

0,1
Comm: tcp://127.0.0.1:36769,Total threads: 10
Dashboard: http://127.0.0.1:39735/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:39417,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-gl36bqgg,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-gl36bqgg

0,1
Comm: tcp://127.0.0.1:40673,Total threads: 10
Dashboard: http://127.0.0.1:35465/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:46689,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-wba7od3_,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-wba7od3_

0,1
Comm: tcp://127.0.0.1:34985,Total threads: 10
Dashboard: http://127.0.0.1:32800/status,Memory: 55.88 GiB
Nanny: tcp://127.0.0.1:41126,
Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-m_p0k934,Local directory: /data/epyc/users/brantd/tmp/dask-scratch-space/worker-m_p0k934


# Generating a custom-sized subset of ZTF for TAPE Analysis

In [2]:
# path to ZTF AXS
ztf_object_path = "/astro/store/epyc3/data3/hipscat/catalogs/ztf_axs/ztf_dr14"
ztf_source_path = "/astro/store/epyc3/data3/hipscat/catalogs/ztf_axs/ztf_source/"

In [3]:
#Load and Subsample using LSDB

object_cols = ["ps1_objid", "ra", "dec"]
source_cols = ["ps1_objid", "ra", "dec", "mjd", "mag", "magerr", "band"]

# Load into LSDB catalog objects
ztf_object = lsdb.read_hipscat(ztf_object_path) # ZTF Object
ztf_source = lsdb.read_hipscat(ztf_source_path, columns=source_cols) # ZTF Source

# Box Search to filter down to a small subset
ra = (340, 342)
dec = (10, 12)
ztf_object = ztf_object.box(ra=ra, dec=dec)
ztf_source = ztf_source.box(ra=ra, dec=dec)

# Join Source to Object to set proper object-level _hipscat_index
joined_source = ztf_object.join(
    ztf_source, left_on="ps1_objid", right_on="ps1_objid", suffixes=("_object", "")
)

In [4]:
# Load into TAPE

# ColumnMapper Establishes which table columns map to timeseries quantities
colmap = ColumnMapper(
        id_col='_hipscat_index',
        time_col='mjd',
        flux_col='mag',
        err_col='magerr',
        band_col='band',
      )

ens = Ensemble(client=client)

ens.from_lsdb(joined_source, ztf_object, column_mapper=colmap)

<tape.ensemble.Ensemble at 0x7fe5913a8ac0>

In [5]:
print(ens.object.npartitions, ens.source.npartitions)

3 11


In [9]:
len(ens.object)

75944

In [6]:
# Save to parquet for easy loading
import numpy as np
ens.source.index=ens.source.index.astype(np.uint64) # need this until lsdb 0.2
ens.save_ensemble(".", "ztf_small_ensemble")

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Saved to ./ztf_small_ensemble
