This NB runs a simple TAPE analysis with `ztf_axs` dataset on Bridges2 cluster of PSC. You could run it starting a Juoyter Lab on https://ondemand.bridges2.psc.edu with default parameters (single RM node, time limit is up to 24 hours).

**Note**: you cannot run it on a small node (such as RM-small), because while is doesn't serve Dask workers itself, it still requires some large ammount of RAM for something.

In [1]:
import numpy as np
import time

from dask.distributed import Client
from dask_jobqueue import SLURMCluster
from tape import Ensemble, ColumnMapper

In [2]:
cluster = SLURMCluster(
    # Number of Dask workers per node
    processes=1,
    # Regular memory node type on PSC bridges2
    queue="RM",
    # dask_jobqueue requires cores and memory to be specified
    # We set them to match RM specs
    cores=128,
    memory="256GB",
    walltime="12:00:00",
)
# Run multiple jobs
cluster.scale(jobs=10)
# Allow to run more jobs
cluster.adapt(maximum_jobs=100)

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.8.10.12:8787/status,

0,1
Dashboard: http://10.8.10.12:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.8.10.12:41693,Workers: 0
Dashboard: http://10.8.10.12:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [3]:
# Make a command for dashboard ssh-tunneling

import socket
from getpass import getuser
from urllib.parse import urlparse

local_addr = '127.0.0.1:8787'
remote_host = 'bridges2.psc.edu'

with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
    s.connect(('1.1.1.1', 53))
    ip = s.getsockname()[0]
username = getuser()
dashboard_port = urlparse(client.dashboard_link).port

print(f'''
Copy-paste and run in your terminal:

ssh -N -L {local_addr}:{ip}:{dashboard_port} {username}@{remote_host}

And open this URL in your browser to see the dashboard:
http://{local_addr}/
''')


Copy-paste and run in your terminal:

ssh -N -L 127.0.0.1:8787:10.8.10.12:8787 malanche@bridges2.psc.edu

And open this URL in your browser to see the dashboard:
http://127.0.0.1:8787/



In [4]:
start_time = time.monotonic()

In [5]:
%%time

root_dir = '/ocean/projects/phy210048p/shared/hipscat/catalogs/ztf_axs'

ens = Ensemble(client=client)
ens.from_hipscat(
    dir=root_dir,
    source_subdir='ztf_source',
    object_subdir='ztf_dr14',
    sync_tables=False,
    column_mapper=ColumnMapper(
        id_col='ps1_objid',
        time_col='mjd',
        band_col='band',
        flux_col='mag',
        err_col='magerr',
    ),
)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


CPU times: user 23.6 s, sys: 21.2 s, total: 44.8 s
Wall time: 15min 26s




<tape.ensemble.Ensemble at 0x14e89596c5e0>

In [6]:
%%time 

ens.query('band == "r" and catflags == 0', table='source')

ens.calc_nobs(by_band=False, temporary=False, label="ngoodobs")
ens.query('ngoodobs_total >= 100', table='object')

duration = ens.batch(np.ptp, ens._time_col, schema={'duration': float})
ens.assign(table='object', duration=duration, temporary=False)
ens.query('duration >= 50.0', table='object')

reduced_chi2_extractor = licu.ReducedChi2()
rchi2 = ens.batch(reduced_chi2_extractor, ens._time_col, ens._mag_col, ens._err_col,
                  fill_value=-1.0, check=False, sorted=True,
                  schema={'rchi2': float})
ens.assign(table='object', rchi2=rchi2, temporary=False)
ens.query('rchi2 >= 10.0', table='object')

size = ens._object.shape[0].compute()

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
2023-11-09 15:47:42,330 - distributed.protocol.core - CRITICAL - Failed to Serialize
Traceback (most recent call last):
  File "/jet/home/malanche/.virtualenvs/tape/lib/python3.9/site-packages/distributed/protocol/core.py", line 109, in dumps
    frames[0] = msgpack.dumps(msg, default=_encode_default, use_bin_type=True)
  File "/jet/home/malanche/.virtualenvs/tape/lib/python3.9/site-packages/msgpack/__init__.py", line 36, in packb
    return Packer(**kwargs).pack(o)
  File "msgpack/_packer.pyx", line 294, in msgpack._cmsgpack.Packer.pack
  File "msgpack/_packer.pyx", line 300, in msgpack._cmsgpack.Packer.pack
  File "msgpack/_packer.pyx", line 297, in msgpack._cmsgpack.Packer.pack
  File "msgpack/_packer.pyx", line 264, in msgpack._cmsgpack.Packer._pack
  File "msgpack/_packer.pyx", line 231, in msgpack._cmsgpack

CancelledError: ('query-index-dc155bc4e0b5b6ec2fe2e05d43c28f57', 1006)

In [7]:
import os
from pathlib import Path

path = Path(os.environ['HOME']) / 'result.txt'
with open(path, 'a') as f:
    f.write(f'''Final object table size: {size}
Time spent: {time.monotonic() - start_time}
''')

NameError: name 'size' is not defined

In [9]:
cluster.close()

2023-11-09 16:00:02,369 - distributed.deploy.adaptive_core - INFO - Adaptive stop


In [None]:
1

In [8]:
1

1