In [1]:
import json
import hist

import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds

#### Using Skyhook with PyArrow

In [2]:
dataset = ds.dataset("file:///mnt/cephfs/nyc", format=ds.SkyhookFileFormat("parquet", "ls /opt/ceph/ceph.conf", "cephfs-data0"))
dataset.to_table(columns=["total_amount", "fare_amount"], filter=(ds.field("trip_distance") > 20.0)).to_pandas()

Unnamed: 0,total_amount,fare_amount
0,75.84,52.00
1,69.99,52.00
2,59.84,53.00
3,68.50,53.50
4,70.01,52.00
...,...,...
376,78.88,67.00
377,64.84,58.50
378,0.31,0.01
379,58.80,57.50


#### Load ROOT files URIs 

In [3]:
with open('ntuples.json', 'r') as f:
    data = json.load(f)
    
uris = list()
for file in data['data']['nominal']['files']:
    uris.append(file['path'])
    
uris[0:5]

['https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/Run2015D/SingleMuon/MINIAOD/16Dec2015-v1/10000/00006301-CAA8-E511-AD39-549F35AD8BC9.root',
 'https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/Run2015D/SingleMuon/MINIAOD/16Dec2015-v1/10000/0034202D-A3A8-E511-BA9C-00259073E3DA.root',
 'https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/Run2015D/SingleMuon/MINIAOD/16Dec2015-v1/10000/0043758E-ECA8-E511-B849-002618FDA287.root',
 'https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/Run2015D/SingleMuon/MINIAOD/16Dec2015-v1/10000/004C08BC-C8A8-E511-943C-00266CFAE6E0.root',
 'https://xrootd-local.unl.edu:1094//store/user/AGC/datasets/Run2015D/SingleMuon/MINIAOD/16Dec2015-v1/10000/005416D9-E0A8-E511-8AA1-0CC47A4C8E46.root']

In [4]:
# import servicex as sx
# from func_adl_servicex import ServiceXSourceUpROOT
# dataset_name = uris[0:10]
# sx_dataset = sx.ServiceXDataset(dataset_name, "uproot", result_destination="volume")
# ds = ServiceXSourceUpROOT(sx_dataset, "Events")
# missing_ET = ds.Select(lambda event: {'met': event.met_pt}).AsAwkwardArray().value()

#### Initiate Dask client

In [5]:
from dask.distributed import Client
client = Client("tcp://127.0.0.1:42565")
client


+---------+------------------------+-----------+------------------------+
| Package | client                 | scheduler | workers                |
+---------+------------------------+-----------+------------------------+
| dask    | 2022.04.0+7.gdd15a6aca | 2022.04.0 | 2022.04.0+7.gdd15a6aca |
+---------+------------------------+-----------+------------------------+


0,1
Connection method: Direct,
Dashboard: /user/jayjeetc@ucsc.edu/proxy/33611/status,

0,1
Comm: tcp://127.0.0.1:42565,Workers: 4
Dashboard: /user/jayjeetc@ucsc.edu/proxy/33611/status,Total threads: 16
Started: 40 minutes ago,Total memory: 62.80 GiB

0,1
Comm: tcp://127.0.0.1:37405,Total threads: 4
Dashboard: /user/jayjeetc@ucsc.edu/proxy/46533/status,Memory: 15.70 GiB
Nanny: tcp://127.0.0.1:46043,
Local directory: /home/cms-jovyan/dask-worker-space/worker-2_5cbl2g,Local directory: /home/cms-jovyan/dask-worker-space/worker-2_5cbl2g
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 4.0%,Last seen: Just now
Memory usage: 551.09 MiB,Spilled bytes: 0 B
Read bytes: 108.40 kiB,Write bytes: 110.20 kiB

0,1
Comm: tcp://127.0.0.1:40699,Total threads: 4
Dashboard: /user/jayjeetc@ucsc.edu/proxy/45515/status,Memory: 15.70 GiB
Nanny: tcp://127.0.0.1:34269,
Local directory: /home/cms-jovyan/dask-worker-space/worker-wkb22twk,Local directory: /home/cms-jovyan/dask-worker-space/worker-wkb22twk
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 490.87 MiB,Spilled bytes: 0 B
Read bytes: 80.13 kiB,Write bytes: 76.74 kiB

0,1
Comm: tcp://127.0.0.1:43453,Total threads: 4
Dashboard: /user/jayjeetc@ucsc.edu/proxy/44201/status,Memory: 15.70 GiB
Nanny: tcp://127.0.0.1:32823,
Local directory: /home/cms-jovyan/dask-worker-space/worker-ckg018wi,Local directory: /home/cms-jovyan/dask-worker-space/worker-ckg018wi
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 489.54 MiB,Spilled bytes: 0 B
Read bytes: 79.79 kiB,Write bytes: 77.22 kiB

0,1
Comm: tcp://127.0.0.1:34945,Total threads: 4
Dashboard: /user/jayjeetc@ucsc.edu/proxy/37713/status,Memory: 15.70 GiB
Nanny: tcp://127.0.0.1:33325,
Local directory: /home/cms-jovyan/dask-worker-space/worker-4e7566wf,Local directory: /home/cms-jovyan/dask-worker-space/worker-4e7566wf
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 499.92 MiB,Spilled bytes: 0 B
Read bytes: 81.33 kiB,Write bytes: 77.96 kiB


#### Offloading Dask compute using Skyhook

In [6]:
import dask.dataframe as dd

In [7]:
df = dd.read_parquet("/mnt/cephfs/AGC/AGC_copied", filters=[('met_pt', '>', 450)], format="parquet")

In [8]:
df.compute()

Unnamed: 0,met_pt
0,460.105164
0,460.105164
0,460.105164
0,460.105164
0,460.105164
...,...
0,460.105164
0,460.105164
0,460.105164
0,460.105164


#### ADL Benchmark Query

In [9]:
from coffea import processor
from coffea.nanoevents import schemas

class Q1Processor(processor.ProcessorABC):
    def process(self, events):
        return (
            hist.Hist.new.Reg(100, 0, 200, name="met", label="$E_{T}^{miss}$ [GeV]")
            .Double()
            .fill(events.met_pt)
        )

    def postprocess(self, accumulator):
        return accumulator

#### Computation on Dask Workers

In [10]:
fileset = {'SingleMu' : "/mnt/cephfs/AGC/AGC_copied"}

run = processor.Runner(executor=processor.DaskExecutor(client=client),
                        schema=schemas.BaseSchema,
                        use_skyhook=False,
                        format="parquet"
                      )

output = run(fileset, "Events", processor_instance=Q1Processor())
output

[########################################] | 100% Completed | 19.7s

#### Computation inside Skyhook

In [11]:
fileset = {'SingleMu' : "/mnt/cephfs/AGC/AGC_copied"}

run = processor.Runner(executor=processor.DaskExecutor(client=client),
                        schema=schemas.BaseSchema,
                        use_skyhook=True,
                        format="parquet"
                      )

output = run(fileset, "Events", processor_instance=Q1Processor())
output

[########################################] | 100% Completed |  1min 21.8s