# Exploring The Data

Looking at the data to see how to access enough columns to make this relevant.

In [1]:
from func_adl_servicex_xaodr22 import atlas_release
from func_adl_servicex_xaodr22 import SXDSAtlasxAODR22PHYSLITE

from hist.dask import Hist
import dask_awkward as dak

print(f'Using release {atlas_release}')

Using release 22.2.107


Setup the dataset we will use for testing.

In [2]:
ttbar_all_rucio_dataset_name = "mc23_13p6TeV.601229.PhPy8EG_A14_ttbar_hdamp258p75_SingleLep.deriv.DAOD_PHYSLITE.e8514_s4162_r14622_p6026"
ttbar_all = f"rucio://{ttbar_all_rucio_dataset_name}?files=4"
ds = SXDSAtlasxAODR22PHYSLITE(ttbar_all, backend='atlasr22')

ds.return_qastle = True

## ServiceX Query

Do an event-level query - so lists of jets, met, etc, all at the top level.

In [4]:
# TODO: The EventInfo argument should default correctly
# TODO: dataclass should be supported so as not to lose type-following!
query = (ds
         .Select(lambda e: {
             'evt': e.EventInfo("EventInfo"),
             'jet': e.Jets()
             })
         .Select(lambda ei: {
             'event_number': ei.evt.eventNumber(),
             'run_number': ei.evt.runNumber(),
             'jet_pt': ei.jet.Select(lambda j: j.pt()/1000),
             'jet_energy_sampling': ei.jet.Select(lambda j: j.getAttributeVectorFloat("EnergyPerSampling")),
         })
)

We do not have tight integration into `dask_awkward` until there is extra code working, so lets grab all the data.

In [5]:
# Start by grabbing the data as an awkward array
# TODO: Files should remain in the S3 cache and be read directly from there
qastle_text = query.value()
qastle_text

"(call Select (call Select (call MetaData (call MetaData (call EventDataset 'bogus.root') (dict (list 'metadata_type' 'name' 'include_files' 'container_type' 'contains_collection' 'link_libraries') (list 'add_atlas_event_collection_info' 'EventInfo' (list 'xAODEventInfo/versions/EventInfo_v1.h') 'xAOD::EventInfo_v1' False (list 'xAODEventInfo')))) (dict (list 'metadata_type' 'name' 'include_files' 'container_type' 'element_type' 'contains_collection' 'link_libraries') (list 'add_atlas_event_collection_info' 'Jets' (list 'xAODJet/JetContainer.h') 'DataVector<xAOD::Jet_v1>' 'xAOD::Jet_v1' True (list 'xAODJet')))) (lambda (list e) (dict (list 'evt' 'jet') (list (call (attr e 'EventInfo') 'EventInfo') (call (attr e 'Jets') 'AnalysisJets'))))) (lambda (list ei) (dict (list 'event_number' 'run_number' 'jet_pt' 'jet_energy_sampling') (list (call (attr (attr ei 'evt') 'eventNumber')) (call (attr (attr ei 'evt') 'runNumber')) (call (attr (attr ei 'jet') 'Select') (lambda (list j) (/ (call (attr

In [6]:
from servicex import ServiceXDataset
ds_prime = ServiceXDataset(ttbar_all, backend_name='atlasr22')
# TODO: Why does `get_data_parquet` return no files, but `get_data_rootfiles` does?
# Download the files locally
files = ds_prime.get_data_rootfiles(qastle_text, title="First Request")
# Get a URL so we can open over the internet
# files = ds_prime.get_data_rootfiles_uri(qastle_text, title="First Request")
# files = [f.url for f in files]

files

First Request:   0%|          | 0/9000000000.0 [00:00]

        First Request Downloaded:   0%|          | 0/9000000000.0 [00:00]

[WindowsPath('C:/Users/gordo/AppData/Local/Temp/servicex_gordo/data/2ab284a4-01ad-466a-ae21-13f02aa70f33/e603ae32a8ae0b03a73b04211b9d7cb4-TE.37223155._000309.pool.root.1'),
 WindowsPath('C:/Users/gordo/AppData/Local/Temp/servicex_gordo/data/2ab284a4-01ad-466a-ae21-13f02aa70f33/37845c405fe1b9e4c6ac080e5328d900-TE.37223155._000289.pool.root.1'),
 WindowsPath('C:/Users/gordo/AppData/Local/Temp/servicex_gordo/data/2ab284a4-01ad-466a-ae21-13f02aa70f33/84ff6ddd812ac4c37755811d3ea96436-TE.37223155._000310.pool.root.1')]

In [None]:
import uproot
data = uproot.dask({
    f: 'atlas_xaod_tree'
    for f in files
})

## Plots

Next, lets make plots of everything

In [None]:
# Quick construction, no other imports needed:
h = (
    Hist.new.Reg(20, 0, 100000000, name="x", label="x-axis")
    .Int64()
)
r1 = h.fill(data.event_number)

In [None]:
# Quick construction, no other imports needed:
h = (
    Hist.new.Reg(20, 0, 200, name="x", label="Jet $p_T$")
    .Int64()
)
r2 = h.fill(dak.flatten(data.jet_pt))

In [None]:
r1.compute()

In [None]:
r2.compute()