# HEP Demo

Here we show a plausible small workflow on a real excerpt of particle data.

In [1]:
import akimbo.pandas
import pandas as pd
import awkward as ak
import dask_awkward as dak

In [2]:
ds = dak.from_parquet("s3://ddavistemp/hpq/*.parquet", storage_options={"anon": True}).compute()

In [3]:
ds.show(limit_rows=10, limit_cols=140)

[{run: 1, luminosityBlock: 13, event: 1201, MET: {pt: 19.5, phi: 3.1}, muons: [], gen: [{pt: 60.4, ...}, ...]},
 {run: 1, luminosityBlock: 13, event: 1202, MET: {pt: 20.4, phi: -2.18}, muons: [{pt: 18.6, ...}, ...], gen: [{...}, ...]},
 {run: 1, luminosityBlock: 13, event: 1203, MET: {pt: 28.8, phi: 2.62}, muons: [], gen: [{pt: 40.6, ...}]},
 {run: 1, luminosityBlock: 13, event: 1204, MET: {pt: 4.42, phi: -0.206}, muons: [{pt: 26.7, ...}, ...], gen: [{...}, ...]},
 {run: 1, luminosityBlock: 13, event: 1205, MET: {pt: 5.86, phi: 2.47}, muons: [{pt: 7.62, eta: ..., ...}], gen: [...]},
 ...,
 {run: 1, luminosityBlock: 2801, event: 80097, MET: {pt: 17.6, phi: 2.15}, muons: [], gen: [{pt: 24, ...}, ...]},
 {run: 1, luminosityBlock: 2801, event: 80098, MET: {pt: 15.5, phi: 0.718}, muons: [{pt: 18.6, ...}, ...], gen: [...]},
 {run: 1, luminosityBlock: 2801, event: 80099, MET: {pt: 16.2, phi: -0.555}, muons: [{pt: 26.1, ...}, ...], gen: [...]},
 {run: 1, luminosityBlock: 2801, event: 80100, ME

In [4]:
s = ak.to_arrow(ds, extensionarray=False).to_pandas(types_mapper=pd.ArrowDtype)

![](muons_dataset1.svg)

In [5]:
s

0         {'run': 1, 'luminosityBlock': 13, 'event': 120...
1         {'run': 1, 'luminosityBlock': 13, 'event': 120...
2         {'run': 1, 'luminosityBlock': 13, 'event': 120...
3         {'run': 1, 'luminosityBlock': 13, 'event': 120...
4         {'run': 1, 'luminosityBlock': 13, 'event': 120...
                                ...                        
299678    {'run': 1, 'luminosityBlock': 2801, 'event': 8...
299679    {'run': 1, 'luminosityBlock': 2801, 'event': 8...
299680    {'run': 1, 'luminosityBlock': 2801, 'event': 8...
299681    {'run': 1, 'luminosityBlock': 2801, 'event': 8...
299682    {'run': 1, 'luminosityBlock': 2801, 'event': 8...
Length: 299683, dtype: struct<run: int64 not null, luminosityBlock: int64 not null, event: int64 not null, MET: struct<pt: double not null, phi: double not null> not null, muons: large_list<item: struct<pt: double not null, eta: double not null, phi: double not null, mass: double not null, charge: int64 not null> not null> not null, gen: 

We can "unroll" the Series into a DataFrame by extracting columns that are primitive types (e.g. Series of ints or floats). We do this with the awkward accessor (`.ak` on the Series). In the accessor we have a `to_columns` method:

In [6]:
df = s.ak.unpack()

![](muons_dataset_df.svg)

In [7]:
df

Unnamed: 0,run,luminosityBlock,event,MET,muons,gen
0,1,13,1201,"{'pt': 19.49629020690918, 'phi': 3.09666585922...",[],"[{'pt': 60.43461608886719, 'eta': -0.782095849..."
1,1,13,1202,"{'pt': 20.397918701171875, 'phi': -2.180577278...","[{'pt': 18.583789825439453, 'eta': -0.17873963...","[{'pt': 18.733409881591797, 'eta': -0.17861033..."
2,1,13,1203,"{'pt': 28.81757164001465, 'phi': 2.61683297157...",[],"[{'pt': 40.565895080566406, 'eta': -0.33271655..."
3,1,13,1204,"{'pt': 4.415469169616699, 'phi': -0.2062562108...","[{'pt': 26.678863525390625, 'eta': -1.23002457...","[{'pt': 26.755929946899414, 'eta': -1.23014056..."
4,1,13,1205,"{'pt': 5.85665225982666, 'phi': 2.472323179244...","[{'pt': 7.621268272399902, 'eta': 2.1535851955...","[{'pt': 7.496843338012695, 'eta': 2.1539559364..."
...,...,...,...,...,...,...
299678,1,2801,80096,"{'pt': 13.942445755004883, 'phi': -0.285923928...","[{'pt': 5.740289211273193, 'eta': -1.979136943...","[{'pt': 5.610562324523926, 'eta': -1.979978322..."
299679,1,2801,80097,"{'pt': 17.55270004272461, 'phi': 2.15474414825...",[],"[{'pt': 24.036447525024414, 'eta': 0.494034796..."
299680,1,2801,80098,"{'pt': 15.480612754821777, 'phi': 0.7176428437...","[{'pt': 18.630128860473633, 'eta': 0.646761536...","[{'pt': 18.798992156982422, 'eta': 0.646190226..."
299681,1,2801,80099,"{'pt': 16.163414001464844, 'phi': -0.555234909...","[{'pt': 26.122941970825195, 'eta': -1.98191392...","[{'pt': 25.94921112060547, 'eta': -1.981276035..."


In [8]:
df.luminosityBlock

0           13
1           13
2           13
3           13
4           13
          ... 
299678    2801
299679    2801
299680    2801
299681    2801
299682    2801
Name: luminosityBlock, Length: 299683, dtype: int64[pyarrow]

In [9]:
muons_series = df["muons"]

In [10]:
muons_series

0                                                        []
1         [{'pt': 18.583789825439453, 'eta': -0.17873963...
2                                                        []
3         [{'pt': 26.678863525390625, 'eta': -1.23002457...
4         [{'pt': 7.621268272399902, 'eta': 2.1535851955...
                                ...                        
299678    [{'pt': 5.740289211273193, 'eta': -1.979136943...
299679                                                   []
299680    [{'pt': 18.630128860473633, 'eta': 0.646761536...
299681    [{'pt': 26.122941970825195, 'eta': -1.98191392...
299682    [{'pt': 30.407604217529297, 'eta': 1.319252133...
Name: muons, Length: 299683, dtype: large_list<item: struct<pt: double not null, eta: double not null, phi: double not null, mass: double not null, charge: int64 not null> not null>[pyarrow]

In [11]:
%%timeit
[len(x) for x in muons_series.tolist()]

2.38 s ± 7.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In akimbo world we have access to functions from the awkward-array API:

In [12]:
%%timeit
muons_series.ak.num(axis=1)

922 μs ± 32 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Big performance improvement with compiled awkward code over Python iteration!

In [13]:
# ensure they give the same result
[len(x) for x in muons_series.tolist()] == muons_series.ak.num(axis=1).tolist()

True

In [14]:
n_muons = muons_series.ak.num(axis=1)

And we can use the results with other pandas analysis workflows...

In [15]:
n_muons

0         0
1         2
2         0
3         3
4         1
         ..
299678    2
299679    0
299680    2
299681    3
299682    4
Length: 299683, dtype: int64[pyarrow]

In [16]:
df["n_muons"] = n_muons

In [17]:
df

Unnamed: 0,run,luminosityBlock,event,MET,muons,gen,n_muons
0,1,13,1201,"{'pt': 19.49629020690918, 'phi': 3.09666585922...",[],"[{'pt': 60.43461608886719, 'eta': -0.782095849...",0
1,1,13,1202,"{'pt': 20.397918701171875, 'phi': -2.180577278...","[{'pt': 18.583789825439453, 'eta': -0.17873963...","[{'pt': 18.733409881591797, 'eta': -0.17861033...",2
2,1,13,1203,"{'pt': 28.81757164001465, 'phi': 2.61683297157...",[],"[{'pt': 40.565895080566406, 'eta': -0.33271655...",0
3,1,13,1204,"{'pt': 4.415469169616699, 'phi': -0.2062562108...","[{'pt': 26.678863525390625, 'eta': -1.23002457...","[{'pt': 26.755929946899414, 'eta': -1.23014056...",3
4,1,13,1205,"{'pt': 5.85665225982666, 'phi': 2.472323179244...","[{'pt': 7.621268272399902, 'eta': 2.1535851955...","[{'pt': 7.496843338012695, 'eta': 2.1539559364...",1
...,...,...,...,...,...,...,...
299678,1,2801,80096,"{'pt': 13.942445755004883, 'phi': -0.285923928...","[{'pt': 5.740289211273193, 'eta': -1.979136943...","[{'pt': 5.610562324523926, 'eta': -1.979978322...",2
299679,1,2801,80097,"{'pt': 17.55270004272461, 'phi': 2.15474414825...",[],"[{'pt': 24.036447525024414, 'eta': 0.494034796...",0
299680,1,2801,80098,"{'pt': 15.480612754821777, 'phi': 0.7176428437...","[{'pt': 18.630128860473633, 'eta': 0.646761536...","[{'pt': 18.798992156982422, 'eta': 0.646190226...",2
299681,1,2801,80099,"{'pt': 16.163414001464844, 'phi': -0.555234909...","[{'pt': 26.122941970825195, 'eta': -1.98191392...","[{'pt': 25.94921112060547, 'eta': -1.981276035...",3


In [18]:
df.query("n_muons >= 2")

Unnamed: 0,run,luminosityBlock,event,MET,muons,gen,n_muons
1,1,13,1202,"{'pt': 20.397918701171875, 'phi': -2.180577278...","[{'pt': 18.583789825439453, 'eta': -0.17873963...","[{'pt': 18.733409881591797, 'eta': -0.17861033...",2
3,1,13,1204,"{'pt': 4.415469169616699, 'phi': -0.2062562108...","[{'pt': 26.678863525390625, 'eta': -1.23002457...","[{'pt': 26.755929946899414, 'eta': -1.23014056...",3
9,1,13,1210,"{'pt': 23.15256118774414, 'phi': -1.3448100090...","[{'pt': 8.245718955993652, 'eta': 0.4933051466...","[{'pt': 8.270858764648438, 'eta': 0.4933516085...",2
10,1,13,1211,"{'pt': 8.993561744689941, 'phi': -0.6995754837...","[{'pt': 40.31018829345703, 'eta': -1.591983556...","[{'pt': 40.29347610473633, 'eta': -1.591879606...",2
11,1,13,1212,"{'pt': 11.699790000915527, 'phi': -2.898928403...","[{'pt': 41.209922790527344, 'eta': -1.75342404...","[{'pt': 40.541831970214844, 'eta': -1.75308358...",3
...,...,...,...,...,...,...,...
299677,1,2801,80095,"{'pt': 6.506622791290283, 'phi': -2.8128659725...","[{'pt': 26.82998275756836, 'eta': -1.692675352...","[{'pt': 23.990140914916992, 'eta': -1.70475649...",5
299678,1,2801,80096,"{'pt': 13.942445755004883, 'phi': -0.285923928...","[{'pt': 5.740289211273193, 'eta': -1.979136943...","[{'pt': 5.610562324523926, 'eta': -1.979978322...",2
299680,1,2801,80098,"{'pt': 15.480612754821777, 'phi': 0.7176428437...","[{'pt': 18.630128860473633, 'eta': 0.646761536...","[{'pt': 18.798992156982422, 'eta': 0.646190226...",2
299681,1,2801,80099,"{'pt': 16.163414001464844, 'phi': -0.555234909...","[{'pt': 26.122941970825195, 'eta': -1.98191392...","[{'pt': 25.94921112060547, 'eta': -1.981276035...",3


In [19]:
df = df.query("n_muons >= 2")

In [20]:
df.ak["MET", "phi"].groupby(df.luminosityBlock).min().sort_values()

luminosityBlock
2143    -3.14158
594    -3.141542
2597   -3.141537
1453   -3.141512
1746   -3.141483
          ...   
1799   -2.282698
1012   -2.273576
2059   -2.244525
47     -2.204018
958    -2.181129
Length: 2997, dtype: double[pyarrow]

In [21]:
maybe_strange_lumiblock = int(df.ak["MET", "phi"].groupby(df.luminosityBlock).min().idxmax())

In [22]:
maybe_strange_lumiblock

958

In [23]:
df[df.luminosityBlock == maybe_strange_lumiblock]

Unnamed: 0,run,luminosityBlock,event,MET,muons,gen,n_muons
136194,1,958,95702,"{'pt': 9.32455062866211, 'phi': 0.053808707743...","[{'pt': 17.39313507080078, 'eta': -1.847270846...","[{'pt': 17.233966827392578, 'eta': -1.84710228...",2
136195,1,958,95703,"{'pt': 15.678848266601562, 'phi': 1.0668007135...","[{'pt': 34.96195602416992, 'eta': 1.0815714597...","[{'pt': 34.21088790893555, 'eta': 1.0816322565...",2
136196,1,958,95704,"{'pt': 10.801395416259766, 'phi': -1.883632421...","[{'pt': 40.85875701904297, 'eta': 1.5373326539...","[{'pt': 39.86746597290039, 'eta': 1.5372104644...",5
136197,1,958,95705,"{'pt': 27.977304458618164, 'phi': -2.181129455...","[{'pt': 42.02450180053711, 'eta': 0.6413493156...","[{'pt': 42.0600471496582, 'eta': 0.64086186885...",2
136198,1,958,95706,"{'pt': 44.71547317504883, 'phi': 2.52462005615...","[{'pt': 9.585787773132324, 'eta': -1.828484535...","[{'pt': 9.453995704650879, 'eta': -1.828184485...",2
136202,1,958,95710,"{'pt': 4.6118950843811035, 'phi': -2.009603738...","[{'pt': 12.712407112121582, 'eta': 1.665425777...","[{'pt': 12.579483985900879, 'eta': 1.665357470...",4
136204,1,958,95712,"{'pt': 10.901251792907715, 'phi': 1.6692819595...","[{'pt': 28.988916397094727, 'eta': 1.426161408...","[{'pt': 29.23183250427246, 'eta': 1.4259384870...",2
136206,1,958,95714,"{'pt': 6.936832427978516, 'phi': 0.50806105136...","[{'pt': 33.164180755615234, 'eta': -1.03487014...","[{'pt': 33.692684173583984, 'eta': -1.03486096...",4
136207,1,958,95715,"{'pt': 12.992511749267578, 'phi': 2.5144424438...","[{'pt': 42.73493957519531, 'eta': -1.897092819...","[{'pt': 41.00784683227539, 'eta': -1.897110700...",3
136209,1,958,95717,"{'pt': 11.85525131225586, 'phi': -1.5537400245...","[{'pt': 8.502578735351562, 'eta': -1.390953898...","[{'pt': 8.506632804870605, 'eta': -1.391292810...",2


In [24]:
# save for further analysis
df[df.luminosityBlock == maybe_strange_lumiblock].to_parquet("strange.parquet")