In [1]:
import awkward_pandas
import pandas as pd
import awkward._v2 as ak
from dask_awkward.io.scratch import from_parquet

In [2]:
ds = from_parquet("s3://ddavistemp/higgs_pq/*.parquet", storage_options={"anon": True}).compute()

In [3]:
ds.show(limit_rows=10, limit_cols=140)

[{run: 1, luminosityBlock: 13, event: 1201, MET: {pt: 19.5, phi: 3.1}, muons: [], gen: [{pt: 60.4, ...}, ...]},
 {run: 1, luminosityBlock: 13, event: 1202, MET: {pt: 20.4, phi: -2.18}, muons: [{pt: 18.6, ...}, ...], gen: [{...}, ...]},
 {run: 1, luminosityBlock: 13, event: 1203, MET: {pt: 28.8, phi: 2.62}, muons: [], gen: [{pt: 40.6, ...}]},
 {run: 1, luminosityBlock: 13, event: 1204, MET: {pt: 4.42, phi: -0.206}, muons: [{pt: 26.7, ...}, ...], gen: [{...}, ...]},
 {run: 1, luminosityBlock: 13, event: 1205, MET: {pt: 5.86, phi: 2.47}, muons: [{pt: 7.62, eta: ..., ...}], gen: [...]},
 ...,
 {run: 1, luminosityBlock: 2801, event: 80097, MET: {pt: 17.6, phi: 2.15}, muons: [], gen: [{pt: 24, ...}, ...]},
 {run: 1, luminosityBlock: 2801, event: 80098, MET: {pt: 15.5, phi: 0.718}, muons: [{pt: 18.6, ...}, ...], gen: [...]},
 {run: 1, luminosityBlock: 2801, event: 80099, MET: {pt: 16.2, phi: -0.555}, muons: [{pt: 26.1, ...}, ...], gen: [...]},
 {run: 1, luminosityBlock: 2801, event: 80100, ME

In [4]:
s = pd.Series(pd.array(ds, dtype="awkward"), name="nested")

![](muons_dataset1.svg)

In [5]:
s

0         ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
1         ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
2         ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
3         ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
4         ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
                                ...                        
299678    ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
299679    ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
299680    ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
299681    ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
299682    ['run' 'luminosityBlock' 'event' 'MET' 'muons'...
Name: nested, Length: 299683, dtype: awkward

We can "unroll" the Series into a DataFrame by extracting columns that are primitive types (e.g. Series of ints or floats). We do this with the awkward accessor (`.ak` on the Series). In the accessor we have a `to_columns` method:

In [6]:
df = s.ak.to_columns()

![](muons_dataset_df.svg)

In [7]:
df

Unnamed: 0,run,luminosityBlock,event,nested
0,1,13,1201,['MET' 'muons' 'gen']
1,1,13,1202,['MET' 'muons' 'gen']
2,1,13,1203,['MET' 'muons' 'gen']
3,1,13,1204,['MET' 'muons' 'gen']
4,1,13,1205,['MET' 'muons' 'gen']
...,...,...,...,...
299678,1,2801,80096,['MET' 'muons' 'gen']
299679,1,2801,80097,['MET' 'muons' 'gen']
299680,1,2801,80098,['MET' 'muons' 'gen']
299681,1,2801,80099,['MET' 'muons' 'gen']


In [8]:
df.luminosityBlock

0           13
1           13
2           13
3           13
4           13
          ... 
299678    2801
299679    2801
299680    2801
299681    2801
299682    2801
Name: luminosityBlock, Length: 299683, dtype: int64

In [9]:
df.nested.values._data

<Array [{MET: {...}, muons: [], ...}, ...] type='299683 * {MET: {pt: float6...'>

In [10]:
muons_series = df.nested.ak["muons"]

In [11]:
muons_series

0                                                        []
1         [['pt', 'eta', 'phi', 'mass', 'charge'], ['pt'...
2                                                        []
3         [['pt', 'eta', 'phi', 'mass', 'charge'], ['pt'...
4                  [['pt', 'eta', 'phi', 'mass', 'charge']]
                                ...                        
299678    [['pt', 'eta', 'phi', 'mass', 'charge'], ['pt'...
299679                                                   []
299680    [['pt', 'eta', 'phi', 'mass', 'charge'], ['pt'...
299681    [['pt', 'eta', 'phi', 'mass', 'charge'], ['pt'...
299682    [['pt', 'eta', 'phi', 'mass', 'charge'], ['pt'...
Length: 299683, dtype: awkward

In [12]:
%%timeit
[len(x) for x in muons_series.tolist()]

347 ms ± 3.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In awkward-pandas world we have access to functions from the awkward-array API:

In [13]:
%%timeit
muons_series.ak.num(axis=1)

468 µs ± 2.35 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Huge performance improvement with compiled awkward code over Python iteration!

In [14]:
n_muons = muons_series.ak.num(axis=1)

And we can use the results with other pandas analysis workflows...

In [15]:
n_muons

0         0
1         2
2         0
3         3
4         1
         ..
299678    2
299679    0
299680    2
299681    3
299682    4
Length: 299683, dtype: awkward

In [16]:
df["n_muons"] = n_muons

In [17]:
df

Unnamed: 0,run,luminosityBlock,event,nested,n_muons
0,1,13,1201,['MET' 'muons' 'gen'],0
1,1,13,1202,['MET' 'muons' 'gen'],2
2,1,13,1203,['MET' 'muons' 'gen'],0
3,1,13,1204,['MET' 'muons' 'gen'],3
4,1,13,1205,['MET' 'muons' 'gen'],1
...,...,...,...,...,...
299678,1,2801,80096,['MET' 'muons' 'gen'],2
299679,1,2801,80097,['MET' 'muons' 'gen'],0
299680,1,2801,80098,['MET' 'muons' 'gen'],2
299681,1,2801,80099,['MET' 'muons' 'gen'],3


In [18]:
df.query("n_muons >= 2")

Unnamed: 0,run,luminosityBlock,event,nested,n_muons
1,1,13,1202,['MET' 'muons' 'gen'],2
3,1,13,1204,['MET' 'muons' 'gen'],3
9,1,13,1210,['MET' 'muons' 'gen'],2
10,1,13,1211,['MET' 'muons' 'gen'],2
11,1,13,1212,['MET' 'muons' 'gen'],3
...,...,...,...,...,...
299677,1,2801,80095,['MET' 'muons' 'gen'],5
299678,1,2801,80096,['MET' 'muons' 'gen'],2
299680,1,2801,80098,['MET' 'muons' 'gen'],2
299681,1,2801,80099,['MET' 'muons' 'gen'],3


In [19]:
df = df.query("n_muons >= 2")

In [20]:
df.nested.ak["MET", "phi"].groupby(df.luminosityBlock).min().sort_values()

luminosityBlock
2599.0    -3.14158
268.0    -3.141537
2062.0   -3.141512
1482.0   -3.141317
826.0    -3.141196
            ...   
1758.0   -2.528853
154.0    -2.486018
961.0    -2.465839
1148.0   -2.457783
2644.0    -2.28751
Length: 1445, dtype: awkward

In [21]:
maybe_strange_lumiblock = int(df.nested.ak["MET", "phi"].groupby(df.luminosityBlock).min().idxmax())

In [22]:
maybe_strange_lumiblock

2644

We can convert back to an awkward Record array with merge:

In [23]:
merged = awkward_pandas.merge(df[["run", "luminosityBlock"]])

In [24]:
ds[["run", "luminosityBlock"]].fields

['run', 'luminosityBlock']

In [25]:
ds.MET.fields

['pt', 'phi']

In [26]:
ak.to_parquet(awkward_pandas.merge(df[df.luminosityBlock == maybe_strange_lumiblock]), "strange.parquet")

<pyarrow._parquet.FileMetaData object at 0x1638b68e0>
  created_by: parquet-cpp-arrow version 9.0.0
  num_columns: 1
  num_rows: 41
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 0

In [27]:
ak.to_parquet(awkward_pandas.merge(df[df.luminosityBlock == maybe_strange_lumiblock]).values._data, "strange.parquet")

<pyarrow._parquet.FileMetaData object at 0x16382d8f0>
  created_by: parquet-cpp-arrow version 9.0.0
  num_columns: 15
  num_rows: 41
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 0

In [28]:
ak.from_parquet("strange.parquet")

<Array [{run: 1, ...}, ..., {run: 1, ...}] type='41 * {run: int64, luminosi...'>

In [29]:
s.ak.fields()

['run', 'luminosityBlock', 'event', 'MET', 'muons', 'gen']