In [4]:
import random
random.seed(0)  # reproducibility

In [5]:
from nested_pandas.datasets import generate_data

nf = generate_data(10, 100)
nf

Unnamed: 0,a,b,nested
0,0.497468,0.213846,t flux band 0 19.168439 7...
1,0.080885,1.677626,t flux band 0 3.670942 3...
2,0.598974,1.929117,t flux band 0 4.472565 3...
3,0.265504,0.654886,t flux band 0 6.047763 4...
4,0.430084,0.867972,t flux band 0 3.085544 7...
5,0.229539,0.119194,t flux band 0 5.281058 3...
6,0.37289,0.203122,t flux band 0 7.933723 9...
7,0.542007,1.169242,t flux band 0 17.323921 3...
8,0.583941,1.711632,t flux band 0 3.251722 9...
9,0.907824,0.520125,t flux band 0 4.126267 3...


In [36]:
# The incorrect parsing of some scientific notation was the original motivation;
# by leaving query() mostly alone and extending the underlying NestedFrame.eval()
# method, all parsing issues are gone.
nf_s = nf.query("a < 5e-1")
nf.shape, nf_s.shape

((10, 3), (6, 3))

In [13]:
# We can also use all the other Pandas features of eval() on nested columns
# and even on a mix of nested and base columns.  This includes calling
# numeric methods:
nf.eval("a + nested.flux.mean() / nested.t.median()")

0    5.296883
1    4.880300
2    5.398389
3    5.064918
4    5.229499
5    5.028954
6    5.172305
7    5.341422
8    5.383356
9    5.707239
Name: a, dtype: float64

In [14]:
# You can also use assignment to create not only new base columns...
nf_b = nf.eval("d = a + b")
nf_b.columns

Index(['a', 'b', 'nested', 'd'], dtype='object')

In [20]:
# ...but also new nested columns...
nf_n = nf.eval("nested.f2 = nested.flux * 2")
nf_n.nested.nest.fields

['t', 'flux', 'band', 'f2']

In [25]:
# ...and even new nests!
# Note that you can mix base columns into these expressions, too.
nf_n2 = nf.eval("packed.g = nested.flux * 2 + a")
nf_n2.packed.nest["g"]

0    145.865779
0      61.82493
0    118.180083
0    106.113991
0    144.867418
        ...    
9      4.856042
9    111.532587
9     90.839491
9    136.551176
9    124.248486
Name: g, Length: 1000, dtype: double[pyarrow]

In [32]:
# You can also now use the multi-line assignment feature of
# eval() across nested and base columns:
nf_x = nf.eval("""
   d = a + b * 2
   p2.g = nested.flux * 0.5 + a
   nested.f2 = nested.flux * 2
   """)
display(nf_x.columns)
display(nf_x.p2.nest.fields)
display(nf_x.nested.nest.fields)
nf_x

Index(['a', 'b', 'nested', 'd', 'p2'], dtype='object')

['g']

['t', 'flux', 'band', 'f2']

Unnamed: 0,a,b,nested,d,p2
0,0.497468,0.213846,t flux band f2 0 ...,0.92516,g 0 36.839546 1 15.829333 2 ...
1,0.080885,1.677626,t flux band f2 0 ...,3.436136,g 0 16.888618 1 4.874770 2 ...
2,0.598974,1.929117,t flux band f2 0 ...,4.457208,g 0 19.830519 1 30.668843 2 ...
3,0.265504,0.654886,t flux band f2 0 ...,1.575276,g 0 22.886778 1 4.881611 2 ...
4,0.430084,0.867972,t flux band f2 0 ...,2.166027,g 0 39.739070 1 40.990789 2 ...
5,0.229539,0.119194,t flux band f2 0 ...,0.467927,g 0 16.277913 1 8.316289 2 ...
6,0.37289,0.203122,t flux band f2 0 ...,0.779134,g 0 45.534103 1 5.589673 2 ...
7,0.542007,1.169242,t flux band f2 0 ...,2.880491,g 0 19.324947 1 5.110645 2 ...
8,0.583941,1.711632,t flux band f2 0 ...,4.007204,g 0 47.816739 1 11.572494 2 ...
9,0.907824,0.520125,t flux band f2 0 ...,1.948073,g 0 17.354108 1 49.114442 2 ...


In [27]:
nf_x.p2.nest["g"]

0    36.839546
0    15.829333
0    29.918122
0    26.901599
0    36.589956
       ...    
9     1.894878
9    28.564015
9    23.390741
9    34.818662
9    31.742989
Name: g, Length: 1000, dtype: double[pyarrow]

In [41]:
nf_x.nested.nest["t"]

0    19.168439
0     3.913644
0    10.513982
0    16.397308
0     3.151468
       ...    
9    11.313674
9    15.501648
9     11.32539
9    17.268869
9      4.09131
Name: t, Length: 1000, dtype: double[pyarrow]

In [58]:
nf_x.nested.nest["band"] == "r"

0    False
0    False
0    False
0     True
0     True
     ...  
9    False
9    False
9    False
9     True
9    False
Name: band, Length: 1000, dtype: bool[pyarrow]

In [59]:
# The fact that NestedFrame.query depends on this improved eval() function
# means that more sophisticated queries are possible, too.  But do note
# that querying across base and nested columns remains an unsolved problem,
# one that the user has to resolve themselves.
nf_x2 = nf_x.query("nested.band == 'r' and nested.t > 10.5")
print("nested rows before:", nf_x.nested.nest["band"].size, "nested rows after:", nf_x2.nested.nest["band"].size)

nested.band before: 1000 nested band after: 252


In [64]:
# So within a nest is fine, and among base columns is fine:
nf_x3 = nf_x.query("a > 0.5 and b > 0.2")
print("base rows before:", nf_x.size, "base rows after:", nf_x3.size)

base rows before: 50 base rows after: 20


In [65]:
# ...but across base and nested columns is not doable yet
nf_x.query("a > 0.5 and nested.t > 10.5")

ValueError: Queries cannot target multiple structs/layers, write a separate query for each

In [104]:
# Unless, of course, you reduce those nests first to a single value!
# But a word of CAUTION: note that this is equivalent to
#   nf_x.a > 0.5 and nf_x.nested.nest["t"].max() > 10.5
# that is, the max of the entire unpacked column, NOT:
#   nf_x.a > 0.5 and nf_x.reduce(max, 'nested.t') > 10.5
nf_x.query("a > 0.5 and nested.t.max() > 10.5")

Unnamed: 0,a,b,nested,d,p2
2,0.598974,1.929117,t flux band f2 0 ...,4.457208,g 0 19.830519 1 30.668843 2 ...
7,0.542007,1.169242,t flux band f2 0 ...,2.880491,g 0 19.324947 1 5.110645 2 ...
8,0.583941,1.711632,t flux band f2 0 ...,4.007204,g 0 47.816739 1 11.572494 2 ...
9,0.907824,0.520125,t flux band f2 0 ...,1.948073,g 0 17.354108 1 49.114442 2 ...
