# Nested-Pandas: Creating nested `reduce` results with "Nesting Inference"

In [17]:
import nested_pandas as npd
import pandas as pd
from nested_pandas.datasets import generate_data
import numpy as np
import pyarrow as pa
# Added in nested-pandas v0.3.8
print(npd.__version__)

0.3.8


In [18]:
# Generate some dummy data
ndf = generate_data(3,20, seed=1)

### Normal `reduce` Behavior

In [19]:
# Normal Behavior of reduce
def complex_output(flux):
    return {"max_flux":np.max(flux), "flux_quantiles":np.quantile(flux, [0.1,0.2,0.3,0.4,0.5]), "labels":[0.1,0.2,0.3,0.4,0.5]}

ndf.reduce(complex_output, "nested.flux")

Unnamed: 0,max_flux,flux_quantiles,labels
0,96.484005,"[10.987139293024393, 13.88482277263824, 22.195400645721197, 45.50049253382897, 60.18499770613584]","[0.1, 0.2, 0.3, 0.4, 0.5]"
1,99.732285,"[29.66015102826585, 39.79633942095249, 55.60423742880927, 69.22914543044622, 73.84702096888944]","[0.1, 0.2, 0.3, 0.4, 0.5]"
2,94.459476,"[11.46745145770277, 16.536734750551034, 31.45441958947629, 37.81654389824572, 43.90016616756178]","[0.1, 0.2, 0.3, 0.4, 0.5]"


### Using Nesting Inference to create nested structure

In [20]:
# Normal Behavior of reduce
ndf = generate_data(3,20, seed=1)

# Use nested column naming to signal nested structure
def complex_output(flux):
    return {"max_flux":np.max(flux), "lc.flux_quantiles":np.quantile(flux, [0.1,0.2,0.3,0.4,0.5]), "lc.labels":[0.1,0.2,0.3,0.4,0.5]}

ndf.reduce(complex_output, "nested.flux")

Unnamed: 0_level_0,max_flux,lc
flux_quantiles,labels,Unnamed: 2_level_1
0,96.484005,flux_quantiles  labels  10.987139  0.1  5 rows × 2 columns
flux_quantiles,labels,
10.987139,0.1,
1,99.732285,29.660151  0.1  5 rows × 2 columns
29.660151,0.1,
2,94.459476,11.467451  0.1  5 rows × 2 columns
11.467451,0.1,

flux_quantiles,labels

0,1
10.987139,0.1

0,1
29.660151,0.1

0,1
11.467451,0.1


### On by default, but can be turned off

In [21]:
# Use nested column naming to signal nested structure
def complex_output(flux):
    return {"max_flux":np.max(flux), "lc.flux_quantiles":np.quantile(flux, [0.1,0.2,0.3,0.4,0.5]), "lc.labels":[0.1,0.2,0.3,0.4,0.5]}

ndf.reduce(complex_output, "nested.flux", infer_nesting=False)

Unnamed: 0,max_flux,lc.flux_quantiles,lc.labels
0,96.484005,"[10.987139293024393, 13.88482277263824, 22.195400645721197, 45.50049253382897, 60.18499770613584]","[0.1, 0.2, 0.3, 0.4, 0.5]"
1,99.732285,"[29.66015102826585, 39.79633942095249, 55.60423742880927, 69.22914543044622, 73.84702096888944]","[0.1, 0.2, 0.3, 0.4, 0.5]"
2,94.459476,"[11.46745145770277, 16.536734750551034, 31.45441958947629, 37.81654389824572, 43.90016616756178]","[0.1, 0.2, 0.3, 0.4, 0.5]"


## Dask-side: Almost the same, but have to deal with meta

In [22]:
import nested_dask as nd
# Added in nested-dask v0.3.8
print(nd.__version__)

0.3.4


In [23]:
from nested_dask.datasets import generate_data
ndd = generate_data(20,20, npartitions=2, seed=1)

In [27]:
def complex_output(flux):
    return {"max_flux":np.max(flux), "lc.flux_quantiles":np.quantile(flux, [0.1,0.2,0.3,0.4,0.5]), "lc.labels":[0.1,0.2,0.3,0.4,0.5]}

# create a NestedDtype for the nested column "lc"
from nested_pandas.series.dtype import NestedDtype
lc_dtype = NestedDtype(pa.struct([pa.field("flux_quantiles", pa.list_(pa.float64())),
                                   pa.field("labels", pa.list_(pa.float64()))]))
# use the lc_dtype in meta creation
result_meta = npd.NestedFrame({'max_flux':pd.Series([], dtype='float'),
                'lc':pd.Series([], dtype=lc_dtype)})

ndd.reduce(complex_output, "nested.flux", meta=result_meta)

Unnamed: 0_level_0,max_flux,lc
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,float64,"nested<flux_quantiles: [double], labels: [double]>"
9,...,...
19,...,...
