Skip to content

Commit

Permalink
Add dim-expression support in Dataset.select (#3920)
Browse files Browse the repository at this point in the history
* Add compute and keep_index kwargs to interface.values. Adds new `compute` and `keep_index` kwargs to the `values` method of all data interfaces. For dask data structures, compute controls whether value is evaluated before being returned. For pandas/dask Series, keep_index controls whether the values are returned as a series (keep_index=True) or array (keep_index=False).

* Support indexing Dataset with pandas/Dask series. Makes it possible to filter a dataset using a pandas or dask series with index that matches the index of the element's `data` data frame.

* Add expression selection tests

* Add selection_expr argument to Dataset.select. This may be set to a dim predicate express indicating which rows should kept. Add error message if first arg is not a dim expression, referring users to the selection_specs keyword argument.
  • Loading branch information
jonmmease committed Sep 3, 2019
1 parent ba24986 commit 996e7c4
Show file tree
Hide file tree
Showing 18 changed files with 165 additions and 30 deletions.
37 changes: 31 additions & 6 deletions holoviews/core/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def add_dimension(self, dimension, dim_pos, dim_val, vdim=False, **kwargs):
return self.clone(data, **dimensions)


def select(self, selection_specs=None, **selection):
def select(self, selection_expr=None, selection_specs=None, **selection):
"""Applies selection by dimension name
Applies a selection along the dimensions of the object using
Expand All @@ -362,7 +362,14 @@ def select(self, selection_specs=None, **selection):
ds.select(x=[0, 1, 2])
* predicate expression: A holoviews.dim expression, e.g.:
from holoviews import dim
ds.select(selection_expr=dim('x') % 2 == 0)
Args:
selection_expr: holoviews.dim predicate expression
specifying selection.
selection_specs: List of specs to match on
A list of types, functions, or type[.group][.label]
strings specifying which objects to apply the
Expand All @@ -375,15 +382,33 @@ def select(self, selection_specs=None, **selection):
Returns an Dimensioned object containing the selected data
or a scalar if a single value was selected
"""
from ...util.transform import dim
if selection_expr is not None and not isinstance(selection_expr, dim):
raise ValueError("""\
The first positional argument to the Dataset.select method is expected to be a
holoviews.util.transform.dim expression. Use the selection_specs keyword
argument to specify a selection specification""")

if selection_specs is not None and not isinstance(selection_specs, (list, tuple)):
selection_specs = [selection_specs]
selection = {dim: sel for dim, sel in selection.items()
if dim in self.dimensions()+['selection_mask']}
selection = {dim_name: sel for dim_name, sel in selection.items()
if dim_name in self.dimensions()+['selection_mask']}
if (selection_specs and not any(self.matches(sp) for sp in selection_specs)
or not selection):
or (not selection and not selection_expr)):
return self

data = self.interface.select(self, **selection)
# Handle selection dim expression
if selection_expr is not None:
mask = selection_expr.apply(self, compute=False, keep_index=True)
dataset = self[mask]
else:
dataset = self

# Handle selection kwargs
if selection:
data = dataset.interface.select(dataset, **selection)
else:
data = dataset.data

if np.isscalar(data):
return data
Expand Down Expand Up @@ -455,7 +480,7 @@ def __getitem__(self, slices):
object.
"""
slices = util.process_ellipses(self, slices, vdim_selection=True)
if isinstance(slices, np.ndarray) and slices.dtype.kind == 'b':
if getattr(getattr(slices, 'dtype', None), 'kind', None) == 'b':
if not len(slices) == len(self):
raise IndexError("Boolean index must match length of sliced object")
return self.clone(self.select(selection_mask=slices))
Expand Down
4 changes: 3 additions & 1 deletion holoviews/core/data/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,9 @@ def sort(cls, dataset, by=[], reverse=False):


@classmethod
def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
def values(
cls, dataset, dim, expanded=True, flat=True, compute=True, keep_index=False
):
data = dataset.data
dim_idx = dataset.get_dimension_index(dim)
if data.ndim == 1:
Expand Down
15 changes: 13 additions & 2 deletions holoviews/core/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,23 @@ def sort(cls, dataset, by=[], reverse=False):
return dataset.data

@classmethod
def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
def values(
cls,
dataset,
dim,
expanded=True,
flat=True,
compute=True,
keep_index=False,
):
dim = dataset.get_dimension(dim)
data = dataset.data[dim.name]
if not expanded:
data = data.unique()
return data.compute().values if compute else data.values
if keep_index:
return data.compute() if compute else data
else:
return data.compute().values if compute else data.values

@classmethod
def select_mask(cls, dataset, selection):
Expand Down
4 changes: 3 additions & 1 deletion holoviews/core/data/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,9 @@ def range(cls, dataset, dimension):


@classmethod
def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
def values(
cls, dataset, dim, expanded=True, flat=True, compute=True, keep_index=False
):
dim = dataset.get_dimension(dim).name
values = dataset.data.get(dim)
if isscalar(values):
Expand Down
4 changes: 3 additions & 1 deletion holoviews/core/data/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,9 @@ def ndloc(cls, dataset, indices):


@classmethod
def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
def values(
cls, dataset, dim, expanded=True, flat=True, compute=True, keep_index=False
):
dim = dataset.get_dimension(dim, strict=True)
if dim in dataset.vdims or dataset.data[dim.name].ndim > 1:
data = dataset.data[dim.name]
Expand Down
4 changes: 3 additions & 1 deletion holoviews/core/data/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ def range(cls, obj, dim):


@classmethod
def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
def values(
cls, dataset, dim, expanded=True, flat=True, compute=True, keep_index=False
):
"""
The set of samples available along a particular dimension.
"""
Expand Down
14 changes: 12 additions & 2 deletions holoviews/core/data/multipath.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,15 @@ def redim(cls, dataset, dimensions):
return new_data

@classmethod
def values(cls, dataset, dimension, expanded=True, flat=True, compute=True):
def values(
cls,
dataset,
dimension,
expanded=True,
flat=True,
compute=True,
keep_index=False,
):
"""
Returns a single concatenated array of all subpaths separated
by NaN values. If expanded keyword is False an array of arrays
Expand All @@ -293,7 +301,9 @@ def values(cls, dataset, dimension, expanded=True, flat=True, compute=True):
ds = cls._inner_dataset_template(dataset)
for d in dataset.data:
ds.data = d
dvals = ds.interface.values(ds, dimension, expanded, flat, compute)
dvals = ds.interface.values(
ds, dimension, expanded, flat, compute, keep_index
)
if not len(dvals):
continue
elif expanded:
Expand Down
19 changes: 16 additions & 3 deletions holoviews/core/data/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,20 +269,33 @@ def select(cls, dataset, selection_mask=None, **selection):
df = dataset.data
if selection_mask is None:
selection_mask = cls.select_mask(dataset, selection)

indexed = cls.indexed(dataset, selection)
df = df.iloc[selection_mask]
if isinstance(selection_mask, pd.Series):
df = df[selection_mask]
else:
df = df.iloc[selection_mask]
if indexed and len(df) == 1 and len(dataset.vdims) == 1:
return df[dataset.vdims[0].name].iloc[0]
return df


@classmethod
def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
def values(
cls,
dataset,
dim,
expanded=True,
flat=True,
compute=True,
keep_index=False,
):
dim = dataset.get_dimension(dim, strict=True)
data = dataset.data[dim.name]
if not expanded:
return data.unique()
return data.values

return data if keep_index else data.values


@classmethod
Expand Down
2 changes: 1 addition & 1 deletion holoviews/core/data/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def coords(cls, dataset, dimension, ordered=False, expanded=False, edges=False):


@classmethod
def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
def values(cls, dataset, dim, expanded=True, flat=True, compute=True, keep_index=False):
dim = dataset.get_dimension(dim, strict=True)
data = dataset.data[dim.name].data
irregular = cls.irregular(dataset, dim) if dim in dataset.kdims else False
Expand Down
4 changes: 2 additions & 2 deletions holoviews/core/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -1147,14 +1147,14 @@ def select(self, selection_specs=None, **kwargs):
# Apply the selection on the selected object of a different type
dimensions = selection.dimensions() + ['value']
if any(kw in dimensions for kw in kwargs):
selection = selection.select(selection_specs, **kwargs)
selection = selection.select(selection_specs=selection_specs, **kwargs)
elif isinstance(selection, Dimensioned) and selection._deep_indexable:
# Apply the deep selection on each item in local selection
items = []
for k, v in selection.items():
dimensions = v.dimensions() + ['value']
if any(kw in dimensions for kw in kwargs):
items.append((k, v.select(selection_specs, **kwargs)))
items.append((k, v.select(selection_specs=selection_specs, **kwargs)))
else:
items.append((k, v))
selection = selection.clone(items)
Expand Down
2 changes: 1 addition & 1 deletion holoviews/core/spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,7 +1371,7 @@ def select(self, selection_specs=None, **kwargs):
"""
if selection_specs is not None and not isinstance(selection_specs, (list, tuple)):
selection_specs = [selection_specs]
selection = super(DynamicMap, self).select(selection_specs, **kwargs)
selection = super(DynamicMap, self).select(selection_specs=selection_specs, **kwargs)
def dynamic_select(obj, **dynkwargs):
if selection_specs is not None:
matches = any(obj.matches(spec) for spec in selection_specs)
Expand Down
2 changes: 1 addition & 1 deletion holoviews/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def process_ellipses(obj, key, vdim_selection=False):
will be exactly one longer than the number of kdims). Note: this
flag should not be used for composite types.
"""
if isinstance(key, np.ndarray) and key.dtype.kind == 'b':
if getattr(getattr(key, 'dtype', None), 'kind', None) == 'b':
return key
wrapped_key = wrap_tuple(key)
if wrapped_key.count(Ellipsis)== 0:
Expand Down
4 changes: 2 additions & 2 deletions holoviews/element/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ def __getitem__(self, key):
return self.clone(extents=(xstart, ystart, xstop, ystop))


def select(self, selection_specs=None, **kwargs):
def select(self, *args, **kwargs):
"""
Bypasses selection on data and sets extents based on selection.
"""
return super(Element2D, self).select(selection_specs, **kwargs)
return super(Element2D, self).select(*args, **kwargs)


def split(self, start=None, end=None, datatype=None, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion holoviews/plotting/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ def initialize_unbounded(obj, dimensions, key):
"""
select = dict(zip([d.name for d in dimensions], key))
try:
obj.select([DynamicMap], **select)
obj.select(selection_specs=[DynamicMap], **select)
except KeyError:
pass

Expand Down
24 changes: 23 additions & 1 deletion holoviews/tests/core/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from holoviews.core.data import concat
from holoviews.core.data.interface import DataError
from holoviews.element import Scatter, Curve
from holoviews.element.comparison import ComparisonTestCase
from holoviews.element.comparison import ComparisonTestCase
from holoviews.util.transform import dim

from collections import OrderedDict

Expand Down Expand Up @@ -697,6 +698,13 @@ def test_dataset_select_rows_gender_male(self):
kdims=self.kdims, vdims=self.vdims)
self.assertEquals(row, indexed)

def test_dataset_select_rows_gender_male_expr(self):
row = self.table.select(selection_expr=dim('Gender') == 'M')
indexed = Dataset({'Gender': ['M', 'M'], 'Age': [10, 16],
'Weight': [15, 18], 'Height': [0.8,0.6]},
kdims=self.kdims, vdims=self.vdims)
self.assertEquals(row, indexed)

def test_dataset_select_rows_gender_male_alias(self):
row = self.alias_table.select(Gender='M')
alias_row = self.alias_table.select(gender='M')
Expand Down Expand Up @@ -859,10 +867,24 @@ def test_dataset_scalar_select(self):
ds = Dataset({'A': 1, 'B': np.arange(10)}, kdims=['A', 'B'])
self.assertEqual(ds.select(A=1).dimension_values('B'), np.arange(10))

def test_dataset_scalar_select_expr(self):
ds = Dataset({'A': 1, 'B': np.arange(10)}, kdims=['A', 'B'])
self.assertEqual(
ds.select(selection_expr=dim('A') == 1).dimension_values('B'),
np.arange(10)
)

def test_dataset_scalar_empty_select(self):
ds = Dataset({'A': 1, 'B': np.arange(10)}, kdims=['A', 'B'])
self.assertEqual(ds.select(A=0).dimension_values('B'), np.array([]))

def test_dataset_scalar_empty_select_expr(self):
ds = Dataset({'A': 1, 'B': np.arange(10)}, kdims=['A', 'B'])
self.assertEqual(
ds.select(selection_expr=dim('A') == 0).dimension_values('B'),
np.array([])
)

def test_dataset_scalar_sample(self):
ds = Dataset({'A': 1, 'B': np.arange(10)}, kdims=['A', 'B'])
self.assertEqual(ds.sample([(1,)]).dimension_values('B'), np.arange(10))
Expand Down
14 changes: 14 additions & 0 deletions holoviews/tests/core/data/testdaskinterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
raise SkipTest("Could not import dask, skipping DaskInterface tests.")

from holoviews.core.data import Dataset
from holoviews.util.transform import dim

from .testpandasinterface import PandasInterfaceTests

Expand Down Expand Up @@ -92,3 +93,16 @@ def test_dataset_range_categorical_dimension_empty(self):
ds_range = ds.range(0)
self.assertTrue(np.isnan(ds_range[0]))
self.assertTrue(np.isnan(ds_range[1]))

def test_select_expression_lazy(self):
df = pd.DataFrame({
'a': [1, 2, 3, 4, 5],
'b': [10, 10, 11, 11, 10],
})
ddf = dd.from_pandas(df, npartitions=2)
ds = Dataset(ddf)
new_ds = ds.select(selection_expr=dim('b') == 10)

# Make sure that selecting by expression didn't cause evaluation
self.assertIsInstance(new_ds.data, dd.DataFrame)
self.assertEqual(new_ds.data.compute(), df[df.b == 10])
15 changes: 13 additions & 2 deletions holoviews/tests/element/testelementselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def test_deep_layout_nesting_slice(self):
self.assertEqual(selection, hmap1 + hmap2)

def test_spec_duplicate_dim_select(self):
selection = self.duplicate_map.select((HoloMap,), x=(0, 1), y=(1, 3))
selection = self.duplicate_map.select(
selection_specs=(HoloMap,), x=(0, 1), y=(1, 3)
)
self.assertEqual(selection, self.duplicate_map[0:1, 1:3])

def test_duplicate_dim_select(self):
Expand All @@ -102,11 +104,20 @@ def test_datetime_select(self):
curve = self.datetime_fn()
overlay = curve * self.datetime_fn()
for el in [curve, overlay]:
self.assertEqual(el.select(time=(s, e)), el[s:e])
v = el.select(time=(s, e))
self.assertEqual(v, el[s:e])
self.assertEqual(el.select(time=
(dt.datetime(1999, 12, 31), dt.datetime(2000, 1, 2))), el[s:e]
)
if pd:
self.assertEqual(el.select(
time=(pd.Timestamp(s), pd.Timestamp(e))
), el[pd.Timestamp(s):pd.Timestamp(e)])

def test_selection_spec_positional_error_message(self):
s, e = '1999-12-31', '2000-1-2'
curve = self.datetime_fn()
with self.assertRaisesRegexp(
ValueError, "Use the selection_specs keyword"
):
curve.select((Curve,), time=(s, e))
Loading

0 comments on commit 996e7c4

Please sign in to comment.