Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization for heatmap aggregation with pandas #1174

Merged
merged 3 commits into from
Mar 5, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions holoviews/core/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,13 +516,17 @@ def get_dimension_type(self, dim):
return self.interface.dimension_type(self, dim_obj)


def dframe(self, dimensions=None):
def dframe(self, dimensions=None, copy=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure copy argument makes much sense if the element isn't already using a dataframe based interface - for other interfaces, don't you always have to create a new dataframe - which would be the same as copy being fixed to True?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's true, it's more like avoid_copy, but I think providing a consistent API to get a hold of a dataframe with the minimal amount of overhead is useful.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That said, I'd also be fine having a utility for it instead.

"""
Returns the data in the form of a DataFrame.
Returns the data in the form of a DataFrame. Supplying a list
of dimensions filters the dataframe. If the data is already
a DataFrame copy=False may be supplied to avoid making a copy.
"""
if dimensions:
if pd is None:
raise Exception("Cannot return data as dataframe, pandas is not available")
elif dimensions:
dimensions = [self.get_dimension(d, strict=True).name for d in dimensions]
return self.interface.dframe(self, dimensions)
return self.interface.dframe(self, dimensions, copy)


def columns(self, dimensions=None):
Expand Down
2 changes: 1 addition & 1 deletion holoviews/core/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def concat(cls, columns_objs):
return dd.concat([col.data for col in cast_objs])

@classmethod
def dframe(cls, columns, dimensions):
def dframe(cls, columns, dimensions, copy):
return columns.data.compute()

@classmethod
Expand Down
4 changes: 2 additions & 2 deletions holoviews/core/data/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ def array(cls, dataset, dimensions):
return Element.array(dataset, dimensions)

@classmethod
def dframe(cls, dataset, dimensions):
return Element.dframe(dataset, dimensions)
def dframe(cls, dataset, dimensions, copy):
return Element.dframe(dataset, dimensions, copy)

@classmethod
def columns(cls, dataset, dimensions):
Expand Down
6 changes: 3 additions & 3 deletions holoviews/core/data/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,13 +223,13 @@ def add_dimension(cls, columns, dimension, dim_pos, values, vdim):


@classmethod
def dframe(cls, columns, dimensions):
def dframe(cls, columns, dimensions, copy):
if dimensions:
dimensions = [columns.get_dimension(d, strict=True).name
for d in dimensions]
return columns.reindex(dimensions).data.copy()
else:
return columns.data.copy()
elif :
return columns.data.copy() if copy else columns.data


Interface.register(PandasInterface)
2 changes: 1 addition & 1 deletion holoviews/core/data/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def length(cls, dataset):
return np.product(dataset[dataset.vdims[0].name].shape)

@classmethod
def dframe(cls, dataset, dimensions):
def dframe(cls, dataset, dimensions, copy):
if dimensions:
return dataset.reindex(columns=dimensions).data.to_dataframe().reset_index(dimensions)
else:
Expand Down
2 changes: 1 addition & 1 deletion holoviews/core/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def table(self, datatype=None):
return Table(self, **(dict(datatype=datatype) if datatype else {}))


def dframe(self, dimensions=None):
def dframe(self, dimensions=None, copy=True):
import pandas as pd
column_names = dimensions if dimensions else self.dimensions(label=True)
dim_vals = OrderedDict([(dim, self[dim]) for dim in column_names])
Expand Down
17 changes: 14 additions & 3 deletions holoviews/element/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@

from ..core import Dataset, OrderedDict
from ..core.operation import ElementOperation
from ..core.util import (pd, is_nan, sort_topologically,
cartesian_product, is_cyclic, one_to_one)
from ..core.util import (is_nan, sort_topologically, one_to_one,
cartesian_product, is_cyclic, get_df_data)

try:
import pandas as pd
except:
pd = None

try:
import dask
Expand Down Expand Up @@ -134,7 +139,13 @@ def _aggregate_dataset(self, obj, xcoords, ycoords):
dtype = 'dataframe' if pd else 'dictionary'
dense_data = Dataset(data, kdims=obj.kdims, vdims=obj.vdims, datatype=[dtype])
concat_data = obj.interface.concatenate([dense_data, obj], datatype=[dtype])
agg = concat_data.reindex([xdim, ydim], vdims).aggregate([xdim, ydim], reduce_fn)
reindexed = concat_data.reindex([xdim, ydim], vdims)
if pd:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use reindexed.interface.dframe(dimensions=None, copy=False) instead of exposing the copy keyword argument at the element level? For copy=False to work, you are already assuming a dataframe type interface is being used...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose the other thing you could do is complain if copy=False is passed to the dframe method of any interface that isn't based on dataframes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For copy=False to work, you are already assuming a dataframe type interface is being used...

Because then I need conditional branches for the "is already dataframe" and "convert to dataframe" paths again. I guess I agree copy is confusing because you might assume you can mutate the dataframe and have an effect on the original element if you don't make a copy, when the real point of it is to avoid making pointless copies.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would there be any harm with the dataframe interfaces just avoiding pointless copies automatically? Then it doesn't have to be something the user needs to ever think about...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my usage of dframe I often create it and then assign to it so that would be a bit of pain.

df = reindexed.dframe(copy=False)
df = df.groupby([xdim, ydim], sort=False).first().reset_index()
agg = reindexed.clone(df)
else:
agg = reindexed.aggregate([xdim, ydim], reduce_fn)

# Convert data to a gridded dataset
grid_data = {xdim: xcoords, ydim: ycoords}
Expand Down
11 changes: 3 additions & 8 deletions holoviews/operation/datashader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,19 @@

from ..core import (ElementOperation, Element, Dimension, NdOverlay,
Overlay, CompositeOverlay, Dataset)
from ..core.data import ArrayInterface, PandasInterface, DaskInterface
from ..core.data import PandasInterface, DaskInterface
from ..core.util import get_param_values, basestring
from ..element import GridImage, Image, Path, Curve, Contours, RGB
from ..streams import RangeXY

DF_INTERFACES = [PandasInterface, DaskInterface]

@dispatch(Element)
def discover(dataset):
"""
Allows datashader to correctly discover the dtypes of the data
in a holoviews Element.
"""
if dataset.interface in DF_INTERFACES:
return dsdiscover(dataset.data)
else:
return dsdiscover(dataset.dframe())
return dsdiscover(dataset.dframe(copy=False))


@bypixel.pipeline.register(Element)
Expand Down Expand Up @@ -135,7 +131,6 @@ def get_agg_data(cls, obj, category=None):
kdims = obj.kdims
vdims = obj.vdims
x, y = obj.dimensions(label=True)[:2]
is_df = lambda x: isinstance(x, Dataset) and x.interface in DF_INTERFACES
if isinstance(obj, Path):
glyph = 'line'
for p in obj.data:
Expand All @@ -146,7 +141,7 @@ def get_agg_data(cls, obj, category=None):
elif isinstance(obj, CompositeOverlay):
for key, el in obj.data.items():
x, y, element, glyph = cls.get_agg_data(el)
df = element.data if is_df(element) else element.dframe()
df = element.dframe(copy=False)
if isinstance(obj, NdOverlay):
df = df.assign(**dict(zip(obj.dimensions('key', True), key)))
paths.append(df)
Expand Down
2 changes: 1 addition & 1 deletion holoviews/plotting/bokeh/chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ class BoxPlot(ChartPlot):
def _init_chart(self, element, ranges):
properties = self.style[self.cyclic_index]
label = element.dimensions('key', True)
dframe = element.dframe()
dframe = element.dframe(copy=False)

# Fix for displaying datetimes which are not handled by bokeh
for kd in element.kdims:
Expand Down