Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve import times #3055

Merged
merged 5 commits into from
Oct 8, 2018
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 15 additions & 17 deletions holoviews/core/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
import numpy as np
import param

from ..dimension import redim
from ..util import unique_iterator
from ..dimension import redim, Dimension, process_dimensions
from ..element import Element
from ..ndmapping import OrderedDict
from ..spaces import HoloMap, DynamicMap
from ..util import (basestring, dimension_range as d_range, get_param_values,
isfinite, process_ellipses, unique_iterator, wrap_tuple)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had some weird issues when importing from .. import util getting the wrong utilities, hence I did this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I would prefer to figure out the issue rather than switching to the unqualified version...

from .interface import Interface, iloc, ndloc
from .array import ArrayInterface
from .dictionary import DictInterface
Expand All @@ -34,7 +38,6 @@
'following error: %s' % e)

try:
import xarray # noqa (Availability import)
from .xarray import XArrayInterface # noqa (Conditional API import)
datatypes.append('xarray')
except ImportError:
Expand All @@ -49,11 +52,6 @@
if 'array' not in datatypes:
datatypes.append('array')

from ..dimension import Dimension, process_dimensions
from ..element import Element
from ..ndmapping import OrderedDict
from ..spaces import HoloMap, DynamicMap
from .. import util


def concat(datasets, datatype=None):
Expand Down Expand Up @@ -210,7 +208,7 @@ class Dataset(Element):

def __init__(self, data, kdims=None, vdims=None, **kwargs):
if isinstance(data, Element):
pvals = util.get_param_values(data)
pvals = get_param_values(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove the namespace qualification? Does it affect the import times? (It shouldn't!)

Copy link
Member Author

@philippjfr philippjfr Oct 8, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my comment above, I can have another look at this but:

I had some weird issues when importing from .. import util getting the wrong utilities, hence I did this.

kwargs.update([(l, pvals[l]) for l in ['group', 'label']
if l in pvals and l not in kwargs])
kwargs.update(process_dimensions(kdims, vdims))
Expand Down Expand Up @@ -281,15 +279,15 @@ def range(self, dim, data_range=True, dimension_range=True):

if dim is None or (not data_range and not dimension_range):
return (None, None)
elif all(util.isfinite(v) for v in dim.range) and dimension_range:
elif all(isfinite(v) for v in dim.range) and dimension_range:
return dim.range
elif dim in self.dimensions() and data_range and len(self):
lower, upper = self.interface.range(self, dim)
else:
lower, upper = (np.NaN, np.NaN)
if not dimension_range:
return lower, upper
return util.dimension_range(lower, upper, dim.range, dim.soft_range)
return d_range(lower, upper, dim.range, dim.soft_range)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I prefer the old version...I don't see why the name d_range should be introduced for a single use (at least in this PR diff...)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, see the comment above.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, really it is two things:

  1. For some reason you have problems with the qualified import (the root problem which would ideally be fixed)
  2. You renamed to d_range to avoid a local variable name clash.

This PR would be greatly simplified if 1 can be fixed!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having nightmares with this, but I should be able to fix it.



def add_dimension(self, dimension, dim_pos, dim_val, vdim=False, **kwargs):
Expand All @@ -299,7 +297,7 @@ def add_dimension(self, dimension, dim_pos, dim_val, vdim=False, **kwargs):
dimensions and a key value scalar or sequence of the same length
as the existing keys.
"""
if isinstance(dimension, (util.basestring, tuple)):
if isinstance(dimension, (basestring, tuple)):
dimension = Dimension(dimension)

if dimension.name in self.kdims:
Expand Down Expand Up @@ -398,7 +396,7 @@ def __getitem__(self, slices):
(4) A boolean array index matching the length of the Dataset
object.
"""
slices = util.process_ellipses(self, slices, vdim_selection=True)
slices = process_ellipses(self, slices, vdim_selection=True)
if isinstance(slices, np.ndarray) and slices.dtype.kind == 'b':
if not len(slices) == len(self):
raise IndexError("Boolean index must match length of sliced object")
Expand Down Expand Up @@ -466,11 +464,11 @@ def sample(self, samples=[], closest=True, **kwargs):
reindexed = selection.clone(new_type=Dataset).reindex(kdims)
selection = tuple(reindexed.columns(kdims+self.vdims).values())

datatype = list(util.unique_iterator(self.datatype+['dataframe', 'dict']))
datatype = list(unique_iterator(self.datatype+['dataframe', 'dict']))
return self.clone(selection, kdims=kdims, new_type=new_type,
datatype=datatype)

lens = set(len(util.wrap_tuple(s)) for s in samples)
lens = set(len(wrap_tuple(s)) for s in samples)
if len(lens) > 1:
raise IndexError('Sample coordinates must all be of the same length.')

Expand All @@ -479,7 +477,7 @@ def sample(self, samples=[], closest=True, **kwargs):
samples = self.closest(samples)
except NotImplementedError:
pass
samples = [util.wrap_tuple(s) for s in samples]
samples = [wrap_tuple(s) for s in samples]
return self.clone(self.interface.sample(self, samples), new_type=Table)


Expand Down Expand Up @@ -574,7 +572,7 @@ def groupby(self, dimensions=[], container_type=HoloMap, group_type=None,
group_dims = [kd for kd in self.kdims if kd not in dimensions]
kdims = [self.get_dimension(d) for d in kwargs.pop('kdims', group_dims)]
drop_dim = len(group_dims) != len(kdims)
group_kwargs = dict(util.get_param_values(self), kdims=kdims)
group_kwargs = dict(get_param_values(self), kdims=kdims)
group_kwargs.update(kwargs)
def load_subset(*args):
constraint = dict(zip(dim_names, args))
Expand Down
23 changes: 19 additions & 4 deletions holoviews/core/data/dask.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from __future__ import absolute_import

import sys
try:
import itertools.izip as zip
except ImportError:
pass

import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.dataframe import DataFrame, Series

from .. import util
from ..dimension import Dimension
Expand Down Expand Up @@ -37,16 +36,29 @@ class DaskInterface(PandasInterface):
some functions applied with aggregate and reduce will not work.
"""

types = (DataFrame, Series)
types = ()

datatype = 'dask'

default_partitions = 100

@classmethod
def loaded(cls):
return 'dask' in sys.modules

@classmethod
def applies(cls, obj):
if not cls.loaded():
return False
import dask.dataframe as dd
return isinstance(obj, (dd.DataFrame, dd.Series))

@classmethod
def init(cls, eltype, data, kdims, vdims):
import dask.dataframe as dd

data, dims, extra = PandasInterface.init(eltype, data, kdims, vdims)
if not isinstance(data, DataFrame):
if not isinstance(data, dd.DataFrame):
data = dd.from_pandas(data, npartitions=cls.default_partitions, sort=False)
kdims = [d.name if isinstance(d, Dimension) else d for d in dims['kdims']]

Expand All @@ -64,6 +76,7 @@ def shape(cls, dataset):

@classmethod
def range(cls, columns, dimension):
import dask.dataframe as dd
column = columns.data[columns.get_dimension(dimension).name]
if column.dtype.kind == 'O':
column = np.sort(column[column.notnull()].compute())
Expand Down Expand Up @@ -211,6 +224,7 @@ def unpack_scalar(cls, columns, data):
Given a columns object and data in the appropriate format for
the interface, return a simple scalar.
"""
import dask.dataframe as dd
if len(data.columns) > 1 or len(data) != 1:
return data
if isinstance(data, dd.DataFrame):
Expand Down Expand Up @@ -245,6 +259,7 @@ def add_dimension(cls, columns, dimension, dim_pos, values, vdim):

@classmethod
def concat(cls, datasets, dimensions, vdims):
import dask.dataframe as dd
dataframes = []
for key, ds in datasets:
data = ds.data.copy()
Expand Down
33 changes: 16 additions & 17 deletions holoviews/core/data/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,7 @@
except ImportError:
pass


import numpy as np
array_types = (np.ndarray,)

try:
import dask.array as da
array_types += (da.Array,)
except ImportError:
da = None

def is_dask(array):
return da and isinstance(array, da.Array)

from .dictionary import DictInterface
from .interface import Interface, DataError
Expand All @@ -27,6 +16,7 @@ def is_dask(array):
from ..dimension import OrderedDict as cyODict
from ..ndmapping import NdMapping, item_check, sorted_context
from .. import util
from .util import is_dask, get_dask_array, get_array_types



Expand Down Expand Up @@ -90,7 +80,7 @@ def init(cls, eltype, data, kdims, vdims):
name = dimension_name(dim)
if name not in data:
raise ValueError("Values for dimension %s not found" % dim)
if not isinstance(data[name], array_types):
if not isinstance(data[name], get_array_types()):
data[name] = np.array(data[name])

kdim_names = [dimension_name(d) for d in kdims]
Expand Down Expand Up @@ -144,7 +134,7 @@ def concat_dim(cls, datasets, dim, vdims):
'of arrays must match. %s found that arrays '
'along the %s dimension do not match.' %
(cls.__name__, vdim.name))
stack = np.stack if any(is_dask(arr) for arr in arrays) else da.stack
stack = np.stack if any(is_dask(arr) for arr in arrays) else get_dask_array().stack
new_data[vdim.name] = stack(arrays, -1)
return new_data

Expand Down Expand Up @@ -263,7 +253,7 @@ def canonicalize(cls, dataset, data, data_coords=None, virtual_coords=[]):

# Transpose data
dims = [name for name in data_coords
if isinstance(cls.coords(dataset, name), array_types)]
if isinstance(cls.coords(dataset, name), get_array_types())]
dropped = [dims.index(d) for d in dims
if d not in dataset.kdims+virtual_coords]
if dropped:
Expand Down Expand Up @@ -346,6 +336,7 @@ def values(cls, dataset, dim, expanded=True, flat=True, compute=True):
if dim in dataset.vdims or dataset.data[dim.name].ndim > 1:
data = dataset.data[dim.name]
data = cls.canonicalize(dataset, data)
da = get_dask_array()
if compute and da and isinstance(data, da.Array):
data = data.compute()
return data.T.flatten() if flat else data
Expand Down Expand Up @@ -398,12 +389,12 @@ def groupby(cls, dataset, dim_names, container_type, group_type, **kwargs):
else:
group_data = cls.select(dataset, **select)

if np.isscalar(group_data) or (isinstance(group_data, array_types) and group_data.shape == ()):
if np.isscalar(group_data) or (isinstance(group_data, get_array_types()) and group_data.shape == ()):
group_data = {dataset.vdims[0].name: np.atleast_1d(group_data)}
for dim, v in zip(dim_names, unique_key):
group_data[dim] = np.atleast_1d(v)
elif not drop_dim:
if isinstance(group_data, array_types):
if isinstance(group_data, get_array_types()):
group_data = {dataset.vdims[0].name: group_data}
for vdim in dataset.vdims:
data = group_data[vdim.name]
Expand All @@ -423,7 +414,7 @@ def groupby(cls, dataset, dim_names, container_type, group_type, **kwargs):
def key_select_mask(cls, dataset, values, ind):
if isinstance(ind, tuple):
ind = slice(*ind)
if isinstance(ind, array_types):
if isinstance(ind, get_array_types()):
mask = ind
elif isinstance(ind, slice):
mask = True
Expand Down Expand Up @@ -511,19 +502,22 @@ def select(cls, dataset, selection_mask=None, **selection):

for kdim in dataset.kdims:
if cls.irregular(dataset, dim):
da = get_dask_array()
if da and isinstance(dataset.data[kdim.name], da.Array):
data[kdim.name] = dataset.data[kdim.name].vindex[index]
else:
data[kdim.name] = np.asarray(data[kdim.name])[index]

for vdim in dataset.vdims:
da = get_dask_array()
if da and isinstance(dataset.data[vdim.name], da.Array):
data[vdim.name] = dataset.data[vdim.name].vindex[index]
else:
data[vdim.name] = np.asarray(dataset.data[vdim.name])[index]

if indexed:
if len(dataset.vdims) == 1:
da = get_dask_array()
arr = np.squeeze(data[dataset.vdims[0].name])
if da and isinstance(arr, da.Array):
arr = arr.compute()
Expand Down Expand Up @@ -559,6 +553,7 @@ def sample(cls, dataset, samples=[]):
for d, arr in zip(dimensions, np.meshgrid(*sampled)):
data[d].append(arr)
for vdim, array in zip(dataset.vdims, arrays):
da = get_dask_array()
flat_index = np.ravel_multi_index(tuple(int_inds)[::-1], array.shape)
if da and isinstance(array, da.Array):
data[vdim.name].append(array.flatten().vindex[tuple(flat_index)])
Expand All @@ -574,6 +569,7 @@ def aggregate(cls, dataset, kdims, function, **kwargs):
data = {kdim: dataset.data[kdim] for kdim in kdims}
axes = tuple(dataset.ndims-dataset.get_dimension_index(kdim)-1
for kdim in dataset.kdims if kdim not in kdims)
da = get_dask_array()
for vdim in dataset.vdims:
values = dataset.data[vdim.name]
atleast_1d = da.atleast_1d if is_dask(values) else np.atleast_1d
Expand Down Expand Up @@ -649,6 +645,7 @@ def iloc(cls, dataset, index):
new_data.append(cls.values(dataset, d, compute=False)[rows])

if scalar:
da = get_dask_array()
if new_data and isinstance(new_data[0], da.Array):
return new_data[0].compute()[0]
return new_data[0][0]
Expand All @@ -661,6 +658,8 @@ def range(cls, dataset, dimension):
column = cls.coords(dataset, dimension, expanded=expanded, edges=True)
else:
column = cls.values(dataset, dimension, expanded=False, flat=False)

da = get_dask_array()
if column.dtype.kind == 'M':
dmin, dmax = column.min(), column.max()
if da and isinstance(column, da.Array):
Expand Down
25 changes: 24 additions & 1 deletion holoviews/core/data/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,32 @@ class Interface(param.Parameterized):

datatype = None

types = ()

# Denotes whether the interface expects gridded data
gridded = False

# Denotes whether the interface expects ragged data
multi = False

@classmethod
def loaded(cls):
"""
Indicates whether the required dependencies are loaded.
"""
return True

@classmethod
def applies(cls, obj):
"""
Indicates whether the interface is designed specifically to
handle the supplied object's type. By default simply checks
if the object is one of the types declared on the class,
however if the type is expensive to import at load time the
method may be overridden.
"""
return any(isinstance(obj, t) for t in cls.types)

@classmethod
def register(cls, interface):
cls.interfaces[interface.datatype] = interface
Expand Down Expand Up @@ -176,14 +196,17 @@ def initialize(cls, eltype, data, kdims, vdims, datatype=None):
# Set interface priority order
prioritized = [cls.interfaces[p] for p in datatype
if p in cls.interfaces]
head = [intfc for intfc in prioritized if type(data) in intfc.types]
head = [intfc for intfc in prioritized if intfc.applies(data)]
if head:
# Prioritize interfaces which have matching types
prioritized = head + [el for el in prioritized if el != head[0]]

# Iterate over interfaces until one can interpret the input
priority_errors = []
for interface in prioritized:
if not interface.loaded() and len(datatype) != 1:
# Skip interface if it is not loaded and was not explicitly requested
continue
try:
(data, dims, extra_kws) = interface.init(eltype, data, kdims, vdims)
break
Expand Down
Loading