Skip to content

Commit

Permalink
Handle kdims referencing multi-index in dask
Browse files Browse the repository at this point in the history
  • Loading branch information
philippjfr committed Jun 10, 2018
1 parent 98b4ea7 commit 41e5cc4
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 5 deletions.
10 changes: 8 additions & 2 deletions holoviews/core/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from dask.dataframe import DataFrame

from .. import util
from ..dimension import Dimension
from ..element import Element
from ..ndmapping import NdMapping, item_check, OrderedDict
from .interface import Interface
Expand Down Expand Up @@ -44,10 +45,15 @@ class DaskInterface(PandasInterface):

@classmethod
def init(cls, eltype, data, kdims, vdims):
data, kdims, vdims = PandasInterface.init(eltype, data, kdims, vdims)
data, dims, extra = PandasInterface.init(eltype, data, kdims, vdims)
if not isinstance(data, DataFrame):
data = dd.from_pandas(data, npartitions=cls.default_partitions, sort=False)
return data, kdims, vdims
kdims = [d.name if isinstance(d, Dimension) else d for d in dims['kdims']]
if any(d for d in kdims if d not in data.columns):
reset = data.reset_index()
if all(d for d in kdims if d in reset.columns):
data = reset
return data, dims, extra

@classmethod
def shape(cls, dataset):
Expand Down
3 changes: 1 addition & 2 deletions tests/core/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def init_column_data(self):
# all interfaces.

def test_dataset_array_init_hm(self):
"Tests support for arrays (homogeneous)"
dataset = Dataset(np.column_stack([self.xs, self.xs_2]),
kdims=['x'], vdims=['x2'])
self.assertTrue(isinstance(dataset.data, self.data_type))
Expand All @@ -106,7 +105,7 @@ def test_dataset_dataframe_init_hm(self):
if pd is None:
raise SkipTest("Pandas not available")
dataset = Dataset(pd.DataFrame({'x':self.xs, 'x2':self.xs_2}),
kdims=['x'], vdims=[ 'x2'])
kdims=['x'], vdims=['x2'])
self.assertTrue(isinstance(dataset.data, self.data_type))

def test_dataset_dataframe_init_hm_alias(self):
Expand Down
21 changes: 20 additions & 1 deletion tests/core/data/testdaskinterface.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
from nose.plugins.attrib import attr
from unittest import SkipTest

import numpy as np

try:
import pandas as pd
import dask.dataframe as dd
except:
dd = None
raise SkipTest("Could not import dask, skipping DaskInterface tests.")

from holoviews.core.data import Dataset

from .testpandasinterface import PandasInterfaceTests


@attr(optional=1)
class DaskDatasetTest(PandasInterfaceTests):
"""
Test of the pandas DaskDataset interface.
Expand Down Expand Up @@ -46,3 +53,15 @@ def test_dataset_sort_string_ht(self):

def test_dataset_boolean_index(self):
raise SkipTest("Not supported")

def test_dataset_from_multi_index(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ddf = dd.from_pandas(df, 1)
ds = Dataset(ddf.groupby(['x', 'y']).mean(), ['x', 'y'])
self.assertEqual(ds, Dataset(df, ['x', 'y']))

def test_dataset_from_multi_index_tuple_dims(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ddf = dd.from_pandas(df, 1)
ds = Dataset(ddf.groupby(['x', 'y']).mean(), [('x', 'X'), ('y', 'Y')])
self.assertEqual(ds, Dataset(df, [('x', 'X'), ('y', 'Y')]))
10 changes: 10 additions & 0 deletions tests/core/data/testpandasinterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,13 @@ def test_dataset_conversion_groupby_with_index(self):
hmap = HoloMap({0: Scatter(([0, 1], [1, 2]), 'index', 'y'),
1: Scatter([(2, 3)], 'index', 'y')}, 'x')
self.assertEqual(scatters, hmap)

def test_dataset_from_multi_index(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ds = Dataset(df.groupby(['x', 'y']).mean(), ['x', 'y'])
self.assertEqual(ds, Dataset(df, ['x', 'y']))

def test_dataset_from_multi_index_tuple_dims(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ds = Dataset(df.groupby(['x', 'y']).mean(), [('x', 'X'), ('y', 'Y')])
self.assertEqual(ds, Dataset(df, [('x', 'X'), ('y', 'Y')]))

0 comments on commit 41e5cc4

Please sign in to comment.