Add cuDFInterface to work with cuDF GPU dataframes and cupy support f…

…or XArrayInterface (#3982)
holoviz · Mar 4, 2020 · 83490d7 · 83490d7
1 parent f8c1cd6
commit 83490d7
Show file tree

Hide file tree

Showing 8 changed files with 506 additions and 19 deletions.
diff --git a/holoviews/core/data/__init__.py b/holoviews/core/data/__init__.py
@@ -55,6 +55,12 @@
 except ImportError:
     pass
 
+try:
+    from .cudf import cuDFInterface   # noqa (Conditional API import)
+    datatypes.append('cuDF')
+except ImportError:
+    pass
+
 if 'array' not in datatypes:
     datatypes.append('array')
 if 'multitabular' not in datatypes:

diff --git a/holoviews/core/data/cudf.py b/holoviews/core/data/cudf.py
@@ -0,0 +1,340 @@
+from __future__ import absolute_import
+
+import sys
+import warnings
+
+try:
+    import itertools.izip as zip
+except ImportError:
+    pass
+
+from itertools import product
+
+import numpy as np
+
+from .. import util
+from ..dimension import dimension_name
+from ..element import Element
+from ..ndmapping import NdMapping, item_check, sorted_context
+from .interface import DataError, Interface
+from .pandas import PandasInterface
+
+
+class cuDFInterface(PandasInterface):
+    """
+    The cuDFInterface allows a Dataset objects to wrap a cuDF
+    DataFrame object. Using cuDF allows working with columnar
+    data on a GPU. Most operations leave the data in GPU memory,
+    however to plot the data it has to be loaded into memory.
+
+    The cuDFInterface covers almost the complete API exposed
+    by the PandasInterface with two notable exceptions:
+
+    1) Aggregation and groupby do not have a consistent sort order
+       (see https://github.com/rapidsai/cudf/issues/4237)
+    3) Not all functions can be easily applied to a cuDF so
+       some functions applied with aggregate and reduce will not work.
+    """
+
+    datatype = 'cuDF'
+
+    types = ()
+
+    @classmethod
+    def loaded(cls):
+        return 'cudf' in sys.modules
+
+    @classmethod
+    def applies(cls, obj):
+        if not cls.loaded():
+            return False
+        import cudf
+        return isinstance(obj, (cudf.DataFrame, cudf.Series))
+
+    @classmethod
+    def init(cls, eltype, data, kdims, vdims):
+        import cudf
+        import pandas as pd
+
+        element_params = eltype.param.objects()
+        kdim_param = element_params['kdims']
+        vdim_param = element_params['vdims']
+
+        if isinstance(data, (cudf.Series, pd.Series)):
+            data = data.to_frame()
+
+        if not isinstance(data, cudf.DataFrame):
+            data, _, _ = PandasInterface.init(eltype, data, kdims, vdims)
+            data = cudf.from_pandas(data)
+
+        ncols = len(data.columns)
+        index_names = [data.index.name]
+        if index_names == [None]:
+            index_names = ['index']
+        if eltype._auto_indexable_1d and ncols == 1 and kdims is None:
+            kdims = list(index_names)
+
+        if isinstance(kdim_param.bounds[1], int):
+            ndim = min([kdim_param.bounds[1], len(kdim_param.default)])
+        else:
+            ndim = None
+        nvdim = vdim_param.bounds[1] if isinstance(vdim_param.bounds[1], int) else None
+        if kdims and vdims is None:
+            vdims = [c for c in data.columns if c not in kdims]
+        elif vdims and kdims is None:
+            kdims = [c for c in data.columns if c not in vdims][:ndim]
+        elif kdims is None:
+            kdims = list(data.columns[:ndim])
+            if vdims is None:
+                vdims = [d for d in data.columns[ndim:((ndim+nvdim) if nvdim else None)]
+                         if d not in kdims]
+        elif kdims == [] and vdims is None:
+            vdims = list(data.columns[:nvdim if nvdim else None])
+
+        # Handle reset of index if kdims reference index by name
+        for kd in kdims:
+            kd = dimension_name(kd)
+            if kd in data.columns:
+                continue
+            if any(kd == ('index' if name is None else name)
+                   for name in index_names):
+                data = data.reset_index()
+                break
+        if any(isinstance(d, (np.int64, int)) for d in kdims+vdims):
+            raise DataError("cudf DataFrame column names used as dimensions "
+                            "must be strings not integers.", cls)
+
+        if kdims:
+            kdim = dimension_name(kdims[0])
+            if eltype._auto_indexable_1d and ncols == 1 and kdim not in data.columns:
+                data = data.copy()
+                data.insert(0, kdim, np.arange(len(data)))
+
+        for d in kdims+vdims:
+            d = dimension_name(d)
+            if len([c for c in data.columns if c == d]) > 1:
+                raise DataError('Dimensions may not reference duplicated DataFrame '
+                                'columns (found duplicate %r columns). If you want to plot '
+                                'a column against itself simply declare two dimensions '
+                                'with the same name. '% d, cls)
+        return data, {'kdims':kdims, 'vdims':vdims}, {}
+
+
+    @classmethod
+    def range(cls, dataset, dimension):
+        column = dataset.data[dataset.get_dimension(dimension, strict=True).name]
+        if column.dtype.kind == 'O':
+            return np.NaN, np.NaN
+        else:
+            return (column.min(), column.max())
+
+
+    @classmethod
+    def values(cls, dataset, dim, expanded=True, flat=True, compute=True,
+               keep_index=False):
+        dim = dataset.get_dimension(dim, strict=True)
+        data = dataset.data[dim.name]
+        if not expanded:
+            data = data.unique()
+            return data.to_array() if compute else data
+        elif keep_index:
+            return data
+        elif compute:
+            return data.to_array()
+        return data
+
+
+    @classmethod
+    def groupby(cls, dataset, dimensions, container_type, group_type, **kwargs):
+        # Get dimensions information
+        dimensions = [dataset.get_dimension(d).name for d in dimensions]
+        kdims = [kdim for kdim in dataset.kdims if kdim not in dimensions]
+
+        # Update the kwargs appropriately for Element group types
+        group_kwargs = {}
+        group_type = dict if group_type == 'raw' else group_type
+        if issubclass(group_type, Element):
+            group_kwargs.update(util.get_param_values(dataset))
+            group_kwargs['kdims'] = kdims
+        group_kwargs.update(kwargs)
+
+        # Propagate dataset
+        group_kwargs['dataset'] = dataset.dataset
+
+        # Find all the keys along supplied dimensions
+        keys = product(*(dataset.data[dimensions[0]].unique() for d in dimensions))
+
+        # Iterate over the unique entries applying selection masks
+        grouped_data = []
+        for unique_key in util.unique_iterator(keys):
+            group_data = dataset.select(**dict(zip(dimensions, unique_key)))
+            if not len(group_data):
+                continue
+            group_data = group_type(group_data, **group_kwargs)
+            grouped_data.append((unique_key, group_data))
+
+        if issubclass(container_type, NdMapping):
+            with item_check(False), sorted_context(False):
+                kdims = [dataset.get_dimension(d) for d in dimensions]
+                return container_type(grouped_data, kdims=kdims)
+        else:
+            return container_type(grouped_data)
+
+
+    @classmethod
+    def select_mask(cls, dataset, selection):
+        """
+        Given a Dataset object and a dictionary with dimension keys and
+        selection keys (i.e tuple ranges, slices, sets, lists or literals)
+        return a boolean mask over the rows in the Dataset object that
+        have been selected.
+        """
+        mask = None
+        for dim, sel in selection.items():
+            if isinstance(sel, tuple):
+                sel = slice(*sel)
+            arr = cls.values(dataset, dim, compute=False)
+            if util.isdatetime(arr) and util.pd:
+                try:
+                    sel = util.parse_datetime_selection(sel)
+                except:
+                    pass
+
+            new_masks = []
+            if isinstance(sel, slice):
+                with warnings.catch_warnings():
+                    warnings.filterwarnings('ignore', r'invalid value encountered')
+                    if sel.start is not None:
+                        new_masks.append(sel.start <= arr)
+                    if sel.stop is not None:
+                        new_masks.append(arr < sel.stop)
+                if not new_masks:
+                    continue
+                new_mask = new_masks[0]
+                for imask in new_masks[1:]:
+                    new_mask &= imask
+            elif isinstance(sel, (set, list)):
+                for v in sel:
+                    new_masks.append(arr==v)
+                if not new_masks:
+                    continue
+                new_mask = new_masks[0]
+                for imask in new_masks[1:]:
+                    new_mask |= imask
+            elif callable(sel):
+                new_mask = sel(arr)
+            else:
+                new_mask = arr == sel
+
+            if mask is None:
+                mask = new_mask
+            else:
+                mask &= new_mask
+        return mask
+
+
+    @classmethod
+    def select(cls, dataset, selection_mask=None, **selection):
+        df = dataset.data
+        if selection_mask is None:
+            selection_mask = cls.select_mask(dataset, selection)
+
+        indexed = cls.indexed(dataset, selection)
+        if selection_mask is not None:
+            df = df[selection_mask]
+        if indexed and len(df) == 1 and len(dataset.vdims) == 1:
+            return df[dataset.vdims[0].name].iloc[0]
+        return df
+
+
+    @classmethod
+    def concat_fn(cls, dataframes, **kwargs):
+        import cudf
+        return cudf.concat(dataframes, **kwargs)
+
+
+    @classmethod
+    def add_dimension(cls, dataset, dimension, dim_pos, values, vdim):
+        data = dataset.data.copy()
+        if dimension.name not in data:
+            data[dimension.name] = values
+        return data
+
+
+    @classmethod
+    def aggregate(cls, dataset, dimensions, function, **kwargs):
+        data = dataset.data
+        cols = [d.name for d in dataset.kdims if d in dimensions]
+        vdims = dataset.dimensions('value', label='name')
+        reindexed = data[cols+vdims]
+        agg = function.__name__
+        if len(dimensions):
+            agg_map = {'amin': 'min', 'amax': 'max'}
+            agg = agg_map.get(agg, agg)
+            grouped = reindexed.groupby(cols, sort=False)
+            if not hasattr(grouped, agg):
+                raise ValueError('%s aggregation is not supported on cudf DataFrame.' % agg)
+            df = getattr(grouped, agg)().reset_index()
+        else:
+            agg_map = {'amin': 'min', 'amax': 'max', 'size': 'count'}
+            agg = agg_map.get(agg, agg)
+            if not hasattr(reindexed, agg):
+                raise ValueError('%s aggregation is not supported on cudf DataFrame.' % agg)
+            agg = getattr(reindexed, agg)()
+            data = dict(((col, [v]) for col, v in zip(agg.index, agg.to_array())))
+            df = util.pd.DataFrame(data, columns=list(agg.index))
+
+        dropped = []
+        for vd in vdims:
+            if vd not in df.columns:
+                dropped.append(vd)
+        return df, dropped
+
+
+    @classmethod
+    def iloc(cls, dataset, index):
+        import cudf
+
+        rows, cols = index
+        scalar = False
+        columns = list(dataset.data.columns)
+        if isinstance(cols, slice):
+            cols = [d.name for d in dataset.dimensions()][cols]
+        elif np.isscalar(cols):
+            scalar = np.isscalar(rows)
+            cols = [dataset.get_dimension(cols).name]
+        else:
+            cols = [dataset.get_dimension(d).name for d in index[1]]
+        col_index = [columns.index(c) for c in cols]
+        if np.isscalar(rows):
+            rows = [rows]
+
+        if scalar:
+            return dataset.data[cols[0]].iloc[rows[0]]
+        result = dataset.data.iloc[rows, col_index]
+
+        # cuDF does not handle single rows and cols indexing correctly
+        # as of cudf=0.10.0 so we have to convert Series back to DataFrame
+        if isinstance(result, cudf.Series):
+            if len(cols) == 1:
+                result = result.to_frame(cols[0])
+            else:
+                result = result.to_frame().T
+        return result
+
+
+    @classmethod
+    def sort(cls, dataset, by=[], reverse=False):
+        cols = [dataset.get_dimension(d, strict=True).name for d in by]
+        return dataset.data.sort_values(by=cols, ascending=not reverse)
+
+
+    @classmethod
+    def dframe(cls, dataset, dimensions):
+        if dimensions:
+            return dataset.data[dimensions].to_pandas()
+        else:
+            return dataset.data.to_pandas()
+
+
+Interface.register(cuDFInterface)
diff --git a/holoviews/core/data/dask.py b/holoviews/core/data/dask.py
@@ -270,15 +270,9 @@ def add_dimension(cls, dataset, dimension, dim_pos, values, vdim):
         return data
 
     @classmethod
-    def concat(cls, datasets, dimensions, vdims):
+    def concat_fn(cls, dataframes, **kwargs):
         import dask.dataframe as dd
-        dataframes = []
-        for key, ds in datasets:
-            data = ds.data.copy()
-            for d, k in zip(dimensions, key):
-                data[d.name] = k
-            dataframes.append(data)
-        return dd.concat(dataframes)
+        return dd.concat(dataframes, **kwargs)
 
     @classmethod
     def dframe(cls, dataset, dimensions):