Merge pull request #39 from khaeru/sparse-0.12

Adjust to sparse 0.12; enhance AttrSeries.{assign_coords,sel}
khaeru · Mar 22, 2021 · 3fa97e9 · 3fa97e9
2 parents 55a2b15 + ed4d89f
commit 3fa97e9
Show file tree

Hide file tree

Showing 10 changed files with 291 additions and 66 deletions.
diff --git a/doc/conf.py b/doc/conf.py
@@ -66,6 +66,7 @@
     "plotnine": ("https://plotnine.readthedocs.io/en/stable/", None),
     "pyam": ("https://pyam-iamc.readthedocs.io/en/stable/", None),
     "python": ("https://docs.python.org/3/", None),
+    "xarray": ("https://xarray.pydata.org/en/stable/", None),
 }
 
 # -- Options for sphinx.ext.todo -------------------------------------------------------

diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst
@@ -6,8 +6,18 @@ What's new
    :backlinks: none
    :depth: 1
 
-.. Next release
-.. ============
+Next release
+============
+
+- Bump minimum version of :mod:`sparse` from 0.10 to 0.12 and adjust to changes in this version (:pull:`39`)
+
+  - Remove :meth:`.SparseDataArray.equals`, obviated by improvements in :mod:`sparse`.
+
+- Improve :class:`.AttrSeries` (:pull:`39`)
+
+  - Implement :meth:`~.AttrSeries.drop_vars` and :meth:`~.AttrSeries.expand_dims`.
+  - :meth:`~.AttrSeries.assign_coords` can relabel an entire dimension.
+  - :meth:`~.AttrSeries.sel` can accept :class:`.DataArray` indexers and rename/combine dimensions.
 
 v1.2.1 (2021-03-08)
 ===================

diff --git a/genno/computations.py b/genno/computations.py
@@ -111,9 +111,7 @@ def aggregate(quantity, groups, keep):
 
         # Aggregate each group
         for group, members in dim_groups.items():
-            agg = (
-                quantity.sel({dim: members}).sum(dim=dim).assign_coords(**{dim: group})
-            )
+            agg = quantity.sel({dim: members}).sum(dim=dim).expand_dims({dim: [group]})
 
             if isinstance(agg, AttrSeries):
                 # .transpose() is necessary for AttrSeries

diff --git a/genno/core/attrseries.py b/genno/core/attrseries.py
@@ -1,21 +1,32 @@
 import logging
-from typing import Any, Hashable, Mapping
+from typing import Any, Hashable, Iterable, Mapping, Union
 
 import pandas as pd
 import pandas.core.indexes.base as ibase
 import xarray as xr
+from xarray.core.utils import either_dict_or_kwargs
 
 from genno.core.quantity import Quantity
 
 log = logging.getLogger(__name__)
 
 
+def _multiindex_of(obj: pd.Series):
+    """Return ``obj.index``; if this is not a :class:`pandas.MultiIndex`, convert."""
+    return (
+        obj.index
+        if isinstance(obj.index, pd.MultiIndex)
+        else pd.MultiIndex.from_product([obj.index])
+    )
+
+
 class AttrSeries(pd.Series, Quantity):
     """:class:`pandas.Series` subclass imitating :class:`xarray.DataArray`.
 
     The AttrSeries class provides similar methods and behaviour to
-    :class:`xarray.DataArray`, so that :mod:`genno.computations` methods can use xarray-
-    like syntax.
+    :class:`xarray.DataArray`, so that :mod:`genno.computations` functions and user
+    code can use xarray-like syntax. In particular, this allows such code to be agnostic
+    about the order of dimensions.
 
     Parameters
     ----------
@@ -70,9 +81,27 @@ def from_series(cls, series, sparse=None):
         """Like :meth:`xarray.DataArray.from_series`."""
         return AttrSeries(series)
 
-    def assign_coords(self, **kwargs):
+    def assign_coords(self, coords=None, **coord_kwargs):
         """Like :meth:`xarray.DataArray.assign_coords`."""
-        return pd.concat([self], keys=kwargs.values(), names=kwargs.keys())
+        coords = either_dict_or_kwargs(coords, coord_kwargs, "assign_coords")
+
+        idx = _multiindex_of(self)
+
+        # Construct a new index
+        new_idx = idx.copy()
+        for dim, values in coords.items():
+            expected_len = len(idx.levels[idx.names.index(dim)])
+            if expected_len != len(values):
+                raise ValueError(
+                    f"conflicting sizes for dimension {repr(dim)}: length "
+                    f"{expected_len} on <this-array> and length {len(values)} on "
+                    f"{repr(dim)}"
+                )
+
+            new_idx = new_idx.set_levels(values, level=dim)
+
+        # Return a new object with the new index
+        return self.set_axis(new_idx)
 
     def bfill(self, dim: Hashable, limit: int = None):
         """Like :meth:`xarray.DataArray.bfill`."""
@@ -93,7 +122,7 @@ def coords(self):
         return result
 
     def cumprod(self, dim=None, axis=None, skipna=None, **kwargs):
-        """Like :attr:`xarray.DataArray.cumprod`."""
+        """Like :meth:`xarray.DataArray.cumprod`."""
         if axis:
             log.info(f"{self.__class__.__name__}.cumprod(…, axis=…) is ignored")
 
@@ -114,6 +143,31 @@ def drop(self, label):
         """Like :meth:`xarray.DataArray.drop`."""
         return self.droplevel(label)
 
+    def drop_vars(
+        self, names: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise"
+    ):
+        """Like :meth:`xarray.DataArray.drop_vars`."""
+
+        return self.droplevel(names)
+
+    def expand_dims(
+        self,
+        dim: Union[None, Mapping[Hashable, Any]] = None,
+        axis=None,
+        **dim_kwargs: Any,
+    ):
+        """Like :meth:`xarray.DataArray.expand_dims`."""
+        dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims")
+        if axis is not None:
+            raise NotImplementedError  # pragma: no cover
+
+        result = self
+        for name, values in reversed(list(dim.items())):
+            print(name, values)
+            result = pd.concat([result] * len(values), keys=values, names=[name])
+
+        return result
+
     def ffill(self, dim: Hashable, limit: int = None):
         """Like :meth:`xarray.DataArray.ffill`."""
         return self.__class__(
@@ -141,9 +195,7 @@ def rename(self, new_name_or_name_dict):
 
     def sel(self, indexers=None, drop=False, **indexers_kwargs):
         """Like :meth:`xarray.DataArray.sel`."""
-        indexers = xr.core.utils.either_dict_or_kwargs(
-            indexers, indexers_kwargs, "indexers"
-        )
+        indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "indexers")
 
         if len(indexers) == 1:
             level, key = list(indexers.items())[0]
@@ -156,17 +208,69 @@ def sel(self, indexers=None, drop=False, **indexers_kwargs):
                     # No MultiIndex; use .loc with a slice to avoid returning scalar
                     return self.loc[slice(key, key)]
 
-        # Iterate over dimensions
-        idx = []
-        for dim in self.dims:
-            # Get an indexer for this dimension
-            i = indexers.get(dim, slice(None))
+        if len(indexers) and all(
+            isinstance(i, xr.DataArray) for i in indexers.values()
+        ):
+            # DataArray indexers
+
+            # Combine indexers in a data set; dimensions are aligned
+            ds = xr.Dataset(indexers)
+
+            # All dimensions indexed
+            dims_indexed = set(indexers.keys())
+            # Dimensions to discard
+            dims_drop = set(ds.data_vars.keys())
+
+            # Check contents of indexers
+            if any(ds.isnull().any().values()):
+                raise IndexError(
+                    f"Dimensions of indexers mismatch: {ds.notnull().sum()}"
+                )
+            elif len(ds.dims) > 1:
+                raise NotImplementedError(  # pragma: no cover
+                    f"map to > 1 dimensions {repr(ds.dims)} with AttrSeries.sel()"
+                )
+
+            # pd.Index object with names and levels of the new dimension to be created
+            idx = ds.coords.to_index()
+
+            # Dimensions to drop on sliced data to avoid duplicated dimensions
+            drop = list(dims_indexed - dims_drop)
+
+            # Dictionary of Series to concatenate
+            data = {}
 
-            # Maybe unpack an xarray DataArray indexers, for pandas
-            idx.append(i.data if isinstance(i, xr.DataArray) else i)
+            # Iterate over labels in the new dimension
+            for label in idx:
+                # Get a slice from the indexers corresponding to this label
+                loc_ds = ds.sel({idx.name: label})
 
-        # Select and return
-        return AttrSeries(self.loc[tuple(idx)])
+                # Assemble a key with one element for each dimension
+                seq = [loc_ds.get(d) for d in self.dims]
+                # Replace None from .get() with slice(None) or unpack a single value
+                seq = [slice(None) if item is None else item.item() for item in seq]
+
+                # Use the key to retrieve 1+ integer locations; slice; store
+                data[label] = self.iloc[self.index.get_locs(seq)].droplevel(drop)
+
+            # Rejoin to a single data frame; drop the source levels
+            data = pd.concat(data, names=[idx.name]).droplevel(list(dims_drop))
+        else:
+            # Other indexers
+
+            # Iterate over dimensions
+            idx = []
+            for dim in self.dims:
+                # Get an indexer for this dimension
+                i = indexers.get(dim, slice(None))
+
+                # Maybe unpack an xarray DataArray indexers, for pandas
+                idx.append(i.data if isinstance(i, xr.DataArray) else i)
+
+            data = self.loc[tuple(idx)]
+
+        # Return
+        return AttrSeries(data, attrs=self.attrs)
 
     def shift(
         self,
@@ -175,7 +279,7 @@ def shift(
         **shifts_kwargs: int,
     ):
         """Like :meth:`xarray.DataArray.shift`."""
-        shifts = xr.core.utils.either_dict_or_kwargs(shifts, shifts_kwargs, "shift")
+        shifts = either_dict_or_kwargs(shifts, shifts_kwargs, "shift")
         if len(shifts) > 1:
             raise NotImplementedError(
                 f"{self.__class__.__name__}.shift() with > 1 dimension"
@@ -267,21 +371,25 @@ def align_levels(self, other):
 
         Return a copy of `self` with common levels in the same order as `other`.
         """
+        # If other.index is a (1D) Index object, convert to a MultiIndex with 1 level so
+        # .levels[…] can be used, below. See also Quantity._single_column_df()
+        other_index = _multiindex_of(other)
+
         # Lists of common dimensions, and dimensions on `other` missing from `self`.
         common, missing = [], []
-        for (i, n) in enumerate(other.index.names):
+        for (i, n) in enumerate(other_index.names):
             if n in self.index.names:
                 common.append(n)
             else:
                 missing.append((i, n))
 
         result = self
         if len(common) == 0:
-            # Broadcast over missing dimensions
-            # TODO make this more efficient, e.g. using itertools.product()
-            for i, dim in missing:
-                result = pd.concat(
-                    {v: result for v in other.index.get_level_values(i)}, names=[dim]
+            # No common dimensions
+            if len(missing):
+                # Broadcast over missing dimensions
+                result = result.expand_dims(
+                    {dim: other_index.levels[i] for i, dim in missing}
                 )
 
             if len(self) == len(self.index.names) == 1:
@@ -290,15 +398,15 @@ def align_levels(self, other):
                 result = result.droplevel(-1)
 
             # Reordering starts with the dimensions of `other`
-            order = list(other.index.names)
+            order = list(other_index.names)
         else:
             # Some common dimensions exist; no need to broadcast, only reorder
             order = common
 
         # Append the dimensions of `self`
         order.extend(
             filter(
-                lambda n: n is not None and n not in other.index.names, self.index.names
+                lambda n: n is not None and n not in other_index.names, self.index.names
             )
         )
 

diff --git a/genno/core/quantity.py b/genno/core/quantity.py
@@ -20,7 +20,6 @@ def to_series(self) -> pd.Series:
         """Like :meth:`xarray.DataArray.to_series`."""
         # Provided only for type-checking in other packages. AttrSeries implements;
         # SparseDataArray uses the xr.DataArray method.
-        raise RuntimeError
 
     @classmethod
     def from_series(cls, series, sparse=True):
@@ -55,6 +54,12 @@ def _single_column_df(data, name):
 
             # Unpack a single column; use its name if not overridden by `name`
             return data.iloc[:, 0], (name or data.columns[0])
+        # NB would prefer to do this, but pandas has several bugs for MultiIndex with
+        #    only 1 level
+        # elif (
+        #     isinstance(data, pd.Series) and not isinstance(data.index, pd.MultiIndex)
+        # ):
+        #     return data.set_axis(pd.MultiIndex.from_product([data.index])), name
         else:
             return data, name
 

diff --git a/genno/core/sparsedataarray.py b/genno/core/sparsedataarray.py
@@ -144,19 +144,14 @@ def ffill(self, dim: Hashable, limit: int = None):
         """Override :meth:`~xarray.DataArray.ffill` to auto-densify."""
         return self._sda.dense_super.ffill(dim, limit)._sda.convert()
 
-    def equals(self, other) -> bool:
-        """True if two SparseDataArrays have the same dims, coords, and values.
-
-        Overrides :meth:`~xarray.DataArray.equals` for sparse data.
-        """
-        # Necessary for :meth:`xarray.testing.assert_equal` to work.
-        return self.variable.equals(other.variable, equiv=np.equal)
-
-    def item(self):
-        """Analogous to :meth:`pandas.Series.item`."""
-        if len(self.data.shape) == 0:
+    def item(self, *args):
+        """Like :meth:`~xarray.DataArray.item`."""
+        if len(args):  # pragma: no cover
+            super().item(*args)
+        elif len(self.data.shape) == 0:
             return self.data.data[0]
-        raise ValueError("can only convert an array of size 1 to a Python scalar")
+        else:
+            raise ValueError("can only convert an array of size 1 to a Python scalar")
 
     def sel(
         self, indexers=None, method=None, tolerance=None, drop=False, **indexers_kwargs