Skip to content

Commit

Permalink
Merge pull request #39 from khaeru/sparse-0.12
Browse files Browse the repository at this point in the history
Adjust to sparse 0.12; enhance AttrSeries.{assign_coords,sel}
  • Loading branch information
khaeru committed Mar 22, 2021
2 parents 55a2b15 + ed4d89f commit 3fa97e9
Show file tree
Hide file tree
Showing 10 changed files with 291 additions and 66 deletions.
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"plotnine": ("https://plotnine.readthedocs.io/en/stable/", None),
"pyam": ("https://pyam-iamc.readthedocs.io/en/stable/", None),
"python": ("https://docs.python.org/3/", None),
"xarray": ("https://xarray.pydata.org/en/stable/", None),
}

# -- Options for sphinx.ext.todo -------------------------------------------------------
Expand Down
14 changes: 12 additions & 2 deletions doc/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,18 @@ What's new
:backlinks: none
:depth: 1

.. Next release
.. ============
Next release
============

- Bump minimum version of :mod:`sparse` from 0.10 to 0.12 and adjust to changes in this version (:pull:`39`)

- Remove :meth:`.SparseDataArray.equals`, obviated by improvements in :mod:`sparse`.

- Improve :class:`.AttrSeries` (:pull:`39`)

- Implement :meth:`~.AttrSeries.drop_vars` and :meth:`~.AttrSeries.expand_dims`.
- :meth:`~.AttrSeries.assign_coords` can relabel an entire dimension.
- :meth:`~.AttrSeries.sel` can accept :class:`.DataArray` indexers and rename/combine dimensions.

v1.2.1 (2021-03-08)
===================
Expand Down
4 changes: 1 addition & 3 deletions genno/computations.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,7 @@ def aggregate(quantity, groups, keep):

# Aggregate each group
for group, members in dim_groups.items():
agg = (
quantity.sel({dim: members}).sum(dim=dim).assign_coords(**{dim: group})
)
agg = quantity.sel({dim: members}).sum(dim=dim).expand_dims({dim: [group]})

if isinstance(agg, AttrSeries):
# .transpose() is necessary for AttrSeries
Expand Down
162 changes: 135 additions & 27 deletions genno/core/attrseries.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,32 @@
import logging
from typing import Any, Hashable, Mapping
from typing import Any, Hashable, Iterable, Mapping, Union

import pandas as pd
import pandas.core.indexes.base as ibase
import xarray as xr
from xarray.core.utils import either_dict_or_kwargs

from genno.core.quantity import Quantity

log = logging.getLogger(__name__)


def _multiindex_of(obj: pd.Series):
"""Return ``obj.index``; if this is not a :class:`pandas.MultiIndex`, convert."""
return (
obj.index
if isinstance(obj.index, pd.MultiIndex)
else pd.MultiIndex.from_product([obj.index])
)


class AttrSeries(pd.Series, Quantity):
""":class:`pandas.Series` subclass imitating :class:`xarray.DataArray`.
The AttrSeries class provides similar methods and behaviour to
:class:`xarray.DataArray`, so that :mod:`genno.computations` methods can use xarray-
like syntax.
:class:`xarray.DataArray`, so that :mod:`genno.computations` functions and user
code can use xarray-like syntax. In particular, this allows such code to be agnostic
about the order of dimensions.
Parameters
----------
Expand Down Expand Up @@ -70,9 +81,27 @@ def from_series(cls, series, sparse=None):
"""Like :meth:`xarray.DataArray.from_series`."""
return AttrSeries(series)

def assign_coords(self, **kwargs):
def assign_coords(self, coords=None, **coord_kwargs):
"""Like :meth:`xarray.DataArray.assign_coords`."""
return pd.concat([self], keys=kwargs.values(), names=kwargs.keys())
coords = either_dict_or_kwargs(coords, coord_kwargs, "assign_coords")

idx = _multiindex_of(self)

# Construct a new index
new_idx = idx.copy()
for dim, values in coords.items():
expected_len = len(idx.levels[idx.names.index(dim)])
if expected_len != len(values):
raise ValueError(
f"conflicting sizes for dimension {repr(dim)}: length "
f"{expected_len} on <this-array> and length {len(values)} on "
f"{repr(dim)}"
)

new_idx = new_idx.set_levels(values, level=dim)

# Return a new object with the new index
return self.set_axis(new_idx)

def bfill(self, dim: Hashable, limit: int = None):
"""Like :meth:`xarray.DataArray.bfill`."""
Expand All @@ -93,7 +122,7 @@ def coords(self):
return result

def cumprod(self, dim=None, axis=None, skipna=None, **kwargs):
"""Like :attr:`xarray.DataArray.cumprod`."""
"""Like :meth:`xarray.DataArray.cumprod`."""
if axis:
log.info(f"{self.__class__.__name__}.cumprod(…, axis=…) is ignored")

Expand All @@ -114,6 +143,31 @@ def drop(self, label):
"""Like :meth:`xarray.DataArray.drop`."""
return self.droplevel(label)

def drop_vars(
self, names: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise"
):
"""Like :meth:`xarray.DataArray.drop_vars`."""

return self.droplevel(names)

def expand_dims(
self,
dim: Union[None, Mapping[Hashable, Any]] = None,
axis=None,
**dim_kwargs: Any,
):
"""Like :meth:`xarray.DataArray.expand_dims`."""
dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims")
if axis is not None:
raise NotImplementedError # pragma: no cover

result = self
for name, values in reversed(list(dim.items())):
print(name, values)
result = pd.concat([result] * len(values), keys=values, names=[name])

return result

def ffill(self, dim: Hashable, limit: int = None):
"""Like :meth:`xarray.DataArray.ffill`."""
return self.__class__(
Expand Down Expand Up @@ -141,9 +195,7 @@ def rename(self, new_name_or_name_dict):

def sel(self, indexers=None, drop=False, **indexers_kwargs):
"""Like :meth:`xarray.DataArray.sel`."""
indexers = xr.core.utils.either_dict_or_kwargs(
indexers, indexers_kwargs, "indexers"
)
indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "indexers")

if len(indexers) == 1:
level, key = list(indexers.items())[0]
Expand All @@ -156,17 +208,69 @@ def sel(self, indexers=None, drop=False, **indexers_kwargs):
# No MultiIndex; use .loc with a slice to avoid returning scalar
return self.loc[slice(key, key)]

# Iterate over dimensions
idx = []
for dim in self.dims:
# Get an indexer for this dimension
i = indexers.get(dim, slice(None))
if len(indexers) and all(
isinstance(i, xr.DataArray) for i in indexers.values()
):
# DataArray indexers

# Combine indexers in a data set; dimensions are aligned
ds = xr.Dataset(indexers)

# All dimensions indexed
dims_indexed = set(indexers.keys())
# Dimensions to discard
dims_drop = set(ds.data_vars.keys())

# Check contents of indexers
if any(ds.isnull().any().values()):
raise IndexError(
f"Dimensions of indexers mismatch: {ds.notnull().sum()}"
)
elif len(ds.dims) > 1:
raise NotImplementedError( # pragma: no cover
f"map to > 1 dimensions {repr(ds.dims)} with AttrSeries.sel()"
)

# pd.Index object with names and levels of the new dimension to be created
idx = ds.coords.to_index()

# Dimensions to drop on sliced data to avoid duplicated dimensions
drop = list(dims_indexed - dims_drop)

# Dictionary of Series to concatenate
data = {}

# Maybe unpack an xarray DataArray indexers, for pandas
idx.append(i.data if isinstance(i, xr.DataArray) else i)
# Iterate over labels in the new dimension
for label in idx:
# Get a slice from the indexers corresponding to this label
loc_ds = ds.sel({idx.name: label})

# Select and return
return AttrSeries(self.loc[tuple(idx)])
# Assemble a key with one element for each dimension
seq = [loc_ds.get(d) for d in self.dims]
# Replace None from .get() with slice(None) or unpack a single value
seq = [slice(None) if item is None else item.item() for item in seq]

# Use the key to retrieve 1+ integer locations; slice; store
data[label] = self.iloc[self.index.get_locs(seq)].droplevel(drop)

# Rejoin to a single data frame; drop the source levels
data = pd.concat(data, names=[idx.name]).droplevel(list(dims_drop))
else:
# Other indexers

# Iterate over dimensions
idx = []
for dim in self.dims:
# Get an indexer for this dimension
i = indexers.get(dim, slice(None))

# Maybe unpack an xarray DataArray indexers, for pandas
idx.append(i.data if isinstance(i, xr.DataArray) else i)

data = self.loc[tuple(idx)]

# Return
return AttrSeries(data, attrs=self.attrs)

def shift(
self,
Expand All @@ -175,7 +279,7 @@ def shift(
**shifts_kwargs: int,
):
"""Like :meth:`xarray.DataArray.shift`."""
shifts = xr.core.utils.either_dict_or_kwargs(shifts, shifts_kwargs, "shift")
shifts = either_dict_or_kwargs(shifts, shifts_kwargs, "shift")
if len(shifts) > 1:
raise NotImplementedError(
f"{self.__class__.__name__}.shift() with > 1 dimension"
Expand Down Expand Up @@ -267,21 +371,25 @@ def align_levels(self, other):
Return a copy of `self` with common levels in the same order as `other`.
"""
# If other.index is a (1D) Index object, convert to a MultiIndex with 1 level so
# .levels[…] can be used, below. See also Quantity._single_column_df()
other_index = _multiindex_of(other)

# Lists of common dimensions, and dimensions on `other` missing from `self`.
common, missing = [], []
for (i, n) in enumerate(other.index.names):
for (i, n) in enumerate(other_index.names):
if n in self.index.names:
common.append(n)
else:
missing.append((i, n))

result = self
if len(common) == 0:
# Broadcast over missing dimensions
# TODO make this more efficient, e.g. using itertools.product()
for i, dim in missing:
result = pd.concat(
{v: result for v in other.index.get_level_values(i)}, names=[dim]
# No common dimensions
if len(missing):
# Broadcast over missing dimensions
result = result.expand_dims(
{dim: other_index.levels[i] for i, dim in missing}
)

if len(self) == len(self.index.names) == 1:
Expand All @@ -290,15 +398,15 @@ def align_levels(self, other):
result = result.droplevel(-1)

# Reordering starts with the dimensions of `other`
order = list(other.index.names)
order = list(other_index.names)
else:
# Some common dimensions exist; no need to broadcast, only reorder
order = common

# Append the dimensions of `self`
order.extend(
filter(
lambda n: n is not None and n not in other.index.names, self.index.names
lambda n: n is not None and n not in other_index.names, self.index.names
)
)

Expand Down
7 changes: 6 additions & 1 deletion genno/core/quantity.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def to_series(self) -> pd.Series:
"""Like :meth:`xarray.DataArray.to_series`."""
# Provided only for type-checking in other packages. AttrSeries implements;
# SparseDataArray uses the xr.DataArray method.
raise RuntimeError

@classmethod
def from_series(cls, series, sparse=True):
Expand Down Expand Up @@ -55,6 +54,12 @@ def _single_column_df(data, name):

# Unpack a single column; use its name if not overridden by `name`
return data.iloc[:, 0], (name or data.columns[0])
# NB would prefer to do this, but pandas has several bugs for MultiIndex with
# only 1 level
# elif (
# isinstance(data, pd.Series) and not isinstance(data.index, pd.MultiIndex)
# ):
# return data.set_axis(pd.MultiIndex.from_product([data.index])), name
else:
return data, name

Expand Down
19 changes: 7 additions & 12 deletions genno/core/sparsedataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,19 +144,14 @@ def ffill(self, dim: Hashable, limit: int = None):
"""Override :meth:`~xarray.DataArray.ffill` to auto-densify."""
return self._sda.dense_super.ffill(dim, limit)._sda.convert()

def equals(self, other) -> bool:
"""True if two SparseDataArrays have the same dims, coords, and values.
Overrides :meth:`~xarray.DataArray.equals` for sparse data.
"""
# Necessary for :meth:`xarray.testing.assert_equal` to work.
return self.variable.equals(other.variable, equiv=np.equal)

def item(self):
"""Analogous to :meth:`pandas.Series.item`."""
if len(self.data.shape) == 0:
def item(self, *args):
"""Like :meth:`~xarray.DataArray.item`."""
if len(args): # pragma: no cover
super().item(*args)
elif len(self.data.shape) == 0:
return self.data.data[0]
raise ValueError("can only convert an array of size 1 to a Python scalar")
else:
raise ValueError("can only convert an array of size 1 to a Python scalar")

def sel(
self, indexers=None, method=None, tolerance=None, drop=False, **indexers_kwargs
Expand Down

0 comments on commit 3fa97e9

Please sign in to comment.