Skip to content

Commit

Permalink
ENH: add groupby & reduce support to EA
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Oct 11, 2018
1 parent a86501f commit 90fb20e
Show file tree
Hide file tree
Showing 9 changed files with 193 additions and 21 deletions.
9 changes: 8 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Pandas has gained the ability to hold integer dtypes with missing values. This l
Here is an example of the usage.

We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`)
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`)

.. ipython:: python

Expand Down Expand Up @@ -91,6 +91,13 @@ These dtypes can be merged & reshaped & casted.
pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes
df['A'].astype(float)

Reduction and groupby operations such as 'sum' work.

.. ipython:: python

df.sum()
df.groupby('B').A.sum()

.. warning::

The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date.
Expand Down
23 changes: 23 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,29 @@ def all_arithmetic_operators(request):
return request.param


_all_numeric_reductions = ['sum', 'max', 'min',
'mean', 'prod', 'std', 'var', 'median']


@pytest.fixture(params=_all_numeric_reductions)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names
"""
return request.param


_all_boolean_reductions = ['all', 'any']


@pytest.fixture(params=_all_boolean_reductions)
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names
"""
return request.param


_cython_table = pd.core.base.SelectionMixin._cython_table.items()


Expand Down
29 changes: 29 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ class ExtensionArray(object):
* factorize / _values_for_factorize
* argsort / _values_for_argsort
One can implement methods to handle array reductions.
* _reduce
The remaining methods implemented on this class should be performant,
as they only compose abstract methods. Still, a more efficient
implementation may be available, and these methods can be overridden.
Expand Down Expand Up @@ -713,6 +717,31 @@ def _add_comparison_ops(cls):
cls.__le__ = cls._create_comparison_method(operator.le)
cls.__ge__ = cls._create_comparison_method(operator.ge)

def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
filter_type=None, **kwargs):
"""Return a scalar result of performing the op
Parameters
----------
op : callable
function to apply to the array
name : str
name of the function
axis : int, default 0
axis over which to apply, defined as 0 currently
skipna : bool, default True
if True, skip NaN values
numeric_only : bool, optional
if True, only perform numeric ops
filter_type : str, optional
kwargs : dict
Returns
-------
scalar
"""
raise AbstractMethodError(self)


class ExtensionScalarOpsMixin(ExtensionOpsMixin):
"""A mixin for defining the arithmetic and logical operations on
Expand Down
49 changes: 49 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,55 @@ def cmp_method(self, other):
name = '__{name}__'.format(name=op.__name__)
return set_function_name(cmp_method, name, cls)

def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
filter_type=None, **kwds):
"""Return a scalar result of performing the op
Parameters
----------
op : callable
function to apply to the array
name : str
name of the function
axis : int, default 0
axis over which to apply, defined as 0 currently
skipna : bool, default True
if True, skip NaN values
numeric_only : bool, optional
if True, only perform numeric ops
filter_type : str, optional
kwds : dict
Returns
-------
scalar
"""

data = self._data
mask = self._mask

# coerce to a nan-aware float if needed
if mask.any():
data = self._data.astype('float64')
data[mask] = self._na_value

result = op(data, axis=axis, skipna=skipna)

# if we have a boolean op, provide coercion back to a bool
# type if possible
if name in ['any', 'all']:
if is_integer(result) or is_float(result):
result = bool(int(result))

# if we have a numeric op, provide coercion back to an integer
# type if possible
elif not isna(result):
int_result = int(result)
if int_result == result:
result = int_result

return result

def _maybe_mask_result(self, result, mask, other, op_name):
"""
Parameters
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,15 +587,18 @@ def test_cross_type_arithmetic():
tm.assert_series_equal(result, expected)


def test_groupby_mean_included():
@pytest.mark.parametrize('op', ['sum', 'min', 'max'])
def test_preserve_groupby_dtypes(op):
# TODO(#22346): preserve Int64 dtype
# for ops that enable (mean would actually work here
# but generally it is a float return value)
df = pd.DataFrame({
"A": ['a', 'b', 'b'],
"B": [1, None, 3],
"C": integer_array([1, None, 3], dtype='Int64'),
})

result = df.groupby("A").sum()
# TODO(#22346): preserve Int64 dtype
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="int64")
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class TestMyDtype(BaseDtypeTests):
from .interface import BaseInterfaceTests # noqa
from .methods import BaseMethodsTests # noqa
from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa
from .reduce import BaseNumericReduceTests, BaseBooleanReduceTests # noqa
from .missing import BaseMissingTests # noqa
from .reshaping import BaseReshapingTests # noqa
from .setitem import BaseSetitemTests # noqa
8 changes: 4 additions & 4 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
"B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")

index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
Expand All @@ -39,8 +39,8 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
"B": data_for_grouping})
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")

index = pd.Index(index, name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
self.assert_series_equal(result, expected)

Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/extension/base/reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import warnings
import pytest
import pandas.util.testing as tm
import pandas as pd
from .base import BaseExtensionTests


class BaseReduceTests(BaseExtensionTests):
"""
Reduction specific tests. Generally these only
make sense for numeric/boolean operations.
"""
def check_reduce(self, s, op_name, skipna):
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype('float64'), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)


class BaseNumericReduceTests(BaseReduceTests):

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
s = pd.Series(data)

# min/max with empty produce numpy warnings
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
self.check_reduce(s, op_name, skipna)


class BaseBooleanReduceTests(BaseReduceTests):

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
s = pd.Series(data)
self.check_reduce(s, op_name, skipna)
48 changes: 35 additions & 13 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,17 +208,39 @@ class TestCasting(base.BaseCastingTests):

class TestGroupby(base.BaseGroupbyTests):

@pytest.mark.xfail(reason="groupby not working", strict=True)
def test_groupby_extension_no_sort(self, data_for_grouping):
super(TestGroupby, self).test_groupby_extension_no_sort(
data_for_grouping)

@pytest.mark.parametrize('as_index', [
pytest.param(True,
marks=pytest.mark.xfail(reason="groupby not working",
strict=True)),
False
])
@pytest.mark.parametrize('as_index', [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super(TestGroupby, self).test_groupby_extension_agg(
as_index, data_for_grouping)
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)

# TODO(ExtensionIndex): remove coercion to object
# we don't have an easy way to represent an EA as an Index object
index = pd.Index(index, name="B", dtype=object)
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
expected = expected.reset_index()
self.assert_frame_equal(result, expected)

def test_groupby_extension_no_sort(self, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping})
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)

# TODO(ExtensionIndex): remove coercion to object
# we don't have an easy way to represent an EA as an Index object
index = pd.Index(index, name="B", dtype=object)
expected = pd.Series([1, 3, 4], index=index, name="A")
self.assert_series_equal(result, expected)


class TestNumericReduce(base.BaseNumericReduceTests):
pass


class TestBooleanReduce(base.BaseBooleanReduceTests):
pass

0 comments on commit 90fb20e

Please sign in to comment.