diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 91575c311b409e..80399ca31a9ad9 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -48,7 +48,7 @@ Pandas has gained the ability to hold integer dtypes with missing values. This l Here is an example of the usage. We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value -marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`) +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) .. ipython:: python @@ -91,6 +91,13 @@ These dtypes can be merged & reshaped & casted. pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes df['A'].astype(float) +Reduction and groupby operations such as 'sum' work. + +.. ipython:: python + + df.sum() + df.groupby('B').A.sum() + .. warning:: The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. diff --git a/pandas/conftest.py b/pandas/conftest.py index 621de3ffd4b12c..d24400d09809e7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -131,6 +131,29 @@ def all_arithmetic_operators(request): return request.param +_all_numeric_reductions = ['sum', 'max', 'min', + 'mean', 'prod', 'std', 'var', 'median'] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names + """ + return request.param + + +_all_boolean_reductions = ['all', 'any'] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request): + """ + Fixture for boolean reduction names + """ + return request.param + + _cython_table = pd.core.base.SelectionMixin._cython_table.items() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f7c4ee35adfe48..60f3b2a123ddc9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -59,6 +59,10 @@ class ExtensionArray(object): * factorize / _values_for_factorize * argsort / _values_for_argsort + One can implement methods to handle array reductions. + + * _reduce + The remaining methods implemented on this class should be performant, as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. @@ -708,6 +712,31 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) + def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwargs): + """Return a scalar result of performing the op + + Parameters + ---------- + op : callable + function to apply to the array + name : str + name of the function + axis : int, default 0 + axis over which to apply, defined as 0 currently + skipna : bool, default True + if True, skip NaN values + numeric_only : bool, optional + if True, only perform numeric ops + filter_type : str, optional + kwargs : dict + + Returns + ------- + scalar + """ + raise AbstractMethodError(self) + class ExtensionScalarOpsMixin(ExtensionOpsMixin): """A mixin for defining the arithmetic and logical operations on diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e58109a25e1a57..7e1e7315232aa5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -529,6 +529,55 @@ def cmp_method(self, other): name = '__{name}__'.format(name=op.__name__) return set_function_name(cmp_method, name, cls) + def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + """Return a scalar result of performing the op + + Parameters + ---------- + op : callable + function to apply to the array + name : str + name of the function + axis : int, default 0 + axis over which to apply, defined as 0 currently + skipna : bool, default True + if True, skip NaN values + numeric_only : bool, optional + if True, only perform numeric ops + filter_type : str, optional + kwds : dict + + Returns + ------- + scalar + """ + + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if mask.any(): + data = self._data.astype('float64') + data[mask] = self._na_value + + result = op(data, axis=axis, skipna=skipna) + + # if we have a boolean op, provide coercion back to a bool + # type if possible + if name in ['any', 'all']: + if is_integer(result) or is_float(result): + result = bool(int(result)) + + # if we have a numeric op, provide coercion back to an integer + # type if possible + elif not isna(result): + int_result = int(result) + if int_result == result: + result = int_result + + return result + def _maybe_mask_result(self, result, mask, other, op_name): """ Parameters diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 349a6aee5701ea..3f3653f375149e 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -587,15 +587,18 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) -def test_groupby_mean_included(): +@pytest.mark.parametrize('op', ['sum', 'min', 'max']) +def test_preserve_groupby_dtypes(op): + # TODO(#22346): preserve Int64 dtype + # for ops that enable (mean would actually work here + # but generally it is a float return value) df = pd.DataFrame({ "A": ['a', 'b', 'b'], "B": [1, None, 3], "C": integer_array([1, None, 3], dtype='Int64'), }) - result = df.groupby("A").sum() - # TODO(#22346): preserve Int64 dtype + result = getattr(df.groupby("A"), op)() expected = pd.DataFrame({ "B": np.array([1.0, 3.0]), "C": np.array([1, 3], dtype="int64") diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index b6b81bb941a59c..b5d4fd676245ef 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -48,6 +48,7 @@ class TestMyDtype(BaseDtypeTests): from .interface import BaseInterfaceTests # noqa from .methods import BaseMethodsTests # noqa from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa +from .reduce import BaseNumericReduceTests, BaseBooleanReduceTests # noqa from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 174997c7d51e17..52c635d286df6e 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -25,8 +25,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() _, index = pd.factorize(data_for_grouping, sort=True) - # TODO(ExtensionIndex): remove astype - index = pd.Index(index.astype(object), name="B") + + index = pd.Index(index, name="B") expected = pd.Series([3, 1, 4], index=index, name="A") if as_index: self.assert_series_equal(result, expected) @@ -39,8 +39,8 @@ def test_groupby_extension_no_sort(self, data_for_grouping): "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) - # TODO(ExtensionIndex): remove astype - index = pd.Index(index.astype(object), name="B") + + index = pd.Index(index, name="B") expected = pd.Series([1, 3, 4], index=index, name="A") self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py new file mode 100644 index 00000000000000..ce53ba91ff2fc0 --- /dev/null +++ b/pandas/tests/extension/base/reduce.py @@ -0,0 +1,38 @@ +import warnings +import pytest +import pandas.util.testing as tm +import pandas as pd +from .base import BaseExtensionTests + + +class BaseReduceTests(BaseExtensionTests): + """ + Reduction specific tests. Generally these only + make sense for numeric/boolean operations. + """ + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype('float64'), op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + +class BaseNumericReduceTests(BaseReduceTests): + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + s = pd.Series(data) + + # min/max with empty produce numpy warnings + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + self.check_reduce(s, op_name, skipna) + + +class BaseBooleanReduceTests(BaseReduceTests): + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series(self, data, all_boolean_reductions, skipna): + op_name = all_boolean_reductions + s = pd.Series(data) + self.check_reduce(s, op_name, skipna) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index fa5c89d85e5481..163f2ad6fc72b7 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -208,17 +208,39 @@ class TestCasting(base.BaseCastingTests): class TestGroupby(base.BaseGroupbyTests): - @pytest.mark.xfail(reason="groupby not working", strict=True) - def test_groupby_extension_no_sort(self, data_for_grouping): - super(TestGroupby, self).test_groupby_extension_no_sort( - data_for_grouping) - - @pytest.mark.parametrize('as_index', [ - pytest.param(True, - marks=pytest.mark.xfail(reason="groupby not working", - strict=True)), - False - ]) + @pytest.mark.parametrize('as_index', [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): - super(TestGroupby, self).test_groupby_extension_agg( - as_index, data_for_grouping) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + + # TODO(ExtensionIndex): remove coercion to object + # we don't have an easy way to represent an EA as an Index object + index = pd.Index(index, name="B", dtype=object) + expected = pd.Series([3, 1, 4], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + # TODO(ExtensionIndex): remove coercion to object + # we don't have an easy way to represent an EA as an Index object + index = pd.Index(index, name="B", dtype=object) + expected = pd.Series([1, 3, 4], index=index, name="A") + self.assert_series_equal(result, expected) + + +class TestNumericReduce(base.BaseNumericReduceTests): + pass + + +class TestBooleanReduce(base.BaseBooleanReduceTests): + pass