COMPAT: sum/prod on all nan will remain nan regardless of bottleneck …

…install xref pandas-dev#15507 closes pandas-dev#9422
jreback · Oct 6, 2017 · 0ab6cfa · 0ab6cfa
1 parent e63c935
commit 0ab6cfa
Show file tree

Hide file tree

Showing 10 changed files with 224 additions and 199 deletions.
diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
@@ -181,6 +181,42 @@ account for missing data. For example:
    df.mean(1)
    df.cumsum()
 
+
+.. _missing_data.numeric_sum:
+
+Sum/Prod of Empties/Nans
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. warning::
+
+   This behavior is now standard as of v0.21.0; previously sum/prod would give different
+   results if the ``bottleneck`` package was installed. See the :ref:`here <whatsnew_0210.api_breaking.bottleneck>`.
+
+If summing a ``DataFrame``, a ``Series`` of all-``NaN``.
+
+.. ipython:: python
+
+   s = Series([np.nan])
+
+   s.sum()
+
+Summing of an empty ``Series``
+
+.. ipython:: python
+
+   pd.Series([]).sum()
+
+.. warning::
+
+   These behaviors differ from the default in ``numpy`` which does not generally propagate NaNs
+
+   .. ipython:: python
+
+      np.nansum(np.array([np.nan]))
+      np.nansum(np.array([]))
+
+
+
 NA values in GroupBy
 ~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -12,6 +12,7 @@ Highlights include:
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
 - New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying
   categoricals independent of the data, see :ref:`here <whatsnew_0210.enhancements.categorical_dtype>`.
+- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed, see :ref:`here <whatsnew_0210.api_breaking.bottleneck>`
 
 Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.
 
@@ -411,6 +412,54 @@ Current Behavior
 
    s.loc[pd.Index([True, False, True])]
 
+.. _whatsnew_0210.api_breaking.bottleneck:
+
+Sum/Prod of all-NaN Series/DataFrames is now consistently NaN
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on
+whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed. (:issue:`9422`, :issue:`15507`).
+
+This now will *always* preserve information. You will get back a ``NaN``, indicating missing values in that Series,
+or if summing a ``DataFrame``, a ``Series`` of all-``NaN``. See the :ref:`docs <missing_data.numeric_sum>`.
+
+
+.. ipython:: python
+
+   s = Series([np.nan])
+
+Previously NO ``bottleneck``
+
+.. code_block:: ipython
+
+   In [2]: s.sum()
+   Out[2]: np.nan
+
+Previously WITH ``bottleneck``
+
+.. code_block:: ipython
+
+   In [2]: s.sum()
+   Out[2]: 0.0
+
+New Behavior, without regards to the bottleneck installation.
+
+.. ipython:: python
+
+   s.sum()
+
+Note that this also changes the sum of an empty ``Series``
+
+Previously regardless of ``bottlenck``
+
+.. code_block:: ipython
+
+   In [1]: pd.Series([]).sum()
+   Out[1]: 0
+
+.. ipython:: python
+
+   pd.Series([]).sum()
 
 .. _whatsnew_0210.api_breaking.pandas_eval:
 

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -18,7 +18,7 @@
     is_datetime_or_timedelta_dtype,
     is_int_or_datetime_dtype, is_any_int_dtype)
 from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
-from pandas.core.dtypes.missing import isna, notna
+from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype
 from pandas.core.config import get_option
 from pandas.core.common import _values_from_object
 
@@ -89,8 +89,7 @@ def _f(*args, **kwargs):
 
 class bottleneck_switch(object):
 
-    def __init__(self, zero_value=None, **kwargs):
-        self.zero_value = zero_value
+    def __init__(self, **kwargs):
         self.kwargs = kwargs
 
     def __call__(self, alt):
@@ -108,18 +107,20 @@ def f(values, axis=None, skipna=True, **kwds):
                     if k not in kwds:
                         kwds[k] = v
             try:
-                if self.zero_value is not None and values.size == 0:
-                    if values.ndim == 1:
+                if values.size == 0:
+
+                    # we either return np.nan or pd.NaT
+                    if is_numeric_dtype(values):
+                        values = values.astype('float64')
+                    fill_value = na_value_for_dtype(values.dtype)
 
-                        # wrap the 0's if needed
-                        if is_timedelta64_dtype(values):
-                            return lib.Timedelta(0)
-                        return 0
+                    if values.ndim == 1:
+                        return fill_value
                     else:
                         result_shape = (values.shape[:axis] +
                                         values.shape[axis + 1:])
-                        result = np.empty(result_shape)
-                        result.fill(0)
+                        result = np.empty(result_shape, dtype=values.dtype)
+                        result.fill(fill_value)
                         return result
 
                 if (_USE_BOTTLENECK and skipna and
@@ -154,11 +155,16 @@ def _bn_ok_dtype(dt, name):
     # Bottleneck chokes on datetime64
     if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):
 
+        # GH 15507
         # bottleneck does not properly upcast during the sum
         # so can overflow
-        if name == 'nansum':
-            if dt.itemsize < 8:
-                return False
+
+        # GH 9422
+        # further we also want to preserve NaN when all elements
+        # are NaN, unlinke bottleneck/numpy which consider this
+        # to be 0
+        if name in ['nansum', 'nanprod']:
+            return False
 
         return True
     return False
@@ -297,7 +303,7 @@ def nanall(values, axis=None, skipna=True):
 
 
 @disallow('M8')
-@bottleneck_switch(zero_value=0)
+@bottleneck_switch()
 def nansum(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
     dtype_sum = dtype_max

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -448,7 +448,11 @@ def test_sum(self):
                             has_numeric_only=True, check_dtype=False,
                             check_less_precise=True)
 
-    def test_stat_operators_attempt_obj_array(self):
+    @pytest.mark.parametrize(
+        "method", ['sum', 'mean', 'prod', 'var',
+                   'std', 'skew', 'min', 'max'])
+    def test_stat_operators_attempt_obj_array(self, method):
+        # GH #676
         data = {
             'a': [-0.00049987540199591344, -0.0016467257772919831,
                   0.00067695870775883013],
@@ -458,20 +462,17 @@ def test_stat_operators_attempt_obj_array(self):
         }
         df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
                         dtype='O')
-        methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']
 
-        # GH #676
         df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
                          2: [np.nan, 4]}, dtype=object)
 
         for df in [df1, df2]:
-            for meth in methods:
-                assert df.values.dtype == np.object_
-                result = getattr(df, meth)(1)
-                expected = getattr(df.astype('f8'), meth)(1)
+            assert df.values.dtype == np.object_
+            result = getattr(df, method)(1)
+            expected = getattr(df.astype('f8'), method)(1)
 
-                if not tm._incompat_bottleneck_version(meth):
-                    tm.assert_series_equal(result, expected)
+            if method in ['sum', 'prod']:
+                tm.assert_series_equal(result, expected)
 
     def test_mean(self):
         self._check_stat_op('mean', np.mean, check_dates=True)
@@ -563,15 +564,15 @@ def test_var_std(self):
         arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
         result = nanops.nanvar(arr, axis=0)
         assert not (result < 0).any()
-        if nanops._USE_BOTTLENECK:
-            nanops._USE_BOTTLENECK = False
+
+        with pd.option_context('use_bottleneck', False):
             result = nanops.nanvar(arr, axis=0)
             assert not (result < 0).any()
-            nanops._USE_BOTTLENECK = True
 
-    def test_numeric_only_flag(self):
+    @pytest.mark.parametrize(
+        "meth", ['sem', 'var', 'std'])
+    def test_numeric_only_flag(self, meth):
         # GH #9201
-        methods = ['sem', 'var', 'std']
         df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
         # set one entry to a number in str format
         df1.loc[0, 'foo'] = '100'
@@ -580,20 +581,19 @@ def test_numeric_only_flag(self):
         # set one entry to a non-number str
         df2.loc[0, 'foo'] = 'a'
 
-        for meth in methods:
-            result = getattr(df1, meth)(axis=1, numeric_only=True)
-            expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
-            tm.assert_series_equal(expected, result)
+        result = getattr(df1, meth)(axis=1, numeric_only=True)
+        expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
+        tm.assert_series_equal(expected, result)
 
-            result = getattr(df2, meth)(axis=1, numeric_only=True)
-            expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
-            tm.assert_series_equal(expected, result)
+        result = getattr(df2, meth)(axis=1, numeric_only=True)
+        expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
+        tm.assert_series_equal(expected, result)
 
-            # df1 has all numbers, df2 has a letter inside
-            pytest.raises(TypeError, lambda: getattr(df1, meth)(
-                axis=1, numeric_only=False))
-            pytest.raises(TypeError, lambda: getattr(df2, meth)(
-                axis=1, numeric_only=False))
+        # df1 has all numbers, df2 has a letter inside
+        pytest.raises(TypeError, lambda: getattr(df1, meth)(
+            axis=1, numeric_only=False))
+        pytest.raises(TypeError, lambda: getattr(df2, meth)(
+            axis=1, numeric_only=False))
 
     def test_mixed_ops(self):
         # GH 16116
@@ -606,11 +606,9 @@ def test_mixed_ops(self):
             result = getattr(df, op)()
             assert len(result) == 2
 
-            if nanops._USE_BOTTLENECK:
-                nanops._USE_BOTTLENECK = False
+            with pd.option_context('use_bottleneck', False):
                 result = getattr(df, op)()
                 assert len(result) == 2
-                nanops._USE_BOTTLENECK = True
 
     def test_cumsum(self):
         self.tsframe.loc[5:10, 0] = nan
@@ -676,11 +674,10 @@ def test_sem(self):
         arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
         result = nanops.nansem(arr, axis=0)
         assert not (result < 0).any()
-        if nanops._USE_BOTTLENECK:
-            nanops._USE_BOTTLENECK = False
+
+        with pd.option_context('use_bottleneck', False):
             result = nanops.nansem(arr, axis=0)
             assert not (result < 0).any()
-            nanops._USE_BOTTLENECK = True
 
     def test_skew(self):
         tm._skip_if_no_scipy()
@@ -767,7 +764,7 @@ def wrapper(x):
         tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
                                check_dtype=check_dtype,
                                check_less_precise=check_less_precise)
-        if not tm._incompat_bottleneck_version(name):
+        if name in ['sum', 'prod']:
             exp = frame.apply(skipna_wrapper, axis=1)
             tm.assert_series_equal(result1, exp, check_dtype=False,
                                    check_less_precise=check_less_precise)
@@ -799,7 +796,7 @@ def wrapper(x):
             all_na = self.frame * np.NaN
             r0 = getattr(all_na, name)(axis=0)
             r1 = getattr(all_na, name)(axis=1)
-            if not tm._incompat_bottleneck_version(name):
+            if name in ['sum', 'prod']:
                 assert np.isnan(r0).all()
                 assert np.isnan(r1).all()
 
@@ -1859,14 +1856,14 @@ def test_dataframe_clip(self):
             assert (clipped_df.values[ub_mask] == ub).all()
             assert (clipped_df.values[mask] == df.values[mask]).all()
 
-    @pytest.mark.xfail(reason=("clip on mixed integer or floats "
-                               "with integer clippers coerces to float"))
     def test_clip_mixed_numeric(self):
-
+        # TODO(jreback)
+        # clip on mixed integer or floats
+        # with integer clippers coerces to float
         df = DataFrame({'A': [1, 2, 3],
                         'B': [1., np.nan, 3.]})
         result = df.clip(1, 2)
-        expected = DataFrame({'A': [1, 2, 2],
+        expected = DataFrame({'A': [1, 2, 2.],
                               'B': [1., np.nan, 2.]})
         tm.assert_frame_equal(result, expected, check_like=True)
 

diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py
@@ -562,7 +562,7 @@ def _testit(name):
             exp.name = 'C'
 
             result = op(grouped)['C']
-            if not tm._incompat_bottleneck_version(name):
+            if name in ['sum', 'prod']:
                 assert_series_equal(result, exp)
 
         _testit('count')