Skip to content

Commit

Permalink
COMPAT: sum/prod on all nan will remain nan regardless of bottleneck …
Browse files Browse the repository at this point in the history
…install

xref pandas-dev#15507
closes pandas-dev#9422
  • Loading branch information
jreback committed Oct 6, 2017
1 parent e63c935 commit 0ab6cfa
Show file tree
Hide file tree
Showing 10 changed files with 224 additions and 199 deletions.
36 changes: 36 additions & 0 deletions doc/source/missing_data.rst
Expand Up @@ -181,6 +181,42 @@ account for missing data. For example:
df.mean(1)
df.cumsum()
.. _missing_data.numeric_sum:

Sum/Prod of Empties/Nans
~~~~~~~~~~~~~~~~~~~~~~~~

.. warning::

This behavior is now standard as of v0.21.0; previously sum/prod would give different
results if the ``bottleneck`` package was installed. See the :ref:`here <whatsnew_0210.api_breaking.bottleneck>`.

If summing a ``DataFrame``, a ``Series`` of all-``NaN``.

.. ipython:: python
s = Series([np.nan])
s.sum()
Summing of an empty ``Series``

.. ipython:: python
pd.Series([]).sum()
.. warning::

These behaviors differ from the default in ``numpy`` which does not generally propagate NaNs

.. ipython:: python
np.nansum(np.array([np.nan]))
np.nansum(np.array([]))
NA values in GroupBy
~~~~~~~~~~~~~~~~~~~~

Expand Down
49 changes: 49 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Expand Up @@ -12,6 +12,7 @@ Highlights include:
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying
categoricals independent of the data, see :ref:`here <whatsnew_0210.enhancements.categorical_dtype>`.
- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed, see :ref:`here <whatsnew_0210.api_breaking.bottleneck>`

Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.

Expand Down Expand Up @@ -411,6 +412,54 @@ Current Behavior

s.loc[pd.Index([True, False, True])]

.. _whatsnew_0210.api_breaking.bottleneck:

Sum/Prod of all-NaN Series/DataFrames is now consistently NaN
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on
whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed. (:issue:`9422`, :issue:`15507`).

This now will *always* preserve information. You will get back a ``NaN``, indicating missing values in that Series,
or if summing a ``DataFrame``, a ``Series`` of all-``NaN``. See the :ref:`docs <missing_data.numeric_sum>`.


.. ipython:: python

s = Series([np.nan])

Previously NO ``bottleneck``

.. code_block:: ipython

In [2]: s.sum()
Out[2]: np.nan

Previously WITH ``bottleneck``

.. code_block:: ipython

In [2]: s.sum()
Out[2]: 0.0

New Behavior, without regards to the bottleneck installation.

.. ipython:: python

s.sum()

Note that this also changes the sum of an empty ``Series``

Previously regardless of ``bottlenck``

.. code_block:: ipython

In [1]: pd.Series([]).sum()
Out[1]: 0

.. ipython:: python

pd.Series([]).sum()

.. _whatsnew_0210.api_breaking.pandas_eval:

Expand Down
36 changes: 21 additions & 15 deletions pandas/core/nanops.py
Expand Up @@ -18,7 +18,7 @@
is_datetime_or_timedelta_dtype,
is_int_or_datetime_dtype, is_any_int_dtype)
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
from pandas.core.dtypes.missing import isna, notna
from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype
from pandas.core.config import get_option
from pandas.core.common import _values_from_object

Expand Down Expand Up @@ -89,8 +89,7 @@ def _f(*args, **kwargs):

class bottleneck_switch(object):

def __init__(self, zero_value=None, **kwargs):
self.zero_value = zero_value
def __init__(self, **kwargs):
self.kwargs = kwargs

def __call__(self, alt):
Expand All @@ -108,18 +107,20 @@ def f(values, axis=None, skipna=True, **kwds):
if k not in kwds:
kwds[k] = v
try:
if self.zero_value is not None and values.size == 0:
if values.ndim == 1:
if values.size == 0:

# we either return np.nan or pd.NaT
if is_numeric_dtype(values):
values = values.astype('float64')
fill_value = na_value_for_dtype(values.dtype)

# wrap the 0's if needed
if is_timedelta64_dtype(values):
return lib.Timedelta(0)
return 0
if values.ndim == 1:
return fill_value
else:
result_shape = (values.shape[:axis] +
values.shape[axis + 1:])
result = np.empty(result_shape)
result.fill(0)
result = np.empty(result_shape, dtype=values.dtype)
result.fill(fill_value)
return result

if (_USE_BOTTLENECK and skipna and
Expand Down Expand Up @@ -154,11 +155,16 @@ def _bn_ok_dtype(dt, name):
# Bottleneck chokes on datetime64
if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):

# GH 15507
# bottleneck does not properly upcast during the sum
# so can overflow
if name == 'nansum':
if dt.itemsize < 8:
return False

# GH 9422
# further we also want to preserve NaN when all elements
# are NaN, unlinke bottleneck/numpy which consider this
# to be 0
if name in ['nansum', 'nanprod']:
return False

return True
return False
Expand Down Expand Up @@ -297,7 +303,7 @@ def nanall(values, axis=None, skipna=True):


@disallow('M8')
@bottleneck_switch(zero_value=0)
@bottleneck_switch()
def nansum(values, axis=None, skipna=True):
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
dtype_sum = dtype_max
Expand Down
73 changes: 35 additions & 38 deletions pandas/tests/frame/test_analytics.py
Expand Up @@ -448,7 +448,11 @@ def test_sum(self):
has_numeric_only=True, check_dtype=False,
check_less_precise=True)

def test_stat_operators_attempt_obj_array(self):
@pytest.mark.parametrize(
"method", ['sum', 'mean', 'prod', 'var',
'std', 'skew', 'min', 'max'])
def test_stat_operators_attempt_obj_array(self, method):
# GH #676
data = {
'a': [-0.00049987540199591344, -0.0016467257772919831,
0.00067695870775883013],
Expand All @@ -458,20 +462,17 @@ def test_stat_operators_attempt_obj_array(self):
}
df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
dtype='O')
methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']

# GH #676
df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
2: [np.nan, 4]}, dtype=object)

for df in [df1, df2]:
for meth in methods:
assert df.values.dtype == np.object_
result = getattr(df, meth)(1)
expected = getattr(df.astype('f8'), meth)(1)
assert df.values.dtype == np.object_
result = getattr(df, method)(1)
expected = getattr(df.astype('f8'), method)(1)

if not tm._incompat_bottleneck_version(meth):
tm.assert_series_equal(result, expected)
if method in ['sum', 'prod']:
tm.assert_series_equal(result, expected)

def test_mean(self):
self._check_stat_op('mean', np.mean, check_dates=True)
Expand Down Expand Up @@ -563,15 +564,15 @@ def test_var_std(self):
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
result = nanops.nanvar(arr, axis=0)
assert not (result < 0).any()
if nanops._USE_BOTTLENECK:
nanops._USE_BOTTLENECK = False

with pd.option_context('use_bottleneck', False):
result = nanops.nanvar(arr, axis=0)
assert not (result < 0).any()
nanops._USE_BOTTLENECK = True

def test_numeric_only_flag(self):
@pytest.mark.parametrize(
"meth", ['sem', 'var', 'std'])
def test_numeric_only_flag(self, meth):
# GH #9201
methods = ['sem', 'var', 'std']
df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
# set one entry to a number in str format
df1.loc[0, 'foo'] = '100'
Expand All @@ -580,20 +581,19 @@ def test_numeric_only_flag(self):
# set one entry to a non-number str
df2.loc[0, 'foo'] = 'a'

for meth in methods:
result = getattr(df1, meth)(axis=1, numeric_only=True)
expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
tm.assert_series_equal(expected, result)
result = getattr(df1, meth)(axis=1, numeric_only=True)
expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
tm.assert_series_equal(expected, result)

result = getattr(df2, meth)(axis=1, numeric_only=True)
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
tm.assert_series_equal(expected, result)
result = getattr(df2, meth)(axis=1, numeric_only=True)
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
tm.assert_series_equal(expected, result)

# df1 has all numbers, df2 has a letter inside
pytest.raises(TypeError, lambda: getattr(df1, meth)(
axis=1, numeric_only=False))
pytest.raises(TypeError, lambda: getattr(df2, meth)(
axis=1, numeric_only=False))
# df1 has all numbers, df2 has a letter inside
pytest.raises(TypeError, lambda: getattr(df1, meth)(
axis=1, numeric_only=False))
pytest.raises(TypeError, lambda: getattr(df2, meth)(
axis=1, numeric_only=False))

def test_mixed_ops(self):
# GH 16116
Expand All @@ -606,11 +606,9 @@ def test_mixed_ops(self):
result = getattr(df, op)()
assert len(result) == 2

if nanops._USE_BOTTLENECK:
nanops._USE_BOTTLENECK = False
with pd.option_context('use_bottleneck', False):
result = getattr(df, op)()
assert len(result) == 2
nanops._USE_BOTTLENECK = True

def test_cumsum(self):
self.tsframe.loc[5:10, 0] = nan
Expand Down Expand Up @@ -676,11 +674,10 @@ def test_sem(self):
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
result = nanops.nansem(arr, axis=0)
assert not (result < 0).any()
if nanops._USE_BOTTLENECK:
nanops._USE_BOTTLENECK = False

with pd.option_context('use_bottleneck', False):
result = nanops.nansem(arr, axis=0)
assert not (result < 0).any()
nanops._USE_BOTTLENECK = True

def test_skew(self):
tm._skip_if_no_scipy()
Expand Down Expand Up @@ -767,7 +764,7 @@ def wrapper(x):
tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
check_dtype=check_dtype,
check_less_precise=check_less_precise)
if not tm._incompat_bottleneck_version(name):
if name in ['sum', 'prod']:
exp = frame.apply(skipna_wrapper, axis=1)
tm.assert_series_equal(result1, exp, check_dtype=False,
check_less_precise=check_less_precise)
Expand Down Expand Up @@ -799,7 +796,7 @@ def wrapper(x):
all_na = self.frame * np.NaN
r0 = getattr(all_na, name)(axis=0)
r1 = getattr(all_na, name)(axis=1)
if not tm._incompat_bottleneck_version(name):
if name in ['sum', 'prod']:
assert np.isnan(r0).all()
assert np.isnan(r1).all()

Expand Down Expand Up @@ -1859,14 +1856,14 @@ def test_dataframe_clip(self):
assert (clipped_df.values[ub_mask] == ub).all()
assert (clipped_df.values[mask] == df.values[mask]).all()

@pytest.mark.xfail(reason=("clip on mixed integer or floats "
"with integer clippers coerces to float"))
def test_clip_mixed_numeric(self):

# TODO(jreback)
# clip on mixed integer or floats
# with integer clippers coerces to float
df = DataFrame({'A': [1, 2, 3],
'B': [1., np.nan, 3.]})
result = df.clip(1, 2)
expected = DataFrame({'A': [1, 2, 2],
expected = DataFrame({'A': [1, 2, 2.],
'B': [1., np.nan, 2.]})
tm.assert_frame_equal(result, expected, check_like=True)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_aggregate.py
Expand Up @@ -562,7 +562,7 @@ def _testit(name):
exp.name = 'C'

result = op(grouped)['C']
if not tm._incompat_bottleneck_version(name):
if name in ['sum', 'prod']:
assert_series_equal(result, exp)

_testit('count')
Expand Down

0 comments on commit 0ab6cfa

Please sign in to comment.