Skip to content

Commit

Permalink
BUG: preserve categorical & sparse types when grouping / pivot
Browse files Browse the repository at this point in the history
preserve dtypes when applying a ufunc to a sparse dtype

closes pandas-dev#18502
closes pandas-dev#23743
  • Loading branch information
jreback committed May 29, 2019
1 parent 7629a18 commit 8b5a3d3
Show file tree
Hide file tree
Showing 16 changed files with 247 additions and 61 deletions.
60 changes: 59 additions & 1 deletion doc/source/whatsnew/v0.25.0.rst
Expand Up @@ -154,7 +154,63 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi
Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will
cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before.

.. _whatsnew_0250.api_breaking.incompatible_index_unions
.. _whatsnew_0250.api_breaking.ufuncs:

ufuncs on Extension Dtype
^^^^^^^^^^^^^^^^^^^^^^^^^

Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now coerce the
resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype (:issue:`23743`)

.. ipython:: python
df = pd.DataFrame({'A': pd.Series([1, np.nan, 3], dtype=pd.SparseDtype('float64', np.nan))})
df
df.dtypes
*Previous Behavior*:

.. code-block:: python
In [3]: np.sqrt(df).dtypes Out[3]:
A float64
dtype: object
*New Behavior*:

.. ipython:: python
np.sqrt(df).dtypes
.. _whatsnew_0250.api_breaking.groupby_categorical:

Categorical dtypes are preserved during groupby
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations.
Pandas now will preserve these dtypes (:issue:`18502`)

.. ipython:: python
df = pd.DataFrame({'payload': [-1,-2,-1,-2],
'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)})
df
df.dtypes
*Previous Behavior*:

.. code-block:: python
In [5]: df.groupby('payload').first().col.dtype Out[5]: dtype('O')
*New Behavior*:

.. ipython:: python
df.groupby('payload').first().col.dtype
.. _whatsnew_0250.api_breaking.incompatible_index_unions:

Incompatible Index Type Unions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand All @@ -168,6 +224,8 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`).

*Previous Behavior*:

.. code-block:: python
In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3]))
...
ValueError: can only call with other PeriodIndex-ed objects
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/dtypes/cast.py
Expand Up @@ -606,7 +606,7 @@ def conv(r, dtype):
return [conv(r, dtype) for r, dtype in zip(result, dtypes)]


def astype_nansafe(arr, dtype, copy=True, skipna=False):
def astype_nansafe(arr, dtype, copy=True, skipna=False, casting='unsafe'):
"""
Cast the elements of an array to a given dtype a nan-safe manner.
Expand All @@ -617,8 +617,10 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
copy : bool, default True
If False, a view will be attempted but may fail, if
e.g. the item sizes don't align.
skipna: bool, default False
skipna : bool, default False
Whether or not we should skip NaN when casting as a string-type.
casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’}
optional, default 'unsafe'
Raises
------
Expand Down Expand Up @@ -704,7 +706,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):

if copy or is_object_dtype(arr) or is_object_dtype(dtype):
# Explicit copy, or required since NumPy can't view from / to object.
return arr.astype(dtype, copy=True)
return arr.astype(dtype, copy=True, casting=casting)

return arr.view(dtype)

Expand Down
30 changes: 29 additions & 1 deletion pandas/core/generic.py
Expand Up @@ -1949,8 +1949,31 @@ def __array__(self, dtype=None):
return com.values_from_object(self)

def __array_wrap__(self, result, context=None):
"""
We are called post ufunc; reconstruct the original object and dtypes.
Parameters
----------
result : np.ndarray
context
Returns
-------
Series or DataFrame
"""

d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
return self._constructor(result, **d).__finalize__(self)
result = self._constructor(result, **d)

# we try to cast extension array types back to the original
# TODO: this fails with duplicates, ugh
if self._data.any_extension_types:
result = result.astype(self.dtypes,
copy=False,
errors='ignore',
casting='same_kind')

return result.__finalize__(self)

# ideally we would define this to avoid the getattr checks, but
# is slower
Expand Down Expand Up @@ -5755,6 +5778,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
**kwargs)
return self._constructor(new_data).__finalize__(self)

if not results:
if copy:
self = self.copy()
return self

# GH 19920: retain column metadata after concat
result = pd.concat(results, axis=1, copy=False)
result.columns = self.columns
Expand Down
11 changes: 9 additions & 2 deletions pandas/core/groupby/generic.py
Expand Up @@ -104,12 +104,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,

obj = self.obj[data.items[locs]]
s = groupby(obj, self.grouper)
result = s.aggregate(lambda x: alt(x, axis=self.axis))
try:
result = s.aggregate(lambda x: alt(x, axis=self.axis))
except Exception:
# we may have an exception in trying to aggregate
# continue and exclude the block
pass

finally:

dtype = block.values.dtype

# see if we can cast the block back to the original dtype
result = block._try_coerce_and_cast_result(result)
result = block._try_coerce_and_cast_result(result, dtype=dtype)
newb = block.make_block(result)

new_items.append(locs)
Expand Down
24 changes: 20 additions & 4 deletions pandas/core/groupby/groupby.py
Expand Up @@ -784,6 +784,8 @@ def _try_cast(self, result, obj, numeric_only=False):
elif is_extension_array_dtype(dtype):
# The function can return something of any type, so check
# if the type is compatible with the calling EA.

# return the same type (Series) as our caller
try:
result = obj._values._from_sequence(result, dtype=dtype)
except Exception:
Expand Down Expand Up @@ -1155,7 +1157,8 @@ def mean(self, *args, **kwargs):
"""
nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
try:
return self._cython_agg_general('mean', **kwargs)
return self._cython_agg_general(
'mean', alt=lambda x, axis: Series(x).mean(), **kwargs)
except GroupByError:
raise
except Exception: # pragma: no cover
Expand All @@ -1177,7 +1180,8 @@ def median(self, **kwargs):
Median of values within each group.
"""
try:
return self._cython_agg_general('median', **kwargs)
return self._cython_agg_general(
'median', alt=lambda x, axis: Series(x).median(), **kwargs)
except GroupByError:
raise
except Exception: # pragma: no cover
Expand Down Expand Up @@ -1233,7 +1237,10 @@ def var(self, ddof=1, *args, **kwargs):
nv.validate_groupby_func('var', args, kwargs)
if ddof == 1:
try:
return self._cython_agg_general('var', **kwargs)
return self._cython_agg_general(
'var',
alt=lambda x, axis: Series(x).var(ddof=ddof),
**kwargs)
except Exception:
f = lambda x: x.var(ddof=ddof, **kwargs)
with _group_selection_context(self):
Expand Down Expand Up @@ -1261,7 +1268,6 @@ def sem(self, ddof=1):
Series or DataFrame
Standard error of the mean of values within each group.
"""

return self.std(ddof=ddof) / np.sqrt(self.count())

@Substitution(name='groupby')
Expand Down Expand Up @@ -1318,6 +1324,16 @@ def f(self, **kwargs):
except Exception:
result = self.aggregate(
lambda x: npfunc(x, axis=self.axis))

# coerce the columns if we can
if isinstance(result, DataFrame):
for col in result.columns:
result[col] = self._try_cast(
result[col], self.obj[col])
else:
result = self._try_cast(
result, self.obj)

if _convert:
result = result._convert(datetime=True)
return result
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/ops.py
Expand Up @@ -19,7 +19,7 @@
from pandas.core.dtypes.common import (
ensure_float64, ensure_int64, ensure_int_or_float, ensure_object,
ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype,
is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype,
is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse,
is_timedelta64_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import _maybe_fill, isna

Expand Down Expand Up @@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,

# categoricals are only 1d, so we
# are not setup for dim transforming
if is_categorical_dtype(values):
if is_categorical_dtype(values) or is_sparse(values):
raise NotImplementedError(
"categoricals are not support in cython ops ATM")
"{} are not support in cython ops".format(values.dtype))
elif is_datetime64_any_dtype(values):
if how in ['add', 'prod', 'cumsum', 'cumprod']:
raise NotImplementedError(
Expand Down
16 changes: 15 additions & 1 deletion pandas/core/internals/blocks.py
Expand Up @@ -604,7 +604,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
values = self.get_values(dtype=dtype)

# _astype_nansafe works fine with 1-d only
values = astype_nansafe(values.ravel(), dtype, copy=True)
values = astype_nansafe(
values.ravel(), dtype, copy=True, **kwargs)

# TODO(extension)
# should we make this attribute?
Expand Down Expand Up @@ -1771,6 +1772,19 @@ def _slice(self, slicer):

return self.values[slicer]

def _try_cast_result(self, result, dtype=None):
"""
if we have an operation that operates on for example floats
we want to try to cast back to our EA here if possible
"""
try:
result = self._holder._from_sequence(
np.asarray(result).ravel(), dtype=dtype)
except Exception:
pass

return result

def formatting_values(self):
# Deprecating the ability to override _formatting_values.
# Do the warning here, it's only user in pandas, since we
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/nanops.py
Expand Up @@ -89,11 +89,12 @@ def _f(*args, **kwargs):

class bottleneck_switch:

def __init__(self, **kwargs):
def __init__(self, name=None, **kwargs):
self.name = name
self.kwargs = kwargs

def __call__(self, alt):
bn_name = alt.__name__
bn_name = self.name or alt.__name__

try:
bn_func = getattr(bn, bn_name)
Expand Down Expand Up @@ -821,7 +822,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None):


def _nanminmax(meth, fill_value_typ):
@bottleneck_switch()

@bottleneck_switch(name='nan' + meth)
def reduction(values, axis=None, skipna=True, mask=None):

values, mask, dtype, dtype_max, fill_value = _get_values(
Expand All @@ -841,7 +843,6 @@ def reduction(values, axis=None, skipna=True, mask=None):
result = _wrap_results(result, dtype, fill_value)
return _maybe_null_out(result, axis, mask, values.shape)

reduction.__name__ = 'nan' + meth
return reduction


Expand Down
13 changes: 11 additions & 2 deletions pandas/core/series.py
Expand Up @@ -749,8 +749,17 @@ def __array_wrap__(self, result, context=None):
"""
Gets called after a ufunc.
"""
return self._constructor(result, index=self.index,
copy=False).__finalize__(self)
result = self._constructor(result, index=self.index,
copy=False)

# we try to cast extension array types back to the original
if is_extension_array_dtype(self):
result = result.astype(self.dtype,
copy=False,
errors='ignore',
casting='same_kind')

return result.__finalize__(self)

def __array_prepare__(self, result, context=None):
"""
Expand Down

0 comments on commit 8b5a3d3

Please sign in to comment.