Skip to content

Commit

Permalink
BUG: preserve categorical & sparse types when grouping / pivot
Browse files Browse the repository at this point in the history
preserve dtypes when applying a ufunc to a sparse dtype

closes pandas-dev#18502
closes pandas-dev#23743
  • Loading branch information
jreback committed Jun 21, 2019
1 parent b9b081d commit 5bec6b3
Show file tree
Hide file tree
Showing 17 changed files with 282 additions and 80 deletions.
59 changes: 59 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Expand Up @@ -316,6 +316,65 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t
s
s.str.startswith(b'a')
<<<<<<< HEAD
=======
.. _whatsnew_0250.api_breaking.ufuncs:

ufuncs on Extension Dtype
^^^^^^^^^^^^^^^^^^^^^^^^^

Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now coerce the
resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype. (:issue:`23743`)

.. ipython:: python
df = pd.DataFrame({'A': pd.Series([1, np.nan, 3], dtype=pd.SparseDtype('float64', np.nan))})
df
df.dtypes
*Previous Behavior*:

.. code-block:: python
In [3]: np.sqrt(df).dtypes
Out[3]:
A float64
dtype: object
*New Behavior*:

.. ipython:: python
np.sqrt(df).dtypes
.. _whatsnew_0250.api_breaking.groupby_categorical:

Categorical dtypes are preserved during groupby
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`)

.. ipython:: python
df = pd.DataFrame({'payload': [-1,-2,-1,-2],
'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)})
df
df.dtypes
*Previous Behavior*:

.. code-block:: python
In [5]: df.groupby('payload').first().col.dtype
Out[5]: dtype('O')
*New Behavior*:

.. ipython:: python
df.groupby('payload').first().col.dtype
.. _whatsnew_0250.api_breaking.incompatible_index_unions:

Incompatible Index Type Unions
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/dtypes/cast.py
Expand Up @@ -605,7 +605,7 @@ def conv(r, dtype):
return [conv(r, dtype) for r, dtype in zip(result, dtypes)]


def astype_nansafe(arr, dtype, copy=True, skipna=False):
def astype_nansafe(arr, dtype, copy=True, skipna=False, casting='unsafe'):
"""
Cast the elements of an array to a given dtype a nan-safe manner.
Expand All @@ -616,8 +616,10 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
copy : bool, default True
If False, a view will be attempted but may fail, if
e.g. the item sizes don't align.
skipna: bool, default False
skipna : bool, default False
Whether or not we should skip NaN when casting as a string-type.
casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’}
optional, default 'unsafe'
Raises
------
Expand Down Expand Up @@ -703,7 +705,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):

if copy or is_object_dtype(arr) or is_object_dtype(dtype):
# Explicit copy, or required since NumPy can't view from / to object.
return arr.astype(dtype, copy=True)
return arr.astype(dtype, copy=True, casting=casting)

return arr.view(dtype)

Expand Down
45 changes: 45 additions & 0 deletions pandas/core/frame.py
Expand Up @@ -2641,6 +2641,51 @@ def transpose(self, *args, **kwargs):

T = property(transpose)

# ----------------------------------------------------------------------
# Array Interface

# This is also set in IndexOpsMixin
# GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
__array_priority__ = 1000

def __array__(self, dtype=None):
return com.values_from_object(self)

def __array_wrap__(self, result: np.ndarray, context=None) -> 'DataFrame':
"""
We are called post ufunc; reconstruct the original object and dtypes.
Parameters
----------
result : np.ndarray
context
Returns
-------
DataFrame
"""

d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
result = self._constructor(result, **d)

# we try to cast extension array types back to the original
# TODO: this fails with duplicates, ugh
if self._data.any_extension_types:
result = result.astype(self.dtypes,
copy=False,
errors='ignore',
casting='same_kind')

return result.__finalize__(self)

# ideally we would define this to avoid the getattr checks, but
# is slower
# @property
# def __array_interface__(self):
# """ provide numpy array interface method """
# values = self.values
# return dict(typestr=values.dtype.str,shape=values.shape,data=values)

# ----------------------------------------------------------------------
# Picklability

Expand Down
24 changes: 5 additions & 19 deletions pandas/core/generic.py
Expand Up @@ -1919,25 +1919,6 @@ def empty(self):
# ----------------------------------------------------------------------
# Array Interface

# This is also set in IndexOpsMixin
# GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
__array_priority__ = 1000

def __array__(self, dtype=None):
return com.values_from_object(self)

def __array_wrap__(self, result, context=None):
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
return self._constructor(result, **d).__finalize__(self)

# ideally we would define this to avoid the getattr checks, but
# is slower
# @property
# def __array_interface__(self):
# """ provide numpy array interface method """
# values = self.values
# return dict(typestr=values.dtype.str,shape=values.shape,data=values)

def to_dense(self):
"""
Return dense representation of NDFrame (as opposed to sparse).
Expand Down Expand Up @@ -5693,6 +5674,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
**kwargs)
return self._constructor(new_data).__finalize__(self)

if not results:
if copy:
self = self.copy()
return self

# GH 19920: retain column metadata after concat
result = pd.concat(results, axis=1, copy=False)
result.columns = self.columns
Expand Down
11 changes: 9 additions & 2 deletions pandas/core/groupby/generic.py
Expand Up @@ -156,12 +156,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,

obj = self.obj[data.items[locs]]
s = groupby(obj, self.grouper)
result = s.aggregate(lambda x: alt(x, axis=self.axis))
try:
result = s.aggregate(lambda x: alt(x, axis=self.axis))
except Exception:
# we may have an exception in trying to aggregate
# continue and exclude the block
pass

finally:

dtype = block.values.dtype

# see if we can cast the block back to the original dtype
result = block._try_coerce_and_cast_result(result)
result = block._try_coerce_and_cast_result(result, dtype=dtype)
newb = block.make_block(result)

new_items.append(locs)
Expand Down
27 changes: 23 additions & 4 deletions pandas/core/groupby/groupby.py
Expand Up @@ -786,6 +786,8 @@ def _try_cast(self, result, obj, numeric_only=False):
elif is_extension_array_dtype(dtype):
# The function can return something of any type, so check
# if the type is compatible with the calling EA.

# return the same type (Series) as our caller
try:
result = obj._values._from_sequence(result, dtype=dtype)
except Exception:
Expand Down Expand Up @@ -1157,7 +1159,8 @@ def mean(self, *args, **kwargs):
"""
nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
try:
return self._cython_agg_general('mean', **kwargs)
return self._cython_agg_general(
'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs)
except GroupByError:
raise
except Exception: # pragma: no cover
Expand All @@ -1179,7 +1182,11 @@ def median(self, **kwargs):
Median of values within each group.
"""
try:
return self._cython_agg_general('median', **kwargs)
return self._cython_agg_general(
'median',
alt=lambda x,
axis: Series(x).median(**kwargs),
**kwargs)
except GroupByError:
raise
except Exception: # pragma: no cover
Expand Down Expand Up @@ -1235,7 +1242,10 @@ def var(self, ddof=1, *args, **kwargs):
nv.validate_groupby_func('var', args, kwargs)
if ddof == 1:
try:
return self._cython_agg_general('var', **kwargs)
return self._cython_agg_general(
'var',
alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs),
**kwargs)
except Exception:
f = lambda x: x.var(ddof=ddof, **kwargs)
with _group_selection_context(self):
Expand Down Expand Up @@ -1263,7 +1273,6 @@ def sem(self, ddof=1):
Series or DataFrame
Standard error of the mean of values within each group.
"""

return self.std(ddof=ddof) / np.sqrt(self.count())

@Substitution(name='groupby')
Expand Down Expand Up @@ -1320,6 +1329,16 @@ def f(self, **kwargs):
except Exception:
result = self.aggregate(
lambda x: npfunc(x, axis=self.axis))

# coerce the columns if we can
if isinstance(result, DataFrame):
for col in result.columns:
result[col] = self._try_cast(
result[col], self.obj[col])
else:
result = self._try_cast(
result, self.obj)

if _convert:
result = result._convert(datetime=True)
return result
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/ops.py
Expand Up @@ -19,7 +19,7 @@
from pandas.core.dtypes.common import (
ensure_float64, ensure_int64, ensure_int_or_float, ensure_object,
ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype,
is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype,
is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse,
is_timedelta64_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import _maybe_fill, isna

Expand Down Expand Up @@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,

# categoricals are only 1d, so we
# are not setup for dim transforming
if is_categorical_dtype(values):
if is_categorical_dtype(values) or is_sparse(values):
raise NotImplementedError(
"categoricals are not support in cython ops ATM")
"{} are not support in cython ops".format(values.dtype))
elif is_datetime64_any_dtype(values):
if how in ['add', 'prod', 'cumsum', 'cumprod']:
raise NotImplementedError(
Expand Down
16 changes: 15 additions & 1 deletion pandas/core/internals/blocks.py
Expand Up @@ -600,7 +600,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
values = self.get_values(dtype=dtype)

# _astype_nansafe works fine with 1-d only
values = astype_nansafe(values.ravel(), dtype, copy=True)
values = astype_nansafe(
values.ravel(), dtype, copy=True, **kwargs)

# TODO(extension)
# should we make this attribute?
Expand Down Expand Up @@ -1767,6 +1768,19 @@ def _slice(self, slicer):

return self.values[slicer]

def _try_cast_result(self, result, dtype=None):
"""
if we have an operation that operates on for example floats
we want to try to cast back to our EA here if possible
"""
try:
result = self._holder._from_sequence(
np.asarray(result).ravel(), dtype=dtype)
except Exception:
pass

return result

def formatting_values(self):
# Deprecating the ability to override _formatting_values.
# Do the warning here, it's only user in pandas, since we
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/nanops.py
Expand Up @@ -72,11 +72,12 @@ def _f(*args, **kwargs):

class bottleneck_switch:

def __init__(self, **kwargs):
def __init__(self, name=None, **kwargs):
self.name = name
self.kwargs = kwargs

def __call__(self, alt):
bn_name = alt.__name__
bn_name = self.name or alt.__name__

try:
bn_func = getattr(bn, bn_name)
Expand Down Expand Up @@ -804,7 +805,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None):


def _nanminmax(meth, fill_value_typ):
@bottleneck_switch()

@bottleneck_switch(name='nan' + meth)
def reduction(values, axis=None, skipna=True, mask=None):

values, mask, dtype, dtype_max, fill_value = _get_values(
Expand All @@ -824,7 +826,6 @@ def reduction(values, axis=None, skipna=True, mask=None):
result = _wrap_results(result, dtype, fill_value)
return _maybe_null_out(result, axis, mask, values.shape)

reduction.__name__ = 'nan' + meth
return reduction


Expand Down
27 changes: 23 additions & 4 deletions pandas/core/series.py
Expand Up @@ -762,12 +762,31 @@ def __array__(self, dtype=None):
dtype = 'M8[ns]'
return np.asarray(self.array, dtype)

def __array_wrap__(self, result, context=None):
def __array_wrap__(self, result: np.ndarray, context=None) -> 'Series':
"""
Gets called after a ufunc.
We are called post ufunc; reconstruct the original object and dtypes.
Parameters
----------
result : np.ndarray
context
Returns
-------
Series
"""
return self._constructor(result, index=self.index,
copy=False).__finalize__(self)

result = self._constructor(result, index=self.index,
copy=False)

# we try to cast extension array types back to the original
if is_extension_array_dtype(self):
result = result.astype(self.dtype,
copy=False,
errors='ignore',
casting='same_kind')

return result.__finalize__(self)

def __array_prepare__(self, result, context=None):
"""
Expand Down

0 comments on commit 5bec6b3

Please sign in to comment.