BUG: preserve categorical & sparse types when grouping / pivot

preserve dtypes when applying a ufunc to a sparse dtype closes pandas-dev#18502 closes pandas-dev#23743
jreback · Jun 21, 2019 · 5bec6b3 · 5bec6b3
1 parent b9b081d
commit 5bec6b3
Show file tree

Hide file tree

Showing 17 changed files with 282 additions and 80 deletions.
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -316,6 +316,65 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t
     s
     s.str.startswith(b'a')
 
+<<<<<<< HEAD
+=======
+.. _whatsnew_0250.api_breaking.ufuncs:
+
+ufuncs on Extension Dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now coerce the
+resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype. (:issue:`23743`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': pd.Series([1, np.nan, 3], dtype=pd.SparseDtype('float64', np.nan))})
+   df
+   df.dtypes
+
+*Previous Behavior*:
+
+.. code-block:: python
+
+   In [3]: np.sqrt(df).dtypes
+   Out[3]:
+   A    float64
+   dtype: object
+
+*New Behavior*:
+
+.. ipython:: python
+
+   np.sqrt(df).dtypes
+
+.. _whatsnew_0250.api_breaking.groupby_categorical:
+
+Categorical dtypes are preserved during groupby
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({'payload': [-1,-2,-1,-2],
+                      'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)})
+   df
+   df.dtypes
+
+*Previous Behavior*:
+
+.. code-block:: python
+
+   In [5]: df.groupby('payload').first().col.dtype
+   Out[5]: dtype('O')
+
+*New Behavior*:
+
+.. ipython:: python
+
+   df.groupby('payload').first().col.dtype
+
+
 .. _whatsnew_0250.api_breaking.incompatible_index_unions:
 
 Incompatible Index Type Unions

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -605,7 +605,7 @@ def conv(r, dtype):
     return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
 
 
-def astype_nansafe(arr, dtype, copy=True, skipna=False):
+def astype_nansafe(arr, dtype, copy=True, skipna=False, casting='unsafe'):
     """
     Cast the elements of an array to a given dtype a nan-safe manner.
 
@@ -616,8 +616,10 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
     copy : bool, default True
         If False, a view will be attempted but may fail, if
         e.g. the item sizes don't align.
-    skipna: bool, default False
+    skipna : bool, default False
         Whether or not we should skip NaN when casting as a string-type.
+    casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’}
+        optional, default 'unsafe'
 
     Raises
     ------
@@ -703,7 +705,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
 
     if copy or is_object_dtype(arr) or is_object_dtype(dtype):
         # Explicit copy, or required since NumPy can't view from / to object.
-        return arr.astype(dtype, copy=True)
+        return arr.astype(dtype, copy=True, casting=casting)
 
     return arr.view(dtype)
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2641,6 +2641,51 @@ def transpose(self, *args, **kwargs):
 
     T = property(transpose)
 
+    # ----------------------------------------------------------------------
+    # Array Interface
+
+    # This is also set in IndexOpsMixin
+    # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
+    __array_priority__ = 1000
+
+    def __array__(self, dtype=None):
+        return com.values_from_object(self)
+
+    def __array_wrap__(self, result: np.ndarray, context=None) -> 'DataFrame':
+        """
+        We are called post ufunc; reconstruct the original object and dtypes.
+
+        Parameters
+        ----------
+        result : np.ndarray
+        context
+
+        Returns
+        -------
+        DataFrame
+        """
+
+        d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
+        result = self._constructor(result, **d)
+
+        # we try to cast extension array types back to the original
+        # TODO: this fails with duplicates, ugh
+        if self._data.any_extension_types:
+            result = result.astype(self.dtypes,
+                                   copy=False,
+                                   errors='ignore',
+                                   casting='same_kind')
+
+        return result.__finalize__(self)
+
+    # ideally we would define this to avoid the getattr checks, but
+    # is slower
+    # @property
+    # def __array_interface__(self):
+    #    """ provide numpy array interface method """
+    #    values = self.values
+    #    return dict(typestr=values.dtype.str,shape=values.shape,data=values)
+
     # ----------------------------------------------------------------------
     # Picklability
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1919,25 +1919,6 @@ def empty(self):
     # ----------------------------------------------------------------------
     # Array Interface
 
-    # This is also set in IndexOpsMixin
-    # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
-    __array_priority__ = 1000
-
-    def __array__(self, dtype=None):
-        return com.values_from_object(self)
-
-    def __array_wrap__(self, result, context=None):
-        d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
-        return self._constructor(result, **d).__finalize__(self)
-
-    # ideally we would define this to avoid the getattr checks, but
-    # is slower
-    # @property
-    # def __array_interface__(self):
-    #    """ provide numpy array interface method """
-    #    values = self.values
-    #    return dict(typestr=values.dtype.str,shape=values.shape,data=values)
-
     def to_dense(self):
         """
         Return dense representation of NDFrame (as opposed to sparse).
@@ -5693,6 +5674,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
                                          **kwargs)
             return self._constructor(new_data).__finalize__(self)
 
+        if not results:
+            if copy:
+                self = self.copy()
+            return self
+
         # GH 19920: retain column metadata after concat
         result = pd.concat(results, axis=1, copy=False)
         result.columns = self.columns

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -156,12 +156,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
 
                 obj = self.obj[data.items[locs]]
                 s = groupby(obj, self.grouper)
-                result = s.aggregate(lambda x: alt(x, axis=self.axis))
+                try:
+                    result = s.aggregate(lambda x: alt(x, axis=self.axis))
+                except Exception:
+                    # we may have an exception in trying to aggregate
+                    # continue and exclude the block
+                    pass
 
             finally:
 
+                dtype = block.values.dtype
+
                 # see if we can cast the block back to the original dtype
-                result = block._try_coerce_and_cast_result(result)
+                result = block._try_coerce_and_cast_result(result, dtype=dtype)
                 newb = block.make_block(result)
 
             new_items.append(locs)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -786,6 +786,8 @@ def _try_cast(self, result, obj, numeric_only=False):
             elif is_extension_array_dtype(dtype):
                 # The function can return something of any type, so check
                 # if the type is compatible with the calling EA.
+
+                # return the same type (Series) as our caller
                 try:
                     result = obj._values._from_sequence(result, dtype=dtype)
                 except Exception:
@@ -1157,7 +1159,8 @@ def mean(self, *args, **kwargs):
         """
         nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
         try:
-            return self._cython_agg_general('mean', **kwargs)
+            return self._cython_agg_general(
+                'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
@@ -1179,7 +1182,11 @@ def median(self, **kwargs):
             Median of values within each group.
         """
         try:
-            return self._cython_agg_general('median', **kwargs)
+            return self._cython_agg_general(
+                'median',
+                alt=lambda x,
+                axis: Series(x).median(**kwargs),
+                **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
@@ -1235,7 +1242,10 @@ def var(self, ddof=1, *args, **kwargs):
         nv.validate_groupby_func('var', args, kwargs)
         if ddof == 1:
             try:
-                return self._cython_agg_general('var', **kwargs)
+                return self._cython_agg_general(
+                    'var',
+                    alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs),
+                    **kwargs)
             except Exception:
                 f = lambda x: x.var(ddof=ddof, **kwargs)
                 with _group_selection_context(self):
@@ -1263,7 +1273,6 @@ def sem(self, ddof=1):
         Series or DataFrame
             Standard error of the mean of values within each group.
         """
-
         return self.std(ddof=ddof) / np.sqrt(self.count())
 
     @Substitution(name='groupby')
@@ -1320,6 +1329,16 @@ def f(self, **kwargs):
                 except Exception:
                     result = self.aggregate(
                         lambda x: npfunc(x, axis=self.axis))
+
+                    # coerce the columns if we can
+                    if isinstance(result, DataFrame):
+                        for col in result.columns:
+                            result[col] = self._try_cast(
+                                result[col], self.obj[col])
+                    else:
+                        result = self._try_cast(
+                            result, self.obj)
+
                     if _convert:
                         result = result._convert(datetime=True)
                     return result

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -19,7 +19,7 @@
 from pandas.core.dtypes.common import (
     ensure_float64, ensure_int64, ensure_int_or_float, ensure_object,
     ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype,
-    is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype,
+    is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse,
     is_timedelta64_dtype, needs_i8_conversion)
 from pandas.core.dtypes.missing import _maybe_fill, isna
 
@@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
 
         # categoricals are only 1d, so we
         # are not setup for dim transforming
-        if is_categorical_dtype(values):
+        if is_categorical_dtype(values) or is_sparse(values):
             raise NotImplementedError(
-                "categoricals are not support in cython ops ATM")
+                "{} are not support in cython ops".format(values.dtype))
         elif is_datetime64_any_dtype(values):
             if how in ['add', 'prod', 'cumsum', 'cumprod']:
                 raise NotImplementedError(

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -600,7 +600,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
                         values = self.get_values(dtype=dtype)
 
                     # _astype_nansafe works fine with 1-d only
-                    values = astype_nansafe(values.ravel(), dtype, copy=True)
+                    values = astype_nansafe(
+                        values.ravel(), dtype, copy=True, **kwargs)
 
                 # TODO(extension)
                 # should we make this attribute?
@@ -1767,6 +1768,19 @@ def _slice(self, slicer):
 
         return self.values[slicer]
 
+    def _try_cast_result(self, result, dtype=None):
+        """
+        if we have an operation that operates on for example floats
+        we want to try to cast back to our EA here if possible
+        """
+        try:
+            result = self._holder._from_sequence(
+                np.asarray(result).ravel(), dtype=dtype)
+        except Exception:
+            pass
+
+        return result
+
     def formatting_values(self):
         # Deprecating the ability to override _formatting_values.
         # Do the warning here, it's only user in pandas, since we

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -72,11 +72,12 @@ def _f(*args, **kwargs):
 
 class bottleneck_switch:
 
-    def __init__(self, **kwargs):
+    def __init__(self, name=None, **kwargs):
+        self.name = name
         self.kwargs = kwargs
 
     def __call__(self, alt):
-        bn_name = alt.__name__
+        bn_name = self.name or alt.__name__
 
         try:
             bn_func = getattr(bn, bn_name)
@@ -804,7 +805,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
 
 
 def _nanminmax(meth, fill_value_typ):
-    @bottleneck_switch()
+
+    @bottleneck_switch(name='nan' + meth)
     def reduction(values, axis=None, skipna=True, mask=None):
 
         values, mask, dtype, dtype_max, fill_value = _get_values(
@@ -824,7 +826,6 @@ def reduction(values, axis=None, skipna=True, mask=None):
         result = _wrap_results(result, dtype, fill_value)
         return _maybe_null_out(result, axis, mask, values.shape)
 
-    reduction.__name__ = 'nan' + meth
     return reduction
 
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -762,12 +762,31 @@ def __array__(self, dtype=None):
             dtype = 'M8[ns]'
         return np.asarray(self.array, dtype)
 
-    def __array_wrap__(self, result, context=None):
+    def __array_wrap__(self, result: np.ndarray, context=None) -> 'Series':
         """
-        Gets called after a ufunc.
+        We are called post ufunc; reconstruct the original object and dtypes.
+
+        Parameters
+        ----------
+        result : np.ndarray
+        context
+
+        Returns
+        -------
+        Series
         """
-        return self._constructor(result, index=self.index,
-                                 copy=False).__finalize__(self)
+
+        result = self._constructor(result, index=self.index,
+                                   copy=False)
+
+        # we try to cast extension array types back to the original
+        if is_extension_array_dtype(self):
+            result = result.astype(self.dtype,
+                                   copy=False,
+                                   errors='ignore',
+                                   casting='same_kind')
+
+        return result.__finalize__(self)
 
     def __array_prepare__(self, result, context=None):
         """