BUG: preserve categorical & sparse types when grouping / pivot

preserve dtypes when applying a ufunc to a sparse dtype closes pandas-dev#18502 closes pandas-dev#23743
jreback · May 29, 2019 · 8b5a3d3 · 8b5a3d3
1 parent 7629a18
commit 8b5a3d3
Show file tree

Hide file tree

Showing 16 changed files with 247 additions and 61 deletions.
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -154,7 +154,63 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi
 Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will
 cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before.
 
-.. _whatsnew_0250.api_breaking.incompatible_index_unions
+.. _whatsnew_0250.api_breaking.ufuncs:
+
+ufuncs on Extension Dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Operations with ``numpy`` ufuncs on Extension Arrays, including Sparse Dtypes will now coerce the
+resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype (:issue:`23743`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': pd.Series([1, np.nan, 3], dtype=pd.SparseDtype('float64', np.nan))})
+   df
+   df.dtypes
+
+*Previous Behavior*:
+
+.. code-block:: python
+
+   In [3]: np.sqrt(df).dtypes                                                                                                                                                                            Out[3]:
+   A    float64
+   dtype: object
+
+*New Behavior*:
+
+.. ipython:: python
+
+   np.sqrt(df).dtypes
+
+.. _whatsnew_0250.api_breaking.groupby_categorical:
+
+Categorical dtypes are preserved during groupby
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations.
+Pandas now will preserve these dtypes (:issue:`18502`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({'payload': [-1,-2,-1,-2],
+                      'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)})
+   df
+   df.dtypes
+
+*Previous Behavior*:
+
+.. code-block:: python
+
+   In [5]: df.groupby('payload').first().col.dtype                                                                                                                                                       Out[5]: dtype('O')
+
+*New Behavior*:
+
+.. ipython:: python
+
+   df.groupby('payload').first().col.dtype
+
+
+.. _whatsnew_0250.api_breaking.incompatible_index_unions:
 
 Incompatible Index Type Unions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -168,6 +224,8 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`).
 
 *Previous Behavior*:
 
+.. code-block:: python
+
     In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3]))
     ...
     ValueError: can only call with other PeriodIndex-ed objects

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -606,7 +606,7 @@ def conv(r, dtype):
     return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
 
 
-def astype_nansafe(arr, dtype, copy=True, skipna=False):
+def astype_nansafe(arr, dtype, copy=True, skipna=False, casting='unsafe'):
     """
     Cast the elements of an array to a given dtype a nan-safe manner.
 
@@ -617,8 +617,10 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
     copy : bool, default True
         If False, a view will be attempted but may fail, if
         e.g. the item sizes don't align.
-    skipna: bool, default False
+    skipna : bool, default False
         Whether or not we should skip NaN when casting as a string-type.
+    casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’}
+        optional, default 'unsafe'
 
     Raises
     ------
@@ -704,7 +706,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
 
     if copy or is_object_dtype(arr) or is_object_dtype(dtype):
         # Explicit copy, or required since NumPy can't view from / to object.
-        return arr.astype(dtype, copy=True)
+        return arr.astype(dtype, copy=True, casting=casting)
 
     return arr.view(dtype)
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1949,8 +1949,31 @@ def __array__(self, dtype=None):
         return com.values_from_object(self)
 
     def __array_wrap__(self, result, context=None):
+        """
+        We are called post ufunc; reconstruct the original object and dtypes.
+
+        Parameters
+        ----------
+        result : np.ndarray
+        context
+
+        Returns
+        -------
+        Series or DataFrame
+        """
+
         d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
-        return self._constructor(result, **d).__finalize__(self)
+        result = self._constructor(result, **d)
+
+        # we try to cast extension array types back to the original
+        # TODO: this fails with duplicates, ugh
+        if self._data.any_extension_types:
+            result = result.astype(self.dtypes,
+                                   copy=False,
+                                   errors='ignore',
+                                   casting='same_kind')
+
+        return result.__finalize__(self)
 
     # ideally we would define this to avoid the getattr checks, but
     # is slower
@@ -5755,6 +5778,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
                                          **kwargs)
             return self._constructor(new_data).__finalize__(self)
 
+        if not results:
+            if copy:
+                self = self.copy()
+            return self
+
         # GH 19920: retain column metadata after concat
         result = pd.concat(results, axis=1, copy=False)
         result.columns = self.columns

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -104,12 +104,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
 
                 obj = self.obj[data.items[locs]]
                 s = groupby(obj, self.grouper)
-                result = s.aggregate(lambda x: alt(x, axis=self.axis))
+                try:
+                    result = s.aggregate(lambda x: alt(x, axis=self.axis))
+                except Exception:
+                    # we may have an exception in trying to aggregate
+                    # continue and exclude the block
+                    pass
 
             finally:
 
+                dtype = block.values.dtype
+
                 # see if we can cast the block back to the original dtype
-                result = block._try_coerce_and_cast_result(result)
+                result = block._try_coerce_and_cast_result(result, dtype=dtype)
                 newb = block.make_block(result)
 
             new_items.append(locs)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -784,6 +784,8 @@ def _try_cast(self, result, obj, numeric_only=False):
             elif is_extension_array_dtype(dtype):
                 # The function can return something of any type, so check
                 # if the type is compatible with the calling EA.
+
+                # return the same type (Series) as our caller
                 try:
                     result = obj._values._from_sequence(result, dtype=dtype)
                 except Exception:
@@ -1155,7 +1157,8 @@ def mean(self, *args, **kwargs):
         """
         nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
         try:
-            return self._cython_agg_general('mean', **kwargs)
+            return self._cython_agg_general(
+                'mean', alt=lambda x, axis: Series(x).mean(), **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
@@ -1177,7 +1180,8 @@ def median(self, **kwargs):
             Median of values within each group.
         """
         try:
-            return self._cython_agg_general('median', **kwargs)
+            return self._cython_agg_general(
+                'median', alt=lambda x, axis: Series(x).median(), **kwargs)
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
@@ -1233,7 +1237,10 @@ def var(self, ddof=1, *args, **kwargs):
         nv.validate_groupby_func('var', args, kwargs)
         if ddof == 1:
             try:
-                return self._cython_agg_general('var', **kwargs)
+                return self._cython_agg_general(
+                    'var',
+                    alt=lambda x, axis: Series(x).var(ddof=ddof),
+                    **kwargs)
             except Exception:
                 f = lambda x: x.var(ddof=ddof, **kwargs)
                 with _group_selection_context(self):
@@ -1261,7 +1268,6 @@ def sem(self, ddof=1):
         Series or DataFrame
             Standard error of the mean of values within each group.
         """
-
         return self.std(ddof=ddof) / np.sqrt(self.count())
 
     @Substitution(name='groupby')
@@ -1318,6 +1324,16 @@ def f(self, **kwargs):
                 except Exception:
                     result = self.aggregate(
                         lambda x: npfunc(x, axis=self.axis))
+
+                    # coerce the columns if we can
+                    if isinstance(result, DataFrame):
+                        for col in result.columns:
+                            result[col] = self._try_cast(
+                                result[col], self.obj[col])
+                    else:
+                        result = self._try_cast(
+                            result, self.obj)
+
                     if _convert:
                         result = result._convert(datetime=True)
                     return result

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -19,7 +19,7 @@
 from pandas.core.dtypes.common import (
     ensure_float64, ensure_int64, ensure_int_or_float, ensure_object,
     ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype,
-    is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype,
+    is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse,
     is_timedelta64_dtype, needs_i8_conversion)
 from pandas.core.dtypes.missing import _maybe_fill, isna
 
@@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
 
         # categoricals are only 1d, so we
         # are not setup for dim transforming
-        if is_categorical_dtype(values):
+        if is_categorical_dtype(values) or is_sparse(values):
             raise NotImplementedError(
-                "categoricals are not support in cython ops ATM")
+                "{} are not support in cython ops".format(values.dtype))
         elif is_datetime64_any_dtype(values):
             if how in ['add', 'prod', 'cumsum', 'cumprod']:
                 raise NotImplementedError(

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -604,7 +604,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
                         values = self.get_values(dtype=dtype)
 
                     # _astype_nansafe works fine with 1-d only
-                    values = astype_nansafe(values.ravel(), dtype, copy=True)
+                    values = astype_nansafe(
+                        values.ravel(), dtype, copy=True, **kwargs)
 
                 # TODO(extension)
                 # should we make this attribute?
@@ -1771,6 +1772,19 @@ def _slice(self, slicer):
 
         return self.values[slicer]
 
+    def _try_cast_result(self, result, dtype=None):
+        """
+        if we have an operation that operates on for example floats
+        we want to try to cast back to our EA here if possible
+        """
+        try:
+            result = self._holder._from_sequence(
+                np.asarray(result).ravel(), dtype=dtype)
+        except Exception:
+            pass
+
+        return result
+
     def formatting_values(self):
         # Deprecating the ability to override _formatting_values.
         # Do the warning here, it's only user in pandas, since we

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -89,11 +89,12 @@ def _f(*args, **kwargs):
 
 class bottleneck_switch:
 
-    def __init__(self, **kwargs):
+    def __init__(self, name=None, **kwargs):
+        self.name = name
         self.kwargs = kwargs
 
     def __call__(self, alt):
-        bn_name = alt.__name__
+        bn_name = self.name or alt.__name__
 
         try:
             bn_func = getattr(bn, bn_name)
@@ -821,7 +822,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
 
 
 def _nanminmax(meth, fill_value_typ):
-    @bottleneck_switch()
+
+    @bottleneck_switch(name='nan' + meth)
     def reduction(values, axis=None, skipna=True, mask=None):
 
         values, mask, dtype, dtype_max, fill_value = _get_values(
@@ -841,7 +843,6 @@ def reduction(values, axis=None, skipna=True, mask=None):
         result = _wrap_results(result, dtype, fill_value)
         return _maybe_null_out(result, axis, mask, values.shape)
 
-    reduction.__name__ = 'nan' + meth
     return reduction
 
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -749,8 +749,17 @@ def __array_wrap__(self, result, context=None):
         """
         Gets called after a ufunc.
         """
-        return self._constructor(result, index=self.index,
-                                 copy=False).__finalize__(self)
+        result = self._constructor(result, index=self.index,
+                                   copy=False)
+
+        # we try to cast extension array types back to the original
+        if is_extension_array_dtype(self):
+            result = result.astype(self.dtype,
+                                   copy=False,
+                                   errors='ignore',
+                                   casting='same_kind')
+
+        return result.__finalize__(self)
 
     def __array_prepare__(self, result, context=None):
         """