From 90930eb299f07f10c5c217f21f60e9a4d8922fae Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 12 Nov 2016 12:01:19 -0500 Subject: [PATCH] ENH: add Series & DataFrame .agg/.aggregate to provide convienent function application that mimics the groupby(..).agg/.aggregate interface .apply is now a synonym for .agg, and will accept dict/list-likes for aggregations CLN: rename .name attr -> ._selection_name from SeriesGroupby for compat (didn't exist on DataFrameGroupBy) resolves conflicts w.r.t. setting .name on a groupby object closes #1623 closes #14464 custom .describe closes #14483 closes #15015 closes #7014 --- doc/source/api.rst | 4 + doc/source/basics.rst | 242 +++++++++++++++++++++++++- doc/source/computation.rst | 4 +- doc/source/groupby.rst | 4 +- doc/source/timeseries.rst | 6 +- doc/source/whatsnew/v0.20.0.txt | 69 ++++++++ pandas/core/base.py | 123 +++++++++++-- pandas/core/frame.py | 70 +++++++- pandas/core/generic.py | 85 ++++++++- pandas/core/groupby.py | 41 +++-- pandas/core/series.py | 55 ++++++ pandas/tests/frame/test_apply.py | 169 ++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/series/test_apply.py | 168 +++++++++++++++++- pandas/tests/tseries/test_resample.py | 17 +- pandas/types/cast.py | 17 ++ 16 files changed, 1000 insertions(+), 76 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index f6bf480bebcfc..75751bc7ce6fc 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -313,6 +313,8 @@ Function application, GroupBy & Window :toctree: generated/ Series.apply + Series.aggregate + Series.transform Series.map Series.groupby Series.rolling @@ -830,6 +832,8 @@ Function application, GroupBy & Window DataFrame.apply DataFrame.applymap + DataFrame.aggregate + DataFrame.transform DataFrame.groupby DataFrame.rolling DataFrame.expanding diff --git a/doc/source/basics.rst b/doc/source/basics.rst index f649b3fd8a9a3..18a0c04811439 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -702,7 +702,8 @@ on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise. 1. `Tablewise Function Application`_: :meth:`~DataFrame.pipe` 2. `Row or Column-wise Function Application`_: :meth:`~DataFrame.apply` -3. Elementwise_ function application: :meth:`~DataFrame.applymap` +3. `Aggregation API`_: :meth:`~DataFrame.agg` and :meth:`~DataFrame.transform` +4. `Applying Elementwise Functions`_: :meth:`~DataFrame.applymap` .. _basics.pipe: @@ -778,6 +779,13 @@ statistics methods, take an optional ``axis`` argument: df.apply(np.cumsum) df.apply(np.exp) +``.apply()`` will also dispatch on a string method name. + +.. ipython:: python + + df.apply('mean') + df.apply('mean', axis=1) + Depending on the return type of the function passed to :meth:`~DataFrame.apply`, the result will either be of lower dimension or the same dimension. @@ -827,16 +835,234 @@ set to True, the passed function will instead receive an ndarray object, which has positive performance implications if you do not need the indexing functionality. -.. seealso:: +.. _basics.aggregate: + +Aggregation API +~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. +This API is similar across pandas objects, :ref:`groupby aggregates `, +:ref:`window functions `, and the :ref:`resample API `. + +We will use a similar starting frame from above. + +.. ipython:: python + + tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) + tsdf.iloc[3:7] = np.nan + tsdf + +Using a single function is equivalent to ``.apply``; You can also pass named methods as strings. +This will return a Series of the output. + +.. ipython:: python + + tsdf.agg(np.sum) + + tsdf.agg('sum') + + # these are equivalent to a ``.sum()`` because we are aggregating on a single function + tsdf.sum() + +On a Series this will result in a scalar value + +.. ipython:: python + + tsdf.A.agg('sum') + + +Aggregating multiple functions at once +++++++++++++++++++++++++++++++++++++++ + +You can pass arguments as a list. The results of each of the passed functions will be a row in the resultant DataFrame. +These are naturally named from the aggregation function. + +.. ipython:: python + + tsdf.agg(['sum']) + +Multiple functions yield multiple rows. - The section on :ref:`GroupBy ` demonstrates related, flexible - functionality for grouping by some criterion, applying, and combining the - results into a Series, DataFrame, etc. +.. ipython:: python + + tsdf.agg(['sum', 'mean']) + +On a Series, multiple functions return a Series, indexed by the function names. + +.. ipython:: python + + tsdf.A.agg(['sum', 'mean']) + + +Aggregating with a dict of functions +++++++++++++++++++++++++++++++++++++ + +Passing a dictionary of column name to function or list of functions, to ``DataFame.agg`` +allows you to customize which functions are applied to which columns. + +.. ipython:: python + + tsdf.agg({'A': 'mean', 'B': 'sum'}) + +Passing a list-like will generate a DataFrame output. You will get a matrix-like output +of all of the aggregators; some may be missing values. + +.. ipython:: python + + tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'}) -.. _Elementwise: +For a Series, you can pass a dict. You will get back a MultiIndex Series; The outer level will +be the keys, the inner the name of the functions. + +.. ipython:: python + + tsdf.A.agg({'foo' : ['sum', 'mean']}) + +Alternatively, using multiple dictionaries, you can have renamed elements with the aggregation + +.. ipython:: python + + tsdf.A.agg({'foo' : 'sum', 'bar': 'mean'}) + +Multiple keys will yield a MultiIndex Series. The outer level will be the keys, the inner +the names of the functions. + +.. ipython:: python + + tsdf.A.agg({'foo' : ['sum', 'mean'], 'bar': ['min', 'max', lambda x: x.sum()+1]}) + +.. _basics.aggregation.mixed_dtypes: + +Mixed Dtypes +++++++++++++ + +When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid +aggregations. This is similiar to how groupby ``.agg`` works. + +.. ipython:: python -Applying elementwise Python functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdf = pd.DataFrame({'A': [1, 2, 3], + 'B': [1., 2., 3.], + 'C': ['foo', 'bar', 'baz'], + 'D': pd.date_range('20130101', periods=3)}) + mdf.dtypes + +.. ipython:: python + + mdf.agg(['min', 'sum']) + +.. _basics.aggregation.custom_describe: + +Custom describe ++++++++++++++++ + +With ``.agg()`` is it possible to easily create a custom describe function, similar +to the built in :ref:`describe function `. + +.. ipython:: python + + from functools import partial + + q_25 = partial(pd.Series.quantile, q=0.25) + q_25.__name__ = '25%' + q_75 = partial(pd.Series.quantile, q=0.75) + q_75.__name__ = '75%' + + tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max']) + +.. _basics.transform: + +Transform API +~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +The ``transform`` method returns an object that is indexed the same (same size) +as the original. This API allows you to provide *multiple* operations at the same +time rather than one-by-one. Its api is quite similar to the ``.agg`` API. + +Use a similar frame to the above sections. + +.. ipython:: python + + tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) + tsdf.iloc[3:7] = np.nan + tsdf + +Transform the entire frame. Transform allows functions to input as a numpy function, string +function name and user defined function. + +.. ipython:: python + + tsdf.transform(np.abs) + tsdf.transform('abs') + tsdf.transform(lambda x: x.abs()) + +Since this is a single function, this is equivalent to a ufunc application + +.. ipython:: python + + np.abs(tsdf) + +Passing a single function to ``.transform()`` with a Series will yield a single Series in return. + +.. ipython:: python + + tsdf.A.transform(np.abs) + + +Transform with multiple functions ++++++++++++++++++++++++++++++++++ + +Passing multiple functions will yield a column multi-indexed DataFrame. +The first level will be the original frame column names; the second level +will be the names of the transforming functions. + +.. ipython:: python + + tsdf.transform([np.abs, lambda x: x+1]) + +Passing multiple functions to a Series will yield a DataFrame. The +resulting column names will be the transforming functions. + +.. ipython:: python + + tsdf.A.transform([np.abs, lambda x: x+1]) + + +Transforming with a dict of functions ++++++++++++++++++++++++++++++++++++++ + + +Passing a dict of functions will will allow selective transforming per column. + +.. ipython:: python + + tsdf.transform({'A': np.abs, 'B': lambda x: x+1}) + +Passing a dict of lists will generate a multi-indexed DataFrame with these +selective transforms. + +.. ipython:: python + + tsdf.transform({'A': np.abs, 'B': [lambda x: x+1, 'sqrt']}) + +On a Series, passing a dict allows renaming as in ``.agg()`` + +.. ipython:: python + + tsdf.A.transform({'foo': np.abs}) + tsdf.A.transform({'foo': np.abs, 'bar': [lambda x: x+1, 'sqrt']}) + + +.. _basics.elementwise: + +Applying Elementwise Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Since not all functions can be vectorized (accept NumPy arrays and return another array or value), the methods :meth:`~DataFrame.applymap` on DataFrame diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 57480a244f308..a4750a920b0d6 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -565,7 +565,9 @@ Aggregation ----------- Once the ``Rolling``, ``Expanding`` or ``EWM`` objects have been created, several methods are available to -perform multiple computations on the data. This is very similar to a ``.groupby(...).agg`` seen :ref:`here `. +perform multiple computations on the data. These operations are similar to the :ref:`aggregating API `, +:ref:`groupby aggregates `, and :ref:`resample API `. + .. ipython:: python diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 8484ccd69a983..d4ee41c249a5a 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -439,7 +439,9 @@ Aggregation ----------- Once the GroupBy object has been created, several methods are available to -perform a computation on the grouped data. +perform a computation on the grouped data. These operations are similar to the +:ref:`aggregating API `, :ref:`window functions `, +and :ref:`resample API `. An obvious one is aggregation via the ``aggregate`` or equivalently ``agg`` method: diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 7136b15a7633a..783a2fea48d9d 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1470,11 +1470,13 @@ We can instead only resample those groups where we have points as follows: ts.groupby(partial(round, freq='3T')).sum() +.. _timeseries.aggregate: + Aggregation ~~~~~~~~~~~ -Similar to :ref:`groupby aggregates ` and the :ref:`window functions `, a ``Resampler`` can be selectively -resampled. +Similar to the :ref:`aggregating API `, :ref:`groupby aggregates `, and :ref:`window functions `, +a ``Resampler`` can be selectively resampled. Resampling a ``DataFrame``, the default will be to act on all columns with the same function. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 37a70435ed6ff..dd7140610d036 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -9,6 +9,8 @@ users upgrade to this version. Highlights include: +- new ``.agg()`` API for Series/DataFrame similar to the groupby-rolling-resample API's, see :ref:`here ` +- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) - The ``.ix`` indexer has been deprecated, see :ref:`here ` - Switched the test framework to `pytest`_ (:issue:`13097`) @@ -27,6 +29,73 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ +.. _whatsnew_0200.enhancements.agg: + +``agg`` API +^^^^^^^^^^^ + +Series & DataFrame have been enhanced to support the aggregation API. This is an already familiar API that +is supported for groupby, windows operations, and resampling. This allows one to express, possibly multiple +aggregation operations in a single concise way by using ``.agg()`` and ``.transform()``. The +full documentation is :ref:`here `` (:issue:`1623`) + +Here is a sample + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) + df.iloc[3:7] = np.nan + df + +One can operate using string function names, callables, lists, or dictionaries of these. + +Using a single function is equivalent to ``.apply``. + +.. ipython:: python + + df.agg('sum') + +Multiple functions in lists. + +.. ipython:: python + + df.agg(['sum', 'min']) + +Dictionaries to provide the ability to selective calculation. + +.. ipython:: python + + df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) + +When operating on a Series, passing a dictionry allows one to rename multiple +function aggregates; this will return a MultiIndexed Series. The outer level +are the keys, the inner are the names of the functions. + +.. ipython:: python + + df.A.agg({'foo':['sum', 'min'], 'bar' : ['count','max']}) + +The API also supports a ``.transform()`` function to provide for broadcasting results. + +.. ipython:: python + + df.transform(['abs', lambda x: x-x.min()]) + +When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid +aggregations. This is similiar to how groupby ``.agg`` works. (:issue:`15015`) + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': [1., 2., 3.], + 'C': ['foo', 'bar', 'baz'], + 'D': pd.date_range('20130101', periods=3)}) + df.dtypes + +.. ipython:: python + + df.agg(['min', 'sum']) .. _whatsnew_0200.enhancements.dataio_dtype: diff --git a/pandas/core/base.py b/pandas/core/base.py index bde60be3ddcff..d26a47f178a53 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -290,7 +290,9 @@ class SelectionMixin(object): } @property - def name(self): + def _selection_name(self): + """ return a name for myself; this would ideally be the 'name' property, but + we cannot conflict with the Series.name property which can be set """ if self._selection is None: return None # 'result' else: @@ -405,6 +407,26 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate + def _try_aggregate_string_function(self, arg, *args, **kwargs): + """ + if arg is a string, then try to operate on it: + - try to find a function on ourselves + - try to find a numpy function + - raise + + """ + assert isinstance(arg, compat.string_types) + + f = getattr(self, arg, None) + if f is not None: + return f(*args, **kwargs) + + f = getattr(np, arg, None) + if f is not None: + return f(self, *args, **kwargs) + + raise ValueError("{} is an unknown string function".format(arg)) + def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators @@ -428,14 +450,19 @@ def _aggregate(self, arg, *args, **kwargs): is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False + _axis = kwargs.pop('_axis', None) + if _axis is None: + _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) + if isinstance(arg, compat.string_types): - return getattr(self, arg)(*args, **kwargs), None + return self._try_aggregate_string_function(arg, *args, + **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict - if self.axis != 0: # pragma: no cover + if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj @@ -555,32 +582,74 @@ def _agg(arg, func): result = _agg(arg, _agg_2dim) # combine results + + def is_any_series(): + # return a boolean if we have *any* nested series + return any([isinstance(r, ABCSeries) + for r in compat.itervalues(result)]) + + def is_any_frame(): + # return a boolean if we have *any* nested series + return any([isinstance(r, ABCDataFrame) + for r in compat.itervalues(result)]) + if isinstance(result, list): - result = concat(result, keys=keys, axis=1) - elif isinstance(list(compat.itervalues(result))[0], - ABCDataFrame): - result = concat([result[k] for k in keys], keys=keys, axis=1) - else: - from pandas import DataFrame + return concat(result, keys=keys, axis=1), True + + elif is_any_frame(): + # we have a dict of DataFrames + # return a MI DataFrame + + return concat([result[k] for k in keys], + keys=keys, axis=1), True + + elif isinstance(self, ABCSeries) and is_any_series(): + + # we have a dict of Series + # return a MI Series + try: + result = concat(result) + except TypeError: + # we want to give a nice error here if + # we have non-same sized objects, so + # we don't automatically broadcast + + raise ValueError("cannot perform both aggregation " + "and transformation operations " + "simultaneously") + + return result, True + + # fall thru + from pandas import DataFrame, Series + try: result = DataFrame(result) + except ValueError: + + # we have a dict of scalars + result = Series(result, + name=getattr(self, 'name', None)) return result, True - elif hasattr(arg, '__iter__'): - return self._aggregate_multiple_funcs(arg, _level=_level), None + elif is_list_like(arg) and arg not in compat.string_types: + # we require a list, but not an 'str' + return self._aggregate_multiple_funcs(arg, + _level=_level, + _axis=_axis), None else: result = None - cy_func = self._is_cython_func(arg) - if cy_func and not args and not kwargs: - return getattr(self, cy_func)(), None + f = self._is_cython_func(arg) + if f and not args and not kwargs: + return getattr(self, f)(), None # caller can react return result, True - def _aggregate_multiple_funcs(self, arg, _level): + def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.tools.concat import concat - if self.axis != 0: + if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: @@ -615,10 +684,30 @@ def _aggregate_multiple_funcs(self, arg, _level): keys.append(col) except (TypeError, DataError): pass + except ValueError: + # cannot aggregate + continue except SpecificationError: raise - return concat(results, keys=keys, axis=1) + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + return concat(results, keys=keys, axis=1) + except TypeError: + + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + + from pandas.types.cast import is_nested_object + from pandas import Series + result = Series(results, index=keys, name=self.name) + if is_nested_object(result): + raise ValueError("cannot combine transform and " + "aggregation operations") + return result def _shallow_copy(self, obj=None, obj_type=None, **kwargs): """ return a new object with the replacement attributes """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6b5e8e0799421..24d09164399d6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4064,6 +4064,42 @@ def diff(self, periods=1, axis=0): # ---------------------------------------------------------------------- # Function application + def _gotitem(self, key, ndim, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + if subset is None: + subset = self + + # TODO: _shallow_copy(subset)? + return self[key] + + @Appender(_shared_docs['aggregate'] % _shared_doc_kwargs) + def aggregate(self, func, axis=0, *args, **kwargs): + axis = self._get_axis_number(axis) + + # TODO: flipped axis + result = None + if axis == 0: + try: + result, how = self._aggregate(func, axis=0, *args, **kwargs) + except TypeError: + pass + if result is None: + return self.apply(func, axis=axis, args=args, **kwargs) + return result + + agg = aggregate + def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds): """ @@ -4119,22 +4155,35 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, See also -------- DataFrame.applymap: For elementwise operations + DataFrame.agg: only perform aggregating type operations + DataFrame.transform: only perform transformating type operations Returns ------- applied : Series or DataFrame """ axis = self._get_axis_number(axis) - if kwds or args and not isinstance(func, np.ufunc): + ignore_failures = kwds.pop('ignore_failures', False) + + # dispatch to agg + if axis == 0 and isinstance(func, (list, dict)): + return self.aggregate(func, axis=axis, *args, **kwds) + + if len(self.columns) == 0 and len(self.index) == 0: + return self._apply_empty_result(func, axis, reduce, *args, **kwds) + # if we are a string, try to dispatch + if isinstance(func, compat.string_types): + if axis: + kwds['axis'] = axis + return getattr(self, func)(*args, **kwds) + + if kwds or args and not isinstance(func, np.ufunc): def f(x): return func(x, *args, **kwds) else: f = func - if len(self.columns) == 0 and len(self.index) == 0: - return self._apply_empty_result(func, axis, reduce, *args, **kwds) - if isinstance(f, np.ufunc): with np.errstate(all='ignore'): results = f(self.values) @@ -4151,7 +4200,10 @@ def f(x): else: if reduce is None: reduce = True - return self._apply_standard(f, axis, reduce=reduce) + return self._apply_standard( + f, axis, + reduce=reduce, + ignore_failures=ignore_failures) else: return self._apply_broadcast(f, axis) @@ -4958,7 +5010,13 @@ def f(x): # this can end up with a non-reduction # but not always. if the types are mixed # with datelike then need to make sure a series - result = self.apply(f, reduce=False) + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + result = self.apply(f, reduce=False, + ignore_failures=True) if result.ndim == self.ndim: result = result.iloc[0] return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 87052800b8fb5..f44b3f27717bd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -32,7 +32,7 @@ SettingWithCopyError, SettingWithCopyWarning, AbstractMethodError) -from pandas.core.base import PandasObject +from pandas.core.base import PandasObject, SelectionMixin from pandas.core.index import (Index, MultiIndex, _ensure_index, InvalidIndexError) import pandas.core.indexing as indexing @@ -91,7 +91,7 @@ def _single_replace(self, to_replace, method, inplace, limit): return result -class NDFrame(PandasObject): +class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a size-mutable, labeled data structure @@ -459,6 +459,16 @@ def size(self): """number of elements in the NDFrame""" return np.prod(self.shape) + @property + def _selected_obj(self): + """ internal compat with SelectionMixin """ + return self + + @property + def _obj_with_exclusions(self): + """ internal compat with SelectionMixin """ + return self + def _expand_axes(self, key): new_axes = [] for k, ax in zip(key, self.axes): @@ -2853,6 +2863,66 @@ def pipe(self, func, *args, **kwargs): else: return func(self, *args, **kwargs) + _shared_docs['aggregate'] = (""" + Aggregate using input function or dict of {column -> + function} + + .. versionadded:: 0.20.0 + + Parameters + ---------- + func : callable, string, dictionary, or list of string/callables + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. If + passed a dict, the keys must be DataFrame column names. + + Accepted Combinations are: + - string function name + - function + - list of functions + - dict of column names -> functions (or list of functions) + + Notes + ----- + Numpy functions mean/median/prod/sum/std/var are special cased so the + default behavior is applying the function along axis=0 + (e.g., np.mean(arr_2d, axis=0)) as opposed to + mimicking the default Numpy behavior (e.g., np.mean(arr_2d)). + + Returns + ------- + aggregated : %(klass)s + + See also + -------- + """) + + _shared_docs['transform'] = (""" + Call function producing a like-indexed %(klass)s + and return a %(klass)s with the transformed values` + + .. versionadded:: 0.20.0 + + Parameters + ---------- + func : callable, string, dictionary, or list of string/callables + To apply to column + + Accepted Combinations are: + - string function name + - function + - list of functions + - dict of column names -> functions (or list of functions) + + Examples + -------- + >>> df.transform(lambda x: (x - x.mean()) / x.std()) + + Returns + ------- + transformed : %(klass)s + """) + # ---------------------------------------------------------------------- # Attribute access @@ -5977,6 +6047,17 @@ def ewm(self, com=None, span=None, halflife=None, alpha=None, cls.ewm = ewm + @Appender(_shared_docs['transform'] % _shared_doc_kwargs) + def transform(self, func, *args, **kwargs): + result = self.agg(func, *args, **kwargs) + if is_scalar(result) or len(result) != len(self): + raise ValueError("transforms cannot produce " + "aggregated results") + + return result + + cls.transform = transform + def _doc_parms(cls): """Return a tuple of the doc parms.""" diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 727af8b8cd3eb..28b0126a43e07 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -713,7 +713,7 @@ def _python_apply_general(self, f): not_indexed_same=mutated or self.mutated) def _iterate_slices(self): - yield self.name, self._selected_obj + yield self._selection_name, self._selected_obj def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) @@ -912,9 +912,9 @@ def reset_identity(values): result = concat(values, axis=self.axis) if (isinstance(result, Series) and - getattr(self, 'name', None) is not None): + getattr(self, '_selection_name', None) is not None): - result.name = self.name + result.name = self._selection_name return result @@ -1114,7 +1114,7 @@ def size(self): result = self.grouper.size() if isinstance(self.obj, Series): - result.name = getattr(self, 'name', None) + result.name = getattr(self.obj, 'name', None) return result sum = _groupby_function('sum', 'add', np.sum) @@ -2655,7 +2655,7 @@ class SeriesGroupBy(GroupBy): exec(_def_str) @property - def name(self): + def _selection_name(self): """ since we are a series, we by definition only have a single name, but may be the result of a selection or @@ -2798,12 +2798,12 @@ def _aggregate_multiple_funcs(self, arg, _level): def _wrap_output(self, output, index, names=None): """ common agg/transform wrapping logic """ - output = output[self.name] + output = output[self._selection_name] if names is not None: return DataFrame(output, index=index, columns=names) else: - name = self.name + name = self._selection_name if name is None: name = self._selected_obj.name return Series(output, index=index, name=name) @@ -2821,7 +2821,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self.name, index=keys) + return Series([], name=self._selection_name, index=keys) def _get_index(): if self.grouper.nkeys > 1: @@ -2834,7 +2834,7 @@ def _get_index(): # GH #823 index = _get_index() result = DataFrame(values, index=index).stack() - result.name = self.name + result.name = self._selection_name return result if isinstance(values[0], (Series, dict)): @@ -2846,7 +2846,8 @@ def _get_index(): not_indexed_same=not_indexed_same) else: # GH #6265 - return Series(values, index=_get_index(), name=self.name) + return Series(values, index=_get_index(), + name=self._selection_name) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -3023,7 +3024,7 @@ def nunique(self, dropna=True): return Series(res, index=ri, - name=self.name) + name=self._selection_name) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -3081,7 +3082,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, # multi-index components labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] - names = self.grouper.names + [self.name] + names = self.grouper.names + [self._selection_name] if dropna: mask = labels[-1] != -1 @@ -3116,7 +3117,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if is_integer_dtype(out): out = _ensure_int64(out) - return Series(out, index=mi, name=self.name) + return Series(out, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -3147,7 +3148,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if is_integer_dtype(out): out = _ensure_int64(out) - return Series(out, index=mi, name=self.name) + return Series(out, index=mi, name=self._selection_name) def count(self): """ Compute count of group, excluding missing values """ @@ -3160,7 +3161,7 @@ def count(self): return Series(out, index=self.grouper.result_index, - name=self.name, + name=self._selection_name, dtype='int64') def _apply_to_column_groupbys(self, func): @@ -3316,7 +3317,7 @@ def aggregate(self, arg, *args, **kwargs): try: assert not args and not kwargs result = self._aggregate_multiple_funcs( - [arg], _level=_level) + [arg], _level=_level, _axis=self.axis) result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name) @@ -3548,7 +3549,8 @@ def first_non_None_value(values): except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall # through to the outer else caluse - return Series(values, index=key_index, name=self.name) + return Series(values, index=key_index, + name=self._selection_name) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -3572,8 +3574,9 @@ def first_non_None_value(values): # only coerce dates if we find at least 1 datetime coerce = True if any([isinstance(x, Timestamp) for x in values]) else False - # self.name not passed through to Series as the result - # should not take the name of original selection of columns + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns return (Series(values, index=key_index) ._convert(datetime=True, coerce=coerce)) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0913592e055cd..5522cc9361ed7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2129,6 +2129,49 @@ def map_f(values, f): return self._constructor(new_values, index=self.index).__finalize__(self) + def _gotitem(self, key, ndim, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + return self + + @Appender(generic._shared_docs['aggregate'] % _shared_doc_kwargs) + def aggregate(self, func, axis=0, *args, **kwargs): + axis = self._get_axis_number(axis) + result, how = self._aggregate(func, *args, **kwargs) + if result is None: + + # we can be called from an inner function which + # passes this meta-data + kwargs.pop('_axis', None) + kwargs.pop('_level', None) + + # try a regular apply, this evaluates lambdas + # row-by-row; however if the lambda is expected a Series + # expression, e.g.: lambda x: x-x.quantile(0.25) + # this will fail, so we can try a vectorized evaluation + + # we cannot FIRST try the vectorized evaluation, becuase + # then .agg and .apply would have different semantics if the + # operation is actually defined on the Series, e.g. str + try: + result = self.apply(func, *args, **kwargs) + except (ValueError, AttributeError, TypeError): + result = func(self, *args, **kwargs) + + return result + + agg = aggregate + def apply(self, func, convert_dtype=True, args=(), **kwds): """ Invoke function on values of Series. Can be ufunc (a NumPy function @@ -2152,6 +2195,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): See also -------- Series.map: For element-wise operations + Series.agg: only perform aggregating type operations + Series.transform: only perform transformating type operations Examples -------- @@ -2229,6 +2274,15 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): return self._constructor(dtype=self.dtype, index=self.index).__finalize__(self) + # dispatch to agg + if isinstance(func, (list, dict)): + return self.aggregate(func, *args, **kwds) + + # if we are a string, try to dispatch + if isinstance(func, compat.string_types): + return self._try_aggregate_string_function(func, *args, **kwds) + + # handle ufuncs and lambdas if kwds or args and not isinstance(func, np.ufunc): f = lambda x: func(x, *args, **kwds) else: @@ -2238,6 +2292,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): if isinstance(f, np.ufunc): return f(self) + # row-wise access if is_extension_type(self.dtype): mapped = self._values.map(f) else: diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 30fde4b5b78d8..8352a2a22e51d 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -106,6 +106,17 @@ def test_apply_standard_nonunique(self): rs = df.T.apply(lambda s: s[0], axis=0) assert_series_equal(rs, xp) + def test_with_string_args(self): + + for arg in ['sum', 'mean', 'min', 'max', 'std']: + result = self.frame.apply(arg) + expected = getattr(self.frame, arg)() + tm.assert_series_equal(result, expected) + + result = self.frame.apply(arg, axis=1) + expected = getattr(self.frame, arg)(axis=1) + tm.assert_series_equal(result, expected) + def test_apply_broadcast(self): broadcasted = self.frame.apply(np.mean, broadcast=True) agged = self.frame.apply(np.mean) @@ -455,3 +466,161 @@ def test_apply_non_numpy_dtype(self): df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category') result = df.apply(lambda x: x) assert_frame_equal(result, df) + + +def zip_frames(*frames): + """ + take a list of frames, zip the columns together for each + assume that these all have the first frame columns + + return a new frame + """ + columns = frames[0].columns + zipped = [f[c] for c in columns for f in frames] + return pd.concat(zipped, axis=1) + + +class TestDataFrameAggregate(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_agg_transform(self): + + with np.errstate(all='ignore'): + + f_sqrt = np.sqrt(self.frame) + f_abs = np.abs(self.frame) + + # ufunc + result = self.frame.transform(np.sqrt) + expected = f_sqrt.copy() + assert_frame_equal(result, expected) + + result = self.frame.apply(np.sqrt) + assert_frame_equal(result, expected) + + result = self.frame.transform(np.sqrt) + assert_frame_equal(result, expected) + + # list-like + result = self.frame.apply([np.sqrt]) + expected = f_sqrt.copy() + expected.columns = pd.MultiIndex.from_product( + [self.frame.columns, ['sqrt']]) + assert_frame_equal(result, expected) + + result = self.frame.transform([np.sqrt]) + assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + expected = zip_frames(f_sqrt, f_abs) + expected.columns = pd.MultiIndex.from_product( + [self.frame.columns, ['sqrt', 'absolute']]) + result = self.frame.apply([np.sqrt, np.abs]) + assert_frame_equal(result, expected) + + result = self.frame.transform(['sqrt', np.abs]) + assert_frame_equal(result, expected) + + def test_transform_and_agg_err(self): + # cannot both transform and agg + def f(): + self.frame.transform(['max', 'min']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.frame.agg(['max', 'sqrt']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.frame.transform(['max', 'sqrt']) + self.assertRaises(ValueError, f) + + df = pd.DataFrame({'A': range(5), 'B': 5}) + + def f(): + with np.errstate(all='ignore'): + df.agg({'A': ['abs', 'sum'], 'B': ['mean', 'max']}) + + def test_demo(self): + # demonstration tests + df = pd.DataFrame({'A': range(5), 'B': 5}) + + result = df.agg(['min', 'max']) + expected = DataFrame({'A': [0, 4], 'B': [5, 5]}, + columns=['A', 'B'], + index=['min', 'max']) + tm.assert_frame_equal(result, expected) + + result = df.agg({'A': ['min', 'max'], 'B': ['sum', 'max']}) + expected = DataFrame({'A': [4.0, 0.0, np.nan], + 'B': [5.0, np.nan, 25.0]}, + columns=['A', 'B'], + index=['max', 'min', 'sum']) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + def test_agg_reduce(self): + # all reducers + expected = zip_frames(self.frame.mean().to_frame(), + self.frame.max().to_frame(), + self.frame.sum().to_frame()).T + expected.index = ['mean', 'max', 'sum'] + result = self.frame.agg(['mean', 'max', 'sum']) + assert_frame_equal(result, expected) + + # dict input with scalars + result = self.frame.agg({'A': 'mean', 'B': 'sum'}) + expected = Series([self.frame.A.mean(), self.frame.B.sum()], + index=['A', 'B']) + assert_series_equal(result.reindex_like(expected), expected) + + # dict input with lists + result = self.frame.agg({'A': ['mean'], 'B': ['sum']}) + expected = DataFrame({'A': Series([self.frame.A.mean()], + index=['mean']), + 'B': Series([self.frame.B.sum()], + index=['sum'])}) + assert_frame_equal(result.reindex_like(expected), expected) + + # dict input with lists with multiple + result = self.frame.agg({'A': ['mean', 'sum'], + 'B': ['sum', 'max']}) + expected = DataFrame({'A': Series([self.frame.A.mean(), + self.frame.A.sum()], + index=['mean', 'sum']), + 'B': Series([self.frame.B.sum(), + self.frame.B.max()], + index=['sum', 'max'])}) + assert_frame_equal(result.reindex_like(expected), expected) + + def test_nuiscance_columns(self): + + # GH 15015 + df = DataFrame({'A': [1, 2, 3], + 'B': [1., 2., 3.], + 'C': ['foo', 'bar', 'baz'], + 'D': pd.date_range('20130101', periods=3)}) + + result = df.agg('min') + expected = Series([1, 1., 'bar', pd.Timestamp('20130101')], + index=df.columns) + assert_series_equal(result, expected) + + result = df.agg(['min']) + expected = DataFrame([[1, 1., 'bar', pd.Timestamp('20130101')]], + index=['min'], columns=df.columns) + assert_frame_equal(result, expected) + + result = df.agg('sum') + expected = Series([6, 6., 'foobarbaz'], + index=['A', 'B', 'C']) + assert_series_equal(result, expected) + + result = df.agg(['sum']) + expected = DataFrame([[6, 6., 'foobarbaz']], + index=['sum'], columns=['A', 'B', 'C']) + assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a355dca3029c7..4a1a1642a5138 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3903,7 +3903,7 @@ def test_tab_completion(self): expected = set( ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', + 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'nunique', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 16d1466bb90fe..b750a18356e5a 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -1,13 +1,14 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 +from collections import OrderedDict import numpy as np import pandas as pd from pandas import (Index, Series, DataFrame, isnull) from pandas.compat import lrange from pandas import compat -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_series_equal, assert_frame_equal import pandas.util.testing as tm from .common import TestData @@ -23,16 +24,11 @@ def test_apply(self): import math assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) - # how to handle Series result, #2316 - result = self.ts.apply(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) - expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) - tm.assert_frame_equal(result, expected) - # empty series s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) + # check all metadata (GH 9322) self.assertIsNot(s, rs) self.assertIs(s.index, rs.index) @@ -64,6 +60,13 @@ def test_apply_dont_convert_dtype(self): result = s.apply(f, convert_dtype=False) self.assertEqual(result.dtype, object) + def test_with_string_args(self): + + for arg in ['sum', 'mean', 'min', 'max', 'std']: + result = self.ts.apply(arg) + expected = getattr(self.ts, arg)() + self.assertEqual(result, expected) + def test_apply_args(self): s = Series(['foo,bar']) @@ -137,6 +140,157 @@ def f(x): tm.assert_series_equal(result, exp) +class TestSeriesAggregate(TestData, tm.TestCase): + + _multiprocess_can_split_ = True + + def test_transform(self): + # transforming functions + + with np.errstate(all='ignore'): + + f_sqrt = np.sqrt(self.series) + f_abs = np.abs(self.series) + + # ufunc + result = self.series.transform(np.sqrt) + expected = f_sqrt.copy() + assert_series_equal(result, expected) + + result = self.series.apply(np.sqrt) + assert_series_equal(result, expected) + + # list-like + result = self.series.transform([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ['sqrt'] + assert_frame_equal(result, expected) + + result = self.series.transform([np.sqrt]) + assert_frame_equal(result, expected) + + result = self.series.transform(['sqrt']) + assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ['sqrt', 'absolute'] + result = self.series.apply([np.sqrt, np.abs]) + assert_frame_equal(result, expected) + + result = self.series.transform(['sqrt', 'abs']) + expected.columns = ['sqrt', 'abs'] + assert_frame_equal(result, expected) + + # dict, provide renaming + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ['foo', 'bar'] + expected = expected.unstack().rename('series') + + result = self.series.apply({'foo': np.sqrt, 'bar': np.abs}) + assert_series_equal(result.reindex_like(expected), expected) + + def test_transform_and_agg_error(self): + # we are trying to transform with an aggregator + def f(): + self.series.transform(['min', 'max']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.series.agg(['sqrt', 'max']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.series.transform(['sqrt', 'max']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.series.agg({'foo': np.sqrt, 'bar': 'sum'}) + self.assertRaises(ValueError, f) + + def test_demo(self): + # demonstration tests + s = Series(range(6), dtype='int64', name='series') + + result = s.agg(['min', 'max']) + expected = Series([0, 5], index=['min', 'max'], name='series') + tm.assert_series_equal(result, expected) + + result = s.agg({'foo': 'min'}) + expected = Series([0], index=['foo'], name='series') + tm.assert_series_equal(result, expected) + + result = s.agg({'foo': ['min', 'max']}) + expected = DataFrame( + {'foo': [0, 5]}, + index=['min', 'max']).unstack().rename('series') + tm.assert_series_equal(result, expected) + + def test_multiple_aggregators_with_dict_api(self): + + s = Series(range(6), dtype='int64', name='series') + result = s.agg({'foo': ['min', 'max'], 'bar': ['sum', 'mean']}) + + expected = DataFrame( + {'foo': [5.0, np.nan, 0.0, np.nan], + 'bar': [np.nan, 2.5, np.nan, 15.0]}, + columns=['foo', 'bar'], + index=['max', 'mean', + 'min', 'sum']).unstack().rename('series') + tm.assert_series_equal(result.reindex_like(expected), expected) + + def test_agg_apply_evaluate_lambdas_the_same(self): + # test that we are evaluating row-by-row first + # before vectorized evaluation + result = self.series.apply(lambda x: str(x)) + expected = self.series.agg(lambda x: str(x)) + tm.assert_series_equal(result, expected) + + result = self.series.apply(str) + expected = self.series.agg(str) + tm.assert_series_equal(result, expected) + + def test_with_nested_series(self): + # GH 2316 + # .agg with a reducer and a transform, what to do + result = self.ts.apply(lambda x: Series( + [x, x ** 2], index=['x', 'x^2'])) + expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) + tm.assert_frame_equal(result, expected) + + result = self.ts.agg(lambda x: Series( + [x, x ** 2], index=['x', 'x^2'])) + tm.assert_frame_equal(result, expected) + + def test_replicate_describe(self): + # this also tests a result set that is all scalars + expected = self.series.describe() + result = self.series.apply(OrderedDict( + [('count', 'count'), + ('mean', 'mean'), + ('std', 'std'), + ('min', 'min'), + ('25%', lambda x: x.quantile(0.25)), + ('50%', 'median'), + ('75%', lambda x: x.quantile(0.75)), + ('max', 'max')])) + assert_series_equal(result, expected) + + def test_reduce(self): + # reductions with named functions + result = self.series.agg(['sum', 'mean']) + expected = Series([self.series.sum(), + self.series.mean()], + ['sum', 'mean'], + name=self.series.name) + assert_series_equal(result, expected) + + class TestSeriesMap(TestData, tm.TestCase): def test_map(self): diff --git a/pandas/tests/tseries/test_resample.py b/pandas/tests/tseries/test_resample.py index 57a655b0b7610..6bd1c9a034b0f 100755 --- a/pandas/tests/tseries/test_resample.py +++ b/pandas/tests/tseries/test_resample.py @@ -751,16 +751,7 @@ def test_resample_empty_series(self): expected.index = s.index._shallow_copy(freq=freq) assert_index_equal(result.index, expected.index) self.assertEqual(result.index.freq, expected.index.freq) - - if (method == 'size' and - isinstance(result.index, PeriodIndex) and - freq in ['M', 'D']): - # GH12871 - TODO: name should propagate, but currently - # doesn't on lower / same frequency with PeriodIndex - assert_series_equal(result, expected, check_dtype=False) - - else: - assert_series_equal(result, expected, check_dtype=False) + assert_series_equal(result, expected, check_dtype=False) def test_resample_empty_dataframe(self): # GH13212 @@ -1842,10 +1833,12 @@ def test_how_lambda_functions(self): tm.assert_series_equal(result['foo'], foo_exp) tm.assert_series_equal(result['bar'], bar_exp) + # this is a MI Series, so comparing the names of the results + # doesn't make sense result = ts.resample('M').aggregate({'foo': lambda x: x.mean(), 'bar': lambda x: x.std(ddof=1)}) - tm.assert_series_equal(result['foo'], foo_exp) - tm.assert_series_equal(result['bar'], bar_exp) + tm.assert_series_equal(result['foo'], foo_exp, check_names=False) + tm.assert_series_equal(result['bar'], bar_exp, check_names=False) def test_resample_unequal_times(self): # #1772 diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 91c7d287d6d46..9306b462774fc 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -45,6 +45,23 @@ def maybe_convert_platform(values): return values +def is_nested_object(obj): + """ + return a boolean if we have a nested object, e.g. a Series with 1 or + more Series elements + + This may not be necessarily be performant. + + """ + + if isinstance(obj, ABCSeries) and is_object_dtype(obj): + + if any(isinstance(v, ABCSeries) for v in obj.values): + return True + + return False + + def maybe_downcast_to_dtype(result, dtype): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32