API: rolling.apply will pass Series to function

closes pandas-dev#5071
jreback · Apr 15, 2018 · 5c5abe5 · 5c5abe5
1 parent b16974a
commit 5c5abe5
Show file tree

Hide file tree

Showing 4 changed files with 477 additions and 306 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -65,6 +65,35 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp
    pd.get_dummies(df, columns=['c'], dtype=bool).dtypes
 
 
+.. _whatsnew_0230.enhancements.window_raw:
+
+Rolling/Expanding.apply() accepts a ``raw`` keyword to pass a ``Series`` to the function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`Series.rolling().apply() <pandas.core.window.Rolling.apply>`, :func:`DataFrame.rolling().apply() <pandas.core.window.Rolling.apply>`,
+:func:`Series.expanding().apply() <pandas.core.window.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <pandas.core.window.Expanding.apply>` have gained a ``raw=None`` parameter.
+This is similar to :func:`DataFame.apply`. This parameter, if ``True`` allows one to send a ``np.ndarray`` to the applied function. If ``False`` a ``Series`` will be passed. The
+default is ``None``, which preserves backward compatibility, so this will default to ``True``, sending an ``np.ndarray``.
+In a future version the default will be changed to ``False``, sending a ``Series``. (:issue:`5071`)
+
+.. ipython:: python
+
+   s = pd.Series(np.arange(5), np.arange(5) + 1)
+   s
+
+Pass a ``Series``:
+
+.. ipython:: python
+
+   s.rolling(2, min_periods=1).apply(lambda x: x.iloc[-1], raw=False)
+
+Mimic the original behavior of passing a ndarray:
+
+.. ipython:: python
+
+   s.rolling(2, min_periods=1).apply(lambda x: x[-1], raw=True)
+
+
 .. _whatsnew_0230.enhancements.merge_on_columns_and_levels:
 
 Merging on a combination of columns and index levels
@@ -843,6 +872,9 @@ Deprecations
 - ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`)
 - ``NDFrame.get_ftype_counts()`` is deprecated and will be removed in a future version (:issue:`18243`)
 - The ``convert_datetime64`` parameter in :func:`DataFrame.to_records` has been deprecated and will be removed in a future version. The NumPy bug motivating this parameter has been resolved. The default value for this parameter has also changed from ``True`` to ``None`` (:issue:`18160`).
+- :func:`Series.rolling().apply() <pandas.core.window.Rolling.apply>`, :func:`DataFrame.rolling().apply() <pandas.core.window.Rolling.apply>`,
+  :func:`Series.expanding().apply() <pandas.core.window.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <pandas.core.window.Expanding.apply>` will show a ``FutureWarning`` if the new
+  ``raw`` parameter is not explicity passed (:issue:`20584`)
 
 .. _whatsnew_0230.prior_deprecations:
 

diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx
@@ -1432,39 +1432,44 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win,
     return output
 
 
-def roll_generic(ndarray[float64_t, cast=True] input,
+def roll_generic(object obj,
                  int64_t win, int64_t minp, object index, object closed,
-                 int offset, object func,
+                 int offset, object func, bint raw,
                  object args, object kwargs):
     cdef:
         ndarray[double_t] output, counts, bufarr
+        ndarray[float64_t, cast=True] arr
         float64_t *buf
         float64_t *oldbuf
         int64_t nobs = 0, i, j, s, e, N
         bint is_variable
         ndarray[int64_t] start, end
 
-    if not input.flags.c_contiguous:
-        input = input.copy('C')
-
-    n = len(input)
+    n = len(obj)
     if n == 0:
-        return input
+        return obj
+
+    arr = np.asarray(obj)
+
+    # ndarray input
+    if raw:
+        if not arr.flags.c_contiguous:
+            arr = arr.copy('C')
 
-    counts = roll_sum(np.concatenate([np.isfinite(input).astype(float),
+    counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float),
                                       np.array([0.] * offset)]),
                       win, minp, index, closed)[offset:]
 
-    start, end, N, win, minp, is_variable = get_window_indexer(input, win,
+    start, end, N, win, minp, is_variable = get_window_indexer(arr, win,
                                                                minp, index,
                                                                closed,
                                                                floor=0)
 
     output = np.empty(N, dtype=float)
 
     if is_variable:
+        # variable window arr or series
 
-        # variable window
         if offset != 0:
             raise ValueError("unable to roll_generic with a non-zero offset")
 
@@ -1473,7 +1478,20 @@ def roll_generic(ndarray[float64_t, cast=True] input,
             e = end[i]
 
             if counts[i] >= minp:
-                output[i] = func(input[s:e], *args, **kwargs)
+                if raw:
+                    output[i] = func(arr[s:e], *args, **kwargs)
+                else:
+                    output[i] = func(obj.iloc[s:e], *args, **kwargs)
+            else:
+                output[i] = NaN
+
+    elif not raw:
+        # series
+        for i from 0 <= i < N:
+            if counts[i] >= minp:
+                sl = slice(int_max(i + offset - win + 1, 0),
+                           int_min(i + offset + 1, N))
+                output[i] = func(obj.iloc[sl], *args, **kwargs)
             else:
                 output[i] = NaN
 
@@ -1482,12 +1500,12 @@ def roll_generic(ndarray[float64_t, cast=True] input,
         # truncated windows at the beginning, through first full-length window
         for i from 0 <= i < (int_min(win, N) - offset):
             if counts[i] >= minp:
-                output[i] = func(input[0: (i + offset + 1)], *args, **kwargs)
+                output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs)
             else:
                 output[i] = NaN
 
         # remaining full-length windows
-        buf = <float64_t *> input.data
+        buf = <float64_t *> arr.data
         bufarr = np.empty(win, dtype=float)
         oldbuf = <float64_t *> bufarr.data
         for i from (win - offset) <= i < (N - offset):
@@ -1502,7 +1520,7 @@ def roll_generic(ndarray[float64_t, cast=True] input,
         # truncated windows at the end
         for i from int_max(N - offset, 0) <= i < N:
             if counts[i] >= minp:
-                output[i] = func(input[int_max(i + offset - win + 1, 0): N],
+                output[i] = func(arr[int_max(i + offset - win + 1, 0): N],
                                  *args,
                                  **kwargs)
             else:

diff --git a/pandas/core/window.py b/pandas/core/window.py
@@ -314,7 +314,7 @@ def _center_window(self, result, window):
     def aggregate(self, arg, *args, **kwargs):
         result, how = self._aggregate(arg, *args, **kwargs)
         if result is None:
-            return self.apply(arg, args=args, kwargs=kwargs)
+            return self.apply(arg, raw=False, args=args, kwargs=kwargs)
         return result
 
     agg = aggregate
@@ -954,23 +954,53 @@ def count(self):
     Parameters
     ----------
     func : function
-        Must produce a single value from an ndarray input
-        \*args and \*\*kwargs are passed to the function""")
+        Must produce a single value from an ndarray input if raw=True
+        or a Series if raw=False
+    raw : bool, default None
+        * ``False`` : passes each row or column as a Series to the
+          function.
+        * ``True`` or ``None`` : the passed function will receive ndarray
+          objects instead.
+          If you are just applying a NumPy reduction function this will
+          achieve much better performance.
+
+        The raw parameter is required and will show a FutureWarning if
+        not passed. In the futures raw will default to False.
+
+        .. versionadded:: 0.23.0
+
+    \*args and \*\*kwargs are passed to the function""")
+
+    def apply(self, func, raw=None, args=(), kwargs={}):
+        from pandas import Series
 
-    def apply(self, func, args=(), kwargs={}):
         # TODO: _level is unused?
         _level = kwargs.pop('_level', None)  # noqa
         window = self._get_window()
         offset = _offset(window, self.center)
         index, indexi = self._get_index()
 
+        # TODO: default is for backward compat
+        # change to False in the future
+        if raw is None:
+            warnings.warn(
+                "raw defaults to False "
+                "meaning a Series will be passed to the "
+                "applied function. In the future raw will default to True "
+                "meaning a ndarray is passed to the "
+                "applied function", FutureWarning, stacklevel=3)
+            raw = True
+
         def f(arg, window, min_periods, closed):
             minp = _use_window(min_periods, window)
-            return _window.roll_generic(arg, window, minp, indexi, closed,
-                                        offset, func, args, kwargs)
+            if not raw:
+                arg = Series(arg, index=self.obj.index)
+            return _window.roll_generic(
+                arg, window, minp, indexi,
+                closed, offset, func, raw, args, kwargs)
 
         return self._apply(f, func, args=args, kwargs=kwargs,
-                           center=False)
+                           center=False, raw=raw)
 
     def sum(self, *args, **kwargs):
         nv.validate_window_func('sum', args, kwargs)
@@ -1498,8 +1528,9 @@ def count(self):
     @Substitution(name='rolling')
     @Appender(_doc_template)
     @Appender(_shared_docs['apply'])
-    def apply(self, func, args=(), kwargs={}):
-        return super(Rolling, self).apply(func, args=args, kwargs=kwargs)
+    def apply(self, func, raw=None, args=(), kwargs={}):
+        return super(Rolling, self).apply(
+            func, raw=raw, args=args, kwargs=kwargs)
 
     @Substitution(name='rolling')
     @Appender(_shared_docs['sum'])
@@ -1756,8 +1787,9 @@ def count(self, **kwargs):
     @Substitution(name='expanding')
     @Appender(_doc_template)
     @Appender(_shared_docs['apply'])
-    def apply(self, func, args=(), kwargs={}):
-        return super(Expanding, self).apply(func, args=args, kwargs=kwargs)
+    def apply(self, func, raw=None, args=(), kwargs={}):
+        return super(Expanding, self).apply(
+            func, raw=raw, args=args, kwargs=kwargs)
 
     @Substitution(name='expanding')
     @Appender(_shared_docs['sum'])