BUG: Adding skipna as an option to groupby cumsum and cumprod (pandas…

…-dev#19914)
harisbal · Mar 1, 2018 · 5f271eb · 5f271eb
1 parent 072545d
commit 5f271eb
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 4 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -927,6 +927,7 @@ Groupby/Resample/Rolling
 - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`)
 - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`)
 - Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`)
+- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`)
 
 Sparse
 ^^^^^^

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -139,7 +139,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
 def group_cumprod_float64(float64_t[:, :] out,
                           float64_t[:, :] values,
                           int64_t[:] labels,
-                          bint is_datetimelike):
+                          bint is_datetimelike,
+                          bint skipna=True):
     """
     Only transforms on axis=0
     """
@@ -163,14 +164,20 @@ def group_cumprod_float64(float64_t[:, :] out,
                 if val == val:
                     accum[lab, j] *= val
                     out[i, j] = accum[lab, j]
+                else:
+                    out[i, j] = NaN
+                    if not skipna:
+                        accum[lab, j] = NaN
+                        break
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cumsum(numeric[:, :] out,
                  numeric[:, :] values,
                  int64_t[:] labels,
-                 is_datetimelike):
+                 is_datetimelike,
+                 bint skipna=True):
     """
     Only transforms on axis=0
     """
@@ -196,6 +203,11 @@ def group_cumsum(numeric[:, :] out,
                     if val == val:
                         accum[lab, j] += val
                         out[i, j] = accum[lab, j]
+                    else:
+                        out[i, j] = NaN
+                        if not skipna:
+                            accum[lab, j] = NaN
+                            break
                 else:
                     accum[lab, j] += val
                     out[i, j] = accum[lab, j]

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1888,7 +1888,8 @@ def rank(self, method='average', ascending=True, na_option='keep',
     @Appender(_doc_template)
     def cumprod(self, axis=0, *args, **kwargs):
         """Cumulative product for each group"""
-        nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only'])
+        nv.validate_groupby_func('cumprod', args, kwargs,
+                                 ['numeric_only', 'skipna'])
         if axis != 0:
             return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
 
@@ -1898,7 +1899,8 @@ def cumprod(self, axis=0, *args, **kwargs):
     @Appender(_doc_template)
     def cumsum(self, axis=0, *args, **kwargs):
         """Cumulative sum for each group"""
-        nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only'])
+        nv.validate_groupby_func('cumsum', args, kwargs,
+                                 ['numeric_only', 'skipna'])
         if axis != 0:
             return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
 

diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
@@ -498,6 +498,31 @@ def test_cython_transform_series(self, op, args, targop):
             tm.assert_series_equal(expected, getattr(
                 data.groupby(labels), op)(*args))
 
+    @pytest.mark.parametrize("op", ['cumprod', 'cumsum'])
+    @pytest.mark.parametrize("skipna", [False, True])
+    @pytest.mark.parametrize('input, exp', [
+        # When everything is NaN
+        ({'key': ['b'] * 10, 'value': np.nan},
+         pd.Series([np.nan] * 10, name='value')),
+        # When there is a single NaN
+        ({'key': ['b'] * 10 + ['a'] * 2,
+          'value': [3] * 3 + [np.nan] + [3] * 8},
+         {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
+          ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729.,
+                              2187., 6561., 19683., 3.0, 9.0],
+          ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
+          ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18.,
+                             21., 24., 27., 3.0, 6.0]})])
+    def test_groupby_cum_skipna(self, op, skipna, input, exp):
+        df = pd.DataFrame(input)
+        result = df.groupby('key')['value'].transform(op, skipna=skipna)
+        if isinstance(exp, dict):
+            expected = exp[(op, skipna)]
+        else:
+            expected = exp
+        expected = pd.Series(expected, name='value')
+        tm.assert_series_equal(expected, result)
+
     @pytest.mark.parametrize(
         "op, args, targop",
         [('cumprod', (), lambda x: x.cumprod()),