Cythonized GroupBy pct_change (pandas-dev#19919)

harisbal · Mar 10, 2018 · 52cffa3 · 52cffa3
1 parent da6f827
commit 52cffa3
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 55 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -795,6 +795,7 @@ Performance Improvements
 - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`)
+- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`)
 
 .. _whatsnew_0230.docs:
 

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2044,6 +2044,23 @@ def shift(self, periods=1, freq=None, axis=0):
                                            result_is_index=True,
                                            periods=periods)
 
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
+                   axis=0):
+        """Calcuate pct_change of each value to previous entry in group"""
+        if freq is not None or axis != 0:
+            return self.apply(lambda x: x.pct_change(periods=periods,
+                                                     fill_method=fill_method,
+                                                     limit=limit, freq=freq,
+                                                     axis=axis))
+
+        filled = getattr(self, fill_method)(limit=limit).drop(
+            self.grouper.names, axis=1)
+        shifted = filled.shift(periods=periods, freq=freq)
+
+        return (filled / shifted) - 1
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def head(self, n=5):
@@ -3884,6 +3901,13 @@ def _apply_to_column_groupbys(self, func):
         """ return a pass thru """
         return func(self)
 
+    def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None):
+        """Calculate percent change of each value to previous entry in group"""
+        filled = getattr(self, fill_method)(limit=limit)
+        shifted = filled.shift(periods=periods, freq=freq)
+
+        return (filled / shifted) - 1
+
 
 class NDFrameGroupBy(GroupBy):
 

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2062,61 +2062,6 @@ def test_rank_object_raises(self, ties_method, ascending, na_option,
                                    ascending=ascending,
                                    na_option=na_option, pct=pct)
 
-    @pytest.mark.parametrize("mix_groupings", [True, False])
-    @pytest.mark.parametrize("as_series", [True, False])
-    @pytest.mark.parametrize("val1,val2", [
-        ('foo', 'bar'), (1, 2), (1., 2.)])
-    @pytest.mark.parametrize("fill_method,limit,exp_vals", [
-        ("ffill", None,
-         [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']),
-        ("ffill", 1,
-         [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]),
-        ("bfill", None,
-         ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]),
-        ("bfill", 1,
-         [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan])
-    ])
-    def test_group_fill_methods(self, mix_groupings, as_series, val1, val2,
-                                fill_method, limit, exp_vals):
-        vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
-        _exp_vals = list(exp_vals)
-        # Overwrite placeholder values
-        for index, exp_val in enumerate(_exp_vals):
-            if exp_val == 'val1':
-                _exp_vals[index] = val1
-            elif exp_val == 'val2':
-                _exp_vals[index] = val2
-
-        # Need to modify values and expectations depending on the
-        # Series / DataFrame that we ultimately want to generate
-        if mix_groupings:  # ['a', 'b', 'a, 'b', ...]
-            keys = ['a', 'b'] * len(vals)
-
-            def interweave(list_obj):
-                temp = list()
-                for x in list_obj:
-                    temp.extend([x, x])
-
-                return temp
-
-            _exp_vals = interweave(_exp_vals)
-            vals = interweave(vals)
-        else:  # ['a', 'a', 'a', ... 'b', 'b', 'b']
-            keys = ['a'] * len(vals) + ['b'] * len(vals)
-            _exp_vals = _exp_vals * 2
-            vals = vals * 2
-
-        df = DataFrame({'key': keys, 'val': vals})
-        if as_series:
-            result = getattr(
-                df.groupby('key')['val'], fill_method)(limit=limit)
-            exp = Series(_exp_vals, name='val')
-            assert_series_equal(result, exp)
-        else:
-            result = getattr(df.groupby('key'), fill_method)(limit=limit)
-            exp = DataFrame({'key': keys, 'val': _exp_vals})
-            assert_frame_equal(result, exp)
-
     @pytest.mark.parametrize("agg_func", ['any', 'all'])
     @pytest.mark.parametrize("skipna", [True, False])
     @pytest.mark.parametrize("vals", [

diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
@@ -636,3 +636,90 @@ def test_transform_numeric_ret(self, cols, exp, comp_func, agg_func):
             exp = exp.astype('float')
 
         comp_func(result, exp)
+
+    @pytest.mark.parametrize("mix_groupings", [True, False])
+    @pytest.mark.parametrize("as_series", [True, False])
+    @pytest.mark.parametrize("val1,val2", [
+        ('foo', 'bar'), (1, 2), (1., 2.)])
+    @pytest.mark.parametrize("fill_method,limit,exp_vals", [
+        ("ffill", None,
+         [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']),
+        ("ffill", 1,
+         [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]),
+        ("bfill", None,
+         ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]),
+        ("bfill", 1,
+         [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan])
+    ])
+    def test_group_fill_methods(self, mix_groupings, as_series, val1, val2,
+                                fill_method, limit, exp_vals):
+        vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
+        _exp_vals = list(exp_vals)
+        # Overwrite placeholder values
+        for index, exp_val in enumerate(_exp_vals):
+            if exp_val == 'val1':
+                _exp_vals[index] = val1
+            elif exp_val == 'val2':
+                _exp_vals[index] = val2
+
+        # Need to modify values and expectations depending on the
+        # Series / DataFrame that we ultimately want to generate
+        if mix_groupings:  # ['a', 'b', 'a, 'b', ...]
+            keys = ['a', 'b'] * len(vals)
+
+            def interweave(list_obj):
+                temp = list()
+                for x in list_obj:
+                    temp.extend([x, x])
+
+                return temp
+
+            _exp_vals = interweave(_exp_vals)
+            vals = interweave(vals)
+        else:  # ['a', 'a', 'a', ... 'b', 'b', 'b']
+            keys = ['a'] * len(vals) + ['b'] * len(vals)
+            _exp_vals = _exp_vals * 2
+            vals = vals * 2
+
+        df = DataFrame({'key': keys, 'val': vals})
+        if as_series:
+            result = getattr(
+                df.groupby('key')['val'], fill_method)(limit=limit)
+            exp = Series(_exp_vals, name='val')
+            assert_series_equal(result, exp)
+        else:
+            result = getattr(df.groupby('key'), fill_method)(limit=limit)
+            exp = DataFrame({'key': keys, 'val': _exp_vals})
+            assert_frame_equal(result, exp)
+
+    @pytest.mark.parametrize("test_series", [True, False])
+    @pytest.mark.parametrize("periods,fill_method,limit", [
+        (1, 'ffill', None), (1, 'ffill', 1),
+        (1, 'bfill', None), (1, 'bfill', 1),
+        (-1, 'ffill', None), (-1, 'ffill', 1),
+        (-1, 'bfill', None), (-1, 'bfill', 1)])
+    def test_pct_change(self, test_series, periods, fill_method, limit):
+        vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan]
+        exp_vals = Series(vals).pct_change(periods=periods,
+                                           fill_method=fill_method,
+                                           limit=limit).tolist()
+
+        df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals),
+                        'vals': vals * 2})
+        grp = df.groupby('key')
+
+        def get_result(grp_obj):
+            return grp_obj.pct_change(periods=periods,
+                                      fill_method=fill_method,
+                                      limit=limit)
+
+        if test_series:
+            exp = pd.Series(exp_vals * 2)
+            exp.name = 'vals'
+            grp = grp['vals']
+            result = get_result(grp)
+            tm.assert_series_equal(result, exp)
+        else:
+            exp = DataFrame({'vals': exp_vals * 2})
+            result = get_result(grp)
+            tm.assert_frame_equal(result, exp)