Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix pandas backends to have inclusive preceding window bound #2009

Merged
merged 6 commits into from Nov 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/release.rst
Expand Up @@ -7,6 +7,8 @@ Release Notes
These release notes are for versions of ibis **1.0 and later**. Release
notes for pre-1.0 versions of ibis can be found at :doc:`/release-pre-1.0`

* :release:`1.2.1 <pending>`
* :bug:`2009` Fix pandas backend to treat trailing_window preceding arg as window bound rather than window size (e.g. preceding=0 now indicates current row rather than window size 0)
* :release:`1.2.0 <2019-06-24>`
* :feature:`1836` Add new geospatial functions to OmniSciDB backend
* :support:`1847` Skip SQLAlchemy backend tests in connect method in backends.py
Expand Down
3 changes: 2 additions & 1 deletion ibis/expr/window.py
Expand Up @@ -420,7 +420,8 @@ def trailing_window(preceding, group_by=None, order_by=None):
preceding : int, float or expression of intervals, i.e.
ibis.interval(days=1) + ibis.interval(hours=5)
Int indicates number of trailing rows to include;
0 includes only the current row.
0 includes only the current row, 1 includes the current row and one
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to update for trailing_range_window as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trailing_range_window is a time range window and not a row-based window so this doesn't apply.

preceding row.
Interval indicates a trailing range window.
group_by : expressions, default None
Either specify here or with TableExpr.group_by
Expand Down
14 changes: 8 additions & 6 deletions ibis/pandas/aggcontext.py
Expand Up @@ -297,7 +297,14 @@ def compute_window_spec(dtype, obj):

@compute_window_spec.register(type(None))
def compute_window_spec_none(_, obj):
return obj
"""Helper method only used for row-based windows:

Window spec in ibis is an inclusive window bound. A bound of 0 indicates
the current row.
Window spec in Pandas indicates window size. Therefore, we must add 1
to the ibis window bound to get the expected behavior.
"""
return obj + 1
jreback marked this conversation as resolved.
Show resolved Hide resolved


@compute_window_spec.register(dt.Interval)
Expand All @@ -306,11 +313,6 @@ def compute_window_spec_interval(_, expr):
return pd.tseries.frequencies.to_offset(value)


@compute_window_spec.register(dt.DataType)
def compute_window_spec_expr(_, expr):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this function should ever be called as Ibis throws an input exception if preceding is not an integer or an Ibis interval.

return ibis.pandas.execute(expr)


class Window(AggregationContext):
__slots__ = ('construct_window',)

Expand Down
4 changes: 2 additions & 2 deletions ibis/pandas/execution/tests/test_window.py
Expand Up @@ -288,7 +288,7 @@ def test_batting_rolling(batting, batting_df, sort_kind):
more_values = (
batting_df[columns]
.sort_values('yearID', kind=sort_kind)
.G.rolling(5, min_periods=1)
.G.rolling(6, min_periods=1)
.sum()
.astype('int64')
)
Expand All @@ -311,7 +311,7 @@ def test_batting_rolling_partitioned(batting, batting_df, sort_kind):
batting_df[columns]
.set_index(order_by)
.groupby(group_by)
.G.rolling(3, min_periods=1)
.G.rolling(4, min_periods=1)
.sum()
.rename('rolled')
)
Expand Down
4 changes: 2 additions & 2 deletions ibis/pandas/tests/test_udf.py
Expand Up @@ -264,7 +264,7 @@ def my_mean(series):
result = expr.execute().sort_values(['key', 'a'])
expected = df.sort_values(['key', 'a']).assign(
rolled=lambda df: df.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
Expand All @@ -286,7 +286,7 @@ def test_udaf_window_nan():
result = expr.execute().sort_values(['key', 'a'])
expected = df.sort_values(['key', 'a']).assign(
rolled=lambda d: d.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
Expand Down
6 changes: 3 additions & 3 deletions ibis/spark/tests/test_udf.py
Expand Up @@ -305,7 +305,7 @@ def my_mean(series):
result = expr.execute()
expected = df_random.sort_values(['key', 'a']).assign(
rolled=lambda df: df.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
Expand All @@ -323,7 +323,7 @@ def test_udaf_window_nan(con, t_nan, df_nan):
result = expr.execute()
expected = df_nan.sort_values(['key', 'a']).assign(
rolled=lambda d: d.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
Expand All @@ -338,7 +338,7 @@ def test_udaf_window_null(con, t_null, df_null):
result = expr.execute()
expected = df_null.sort_values(['key', 'a']).assign(
rolled=lambda d: d.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
Expand Down
32 changes: 22 additions & 10 deletions ibis/tests/all/test_window.py
Expand Up @@ -245,22 +245,34 @@ def test_bounded_following_window(backend, alltypes, df, con):
backend.assert_series_equal(left, right)


# TODO (ISSUE #2000): fix Csv, Pandas, and Parquet backends to have
# inclusive preceding window boundary
@pytest.mark.xfail_backends([Csv, Pandas, Parquet])
@pytest.mark.parametrize(
'window_fn',
[
param(
lambda t: ibis.window(
preceding=2,
following=0,
group_by=[t.string_col],
order_by=[t.id],
),
id='preceding-2-following-0',
),
param(
lambda t: ibis.trailing_window(
preceding=2, group_by=[t.string_col], order_by=[t.id]
),
id='trailing-2',
),
],
)
@pytest.mark.xfail_unsupported
def test_bounded_preceding_window(backend, alltypes, df, con):
def test_bounded_preceding_windows(backend, alltypes, df, con, window_fn):
if not backend.supports_window_operations:
pytest.skip(
'Backend {} does not support window operations'.format(backend)
)

window = ibis.window(
preceding=2,
following=0,
group_by=[alltypes.string_col],
order_by=[alltypes.id],
)
window = window_fn(alltypes)

expr = alltypes.mutate(val=alltypes.double_col.sum().over(window))

Expand Down