Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix pandas backends to have inclusive preceding window bound #2009

Merged
merged 6 commits into from Nov 1, 2019
@@ -297,7 +297,11 @@ def compute_window_spec(dtype, obj):

@compute_window_spec.register(type(None))
def compute_window_spec_none(_, obj):
return obj
# Window spec in ibis is an inclusive window bound. A bound of 0 indicates
# the current row.
# Window spec in Pandas indicates window size. Therefore, we must add 1
# to the ibis window bound to get the expected behavior.
return obj + 1
This conversation was marked as resolved by jreback

This comment has been minimized.

Copy link
@icexelloss

icexelloss Oct 24, 2019

Collaborator

This only works with row window right?

This comment has been minimized.

Copy link
@hjoo

hjoo Oct 24, 2019

Author Contributor

Yep, this is only used for row-based windows. There is another method on line 310 that handles windows defined by a time interval.

Added a comment here to clarify.

This comment has been minimized.

Copy link
@xmnlab

xmnlab Oct 25, 2019

Collaborator

@hjoo thanks for adding more details about this method.
could you provide the docstring using numpy format? https://numpydoc.readthedocs.io/en/latest/format.html

This comment has been minimized.

Copy link
@xmnlab

xmnlab Oct 25, 2019

Collaborator

Thanks @hjoo for the change!

I ran here pydocstyle for you change and it raised these 3 errors:

D400: First line should end with a period (not ':')
D401: First line should be in imperative mood; try rephrasing (found 'Helper')

it means that the first line should wne with a period (.) and should start in a verb in imperative mood like:

  • Do ...
  • Get ...
  • Return ...
  • Create ...


@compute_window_spec.register(dt.Interval)
@@ -306,11 +310,6 @@ def compute_window_spec_interval(_, expr):
return pd.tseries.frequencies.to_offset(value)


@compute_window_spec.register(dt.DataType)
def compute_window_spec_expr(_, expr):

This comment has been minimized.

Copy link
@hjoo

hjoo Oct 24, 2019

Author Contributor

I don't think this function should ever be called as Ibis throws an input exception if preceding is not an integer or an Ibis interval.

return ibis.pandas.execute(expr)


class Window(AggregationContext):
__slots__ = ('construct_window',)

@@ -288,7 +288,7 @@ def test_batting_rolling(batting, batting_df, sort_kind):
more_values = (
batting_df[columns]
.sort_values('yearID', kind=sort_kind)
.G.rolling(5, min_periods=1)
.G.rolling(6, min_periods=1)
.sum()
.astype('int64')
)
@@ -311,7 +311,7 @@ def test_batting_rolling_partitioned(batting, batting_df, sort_kind):
batting_df[columns]
.set_index(order_by)
.groupby(group_by)
.G.rolling(3, min_periods=1)
.G.rolling(4, min_periods=1)
.sum()
.rename('rolled')
)
@@ -264,7 +264,7 @@ def my_mean(series):
result = expr.execute().sort_values(['key', 'a'])
expected = df.sort_values(['key', 'a']).assign(
rolled=lambda df: df.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
@@ -286,7 +286,7 @@ def test_udaf_window_nan():
result = expr.execute().sort_values(['key', 'a'])
expected = df.sort_values(['key', 'a']).assign(
rolled=lambda d: d.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
@@ -305,7 +305,7 @@ def my_mean(series):
result = expr.execute()
expected = df_random.sort_values(['key', 'a']).assign(
rolled=lambda df: df.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
@@ -323,7 +323,7 @@ def test_udaf_window_nan(con, t_nan, df_nan):
result = expr.execute()
expected = df_nan.sort_values(['key', 'a']).assign(
rolled=lambda d: d.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
@@ -338,7 +338,7 @@ def test_udaf_window_null(con, t_null, df_null):
result = expr.execute()
expected = df_null.sort_values(['key', 'a']).assign(
rolled=lambda d: d.groupby('key')
.b.rolling(2, min_periods=1)
.b.rolling(3, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
@@ -245,22 +245,36 @@ def test_bounded_following_window(backend, alltypes, df, con):
backend.assert_series_equal(left, right)


# TODO (ISSUE #2000): fix Csv, Pandas, and Parquet backends to have
# inclusive preceding window boundary
@pytest.mark.xfail_backends([Csv, Pandas, Parquet])
@pytest.mark.parametrize(
'window_fn',
[
param(
lambda t: ibis.window(
preceding=2,
following=0,
group_by=[t.string_col],
order_by=[t.id]
),
id='preceding-2-following-0',
),
param(
lambda t: ibis.trailing_window(
preceding=2,
group_by=[t.string_col],
order_by=[t.id]
),
id='trailing-2',
),
]
)
@pytest.mark.xfail_unsupported
def test_bounded_preceding_window(backend, alltypes, df, con):
def test_bounded_preceding_windows(backend, alltypes, df, con, window_fn):
if not backend.supports_window_operations:
pytest.skip(
'Backend {} does not support window operations'.format(backend)
)

window = ibis.window(
preceding=2,
following=0,
group_by=[alltypes.string_col],
order_by=[alltypes.id],
)
window = window_fn(alltypes)

expr = alltypes.mutate(val=alltypes.double_col.sum().over(window))

ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.