Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Ensure that the pandas backend can deal with unary operations in groupby #1182

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion ibis/pandas/execution/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ def execute_cast_series_date(op, data, type, **kwargs):
raise TypeError("Don't know how to cast {} to {}".format(from_type, type))


@execute_node.register(ops.Negate, pd.Series)
def execute_series_unary_op_negate(op, data, **kwargs):
if data.dtype == np.dtype(np.object_):
return data.apply(functools.partial(execute_node, op, **kwargs))
return np.negative(data)


@execute_node.register(ops.UnaryOp, pd.Series)
def execute_series_unary_op(op, data, **kwargs):
function = getattr(np, type(op).__name__.lower())
Expand Down Expand Up @@ -462,7 +469,29 @@ def execute_binary_op_series_group_by(op, left, right, **kwargs):

@execute_node.register(ops.BinaryOp, SeriesGroupBy, simple_types)
def execute_binary_op_series_gb(op, left, right, **kwargs):
result = execute_binary_op(op, left.obj, right)
result = execute_node(op, left.obj, right, **kwargs)
return result.groupby(left.grouper.groupings)


@execute_node.register(ops.UnaryOp, SeriesGroupBy)
def execute_unary_op_series_gb(op, operand, **kwargs):
result = execute_node(op, operand.obj, **kwargs)
return result.groupby(operand.grouper.groupings)


@execute_node.register(
(ops.Log, ops.Round),
SeriesGroupBy,
(numbers.Real, decimal.Decimal, type(None))
)
def execute_log_series_gb_others(op, left, right, **kwargs):
result = execute_node(op, left.obj, right, **kwargs)
return result.groupby(left.grouper.groupings)


@execute_node.register((ops.Log, ops.Round), SeriesGroupBy, SeriesGroupBy)
def execute_log_series_gb_series_gb(op, left, right, **kwargs):
result = execute_node(op, left.obj, right.obj, **kwargs)
return result.groupby(left.grouper.groupings)


Expand Down
1 change: 1 addition & 0 deletions ibis/pandas/execution/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def df():
'strings_with_space': [' ', 'abab', 'ddeeffgg'],
'int64_with_zeros': [0, 1, 0],
'float64_with_zeros': [1.0, 0.0, 1.0],
'float64_positive': [1.0, 2.0, 1.0],
'strings_with_nulls': ['a', None, 'b'],
'datetime_strings_naive': pd.Series(
pd.date_range(start='2017-01-02 01:02:03.234', periods=3).values,
Expand Down
19 changes: 10 additions & 9 deletions ibis/pandas/execution/tests/test_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,20 @@ def test_round(t, df, places):
@pytest.mark.parametrize(
('ibis_func', 'pandas_func'),
[
(methodcaller('round'), methodcaller('round')),
(methodcaller('round', 2), methodcaller('round', 2)),
(methodcaller('round', -2), methodcaller('round', -2)),
(methodcaller('round', 0), methodcaller('round', 0)),
(methodcaller('abs'), np.abs),
(methodcaller('ceil'), np.ceil),
(methodcaller('floor'), np.floor),
(methodcaller('exp'), np.exp),
(methodcaller('sign'), np.sign),
(methodcaller('sqrt'), np.sqrt),
(methodcaller('log', 2), lambda x: np.log(x) / np.log(2)),
(methodcaller('floor'), np.floor),
(methodcaller('ln'), np.log),
(methodcaller('log2'), np.log2),
(methodcaller('log10'), np.log10),
(methodcaller('log', 2), lambda x: np.log(x) / np.log(2)),
(methodcaller('log2'), np.log2),
(methodcaller('round', 0), methodcaller('round', 0)),
(methodcaller('round', -2), methodcaller('round', -2)),
(methodcaller('round', 2), methodcaller('round', 2)),
(methodcaller('round'), methodcaller('round')),
(methodcaller('sign'), np.sign),
(methodcaller('sqrt'), np.sqrt),
]
)
def test_math_functions(t, df, ibis_func, pandas_func):
Expand Down
40 changes: 39 additions & 1 deletion ibis/pandas/execution/tests/test_operations.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import operator

from operator import methodcaller

import pytest

import numpy as np
Expand Down Expand Up @@ -93,11 +96,34 @@ def test_mutate(t, df):
lambda t: (t.dup_strings == 'd') | (t.plain_int64 < 100),
]
)
def test_aggregation_group_by(t, df, where):
@pytest.mark.parametrize(
('ibis_func', 'pandas_func'),
[
(methodcaller('abs'), np.abs),
(methodcaller('ceil'), np.ceil),
(methodcaller('exp'), np.exp),
(methodcaller('floor'), np.floor),
(methodcaller('ln'), np.log),
(methodcaller('log10'), np.log10),
(methodcaller('log', 2), lambda x: np.log(x) / np.log(2)),
(methodcaller('log2'), np.log2),
(methodcaller('round', 0), methodcaller('round', 0)),
(methodcaller('round', -2), methodcaller('round', -2)),
(methodcaller('round', 2), methodcaller('round', 2)),
(methodcaller('round'), methodcaller('round')),
(methodcaller('sign'), np.sign),
(methodcaller('sqrt'), np.sqrt),
]
)
def test_aggregation_group_by(t, df, where, ibis_func, pandas_func):
ibis_where = where(t)
expr = t.group_by(t.dup_strings).aggregate(
avg_plain_int64=t.plain_int64.mean(where=ibis_where),
sum_plain_float64=t.plain_float64.sum(where=ibis_where),
mean_float64_positive=ibis_func(
t.float64_positive
).mean(where=ibis_where),
neg_mean_int64_with_zeros=(-t.int64_with_zeros).mean(where=ibis_where),
nunique_dup_ints=t.dup_ints.nunique(),
)
result = expr.execute()
Expand All @@ -108,16 +134,28 @@ def test_aggregation_group_by(t, df, where):
'plain_int64': lambda x, mask=mask: x[mask].mean(),
'plain_float64': lambda x, mask=mask: x[mask].sum(),
'dup_ints': 'nunique',
'float64_positive': (
lambda x, mask=mask, func=pandas_func: func(x[mask]).mean()
),
'int64_with_zeros': lambda x, mask=mask: (-x[mask]).mean(),
}).reset_index().rename(
columns={
'plain_int64': 'avg_plain_int64',
'plain_float64': 'sum_plain_float64',
'dup_ints': 'nunique_dup_ints',
'float64_positive': 'mean_float64_positive',
'int64_with_zeros': 'neg_mean_int64_with_zeros',
}
)
# TODO(phillipc): Why does pandas not return floating point values here?
expected['avg_plain_int64'] = expected.avg_plain_int64.astype('float64')
result['avg_plain_int64'] = result.avg_plain_int64.astype('float64')
expected['neg_mean_int64_with_zeros'] = (
expected.neg_mean_int64_with_zeros.astype('float64')
)
result['neg_mean_int64_with_zeros'] = (
result.neg_mean_int64_with_zeros.astype('float64')
)
tm.assert_frame_equal(result[expected.columns], expected)


Expand Down