Skip to content

Commit

Permalink
test(backends): add distinct with parameters test
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Mar 24, 2023
1 parent 3720ea5 commit 4f4d962
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 2 deletions.
150 changes: 150 additions & 0 deletions ibis/backends/tests/test_generic.py
Expand Up @@ -995,3 +995,153 @@ def test_pivot_longer(backend):
)
df = res.limit(5).execute()
assert not df.empty


@pytest.mark.parametrize(
"on",
[
param(
["cut"],
marks=[
pytest.mark.notimpl(
["mssql", "mysql"], raises=com.OperationNotDefinedError
),
],
id="one",
),
param(
["clarity", "cut"],
marks=[
pytest.mark.notimpl(
["mssql", "mysql"], raises=com.OperationNotDefinedError
),
],
id="many",
),
],
)
@pytest.mark.parametrize(
"keep",
[
param(
"first",
marks=pytest.mark.notimpl(
["trino"],
raises=AssertionError,
reason="trino is more arbitrary than other backends",
strict=False,
),
),
param(
"last",
marks=[
pytest.mark.notimpl(
["bigquery", "snowflake"],
raises=com.UnsupportedOperationError,
reason="backend doesn't support last argument to arbitrary",
),
pytest.mark.notimpl(
["trino"],
raises=AssertionError,
reason="trino is more arbitrary than other backends",
strict=False,
),
],
),
],
)
@pytest.mark.notimpl(
["druid", "impala"],
raises=(NotImplementedError, sa.exc.ProgrammingError, com.OperationNotDefinedError),
reason="arbitrary not implemented in the backend",
)
@pytest.mark.notimpl(
["dask", "datafusion", "polars"],
raises=com.OperationNotDefinedError,
reason="backend doesn't implement window functions",
)
@pytest.mark.notimpl(
["pandas"],
raises=com.OperationNotDefinedError,
reason="backend doesn't implement ops.WindowFunction",
)
def test_distinct_on_keep(backend, on, keep):
from ibis import _

t = backend.diamonds.mutate(one=ibis.literal(1)).mutate(
idx=ibis.row_number().over(order_by=_.one, rows=(None, 0))
)

requires_cache = backend.name() in ("mysql", "impala")

if requires_cache:
t = t.cache()
expr = t.distinct(on=on, keep=keep).order_by(ibis.asc("idx"))
result = expr.execute()
df = t.execute()
expected = (
df.drop_duplicates(subset=on, keep=keep or False)
.sort_values(by=["idx"])
.reset_index(drop=True)
)
backend.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"on",
[
param(
["cut"],
marks=[
pytest.mark.notimpl(
["mssql", "mysql"], raises=com.OperationNotDefinedError
),
],
id="one",
),
param(
["clarity", "cut"],
marks=[
pytest.mark.notimpl(
["mssql", "mysql"], raises=com.OperationNotDefinedError
),
],
id="many",
),
],
)
@pytest.mark.notimpl(
["druid", "impala"],
raises=(NotImplementedError, sa.exc.ProgrammingError, com.OperationNotDefinedError),
reason="arbitrary not implemented in the backend",
)
@pytest.mark.notimpl(
["dask", "datafusion", "polars"],
raises=com.OperationNotDefinedError,
reason="backend doesn't implement window functions",
)
@pytest.mark.notimpl(
["pandas"],
raises=com.OperationNotDefinedError,
reason="backend doesn't implement ops.WindowFunction",
)
def test_distinct_on_keep_is_none(backend, on):
from ibis import _

t = backend.diamonds.mutate(one=ibis.literal(1)).mutate(
idx=ibis.row_number().over(order_by=_.one, rows=(None, 0))
)

requires_cache = backend.name() in ("mysql", "impala")

if requires_cache:
t = t.cache()
expr = t.distinct(on=on, keep=None).order_by(ibis.asc("idx"))
result = expr.execute()
df = t.execute()
expected = (
df.drop_duplicates(subset=on, keep=False)
.sort_values(by=["idx"])
.reset_index(drop=True)
)
backend.assert_frame_equal(result, expected)
4 changes: 4 additions & 0 deletions ibis/expr/types/relations.py
Expand Up @@ -1053,6 +1053,10 @@ def distinct(

if on is None:
# dedup everything
if keep != "first":
raise com.IbisError(
f"Only keep='first' (the default) makes sense when deduplicating all columns; got keep={keep!r}"
)
return ops.Distinct(self).to_expr()

if not isinstance(on, s.Selector):
Expand Down
15 changes: 13 additions & 2 deletions ibis/tests/expr/test_table.py
Expand Up @@ -1812,9 +1812,20 @@ def test_pivot_longer_no_match():


def test_invalid_deferred():
import ibis

t = ibis.table(dict(value="int", lagged_value="int"), name="t")

with pytest.raises(com.IbisTypeError, match="Deferred input is not allowed"):
ibis.greatest(t.value, ibis._.lagged_value)


@pytest.mark.parametrize("keep", ["last", None])
def test_invalid_distinct(keep):
t = ibis.table(dict(a="int"), name="t")
with pytest.raises(com.IbisError, match="Only keep='first'"):
t.distinct(keep=keep)


def test_invalid_keep_distinct():
t = ibis.table(dict(a="int", b="string"), name="t")
with pytest.raises(com.IbisError, match="Invalid value for `keep`:"):
t.distinct(on="a", keep="invalid")

0 comments on commit 4f4d962

Please sign in to comment.