Skip to content

Commit

Permalink
Implement Mean and Sum for Boolean Columns and Implement approx_nuniq…
Browse files Browse the repository at this point in the history
…ue and summary in BQ

Closes #1516

Author: Scott Lustig <scott.lustig@twosigma.com>

Closes #1517 from missing-semicolon/reduction and squashes the following commits:

80a1672 [Scott Lustig] Implement approx_nunique for BigQuery
c40b03a [Scott Lustig] Implement test for summary and approx_nunique
65680ff [Scott Lustig] Handle where argument when casting boolean columns
3610e67 [Scott Lustig] Add tests with where statement
96b2548 [Scott Lustig] Rewrite Mean and Sum for booleans
599b549 [Scott Lustig] Implement tests
  • Loading branch information
missing-semicolon authored and cpcloud committed Jul 10, 2018
1 parent 15518da commit 24b4b72
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 1 deletion.
23 changes: 22 additions & 1 deletion ibis/bigquery/compiler.py
Expand Up @@ -23,7 +23,7 @@
import ibis.expr.lineage as lin

from ibis.impala.compiler import (
ImpalaSelect, unary, fixed_arity, ImpalaTableSetFormatter
ImpalaSelect, unary, fixed_arity, ImpalaTableSetFormatter, _reduction
)
from ibis.impala import compiler as impala_compiler

Expand Down Expand Up @@ -397,6 +397,7 @@ def _formatter(translator, expr):
ops.ArrayIndex: _array_index,
ops.ArrayLength: unary('ARRAY_LENGTH'),

ops.HLLCardinality: _reduction('APPROX_COUNT_DISTINCT'),
ops.Log: _log,
ops.Sign: unary('SIGN'),
ops.Modulus: fixed_arity('MOD', 2),
Expand Down Expand Up @@ -558,6 +559,26 @@ def log2(expr):
return arg.log(2)


@rewrites(ops.Sum)
def bq_sum(expr):
arg = expr.op().args[0]
where = expr.op().args[1]
if isinstance(arg, ir.BooleanColumn):
return arg.cast('int64').sum(where=where)
else:
return expr


@rewrites(ops.Mean)
def bq_mean(expr):
arg = expr.op().args[0]
where = expr.op().args[1]
if isinstance(arg, ir.BooleanColumn):
return arg.cast('int64').mean(where=where)
else:
return expr


UNIT_FUNCS = {
's': 'SECONDS',
'ms': 'MILLIS',
Expand Down
16 changes: 16 additions & 0 deletions ibis/bigquery/tests/test_client.py
Expand Up @@ -604,3 +604,19 @@ def test_day_of_week(client, case, dtype):
expr_name = date_var.day_of_week.full_name()
result = client.execute(expr_name)
assert result == 'Sunday'


def test_boolean_reducers(alltypes):
b = alltypes.bool_col
bool_avg = b.mean().execute()
assert type(bool_avg) == np.float64

bool_sum = b.sum().execute()
assert type(bool_sum) == np.int64


def test_column_summary(alltypes):
b = alltypes.bool_col.summary()
result = b.execute()
assert result.shape == (1, 7)
assert len(result) == 1
54 changes: 54 additions & 0 deletions ibis/bigquery/tests/test_compiler.py
Expand Up @@ -384,3 +384,57 @@ def test_projection_fusion_only_peeks_at_immediate_parent():
FROM t3
CROSS JOIN t3 t4"""
assert result == expected


def test_bool_reducers(alltypes):
b = alltypes.bool_col
expr = b.mean()
result = expr.compile()
expected = """\
SELECT avg(CAST(`bool_col` AS INT64)) AS `mean`
FROM `ibis-gbq.testing.functional_alltypes`"""
assert result == expected

expr2 = b.sum()
result = expr2.compile()
expected = """\
SELECT sum(CAST(`bool_col` AS INT64)) AS `sum`
FROM `ibis-gbq.testing.functional_alltypes`"""
assert result == expected


def test_bool_reducers_where(alltypes):
b = alltypes.bool_col
m = alltypes.month
expr = b.mean(where=m > 6)
result = expr.compile()
expected = """\
SELECT avg(CASE WHEN `month` > 6 THEN CAST(`bool_col` AS INT64) ELSE NULL END) AS `mean`
FROM `ibis-gbq.testing.functional_alltypes`""" # noqa: E501
assert result == expected

expr2 = b.sum(where=((m > 6) & (m < 10)))
result = expr2.compile()
expected = """\
SELECT sum(CASE WHEN (`month` > 6) AND (`month` < 10) THEN CAST(`bool_col` AS INT64) ELSE NULL END) AS `sum`
FROM `ibis-gbq.testing.functional_alltypes`""" # noqa: E501
assert result == expected


def test_approx_nunique(alltypes):
d = alltypes.double_col
expr = d.approx_nunique()
result = expr.compile()
expected = """\
SELECT APPROX_COUNT_DISTINCT(`double_col`) AS `approx_nunique`
FROM `ibis-gbq.testing.functional_alltypes`"""
assert result == expected

b = alltypes.bool_col
m = alltypes.month
expr2 = b.approx_nunique(where=m > 6)
result = expr2.compile()
expected = """\
SELECT APPROX_COUNT_DISTINCT(CASE WHEN `month` > 6 THEN `bool_col` ELSE NULL END) AS `approx_nunique`
FROM `ibis-gbq.testing.functional_alltypes`""" # noqa: E501
assert result == expected

0 comments on commit 24b4b72

Please sign in to comment.