From 24b4b723d39ab63a1a8503f97920bf663f0c63e9 Mon Sep 17 00:00:00 2001 From: Scott Lustig Date: Mon, 9 Jul 2018 20:49:09 -0400 Subject: [PATCH] Implement Mean and Sum for Boolean Columns and Implement approx_nunique and summary in BQ Closes #1516 Author: Scott Lustig Closes #1517 from missing-semicolon/reduction and squashes the following commits: 80a1672 [Scott Lustig] Implement approx_nunique for BigQuery c40b03a [Scott Lustig] Implement test for summary and approx_nunique 65680ff [Scott Lustig] Handle where argument when casting boolean columns 3610e67 [Scott Lustig] Add tests with where statement 96b2548 [Scott Lustig] Rewrite Mean and Sum for booleans 599b549 [Scott Lustig] Implement tests --- ibis/bigquery/compiler.py | 23 +++++++++++- ibis/bigquery/tests/test_client.py | 16 +++++++++ ibis/bigquery/tests/test_compiler.py | 54 ++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/ibis/bigquery/compiler.py b/ibis/bigquery/compiler.py index addac95d8315..5fbf1940af85 100644 --- a/ibis/bigquery/compiler.py +++ b/ibis/bigquery/compiler.py @@ -23,7 +23,7 @@ import ibis.expr.lineage as lin from ibis.impala.compiler import ( - ImpalaSelect, unary, fixed_arity, ImpalaTableSetFormatter + ImpalaSelect, unary, fixed_arity, ImpalaTableSetFormatter, _reduction ) from ibis.impala import compiler as impala_compiler @@ -397,6 +397,7 @@ def _formatter(translator, expr): ops.ArrayIndex: _array_index, ops.ArrayLength: unary('ARRAY_LENGTH'), + ops.HLLCardinality: _reduction('APPROX_COUNT_DISTINCT'), ops.Log: _log, ops.Sign: unary('SIGN'), ops.Modulus: fixed_arity('MOD', 2), @@ -558,6 +559,26 @@ def log2(expr): return arg.log(2) +@rewrites(ops.Sum) +def bq_sum(expr): + arg = expr.op().args[0] + where = expr.op().args[1] + if isinstance(arg, ir.BooleanColumn): + return arg.cast('int64').sum(where=where) + else: + return expr + + +@rewrites(ops.Mean) +def bq_mean(expr): + arg = expr.op().args[0] + where = expr.op().args[1] + if isinstance(arg, ir.BooleanColumn): + return arg.cast('int64').mean(where=where) + else: + return expr + + UNIT_FUNCS = { 's': 'SECONDS', 'ms': 'MILLIS', diff --git a/ibis/bigquery/tests/test_client.py b/ibis/bigquery/tests/test_client.py index abd700a4b0b4..a3efbd0dfaf7 100644 --- a/ibis/bigquery/tests/test_client.py +++ b/ibis/bigquery/tests/test_client.py @@ -604,3 +604,19 @@ def test_day_of_week(client, case, dtype): expr_name = date_var.day_of_week.full_name() result = client.execute(expr_name) assert result == 'Sunday' + + +def test_boolean_reducers(alltypes): + b = alltypes.bool_col + bool_avg = b.mean().execute() + assert type(bool_avg) == np.float64 + + bool_sum = b.sum().execute() + assert type(bool_sum) == np.int64 + + +def test_column_summary(alltypes): + b = alltypes.bool_col.summary() + result = b.execute() + assert result.shape == (1, 7) + assert len(result) == 1 diff --git a/ibis/bigquery/tests/test_compiler.py b/ibis/bigquery/tests/test_compiler.py index 86fe0dc87ca9..e72ad788fe78 100644 --- a/ibis/bigquery/tests/test_compiler.py +++ b/ibis/bigquery/tests/test_compiler.py @@ -384,3 +384,57 @@ def test_projection_fusion_only_peeks_at_immediate_parent(): FROM t3 CROSS JOIN t3 t4""" assert result == expected + + +def test_bool_reducers(alltypes): + b = alltypes.bool_col + expr = b.mean() + result = expr.compile() + expected = """\ +SELECT avg(CAST(`bool_col` AS INT64)) AS `mean` +FROM `ibis-gbq.testing.functional_alltypes`""" + assert result == expected + + expr2 = b.sum() + result = expr2.compile() + expected = """\ +SELECT sum(CAST(`bool_col` AS INT64)) AS `sum` +FROM `ibis-gbq.testing.functional_alltypes`""" + assert result == expected + + +def test_bool_reducers_where(alltypes): + b = alltypes.bool_col + m = alltypes.month + expr = b.mean(where=m > 6) + result = expr.compile() + expected = """\ +SELECT avg(CASE WHEN `month` > 6 THEN CAST(`bool_col` AS INT64) ELSE NULL END) AS `mean` +FROM `ibis-gbq.testing.functional_alltypes`""" # noqa: E501 + assert result == expected + + expr2 = b.sum(where=((m > 6) & (m < 10))) + result = expr2.compile() + expected = """\ +SELECT sum(CASE WHEN (`month` > 6) AND (`month` < 10) THEN CAST(`bool_col` AS INT64) ELSE NULL END) AS `sum` +FROM `ibis-gbq.testing.functional_alltypes`""" # noqa: E501 + assert result == expected + + +def test_approx_nunique(alltypes): + d = alltypes.double_col + expr = d.approx_nunique() + result = expr.compile() + expected = """\ +SELECT APPROX_COUNT_DISTINCT(`double_col`) AS `approx_nunique` +FROM `ibis-gbq.testing.functional_alltypes`""" + assert result == expected + + b = alltypes.bool_col + m = alltypes.month + expr2 = b.approx_nunique(where=m > 6) + result = expr2.compile() + expected = """\ +SELECT APPROX_COUNT_DISTINCT(CASE WHEN `month` > 6 THEN `bool_col` ELSE NULL END) AS `approx_nunique` +FROM `ibis-gbq.testing.functional_alltypes`""" # noqa: E501 + assert result == expected