More exhaustive bucket test cases, and move dimension creation to tra…

…nslate_expr code path
ibis-project · May 27, 2015 · f0404e3 · f0404e3
1 parent cb90310
commit f0404e3
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 32 deletions.
diff --git a/ibis/sql/compiler.py b/ibis/sql/compiler.py
@@ -250,38 +250,6 @@ def _visit_select_expr(self, expr):
         else:
             return expr
 
-    def _visit_select_Bucket(self, expr):
-        import operator
-
-        op = expr.op()
-
-        stmt = api.case()
-
-        if op.closed == 'left':
-            l_cmp = operator.le
-            r_cmp = operator.lt
-        else:
-            l_cmp = operator.lt
-            r_cmp = operator.le
-
-        bucket_id = 0
-        if op.include_under:
-            stmt = stmt.when(r_cmp(op.arg, op.buckets[0]), bucket_id)
-            bucket_id += 1
-
-        for lower, upper in zip(op.buckets, op.buckets[1:]):
-            stmt = stmt.when(l_cmp(lower, op.arg) & r_cmp(op.arg, upper),
-                             bucket_id)
-            bucket_id += 1
-
-        if op.include_over:
-            stmt = stmt.when(l_cmp(op.buckets[-1], op.arg), bucket_id)
-            bucket_id += 1
-
-        case_expr = stmt.end()
-
-        return case_expr.name(expr.get_name())
-
     def _analyze_filter_exprs(self):
         # What's semantically contained in the filter predicates may need to be
         # rewritten. Not sure if this is the right place to do this, but a

diff --git a/ibis/sql/exprs.py b/ibis/sql/exprs.py
@@ -20,6 +20,8 @@
 
 from io import BytesIO
 
+import ibis.expr.analytics as analytics
+import ibis.expr.api as api
 import ibis.expr.types as ir
 import ibis.expr.operations as ops
 import ibis.expr.temporal as tempo
@@ -282,6 +284,49 @@ def _searched_case(translator, expr):
     return formatter.get_result()
 
 
+def _bucket(translator, expr):
+    import operator
+
+    op = expr.op()
+
+    stmt = api.case()
+
+    if op.closed == 'left':
+        l_cmp = operator.le
+        r_cmp = operator.lt
+    else:
+        l_cmp = operator.lt
+        r_cmp = operator.le
+
+    bucket_id = 0
+    if op.include_under:
+        cmp = operator.lt if op.close_extreme else r_cmp
+        stmt = stmt.when(cmp(op.arg, op.buckets[0]), bucket_id)
+        bucket_id += 1
+
+    user_num_buckets = len(op.buckets) - 1
+
+    for j, (lower, upper) in enumerate(zip(op.buckets, op.buckets[1:])):
+        if (op.close_extreme
+            and ((op.closed == 'right' and j == 0) or
+                 (op.closed == 'left' and j == (user_num_buckets - 1)))):
+            stmt = stmt.when((lower <= op.arg) & (op.arg <= upper),
+                             bucket_id)
+        else:
+            stmt = stmt.when(l_cmp(lower, op.arg) & r_cmp(op.arg, upper),
+                             bucket_id)
+        bucket_id += 1
+
+    if op.include_over:
+        cmp = operator.lt if op.close_extreme else l_cmp
+        stmt = stmt.when(cmp(op.buckets[-1], op.arg), bucket_id)
+        bucket_id += 1
+
+    case_expr = stmt.end().name(expr._name)
+    return _searched_case(translator, case_expr)
+
+
+
 def _table_array_view(translator, expr):
     ctx = translator.context
     table = expr.op().table
@@ -602,6 +647,8 @@ def _not_implemented(translator, expr):
     ops.Contains: _contains,
     ops.NotContains: _not_contains,
 
+    analytics.Bucket: _bucket,
+
     ops.SimpleCase: _simple_case,
     ops.SearchedCase: _searched_case,
 

diff --git a/ibis/sql/tests/test_exprs.py b/ibis/sql/tests/test_exprs.py
@@ -375,6 +375,113 @@ def test_search_case(self):
 END"""
         assert result == expected
 
+    def test_bucket_to_case(self):
+        buckets = [0, 10, 25, 50]
+
+        expr1 = self.table.f.bucket(buckets)
+        expected1 = """\
+CASE
+  WHEN (f >= 0) AND (f < 10) THEN 0
+  WHEN (f >= 10) AND (f < 25) THEN 1
+  WHEN (f >= 25) AND (f <= 50) THEN 2
+  ELSE NULL
+END"""
+
+        expr2 = self.table.f.bucket(buckets, close_extreme=False)
+        expected2 = """\
+CASE
+  WHEN (f >= 0) AND (f < 10) THEN 0
+  WHEN (f >= 10) AND (f < 25) THEN 1
+  WHEN (f >= 25) AND (f < 50) THEN 2
+  ELSE NULL
+END"""
+
+        expr3 = self.table.f.bucket(buckets, closed='right')
+        expected3 = """\
+CASE
+  WHEN (f >= 0) AND (f <= 10) THEN 0
+  WHEN (f > 10) AND (f <= 25) THEN 1
+  WHEN (f > 25) AND (f <= 50) THEN 2
+  ELSE NULL
+END"""
+
+        expr4 = self.table.f.bucket(buckets, closed='right',
+                                    close_extreme=False)
+        expected4 = """\
+CASE
+  WHEN (f > 0) AND (f <= 10) THEN 0
+  WHEN (f > 10) AND (f <= 25) THEN 1
+  WHEN (f > 25) AND (f <= 50) THEN 2
+  ELSE NULL
+END"""
+
+
+        expr5 = self.table.f.bucket(buckets, include_under=True)
+        expected5 = """\
+CASE
+  WHEN f < 0 THEN 0
+  WHEN (f >= 0) AND (f < 10) THEN 1
+  WHEN (f >= 10) AND (f < 25) THEN 2
+  WHEN (f >= 25) AND (f <= 50) THEN 3
+  ELSE NULL
+END"""
+
+        expr6 = self.table.f.bucket(buckets,
+                                    include_under=True,
+                                    include_over=True)
+        expected6 = """\
+CASE
+  WHEN f < 0 THEN 0
+  WHEN (f >= 0) AND (f < 10) THEN 1
+  WHEN (f >= 10) AND (f < 25) THEN 2
+  WHEN (f >= 25) AND (f <= 50) THEN 3
+  WHEN f > 50 THEN 4
+  ELSE NULL
+END"""
+
+        expr7 = self.table.f.bucket(buckets,
+                                    close_extreme=False,
+                                    include_under=True,
+                                    include_over=True)
+        expected7 = """\
+CASE
+  WHEN f < 0 THEN 0
+  WHEN (f >= 0) AND (f < 10) THEN 1
+  WHEN (f >= 10) AND (f < 25) THEN 2
+  WHEN (f >= 25) AND (f < 50) THEN 3
+  WHEN f >= 50 THEN 4
+  ELSE NULL
+END"""
+
+        expr8 = self.table.f.bucket(buckets, closed='right',
+                                    close_extreme=False,
+                                    include_under=True)
+        expected8 = """\
+CASE
+  WHEN f <= 0 THEN 0
+  WHEN (f > 0) AND (f <= 10) THEN 1
+  WHEN (f > 10) AND (f <= 25) THEN 2
+  WHEN (f > 25) AND (f <= 50) THEN 3
+  ELSE NULL
+END"""
+
+
+        cases = [
+            (expr1, expected1),
+            (expr2, expected2),
+            (expr3, expected3),
+            (expr4, expected4),
+            (expr5, expected5),
+            (expr6, expected6),
+            (expr7, expected7),
+            (expr8, expected8)
+        ]
+        cases = [(expr,
+            #self.table[[expr.name('bucket')]],
+                  exp)
+                 for expr, exp in cases]
+        self._check_expr_cases(cases)
+
     def test_where_use_if(self):
         expr = api.where(self.table.f > 0, self.table.e, self.table.a)
         assert isinstance(expr, ir.FloatValue)

diff --git a/ibis/tests/test_impala_e2e.py b/ibis/tests/test_impala_e2e.py
@@ -246,6 +246,10 @@ def test_builtins_1(self):
             d.bucket([0, 10, 25, 50], include_over=True, close_extreme=False),
             d.bucket([10, 25, 50, 100], include_under=True),
 
+            # d.histogram(10),
+            # d.histogram(5, base=10),
+            # d.histogram(base=10, binwidth=5),
+
             # coalesce-like cases
             api.coalesce(table.int_col,
                          api.null(),