308 changes: 232 additions & 76 deletions ibis/expr/analysis.py

Large diffs are not rendered by default.

34 changes: 20 additions & 14 deletions ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1764,15 +1764,16 @@ def _table_info(self, buf=None):
counts = ['Non-null #', '----------'] + [str(x) for x in metrics[1:]]
col_metrics = util.adjoin(2, names, types, counts)

if buf is None:
import sys
buf = sys.stdout

result = ('Table rows: {0}\n\n'
'{1}'
.format(metrics[0], col_metrics))

buf.write(result)
if buf is None:
import sys
sys.stdout.write(result)
sys.stdout.write('\n')
else:
buf.write(result)


def _table_set_column(table, name, expr):
Expand Down Expand Up @@ -1834,20 +1835,22 @@ def filter(table, predicates):
-------
filtered_expr : TableExpr
"""
resolved_predicates = _resolve_predicates(table, predicates)
return _L.apply_filter(table, resolved_predicates)


def _resolve_predicates(table, predicates):
if isinstance(predicates, Expr):
predicates = _L.unwrap_ands(predicates)
predicates = util.promote_list(predicates)

predicates = [ir.bind_expr(table, x) for x in predicates]

resolved_predicates = []
for pred in predicates:
if isinstance(pred, ir.AnalyticExpr):
pred = pred.to_filter()
resolved_predicates.append(pred)

op = _L.apply_filter(table, resolved_predicates)
return TableExpr(op)
return resolved_predicates


def aggregate(table, metrics=None, by=None, having=None, **kwds):
Expand Down Expand Up @@ -1875,7 +1878,7 @@ def aggregate(table, metrics=None, by=None, having=None, **kwds):
v = table._ensure_expr(v)
metrics.append(v.name(k))

op = _ops.Aggregation(table, metrics, by=by, having=having)
op = table.op().aggregate(table, metrics, by=by, having=having)
return TableExpr(op)


Expand Down Expand Up @@ -1928,8 +1931,10 @@ def _table_sort_by(table, sort_exprs):
-------
sorted : TableExpr
"""
op = _ops.SortBy(table, sort_exprs)
return TableExpr(op)
op = table.op()
result = op.sort_by(table, sort_exprs)

return TableExpr(result)


def _table_union(left, right, distinct=False):
Expand Down Expand Up @@ -2046,8 +2051,9 @@ def projection(table, exprs):
if isinstance(exprs, (Expr,) + six.string_types):
exprs = [exprs]

exprs = [table._ensure_expr(e) for e in exprs]
op = L.Projector(table, exprs).get_result()
projector = L.Projector(table, exprs)

op = projector.get_result()
return TableExpr(op)


Expand Down
8 changes: 4 additions & 4 deletions ibis/expr/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_result(self):
if self.memoize:
self._memoize_tables()

if isinstance(what, ir.TableNode) and what.has_schema():
if isinstance(what, ops.TableNode) and what.has_schema():
# This should also catch aggregations
if not self.memoize and what in self.memo:
text = 'Table: %s' % self.memo.get_alias(what)
Expand Down Expand Up @@ -128,8 +128,8 @@ def get_result(self):
return self._indent(text, self.base_level)

def _memoize_tables(self):
table_memo_ops = (ops.Aggregation, ops.Filter,
ops.Projection, ops.SelfReference)
table_memo_ops = (ops.Aggregation, ops.Selection,
ops.SelfReference)

def walk(expr):
op = expr.op()
Expand All @@ -146,7 +146,7 @@ def visit(arg):
visit(op.args)
if isinstance(op, table_memo_ops):
self.memo.observe(op, self._format_node)
elif isinstance(op, ir.TableNode) and op.has_schema():
elif isinstance(op, ops.TableNode) and op.has_schema():
self.memo.observe(op, self._format_table)

walk(self.expr)
Expand Down
4 changes: 3 additions & 1 deletion ibis/expr/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class GroupedTableExpr(object):

def __init__(self, table, by, having=None, order_by=None, window=None):
self.table = table
self.by = _resolve_exprs(table, by)
self.by = by
self._order_by = order_by or []
self._having = having or []
self._window = window
Expand Down Expand Up @@ -158,6 +158,8 @@ def _get_window(self):

sorts = [ops.to_sort_key(self.table, k) for k in sorts]

groups = _resolve_exprs(self.table, groups)

return _window.window(preceding=preceding, following=following,
group_by=groups, order_by=sorts)

Expand Down
335 changes: 249 additions & 86 deletions ibis/expr/operations.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ibis/expr/tests/mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ class BasicTestCase(object):
def setUp(self):
self.schema = _all_types_schema
self.schema_dict = dict(self.schema)
self.table = ibis.table(self.schema)
self.table = ibis.table(self.schema, 'schema')

self.int_cols = ['a', 'b', 'c', 'd']
self.bool_cols = ['h']
Expand Down
173 changes: 94 additions & 79 deletions ibis/expr/tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ def test_rewrite_substitute_distinct_tables(self):
assert_equal(result, expected)

def test_rewrite_join_projection_without_other_ops(self):
# Drop out filters and other commutative table operations. Join
# predicates are "lifted" to reference the base, unmodified join roots
# See #790, predicate pushdown in joins not supported

# Star schema with fact table
table = self.con.table('star1')
Expand All @@ -70,10 +69,8 @@ def test_rewrite_join_projection_without_other_ops(self):

rewritten_proj = L.substitute_parents(view)
op = rewritten_proj.op()
assert_equal(op.table, ex_expr)

# Ensure that filtered table has been substituted with the base table
assert op.selections[0] is table
assert not op.table.equals(ex_expr)

def test_rewrite_past_projection(self):
table = self.con.table('test1')
Expand All @@ -92,76 +89,6 @@ def test_rewrite_past_projection(self):
result = L.substitute_parents(expr)
assert result is expr

def test_rewrite_expr_with_parent(self):
table = self.con.table('test1')

table2 = table[table['f'] > 0]

expr = table2['c'] == 2

result = L.substitute_parents(expr)
expected = table['c'] == 2
assert_equal(result, expected)

# Substitution not fully possible if we depend on a new expr in a
# projection

table4 = table[['c', (table['c'] * 2).name('foo')]]
expr = table4['c'] == table4['foo']
result = L.substitute_parents(expr)
expected = table['c'] == table4['foo']
assert_equal(result, expected)

def test_rewrite_distinct_but_equal_objects(self):
t = self.con.table('test1')
t_copy = self.con.table('test1')

table2 = t[t_copy['f'] > 0]

expr = table2['c'] == 2

result = L.substitute_parents(expr)
expected = t['c'] == 2
assert_equal(result, expected)

def test_projection_with_join_pushdown_rewrite_refs(self):
# Observed this expression IR issue in a TopK-rewrite context
table1 = ibis.table([
('a_key1', 'string'),
('a_key2', 'string'),
('a_value', 'double')
], 'foo')

table2 = ibis.table([
('b_key1', 'string'),
('b_name', 'string'),
('b_value', 'double')
], 'bar')

table3 = ibis.table([
('c_key2', 'string'),
('c_name', 'string')
], 'baz')

proj = (table1.inner_join(table2, [('a_key1', 'b_key1')])
.inner_join(table3, [(table1.a_key2, table3.c_key2)])
[table1, table2.b_name.name('b'), table3.c_name.name('c'),
table2.b_value])

cases = [
(proj.a_value > 0, table1.a_value > 0),
(proj.b_value > 0, table2.b_value > 0)
]

for higher_pred, lower_pred in cases:
result = proj.filter([higher_pred])
op = result.op()
assert isinstance(op, ops.Projection)
filter_op = op.table.op()
assert isinstance(filter_op, ops.Filter)
new_pred = filter_op.predicates[0]
assert_equal(new_pred, lower_pred)

def test_multiple_join_deeper_reference(self):
# Join predicates down the chain might reference one or more root
# tables in the hierarchy.
Expand Down Expand Up @@ -209,10 +136,8 @@ def test_filter_on_projected_field(self):

# Now then! Predicate pushdown here is inappropriate, so we check that
# it didn't occur.

# If filter were pushed below projection, the top-level operator type
# would be Projection instead.
assert type(result.op()) == ops.Filter
assert isinstance(result.op(), ops.Selection)
assert result.op().table is tpch

def test_bad_join_predicate_raises(self):
# Join predicate references a derived table, but we can salvage and
Expand Down Expand Up @@ -267,3 +192,93 @@ def test_filter_self_join(self):
# proj exprs unaffected by analysis
assert_equal(proj_exprs[0], left.region)
assert_equal(proj_exprs[1], metric)

# def test_fuse_filter_projection(self):
# data = ibis.table([('kind', 'string'),
# ('year', 'int64')], 'data')

# pred = data.year == 2010

# result = data.projection(['kind'])[pred]
# expected = data.filter(pred).kind

# assert isinstance(result, ops.Selection)
# assert result.equals(expected)

def test_fuse_projection_sort_by(self):
pass

def test_fuse_filter_sort_by(self):
pass

# Refactoring deadpool

def test_no_rewrite(self):
table = self.con.table('test1')

# Substitution not fully possible if we depend on a new expr in a
# projection
table4 = table[['c', (table['c'] * 2).name('foo')]]
expr = table4['c'] == table4['foo']
result = L.substitute_parents(expr)
expected = table['c'] == table4['foo']
assert_equal(result, expected)

# def test_projection_with_join_pushdown_rewrite_refs(self):
# # Observed this expression IR issue in a TopK-rewrite context
# table1 = ibis.table([
# ('a_key1', 'string'),
# ('a_key2', 'string'),
# ('a_value', 'double')
# ], 'foo')

# table2 = ibis.table([
# ('b_key1', 'string'),
# ('b_name', 'string'),
# ('b_value', 'double')
# ], 'bar')

# table3 = ibis.table([
# ('c_key2', 'string'),
# ('c_name', 'string')
# ], 'baz')

# proj = (table1.inner_join(table2, [('a_key1', 'b_key1')])
# .inner_join(table3, [(table1.a_key2, table3.c_key2)])
# [table1, table2.b_name.name('b'), table3.c_name.name('c'),
# table2.b_value])

# cases = [
# (proj.a_value > 0, table1.a_value > 0),
# (proj.b_value > 0, table2.b_value > 0)
# ]

# for higher_pred, lower_pred in cases:
# result = proj.filter([higher_pred])
# op = result.op()
# assert isinstance(op, ops.Selection)
# new_pred = op.predicates[0]
# assert_equal(new_pred, lower_pred)

# def test_rewrite_expr_with_parent(self):
# table = self.con.table('test1')

# table2 = table[table['f'] > 0]

# expr = table2['c'] == 2

# result = L.substitute_parents(expr)
# expected = table['c'] == 2
# assert_equal(result, expected)

# def test_rewrite_distinct_but_equal_objects(self):
# t = self.con.table('test1')
# t_copy = self.con.table('test1')

# table2 = t[t_copy['f'] > 0]

# expr = table2['c'] == 2

# result = L.substitute_parents(expr)
# expected = t['c'] == 2
# assert_equal(result, expected)
3 changes: 2 additions & 1 deletion ibis/expr/tests/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def test_topk_analysis_bug(self):
'airlines')
dests = ['ORD', 'JFK', 'SFO']
t = airlines[airlines.dest.isin(dests)]
delay_filter = t.dest.topk(10, by=t.arrdelay.mean())
delay_filter = t.origin.topk(10, by=t.arrdelay.mean())

filtered = t.filter([delay_filter])

post_pred = filtered.op().predicates[1]
Expand Down
23 changes: 14 additions & 9 deletions ibis/expr/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,28 +97,28 @@ def test_format_multiple_join_with_projection(self):
('f', 'double'),
('foo_id', 'string'),
('bar_id', 'string'),
])
], 'one')

table2 = ibis.table([
('foo_id', 'string'),
('value1', 'double')
])
], 'two')

table3 = ibis.table([
('bar_id', 'string'),
('value2', 'double')
])
], 'three')

filtered = table[table['f'] > 0]

pred1 = table['foo_id'] == table2['foo_id']
pred1 = filtered['foo_id'] == table2['foo_id']
pred2 = filtered['bar_id'] == table3['bar_id']

j1 = filtered.left_join(table2, [pred1])
j2 = j1.inner_join(table3, [pred2])

# Project out the desired fields
view = j2[[table, table2['value1'], table3['value2']]]
view = j2[[filtered, table2['value1'], table3['value2']]]

# it works!
repr(view)
Expand Down Expand Up @@ -151,7 +151,7 @@ def test_memoize_filtered_table(self):
delay_filter = t.dest.topk(10, by=t.arrdelay.mean())

result = repr(delay_filter)
assert result.count('Filter') == 1
assert result.count('Selection') == 1

def test_memoize_insert_sort_key(self):
table = self.con.table('airlines')
Expand All @@ -161,7 +161,9 @@ def test_memoize_insert_sort_key(self):
.mutate(dest_avg=t.arrdelay.mean(),
dev=t.arrdelay - t.arrdelay.mean()))

worst = expr[expr.dev.notnull()].sort_by(ibis.desc('dev')).limit(10)
worst = (expr[expr.dev.notnull()]
.sort_by(ibis.desc('dev'))
.limit(10))

result = repr(worst)
assert result.count('airlines') == 1
Expand Down Expand Up @@ -193,7 +195,10 @@ def test_memoize_filtered_tables_in_join(self):
right = agged[agged.kind == 'bar']

cond = left.region == right.region
joined = left.join(right, cond)
joined = (left.join(right, cond)
[left, right.total.name('right_total')])

result = repr(joined)
assert result.count('Filter') == 2

# Join, and one for each aggregation
assert result.count('predicates') == 3
29 changes: 14 additions & 15 deletions ibis/expr/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_projection(self):

proj = self.table[cols]
assert isinstance(proj, TableExpr)
assert isinstance(proj.op(), ops.Projection)
assert isinstance(proj.op(), ops.Selection)

assert proj.schema().names == cols
for c in cols:
Expand Down Expand Up @@ -177,10 +177,12 @@ def test_projection_mutate_analysis_bug(self):

t = self.con.table('airlines')

filtered = t[t.depdelay.notnull()]
leg = ibis.literal('-').join([t.origin, t.dest])
mutated = filtered.mutate(leg=leg)

# it works!
(t[t.depdelay.notnull()]
.mutate(leg=ibis.literal('-').join([t.origin, t.dest]))
['year', 'month', 'day', 'depdelay', 'leg'])
mutated['year', 'month', 'day', 'depdelay', 'leg']

def test_projection_self(self):
result = self.table[self.table]
Expand Down Expand Up @@ -278,12 +280,7 @@ def test_filter_no_list(self):
def test_add_predicate(self):
pred = self.table['a'] > 5
result = self.table[pred]
assert isinstance(result.op(), ops.Filter)

def test_filter_root_table_preserved(self):
result = self.table[self.table['a'] > 5]
roots = result.op().root_tables()
assert roots[0] is self.table.op()
assert isinstance(result.op(), ops.Selection)

def test_invalid_predicate(self):
# a lookalike
Expand Down Expand Up @@ -353,7 +350,9 @@ def test_sort_by(self):
# Default is ascending for anything coercable to an expression,
# and we'll have ascending/descending wrappers to help.
result = self.table.sort_by(['f'])
sort_key = result.op().keys[0].op()

sort_key = result.op().sort_keys[0].op()

assert_equal(sort_key.expr, self.table.f)
assert sort_key.ascending

Expand All @@ -365,9 +364,9 @@ def test_sort_by(self):
result3 = self.table.sort_by([('f', 'descending')])
result4 = self.table.sort_by([('f', 0)])

key2 = result2.op().keys[0].op()
key3 = result3.op().keys[0].op()
key4 = result4.op().keys[0].op()
key2 = result2.op().sort_keys[0].op()
key3 = result3.op().sort_keys[0].op()
key4 = result4.op().sort_keys[0].op()

assert not key2.ascending
assert not key3.ascending
Expand Down Expand Up @@ -972,7 +971,7 @@ def test_simple_existence_predicate(self):

# it works!
expr = self.t1[cond]
assert isinstance(expr.op(), ops.Filter)
assert isinstance(expr.op(), ops.Selection)

def test_cannot_use_existence_expression_in_join(self):
# Join predicates must consist only of comparisons
Expand Down
7 changes: 4 additions & 3 deletions ibis/expr/tests/test_window_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,12 @@ def test_auto_windowize_analysis_bug(self):
# GH #544
t = self.con.table('airlines')

def metric(x):
return x.arrdelay.mean().name('avg_delay')

annual_delay = (t[t.dest.isin(['JFK', 'SFO'])]
.group_by(['dest', 'year'])
.aggregate(t.arrdelay.mean().name('avg_delay')))
.aggregate(metric))
what = annual_delay.group_by('dest')
enriched = what.mutate(grand_avg=annual_delay.avg_delay.mean())

Expand All @@ -104,9 +107,7 @@ def test_auto_windowize_analysis_bug(self):

def test_mutate_sorts_keys(self):
t = self.con.table('airlines')

m = t.arrdelay.mean()

g = t.group_by('dest')

result = g.mutate(zzz=m, yyy=m, ddd=m, ccc=m, bbb=m, aaa=m)
Expand Down
46 changes: 12 additions & 34 deletions ibis/expr/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,11 @@ def _pp(x):

return '%s(%s)' % (opname, ', '.join(pprint_args))

def blocks(self):
# The contents of this node at referentially distinct and may not be
# analyzed deeper
return False

def flat_args(self):
for arg in self.args:
if isinstance(arg, (tuple, list)):
Expand Down Expand Up @@ -423,21 +428,6 @@ def root_tables(self):
return []


class TableNode(Node):

def get_type(self, name):
return self.get_schema().get_type(name)

def to_expr(self):
return TableExpr(self)


class BlockingTableNode(TableNode):
# Try to represent the fact that whatever lies here is a semantically
# distinct table. Like projections, aggregations, and so forth
pass


def distinct_roots(*args):
all_roots = []
for arg in args:
Expand Down Expand Up @@ -541,6 +531,13 @@ def factory(arg):
return TableExpr(arg)
return factory

def _is_valid(self, exprs):
try:
self._assert_valid(util.promote_list(exprs))
return True
except:
return False

def _assert_valid(self, exprs):
from ibis.expr.analysis import ExprValidator
ExprValidator([self]).validate_all(exprs)
Expand Down Expand Up @@ -1125,22 +1122,3 @@ def find_base_table(expr):
r = find_base_table(arg)
if isinstance(r, TableExpr):
return r


def find_all_base_tables(expr, memo=None):
if memo is None:
memo = {}

node = expr.op()

if (isinstance(expr, TableExpr) and
isinstance(node, BlockingTableNode)):
if id(expr) not in memo:
memo[id(expr)] = expr
return memo

for arg in expr.op().flat_args():
if isinstance(arg, Expr):
find_all_base_tables(arg, memo)

return memo
8 changes: 1 addition & 7 deletions ibis/filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,14 @@
import six

from ibis.config import options
from ibis.util import implements
import ibis.common as com


class HDFSError(com.IbisError):
pass


def implements(f):
def decorator(g):
g.__doc__ = f.__doc__
return g
return decorator


class HDFS(object):

"""
Expand Down
6 changes: 3 additions & 3 deletions ibis/impala/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def verify(expr):


def connect(host='localhost', port=21050, database='default', timeout=45,
use_ssl=False, ca_cert=None, user=None, password=None,
auth_mechanism='NOSASL', kerberos_service_name='impala',
pool_size=8, hdfs_client=None):
use_ssl=False, ca_cert=None, user=None,
password=None, auth_mechanism='NOSASL',
kerberos_service_name='impala', pool_size=8, hdfs_client=None):
"""
Create an ImpalaClient for use with Ibis.
Expand Down
94 changes: 58 additions & 36 deletions ibis/impala/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ class ImpalaDatabase(Database):

def create_table(self, table_name, obj=None, **kwargs):
"""
Dispatch to ImpalaClient.create_table. See docs for more
Dispatch to ImpalaClient.create_table. See that function's docstring
for more
"""
return self.client.create_table(table_name, obj=obj,
database=self.name, **kwargs)
Expand Down Expand Up @@ -182,18 +183,21 @@ class ImpalaCursor(object):

def __init__(self, cursor, con, impyla_con, database,
options):
self.cursor = cursor
self._cursor = cursor
self.con = con
self.impyla_con = impyla_con
self.database = database
self.options = options
self.released = False

def __del__(self):
self._close_cursor()
with self.con.lock:
self.con.connection_pool_size -= 1

def _close_cursor(self):
try:
self.cursor.close()
self._cursor.close()
except HS2Error as e:
# connection was closed elsewhere
if 'invalid session' not in e.args[0].lower():
Expand All @@ -208,17 +212,19 @@ def __exit__(self, type, value, tb):
def set_options(self):
for k, v in self.options.items():
query = 'SET {0}={1}'.format(k, v)
self.cursor.execute(query)
self._cursor.execute(query)

@property
def description(self):
return self.cursor.description
return self._cursor.description

def release(self):
self.con.release(self)
if not self.released:
self.con.release(self)
self.released = True

def execute(self, stmt, async=False):
self.cursor.execute_async(stmt)
self._cursor.execute_async(stmt)
if async:
return
else:
Expand All @@ -241,11 +247,11 @@ def _sleep_interval(start_time):
return 0.5
return 1.0

cur = self.cursor
cur = self._cursor
try:
while True:
state = cur.status()
if self.cursor._op_state_is_error(state):
if self._cursor._op_state_is_error(state):
raise OperationalError("Operation is in ERROR_STATE")
if not cur._op_state_is_executing(state):
break
Expand All @@ -259,16 +265,19 @@ def is_finished(self):
return not self.is_executing()

def is_executing(self):
return self.cursor.is_executing()
return self._cursor.is_executing()

def cancel(self):
self.cursor.cancel_operation()
self._cursor.cancel_operation()

def fetchone(self):
return self._cursor.fetchone()

def fetchall(self, columnar=False):
if columnar:
return self.cursor.fetchcolumnar()
return self._cursor.fetchcolumnar()
else:
return self.cursor.fetchall()
return self._cursor.fetchall()


class ImpalaQuery(Query):
Expand Down Expand Up @@ -459,6 +468,7 @@ def __init__(self, con, hdfs_client=None, **params):
raise TypeError(hdfs_client)

self._hdfs = hdfs_client
self._kudu = None

self._temp_objects = weakref.WeakValueDictionary()

Expand All @@ -481,6 +491,13 @@ def _set_hdfs(self, hdfs):

hdfs = property(fget=_get_hdfs, fset=_set_hdfs)

@property
def kudu(self):
from ibis.impala.kudu_support import KuduImpalaInterface
if self._kudu is None:
self._kudu = KuduImpalaInterface(self)
return self._kudu

@property
def _table_expr_klass(self):
return ImpalaTable
Expand Down Expand Up @@ -686,7 +703,7 @@ def get_schema(self, table_name, database=None):
ibis_types = []
for t in types:
t = t.lower()
t = udf._impala_to_ibis_type.get(t, t)
t = udf.parse_type(t)
ibis_types.append(t)

names = [x.lower() for x in names]
Expand Down Expand Up @@ -772,11 +789,14 @@ def drop_view(self, name, database=None, force=False):
return self._execute(statement)

def create_table(self, table_name, obj=None, schema=None, database=None,
format='parquet', force=False, external=False,
location=None, partition=None, like_parquet=None,
path=None):
external=False, force=False,
# HDFS options
format='parquet', location=None,
partition=None, like_parquet=None):
"""
Create a new table in Impala using an Ibis table expression
Create a new table in Impala using an Ibis table expression. This is
currently designed for tables whose data is stored in HDFS (or
eventually other filesystems).
Parameters
----------
Expand All @@ -787,12 +807,12 @@ def create_table(self, table_name, obj=None, schema=None, database=None,
Mutually exclusive with expr, creates an empty table with a
particular schema
database : string, default None (optional)
format : {'parquet'}
force : boolean, default False
Do not create table if table with indicated name already exists
external : boolean, default False
Create an external table; Impala will not delete the underlying data
when the table is dropped
format : {'parquet'}
location : string, default None
Specify the directory location where Impala reads and writes files
for the table
Expand All @@ -809,13 +829,10 @@ def create_table(self, table_name, obj=None, schema=None, database=None,
if like_parquet is not None:
raise NotImplementedError

# TODO: deprecation warning
if path is not None:
location = path

if obj is not None:
if isinstance(obj, pd.DataFrame):
writer, to_insert = _write_temp_dataframe(self, obj)
from ibis.impala.pandas_interop import write_temp_dataframe
writer, to_insert = write_temp_dataframe(self, obj)
else:
to_insert = obj
ast = self._build_ast(to_insert)
Expand All @@ -836,7 +853,7 @@ def create_table(self, table_name, obj=None, schema=None, database=None,
path=location)
elif schema is not None:
statement = ddl.CreateTableWithSchema(
table_name, schema, ddl.NoFormat(),
table_name, schema,
database=database,
format=format,
can_exist=force,
Expand Down Expand Up @@ -988,7 +1005,7 @@ def parquet_file(self, hdfs_dir, schema=None, name=None, database=None,
def _get_concrete_table_path(self, name, database, persist=False):
if not persist:
if name is None:
name = util.guid()
name = '__ibis_tmp_{0}'.format(util.guid())

if database is None:
self._ensure_temp_db_exists()
Expand All @@ -1003,7 +1020,11 @@ def _ensure_temp_db_exists(self):
# TODO: session memoize to avoid unnecessary `SHOW DATABASES` calls
name, path = options.impala.temp_db, options.impala.temp_hdfs_path
if not self.exists_database(name):
self.create_database(name, path=path, force=True)
if self._hdfs is None:
print('Without an HDFS connection, certain functionality'
' may be disabled')
else:
self.create_database(name, path=path, force=True)

def _wrap_new_table(self, name, database, persist):
qualified_name = self._fully_qualified_name(name, database)
Expand Down Expand Up @@ -1304,7 +1325,8 @@ def _to_type(x):
tuples = cur.fetchall()
if len(tuples) > 0:
result = []
for out_type, sig in tuples:
for tup in tuples:
out_type, sig = tup[:2]
name, types = _split_signature(sig)
types = _type_parser(types).types

Expand Down Expand Up @@ -1640,7 +1662,8 @@ def insert(self, obj=None, overwrite=False, partition=None,
t.insert(table_expr, overwrite=True)
"""
if isinstance(obj, pd.DataFrame):
writer, expr = _write_temp_dataframe(self._client, obj)
from ibis.impala.pandas_interop import write_temp_dataframe
writer, expr = write_temp_dataframe(self._client, obj)
else:
expr = obj

Expand Down Expand Up @@ -1801,6 +1824,12 @@ def _run_ddl(**kwds):
tbl_properties=tbl_properties,
serde_properties=serde_properties)

def set_external(self, is_external=True):
"""
Toggle EXTERNAL table property.
"""
self.alter(tbl_properties={'EXTERNAL': is_external})

def alter_partition(self, spec, location=None, format=None,
tbl_properties=None,
serde_properties=None):
Expand Down Expand Up @@ -1898,13 +1927,6 @@ def drop(self):
pass


def _write_temp_dataframe(client, df):
from ibis.impala.pandas_interop import DataFrameWriter
writer = DataFrameWriter(client, df)
path = writer.write_temp_csv()
return writer, writer.delimited_table(path)


def _validate_compatible(from_schema, to_schema):
if set(from_schema.names) != set(to_schema.names):
raise com.IbisInputError('Schemas have different names')
Expand Down
7 changes: 4 additions & 3 deletions ibis/impala/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,9 +887,10 @@ def _exists_subquery(translator, expr):
op = expr.op()
ctx = translator.context

expr = (op.foreign_table
.filter(op.predicates)
.projection([ir.literal(1).name(ir.unnamed)]))
dummy = ir.literal(1).name(ir.unnamed)

filtered = op.foreign_table.filter(op.predicates)
expr = filtered.projection([dummy])

subquery = ctx.get_compiled_expr(expr)

Expand Down
75 changes: 57 additions & 18 deletions ibis/impala/ddl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from ibis.expr.datatypes import validate_type
from ibis.compat import py_string
import ibis.expr.datatypes as dt
import ibis.expr.rules as rules


Expand Down Expand Up @@ -75,12 +76,22 @@ def _sanitize_format(format):
return format


def _serdeproperties(props):
formatted_props = _format_properties(props)
return 'SERDEPROPERTIES {0}'.format(formatted_props)


def format_tblproperties(props):
formatted_props = _format_properties(props)
return 'TBLPROPERTIES {0}'.format(formatted_props)


def _format_properties(props):
tokens = []
for k, v in sorted(props.items()):
tokens.append("'{0!s}'='{1!s}'".format(k, v))
tokens.append(" '{0!s}'='{1!s}'".format(k, v))

return '({0})'.format(', '.join(tokens))
return '(\n{0}\n)'.format(',\n'.join(tokens))


class CreateTable(CreateDDL):
Expand All @@ -95,7 +106,8 @@ class CreateTable(CreateDDL):

def __init__(self, table_name, database=None, external=False,
format='parquet', can_exist=False,
partition=None, path=None):
partition=None, path=None,
tbl_properties=None):
self.table_name = table_name
self.database = database
self.partition = partition
Expand All @@ -104,6 +116,8 @@ def __init__(self, table_name, database=None, external=False,
self.can_exist = can_exist
self.format = _sanitize_format(format)

self.tbl_properties = tbl_properties

def _create_line(self):
scoped_name = self._get_scoped_name(self.table_name, self.database)

Expand Down Expand Up @@ -220,7 +234,7 @@ def compile(self):

class CreateTableWithSchema(CreateTable):

def __init__(self, table_name, schema, table_format, **kwargs):
def __init__(self, table_name, schema, table_format=None, **kwargs):
self.schema = schema
self.table_format = table_format

Expand Down Expand Up @@ -260,9 +274,10 @@ def _push_schema(x):
buf.write('\n')
_push_schema(self.schema)

format_ddl = self.table_format.to_ddl()
if format_ddl:
buf.write(format_ddl)
if self.table_format is not None:
buf.write(self.table_format.to_ddl())
else:
buf.write(self._storage())

buf.write(self._location())

Expand All @@ -272,7 +287,7 @@ def _push_schema(x):
class NoFormat(object):

def to_ddl(self):
return None
return ''


class DelimitedFormat(object):
Expand Down Expand Up @@ -303,8 +318,11 @@ def to_ddl(self):
buf.write("\nLOCATION '{0}'".format(self.path))

if self.na_rep is not None:
buf.write("\nTBLPROPERTIES('serialization.null.format'='{0}')"
.format(self.na_rep))
props = {
'serialization.null.format': self.na_rep
}
buf.write('\n')
buf.write(format_tblproperties(props))

return buf.getvalue()

Expand All @@ -324,9 +342,22 @@ def to_ddl(self):

schema = json.dumps(self.avro_schema, indent=2, sort_keys=True)
schema = '\n'.join([x.rstrip() for x in schema.split('\n')])
buf.write("\nTBLPROPERTIES ('avro.schema.literal'='{0}')"
.format(schema))

props = {'avro.schema.literal': schema}
buf.write('\n')
buf.write(format_tblproperties(props))
return buf.getvalue()


class ParquetFormat(object):

def __init__(self, path):
self.path = path

def to_ddl(self):
buf = StringIO()
buf.write('\nSTORED AS PARQUET')
buf.write("\nLOCATION '{0}'".format(self.path))
return buf.getvalue()


Expand Down Expand Up @@ -400,19 +431,29 @@ def _format_partition(partition, partition_schema):
if isinstance(partition, dict):
for name in partition_schema:
if name in partition:
tok = '{0}={1}'.format(name, partition[name])
tok = _format_partition_kv(name, partition[name],
partition_schema[name])
else:
# dynamic partitioning
tok = name
tokens.append(tok)
else:
for name, value in zip(partition_schema, partition):
tok = '{0}={1}'.format(name, value)
tok = _format_partition_kv(name, value, partition_schema[name])
tokens.append(tok)

return 'PARTITION ({0})'.format(', '.join(tokens))


def _format_partition_kv(k, v, type):
if type == dt.string:
value_formatted = '"{0}"'.format(v)
else:
value_formatted = str(v)

return '{0}={1}'.format(k, value_formatted)


class LoadData(ImpalaDDL):

"""
Expand Down Expand Up @@ -468,12 +509,10 @@ def _format_properties(self, prefix=''):
tokens.append("FILEFORMAT {0}".format(self.format))

if self.tbl_properties is not None:
props = _format_properties(self.tbl_properties)
tokens.append('TBLPROPERTIES {0}'.format(props))
tokens.append(format_tblproperties(self.tbl_properties))

if self.serde_properties is not None:
props = _format_properties(self.serde_properties)
tokens.append('SERDEPROPERTIES {0}'.format(props))
tokens.append(_serdeproperties(self.serde_properties))

if len(tokens) > 0:
return '\n{0}{1}'.format(prefix, '\n'.join(tokens))
Expand Down
300 changes: 300 additions & 0 deletions ibis/impala/kudu_support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from six import StringIO

import pandas as pd

from ibis.common import IbisError
from ibis.expr.api import schema
from ibis.impala import ddl
from ibis.util import implements as copydoc
import ibis.expr.datatypes as dt
import kudu


_kudu_type_to_ibis_typeclass = {
'int8': dt.Int8,
'int16': dt.Int16,
'int32': dt.Int32,
'int64': dt.Int64,
'float': dt.Float,
'double': dt.Double,
'bool': dt.Boolean,
'string': dt.String,
'timestamp': dt.Timestamp
}


class KuduImpalaInterface(object):

"""
User-facing wrapper layer for the ImpalaClient
"""

def __init__(self, impala_client):
self.impala_client = impala_client
self.client = None

@copydoc(kudu.client.Client.list_tables)
def list_tables(self, filter=''):
return self.client.list_tables(filter)

@copydoc(kudu.client.Client.table_exists)
def table_exists(self, name):
return self.client.table_exists(name)

def connect(self, host_or_hosts, port_or_ports=7051, rpc_timeout=None):
"""
Pass-through connection interface to the Kudu client
Parameters
----------
host_or_hosts : string or list of strings
If you have multiple Kudu masters for HA, pass a list
port_or_ports : int or list of int, default 7051
If you pass multiple host names, pass multiple ports
rpc_timeout : kudu.TimeDelta
See Kudu client documentation for details
Returns
-------
None
"""
self.client = kudu.connect(host_or_hosts, port_or_ports,
rpc_timeout=rpc_timeout)

def _check_connected(self):
if not self.is_connected:
raise IbisError('Please first connect to a Kudu cluster '
'with client.kudu.connect')

@property
def is_connected(self):
# crude check for now
return self.client is not None

def create_table(self, impala_name, kudu_name, primary_keys=None,
obj=None, schema=None, database=None,
external=False, force=False):
"""
Create an Kudu-backed table in the connected Impala cluster. For
non-external tables, this will create a Kudu table with a compatible
storage schema.
This function is patterned after the ImpalaClient.create_table function
designed for physical filesystems (like HDFS).
Parameters
----------
impala_name : string
Name of the created Impala table
kudu_name : string
Name of hte backing Kudu table. Will be created if external=False
primary_keys : list of column names
List of
obj : TableExpr or pandas.DataFrame, optional
If passed, creates table from select statement results
schema : ibis.Schema, optional
Mutually exclusive with expr, creates an empty table with a
particular schema
database : string, default None (optional)
external : boolean, default False
If False, a new Kudu table will be created. Otherwise, the Kudu table
must already exist.
"""
self._check_connected()

if not external and (primary_keys is None or len(primary_keys) == 0):
raise ValueError('Must specify primary keys when DDL creates a '
'new Kudu table')

if obj is not None:
if external:
raise ValueError('Cannot create an external Kudu-Impala table '
'from an expression or DataFrame')

if isinstance(obj, pd.DataFrame):
from ibis.impala.pandas_interop import write_temp_dataframe
writer, to_insert = write_temp_dataframe(self.impala_client,
obj)
else:
to_insert = obj
# XXX: exposing a lot of internals
ast = self.impala_client._build_ast(to_insert)
select = ast.queries[0]

stmt = CTASKudu(impala_name, kudu_name,
self.client.master_addrs,
select, primary_keys,
database=database)
else:
if external:
ktable = self.client.table(kudu_name)
kschema = ktable.schema
schema = schema_kudu_to_ibis(kschema)
primary_keys = kschema.primary_keys()
elif schema is None:
raise ValueError('Must specify schema for new empty '
'Kudu-backed table')

stmt = CreateTableKudu(impala_name, kudu_name,
self.client.master_addrs,
schema, primary_keys,
external=external,
database=database,
can_exist=False)

self.impala_client._execute(stmt)

def table(self, kudu_name, name=None, database=None, persist=False,
external=True):
"""
Convenience to expose an existing Kudu table (using CREATE TABLE) as an
Impala table. To create a new table both in the Hive Metastore with
storage in Kudu, use create_table.
Note: all tables created are EXTERNAL for now. Creates a temporary
table (like parquet_file and others) unless persist=True.
If you create a persistent table you can thereafter use it like any
other Impala table.
Parameters
----------
kudu_name : string
The name of the table in the Kudu cluster
name : string, optional
Name of the created table in Impala / Hive Metastore. Randomly
generated if not specified.
database : string, optional
Database to create the table in. Uses the temp db if not provided
persist : boolean, default False
If True, do not drop the table upon Ibis garbage collection /
interpreter shutdown. Be careful using this in conjunction with the
`external` option.
external : boolean, default True
If True, create the Impala table as EXTERNAL so the Kudu data is not
deleted when the Impala table is dropped
Returns
-------
parquet_table : ImpalaTable
"""
# Law of demeter, but OK for now because internal class coupling
name, database = (self.impala_client
._get_concrete_table_path(name, database,
persist=persist))
self.create_table(name, kudu_name, database=database, external=True)
return self.impala_client._wrap_new_table(name, database, persist)


class CreateTableKudu(ddl.CreateTable):

"""
Creates an Impala table that scans from a Kudu table
"""

# TODO
# - DISTRIBUTE BY HASH
# - DISTRIBUTE BY RANGE`
# - multi master test

def __init__(self, table_name, kudu_table_name,
master_addrs, schema, key_columns,
external=True, **kwargs):
self.kudu_table_name = kudu_table_name
self.master_addrs = master_addrs
self.schema = schema
self.key_columns = key_columns
ddl.CreateTable.__init__(self, table_name, external=external,
**kwargs)

self._validate()

def _validate(self):
pass

def compile(self):
buf = StringIO()
buf.write(self._create_line())

schema = ddl.format_schema(self.schema)
buf.write('\n{0}'.format(schema))

props = self._get_table_properties()
buf.write('\n')
buf.write(ddl.format_tblproperties(props))
return buf.getvalue()

_table_props_base = {
'storage_handler': 'com.cloudera.kudu.hive.KuduStorageHandler'
}

def _get_table_properties(self):
tbl_props = self._table_props_base.copy()

addr_string = ', '.join(self.master_addrs)
keys_string = ', '.join(self.key_columns)

tbl_props.update({
'kudu.table_name': self.kudu_table_name,
'kudu.master_addresses': addr_string,
'kudu.key_columns': keys_string
})

return tbl_props


class CTASKudu(CreateTableKudu):

def __init__(self, table_name, kudu_name, master_addrs,
select, key_columns, database=None,
external=False, can_exist=False):
self.select = select
CreateTableKudu.__init__(self, table_name, kudu_name,
master_addrs, None, key_columns,
database=database,
external=external,
can_exist=can_exist)

def compile(self):
buf = StringIO()
buf.write(self._create_line())

props = self._get_table_properties()
buf.write('\n')
buf.write(ddl.format_tblproperties(props))

select_query = self.select.compile()
buf.write(' AS\n{0}'.format(select_query))
return buf.getvalue()


def schema_kudu_to_ibis(kschema, drop_nn=False):
ibis_types = []
for i in range(len(kschema)):
col = kschema[i]

typeclass = _kudu_type_to_ibis_typeclass[col.type.name]

if drop_nn:
# For testing, because Impala does not have nullable types
itype = typeclass(True)
else:
itype = typeclass(col.nullable)

ibis_types.append((col.name, itype))

return schema(ibis_types)
6 changes: 4 additions & 2 deletions ibis/impala/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,16 @@ def _converter(tup):

def _try_timestamp(x):
try:
return pd.Timestamp(x)
ts = pd.Timestamp(x)
return ts.to_pydatetime()
except (ValueError, TypeError):
return x


def _try_unix_timestamp(x):
try:
return pd.Timestamp.fromtimestamp(int(x))
ts = pd.Timestamp.fromtimestamp(int(x))
return ts.to_pydatetime()
except (ValueError, TypeError):
return x

Expand Down
6 changes: 6 additions & 0 deletions ibis/impala/pandas_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,9 @@ def pandas_to_ibis_schema(frame):
ibis_type = pandas_col_to_ibis_type(frame[col_name])
pairs.append((col_name, ibis_type))
return schema(pairs)


def write_temp_dataframe(client, df):
writer = DataFrameWriter(client, df)
path = writer.write_temp_csv()
return writer, writer.delimited_table(path)
46 changes: 20 additions & 26 deletions ibis/impala/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import os
import time
import six
from posixpath import join as pjoin

import pytest

Expand Down Expand Up @@ -94,7 +93,7 @@ class ImpalaE2E(object):

@classmethod
def setUpClass(cls):
ImpalaE2E.setup_e2e(cls)
ImpalaE2E.setup_e2e(cls, ENV)

# make sure this never gets messed up
opts = cls.con.get_options()
Expand All @@ -105,23 +104,33 @@ def tearDownClass(cls):
ImpalaE2E.teardown_e2e(cls)

@staticmethod
def setup_e2e(cls):
cls.con = connect_test(ENV)
def setup_e2e(cls, env):
cls.env = env
cls.con = connect_test(env)

# Tests run generally faster without it
if not ENV.use_codegen:
if not env.use_codegen:
cls.con.disable_codegen()
cls.hdfs = cls.con.hdfs
cls.test_data_dir = ENV.test_data_dir
cls.test_data_db = ENV.test_data_db
cls.tmp_dir = ENV.tmp_dir
cls.tmp_db = ENV.tmp_db
cls.alltypes = cls.con.table('functional_alltypes')
cls.test_data_dir = env.test_data_dir
cls.test_data_db = env.test_data_db
cls.tmp_dir = env.tmp_dir
cls.tmp_db = env.tmp_db

try:
cls.alltypes = cls.con.table('functional_alltypes')
except:
pass

cls.db = cls.con.database(ENV.test_data_db)
cls.db = cls.con.database(env.test_data_db)

if not cls.con.exists_database(cls.tmp_db):
cls.con.create_database(cls.tmp_db)

if not cls.hdfs.exists(cls.tmp_dir):
cls.hdfs.mkdir(cls.tmp_dir)
cls.hdfs.chmod(cls.tmp_dir, '777')

@staticmethod
def teardown_e2e(cls):
i, retries = 0, 3
Expand All @@ -137,21 +146,6 @@ def teardown_e2e(cls):

time.sleep(0.1)

@classmethod
def _create_777_tmp_dir(cls):
base = pjoin(cls.tmp_dir, util.guid())
tmp_path = pjoin(base, util.guid())
env = IbisTestEnv()
superuser_hdfs = ibis.hdfs_connect(host=env.nn_host,
port=env.webhdfs_port,
auth_mechanism=env.auth_mechanism,
verify=(env.auth_mechanism
not in ['GSSAPI', 'LDAP']),
user=env.hdfs_superuser)
superuser_hdfs.mkdir(base)
superuser_hdfs.chmod(base, '777')
return tmp_path

def setUp(self):
self.temp_databases = []
self.temp_tables = []
Expand Down
29 changes: 25 additions & 4 deletions ibis/impala/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@
import ibis.util as util


def approx_equal(a, b, eps):
assert abs(a - b) < eps


ENV = IbisTestEnv()


Expand All @@ -47,6 +43,11 @@ def test_execute_exprs_default_backend(self):
result = expr.execute()
assert result == expected

def test_cursor_garbage_collection(self):
for i in range(5):
self.con.raw_sql('select 1', True).fetchall()
self.con.raw_sql('select 1', True).fetchone()

def test_raise_ibis_error_no_hdfs(self):
# #299
client = connect_test(ENV, with_hdfs=False)
Expand Down Expand Up @@ -334,3 +335,23 @@ def test_disable_codegen(self):

assert opts1['DISABLE_CODEGEN'] == '1'
assert opts2['DISABLE_CODEGEN'] == '1'

def test_attr_name_conflict(self):
LEFT = 'testing_{0}'.format(util.guid())
RIGHT = 'testing_{0}'.format(util.guid())

schema = ibis.schema([('id', 'int32'), ('name', 'string'),
('files', 'int32')])

db = self.con.database(self.tmp_db)

for tablename in (LEFT, RIGHT):
db.create_table(tablename, schema=schema,
format='parquet')

left = db[LEFT]
right = db[RIGHT]

left.join(right, ['id'])
left.join(right, ['id', 'name'])
left.join(right, ['id', 'files'])
103 changes: 65 additions & 38 deletions ibis/impala/tests/test_ddl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,13 @@
from ibis.impala import ddl
from ibis.impala.compat import HS2Error, ImpylaError
from ibis.impala.client import build_ast
from ibis.impala.tests.common import IbisTestEnv, ImpalaE2E, connect_test
from ibis.impala.tests.common import ENV, ImpalaE2E, connect_test
from ibis.tests.util import assert_equal
import ibis.common as com
import ibis.expr.types as ir
import ibis.util as util


ENV = IbisTestEnv()


class TestDropTable(unittest.TestCase):

def test_must_exist(self):
Expand Down Expand Up @@ -152,6 +150,15 @@ def test_add_partition(self):
expected = 'ALTER TABLE tbl ADD PARTITION (year=2007, month=4)'
assert result == expected

def test_add_partition_string_key(self):
part_schema = ibis.schema([('foo', 'int32'),
('bar', 'string')])
stmt = ddl.AddPartition('tbl', {'foo': 5, 'bar': 'qux'}, part_schema)

result = stmt.compile()
expected = 'ALTER TABLE tbl ADD PARTITION (foo=5, bar="qux")'
assert result == expected

def test_drop_partition(self):
stmt = ddl.DropPartition(self.table_name,
{'year': 2007, 'month': 4},
Expand Down Expand Up @@ -201,13 +208,18 @@ def _get_ddl_string(props):
}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET TBLPROPERTIES ('bar'='2', 'foo'='1')"""
SET TBLPROPERTIES (
'bar'='2',
'foo'='1'
)"""
assert result == expected

result = _get_ddl_string({'serde_properties': {'baz': 3}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET SERDEPROPERTIES ('baz'='3')"""
SET SERDEPROPERTIES (
'baz'='3'
)"""
assert result == expected

def test_alter_table_properties(self):
Expand Down Expand Up @@ -236,13 +248,18 @@ def _get_ddl_string(props):
}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET TBLPROPERTIES ('bar'='2', 'foo'='1')"""
SET TBLPROPERTIES (
'bar'='2',
'foo'='1'
)"""
assert result == expected

result = _get_ddl_string({'serde_properties': {'baz': 3}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET SERDEPROPERTIES ('baz'='3')"""
SET SERDEPROPERTIES (
'baz'='3'
)"""
assert result == expected


Expand Down Expand Up @@ -280,8 +297,8 @@ def test_create_table_with_location(self):
('bar', 'int8'),
('baz', 'int16')])
statement = ddl.CreateTableWithSchema('another_table', schema,
ddl.NoFormat(),
can_exist=False,
format='parquet',
path=path, database='foo')
result = statement.compile()

Expand All @@ -290,6 +307,7 @@ def test_create_table_with_location(self):
(`foo` string,
`bar` tinyint,
`baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(path)
assert result == expected

Expand Down Expand Up @@ -410,7 +428,8 @@ def test_create_external_table_avro(self):
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
STORED AS AVRO
LOCATION '%s'
TBLPROPERTIES ('avro.schema.literal'='{
TBLPROPERTIES (
'avro.schema.literal'='{
"fields": [
{
"name": "a",
Expand All @@ -434,7 +453,8 @@ def test_create_external_table_avro(self):
],
"name": "my_record",
"type": "record"
}')""" % path
}'
)""" % path
assert result == expected

def test_create_table_parquet(self):
Expand Down Expand Up @@ -488,7 +508,7 @@ class TestDDLE2E(ImpalaE2E, unittest.TestCase):

@classmethod
def setUpClass(cls):
ImpalaE2E.setup_e2e(cls)
ImpalaE2E.setup_e2e(cls, ENV)

cls.path_uuid = 'change-location-{0}'.format(util.guid())
fake_path = pjoin(cls.tmp_dir, cls.path_uuid)
Expand Down Expand Up @@ -552,8 +572,9 @@ def test_create_exists_drop_database(self):
assert not self.con.exists_database(tmp_name)

def test_exists_table(self):
tmp_name = _random_table_name()
assert self.con.exists_table('functional_alltypes')
assert not self.con.exists_table(util.guid())
assert not self.con.exists_table(tmp_name)

def text_exists_table_with_database(self):
table_name = _random_table_name()
Expand All @@ -568,8 +589,7 @@ def text_exists_table_with_database(self):
assert not self.con.exists_table(table_name, database=tmp_name)

def test_create_exists_drop_view(self):
tmp_name = util.guid()

tmp_name = _random_table_name()
assert not self.con.exists_table(tmp_name)

expr = (self.con.table('functional_alltypes')
Expand All @@ -589,13 +609,15 @@ def test_create_exists_drop_view(self):

def test_drop_non_empty_database(self):
tmp_db = '__ibis_test_{0}'.format(util.guid())
tmp_name = _random_table_name()

self.con.create_database(tmp_db)

self.con.create_table(util.guid(), self.alltypes, database=tmp_db)
self.con.create_table(tmp_name, self.alltypes, database=tmp_db)

# Has a view, too
self.con.create_view(util.guid(), self.alltypes,
tmp_name2 = _random_table_name()
self.con.create_view(tmp_name2, self.alltypes,
database=tmp_db)

self.assertRaises(com.IntegrityError, self.con.drop_database, tmp_db)
Expand All @@ -613,23 +635,11 @@ def test_create_database_with_location(self):
self.con.drop_database(name)
self.hdfs.rmdir(base)

@pytest.mark.superuser
def test_create_table_with_location(self):
base = pjoin(self.tmp_dir, util.guid())
name = 'test_{0}'.format(util.guid())
tmp_path = pjoin(base, name)

# impala user has trouble writing to jenkins-owned dir so here we give
# the tmp dir 777
superuser_hdfs = ibis.hdfs_connect(host=ENV.nn_host,
port=ENV.webhdfs_port,
auth_mechanism=ENV.auth_mechanism,
verify=(ENV.auth_mechanism
not in ['GSSAPI', 'LDAP']),
user=ENV.hdfs_superuser)
superuser_hdfs.mkdir(base)
superuser_hdfs.chmod(base, '777')

expr = self.alltypes
table_name = _random_table_name()

Expand All @@ -639,15 +649,14 @@ def test_create_table_with_location(self):
assert self.hdfs.exists(tmp_path)

def test_drop_table_not_exist(self):
random_name = util.guid()
random_name = _random_table_name()
self.assertRaises(Exception, self.con.drop_table, random_name)

self.con.drop_table(random_name, force=True)

def test_truncate_table(self):
expr = self.alltypes.limit(50)

table_name = util.guid()
table_name = _random_table_name()
self.con.create_table(table_name, obj=expr)
self.temp_tables.append(table_name)

Expand Down Expand Up @@ -675,7 +684,7 @@ def test_create_empty_table(self):
('c', 'decimal(12,8)'),
('d', 'double')])

table_name = util.guid()
table_name = _random_table_name()
self.con.create_table(table_name, schema=schema)
self.temp_tables.append(table_name)

Expand Down Expand Up @@ -824,11 +833,11 @@ def test_table_column_stats(self):
def test_drop_table_or_view(self):
t = self.db.functional_alltypes

tname = util.guid()
tname = _random_table_name()
self.con.create_table(tname, t.limit(10))
self.temp_tables.append(tname)

vname = util.guid()
vname = _random_table_name()
self.con.create_view(vname, t.limit(10))
self.temp_views.append(vname)

Expand Down Expand Up @@ -875,12 +884,12 @@ def test_change_properties(self):

self.table.alter(tbl_properties=props)
tbl_props = self.table.metadata().tbl_properties
for k, v in props.iteritems():
for k, v in props.items():
assert v == tbl_props[k]

self.table.alter(serde_properties=props)
serde_props = self.table.metadata().serde_properties
for k, v in props.iteritems():
for k, v in props.items():
assert v == serde_props[k]

def test_change_format(self):
Expand Down Expand Up @@ -988,7 +997,7 @@ def test_query_parquet_infer_schema(self):
assert_equal(table.schema(), ex_schema)

def test_create_table_persist_fails_if_called_twice(self):
tname = util.guid()
tname = _random_table_name()

hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')
self.con.parquet_file(hdfs_path, name=tname, persist=True)
Expand Down Expand Up @@ -1029,6 +1038,24 @@ def test_query_delimited_file_directory(self):
finally:
self.con.drop_table(name, database=self.tmp_db)

def test_varchar_char_support(self):
statement = """\
CREATE EXTERNAL TABLE {0}
(`group1` varchar(10),
`group2` char(10))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/tmp'"""

full_path = '{0}.testing_{1}'.format(self.tmp_db, util.guid())
sql = statement.format(full_path)

self.con._execute(sql, results=False)

table = self.con.table(full_path)
assert isinstance(table['group1'], ir.StringValue)
assert isinstance(table['group2'], ir.StringValue)

def test_temp_table_concurrency(self):
pytest.skip('Cannot get this test to run under pytest')

Expand Down
289 changes: 289 additions & 0 deletions ibis/impala/tests/test_kudu_support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pytest

from ibis.compat import unittest
from ibis.expr.tests.mocks import MockConnection
from ibis.impala.client import build_ast
from ibis.impala.tests.common import IbisTestEnv, ImpalaE2E
from ibis.tests.util import assert_equal
import ibis.expr.datatypes as dt
import ibis.util as util
import ibis

try:
from ibis.impala import kudu_support as ksupport
import kudu
HAVE_KUDU_CLIENT = True
except ImportError:
HAVE_KUDU_CLIENT = False


pytestmark = pytest.mark.skipif(not HAVE_KUDU_CLIENT,
reason='Kudu client not installed')


class KuduImpalaTestEnv(IbisTestEnv):

def __init__(self):
IbisTestEnv.__init__(self)

# band-aid until Kudu support merged into Impala mainline
self.test_host = os.getenv('IBIS_TEST_KIMPALA_HOST',
'quickstart.cloudera')

# XXX
self.impala_host = self.test_host
self.impala_port = 21050
self.master_host = os.getenv('IBIS_TEST_KUDU_MASTER', self.test_host)
self.master_port = os.getenv('IBIS_TEST_KUDU_MASTER_PORT', 7051)
self.nn_host = os.environ.get('IBIS_TEST_KUDU_NN_HOST', self.test_host)

self.webhdfs_port = int(os.environ.get('IBIS_TEST_WEBHDFS_PORT',
50070))
self.hdfs_superuser = os.environ.get('IBIS_TEST_HDFS_SUPERUSER',
'hdfs')

ENV = KuduImpalaTestEnv()


class TestKuduTools(unittest.TestCase):

# Test schema conversion, DDL statements, etc.

def test_kudu_schema_convert(self):
spec = [
# name, type, is_nullable, is_primary_key
('a', dt.Int8(False), 'int8', False, True),
('b', dt.Int16(False), 'int16', False, True),
('c', dt.Int32(False), 'int32', False, False),
('d', dt.Int64(True), 'int64', True, False),
('e', dt.String(True), 'string', True, False),
('f', dt.Boolean(False), 'bool', False, False),
('g', dt.Float(False), 'float', False, False),
('h', dt.Double(True), 'double', True, False),

# TODO
# ('i', 'binary', False, False),

('j', dt.Timestamp(True), 'timestamp', True, False)
]

builder = kudu.schema_builder()
primary_keys = []
ibis_types = []
for name, itype, type_, is_nullable, is_primary_key in spec:
builder.add_column(name, type_, nullable=is_nullable)

if is_primary_key:
primary_keys.append(name)

ibis_types.append((name, itype))

builder.set_primary_keys(primary_keys)
kschema = builder.build()

ischema = ksupport.schema_kudu_to_ibis(kschema)
expected = ibis.schema(ibis_types)

assert_equal(ischema, expected)

def test_create_external_ddl(self):
schema = ibis.schema([('key1', 'int32'),
('key2', 'int64'),
('value1', 'double')])

stmt = ksupport.CreateTableKudu('impala_name', 'kudu_name',
['master1.d.com:7051',
'master2.d.com:7051'],
schema, ['key1', 'key2'])

result = stmt.compile()
expected = """\
CREATE EXTERNAL TABLE `impala_name`
(`key1` int,
`key2` bigint,
`value1` double)
TBLPROPERTIES (
'kudu.key_columns'='key1, key2',
'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051',
'kudu.table_name'='kudu_name',
'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
)"""
assert result == expected

def test_ctas_ddl(self):
con = MockConnection()

select = build_ast(con.table('test1')).queries[0]
statement = ksupport.CTASKudu(
'another_table', 'kudu_name', ['dom.d.com:7051'],
select, ['string_col'], external=True,
can_exist=False, database='foo')
result = statement.compile()

expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
TBLPROPERTIES (
'kudu.key_columns'='string_col',
'kudu.master_addresses'='dom.d.com:7051',
'kudu.table_name'='kudu_name',
'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
) AS
SELECT *
FROM test1"""
assert result == expected


class TestKuduE2E(ImpalaE2E, unittest.TestCase):

@classmethod
def setUpClass(cls):
ImpalaE2E.setup_e2e(cls, ENV)

cls.temp_tables = []

cls.kclient = kudu.connect(cls.env.master_host, cls.env.master_port)

cls.con.kudu.connect(cls.env.master_host, cls.env.master_port)

def _new_kudu_example_table(self, kschema):
kudu_name = 'ibis-tmp-{0}'.format(util.guid())

self.kclient.create_table(kudu_name, kschema)
self.temp_tables.append(kudu_name)

return kudu_name

@classmethod
def tearDownClass(cls):
cls.teardown_e2e(cls)

for table in cls.temp_tables:
cls.kclient.delete_table(table)

@classmethod
def example_schema(cls):
builder = kudu.schema_builder()
builder.add_column('key', kudu.int32, nullable=False)
builder.add_column('int_val', kudu.int32)
builder.add_column('string_val', kudu.string)
builder.set_primary_keys(['key'])

return builder.build()

def _write_example_data(self, table_name, nrows=100):
table = self.kclient.table(table_name)
session = self.kclient.new_session()
for i in range(nrows):
op = table.new_insert()
row = i, i * 2, 'hello_%d' % i
op['key'] = row[0]
op['int_val'] = row[1]
op['string_val'] = row[2]
session.apply(op)
session.flush()

@pytest.mark.kudu
def test_external_kudu_table(self):
kschema = self.example_schema()
kudu_name = self._new_kudu_example_table(kschema)

nrows = 100
self._write_example_data(kudu_name, nrows)

table = self.con.kudu.table(kudu_name)
result = table.execute()
assert len(result) == 100

ischema = ksupport.schema_kudu_to_ibis(kschema, drop_nn=True)
assert_equal(table.schema(), ischema)

@pytest.mark.kudu
def test_internal_kudu_table(self):
kschema = self.example_schema()
kudu_name = self._new_kudu_example_table(kschema)

nrows = 100
self._write_example_data(kudu_name, nrows)

impala_name = self._temp_impala_name()
impala_db = self.env.test_data_db
self.con.kudu.table(kudu_name, name=impala_name,
database=impala_db,
external=True,
persist=True)

t = self.con.table(impala_name, database=impala_db)
assert len(t.execute()) == nrows

# Make internal
t.set_external(False)
t.drop()

assert not self.con.kudu.table_exists(kudu_name)

@pytest.mark.kudu
def test_create_table_as_select_ctas(self):
# TODO
kschema = self.example_schema()
kudu_name = self._new_kudu_example_table(kschema)

nrows = 100
self._write_example_data(kudu_name, nrows)

impala_name = self._temp_impala_name()
impala_db = self.env.test_data_db
self.con.kudu.table(kudu_name, name=impala_name,
database=impala_db,
external=True,
persist=True)

impala_name2 = self._temp_impala_name()
expr = self.con.table(impala_name, database=impala_db)

kudu_name2 = 'ibis-ctas-{0}'.format(util.guid())

self.con.kudu.create_table(impala_name2, kudu_name2,
primary_keys=['key'],
obj=expr, database=impala_db)

# TODO: should some stats be automatically computed?
itable = self.con.table(impala_name2, database=impala_db)
assert len(itable.execute()) == len(expr.execute())

ktable = self.kclient.table(kudu_name2)
assert ktable.schema.primary_keys() == ['key']

@pytest.mark.kudu
def test_create_empty_internal_table(self):
kschema = self.example_schema()
ischema = ksupport.schema_kudu_to_ibis(kschema, drop_nn=True)

impala_name = self._temp_impala_name()
kudu_name = 'ibis-empty-{0}'.format(util.guid())

self.con.kudu.create_table(impala_name, kudu_name,
primary_keys=['key'],
schema=ischema,
database=self.env.test_data_db)

ktable = self.kclient.table(kudu_name)
assert ktable.schema.equals(kschema)
self.temp_tables.append(kudu_name)

def _temp_impala_name(self):
return 'kudu_test_{0}'.format(util.guid())
12 changes: 6 additions & 6 deletions ibis/impala/tests/test_pandas_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def setUpClass(cls):
cls.alltypes = cls.alltypes.execute()

def test_alltypes_roundtrip(self):
pytest.skip('IMPALA-2750')
self._check_roundtrip(self.alltypes)

def test_writer_cleanup_deletes_hdfs_dir(self):
Expand All @@ -198,25 +199,23 @@ def test_writer_cleanup_deletes_hdfs_dir(self):
writer.cleanup()
assert not self.con.hdfs.exists(path)

@pytest.mark.superuser
def test_create_table_from_dataframe(self):
pytest.skip('IMPALA-2750')
tname = 'tmp_pandas_{0}'.format(util.guid())
self.con.create_table(tname, self.alltypes, database=self.tmp_db,
location=self._create_777_tmp_dir())
self.con.create_table(tname, self.alltypes, database=self.tmp_db)
self.temp_tables.append(tname)

table = self.con.table(tname, database=self.tmp_db)
df = table.execute()
assert_frame_equal(df, self.alltypes)

@pytest.mark.superuser
def test_insert(self):
pytest.skip('IMPALA-2750')
schema = pandas_to_ibis_schema(exhaustive_df)

table_name = 'tmp_pandas_{0}'.format(util.guid())
self.con.create_table(table_name, database=self.tmp_db,
schema=schema,
location=self._create_777_tmp_dir())
schema=schema)
self.temp_tables.append(table_name)

self.con.insert(table_name, exhaustive_df.iloc[:4],
Expand All @@ -238,6 +237,7 @@ def test_insert_partition(self):
pass

def test_round_trip_exhaustive(self):
pytest.skip('IMPALA-2750')
self._check_roundtrip(exhaustive_df)

def _check_roundtrip(self, df):
Expand Down
58 changes: 19 additions & 39 deletions ibis/impala/tests/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from ibis.compat import unittest
from ibis.impala.compat import ImpylaError
from ibis.impala.tests.common import ImpalaE2E
from ibis.impala.tests.common import ImpalaE2E, ENV
from ibis.tests.util import assert_equal
import ibis
import ibis.util as util
Expand All @@ -34,93 +34,82 @@ class TestPartitioning(ImpalaE2E, unittest.TestCase):

@classmethod
def setUpClass(cls):
ImpalaE2E.setup_e2e(cls)
ImpalaE2E.setup_e2e(cls, ENV)

df = pd.DataFrame({'year': [2009, 2009, 2009, 2010, 2010, 2010],
'month': [1, 2, 3, 1, 2, 3],
'month': ['1', '2', '3', '1', '2', '3'],
'value': [1, 2, 3, 4, 5, 6]})
df = pd.concat([df] * 10, ignore_index=True)
df['id'] = df.index.values

cls.df = df
cls.db = cls.con.database(cls.tmp_db)
cls.pd_name = _tmp_name()
cls.db.create_table(cls.pd_name, df,
location=cls._temp_location())

@classmethod
def _temp_location(cls):
return cls._create_777_tmp_dir()
cls.db.create_table(cls.pd_name, df)

def test_is_partitioned(self):
schema = ibis.schema([('foo', 'string'),
('year', 'int32'),
('month', 'int16')])
('month', 'string')])
name = _tmp_name()
self.db.create_table(name, schema=schema,
partition=['year', 'month'],
location=self._temp_location())
partition=['year', 'month'])
assert self.db.table(name).is_partitioned

@pytest.mark.superuser
def test_create_table_with_partition_column(self):
schema = ibis.schema([('year', 'int32'),
('month', 'int8'),
('month', 'string'),
('day', 'int8'),
('value', 'double')])

name = _tmp_name()
self.con.create_table(name, schema=schema,
database=self.tmp_db,
partition=['year', 'month'],
location=self._temp_location())
partition=['year', 'month'])
self.temp_tables.append(name)

# the partition column get put at the end of the table
ex_schema = ibis.schema([('day', 'int8'),
('value', 'double'),
('year', 'int32'),
('month', 'int8')])
('month', 'string')])
table_schema = self.con.get_schema(name, database=self.tmp_db)
assert_equal(table_schema, ex_schema)

partition_schema = self.db.table(name).partition_schema()

expected = ibis.schema([('year', 'int32'),
('month', 'int8')])
('month', 'string')])
assert_equal(partition_schema, expected)

@pytest.mark.superuser
def test_create_partitioned_separate_schema(self):
schema = ibis.schema([('day', 'int8'),
('value', 'double')])
part_schema = ibis.schema([('year', 'int32'),
('month', 'int8')])
('month', 'string')])

name = _tmp_name()
self.con.create_table(name, schema=schema, partition=part_schema,
location=self._temp_location())
self.con.create_table(name, schema=schema, partition=part_schema)
self.temp_tables.append(name)

# the partition column get put at the end of the table
ex_schema = ibis.schema([('day', 'int8'),
('value', 'double'),
('year', 'int32'),
('month', 'int8')])
('month', 'string')])
table_schema = self.con.get_schema(name)
assert_equal(table_schema, ex_schema)

partition_schema = self.con.table(name).partition_schema()
assert_equal(partition_schema, part_schema)

@pytest.mark.superuser
def test_unpartitioned_table_get_schema(self):
tname = 'functional_alltypes'
with self.assertRaises(ImpylaError):
self.con.table(tname).partition_schema()

@pytest.mark.superuser
def test_insert_select_partitioned_table(self):
pytest.skip('IMPALA-2750')
df = self.df

unpart_t = self.db.table(self.pd_name)
Expand All @@ -142,24 +131,20 @@ def test_insert_select_partitioned_table(self):

self._verify_partitioned_table(part_t, df, unique_keys)

@pytest.mark.superuser
def test_insert_overwrite_partition(self):
pass

@pytest.mark.superuser
def test_dynamic_partitioning(self):
pass

@pytest.mark.superuser
def test_add_drop_partition(self):
pytest.skip('HIVE-12613')
schema = ibis.schema([('foo', 'string'),
('year', 'int32'),
('month', 'int16')])
name = _tmp_name()
tmp_dir = self._temp_location()
self.db.create_table(name, schema=schema,
partition=['year', 'month'],
location=tmp_dir)
partition=['year', 'month'])

table = self.db.table(name)

Expand All @@ -174,11 +159,9 @@ def test_add_drop_partition(self):

assert len(table.partitions()) == 1

@pytest.mark.superuser
def test_set_partition_location(self):
pass

@pytest.mark.superuser
def test_load_data_partition(self):
df = self.df

Expand All @@ -188,7 +171,7 @@ def test_load_data_partition(self):
part_keys)

# trim the runtime of this test
df = df[df.month == 1].reset_index(drop=True)
df = df[df.month == '1'].reset_index(drop=True)

unique_keys = df[part_keys].drop_duplicates()

Expand Down Expand Up @@ -234,20 +217,17 @@ def _verify_partitioned_table(self, part_t, df, unique_keys):
# allow for the total line
assert len(parts) == (len(unique_keys) + 1)

def _create_partitioned_table(self, schema, part_keys):
def _create_partitioned_table(self, schema, part_keys, location=None):
part_name = _tmp_name()

self.db.create_table(part_name,
schema=schema,
partition=part_keys,
location=self._temp_location())
partition=part_keys)
self.temp_tables.append(part_name)
return self.db.table(part_name)

@pytest.mark.superuser
def test_drop_partition(self):
pass

@pytest.mark.superuser
def test_repartition_automated(self):
pass
27 changes: 27 additions & 0 deletions ibis/impala/udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import re

from ibis.expr.datatypes import validate_type
import ibis.expr.datatypes as _dt
import ibis.expr.operations as _ops
Expand Down Expand Up @@ -293,6 +295,31 @@ def add_operation(op, func_name, db):
comp._operation_registry[op] = translator


def parse_type(t):
t = t.lower()
if t in _impala_to_ibis_type:
return _impala_to_ibis_type[t]
else:
if 'varchar' in t or 'char' in t:
return 'string'
elif 'decimal' in t:
result = _dt._parse_decimal(t)
if result:
return t
else:
return ValueError(t)
else:
raise Exception(t)

_VARCHAR_RE = re.compile('varchar\((\d+)\)')


def _parse_varchar(t):
m = _VARCHAR_RE.match(t)
if m:
return 'string'


def _impala_type_to_ibis(tval):
if tval in _impala_to_ibis_type:
return _impala_to_ibis_type[tval]
Expand Down
41 changes: 31 additions & 10 deletions ibis/sql/alchemy.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,11 @@ def _get_sqla_table(ctx, table):
ctx_level = ctx_level.parent
sa_table = ctx_level.get_table(table)
else:
sa_table = table.op().sqla_table
op = table.op()
if isinstance(op, AlchemyTable):
sa_table = op.sqla_table
else:
sa_table = ctx.get_compiled_expr(table)

return sa_table

Expand Down Expand Up @@ -596,6 +600,8 @@ def _compile_table_set(self):

def _add_select(self, table_set):
to_select = []

has_select_star = False
for expr in self.select_set:
if isinstance(expr, ir.ValueExpr):
arg = self._translate(expr, named=True)
Expand All @@ -604,7 +610,8 @@ def _add_select(self, table_set):
cached_table = self.context.get_table(expr)
if cached_table is None:
# the select * case from materialized join
arg = '*'
has_select_star = True
continue
else:
arg = table_set
else:
Expand All @@ -614,18 +621,29 @@ def _add_select(self, table_set):

to_select.append(arg)

if has_select_star:
if table_set is None:
raise ValueError('table_set cannot be None here')

clauses = [table_set] + to_select
else:
clauses = to_select

if self.exists:
clause = sa.exists(to_select)
result = sa.exists(clauses)
else:
clause = sa.select(to_select)
result = sa.select(clauses)

if self.distinct:
clause = clause.distinct()
result = result.distinct()

if table_set is not None:
return clause.select_from(table_set)
if not has_select_star:
if table_set is not None:
return result.select_from(table_set)
else:
return result
else:
return clause
return result

def _add_groupby(self, fragment):
# GROUP BY and HAVING
Expand Down Expand Up @@ -777,13 +795,16 @@ def _format_table(self, expr):


def _can_lower_sort_column(table_set, expr):
# TODO(wesm): This code is pending removal through cleaner internal
# semantics

# we can currently sort by just-appeared aggregate metrics, but the way
# these are references in the expression DSL is as a SortBy (blocking
# table operation) on an aggregation. There's a hack in _collect_SortBy
# in the generic SQL compiler that "fuses" the sort with the
# aggregation so they appear in same query. It's generally for
# cosmetics and doesn't really affect query semantics.
bases = ir.find_all_base_tables(expr)
bases = ops.find_all_base_tables(expr)
if len(bases) > 1:
return False

Expand All @@ -792,7 +813,7 @@ def _can_lower_sort_column(table_set, expr):

if isinstance(base_op, ops.Aggregation):
return base_op.table.equals(table_set)
elif isinstance(base_op, ops.Projection):
elif isinstance(base_op, ops.Selection):
return base.equals(table_set)
else:
return False
Expand Down
91 changes: 29 additions & 62 deletions ibis/sql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import ibis.common as com
import ibis.expr.analysis as L
import ibis.expr.analytics as analytics

import ibis.expr.operations as ops
import ibis.expr.types as ir

Expand Down Expand Up @@ -290,7 +291,7 @@ def _rewrite_reduction_filter(self, expr):
# TODO: what about reductions that reference a join that isn't visible
# at this level? Means we probably have the wrong design, but will have
# to revisit when it becomes a problem.
aggregation, _ = _reduction_to_aggregation(expr, default_name='tmp')
aggregation, _ = L.reduction_to_aggregation(expr, default_name='tmp')
return aggregation.to_array()

def _visit_filter_Any(self, expr):
Expand Down Expand Up @@ -380,16 +381,6 @@ def _collect_Distinct(self, expr, toplevel=False):

self._collect(expr.op().table, toplevel=toplevel)

def _collect_Filter(self, expr, toplevel=False):
op = expr.op()

self.filters.extend(op.predicates)
if toplevel:
self.select_set = [op.table]
self.table_set = op.table

self._collect(op.table)

def _collect_Limit(self, expr, toplevel=False):
if not toplevel:
return
Expand All @@ -412,22 +403,6 @@ def _collect_Union(self, expr, toplevel=False):
else:
raise NotImplementedError

def _collect_SortBy(self, expr, toplevel=False):
op = expr.op()

self.sort_by = op.keys
if toplevel:
# HACK: yuck, need a better way to know if we should perform a
# select * from a subquery here
parent_op = op.table.op()
if (isinstance(parent_op, ir.BlockingTableNode) and
not isinstance(parent_op, ops.Aggregation)):
self.select_set = [op.table]
self.table_set = op.table
toplevel = False

self._collect(op.table, toplevel=toplevel)

def _collect_Aggregation(self, expr, toplevel=False):
# The select set includes the grouping keys (if any), and these are
# duplicated in the group_by set. SQL translator can decide how to
Expand All @@ -441,10 +416,12 @@ def _collect_Aggregation(self, expr, toplevel=False):
self.having = sub_op.having
self.select_set = sub_op.by + sub_op.agg_exprs
self.table_set = sub_op.table
self.filters = sub_op.predicates
self.sort_by = sub_op.sort_keys

self._collect(expr.op().table)

def _collect_Projection(self, expr, toplevel=False):
def _collect_Selection(self, expr, toplevel=False):
op = expr.op()
table = op.table

Expand All @@ -455,17 +432,27 @@ def _collect_Projection(self, expr, toplevel=False):
if isinstance(table.op(), ops.Join):
can_sub = self._collect_Join(table)
else:
can_sub = True
can_sub = False
self._collect(table)

selections = op.selections
sort_keys = op.sort_keys
filters = op.predicates

if can_sub:
selections = sop.selections
filters = sop.predicates
sort_keys = sop.sort_keys
table = sop.table

if len(selections) == 0:
# select *
selections = [table]

self.sort_by = sort_keys
self.select_set = selections
self.table_set = table
self.filters = filters

def _collect_MaterializedJoin(self, expr, toplevel=False):
op = expr.op()
Expand All @@ -489,11 +476,10 @@ def _collect_Join(self, expr, toplevel=False):

subtables = _get_subtables(expr)

# If any of the joined tables are non-blocking modified versions
# (e.g. with Filter) of the same table, then it's not safe to continue
# walking down the tree (see #667), and we should instead have inline
# views rather than attempting to fuse things together into the same
# SELECT query.
# If any of the joined tables are non-blocking modified versions of the
# same table, then it's not safe to continue walking down the tree (see
# #667), and we should instead have inline views rather than attempting
# to fuse things together into the same SELECT query.
can_substitute = _all_distinct_roots(subtables)
if can_substitute:
for table in subtables:
Expand Down Expand Up @@ -579,7 +565,7 @@ def _all_distinct_roots(subtables):

def _blocking_base(expr):
node = expr.op()
if isinstance(node, (ir.BlockingTableNode, ops.Join)):
if node.blocks() or isinstance(node, ops.Join):
return expr
else:
for arg in expr.op().flat_args():
Expand Down Expand Up @@ -683,17 +669,18 @@ def _visit_Aggregation(self, expr):
def _visit_Distinct(self, expr):
self.observe(expr)

def _visit_Filter(self, expr):
self.visit(expr.op().table)

def _visit_Limit(self, expr):
self.observe(expr)
self.visit(expr.op().table)

def _visit_Union(self, expr):
self.observe(expr)

def _visit_Projection(self, expr):
def _visit_MaterializedJoin(self, expr):
self.observe(expr)
self.visit(expr.op().join)

def _visit_Selection(self, expr):
self.observe(expr)
self.visit(expr.op().table)

Expand All @@ -708,10 +695,6 @@ def _visit_TableColumn(self, expr):
def _visit_SelfReference(self, expr):
self.visit(expr.op().table)

def _visit_SortBy(self, expr):
self.observe(expr)
self.visit(expr.op().table)


def _foreign_ref_check(query, expr):
checker = _CorrelatedRefCheck(query, expr)
Expand Down Expand Up @@ -808,18 +791,15 @@ def as_is(x):
if isinstance(expr, ir.TableExpr):
return expr, as_is

def _scalar_reduce(x):
return isinstance(x, ir.ScalarExpr) and ops.is_reduction(x)

def _get_scalar(field):
def scalar_handler(results):
return results[field][0]
return scalar_handler

if isinstance(expr, ir.ScalarExpr):

if _scalar_reduce(expr):
table_expr, name = _reduction_to_aggregation(
if L.is_scalar_reduce(expr):
table_expr, name = L.reduction_to_aggregation(
expr, default_name='tmp')
return table_expr, _get_scalar(name)
else:
Expand All @@ -840,7 +820,7 @@ def scalar_handler(results):
any_aggregation = False

for x in exprs:
if not _scalar_reduce(x):
if not L.is_scalar_reduce(x):
is_aggregation = False
else:
any_aggregation = True
Expand Down Expand Up @@ -888,19 +868,6 @@ def column_handler(results):
.format(type(expr)))


def _reduction_to_aggregation(expr, default_name='tmp'):
table = ir.find_base_table(expr)

try:
name = expr.get_name()
named_expr = expr
except:
name = default_name
named_expr = expr.name(default_name)

return table.aggregate([named_expr]), name


class QueryBuilder(object):

select_builder = SelectBuilder
Expand Down
2 changes: 2 additions & 0 deletions ibis/sql/sqlite/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from ibis.sql.sqlite.compiler import SQLiteExprTranslator
import ibis.sql.sqlite.api as api
import ibis.util as util

from sqlalchemy.dialects.sqlite import dialect as sqlite_dialect


Expand Down
39 changes: 39 additions & 0 deletions ibis/sql/sqlite/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pytest # noqa

from .common import SQLiteTests
Expand Down Expand Up @@ -355,3 +356,41 @@ def _execute_projection(self, table, exprs):

proj = table.projection(agg_exprs)
proj.execute()

def test_filter_has_sqla_table(self):
t = self.alltypes
pred = t.year == 2010
filt = t.filter(pred).sort_by('float_col').float_col
s = filt.execute()
result = s.squeeze().reset_index(drop=True)
expected = t.execute().query(
'year == 2010'
).sort('float_col').float_col

assert len(result) == len(expected)

def test_column_access_after_sort(self):
t = self.alltypes
expr = t.sort_by('float_col').string_col

# it works!
expr.execute(limit=10)

def test_materialized_join(self):
path = '__ibis_tmp_{0}.db'.format(ibis.util.guid())

con = ibis.sqlite.connect(path, create=True)

try:
con.raw_sql("create table mj1 (id1 integer, val1 real)")
con.raw_sql("insert into mj1 values (1, 10), (2, 20)")
con.raw_sql("create table mj2 (id2 integer, val2 real)")
con.raw_sql("insert into mj2 values (1, 15), (2, 25)")

t1 = con.table('mj1')
t2 = con.table('mj2')
joined = t1.join(t2, t1.id1 == t2.id2).materialize()
result = joined.val2.execute()
assert len(result) == 2
finally:
os.remove(path)
Loading