527 changes: 287 additions & 240 deletions ibis/expr/types.py

Large diffs are not rendered by default.

264 changes: 264 additions & 0 deletions ibis/filesystems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This file may adapt small portions of https://github.com/mtth/hdfs (MIT
# license), see the LICENSES directory.

from os import path as osp
import os
import posixpath
import shutil

import ibis.common as com
import ibis.util as util


from hdfs.util import temppath


class HDFSError(com.IbisError):
pass


def implements(f):
def decorator(g):
g.__doc__ = f.__doc__
return g
return decorator


class HDFS(object):

"""
Interface class to HDFS for ibis that abstracts away (and protects
user/developer against) various 3rd party library API differences.
"""
def log(self, message):
print(message)

def exists(self, path):
raise NotImplementedError

def status(self, path):
raise NotImplementedError

def head(self, hdfs_path, nbytes=1024, offset=0):
raise NotImplementedError

def get(self, hdfs_path, local_path='.', overwrite=False):
"""
Download remote file or directory to the local filesystem
Parameters
----------
hdfs_path : string
local_path : string, default '.'
"""
raise NotImplementedError

def put(self, hdfs_path, local_path, overwrite=False, verbose=None,
**kwargs):
"""
Write file or directory to HDFS
Parameters
----------
hdfs_path : string
Directory or path
local_path : string
Relative or absolute path to local resource
overwrite : boolean, default False
verbose : boolean, default ibis options.verbose
Further keyword arguments passed down to any internal API used.
Returns
-------
written_path : string
The path to the written file or directory
"""
raise NotImplementedError

def write(self, hdfs_path, buf, overwrite=False, blocksize=None,
replication=None, buffersize=None):
raise NotImplementedError

def mkdir(self, path, create_parent=False):
pass

def ls(self, hdfs_path, status=False):
"""
Return contents of directory
Parameters
----------
hdfs_path : string
"""
raise NotImplementedError

def tail(self, hdfs_path, nbytes=1024):
raise NotImplementedError

def rm(self, path):
return self.delete(path)

def rmdir(self, path):
self.client.delete(path, recursive=True)

def find_any_file(self, hdfs_dir):
contents = self.ls(hdfs_dir, status=True)

def valid_filename(name):
head, tail = posixpath.split(name)

tail = tail.lower()
return (not tail.endswith('.tmp') and
not tail.endswith('.copying') and
not tail.startswith('_') and
not tail.startswith('.'))

for filename, meta in contents:
if meta['type'].lower() == 'file' and valid_filename(filename):
return filename
raise com.IbisError('No files found in the passed directory')


class WebHDFS(HDFS):

"""
A WebHDFS-based interface to HDFS using the HDFSCli library
"""

def __init__(self, client):
self.client = client

@property
def protocol(self):
return 'webhdfs'

def status(self, path):
return self.client.status(path)

@implements(HDFS.exists)
def exists(self, path):
try:
self.client.status(path)
return True
except Exception:
return False

@implements(HDFS.ls)
def ls(self, hdfs_path, status=False):
contents = self.client.list(hdfs_path)
if not status:
return [path for path, detail in contents]
else:
return contents

@implements(HDFS.mkdir)
def mkdir(self, dir_path, create_parent=False):
# ugh, see #252

# create a temporary file, then delete it
dummy = posixpath.join(dir_path, util.guid())
self.client.write(dummy, '')
self.client.delete(dummy)

def delete(self, hdfs_path, recursive=False):
"""
"""
return self.client.delete(hdfs_path, recursive=recursive)

def head(self, hdfs_path, nbytes=1024, offset=0):
gen = self.client.read(hdfs_path, offset=offset, length=nbytes)
return ''.join(gen)

@implements(HDFS.put)
def put(self, hdfs_path, local_path, overwrite=False, verbose=None,
**kwargs):
if osp.isdir(local_path):
for dirpath, dirnames, filenames in os.walk(local_path):
rel_dir = osp.relpath(dirpath, local_path)
if rel_dir == '.':
rel_dir = ''
for fpath in filenames:
abs_path = osp.join(dirpath, fpath)
rel_hdfs_path = posixpath.join(hdfs_path, rel_dir, fpath)
self.put(rel_hdfs_path, abs_path, overwrite=overwrite,
verbose=verbose, **kwargs)
else:
if verbose:
self.log('Writing local {} to HDFS {}'.format(local_path,
hdfs_path))
self.client.upload(hdfs_path, local_path,
overwrite=overwrite, **kwargs)

@implements(HDFS.get)
def get(self, hdfs_path, local_path, overwrite=False):
hdfs_path = hdfs_path.rstrip(posixpath.sep)

if osp.isdir(local_path):
dest = osp.join(local_path, posixpath.basename(hdfs_path))
else:
local_dir = osp.dirname(local_path) or '.'
if osp.isdir(local_dir):
dest = local_path
else:
# fail early
raise HDFSError('Parent directory %s does not exist',
local_dir)

# TODO: threadpool

def _scrape_dir(path, dst):
objs = self.client.list(path)
for hpath, detail in objs:
relpath = posixpath.relpath(hpath, hdfs_path)
full_opath = posixpath.join(dst, relpath)

if detail['type'] == 'FILE':
self.client.download(hpath, full_opath)
else:
os.makedirs(full_opath)
_scrape_dir(hpath, dst)

status = self.status(hdfs_path)
if status['type'] == 'FILE':
if not overwrite and osp.exists(local_path):
raise Exception('{0} exists'.format(local_path))

self.client.download(hdfs_path, local_path)
else:
# TODO: partitioned files

with temppath() as tpath:
_temp_dir_path = osp.join(tpath, posixpath.basename(hdfs_path))
os.makedirs(_temp_dir_path)
_scrape_dir(hdfs_path, _temp_dir_path)
shutil.move(_temp_dir_path, local_path)

return dest

def write(self, hdfs_path, buf, overwrite=False, blocksize=None,
replication=None, buffersize=None):
"""
Write a buffer-like object to indicated HDFS path
Parameters
----------
"""
self.client.write(buf, hdfs_path, overwrite=overwrite,
blocksize=blocksize, replication=replication,
buffersize=buffersize)
2 changes: 1 addition & 1 deletion ibis/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _eintr_retry(func, *args):
if e.args[0] != errno.EINTR:
raise

#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Daemon logic for spawning new child workers


Expand Down
205 changes: 157 additions & 48 deletions ibis/sql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ibis.sql.context import QueryContext
import ibis.sql.ddl as ddl
import ibis.sql.transforms as transforms
import ibis.util as util


def build_ast(expr, context=None):
Expand All @@ -47,7 +48,7 @@ def to_sql(expr, context=None):
return query.compile(context)


#----------------------------------------------------------------------
# ---------------------------------------------------------------------


class QueryAST(object):
Expand Down Expand Up @@ -121,7 +122,7 @@ def __init__(self, expr, context):
self.subqueries = []
self.distinct = False

self.op_memo = set()
self.op_memo = util.IbisSet()

def get_result(self):
# make idempotent
Expand Down Expand Up @@ -152,7 +153,9 @@ def _generate_teardown_queries(self):

def _build_result_query(self):
self._collect_elements()
self._analyze_filter_clauses()

self._analyze_select_exprs()
self._analyze_filter_exprs()
self._analyze_subqueries()
self._populate_context()

Expand Down Expand Up @@ -201,10 +204,76 @@ def _make_table_aliases(self, expr):
if not ctx.is_extracted(expr):
ctx.make_alias(expr)

#----------------------------------------------------------------------
# Filter analysis / rewrites
# ---------------------------------------------------------------------
# Expr analysis / rewrites

def _analyze_select_exprs(self):
new_select_set = []

for expr in self.select_set:
new_expr = self._visit_select_expr(expr)
new_select_set.append(new_expr)

self.select_set = new_select_set

def _visit_select_expr(self, expr):
op = expr.op()

method = '_visit_select_{}'.format(type(op).__name__)
if hasattr(self, method):
f = getattr(self, method)
return f(expr)

unchanged = True

if isinstance(op, ops.ValueNode):
new_args = []
for arg in op.args:
if isinstance(arg, ir.Expr):
new_arg = self._visit_select_expr(arg)
if arg is not new_arg:
unchanged = False
new_args.append(new_arg)
else:
new_args.append(arg)

if not unchanged:
return expr._factory(type(op)(*new_args))
else:
return expr
else:
return expr

def _visit_select_Histogram(self, expr):
op = expr.op()

EPS = 1e-13

if op.binwidth is None or op.base is None:
aux_hash = op.aux_hash or util.guid()[:6]

min_name = 'min_%s' % aux_hash
max_name = 'max_%s' % aux_hash

minmax = self.table_set.aggregate([op.arg.min().name(min_name),
op.arg.max().name(max_name)])
self.table_set = self.table_set.cross_join(minmax)

def _analyze_filter_clauses(self):
if op.base is None:
base = minmax[min_name] - EPS
else:
base = op.base

binwidth = (minmax[max_name] - base) / (op.nbins - 1)
else:
# Have both a bin width and a base
binwidth = op.binwidth
base = op.base

bucket = (op.arg - base) / binwidth
return bucket.floor().name(expr._name)

def _analyze_filter_exprs(self):
# What's semantically contained in the filter predicates may need to be
# rewritten. Not sure if this is the right place to do this, but a
# starting point
Expand Down Expand Up @@ -248,9 +317,21 @@ def _visit_filter(self, expr):
return type(expr)(type(op)(left, right))
else:
return expr
elif isinstance(op, (ops.Any, ops.Between, ops.Contains,
ops.TableColumn, ops.Literal)):
elif isinstance(op, (ops.Any, ops.BooleanValueOp,
ops.TableColumn, ir.Literal)):
return expr
elif isinstance(op, ops.ValueNode):
visited = [self._visit_filter(arg)
if isinstance(arg, ir.Expr) else arg
for arg in op.args]
unchanged = True
for new, old in zip(visited, op.args):
if new is not old:
unchanged = False
if not unchanged:
return type(expr)(type(op)(*visited))
else:
return expr
else:
raise NotImplementedError(type(op))

Expand Down Expand Up @@ -288,20 +369,11 @@ def _visit_filter_TopK(self, expr):
.limit(op.k))

pred = (op.arg == getattr(rank_set, op.arg.get_name()))
filtered = self.table_set.semi_join(rank_set, [pred])

# Now, fix up the now broken select set. Is this necessary?
# new_select_set = []
# for x in self.select_set:
# new_expr = ir.sub_for(x, [(self.table_set, filtered)])
# new_select_set.append(new_expr)
# self.select_set = new_select_set

self.table_set = filtered
self.table_set = self.table_set.semi_join(rank_set, [pred])

return None

#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Analysis of table set

def _collect_elements(self):
Expand All @@ -328,7 +400,7 @@ def _collect(self, expr, toplevel=False):
method = '_collect_{}'.format(type(op).__name__)

# Do not visit nodes twice
if id(op) in self.op_memo:
if op in self.op_memo:
return

if hasattr(self, method):
Expand All @@ -341,7 +413,7 @@ def _collect(self, expr, toplevel=False):
else:
raise NotImplementedError(type(op))

self.op_memo.add(id(op))
self.op_memo.add(op)

def _collect_Aggregation(self, expr, toplevel=False):
# The select set includes the grouping keys (if any), and these are
Expand All @@ -352,9 +424,9 @@ def _collect_Aggregation(self, expr, toplevel=False):
subbed_expr = self._sub(expr)
sub_op = subbed_expr.op()

self.group_by = sub_op.by
self.group_by = range(len(sub_op.by))
self.having = sub_op.having
self.select_set = self.group_by + sub_op.agg_exprs
self.select_set = sub_op.by + sub_op.agg_exprs
self.table_set = sub_op.table

self._collect(expr.op().table)
Expand Down Expand Up @@ -447,7 +519,7 @@ def _sub(self, what):
else:
return L.substitute_parents(what, self.sub_memo)

#----------------------------------------------------------------------
# --------------------------------------------------------------------
# Subquery analysis / extraction

def _analyze_subqueries(self):
Expand All @@ -472,16 +544,24 @@ def _analyze_subqueries(self):
# want.

# Find the subqueries, and record them in the passed query context.
self.subqueries = _extract_subqueries(self)
for expr in self.subqueries:
self.context.set_extracted(expr)
subqueries = _extract_subqueries(self)
self.subqueries = []
for expr in subqueries:
# See #173. Might have been extracted already in a parent context.
if not self.context.is_extracted(expr):
self.subqueries.append(expr)
self.context.set_extracted(expr)


def _extract_subqueries(select_stmt):
helper = _ExtractSubqueries(select_stmt)
return helper.get_result()


def _extract_noop(self, expr):
return


class _ExtractSubqueries(object):

# Helper class to make things a little easier
Expand All @@ -490,37 +570,42 @@ def __init__(self, query, greedy=False):
self.query = query
self.greedy = greedy

# Keep track of table expressions that we find in the query structure
self.observed_exprs = {}

# Maintain order of observation
self.observed_keys = []
# Ordered set that uses object .equals to find keys
self.observed_exprs = util.IbisMap()

self.expr_counts = defaultdict(lambda: 0)

def get_result(self):
self.visit(self.query.table_set)

for clause in self.query.filters:
self.visit(clause)

to_extract = []

# Read them inside-out, to avoid nested dependency issues
for k in reversed(self.observed_keys):
v = self.expr_counts[k]
for expr, key in reversed(zip(self.observed_exprs.keys,
self.observed_exprs.values)):
v = self.expr_counts[key]

if self.greedy or v > 1:
to_extract.append(self.observed_exprs[k])
to_extract.append(expr)

return to_extract

def observe(self, expr):
key = id(expr.op())

if key not in self.expr_counts:
self.observed_keys.append(key)
if expr in self.observed_exprs:
key = self.observed_exprs.get(expr)
else:
# this key only needs to be unique because of the IbisMap
key = id(expr.op())
self.observed_exprs.set(expr, key)

self.observed_exprs[key] = expr
self.expr_counts[key] += 1

def _has_been_observed(self, expr):
return expr in self.observed_exprs

def visit(self, expr):
node = expr.op()
method = '_visit_{}'.format(type(node).__name__)
Expand All @@ -532,6 +617,11 @@ def visit(self, expr):
self._visit_join(expr)
elif isinstance(node, ops.PhysicalTable):
self._visit_physical_table(expr)
elif isinstance(node, ops.ValueNode):
for arg in node.flat_args():
if not isinstance(arg, ir.Expr):
continue
self.visit(arg)
else:
raise NotImplementedError(type(node))

Expand All @@ -540,8 +630,9 @@ def _visit_join(self, expr):
self.visit(node.left)
self.visit(node.right)

def _visit_physical_table(self, expr):
return
_visit_physical_table = _extract_noop
_visit_ExistsSubquery = _extract_noop
_visit_NotExistsSubquery = _extract_noop

def _visit_Aggregation(self, expr):
self.observe(expr)
Expand All @@ -551,7 +642,7 @@ def _visit_Distinct(self, expr):
self.observe(expr)

def _visit_Filter(self, expr):
pass
self.visit(expr.op().table)

def _visit_Limit(self, expr):
self.visit(expr.op().table)
Expand All @@ -566,6 +657,11 @@ def _visit_Projection(self, expr):
def _visit_SQLQueryResult(self, expr):
self.observe(expr)

def _visit_TableColumn(self, expr):
table = expr.op().table
if not self._has_been_observed(table):
self.visit(table)

def _visit_SelfReference(self, expr):
self.visit(expr.op().table)

Expand All @@ -587,7 +683,8 @@ def __init__(self, query, expr):
self.expr = expr

qroots = self.query.table_set._root_tables()
self.query_roots = set([id(x) for x in qroots])

self.query_roots = util.IbisSet.from_list(qroots)

# aliasing required
self.foreign_refs = []
Expand Down Expand Up @@ -651,7 +748,7 @@ def _ref_check(self, node, in_subquery=False):
def _is_root(self, what):
if isinstance(what, ir.Expr):
what = what.op()
return id(what) in self.query_roots
return what in self.query_roots


def _adapt_expr(expr):
Expand All @@ -661,17 +758,29 @@ def _adapt_expr(expr):
#
# Canonical case is scalar values or arrays produced by some reductions
# (simple reductions, or distinct, say)
as_is = lambda x: x

if isinstance(expr, ir.TableExpr):
handler = lambda x: x
return expr, handler
return expr, as_is

def _scalar_reduce(x):
return isinstance(x, ir.ScalarExpr) and x.is_reduction()

if isinstance(expr, ir.ScalarExpr) and expr.is_reduction():
if _scalar_reduce(expr):
table_expr = _reduction_to_aggregation(expr, agg_name='tmp')

def scalar_handler(results):
return results['tmp'][0]

return table_expr, scalar_handler
elif isinstance(expr, ir.ExprList):
exprs = expr.exprs()
for expr in exprs:
if not _scalar_reduce(expr):
raise NotImplementedError(expr)

table = L.find_base_table(exprs[0])
return table.aggregate(exprs), as_is
elif isinstance(expr, ir.ArrayExpr):
op = expr.op()

Expand Down
22 changes: 17 additions & 5 deletions ibis/sql/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import ibis.expr.types as ir


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# The QueryContext (temporary name) will store useful information like table
# alias names for converting value expressions to SQL.

Expand All @@ -38,6 +38,8 @@ def __init__(self, indent=2, parent=None):

self.query = None

self._table_key_memo = {}

@property
def top_context(self):
if self.parent is None:
Expand All @@ -48,7 +50,14 @@ def top_context(self):
def _get_table_key(self, table):
if isinstance(table, ir.TableExpr):
table = table.op()
return id(table)

k = id(table)
if k in self._table_key_memo:
return self._table_key_memo[k]
else:
val = table._repr()
self._table_key_memo[k] = val
return val

def set_always_alias(self):
self.always_alias = True
Expand Down Expand Up @@ -104,14 +113,17 @@ def make_alias(self, table_expr):

def has_alias(self, table_expr, parent_contexts=False):
key = self._get_table_key(table_expr)
return self._key_in(key, 'table_aliases',
parent_contexts=parent_contexts)

if key in self.table_aliases:
def _key_in(self, key, memo_attr, parent_contexts=False):
if key in getattr(self, memo_attr):
return True

ctx = self
while parent_contexts and ctx.parent is not None:
ctx = ctx.parent
if key in ctx.table_aliases:
if key in getattr(ctx, memo_attr):
return True

return False
Expand Down Expand Up @@ -150,7 +162,7 @@ def is_foreign_expr(self, expr):

# The expression isn't foreign to us. For example, the parent table set
# in a correlated WHERE subquery
if self.has_alias(expr):
if self.has_alias(expr, parent_contexts=True):
return False

exprs = [self.query.table_set] + self.query.select_set
Expand Down
355 changes: 323 additions & 32 deletions ibis/sql/ddl.py

Large diffs are not rendered by default.

254 changes: 236 additions & 18 deletions ibis/sql/exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,22 @@
# table, with optional teardown if the user wants the intermediate converted
# table to be temporary.

import datetime
from io import BytesIO

import ibis.expr.analysis as L
import ibis
import ibis.expr.analytics as analytics
import ibis.expr.types as ir
import ibis.expr.operations as ops
import ibis.expr.temporal as tempo

import ibis.sql.transforms as transforms
import ibis.sql.identifiers as identifiers

import ibis.common as com
import ibis.util as util

#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Scalar and array expression formatting

_sql_type_names = {
Expand All @@ -47,8 +52,11 @@
def _cast(translator, expr):
op = expr.op()
arg = translator.translate(op.arg)
sql_type = _type_to_sql_string(op.target_type)
return 'CAST({!s} AS {!s})'.format(arg, sql_type)
if isinstance(op.arg, ir.CategoryValue) and op.target_type == 'int32':
return arg
else:
sql_type = _type_to_sql_string(op.target_type)
return 'CAST({!s} AS {!s})'.format(arg, sql_type)


def _type_to_sql_string(tval):
Expand All @@ -73,6 +81,20 @@ def _contains(translator, expr):
return '{!s} IN {!s}'.format(comp, options)


def _like(translator, expr):
op = expr.op()
arg = translator.translate(op.arg)
pattern = translator.translate(op.pattern)
return '{!s} LIKE {!s}'.format(arg, pattern)


def _rlike(translator, expr):
op = expr.op()
arg = translator.translate(op.arg)
pattern = translator.translate(op.pattern)
return '{!s} RLIKE {!s}'.format(arg, pattern)


def _not_contains(translator, expr):
# Slight code dup
op = expr.op()
Expand Down Expand Up @@ -113,6 +135,33 @@ def formatter(translator, expr):
return formatter


def _reduction(func_name):
def formatter(translator, expr):
op = expr.op()

if op.where is not None:
case = op.where.ifelse(op.arg, ibis.NA)
arg = translator.translate(case)
else:
arg = translator.translate(op.arg)

return '{!s}({!s})'.format(func_name, arg)
return formatter


def _fixed_arity_call(func_name, arity):
def formatter(translator, expr):
op = expr.op()
formatted_args = []
for i in xrange(arity):
arg = op.args[i]
fmt_arg = translator.translate(arg)
formatted_args.append(fmt_arg)

return '{!s}({!s})'.format(func_name, ', '.join(formatted_args))
return formatter


def _binary_infix_op(infix_sym):
def formatter(translator, expr):
op = expr.op()
Expand Down Expand Up @@ -183,8 +232,18 @@ def _string_literal_format(expr):
return "'{!s}'".format(value.replace("'", "\\'"))


def quote_identifier(name, quotechar='`'):
if name.count(' ') or name in identifiers.impala_identifiers:
def _timestamp_literal_format(expr):
value = expr.op().value
if isinstance(value, datetime.datetime):
if value.microsecond != 0:
raise ValueError(value)
value = value.strftime('%Y-%m-%d %H:%M:%S')

return "'{!s}'".format(value)


def quote_identifier(name, quotechar='`', force=False):
if force or name.count(' ') or name in identifiers.impala_identifiers:
return '{0}{1}{0}'.format(quotechar, name)
else:
return name
Expand Down Expand Up @@ -254,14 +313,104 @@ def _searched_case(translator, expr):
return formatter.get_result()


def _bucket(translator, expr):
import operator

op = expr.op()
stmt = ibis.case()

if op.closed == 'left':
l_cmp = operator.le
r_cmp = operator.lt
else:
l_cmp = operator.lt
r_cmp = operator.le

user_num_buckets = len(op.buckets) - 1

bucket_id = 0
if op.include_under:
if user_num_buckets > 0:
cmp = operator.lt if op.close_extreme else r_cmp
else:
cmp = operator.le if op.closed == 'right' else operator.lt
stmt = stmt.when(cmp(op.arg, op.buckets[0]), bucket_id)
bucket_id += 1

for j, (lower, upper) in enumerate(zip(op.buckets, op.buckets[1:])):
if (op.close_extreme
and ((op.closed == 'right' and j == 0) or
(op.closed == 'left' and j == (user_num_buckets - 1)))):
stmt = stmt.when((lower <= op.arg) & (op.arg <= upper),
bucket_id)
else:
stmt = stmt.when(l_cmp(lower, op.arg) & r_cmp(op.arg, upper),
bucket_id)
bucket_id += 1

if op.include_over:
if user_num_buckets > 0:
cmp = operator.lt if op.close_extreme else l_cmp
else:
cmp = operator.lt if op.closed == 'right' else operator.le

stmt = stmt.when(cmp(op.buckets[-1], op.arg), bucket_id)
bucket_id += 1

case_expr = stmt.end().name(expr._name)
return _searched_case(translator, case_expr)


def _category_label(translator, expr):
op = expr.op()

stmt = op.arg.case()
for i, label in enumerate(op.labels):
stmt = stmt.when(i, label)

if op.nulls is not None:
stmt = stmt.else_(op.nulls)

case_expr = stmt.end().name(expr._name)
return _simple_case(translator, case_expr)


def _table_array_view(translator, expr):
ctx = translator.context
table = expr.op().table
query = ctx.get_formatted_query(table)
return '(\n{}\n)'.format(util.indent(query, ctx.indent))


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Timestamp arithmetic and other functions

def _timestamp_delta(translator, expr):
op = expr.op()
formatted_arg = translator.translate(op.arg)
return _timestamp_format_offset(op.offset, formatted_arg)


_impala_delta_functions = {
tempo.Year: 'years_add',
tempo.Month: 'months_add',
tempo.Week: 'weeks_add',
tempo.Day: 'days_add',
tempo.Hour: 'hours_add',
tempo.Minute: 'minutes_add',
tempo.Second: 'seconds_add',
tempo.Millisecond: 'milliseconds_add',
tempo.Microsecond: 'microseconds_add',
tempo.Nanosecond: 'nanoseconds_add'
}


def _timestamp_format_offset(offset, arg):
f = _impala_delta_functions[type(offset)]
return '{}({}, {})'.format(f, arg, offset.n)


# ---------------------------------------------------------------------
# Semi/anti-join supports


Expand Down Expand Up @@ -337,6 +486,32 @@ def extract_field_formatter(translator, expr):
return extract_field_formatter


def _timestamp_from_unix(translator, expr):
op = expr.op()

val = op.arg
if op.unit == 'ms':
val = (val / 1000).cast('int32')
elif op.unit == 'us':
val = (val / 1000000).cast('int32')

arg = _from_unixtime(translator, val)
return 'CAST({} AS timestamp)'.format(arg)


def _from_unixtime(translator, expr):
arg = translator.translate(expr)
return 'from_unixtime({}, "yyyy-MM-dd HH:mm:ss")'.format(arg)


def _coalesce_like(func_name):
def coalesce_like_formatter(translator, expr):
op = expr.op()
trans_args = [translator.translate(arg) for arg in op.args]
return '{}({})'.format(func_name, ', '.join(trans_args))
return coalesce_like_formatter


def _substring(translator, expr):
op = expr.op()
arg_formatted = translator.translate(op.arg)
Expand Down Expand Up @@ -365,6 +540,16 @@ def _round(translator, expr):
return 'round({})'.format(arg_formatted)


def _hash(translator, expr):
op = expr.op()
arg_formatted = translator.translate(op.arg)

if op.how == 'fnv':
return 'fnv_hash({})'.format(arg_formatted)
else:
raise NotImplementedError(op.how)


def _log(translator, expr):
op = expr.op()
arg_formatted = translator.translate(op.arg)
Expand All @@ -388,6 +573,8 @@ def _literal(translator, expr):
typeclass = 'string'
elif isinstance(expr, ir.NumericValue):
typeclass = 'number'
elif isinstance(expr, ir.TimestampValue):
typeclass = 'timestamp'
else:
raise NotImplementedError

Expand All @@ -401,7 +588,8 @@ def _null_literal(translator, expr):
_literal_formatters = {
'boolean': _boolean_literal_format,
'number': _number_literal_format,
'string': _string_literal_format
'string': _string_literal_format,
'timestamp': _timestamp_literal_format
}


Expand All @@ -421,6 +609,9 @@ def _not_implemented(translator, expr):
ops.IsNull: _is_null,
ops.Negate: _negate,

ops.IfNull: _fixed_arity_call('isnull', 2),
ops.NullIf: _fixed_arity_call('nullif', 2),

ops.ZeroIfNull: _unary_op('zeroifnull'),

ops.Abs: _unary_op('abs'),
Expand All @@ -432,18 +623,26 @@ def _not_implemented(translator, expr):
ops.Sign: _unary_op('sign'),
ops.Sqrt: _unary_op('sqrt'),

ops.Hash: _hash,

ops.Log: _log,
ops.Ln: _unary_op('ln'),
ops.Log2: _unary_op('log2'),
ops.Log10: _unary_op('log10'),

# Unary aggregates
ops.Mean: _unary_op('avg'),
ops.Sum: _unary_op('sum'),
ops.Max: _unary_op('max'),
ops.Min: _unary_op('min'),
ops.DecimalPrecision: _unary_op('precision'),
ops.DecimalScale: _unary_op('scale'),

ops.Count: _unary_op('count'),
# Unary aggregates
ops.CMSMedian: _reduction('appx_median'),
ops.HLLCardinality: _reduction('ndv'),
ops.Mean: _reduction('avg'),
ops.Sum: _reduction('sum'),
ops.Max: _reduction('max'),
ops.Min: _reduction('min'),
ops.GroupConcat: _fixed_arity_call('group_concat', 2),

ops.Count: _reduction('count'),
ops.CountDistinct: _count_distinct,
}

Expand All @@ -454,7 +653,8 @@ def _not_implemented(translator, expr):
ops.Subtract: _binary_infix_op('-'),
ops.Multiply: _binary_infix_op('*'),
ops.Divide: _binary_infix_op('/'),
ops.Power: _binary_infix_op('^'),
ops.Power: _fixed_arity_call('pow', 2),
ops.Modulus: _binary_infix_op('%'),

# Comparisons
ops.Equals: _binary_infix_op('='),
Expand Down Expand Up @@ -492,25 +692,42 @@ def _not_implemented(translator, expr):


_other_ops = {
ops.Literal: _literal,
ops.Any: _any_exists,

ops.E: lambda *args: 'e()',

ir.Literal: _literal,
ops.NullLiteral: _null_literal,

ops.ValueList: _value_list,

ops.Cast: _cast,

ops.Coalesce: _coalesce_like('coalesce'),
ops.Greatest: _coalesce_like('greatest'),
ops.Least: _coalesce_like('least'),

ops.Where: _fixed_arity_call('if', 3),

ops.StringSQLLike: _like,
ops.RegexSearch: _rlike,

ops.Between: _between,
ops.Contains: _contains,
ops.NotContains: _not_contains,

analytics.Bucket: _bucket,
analytics.CategoryLabel: _category_label,

ops.SimpleCase: _simple_case,
ops.SearchedCase: _searched_case,

ops.TableColumn: _table_column,

ops.TableArrayView: _table_array_view,

ops.Any: _any_exists,
ops.TimestampDelta: _timestamp_delta,
ops.TimestampFromUNIX: _timestamp_from_unix,

transforms.ExistsSubquery: _exists_subquery,
transforms.NotExistsSubquery: _exists_subquery
Expand Down Expand Up @@ -547,7 +764,8 @@ def get_result(self):
if self._needs_name(self.expr):
# TODO: this could fail in various ways
name = self.expr.get_name()
translated = _name_expr(translated, quote_identifier(name))
translated = _name_expr(translated,
quote_identifier(name, force=True))
return translated

def _needs_name(self, expr):
Expand Down
565 changes: 419 additions & 146 deletions ibis/sql/tests/test_compiler.py

Large diffs are not rendered by default.

406 changes: 388 additions & 18 deletions ibis/sql/tests/test_exprs.py

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions ibis/sql/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import ibis.expr.analysis as L
import ibis.expr.operations as ops
import ibis.expr.types as ir
import ibis.util as util


class ExistsSubquery(ir.Node):
Expand Down Expand Up @@ -51,7 +52,7 @@ def __init__(self, context, expr, parent_table):
self.parent_table = parent_table

qroots = self.parent_table._root_tables()
self.query_roots = set([id(x) for x in qroots])
self.query_roots = util.IbisSet.from_list(qroots)

def get_result(self):
self.foreign_table = None
Expand Down Expand Up @@ -108,4 +109,4 @@ def _ref_check(self, expr):
def _is_root(self, what):
if isinstance(what, ir.Expr):
what = what.op()
return id(what) in self.query_roots
return what in self.query_roots
4 changes: 2 additions & 2 deletions ibis/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def execute(self):
self.lock.release()


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Ping pong task for testing


Expand All @@ -189,7 +189,7 @@ def run(self):

register_task('ping', PingPongTask)

#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Aggregation execution tasks


Expand Down
254 changes: 254 additions & 0 deletions ibis/tests/test_filesystems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from posixpath import join as pjoin
import os
import shutil
import unittest

from hdfs import InsecureClient
import pytest

from ibis.filesystems import HDFS, WebHDFS
import ibis.util as util


class MockHDFS(HDFS):

def __init__(self):
self.ls_result = []

def set_ls(self, results):
self.ls_result = results

def ls(self, *args, **kwargs):
return self.ls_result


class TestHDFSRandom(unittest.TestCase):

def setUp(self):
self.con = MockHDFS()

def test_find_any_file(self):
ls_contents = [(u'/path/foo',
{u'type': u'DIRECTORY'}),
(u'/path/bar.tmp',
{u'type': u'FILE'}),
(u'/path/baz.copying',
{u'type': u'FILE'}),
(u'/path/_SUCCESS',
{u'type': u'FILE'}),
(u'/path/.peekaboo',
{u'type': u'FILE'}),
(u'/path/0.parq',
{u'type': u'FILE'}),
(u'/path/_FILE',
{u'type': u'DIRECTORY'})]

self.con.set_ls(ls_contents)

result = self.con.find_any_file('/path')
assert result == '/path/0.parq'


class TestHDFSE2E(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.host = os.environ.get('IBIS_TEST_HOST', 'localhost')
cls.protocol = os.environ.get('IBIS_TEST_PROTOCOL', 'hiveserver2')
cls.port = os.environ.get('IBIS_TEST_PORT', 21050)

cls.test_dir = '/{}'.format(util.guid())

# Impala dev environment uses port 5070 for HDFS web interface

hdfs_host = 'localhost'
webhdfs_port = 5070
url = 'http://{}:{}'.format(hdfs_host, webhdfs_port)

try:
cls.hdfs_client = InsecureClient(url)
cls.hdfs = WebHDFS(cls.hdfs_client)
cls.hdfs.mkdir(cls.test_dir)
except Exception as e:
pytest.skip('Could not connect to HDFS: {}'.format(e.message))

@classmethod
def tearDownClass(cls):
try:
cls.hdfs.rmdir(cls.test_dir)
except:
pass

def setUp(self):
self.test_files = []

def tearDown(self):
self._delete_test_files()

def _delete_test_files(self):
for path in self.test_files:
try:
os.remove(path)
except os.error:
pass

def _make_random_file(self, units=100, directory=None):
path = util.guid()

if directory:
path = os.path.join(directory, path)

with open(path, 'wb') as f:
for i in xrange(units):
f.write(util.guid())

self.test_files.append(path)
return path

def test_mkdir(self):
path = pjoin(self.test_dir, 'mkdir-test')
self.hdfs.mkdir(path)
assert self.hdfs.exists(path)

def test_write_get_delete_file(self):
dirpath = pjoin(self.test_dir, 'write-delete-test')
self.hdfs.mkdir(dirpath)

lpath = self._make_random_file()
fpath = pjoin(dirpath, lpath)

self.hdfs.put(fpath, lpath)
assert self.hdfs.exists(fpath)

try:
dpath = util.guid()
self.hdfs.get(fpath, dpath)
assert _contents_equal(dpath, lpath)
os.remove(dpath)
finally:
self.hdfs.rm(fpath)
assert not self.hdfs.exists(fpath)

def test_overwrite_file(self):
pass

def test_write_get_directory(self):
local_dir = util.guid()
local_download_dir = util.guid()

K = 5

os.mkdir(local_dir)

try:
for i in xrange(K):
self._make_random_file(directory=local_dir)

remote_dir = pjoin(self.test_dir, local_dir)
self.hdfs.put(remote_dir, local_dir)

assert self.hdfs.exists(remote_dir)
assert len(self.hdfs.ls(remote_dir)) == K

# download directory and check contents
self.hdfs.get(remote_dir, local_download_dir)

_check_directories_equal(local_dir, local_download_dir)

self._try_delete_directory(local_download_dir)

self.hdfs.rmdir(remote_dir)
assert not self.hdfs.exists(remote_dir)
finally:
shutil.rmtree(local_dir)

def test_get_directory_nested_dirs(self):
local_dir = util.guid()
local_download_dir = util.guid()

K = 5

os.mkdir(local_dir)

try:
for i in xrange(K):
self._make_random_file(directory=local_dir)

nested_dir = os.path.join(local_dir, 'nested-dir')
shutil.copytree(local_dir, nested_dir)

remote_dir = pjoin(self.test_dir, local_dir)
self.hdfs.put(remote_dir, local_dir)

# download directory and check contents
self.hdfs.get(remote_dir, local_download_dir)

_check_directories_equal(local_dir, local_download_dir)

self._try_delete_directory(local_download_dir)

self.hdfs.rmdir(remote_dir)
assert not self.hdfs.exists(remote_dir)
finally:
shutil.rmtree(local_dir)

def _try_delete_directory(self, path):
try:
shutil.rmtree(path)
except os.error:
pass

def test_ls(self):
test_dir = pjoin(self.test_dir, 'ls-test')
self.hdfs.mkdir(test_dir)
for i in xrange(10):
local_path = self._make_random_file()
hdfs_path = pjoin(test_dir, local_path)
self.hdfs.put(hdfs_path, local_path)

assert len(self.hdfs.ls(test_dir)) == 10


def _check_directories_equal(left, right):
left_files = _get_all_files(left)
right_files = _get_all_files(right)

assert set(left_files.keys()) == set(right_files.keys())

for relpath, labspath in left_files.items():
rabspath = right_files[relpath]
assert _contents_equal(rabspath, labspath)


def _contents_equal(left, right):
with open(left) as lf:
with open(right) as rf:
return lf.read() == rf.read()


def _get_all_files(path):
paths = {}
for dirpath, _, filenames in os.walk(path):
rel_dir = os.path.relpath(dirpath, path)
if rel_dir == '.':
rel_dir = ''
for name in filenames:
abspath = os.path.join(dirpath, name)
relpath = os.path.join(rel_dir, name)
paths[relpath] = abspath

return paths
600 changes: 560 additions & 40 deletions ibis/tests/test_impala_e2e.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions ibis/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
from ibis.server import IbisServerNode


# non-POSIX system (e.g. Windows)
pytestmark = pytest.mark.skipif(not hasattr(os, 'setpgid'),
reason='non-POSIX system')


def port_is_closed(port):
server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
Expand Down
1 change: 0 additions & 1 deletion ibis/uda.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

from ibis.cloudpickle import dumps as pickle_dump
from cPickle import loads as pickle_load

import base64

Expand Down
56 changes: 54 additions & 2 deletions ibis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@


def guid():
from ibis.comms import uuid4_hex
return uuid4_hex()
try:
from ibis.comms import uuid4_hex
return uuid4_hex()
except ImportError:
from uuid import uuid4
return uuid4().get_hex()


def bytes_to_uint8_array(val, width=70):
Expand Down Expand Up @@ -62,3 +66,51 @@ def all_of(values, t):
if not isinstance(x, t):
return False
return True


def promote_list(val):
if not isinstance(val, list):
val = [val]
return val


class IbisSet(object):

def __init__(self, keys=None):
self.keys = keys or []

@classmethod
def from_list(cls, keys):
return IbisSet(keys)

def __contains__(self, obj):
for other in self.keys:
if obj.equals(other):
return True
return False

def add(self, obj):
self.keys.append(obj)


class IbisMap(object):

def __init__(self):
self.keys = []
self.values = []

def __contains__(self, obj):
for other in self.keys:
if obj.equals(other):
return True
return False

def set(self, key, value):
self.keys.append(key)
self.values.append(value)

def get(self, key):
for k, v in zip(self.keys, self.values):
if key.equals(k):
return v
raise KeyError(key)
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ numpy>=1.7.0
pandas>=0.12.0
impyla>=0.9.1
psutil==0.6.1
snakebite
hdfs[kerberos]
six
176 changes: 176 additions & 0 deletions scripts/load_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Populates the ibis_testing Impala database

from posixpath import join as pjoin
import os
import posixpath
import shutil

from ibis.util import guid
import ibis

IMPALA_HOST = 'localhost'
HDFS_HOST = 'localhost'
WEBHDFS_PORT = 5070
TEST_DB = 'ibis_testing'
TEST_DATA_DIR = 'ibis-testing-data'
TEST_DATA_HDFS_LOC = '/__ibis/ibis-testing-data'


def make_connection():
ic = ibis.impala_connect(host=IMPALA_HOST)
hdfs = ibis.hdfs_connect(host=HDFS_HOST, port=WEBHDFS_PORT)
con = ibis.make_client(ic, hdfs_client=hdfs)

return con

# ----------------------------------------------------------------------
# Functions for creating the test data archive to begin with

TMP_DB_LOCATION = '/__ibis/{0}'.format(guid())
TMP_DB = guid()

def make_temp_database(con):
if con.exists_database(TMP_DB):
con.drop_database(TMP_DB, force=True)
con.create_database(TMP_DB, path=TMP_DB_LOCATION)
print('Created database {0} at {1}'.format(TMP_DB, TMP_DB_LOCATION))


def cleanup_temporary_stuff(con):
con.drop_database(TMP_DB, force=True)
assert not con.hdfs.exists(TMP_DB_LOCATION)

def download_parquet_files(con):
parquet_path = pjoin(TEST_DATA_DIR, 'parquet')
print("Downloading {0}".format(parquet_path))
con.hdfs.get(TMP_DB_LOCATION, parquet_path)


def download_avro_files(con):
avro_path = '/test-warehouse/tpch.region_avro'
os.mkdir(os.path.join(TEST_DATA_DIR, 'avro'))
print("Downloading {0}".format(avro_path))
con.hdfs.get(avro_path, pjoin(TEST_DATA_DIR, 'avro', 'tpch.region'))


def generate_csv_files():
import numpy as np
import pandas as pd
import pandas.util.testing as tm

N = 10
nfiles = 10

csv_base = os.path.join(TEST_DATA_DIR, 'csv')
os.mkdir(csv_base)

df = pd.DataFrame({
'foo': [tm.rands(10) for _ in xrange(N)],
'bar': np.random.randn(N),
'baz': np.random.randint(0, 100, size=N)
}, columns=['foo', 'bar', 'baz'])

for i in xrange(nfiles):
csv_path = os.path.join(csv_base, '{}.csv'.format(i))
print('Writing {}'.format(csv_path))
df.to_csv(csv_path, index=False, header=False)


def scrape_parquet_files(con):
to_scrape = [('tpch', x) for x in con.list_tables(database='tpch')]
to_scrape.append(('functional', 'alltypes'))
for db, tname in to_scrape:
table = con.table(tname, database=db)
new_name = '{}_{}'.format(db, tname)
print('Creating {}'.format(new_name))
con.create_table(new_name, table, database=TMP_DB)


def make_local_test_archive():
con = make_connection()
make_temp_database(con)

try:
scrape_parquet_files(con)

if os.path.exists(TEST_DATA_DIR):
shutil.rmtree(TEST_DATA_DIR)
os.mkdir(TEST_DATA_DIR)

download_parquet_files(con)
download_avro_files(con)
generate_csv_files()
finally:
cleanup_temporary_stuff(con)

# ----------------------------------------------------------------------
#


def write_data_to_hdfs(con):
# TODO per #278, write directly from the gzipped tarball
con.hdfs.put(TEST_DATA_HDFS_LOC, TEST_DATA_DIR,
verbose=True, overwrite=True)


def create_test_database(con):
if con.exists_database(TEST_DB):
con.drop_database(TEST_DB, force=True)
con.create_database(TEST_DB)
print('Created database {0}'.format(TEST_DB))


def create_parquet_tables(con):
parquet_files = con.hdfs.ls(pjoin(TEST_DATA_HDFS_LOC, 'parquet'))

schemas = {
'functional_alltypes': ibis.schema(
[('id', 'int32'),
('bool_col', 'boolean'),
('tinyint_col', 'int8'),
('smallint_col', 'int16'),
('int_col', 'int32'),
('bigint_col', 'int64'),
('float_col', 'float'),
('double_col', 'double'),
('date_string_col', 'string'),
('string_col', 'string'),
('timestamp_col', 'timestamp'),
('year', 'int32'),
('month', 'int32')])
}

for path in parquet_files:
head, table_name = posixpath.split(path)
print 'Creating {0}'.format(table_name)

# if no schema infer!
schema = schemas.get(table_name)

con.parquet_file(path, schema=schema, name=table_name,
database=TEST_DB, persist=True)


def setup_test_data():
con = make_connection()
write_data_to_hdfs(con)
create_test_database(con)
create_parquet_tables(con)


if __name__ == '__main__':
setup_test_data()
15 changes: 13 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,18 @@
from distutils.extension import Extension

MAJOR = 0
MINOR = 1
MINOR = 2
MICRO = 0
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)

ISRELEASED = True

if not ISRELEASED:
VERSION += '.dev'

# todo: acquire git hash


from distutils.command.clean import clean as _clean
class clean(_clean):
def run(self):
Expand All @@ -59,6 +67,9 @@ def run(self):

common_include = ['ibis/src', np.get_include()]

with open('requirements.txt') as f:
file_reqs = f.read().splitlines()
requirements = requirements + file_reqs

if COMMS_EXT_ENABLED:
comms_ext_libraries = []
Expand Down Expand Up @@ -89,7 +100,7 @@ def run(self):
ext_modules=extensions,
cmdclass=cmdclass,
install_requires=requirements,
description="Python analytics framework for Impala",
description="Productivity-centric Python Big Data Framework",
license='Apache License, Version 2.0',
maintainer="Wes McKinney",
maintainer_email="wes@cloudera.com"
Expand Down