439 changes: 12 additions & 427 deletions ibis/sql/ddl.py

Large diffs are not rendered by default.

180 changes: 97 additions & 83 deletions ibis/sql/exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@
from io import BytesIO

import ibis
import ibis.expr.analysis as L
import ibis.expr.analytics as analytics
import ibis.expr.datatypes as dt
import ibis.expr.types as ir
import ibis.expr.operations as ops
import ibis.expr.temporal as tempo

import ibis.sql.transforms as transforms
import ibis.sql.identifiers as identifiers

import ibis.impala.identifiers as identifiers

import ibis.common as com
import ibis.util as util
Expand All @@ -39,7 +42,8 @@
'double': 'double',
'string': 'string',
'boolean': 'boolean',
'timestamp': 'timestamp'
'timestamp': 'timestamp',
'decimal': 'decimal',
}


Expand All @@ -56,10 +60,10 @@ def _cast(translator, expr):


def _type_to_sql_string(tval):
if isinstance(tval, ir.DecimalType):
if isinstance(tval, dt.Decimal):
return 'decimal({0},{1})'.format(tval.precision, tval.scale)
else:
return _sql_type_names[tval]
return _sql_type_names[tval.name()]


def _between(translator, expr):
Expand All @@ -83,6 +87,8 @@ def _not_null(translator, expr):
ops.CumulativeMin: ops.Min,
ops.CumulativeMax: ops.Max,
ops.CumulativeMean: ops.Mean,
ops.CumulativeAny: ops.Any,
ops.CumulativeAll: ops.All,
}


Expand All @@ -95,7 +101,13 @@ def _cumulative_to_window(expr, window):

klass = _cumulative_to_reduction[type(op)]
new_op = klass(*op.args)
return expr._factory(new_op, name=expr._name), win
new_expr = expr._factory(new_op, name=expr._name)

if type(new_op) in _expr_rewrites:
new_expr = _expr_rewrites[type(new_op)](new_expr)

new_expr = L.windowize_function(new_expr, win)
return new_expr


def _window(translator, expr):
Expand All @@ -115,7 +127,6 @@ def _window(translator, expr):
ops.CMSMedian,
ops.GroupConcat,
ops.HLLCardinality,

)

if isinstance(window_op, _unsupported_reductions):
Expand All @@ -124,8 +135,8 @@ def _window(translator, expr):
.format(type(window_op)))

if isinstance(window_op, ops.CumulativeOp):
arg, window = _cumulative_to_window(arg, window)
window_op = arg.op()
arg = _cumulative_to_window(arg, window)
return translator.translate(arg)

# Some analytic functions need to have the expression of interest in
# the ORDER BY part of the window clause
Expand Down Expand Up @@ -276,6 +287,28 @@ def formatter(translator, expr):
return formatter


def _any_expand(expr):
arg = expr.op().args[0]
return arg.sum() > 0


def _notany_expand(expr):
arg = expr.op().args[0]
return arg.sum() == 0


def _all_expand(expr):
arg = expr.op().args[0]
t = ir.find_base_table(arg)
return arg.sum() == t.count()


def _notall_expand(expr):
arg = expr.op().args[0]
t = ir.find_base_table(arg)
return arg.sum() < t.count()


def _fixed_arity_call(func_name, arity):

def formatter(translator, expr):
Expand Down Expand Up @@ -563,26 +596,6 @@ def _timestamp_format_offset(offset, arg):
# Semi/anti-join supports


def _any_exists(translator, expr):
# Foreign references will have been catalogued by the correlated
# ref-checking code. However, we need to rewrite this expression as a query
# of the type
#
# SELECT 1
# FROM {foreign_ref}
# WHERE {correlated_filter}
#
# It's possible there could be multiple predicates inside the Any involving
# more than one foreign reference. Will just disallow this for now until
# someone *really* needs it.
# op = expr.op()
# ctx = translator.context

# comp_op = op.arg.op()

raise NotImplementedError


def _exists_subquery(translator, expr):
op = expr.op()
ctx = translator.context
Expand All @@ -605,7 +618,8 @@ def _exists_subquery(translator, expr):

def _table_column(translator, expr):
op = expr.op()
field_name = quote_identifier(op.name)
field_name = op.name
quoted_name = quote_identifier(field_name, force=True)

table = op.table
ctx = translator.context
Expand All @@ -619,9 +633,9 @@ def _table_column(translator, expr):
if ctx.need_aliases():
alias = ctx.get_alias(table)
if alias is not None:
field_name = '{0}.{1}'.format(alias, field_name)
quoted_name = '{0}.{1}'.format(alias, quoted_name)

return field_name
return quoted_name


def _extract_field(sql_attr):
Expand Down Expand Up @@ -838,7 +852,49 @@ def _not_implemented(translator, expr):
raise NotImplementedError


_unary_ops = {
_subtract_one = '{0} - 1'.format


_expr_transforms = {
ops.RowNumber: _subtract_one,
ops.DenseRank: _subtract_one,
ops.MinRank: _subtract_one,
}


_expr_rewrites = {
ops.Any: _any_expand,
ops.All: _all_expand,
ops.NotAny: _notany_expand,
ops.NotAll: _notall_expand,
}


_binary_infix_ops = {
# Binary operations
ops.Add: _binary_infix_op('+'),
ops.Subtract: _binary_infix_op('-'),
ops.Multiply: _binary_infix_op('*'),
ops.Divide: _binary_infix_op('/'),
ops.Power: _fixed_arity_call('pow', 2),
ops.Modulus: _binary_infix_op('%'),

# Comparisons
ops.Equals: _binary_infix_op('='),
ops.NotEquals: _binary_infix_op('!='),
ops.GreaterEqual: _binary_infix_op('>='),
ops.Greater: _binary_infix_op('>'),
ops.LessEqual: _binary_infix_op('<='),
ops.Less: _binary_infix_op('<'),

# Boolean comparisons
ops.And: _binary_infix_op('AND'),
ops.Or: _binary_infix_op('OR'),
ops.Xor: _xor,
}


_operation_registry = {
# Unary operations
ops.NotNull: _not_null,
ops.IsNull: _is_null,
Expand All @@ -850,6 +906,7 @@ def _not_implemented(translator, expr):
ops.ZeroIfNull: _unary_op('zeroifnull'),

ops.Abs: _unary_op('abs'),
ops.BaseConvert: _fixed_arity_call('conv', 3),
ops.Ceil: _unary_op('ceil'),
ops.Floor: _unary_op('floor'),
ops.Exp: _unary_op('exp'),
Expand Down Expand Up @@ -879,33 +936,8 @@ def _not_implemented(translator, expr):

ops.Count: _reduction('count'),
ops.CountDistinct: _count_distinct,
}


_binary_infix_ops = {
# Binary operations
ops.Add: _binary_infix_op('+'),
ops.Subtract: _binary_infix_op('-'),
ops.Multiply: _binary_infix_op('*'),
ops.Divide: _binary_infix_op('/'),
ops.Power: _fixed_arity_call('pow', 2),
ops.Modulus: _binary_infix_op('%'),

# Comparisons
ops.Equals: _binary_infix_op('='),
ops.NotEquals: _binary_infix_op('!='),
ops.GreaterEqual: _binary_infix_op('>='),
ops.Greater: _binary_infix_op('>'),
ops.LessEqual: _binary_infix_op('<='),
ops.Less: _binary_infix_op('<'),

# Boolean comparisons
ops.And: _binary_infix_op('AND'),
ops.Or: _binary_infix_op('OR'),
ops.Xor: _xor,
}

_string_ops = {
# string operations
ops.StringLength: _unary_op('length'),
ops.StringAscii: _unary_op('ascii'),
ops.Lowercase: _unary_op('lower'),
Expand All @@ -929,10 +961,8 @@ def _not_implemented(translator, expr):
ops.RegexExtract: _fixed_arity_call('regexp_extract', 3),
ops.RegexReplace: _fixed_arity_call('regexp_replace', 3),
ops.ParseURL: _parse_url,
}


_timestamp_ops = {
# Timestamp operations
ops.TimestampNow: lambda *args: 'now()',
ops.ExtractYear: _extract_field('year'),
ops.ExtractMonth: _extract_field('month'),
Expand All @@ -941,13 +971,9 @@ def _not_implemented(translator, expr):
ops.ExtractMinute: _extract_field('minute'),
ops.ExtractSecond: _extract_field('second'),
ops.ExtractMillisecond: _extract_field('millisecond'),
ops.Truncate: _truncate
}


_other_ops = {
ops.Any: _any_exists,
ops.Truncate: _truncate,

# Other operations
ops.E: lambda *args: 'e()',

ir.Literal: _literal,
Expand Down Expand Up @@ -996,23 +1022,7 @@ def _not_implemented(translator, expr):
ops.WindowOp: _window
}


_subtract_one = '{0} - 1'.format


_expr_transforms = {
ops.RowNumber: _subtract_one,
ops.DenseRank: _subtract_one,
ops.MinRank: _subtract_one,
}


_operation_registry = {}
_operation_registry.update(_unary_ops)
_operation_registry.update(_binary_infix_ops)
_operation_registry.update(_string_ops)
_operation_registry.update(_timestamp_ops)
_operation_registry.update(_other_ops)


class ExprTranslator(object):
Expand Down Expand Up @@ -1061,6 +1071,10 @@ def translate(self, expr):
# The operation node type the typed expression wraps
op = expr.op()

if type(op) in _expr_rewrites:
expr = _expr_rewrites[type(op)](expr)
op = expr.op()

# TODO: use op MRO for subclasses instead of this isinstance spaghetti
if isinstance(op, ir.Parameter):
return self._trans_param(expr)
Expand Down
Empty file added ibis/sql/presto/__init__.py
Empty file.
Empty file.
Empty file added ibis/sql/redshift/__init__.py
Empty file.
Empty file.
Empty file added ibis/sql/sqlite/__init__.py
Empty file.
Empty file.
727 changes: 205 additions & 522 deletions ibis/sql/tests/test_compiler.py

Large diffs are not rendered by default.

344 changes: 180 additions & 164 deletions ibis/sql/tests/test_exprs.py

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions ibis/sql/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_aggregate_in_projection(self):
proj = t[t, (t.f / t.f.sum()).name('normed_f')]

expected = """\
SELECT *, f / sum(f) OVER () AS `normed_f`
SELECT *, `f` / sum(`f`) OVER () AS `normed_f`
FROM alltypes"""
self._check_sql(proj, expected)

Expand All @@ -48,19 +48,19 @@ def test_add_default_order_by(self):
grouped = t.group_by('g')
proj = grouped.mutate([lag, diff, first, last, lag2])
expected = """\
SELECT *, lag(f) OVER (PARTITION BY g ORDER BY f) AS `lag`,
lead(f) OVER (PARTITION BY g ORDER BY f) - f AS `fwd_diff`,
first_value(f) OVER (PARTITION BY g ORDER BY f) AS `first`,
last_value(f) OVER (PARTITION BY g ORDER BY f) AS `last`,
lag(f) OVER (PARTITION BY g ORDER BY d) AS `lag2`
SELECT *, lag(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `lag`,
lead(`f`) OVER (PARTITION BY `g` ORDER BY `f`) - `f` AS `fwd_diff`,
first_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `first`,
last_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `last`,
lag(`f`) OVER (PARTITION BY `g` ORDER BY `d`) AS `lag2`
FROM alltypes"""
self._check_sql(proj, expected)

def test_window_frame_specs(self):
t = self.con.table('alltypes')

ex_template = """\
SELECT sum(d) OVER (ORDER BY f {0}) AS `foo`
SELECT sum(`d`) OVER (ORDER BY `f` {0}) AS `foo`
FROM alltypes"""

cases = [
Expand Down Expand Up @@ -134,8 +134,8 @@ def test_nested_analytic_function(self):
expr = (t.f - t.f.lag()).lag().over(w).name('foo')
result = t.projection([expr])
expected = """\
SELECT lag(f - lag(f) OVER (ORDER BY f)) \
OVER (ORDER BY f) AS `foo`
SELECT lag(`f` - lag(`f`) OVER (ORDER BY `f`)) \
OVER (ORDER BY `f`) AS `foo`
FROM alltypes"""
self._check_sql(result, expected)

Expand All @@ -145,8 +145,8 @@ def test_rank_functions(self):
proj = t[t.g, t.f.rank().name('minr'),
t.f.dense_rank().name('denser')]
expected = """\
SELECT g, rank() OVER (ORDER BY f) - 1 AS `minr`,
dense_rank() OVER (ORDER BY f) - 1 AS `denser`
SELECT `g`, rank() OVER (ORDER BY `f`) - 1 AS `minr`,
dense_rank() OVER (ORDER BY `f`) - 1 AS `denser`
FROM alltypes"""
self._check_sql(proj, expected)

Expand All @@ -159,7 +159,7 @@ def test_multiple_windows(self):
proj = t.projection([t.g, expr.name('result')])

expected = """\
SELECT g, sum(f) OVER (PARTITION BY g) - sum(f) OVER () AS `result`
SELECT `g`, sum(`f`) OVER (PARTITION BY `g`) - sum(`f`) OVER () AS `result`
FROM alltypes"""
self._check_sql(proj, expected)

Expand All @@ -170,16 +170,16 @@ def test_order_by_desc(self):

proj = t[t.f, ibis.row_number().over(w).name('revrank')]
expected = """\
SELECT f, row_number() OVER (ORDER BY f DESC) - 1 AS `revrank`
SELECT `f`, row_number() OVER (ORDER BY `f` DESC) - 1 AS `revrank`
FROM alltypes"""
self._check_sql(proj, expected)

expr = (t.group_by('g')
.order_by(ibis.desc(t.f))
[t.d.lag().name('foo'), t.a.max()])
expected = """\
SELECT lag(d) OVER (PARTITION BY g ORDER BY f DESC) AS `foo`,
max(a) OVER (PARTITION BY g ORDER BY f DESC) AS `max`
SELECT lag(`d`) OVER (PARTITION BY `g` ORDER BY `f` DESC) AS `foo`,
max(`a`) OVER (PARTITION BY `g` ORDER BY `f` DESC) AS `max`
FROM alltypes"""
self._check_sql(expr, expected)

Expand All @@ -195,7 +195,7 @@ def test_row_number_requires_order_by(self):
.mutate(ibis.row_number().name('foo')))

expected = """\
SELECT *, row_number() OVER (PARTITION BY g ORDER BY f) - 1 AS `foo`
SELECT *, row_number() OVER (PARTITION BY `g` ORDER BY `f`) - 1 AS `foo`
FROM alltypes"""
self._check_sql(expr, expected)

Expand Down
Empty file added ibis/sql/vertica/__init__.py
Empty file.
Empty file.
14 changes: 12 additions & 2 deletions ibis/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,19 @@
def pytest_addoption(parser):
parser.addoption('--e2e', action='store_true', default=False,
help='Enable the e2e (end-to-end) tests')
parser.addoption('--skip-udf', action='store_true', default=False,
help='Skip tests marked udf')
parser.addoption('--skip-superuser', action='store_true', default=False,
help='Skip tests marked superuser')


def pytest_runtest_setup(item):
if getattr(item.obj, 'e2e', None): # the test item is marked e2e
if not item.config.getvalue('e2e'): # but --e2e option not set
skip('e2e was not enabled')
if not item.config.getoption('--e2e'): # but --e2e option not set
skip('--e2e NOT enabled')
if getattr(item.obj, 'udf', None):
if item.config.getoption('--skip-udf'):
skip('--skip-udf enabled')
if getattr(item.obj, 'superuser', None):
if item.config.getoption('--skip-superuser'):
skip('--skip-superuser enabled')
122 changes: 117 additions & 5 deletions ibis/tests/test_filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
import os
import shutil

from hdfs import InsecureClient
import pytest

from ibis.filesystems import HDFS, WebHDFS
from ibis.filesystems import HDFS
from ibis.compat import unittest
from ibis.tests.util import IbisTestEnv
import ibis.util as util
import ibis


ENV = IbisTestEnv()
Expand Down Expand Up @@ -77,8 +77,12 @@ class TestHDFSE2E(unittest.TestCase):
def setUpClass(cls):
cls.ENV = ENV
cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid())
cls.hdfs_client = InsecureClient(cls.ENV.hdfs_url)
cls.hdfs = WebHDFS(cls.hdfs_client)
if cls.ENV.use_kerberos:
print("Warning: ignoring invalid Certificate Authority errors")
cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host,
port=cls.ENV.webhdfs_port,
use_kerberos=cls.ENV.use_kerberos,
verify=(not cls.ENV.use_kerberos))
cls.hdfs.mkdir(cls.tmp_dir)

@classmethod
Expand Down Expand Up @@ -146,6 +150,19 @@ def test_mkdir(self):
self.hdfs.mkdir(path)
assert self.hdfs.exists(path)

def test_chmod(self):
new_permissions = '755'
path = self._make_random_hdfs_file()
self.hdfs.chmod(path, new_permissions)
assert self.hdfs.status(path)['permission'] == new_permissions

def test_chmod_directory(self):
new_permissions = '755'
path = pjoin(self.tmp_dir, util.guid())
self.hdfs.mkdir(path)
self.hdfs.chmod(path, new_permissions)
assert self.hdfs.status(path)['permission'] == new_permissions

def test_mv_to_existing_file(self):
remote_file = self._make_random_hdfs_file()
existing_remote_file_dest = self._make_random_hdfs_file()
Expand All @@ -155,7 +172,8 @@ def test_mv_to_existing_file_no_overwrite(self):
remote_file = self._make_random_hdfs_file()
existing_remote_file_dest = self._make_random_hdfs_file()
with self.assertRaises(Exception):
self.hdfs.mv(remote_file, existing_remote_file_dest, overwrite=False)
self.hdfs.mv(remote_file, existing_remote_file_dest,
overwrite=False)

def test_mv_to_directory(self):
remote_file = self._make_random_hdfs_file()
Expand Down Expand Up @@ -373,6 +391,100 @@ def _sample_nested_directory(self):
return dirname


@pytest.mark.e2e
@pytest.mark.superuser
class TestSuperUserHDFSE2E(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.ENV = ENV
cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid())
if cls.ENV.use_kerberos:
print("Warning: ignoring invalid Certificate Authority errors")
# NOTE: specifying superuser as set in IbisTestEnv
cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host,
port=cls.ENV.webhdfs_port,
use_kerberos=cls.ENV.use_kerberos,
verify=(not cls.ENV.use_kerberos),
user=cls.ENV.hdfs_superuser)
cls.hdfs.mkdir(cls.tmp_dir)

@classmethod
def tearDownClass(cls):
try:
cls.hdfs.rmdir(cls.tmp_dir)
except:
pass

def setUp(self):
self.test_files = []
self.test_directories = []

def tearDown(self):
self._delete_test_files()
pass

def _delete_test_files(self):
for path in self.test_files:
try:
os.remove(path)
except os.error:
pass

for path in self.test_directories:
try:
shutil.rmtree(path)
except os.error:
pass

def _make_random_file(self, size=1024, directory=None):
path = util.guid()

if directory:
path = osp.join(directory, path)

units = size / 32

with open(path, 'wb') as f:
for i in xrange(units):
f.write(util.guid())

self.test_files.append(path)
return path

def _make_random_hdfs_file(self, size=1024, directory=None):
local_path = self._make_random_file(size=size)
remote_path = pjoin(directory or self.tmp_dir, local_path)
self.hdfs.put(remote_path, local_path)
return remote_path

def test_chown_owner(self):
new_owner = 'randomowner'
path = self._make_random_hdfs_file()
self.hdfs.chown(path, new_owner)
assert self.hdfs.status(path)['owner'] == new_owner

def test_chown_group(self):
new_group = 'randomgroup'
path = self._make_random_hdfs_file()
self.hdfs.chown(path, group=new_group)
assert self.hdfs.status(path)['group'] == new_group

def test_chown_group_directory(self):
new_group = 'randomgroup'
path = pjoin(self.tmp_dir, util.guid())
self.hdfs.mkdir(path)
self.hdfs.chown(path, group=new_group)
assert self.hdfs.status(path)['group'] == new_group

def test_chown_owner_directory(self):
new_owner = 'randomowner'
path = pjoin(self.tmp_dir, util.guid())
self.hdfs.mkdir(path)
self.hdfs.chown(path, new_owner)
assert self.hdfs.status(path)['owner'] == new_owner


def _check_directories_equal(left, right):
left_files = _get_all_files(left)
right_files = _get_all_files(right)
Expand Down
26 changes: 16 additions & 10 deletions ibis/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,14 @@ def __init__(self):
self.nn_host = os.environ.get('IBIS_TEST_NN_HOST', 'localhost')
# 5070 is default for impala dev env
self.webhdfs_port = int(os.environ.get('IBIS_TEST_WEBHDFS_PORT', 5070))
self.hdfs_url = 'http://{0}:{1}'.format(self.nn_host,
self.webhdfs_port)
self.hdfs_superuser = os.environ.get('IBIS_TEST_HDFS_SUPERUSER',
'hdfs')
self.use_codegen = os.environ.get('IBIS_TEST_USE_CODEGEN',
'False').lower() == 'true'
self.cleanup_test_data = os.environ.get('IBIS_TEST_CLEANUP_TEST_DATA',
'True').lower() == 'true'
self.use_kerberos = os.environ.get('IBIS_TEST_USE_KERBEROS',
'False').lower() == 'true'

# update global Ibis config where relevant
options.impala.temp_db = self.tmp_db
options.impala.temp_hdfs_path = self.tmp_dir
Expand All @@ -68,14 +67,14 @@ def connect_test(env, with_hdfs=True):
pool_size=2)
if with_hdfs:
if env.use_kerberos:
from hdfs.ext.kerberos import KerberosClient
hdfs_client = KerberosClient(env.hdfs_url, mutual_auth='REQUIRED')
else:
from hdfs.client import InsecureClient
hdfs_client = InsecureClient(env.hdfs_url)
return ibis.make_client(con, hdfs_client)
print("Warning: ignoring invalid Certificate Authority errors")
hdfs_client = ibis.hdfs_connect(host=env.nn_host,
port=env.webhdfs_port,
use_kerberos=env.use_kerberos,
verify=(not env.use_kerberos))
else:
return ibis.make_client(con)
hdfs_client = None
return ibis.make_client(con, hdfs_client)


@pytest.mark.e2e
Expand All @@ -95,6 +94,8 @@ def setUpClass(cls):
cls.tmp_db = ENV.tmp_db
cls.alltypes = cls.con.table('functional_alltypes')

cls.db = cls.con.database(ENV.test_data_db)

if not cls.con.exists_database(cls.tmp_db):
cls.con.create_database(cls.tmp_db)

Expand All @@ -117,6 +118,7 @@ def setUp(self):
self.temp_databases = []
self.temp_tables = []
self.temp_views = []
self.temp_functions = []

def tearDown(self):
for t in self.temp_tables:
Expand All @@ -125,6 +127,10 @@ def tearDown(self):
for t in self.temp_views:
self.con.drop_view(t, force=True)

for f_name, f_inputs in self.temp_functions:
self.con.drop_udf(f_name, input_types=f_inputs,
force=True)

self.con.set_database(self.test_data_db)
for t in self.temp_databases:
self.con.drop_database(t, force=True)
Expand Down
106 changes: 44 additions & 62 deletions ibis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
import pandas.core.common as pdcom

import ibis
from ibis.common import IbisTypeError
import types
import ibis.compat as compat


def guid():
Expand Down Expand Up @@ -123,60 +119,46 @@ def get(self, key):
raise KeyError(key)


def pandas_col_to_ibis_type(col):
dty = col.dtype
def is_function(v):
return isinstance(v, (types.FunctionType, types.LambdaType))

# datetime types
if pdcom.is_datetime64_dtype(dty):
if pdcom.is_datetime64_ns_dtype(dty):
return 'timestamp'
else:
raise IbisTypeError(
"Column {0} has dtype {1}, which is datetime64-like but does "
"not use nanosecond units".format(col.name, dty))
if pdcom.is_timedelta64_dtype(dty):
print("Warning: encoding a timedelta64 as an int64")
return 'int64'

if pdcom.is_categorical_dtype(dty):
return 'category'

if pdcom.is_bool_dtype(dty):
return 'boolean'

# simple numerical types
if issubclass(dty.type, np.int8):
return 'int8'
if issubclass(dty.type, np.int16):
return 'int16'
if issubclass(dty.type, np.int32):
return 'int32'
if issubclass(dty.type, np.int64):
return 'int64'
if issubclass(dty.type, np.float32):
return 'float'
if issubclass(dty.type, np.float64):
return 'double'
if issubclass(dty.type, np.uint8):
return 'int16'
if issubclass(dty.type, np.uint16):
return 'int32'
if issubclass(dty.type, np.uint32):
return 'int64'
if issubclass(dty.type, np.uint64):
raise IbisTypeError("Column {0} is an unsigned int64".format(col.name))

if pdcom.is_object_dtype(dty):
# TODO: overly broad?
return 'string'

raise IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))


def pandas_to_ibis_schema(frame):
# no analog for decimal in pandas
pairs = []
for col_name in frame:
ibis_type = pandas_col_to_ibis_type(frame[col_name])
pairs.append((col_name, ibis_type))
return ibis.schema(pairs)

def adjoin(space, *lists):
"""
Glues together two sets of strings using the amount of space requested.
The idea is to prettify.
Brought over from from pandas
"""
out_lines = []
newLists = []
lengths = [max(map(len, x)) + space for x in lists[:-1]]

# not the last one
lengths.append(max(map(len, lists[-1])))

maxLen = max(map(len, lists))
for i, lst in enumerate(lists):
nl = [x.ljust(lengths[i]) for x in lst]
nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
newLists.append(nl)
toJoin = zip(*newLists)
for lines in toJoin:
out_lines.append(_join_unicode(lines))
return _join_unicode(out_lines, sep='\n')


def _join_unicode(lines, sep=''):
try:
return sep.join(lines)
except UnicodeDecodeError:
sep = compat.unicode_type(sep)
return sep.join([x.decode('utf-8') if isinstance(x, str) else x
for x in lines])


def deprecate(f, message):
def g(*args, **kwargs):
print(message)
return f(*args, **kwargs)
return g
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
pytest
numpy>=1.7.0
pandas>=0.12.0
impyla>=0.9.1
impyla>=0.10.0
psutil==0.6.1
snakebite
hdfs[kerberos]>=1.1.1
hdfs==1.4.3
six
50 changes: 0 additions & 50 deletions scripts/cleanup_testing_data.py

This file was deleted.

124 changes: 0 additions & 124 deletions scripts/create_test_data_archive.py

This file was deleted.

154 changes: 0 additions & 154 deletions scripts/load_test_data.py

This file was deleted.

55 changes: 47 additions & 8 deletions scripts/run_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ set -x

printenv

mkdir -p /tmp/impyla-dbapi
TMP_DIR=$(mktemp -d -p /tmp/impyla-dbapi tmpXXXX)
mkdir -p /tmp/ibis-tests
TMP_DIR=$(mktemp -d -p /tmp/ibis-tests tmpXXXX)

function cleanup {
rm -rf $TMP_DIR
Expand All @@ -33,6 +33,11 @@ trap cleanup EXIT

cd $TMP_DIR

# Add LLVM to PATH
if [ -n "$IBIS_TEST_LLVM_CONFIG" ]; then
export PATH="$($IBIS_TEST_LLVM_CONFIG --bindir):$PATH"
fi

# Checkout ibis if necessary
if [ -z "$WORKSPACE" ]; then
: ${GIT_URL:?"GIT_URL is unset"}
Expand All @@ -45,6 +50,17 @@ else
IBIS_HOME=$WORKSPACE
fi

# pull in PR if necessary
if [ -z "$WORKSPACE" -a -n "$GITHUB_PR" ]; then
pushd $IBIS_HOME
git clean -d -f
git fetch origin pull/$GITHUB_PR/head:pr_$GITHUB_PR
git checkout pr_$GITHUB_PR
popd
fi

pushd $IBIS_HOME && git status && popd

# Setup Python
curl https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh > miniconda.sh
bash miniconda.sh -b -p $TMP_DIR/miniconda
Expand All @@ -56,20 +72,43 @@ conda info -a
CONDA_ENV_NAME=pyenv-ibis-test
conda create -y -q -n $CONDA_ENV_NAME python=$PYTHON_VERSION numpy pandas
source activate $CONDA_ENV_NAME
pip install click
# preempt the requirements.txt file by installing impyla master
pip install git+https://github.com/cloudera/impyla.git
pip install $IBIS_HOME

python --version
which python

if [ $IBIS_TEST_USE_KERBEROS = "True" ]; then
pip install requests-kerberos
pip install git+https://github.com/laserson/python-sasl.git@cython

# CLOUDERA INTERNAL JENKINS/KERBEROS CONFIG
kinit -l 4h -kt /cdep/keytabs/hive.keytab hive
sudo -u hive PYTHON_EGG_CACHE=/dev/null impala-shell -k -q "GRANT ALL ON SERVER TO ROLE cdep_default_admin WITH GRANT OPTION"
sudo -u hive PYTHON_EGG_CACHE=/dev/null impala-shell -k -q "GRANT ALL ON DATABASE $IBIS_TEST_DATA_DB TO ROLE cdep_default_admin"
sudo -u hive PYTHON_EGG_CACHE=/dev/null impala-shell -k -q "GRANT ALL ON DATABASE $IBIS_TEST_TMP_DB TO ROLE cdep_default_admin"
kdestroy
kinit -l 4h -kt /cdep/keytabs/systest.keytab systest
fi

cd $IBIS_HOME

python -c "from ibis.tests.util import IbisTestEnv; print(IbisTestEnv())"

# load necessary test data
scripts/load_test_data.py
# load necessary test data (without overwriting)
scripts/test_data_admin.py load --data --no-udf

# run the test suite
py.test --e2e ibis
if [ -z "$WORKSPACE" ]; then
# on kerberized cluster, skip UDF work
py.test --skip-udf --skip-superuser --e2e ibis
else
# build and load the UDFs
scripts/test_data_admin.py load --no-data --udf --overwrite
# run the full test suite
py.test --e2e ibis
fi

# cleanup
scripts/cleanup_testing_data.py
# cleanup temporary data (but not testing data)
scripts/test_data_admin.py cleanup --tmp-data --tmp-db
379 changes: 379 additions & 0 deletions scripts/test_data_admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,379 @@
#! /usr/bin/env python
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import shutil
import tempfile
import os.path as osp
from os.path import join as pjoin
from subprocess import check_call

import pandas as pd
from click import group, option

import ibis
from ibis.compat import BytesIO
from ibis.common import IbisError
from ibis.tests.util import IbisTestEnv


ENV = IbisTestEnv()
IBIS_TEST_DATA_S3_BUCKET = 'ibis-test-resources'
IBIS_TEST_DATA_LOCAL_DIR = 'ibis-testing-data'
IBIS_TEST_DATA_TARBALL = 'ibis-testing-data.tar.gz'


def make_ibis_client():
ic = ibis.impala.connect(host=ENV.impala_host, port=ENV.impala_port,
protocol=ENV.impala_protocol,
use_kerberos=ENV.use_kerberos)
if ENV.use_kerberos:
print("Warning: ignoring invalid Certificate Authority errors")
hc = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port,
use_kerberos=ENV.use_kerberos,
verify=(not ENV.use_kerberos))
return ibis.make_client(ic, hdfs_client=hc)


def can_write_to_hdfs(con):
test_path = pjoin(ENV.test_data_dir, ibis.util.guid())
test_file = BytesIO(ibis.util.guid())
try:
con.hdfs.put(test_path, test_file)
con.hdfs.rm(test_path)
return True
except:
return False


def can_build_udfs():
try:
check_call('which cmake', shell=True)
check_call('which make', shell=True)
check_call('which clang++', shell=True)
return True
except:
return False


def is_data_loaded(con):
if not con.hdfs.exists(ENV.test_data_dir):
return False
if not con.exists_database(ENV.test_data_db):
return False
return True


def is_udf_loaded(con):
bitcode_dir = pjoin(ENV.test_data_dir, 'udf')
if con.hdfs.exists(bitcode_dir):
return True
return False


def dnload_ibis_test_data_from_s3(local_path):
url = 'https://{0}.s3.amazonaws.com/{1}'.format(
IBIS_TEST_DATA_S3_BUCKET, IBIS_TEST_DATA_TARBALL)
cmd = 'cd {0} && wget -q {1} && tar -xzf {2}'.format(
local_path, url, IBIS_TEST_DATA_TARBALL)
check_call(cmd, shell=True)
data_dir = pjoin(local_path, IBIS_TEST_DATA_LOCAL_DIR)
print('Downloaded {0} and unpacked it to {1}'.format(url, data_dir))
return data_dir


def upload_ibis_test_data_to_hdfs(con, data_path):
hdfs = con.hdfs
if hdfs.exists(ENV.test_data_dir):
hdfs.rmdir(ENV.test_data_dir)
hdfs.put(ENV.test_data_dir, data_path, verbose=True)


def create_test_database(con):
if con.exists_database(ENV.test_data_db):
con.drop_database(ENV.test_data_db, force=True)
con.create_database(ENV.test_data_db)
print('Created database {0}'.format(ENV.test_data_db))


def create_parquet_tables(con):
parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet'))
schemas = {
'functional_alltypes': ibis.schema(
[('id', 'int32'),
('bool_col', 'boolean'),
('tinyint_col', 'int8'),
('smallint_col', 'int16'),
('int_col', 'int32'),
('bigint_col', 'int64'),
('float_col', 'float'),
('double_col', 'double'),
('date_string_col', 'string'),
('string_col', 'string'),
('timestamp_col', 'timestamp'),
('year', 'int32'),
('month', 'int32')]),
'tpch_region': ibis.schema(
[('r_regionkey', 'int16'),
('r_name', 'string'),
('r_comment', 'string')])}
tables = []
for path in parquet_files:
head, table_name = osp.split(path)
print('Creating {0}'.format(table_name))
# if no schema infer!
schema = schemas.get(table_name)
table = con.parquet_file(path, schema=schema, name=table_name,
database=ENV.test_data_db, persist=True)
tables.append(table)
return tables


def create_avro_tables(con):
avro_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'avro'))
schemas = {
'tpch_region_avro': {
'type': 'record',
'name': 'a',
'fields': [
{'name': 'R_REGIONKEY', 'type': ['null', 'int']},
{'name': 'R_NAME', 'type': ['null', 'string']},
{'name': 'R_COMMENT', 'type': ['null', 'string']}]}}
tables = []
for path in avro_files:
head, table_name = osp.split(path)
print('Creating {0}'.format(table_name))
schema = schemas[table_name]
table = con.avro_file(path, schema, name=table_name,
database=ENV.test_data_db, persist=True)
tables.append(table)
return tables


def build_udfs():
print('Building UDFs')
ibis_home_dir = osp.dirname(osp.dirname(osp.abspath(__file__)))
udf_dir = pjoin(ibis_home_dir, 'testing', 'udf')
check_call('cmake . && make', shell=True, cwd=udf_dir)


def upload_udfs(con):
ibis_home_dir = osp.dirname(osp.dirname(osp.abspath(__file__)))
build_dir = pjoin(ibis_home_dir, 'testing', 'udf', 'build')
bitcode_dir = pjoin(ENV.test_data_dir, 'udf')
print('Uploading UDFs to {0}'.format(bitcode_dir))
if con.hdfs.exists(bitcode_dir):
con.hdfs.rmdir(bitcode_dir)
con.hdfs.put(bitcode_dir, build_dir, verbose=True)


def scrape_parquet_files(con):
to_scrape = [('tpch', x) for x in con.list_tables(database='tpch')]
to_scrape.append(('functional', 'alltypes'))
for db, tname in to_scrape:
table = con.table(tname, database=db)
new_name = '{0}_{1}'.format(db, tname)
print('Creating {0}'.format(new_name))
con.create_table(new_name, table, database=tmp_db)


def download_parquet_files(con, tmp_db_hdfs_path):
parquet_path = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'parquet')
print("Downloading {0}".format(parquet_path))
con.hdfs.get(tmp_db_hdfs_path, parquet_path)


def download_avro_files(con):
avro_hdfs_path = '/test-warehouse/tpch.region_avro'
avro_local_path = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'avro')
os.mkdir(avro_local_path)
print("Downloading {0}".format(avro_hdfs_path))
con.hdfs.get(avro_hdfs_path, pjoin(avro_local_path, 'tpch_region_avro'))


def generate_csv_files():
N = 10
nfiles = 10
df = pd.DataFrame({'foo': [tm.rands(10) for _ in xrange(N)],
'bar': np.random.randn(N),
'baz': np.random.randint(0, 100, size=N)},
columns=['foo', 'bar', 'baz'])
csv_base = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'csv')
os.mkdir(csv_base)
for i in xrange(nfiles):
csv_path = pjoin(csv_base, '{0}.csv'.format(i))
print('Writing {0}'.format(csv_path))
df.to_csv(csv_path, index=False, header=False)


def copy_tarball_to_versioned_backup(bucket):
key = bucket.get_key(IBIS_TEST_DATA_TARBALL)
if key:
names = [k.name for k in bucket.list(prefix=IBIS_TEST_DATA_TARBALL)]
names.remove(IBIS_TEST_DATA_TARBALL)
# get the highest number for this key name
last = sorted([int(names.split('.')[-1]) for name in names])[-1]
next_key = '{0}.{1}'.format(IBIS_TEST_DATA_TARBALL, last + 1)
key.copy(IBIS_TEST_DATA_S3_BUCKET, next_key)
key.delete()
assert bucket.get_key(IBIS_TEST_DATA_TARBALL) is None


# ==========================================


@group(context_settings={'help_option_names': ['-h', '--help']})
def main():
"""Manage test data for Ibis"""
pass


@main.command()
def printenv():
"""Print current IbisTestEnv"""
print(str(ENV))


@main.command()
@option('--create-tarball', is_flag=True,
help="Create a gzipped tarball")
@option('--push-to-s3', is_flag=True,
help="Also push the tarball to s3://ibis-test-resources")
def create(create_tarball, push_to_s3):
"""Create Ibis test data"""
print(str(ENV))

con = make_ibis_client()

# verify some assumptions before proceeding
if push_to_s3 and not create_tarball:
raise IbisError(
"Must specify --create-tarball if specifying --push-to-s3")
if osp.exists(IBIS_TEST_DATA_LOCAL_DIR):
raise IbisError(
'Local dir {0} already exists; please remove it first'.format(
IBIS_TEST_DATA_LOCAL_DIR))
if not con.exists_database('tpch'):
raise IbisError('`tpch` database does not exist')
if not con.hdfs.exists('/test-warehouse/tpch.region_avro'):
raise IbisError(
'HDFS dir /test-warehouse/tpch.region_avro does not exist')

# generate tmp identifiers
tmp_db_hdfs_path = pjoin(ENV.tmp_dir, guid())
tmp_db = guid()
os.mkdir(IBIS_TEST_DATA_LOCAL_DIR)
try:
# create the tmp data locally
con.create_database(tmp_db, path=tmp_db_hdfs_path)
print('Created database {0} at {1}'.format(tmp_db, tmp_db_hdfs_path))

# create the local data set
scrape_parquet_files(con)
download_parquet_files(con, tmp_db_hdfs_path)
download_avro_files(con)
generate_csv_files()
finally:
con.drop_database(tmp_db, force=True)
assert not con.hdfs.exists(TMP_DB_HDFS_PATH)

if create_tarball:
check_call('tar -xzf {0} {1}'.format(IBIS_TEST_DATA_TARBALL,
IBIS_TEST_DATA_LOCAL_DIR),
shell=True)

if push_to_s3:
from boto.s3 import connect_to_region
s3_conn = connect_to_region('us-west-2')
bucket = s3_conn.get_bucket(IBIS_TEST_DATA_S3_BUCKET)
copy_tarball_to_versioned_backup(bucket)
key = bucket.new_key(IBIS_TEST_DATA_TARBALL)
print('Upload tarball to S3')
key.set_contents_from_filename(IBIS_TEST_DATA_TARBALL, replace=False)


@main.command()
@option('--data/--no-data', default=True, help='Load (skip) ibis testing data')
@option('--udf/--no-udf', default=True, help='Build/upload (skip) test UDFs')
@option('--data-dir',
help='Path to testing data; dnloads data from S3 if unset')
@option('--overwrite', is_flag=True, help='Forces overwriting of data/UDFs')
def load(data, udf, data_dir, overwrite):
"""Load Ibis test data and build/upload UDFs"""
print(str(ENV))

con = make_ibis_client()

# validate our environment before performing possibly expensive operations
if not can_write_to_hdfs(con):
raise IbisError('Failed to write to HDFS; check your settings')
if udf and not can_build_udfs():
raise IbisError('Build environment does not support building UDFs')

# load the data files
if data and (overwrite or not is_data_loaded(con)):
try:
tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_')
if not data_dir:
print('Did not specify a local dir with the test data, so '
'downloading it from S3')
data_dir = dnload_ibis_test_data_from_s3(tmp_dir)
upload_ibis_test_data_to_hdfs(con, data_dir)
create_test_database(con)
parquet_tables = create_parquet_tables(con)
avro_tables = create_avro_tables(con)
for table in parquet_tables + avro_tables:
print('Computing stats for {0}'.format(table.op().name))
table.compute_stats()
finally:
shutil.rmtree(tmp_dir)

# build and upload the UDFs
if udf and (overwrite or not is_udf_loaded(con)):
build_udfs()
upload_udfs(con)


@main.command()
@option('--test-data', is_flag=True,
help='Cleanup Ibis test data, test database, and also the test UDFs '
'if they are stored in the test data directory/database')
@option('--udfs', is_flag=True, help='Cleanup Ibis test UDFs only')
@option('--tmp-data', is_flag=True,
help='Cleanup Ibis temporary HDFS directory')
@option('--tmp-db', is_flag=True, help='Cleanup Ibis temporary database')
def cleanup(test_data, udfs, tmp_data, tmp_db):
"""Cleanup Ibis test data and UDFs"""
print(str(ENV))

con = make_ibis_client()

if udfs:
# this comes before test_data bc the latter clobbers this too
con.hdfs.rmdir(pjoin(ENV.test_data_dir, 'udf'))

if test_data:
con.drop_database(ENV.test_data_db, force=True)
con.hdfs.rmdir(ENV.test_data_dir)

if tmp_data:
con.hdfs.rmdir(ENV.tmp_dir)

if tmp_db:
con.drop_database(ENV.tmp_db, force=True)


if __name__ == '__main__':
main()
38 changes: 37 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from distutils.extension import Extension

MAJOR = 0
MINOR = 3
MINOR = 4
MICRO = 0
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)

Expand Down Expand Up @@ -91,20 +91,56 @@ def run(self):
include_dirs=common_include)
extensions = cythonize([comms_ext])

LONG_DESCRIPTION = """
Ibis is a productivity-centric Python big data framework.
See http://ibis-project.org
"""

CLASSIFIERS = [
'Development Status :: 4 - Beta',
'Operating System :: OS Independent',
'Intended Audience :: Science/Research',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Cython',
'Topic :: Scientific/Engineering',
]

setup(
name='ibis-framework',
packages=['ibis',
'ibis.expr',
'ibis.expr.tests',
'ibis.hive',
'ibis.hive.tests',
'ibis.impala',
'ibis.impala.tests',
'ibis.spark',
'ibis.spark.tests',
'ibis.sql',
'ibis.sql.tests',
'ibis.sql.presto',
'ibis.sql.presto.tests',
'ibis.sql.redshift',
'ibis.sql.redshift.tests',
'ibis.sql.sqlite',
'ibis.sql.sqlite.tests',
'ibis.sql.vertica',
'ibis.sql.vertica.tests',
'ibis.tests'],
version=VERSION,
package_data={'ibis': ['*.pxd', '*.pyx']},
ext_modules=extensions,
cmdclass=cmdclass,
install_requires=requirements,
extras_require={'kerberos': ['requests-kerberos']},
description="Productivity-centric Python Big Data Framework",
long_description=LONG_DESCRIPTION,
classifiers=CLASSIFIERS,
license='Apache License, Version 2.0',
maintainer="Wes McKinney",
maintainer_email="wes@cloudera.com"
Expand Down
51 changes: 51 additions & 0 deletions testing/udf/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright 2012 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

cmake_minimum_required(VERSION 2.6)

# where to put generated libraries
set(LIBRARY_OUTPUT_PATH "build")
# where to put generated binaries
set(EXECUTABLE_OUTPUT_PATH "build")

find_program(CLANG_EXECUTABLE clang++)

SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ggdb")

# Function to generate rule to cross compile a source file to an IR module.
# This should be called with the .cc src file and it will generate a
# src-file-ir target that can be built.
# e.g. COMPILE_TO_IR(test.cc) generates the "test-ir" make target.
set(IR_COMPILE_FLAGS "-emit-llvm" "-O3" "-c")
function(COMPILE_TO_IR SRC_FILE)
get_filename_component(BASE_NAME ${SRC_FILE} NAME_WE)
set(OUTPUT_FILE "build/${BASE_NAME}.ll")
add_custom_command(
OUTPUT ${OUTPUT_FILE}
COMMAND ${CLANG_EXECUTABLE} ${IR_COMPILE_FLAGS} ${SRC_FILE} -o ${OUTPUT_FILE}
DEPENDS ${SRC_FILE})
add_custom_target(${BASE_NAME}-ir ALL DEPENDS ${OUTPUT_FILE})
endfunction(COMPILE_TO_IR)

# Build the UDA/UDFs into a shared library.
add_library(udfsample SHARED udf-sample.cc)
add_library(udasample SHARED uda-sample.cc hyperloglog-uda.cc variance-uda.cc)

# Custom targest to cross compile UDA/UDF to ir
if (CLANG_EXECUTABLE)
COMPILE_TO_IR(udf-sample.cc )
COMPILE_TO_IR(uda-sample.cc )
endif(CLANG_EXECUTABLE)


136 changes: 136 additions & 0 deletions testing/udf/hyperloglog-uda.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <assert.h>
#include <math.h>
#include <algorithm>
#include <sstream>
#include <iostream>
#include "lib/udf.h"

using namespace std;
using namespace impala_udf;

// This sample UDA implements the hyperloglog distinct estimate aggregate
// function.
// See these papers for more details.
// 1) Hyperloglog: The analysis of a near-optimal cardinality estimation algorithm (2007)
// 2) HyperLogLog in Practice

// Precision taken from the paper. Doesn't seem to matter very much when between [6,12]
const int HLL_PRECISION = 10;

void HllInit(FunctionContext* ctx, StringVal* dst) {
int str_len = pow(2, HLL_PRECISION);
dst->is_null = false;
dst->ptr = ctx->Allocate(str_len);
dst->len = str_len;
memset(dst->ptr, 0, str_len);
}

static const uint64_t FNV64_PRIME = 1099511628211UL;
static const uint64_t FNV64_SEED = 14695981039346656037UL;

static uint64_t FnvHash(const void* data, int32_t bytes, uint64_t hash) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
while (bytes--) {
hash = (*ptr ^ hash) * FNV64_PRIME;
++ptr;
}
return hash;
}

static uint64_t Hash(const IntVal& v) {
return FnvHash(&v.val, sizeof(int32_t), FNV64_SEED);
}

void HllUpdate(FunctionContext* ctx, const IntVal& src, StringVal* dst) {
if (src.is_null) return;
assert(dst != NULL);
assert(!dst->is_null);
assert(dst->len == pow(2, HLL_PRECISION));
uint64_t hash_value = Hash(src);
if (hash_value != 0) {
// Use the lower bits to index into the number of streams and then
// find the first 1 bit after the index bits.
int idx = hash_value % dst->len;
uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_PRECISION) + 1;
dst->ptr[idx] = ::max(dst->ptr[idx], first_one_bit);
}
}

void HllMerge(FunctionContext* ctx, const StringVal& src, StringVal* dst) {
assert(dst != NULL);
assert(!dst->is_null);
assert(!src.is_null);
assert(dst->len == pow(2, HLL_PRECISION));
assert(src.len == pow(2, HLL_PRECISION));
for (int i = 0; i < src.len; ++i) {
dst->ptr[i] = ::max(dst->ptr[i], src.ptr[i]);
}
}

const StringVal HllSerialize(FunctionContext* ctx, const StringVal& src) {
if (src.is_null) return src;
// Copy intermediate state into memory owned by Impala and free allocated memory
StringVal result(ctx, src.len);
memcpy(result.ptr, src.ptr, src.len);
ctx->Free(src.ptr);
return result;
}

StringVal HllFinalize(FunctionContext* ctx, const StringVal& src) {
assert(!src.is_null);
assert(src.len == pow(2, HLL_PRECISION));

const int num_streams = pow(2, HLL_PRECISION);
// Empirical constants for the algorithm.
float alpha = 0;
if (num_streams == 16) {
alpha = 0.673f;
} else if (num_streams == 32) {
alpha = 0.697f;
} else if (num_streams == 64) {
alpha = 0.709f;
} else {
alpha = 0.7213f / (1 + 1.079f / num_streams);
}

float harmonic_mean = 0;
int num_zero_registers = 0;
for (int i = 0; i < src.len; ++i) {
harmonic_mean += powf(2.0f, -src.ptr[i]);
if (src.ptr[i] == 0) ++num_zero_registers;
}
harmonic_mean = 1.0f / harmonic_mean;
int64_t estimate = alpha * num_streams * num_streams * harmonic_mean;

if (num_zero_registers != 0) {
// Estimated cardinality is too low. Hll is too inaccurate here, instead use
// linear counting.
estimate = num_streams * log(static_cast<float>(num_streams) / num_zero_registers);
}

// Free allocated memory
ctx->Free(src.ptr);

// Output the estimate as ascii string
stringstream out;
out << estimate;
string out_str = out.str();
StringVal result_str(ctx, out_str.size());
memcpy(result_str.ptr, out_str.c_str(), result_str.len);
return result_str;
}

43 changes: 43 additions & 0 deletions testing/udf/lib/udf-debug.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


#ifndef IMPALA_UDF_UDF_DEBUG_H
#define IMPALA_UDF_UDF_DEBUG_H

#include "udf.h"

#include <string>
#include <sstream>

namespace impala_udf {

template<typename T>
inline std::string DebugString(const T& val) {
if (val.is_null) return "NULL";
std::stringstream ss;
ss << val.val;
return ss.str();
}

template<>
inline std::string DebugString(const StringVal& val) {
if (val.is_null) return "NULL";
return std::string(reinterpret_cast<const char*>(val.ptr), val.len);
}

}

#endif

611 changes: 611 additions & 0 deletions testing/udf/lib/udf.h

Large diffs are not rendered by default.

180 changes: 180 additions & 0 deletions testing/udf/uda-sample.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "uda-sample.h"
#include <assert.h>
#include <sstream>

using namespace impala_udf;
using namespace std;

template <typename T>
StringVal ToStringVal(FunctionContext* context, const T& val) {
stringstream ss;
ss << val;
string str = ss.str();
StringVal string_val(context, str.size());
memcpy(string_val.ptr, str.c_str(), str.size());
return string_val;
}

template <>
StringVal ToStringVal<DoubleVal>(FunctionContext* context, const DoubleVal& val) {
if (val.is_null) return StringVal::null();
return ToStringVal(context, val.val);
}

// ---------------------------------------------------------------------------
// This is a sample of implementing a COUNT aggregate function.
// ---------------------------------------------------------------------------
void CountInit(FunctionContext* context, BigIntVal* val) {
val->is_null = false;
val->val = 0;
}

void CountUpdate(FunctionContext* context, const IntVal& input, BigIntVal* val) {
if (input.is_null) return;
++val->val;
}

void CountMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst) {
dst->val += src.val;
}

BigIntVal CountFinalize(FunctionContext* context, const BigIntVal& val) {
return val;
}

// ---------------------------------------------------------------------------
// This is a sample of implementing a AVG aggregate function.
// ---------------------------------------------------------------------------
struct AvgStruct {
double sum;
int64_t count;
};

// Initialize the StringVal intermediate to a zero'd AvgStruct
void AvgInit(FunctionContext* context, StringVal* val) {
val->is_null = false;
val->len = sizeof(AvgStruct);
val->ptr = context->Allocate(val->len);
memset(val->ptr, 0, val->len);
}

void AvgUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val) {
if (input.is_null) return;
assert(!val->is_null);
assert(val->len == sizeof(AvgStruct));
AvgStruct* avg = reinterpret_cast<AvgStruct*>(val->ptr);
avg->sum += input.val;
++avg->count;
}

void AvgMerge(FunctionContext* context, const StringVal& src, StringVal* dst) {
if (src.is_null) return;
const AvgStruct* src_avg = reinterpret_cast<const AvgStruct*>(src.ptr);
AvgStruct* dst_avg = reinterpret_cast<AvgStruct*>(dst->ptr);
dst_avg->sum += src_avg->sum;
dst_avg->count += src_avg->count;
}

// A serialize function is necesary to free the intermediate state allocation. We use the
// StringVal constructor to allocate memory owned by Impala, copy the intermediate state,
// and free the original allocation. Note that memory allocated by the StringVal ctor is
// not necessarily persisted across UDA function calls, which is why we don't use it in
// AvgInit().
const StringVal AvgSerialize(FunctionContext* context, const StringVal& val) {
assert(!val.is_null);
StringVal result(context, val.len);
memcpy(result.ptr, val.ptr, val.len);
context->Free(val.ptr);
return result;
}

StringVal AvgFinalize(FunctionContext* context, const StringVal& val) {
assert(!val.is_null);
assert(val.len == sizeof(AvgStruct));
AvgStruct* avg = reinterpret_cast<AvgStruct*>(val.ptr);
StringVal result;
if (avg->count == 0) {
result = StringVal::null();
} else {
// Copies the result to memory owned by Impala
result = ToStringVal(context, avg->sum / avg->count);
}
context->Free(val.ptr);
return result;
}

// ---------------------------------------------------------------------------
// This is a sample of implementing the STRING_CONCAT aggregate function.
// Example: select string_concat(string_col, ",") from table
// ---------------------------------------------------------------------------
// Delimiter to use if the separator is NULL.
static const StringVal DEFAULT_STRING_CONCAT_DELIM((uint8_t*)", ", 2);

void StringConcatInit(FunctionContext* context, StringVal* val) {
val->is_null = true;
}

void StringConcatUpdate(FunctionContext* context, const StringVal& str,
const StringVal& separator, StringVal* result) {
if (str.is_null) return;
if (result->is_null) {
// This is the first string, simply set the result to be the value.
uint8_t* copy = context->Allocate(str.len);
memcpy(copy, str.ptr, str.len);
*result = StringVal(copy, str.len);
return;
}

const StringVal* sep_ptr = separator.is_null ? &DEFAULT_STRING_CONCAT_DELIM :
&separator;

// We need to grow the result buffer and then append the new string and
// separator.
int new_size = result->len + sep_ptr->len + str.len;
result->ptr = context->Reallocate(result->ptr, new_size);
memcpy(result->ptr + result->len, sep_ptr->ptr, sep_ptr->len);
result->len += sep_ptr->len;
memcpy(result->ptr + result->len, str.ptr, str.len);
result->len += str.len;
}

void StringConcatMerge(FunctionContext* context, const StringVal& src, StringVal* dst) {
if (src.is_null) return;
StringConcatUpdate(context, src, ",", dst);
}

// A serialize function is necesary to free the intermediate state allocation. We use the
// StringVal constructor to allocate memory owned by Impala, copy the intermediate
// StringVal, and free the intermediate's memory. Note that memory allocated by the
// StringVal ctor is not necessarily persisted across UDA function calls, which is why we
// don't use it in StringConcatUpdate().
const StringVal StringConcatSerialize(FunctionContext* context, const StringVal& val) {
if (val.is_null) return val;
StringVal result(context, val.len);
memcpy(result.ptr, val.ptr, val.len);
context->Free(val.ptr);
return result;
}

// Same as StringConcatSerialize().
StringVal StringConcatFinalize(FunctionContext* context, const StringVal& val) {
if (val.is_null) return val;
StringVal result(context, val.len);
memcpy(result.ptr, val.ptr, val.len);
context->Free(val.ptr);
return result;
}
125 changes: 125 additions & 0 deletions testing/udf/uda-sample.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


#ifndef SAMPLES_UDA_H
#define SAMPLES_UDA_H

#include "lib/udf.h"

using namespace impala_udf;

// Note: As of Impala 1.2, UDAs must have the same intermediate and result types (see the
// udf.h header for the full Impala UDA specification, which can be found at
// https://github.com/cloudera/impala/blob/master/be/src/udf/udf.h). Some UDAs naturally
// conform to this limitation, such as Count and StringConcat. However, other UDAs return
// a numeric value but use a custom intermediate struct type that must be stored in a
// StringVal or BufferVal, such as Variance.
//
// As a workaround for now, these UDAs that require an intermediate buffer use StringVal
// for the intermediate and result type. In the UDAs' finalize functions, the numeric
// result is serialized to an ASCII string (see the ToStringVal() utility function
// provided with these samples). The returned StringVal is then cast back to the correct
// numeric type (see the Usage examples below).
//
// This restriction will be lifted in Impala 2.0.


// This is an example of the COUNT aggregate function.
//
// Usage: > create aggregate function my_count(int) returns bigint
// location '/user/cloudera/libudasample.so' update_fn='CountUpdate';
// > select my_count(col) from tbl;
void CountInit(FunctionContext* context, BigIntVal* val);
void CountUpdate(FunctionContext* context, const IntVal& input, BigIntVal* val);
void CountMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst);
BigIntVal CountFinalize(FunctionContext* context, const BigIntVal& val);

// This is an example of the AVG(double) aggregate function. This function needs to
// maintain two pieces of state, the current sum and the count. We do this using
// the BufferVal intermediate type. When this UDA is registered, it would specify
// 16 bytes (8 byte sum + 8 byte count) as the size for this buffer.
//
// Usage: > create aggregate function my_avg(double) returns string
// location '/user/cloudera/libudasample.so' update_fn='AvgUpdate';
// > select cast(my_avg(col) as double) from tbl;
//
// TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal
// and the return type changed to DoubleVal in Impala 2.0
void AvgInit(FunctionContext* context, StringVal* val);
void AvgUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val);
void AvgMerge(FunctionContext* context, const StringVal& src, StringVal* dst);
const StringVal AvgSerialize(FunctionContext* context, const StringVal& val);
StringVal AvgFinalize(FunctionContext* context, const StringVal& val);

// This is a sample of implementing the STRING_CONCAT aggregate function.
//
// Usage: > create aggregate function string_concat(string, string) returns string
// location '/user/cloudera/libudasample.so' update_fn='StringConcatUpdate';
// > select string_concat(string_col, ",") from table;
void StringConcatInit(FunctionContext* context, StringVal* val);
void StringConcatUpdate(FunctionContext* context, const StringVal& arg1,
const StringVal& arg2, StringVal* val);
void StringConcatMerge(FunctionContext* context, const StringVal& src, StringVal* dst);
const StringVal StringConcatSerialize(FunctionContext* context, const StringVal& val);
StringVal StringConcatFinalize(FunctionContext* context, const StringVal& val);

// This is a example of the variance aggregate function.
//
// Usage: > create aggregate function var(double) returns string
// location '/user/cloudera/libudasample.so' update_fn='VarianceUpdate';
// > select cast(var(col) as double) from tbl;
//
// TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal
// and the return type changed to DoubleVal in Impala 2.0
void VarianceInit(FunctionContext* context, StringVal* val);
void VarianceUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val);
void VarianceMerge(FunctionContext* context, const StringVal& src, StringVal* dst);
const StringVal VarianceSerialize(FunctionContext* context, const StringVal& val);
StringVal VarianceFinalize(FunctionContext* context, const StringVal& val);

// An implementation of the Knuth online variance algorithm, which is also single pass and
// more numerically stable.
//
// Usage: > create aggregate function knuth_var(double) returns string
// location '/user/cloudera/libudasample.so' update_fn='KnuthVarianceUpdate';
// > select cast(knuth_var(col) as double) from tbl;
//
// TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal
// and the return type changed to DoubleVal in Impala 2.0
void KnuthVarianceInit(FunctionContext* context, StringVal* val);
void KnuthVarianceUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val);
void KnuthVarianceMerge(FunctionContext* context, const StringVal& src, StringVal* dst);
const StringVal KnuthVarianceSerialize(FunctionContext* context, const StringVal& val);
StringVal KnuthVarianceFinalize(FunctionContext* context, const StringVal& val);

// The different steps of the UDA are composable. In this case, we'the UDA will use the
// other steps from the Knuth variance computation.
//
// Usage: > create aggregate function stddev(double) returns string
// location '/user/cloudera/libudasample.so' update_fn='KnuthVarianceUpdate'
// finalize_fn="StdDevFinalize";
// > select cast(stddev(col) as double) from tbl;
//
// TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal
// and the return type changed to DoubleVal in Impala 2.0
StringVal StdDevFinalize(FunctionContext* context, const StringVal& val);

// Utility function for serialization to StringVal
// TODO: this will be unnecessary in Impala 2.0, when we will no longer have to serialize
// results to StringVals in order to match the intermediate type
template <typename T>
StringVal ToStringVal(FunctionContext* context, const T& val);

#endif
214 changes: 214 additions & 0 deletions testing/udf/udf-sample.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "udf-sample.h"

#include <cctype>
#include <cmath>
#include <string>

// In this sample we are declaring a UDF that adds two ints and returns an int.
IntVal AddUdf(FunctionContext* context, const IntVal& arg1, const IntVal& arg2) {
if (arg1.is_null || arg2.is_null) return IntVal::null();
return IntVal(arg1.val + arg2.val);
}

// Multiple UDFs can be defined in the same file

BooleanVal FuzzyEquals(FunctionContext* ctx, const DoubleVal& x, const DoubleVal& y) {
const double EPSILON = 0.000001f;
if (x.is_null || y.is_null) return BooleanVal::null();
double delta = fabs(x.val - y.val);
return BooleanVal(delta < EPSILON);
}

// Check if the input string has any occurrences of the letters (a,e,i,o,u).
// Case-insensitive, so also detects (A,E,I,O,U).
BooleanVal HasVowels(FunctionContext* context, const StringVal& input) {
if (input.is_null) return BooleanVal::null();

int index;
uint8_t *ptr;

for (ptr = input.ptr, index = 0; index < input.len; index++, ptr++) {
uint8_t c = tolower(*ptr);
if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') {
return BooleanVal(true);
}
}
return BooleanVal(false);
}

// Count all occurrences of the letters (a,e,i,o,u) in the input string.
// Case-insensitive, so also counts (A,E,I,O,U).
IntVal CountVowels(FunctionContext* context, const StringVal& arg1) {
if (arg1.is_null) return IntVal::null();

int count;
int index;
uint8_t *ptr;

for (ptr = arg1.ptr, count = 0, index = 0; index < arg1.len; index++, ptr++) {
uint8_t c = tolower(*ptr);
if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') {
count++;
}
}
return IntVal(count);
}

// Remove all occurrences of the letters (a,e,i,o,u) from the input string.
// Case-insensitive, so also removes (A,E,I,O,U).
StringVal StripVowels(FunctionContext* context, const StringVal& arg1) {
if (arg1.is_null) return StringVal::null();

int index;
std::string original((const char *)arg1.ptr,arg1.len);
std::string shorter("");

for (index = 0; index < original.length(); index++) {
uint8_t c = original[index];
uint8_t l = tolower(c);

if (l == 'a' || l == 'e' || l == 'i' || l == 'o' || l == 'u') {
continue;
}
else {
shorter.append(1, (char)c);
}
}
// The modified string is stored in 'shorter', which is destroyed when this function
// ends. We need to make a string val and copy the contents.
// NB: Only the version of the ctor that takes a context object allocates new memory.
StringVal result(context, shorter.size());
memcpy(result.ptr, shorter.c_str(), shorter.size());
return result;
}

// In the prepare function, allocate an IntVal and set it as the shared state. This
// IntVal will be set to the result to be returned, i.e. the argument if it's constant
// and null otherwise.
void ReturnConstantArgPrepare(
FunctionContext* context, FunctionContext::FunctionStateScope scope) {
// UDFs should check the version to avoid unimplemented functions from being called
if (context->version() < FunctionContext::v1_3) {
context->SetError("This UDF can only be used with Impala 1.3 or higher");
return;
}
// TODO: this can be FRAGMENT_LOCAL once it's implemented since we're creating
// read-only state
if (scope == FunctionContext::THREAD_LOCAL) {
// Get the constant value of the 'const_val' argument in ReturnConstantArg(). If this
// value is not constant, 'arg' will be NULL.
IntVal* arg = reinterpret_cast<IntVal*>(context->GetConstantArg(0));
// Allocate shared state to store 'arg' or a null IntVal
IntVal* state = reinterpret_cast<IntVal*>(context->Allocate(sizeof(IntVal)));
*state = (arg != NULL) ? *arg : IntVal::null();
// Set the shared state in the function context
context->SetFunctionState(scope, state);
}
}

// Retreives and returns the shared state set in the prepare function
IntVal ReturnConstantArg(FunctionContext* context, const IntVal& const_val) {
IntVal* state = reinterpret_cast<IntVal*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
return *state;
}

// Cleans up the shared state
void ReturnConstantArgClose(
FunctionContext* context, FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
// Retreive and deallocate the shared state
void* state = context->GetFunctionState(scope);
context->Free(reinterpret_cast<uint8_t*>(state));
context->SetFunctionState(scope, NULL);
}
}


BooleanVal Identity(FunctionContext* context, const BooleanVal& arg) { return arg; }

TinyIntVal Identity(FunctionContext* context, const TinyIntVal& arg) { return arg; }

SmallIntVal Identity(FunctionContext* context, const SmallIntVal& arg) { return arg; }

IntVal Identity(FunctionContext* context, const IntVal& arg) { return arg; }

BigIntVal Identity(FunctionContext* context, const BigIntVal& arg) { return arg; }

FloatVal Identity(FunctionContext* context, const FloatVal& arg) { return arg; }

DoubleVal Identity(FunctionContext* context, const DoubleVal& arg) { return arg; }

StringVal Identity(FunctionContext* context, const StringVal& arg) { return arg; }

TimestampVal Identity(FunctionContext* context, const TimestampVal& arg) { return arg; }

DecimalVal Identity(FunctionContext* context, const DecimalVal& arg) { return arg; }

IntVal AlmostAllTypes(
FunctionContext* context, const StringVal& string, const BooleanVal& boolean,
const TinyIntVal& tiny_int, const SmallIntVal& small_int, const IntVal& int_val,
const BigIntVal& big_int, const FloatVal& float_val, const DoubleVal& double_val
) {
int result = string.len + boolean.val + tiny_int.val + small_int.val + int_val.val
+ big_int.val + static_cast<int64_t>(float_val.val)
+ static_cast<int64_t>(double_val.val);
return IntVal(result);
}

IntVal AllTypes(
FunctionContext* context, const StringVal& string, const BooleanVal& boolean,
const TinyIntVal& tiny_int, const SmallIntVal& small_int, const IntVal& int_val,
const BigIntVal& big_int, const FloatVal& float_val, const DoubleVal& double_val,
const DecimalVal& decimal) {
int result = string.len + boolean.val + tiny_int.val + small_int.val + int_val.val
+ big_int.val + static_cast<int64_t>(float_val.val)
+ static_cast<int64_t>(double_val.val) + decimal.val4;
return IntVal(result);
}


IntVal TwoArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2) {
return IntVal(v1.val + v2.val);
}

IntVal FourArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2,
const IntVal& v3, const IntVal& v4) {
return IntVal(v1.val + v2.val + v3.val + v4.val);
}

IntVal FiveArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2,
const IntVal& v3, const IntVal& v4, const IntVal& v5) {
return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val);
}

IntVal SixArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2,
const IntVal& v3, const IntVal& v4, const IntVal& v5, const IntVal& v6) {
return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val + v6.val);
}

IntVal SevenArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2,
const IntVal& v3, const IntVal& v4, const IntVal& v5, const IntVal& v6,
const IntVal& v7) {
return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val + v6.val + v7.val);
}

IntVal EightArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2,
const IntVal& v3, const IntVal& v4, const IntVal& v5, const IntVal& v6,
const IntVal& v7, const IntVal& v8) {
return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val + v6.val + v7.val + v8.val);
}
71 changes: 71 additions & 0 deletions testing/udf/udf-sample.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


#ifndef SAMPLES_UDF_H
#define SAMPLES_UDF_H

#include "lib/udf.h"

using namespace impala_udf;

// Usage: > create function add(int, int) returns int
// location '/user/cloudera/libudfsample.so' SYMBOL='AddUdf';
// > select add(1, 2);
IntVal AddUdf(FunctionContext* context, const IntVal& arg1, const IntVal& arg2);

// Returns true if x is approximately equal to y.
// Usage: > create function fuzzy_equals(double, double) returns boolean
// location '/user/cloudera/libudfsample.so' SYMBOL='FuzzyEquals';
// > select fuzzy_equals(1, 1.00000001);
BooleanVal FuzzyEquals(FunctionContext* context, const DoubleVal& x, const DoubleVal& y);

// Perform tests, calculations, and transformations
// on a string value, using the set of letters 'aeiou'.

// Usage: > create function hasvowels(string) returns boolean
// location '/user/cloudera/libudfsample.so' SYMBOL='HasVowels';
// > select hasvowels('banana');
// > select hasvowels('grr hm shhh');
// > select hasvowels(c1) from t1;
BooleanVal HasVowels(FunctionContext* context, const StringVal& input);


// Usage: > create function countvowels(string) returns int
// location '/user/cloudera/libudfsample.so' SYMBOL='CountVowels';
// > select countvowels('abracadabra hocus pocus');
// > select countvowels(c1) from t1;
IntVal CountVowels(FunctionContext* context, const StringVal& arg1);

// Usage: > create function stripvowels(string) returns string
// location '/user/cloudera/libudfsample.so' SYMBOL='StripVowels';
// > select stripvowels('colour color');
// > select stripvowels(c1) from t1;
StringVal StripVowels(FunctionContext* context, const StringVal& arg1);

// If 'val' is constant, returns 'val', otherwise returns null. This is a simple toy UDF
// demonstrating how to use prepare and close functions to maintain shared state.
// Requires Impala 1.3 or higher.
// Usage: > create function constantarg(int) returns int
// location '/user/cloudera/libudfsample.so' symbol='ReturnConstantArg'
// prepare_fn='ReturnConstantArgPrepare' close_fn='ReturnConstantArgClose';
// > select constantarg(1 + 1);
// > select constantarg(c1) from t1 limit 1;
IntVal ReturnConstantArg(FunctionContext* context, const IntVal& val);
void ReturnConstantArgPrepare(
FunctionContext* context, FunctionContext::FunctionStateScope scope);
void ReturnConstantArgClose(
FunctionContext* context, FunctionContext::FunctionStateScope scope);

#endif
146 changes: 146 additions & 0 deletions testing/udf/variance-uda.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <assert.h>
#include <math.h>
#include <algorithm>
#include <sstream>
#include <iostream>
#include "lib/udf.h"

#include "uda-sample.h"

using namespace std;
using namespace impala_udf;

// An implementation of a simple single pass variance algorithm. A standard UDA must
// be single pass (i.e. does not scan the table more than once), so the most canonical
// two pass approach is not practical.
// This algorithms suffers from numerical precision issues if the input values are
// large due to floating point rounding.
struct VarianceState {
// Sum of all input values.
double sum;
// Sum of the square of all input values.
double sum_squared;
// The number of input values.
int64_t count;
};

void VarianceInit(FunctionContext* ctx, StringVal* dst) {
dst->is_null = false;
dst->len = sizeof(VarianceState);
dst->ptr = ctx->Allocate(dst->len);
memset(dst->ptr, 0, dst->len);
}

void VarianceUpdate(FunctionContext* ctx, const DoubleVal& src, StringVal* dst) {
if (src.is_null) return;
VarianceState* state = reinterpret_cast<VarianceState*>(dst->ptr);
state->sum += src.val;
state->sum_squared += src.val * src.val;
++state->count;
}

void VarianceMerge(FunctionContext* ctx, const StringVal& src, StringVal* dst) {
VarianceState* src_state = reinterpret_cast<VarianceState*>(src.ptr);
VarianceState* dst_state = reinterpret_cast<VarianceState*>(dst->ptr);
dst_state->sum += src_state->sum;
dst_state->sum_squared += src_state->sum_squared;
dst_state->count += src_state->count;
}

// A serialize function is necessary to free the intermediate state allocation.
const StringVal VarianceSerialize(FunctionContext* ctx, const StringVal& src) {
StringVal result(ctx, src.len);
memcpy(result.ptr, src.ptr, src.len);
ctx->Free(src.ptr);
return result;
}

StringVal VarianceFinalize(FunctionContext* ctx, const StringVal& src) {
VarianceState state = *reinterpret_cast<VarianceState*>(src.ptr);
ctx->Free(src.ptr);
if (state.count == 0 || state.count == 1) return StringVal::null();
double mean = state.sum / state.count;
double variance =
(state.sum_squared - state.sum * state.sum / state.count) / (state.count - 1);
return ToStringVal(ctx, variance);
}

struct KnuthVarianceState {
int64_t count;
double mean;
double m2;
};

void KnuthVarianceInit(FunctionContext* ctx, StringVal* dst) {
dst->is_null = false;
dst->len = sizeof(KnuthVarianceState);
dst->ptr = ctx->Allocate(dst->len);
memset(dst->ptr, 0, dst->len);
}

void KnuthVarianceUpdate(FunctionContext* ctx, const DoubleVal& src, StringVal* dst) {
if (src.is_null) return;
KnuthVarianceState* state = reinterpret_cast<KnuthVarianceState*>(dst->ptr);
double temp = 1 + state->count;
double delta = src.val - state->mean;
double r = delta / temp;
state->mean += r;
state->m2 += state->count * delta * r;
state->count = temp;
}

void KnuthVarianceMerge(FunctionContext* ctx, const StringVal& src, StringVal* dst) {
KnuthVarianceState* src_state = reinterpret_cast<KnuthVarianceState*>(src.ptr);
KnuthVarianceState* dst_state = reinterpret_cast<KnuthVarianceState*>(dst->ptr);
if (src_state->count == 0) return;
double delta = dst_state->mean - src_state->mean;
double sum_count = dst_state->count + src_state->count;
dst_state->mean = src_state->mean + delta * (dst_state->count / sum_count);
dst_state->m2 = (src_state->m2) + dst_state->m2 +
(delta * delta) * (src_state->count * dst_state->count / sum_count);
dst_state->count = sum_count;
}

// Same as VarianceSerialize(). Create a wrapper function so automatic symbol resolution
// still works.
const StringVal KnuthVarianceSerialize(FunctionContext* ctx, const StringVal& state_sv) {
return VarianceSerialize(ctx, state_sv);
}

// TODO: this can be used as the actual variance finalize function once the return type
// doesn't need to match the intermediate type in Impala 2.0.
DoubleVal KnuthVarianceFinalize(const StringVal& state_sv) {
KnuthVarianceState* state = reinterpret_cast<KnuthVarianceState*>(state_sv.ptr);
if (state->count == 0 || state->count == 1) return DoubleVal::null();
double variance_n = state->m2 / state->count;
double variance = variance_n * state->count / (state->count - 1);
return DoubleVal(variance);
}

StringVal KnuthVarianceFinalize(FunctionContext* ctx, const StringVal& src) {
StringVal result = ToStringVal(ctx, KnuthVarianceFinalize(src));
ctx->Free(src.ptr);
return result;
}

StringVal StdDevFinalize(FunctionContext* ctx, const StringVal& src) {
DoubleVal variance = KnuthVarianceFinalize(src);
ctx->Free(src.ptr);
if (variance.is_null) return StringVal::null();
return ToStringVal(ctx, sqrt(variance.val));
}