57 changes: 47 additions & 10 deletions ibis/impala/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import os
import time
import six
from posixpath import join as pjoin

import pytest

Expand All @@ -24,16 +25,26 @@
import ibis


GLOBAL_TMP_DB = os.environ.get('IBIS_TEST_TMP_DB',
'__ibis_tmp_{0}'.format(util.guid()))


GLOBAL_TMP_DIR = os.environ.get('IBIS_TEST_TMP_HDFS_DIR',
'/tmp/__ibis_test')

# update global Ibis config where relevant
options.impala.temp_db = GLOBAL_TMP_DB
options.impala.temp_hdfs_path = GLOBAL_TMP_DIR


class IbisTestEnv(object):

def __init__(self):
# TODO: allow initializing values through a constructor
self.impala_host = os.environ.get('IBIS_TEST_IMPALA_HOST', 'localhost')
self.impala_port = int(os.environ.get('IBIS_TEST_IMPALA_PORT', 21050))
self.tmp_db = os.environ.get('IBIS_TEST_TMP_DB',
'__ibis_tmp_{0}'.format(util.guid()))
self.tmp_dir = os.environ.get('IBIS_TEST_TMP_HDFS_DIR',
'/tmp/__ibis_test')
self.tmp_db = GLOBAL_TMP_DB
self.tmp_dir = GLOBAL_TMP_DIR
self.test_data_db = os.environ.get('IBIS_TEST_DATA_DB', 'ibis_testing')
self.test_data_dir = os.environ.get('IBIS_TEST_DATA_HDFS_DIR',
'/__ibis/ibis-testing-data')
Expand All @@ -48,16 +59,16 @@ def __init__(self):
'True').lower() == 'true'
self.auth_mechanism = os.environ.get('IBIS_TEST_AUTH_MECH', 'NOSASL')
self.llvm_config = os.environ.get('IBIS_TEST_LLVM_CONFIG', None)
# update global Ibis config where relevant
options.impala.temp_db = self.tmp_db
options.impala.temp_hdfs_path = self.tmp_dir

def __repr__(self):
kvs = ['{0}={1}'.format(k, v)
for (k, v) in six.iteritems(self.__dict__)]
return 'IbisTestEnv(\n {0})'.format(',\n '.join(kvs))


ENV = IbisTestEnv()


def connect_test(env, with_hdfs=True):
if with_hdfs:
if env.auth_mechanism in ['GSSAPI', 'LDAP']:
Expand All @@ -83,7 +94,18 @@ class ImpalaE2E(object):

@classmethod
def setUpClass(cls):
ENV = IbisTestEnv()
ImpalaE2E.setup_e2e(cls)

# make sure this never gets messed up
opts = cls.con.get_options()
assert opts['DISABLE_CODEGEN'] == '1'

@classmethod
def tearDownClass(cls):
ImpalaE2E.teardown_e2e(cls)

@staticmethod
def setup_e2e(cls):
cls.con = connect_test(ENV)
# Tests run generally faster without it
if not ENV.use_codegen:
Expand All @@ -100,8 +122,8 @@ def setUpClass(cls):
if not cls.con.exists_database(cls.tmp_db):
cls.con.create_database(cls.tmp_db)

@classmethod
def tearDownClass(cls):
@staticmethod
def teardown_e2e(cls):
i, retries = 0, 3
while True:
# reduce test flakiness
Expand All @@ -115,6 +137,21 @@ def tearDownClass(cls):

time.sleep(0.1)

@classmethod
def _create_777_tmp_dir(cls):
base = pjoin(cls.tmp_dir, util.guid())
tmp_path = pjoin(base, util.guid())
env = IbisTestEnv()
superuser_hdfs = ibis.hdfs_connect(host=env.nn_host,
port=env.webhdfs_port,
auth_mechanism=env.auth_mechanism,
verify=(env.auth_mechanism
not in ['GSSAPI', 'LDAP']),
user=env.hdfs_superuser)
superuser_hdfs.mkdir(base)
superuser_hdfs.chmod(base, '777')
return tmp_path

def setUp(self):
self.temp_databases = []
self.temp_tables = []
Expand Down
44 changes: 42 additions & 2 deletions ibis/impala/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ def test_limit_overrides_expr(self):
result = self.con.execute(t.limit(10), limit=5)
assert len(result) == 5

def test_limit_equals_none_no_limit(self):
t = self.alltypes

with config.option_context('sql.default_limit', 10):
result = t.execute(limit=None)
assert len(result) > 10

def test_verbose_log_queries(self):
queries = []

Expand All @@ -171,8 +178,7 @@ def logger(x):
self.con.table('tpch_orders', database=self.test_data_db)

assert len(queries) == 1
expected = 'SELECT * FROM {0}.`tpch_orders` LIMIT 0'.format(
self.test_data_db)
expected = 'DESCRIBE {0}.`tpch_orders`'.format(self.test_data_db)
assert queries[0] == expected

def test_sql_query_limits(self):
Expand Down Expand Up @@ -294,3 +300,37 @@ def test_query_cancel(self):
assert elapsed < 5

assert q.is_finished()

def test_set_compression_codec(self):
old_opts = self.con.get_options()
assert old_opts['COMPRESSION_CODEC'].upper() == 'NONE'

self.con.set_compression_codec('snappy')
opts = self.con.get_options()
assert opts['COMPRESSION_CODEC'].upper() == 'SNAPPY'

self.con.set_compression_codec(None)
opts = self.con.get_options()
assert opts['COMPRESSION_CODEC'].upper() == 'NONE'

def test_disable_codegen(self):
self.con.disable_codegen(False)
opts = self.con.get_options()
assert opts['DISABLE_CODEGEN'] == '0'

self.con.disable_codegen()
opts = self.con.get_options()
assert opts['DISABLE_CODEGEN'] == '1'

impala_con = self.con.con
cur1 = impala_con.execute('SET')
cur2 = impala_con.execute('SET')

opts1 = dict(cur1.fetchall())
cur1.release()

opts2 = dict(cur2.fetchall())
cur2.release()

assert opts1['DISABLE_CODEGEN'] == '1'
assert opts2['DISABLE_CODEGEN'] == '1'
315 changes: 298 additions & 17 deletions ibis/impala/tests/test_ddl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
import gc

import ibis

import pandas as pd

from posixpath import join as pjoin
import pytest

from ibis.expr.tests.mocks import MockConnection
from ibis.compat import unittest
from ibis.compat import unittest, mock
from ibis.impala import ddl
from ibis.impala.compat import HS2Error, ImpylaError
from ibis.impala.client import build_ast
Expand All @@ -49,7 +49,7 @@ def test_must_exist(self):
assert query == expected


class TestInsert(unittest.TestCase):
class TestInsertLoadData(unittest.TestCase):

def setUp(self):
self.con = MockConnection()
Expand Down Expand Up @@ -81,6 +81,43 @@ def test_select_basics(self):
LIMIT 10"""
assert result == expected

def test_load_data_unpartitioned(self):
path = '/path/to/data'
stmt = ddl.LoadData('functional_alltypes', path, database='foo')

result = stmt.compile()
expected = ("LOAD DATA INPATH '/path/to/data' "
"INTO TABLE foo.`functional_alltypes`")
assert result == expected

stmt.overwrite = True
result = stmt.compile()
expected = ("LOAD DATA INPATH '/path/to/data' "
"OVERWRITE INTO TABLE foo.`functional_alltypes`")
assert result == expected

def test_load_data_partitioned(self):
path = '/path/to/data'
part = {'year': 2007, 'month': 7}
part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')])
stmt = ddl.LoadData('functional_alltypes', path,
database='foo',
partition=part,
partition_schema=part_schema)

result = stmt.compile()
expected = """\
LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
assert result == expected

stmt.overwrite = True
result = stmt.compile()
expected = """\
LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
assert result == expected

def test_select_overwrite(self):
pass

Expand All @@ -99,6 +136,116 @@ def test_pool_name(self):
assert query == expected


class TestAlterTablePartition(unittest.TestCase):

def setUp(self):
self.part_schema = ibis.schema([('year', 'int32'),
('month', 'int32')])
self.table_name = 'tbl'

def test_add_partition(self):
stmt = ddl.AddPartition(self.table_name,
{'year': 2007, 'month': 4},
self.part_schema)

result = stmt.compile()
expected = 'ALTER TABLE tbl ADD PARTITION (year=2007, month=4)'
assert result == expected

def test_drop_partition(self):
stmt = ddl.DropPartition(self.table_name,
{'year': 2007, 'month': 4},
self.part_schema)

result = stmt.compile()
expected = 'ALTER TABLE tbl DROP PARTITION (year=2007, month=4)'
assert result == expected

def test_add_partition_with_props(self):
props = dict(
location='/users/foo/my-data'
)
stmt = ddl.AddPartition(self.table_name,
{'year': 2007, 'month': 4},
self.part_schema, **props)

result = stmt.compile()
expected = """\
ALTER TABLE tbl ADD PARTITION (year=2007, month=4)
LOCATION '/users/foo/my-data'"""
assert result == expected

def test_alter_partition_properties(self):
part = {'year': 2007, 'month': 4}

def _get_ddl_string(props):
stmt = ddl.AlterPartition(self.table_name, part,
self.part_schema,
**props)
return stmt.compile()

result = _get_ddl_string({'location': '/users/foo/my-data'})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET LOCATION '/users/foo/my-data'"""
assert result == expected

result = _get_ddl_string({'format': 'avro'})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET FILEFORMAT AVRO"""
assert result == expected

result = _get_ddl_string({'tbl_properties': {
'bar': 2, 'foo': '1'
}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET TBLPROPERTIES ('bar'='2', 'foo'='1')"""
assert result == expected

result = _get_ddl_string({'serde_properties': {'baz': 3}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET SERDEPROPERTIES ('baz'='3')"""
assert result == expected

def test_alter_table_properties(self):
part = {'year': 2007, 'month': 4}

def _get_ddl_string(props):
stmt = ddl.AlterPartition(self.table_name, part,
self.part_schema,
**props)
return stmt.compile()

result = _get_ddl_string({'location': '/users/foo/my-data'})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET LOCATION '/users/foo/my-data'"""
assert result == expected

result = _get_ddl_string({'format': 'avro'})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET FILEFORMAT AVRO"""
assert result == expected

result = _get_ddl_string({'tbl_properties': {
'bar': 2, 'foo': '1'
}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET TBLPROPERTIES ('bar'='2', 'foo'='1')"""
assert result == expected

result = _get_ddl_string({'serde_properties': {'baz': 3}})
expected = """\
ALTER TABLE tbl PARTITION (year=2007, month=4)
SET SERDEPROPERTIES ('baz'='3')"""
assert result == expected


class TestCreateTable(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -337,7 +484,31 @@ def test_partition_by(self):
pass


class TestDDLOperations(ImpalaE2E, unittest.TestCase):
class TestDDLE2E(ImpalaE2E, unittest.TestCase):

@classmethod
def setUpClass(cls):
ImpalaE2E.setup_e2e(cls)

cls.path_uuid = 'change-location-{0}'.format(util.guid())
fake_path = pjoin(cls.tmp_dir, cls.path_uuid)

cls.table_name = 'table_{0}'.format(util.guid())

schema = ibis.schema([('foo', 'string'), ('bar', 'int64')])

cls.con.create_table(cls.table_name,
database=cls.tmp_db,
schema=schema,
format='parquet',
external=True,
location=fake_path)
cls.table = cls.con.table(cls.table_name, database=cls.tmp_db)

@classmethod
def tearDownClass(cls):
cls.con.drop_table(cls.table_name, database=cls.tmp_db)
ImpalaE2E.teardown_e2e(cls)

def test_list_databases(self):
assert len(self.con.list_databases()) > 0
Expand Down Expand Up @@ -462,7 +633,7 @@ def test_create_table_with_location(self):
expr = self.alltypes
table_name = _random_table_name()

self.con.create_table(table_name, expr=expr, path=tmp_path,
self.con.create_table(table_name, obj=expr, location=tmp_path,
database=self.test_data_db)
self.temp_tables.append('.'.join([self.test_data_db, table_name]))
assert self.hdfs.exists(tmp_path)
Expand All @@ -477,7 +648,7 @@ def test_truncate_table(self):
expr = self.alltypes.limit(50)

table_name = util.guid()
self.con.create_table(table_name, expr=expr)
self.con.create_table(table_name, obj=expr)
self.temp_tables.append(table_name)

try:
Expand Down Expand Up @@ -564,7 +735,91 @@ def test_insert_validate_types(self):
t.insert(to_insert.limit(10))

def test_compute_stats(self):
self.con.table('functional_alltypes').compute_stats()
t = self.con.table('functional_alltypes')

t.compute_stats()
t.compute_stats(incremental=True)

self.con.compute_stats('functional_alltypes')

def test_invalidate_metadata(self):
with self._patch_execute() as ex_mock:
self.con.invalidate_metadata()
ex_mock.assert_called_with('INVALIDATE METADATA')

self.con.invalidate_metadata('functional_alltypes')
t = self.con.table('functional_alltypes')
t.invalidate_metadata()

with self._patch_execute() as ex_mock:
self.con.invalidate_metadata('functional_alltypes',
database=self.test_data_db)
ex_mock.assert_called_with('INVALIDATE METADATA '
'{0}.`{1}`'
.format(self.test_data_db,
'functional_alltypes'))

def test_refresh(self):
tname = 'functional_alltypes'
with self._patch_execute() as ex_mock:
self.con.refresh(tname)
ex_cmd = 'REFRESH {0}.`{1}`'.format(self.test_data_db,
tname)
ex_mock.assert_called_with(ex_cmd)

t = self.con.table(tname)
with self._patch_execute() as ex_mock:
t.refresh()
ex_cmd = 'REFRESH {0}.`{1}`'.format(self.test_data_db,
tname)
ex_mock.assert_called_with(ex_cmd)

def _patch_execute(self):
return mock.patch.object(self.con, '_execute',
wraps=self.con._execute)

def test_describe_formatted(self):
from ibis.impala.metadata import TableMetadata

t = self.con.table('functional_alltypes')
with self._patch_execute() as ex_mock:
desc = t.describe_formatted()
ex_mock.assert_called_with('DESCRIBE FORMATTED '
'{0}.`{1}`'
.format(self.test_data_db,
'functional_alltypes'),
results=True)
assert isinstance(desc, TableMetadata)

def test_show_files(self):
t = self.con.table('functional_alltypes')
qualified_name = '{0}.`{1}`'.format(self.test_data_db,
'functional_alltypes')
with self._patch_execute() as ex_mock:
desc = t.files()
ex_mock.assert_called_with('SHOW FILES IN {0}'
.format(qualified_name),
results=True)
assert isinstance(desc, pd.DataFrame)

def test_table_column_stats(self):
t = self.con.table('functional_alltypes')

qualified_name = '{0}.`{1}`'.format(self.test_data_db,
'functional_alltypes')
with self._patch_execute() as ex_mock:
desc = t.stats()
ex_mock.assert_called_with('SHOW TABLE STATS {0}'
.format(qualified_name),
results=True)
assert isinstance(desc, pd.DataFrame)

with self._patch_execute() as ex_mock:
desc = t.column_stats()
ex_mock.assert_called_with('SHOW COLUMN STATS {0}'
.format(qualified_name),
results=True)
assert isinstance(desc, pd.DataFrame)

def test_drop_table_or_view(self):
t = self.db.functional_alltypes
Expand All @@ -585,28 +840,54 @@ def test_drop_table_or_view(self):
t3.drop()
assert vname not in self.db


class TestAlterTable(ImpalaE2E, unittest.TestCase):

def test_rename_table(self):
tmp_db = '__ibis_tmp_{0}'.format(util.guid()[:4])
self.con.create_database(tmp_db)
self.temp_databases.append(tmp_db)

self.con.create_table('tmp_rename_test',
orig_name = 'tmp_rename_test'
self.con.create_table(orig_name,
self.con.table('tpch_region'))
table = self.con.table('tmp_rename_test')
table = self.con.table(orig_name)

new_name = 'rename_test'
table.rename(new_name, database=tmp_db)
old_name = table.name

table.execute()
new_name = 'rename_test'
renamed = table.rename(new_name, database=tmp_db)
renamed.execute()

t = self.con.table(new_name, database=tmp_db)
assert_equal(table, t)
assert_equal(renamed, t)

assert table.name == old_name

def test_change_location(self):
old_loc = self.table.metadata().location

new_path = pjoin(self.tmp_dir, 'new-path')
self.table.alter(location=new_path)

new_loc = self.table.metadata().location
assert new_loc == old_loc.replace(self.path_uuid, 'new-path')

def test_change_properties(self):
props = {'foo': '1', 'bar': '2'}

self.table.alter(tbl_properties=props)
tbl_props = self.table.metadata().tbl_properties
for k, v in props.iteritems():
assert v == tbl_props[k]

self.table.alter(serde_properties=props)
serde_props = self.table.metadata().serde_properties
for k, v in props.iteritems():
assert v == serde_props[k]

def test_change_format(self):
self.table.alter(format='avro')

class TestQueryHDFSData(ImpalaE2E, unittest.TestCase):
meta = self.table.metadata()
assert 'Avro' in meta.hive_format

def test_cleanup_tmp_table_on_gc(self):
hdfs_path = pjoin(self.test_data_dir, 'parquet/tpch_region')
Expand Down
95 changes: 68 additions & 27 deletions ibis/impala/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import ibis

from ibis import literal as L
from ibis.compat import unittest, StringIO, Decimal
from ibis.expr.datatypes import Category
from ibis.expr.tests.mocks import MockConnection
Expand Down Expand Up @@ -57,7 +58,7 @@ def setUp(self):

def _check_literals(self, cases):
for value, expected in cases:
lit_expr = ibis.literal(value)
lit_expr = L(value)
result = self._translate(lit_expr)
assert result == expected

Expand All @@ -69,7 +70,7 @@ def test_string_literals(self):
]

for value, expected in cases:
lit_expr = ibis.literal(value)
lit_expr = L(value)
result = self._translate(lit_expr)
assert result == expected

Expand Down Expand Up @@ -209,7 +210,7 @@ def test_misc_conditionals(self):

def test_decimal_casts(self):
cases = [
(ibis.literal('9.9999999').cast('decimal(38,5)'),
(L('9.9999999').cast('decimal(38,5)'),
"CAST('9.9999999' AS decimal(38,5))"),
(self.table.f.cast('decimal(12,2)'), "CAST(`f` AS decimal(12,2))")
]
Expand Down Expand Up @@ -276,8 +277,8 @@ def test_timestamp_literals(self):
ex1 = ("'2015-01-01 12:34:56'")

cases = [
(ibis.literal(Timestamp(tv1)), ex1),
(ibis.literal(Timestamp(tv1).to_pydatetime()), ex1),
(L(Timestamp(tv1)), ex1),
(L(Timestamp(tv1).to_pydatetime()), ex1),
(ibis.timestamp(tv1), ex1)
]
self._check_expr_cases(cases)
Expand Down Expand Up @@ -406,7 +407,7 @@ def test_reduction_where(self):
self._check_expr_cases(cases)

def test_reduction_invalid_where(self):
condbad_literal = ibis.literal('T')
condbad_literal = L('T')
c = self.table.double_col
for reduction in [c.sum, c.count, c.mean, c.max, c.min]:
with self.assertRaises(TypeError):
Expand Down Expand Up @@ -698,9 +699,9 @@ def test_field_in_literals(self):

def test_literal_in_list(self):
cases = [
(ibis.literal(2).isin([self.table.a, self.table.b, self.table.c]),
(L(2).isin([self.table.a, self.table.b, self.table.c]),
'2 IN (`a`, `b`, `c`)'),
(ibis.literal(2).notin([self.table.a, self.table.b, self.table.c]),
(L(2).notin([self.table.a, self.table.b, self.table.c]),
'2 NOT IN (`a`, `b`, `c`)')
]
self._check_expr_cases(cases)
Expand Down Expand Up @@ -905,7 +906,7 @@ def test_find_in_set(self):

def test_string_join(self):
cases = [
(ibis.literal(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')")
(L(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')")
]
self._check_expr_cases(cases)

Expand All @@ -928,17 +929,17 @@ def test_table_info(self):

def test_execute_exprs_no_table_ref(self):
cases = [
(ibis.literal(1) + ibis.literal(2), 3)
(L(1) + L(2), 3)
]

for expr, expected in cases:
result = self.con.execute(expr)
assert result == expected

# ExprList
exlist = ibis.api.expr_list([ibis.literal(1).name('a'),
exlist = ibis.api.expr_list([L(1).name('a'),
ibis.now().name('b'),
ibis.literal(2).log().name('c')])
L(2).log().name('c')])
self.con.execute(exlist)

def test_summary_execute(self):
Expand Down Expand Up @@ -1133,8 +1134,8 @@ def assert_cases_equality(self, cases):
assert result == expected, to_sql(expr)

def test_int_builtins(self):
i8 = ibis.literal(50)
i32 = ibis.literal(50000)
i8 = L(50)
i32 = L(50000)

mod_cases = [
(i8 % 5, 0),
Expand Down Expand Up @@ -1162,8 +1163,8 @@ def test_column_types(self):
assert pd.core.common.is_datetime64_dtype(df.timestamp_col.dtype)

def test_timestamp_builtins(self):
i32 = ibis.literal(50000)
i64 = ibis.literal(5 * 10 ** 8)
i32 = L(50000)
i64 = L(5 * 10 ** 8)

stamp = ibis.timestamp('2009-05-17 12:34:56')

Expand All @@ -1182,9 +1183,9 @@ def test_timestamp_builtins(self):
self.assert_cases_equality(timestamp_cases)

def test_decimal_builtins(self):
d = ibis.literal(5.245)
d = L(5.245)
general_cases = [
(ibis.literal(-5).abs(), 5),
(L(-5).abs(), 5),
(d.cast('int32'), 5),
(d.ceil(), 6),
(d.isnull(), False),
Expand All @@ -1197,7 +1198,7 @@ def test_decimal_builtins(self):
self.assert_cases_equality(general_cases)

def test_decimal_builtins_2(self):
d = ibis.literal('5.245')
d = L('5.245')
dc = d.cast('decimal(12,5)')
cases = [
(dc % 5, Decimal('0.245')),
Expand All @@ -1222,12 +1223,12 @@ def test_decimal_builtins_2(self):
approx_equal(result, expected, tol)

def test_string_functions(self):
string = ibis.literal('abcd')
strip_string = ibis.literal(' a ')
string = L('abcd')
strip_string = L(' a ')

cases = [
(string.length(), 4),
(ibis.literal('ABCD').lower(), 'abcd'),
(L('ABCD').lower(), 'abcd'),
(string.upper(), 'ABCD'),
(string.reverse(), 'dcba'),
(string.ascii_str(), 97),
Expand All @@ -1239,23 +1240,52 @@ def test_string_functions(self):
(string.left(2), 'ab'),
(string.right(2), 'cd'),
(string.repeat(2), 'abcdabcd'),
(ibis.literal('0123').translate('012', 'abc'), 'abc3'),

# global replace not available in Impala yet
# (L('aabbaabbaa').replace('bb', 'B'), 'aaBaaBaa'),

(L('0123').translate('012', 'abc'), 'abc3'),
(string.find('a'), 0),
(ibis.literal('baaaab').find('b', 2), 5),
(L('baaaab').find('b', 2), 5),
(string.lpad(1, '-'), 'a'),
(string.lpad(5), ' abcd'),
(string.rpad(1, '-'), 'a'),
(string.rpad(5), 'abcd '),
(string.find_in_set(['a', 'b', 'abcd']), 2),
(ibis.literal(', ').join(['a', 'b']), 'a, b'),
(L(', ').join(['a', 'b']), 'a, b'),
(string.like('a%'), True),
(string.re_search('[a-z]'), True),
(ibis.literal("https://www.cloudera.com").parse_url('HOST'),
"www.cloudera.com"),

(string.re_extract('[a-z]', 0), 'a'),
(string.re_replace('(b)', '2'), 'a2cd'),
]

self._check_cases(cases)

def _check_cases(self, cases):
for expr, expected in cases:
result = self.con.execute(expr)
assert result == expected

def test_parse_url(self):
cases = [
(L("https://www.cloudera.com").parse_url('HOST'),
"www.cloudera.com"),

(L('https://www.youtube.com/watch?v=kEuEcWfewf8&t=10')
.parse_url('QUERY', 'v'),
'kEuEcWfewf8'),
]
self._check_cases(cases)

def test_div_floordiv(self):
cases = [
(L(7) / 2, 3.5),
(L(7) // 2, 3),
(L(7).floordiv(2), 3),
(L(2).rfloordiv(7), 3),
]

for expr, expected in cases:
result = self.con.execute(expr)
assert result == expected
Expand Down Expand Up @@ -1509,3 +1539,14 @@ def test_non_equijoin(self):

# it works
expr.execute()

def test_char_varchar_types(self):
sql = """\
SELECT CAST(string_col AS varchar(20)) AS varchar_col,
CAST(string_col AS CHAR(5)) AS char_col
FROM functional_alltypes"""

t = self.con.sql(sql)

assert isinstance(t.varchar_col, api.StringArray)
assert isinstance(t.char_col, api.StringArray)
129 changes: 129 additions & 0 deletions ibis/impala/tests/test_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd

from numpy import nan

from ibis.compat import unittest
from ibis.impala.metadata import parse_metadata


def _glue_lists_spacer(spacer, lists):
result = list(lists[0])
for lst in lists[1:]:
result.append(spacer)
result.extend(lst)
return result


class TestMetadataParser(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.spacer = ('', nan, nan)

cls.schema = [
('# col_name', 'data_type', 'comment'),
cls.spacer,
('foo', 'int', nan),
('bar', 'tinyint', nan),
('baz', 'bigint', nan)
]

cls.partitions = [
('# Partition Information', nan, nan),
('# col_name', 'data_type', 'comment'),
cls.spacer,
('qux', 'bigint', nan)
]

cls.info = [
('# Detailed Table Information', nan, nan),
('Database:', 'tpcds', nan),
('Owner:', 'wesm', nan),
('CreateTime:', 'Sun Nov 08 01:09:42 PST 2015', nan),
('LastAccessTime:', 'UNKNOWN', nan),
('Protect Mode:', 'None', nan),
('Retention:', '0', nan),
('Location:', ('hdfs://host-name:20500/my.db'
'/dbname.table_name'), nan),
('Table Type:', 'EXTERNAL_TABLE', nan),
('Table Parameters:', nan, nan),
('', 'EXTERNAL', 'TRUE'),
('', 'STATS_GENERATED_VIA_STATS_TASK', 'true'),
('', 'numRows', '183592'),
('', 'transient_lastDdlTime', '1447369741'),
]

cls.storage_info = [
('# Storage Information', nan, nan),
('SerDe Library:', ('org.apache.hadoop'
'.hive.serde2.lazy.LazySimpleSerDe'), nan),
('InputFormat:', 'org.apache.hadoop.mapred.TextInputFormat', nan),
('OutputFormat:',
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
nan),
('Compressed:', 'No', nan),
('Num Buckets:', '0', nan),
('Bucket Columns:', '[]', nan),
('Sort Columns:', '[]', nan),
('Storage Desc Params:', nan, nan),
('', 'field.delim', '|'),
('', 'serialization.format', '|')
]

cls.part_metadata = pd.DataFrame.from_records(
_glue_lists_spacer(cls.spacer, [cls.schema, cls.partitions,
cls.info, cls.storage_info]),
columns=['name', 'type', 'comment'])

cls.unpart_metadata = pd.DataFrame.from_records(
_glue_lists_spacer(cls.spacer, [cls.schema, cls.info,
cls.storage_info]),
columns=['name', 'type', 'comment'])

cls.parsed_part = parse_metadata(cls.part_metadata)
cls.parsed_unpart = parse_metadata(cls.unpart_metadata)

def test_table_params(self):
params = self.parsed_part.info['Table Parameters']

assert params['EXTERNAL'] is True
assert params['STATS_GENERATED_VIA_STATS_TASK'] is True
assert params['numRows'] == 183592
assert (params['transient_lastDdlTime'] ==
pd.Timestamp('2015-11-12 15:09:01'))

def test_partitions(self):
assert self.parsed_unpart.partitions is None
assert self.parsed_part.partitions == [('qux', 'bigint')]

def test_schema(self):
assert self.parsed_part.schema == [
('foo', 'int'),
('bar', 'tinyint'),
('baz', 'bigint')
]

def test_storage_info(self):
storage = self.parsed_part.storage
assert storage['Compressed'] is False
assert storage['Num Buckets'] == 0

def test_storage_params(self):
params = self.parsed_part.storage['Desc Params']

assert params['field.delim'] == '|'
assert params['serialization.format'] == '|'
205 changes: 115 additions & 90 deletions ibis/impala/tests/test_pandas_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,50 +13,19 @@
# limitations under the License.

import numpy as np
import pandas as pd
import pytest

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.types as ir
from pandas.util.testing import assert_frame_equal
import pandas as pd

from ibis.compat import unittest
from ibis.common import IbisTypeError
from ibis.impala.client import pandas_to_ibis_schema
from ibis.impala.pandas_interop import pandas_to_ibis_schema, DataFrameWriter
from ibis.impala.tests.common import ImpalaE2E


functional_alltypes_with_nulls = pd.DataFrame({
'bigint_col': np.int64([0, 10, 20, 30, 40, 50, 60, 70, 80, 90]),
'bool_col': np.bool_([True, False, True, False, True, None,
True, False, True, False]),
'date_string_col': ['11/01/10', None, '11/01/10', '11/01/10',
'11/01/10', '11/01/10', '11/01/10', '11/01/10',
'11/01/10', '11/01/10'],
'double_col': np.float64([0.0, 10.1, None, 30.299999999999997,
40.399999999999999, 50.5, 60.599999999999994,
70.700000000000003, 80.799999999999997,
90.899999999999991]),
'float_col': np.float32([None, 1.1000000238418579, 2.2000000476837158,
3.2999999523162842, 4.4000000953674316, 5.5,
6.5999999046325684, 7.6999998092651367,
8.8000001907348633,
9.8999996185302734]),
'int_col': np.int32([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11],
'smallint_col': np.int16([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'string_col': ['0', '1', None, '3', '4', '5', '6', '7', '8', '9'],
'timestamp_col': [pd.Timestamp('2010-11-01 00:00:00'),
None,
pd.Timestamp('2010-11-01 00:02:00.100000'),
pd.Timestamp('2010-11-01 00:03:00.300000'),
pd.Timestamp('2010-11-01 00:04:00.600000'),
pd.Timestamp('2010-11-01 00:05:00.100000'),
pd.Timestamp('2010-11-01 00:06:00.150000'),
pd.Timestamp('2010-11-01 00:07:00.210000'),
pd.Timestamp('2010-11-01 00:08:00.280000'),
pd.Timestamp('2010-11-01 00:09:00.360000')],
'tinyint_col': np.int8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'year': [2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010]})
import ibis.expr.datatypes as dt
import ibis.expr.types as ir
import ibis.util as util
import ibis


class TestPandasTypeInterop(unittest.TestCase):
Expand Down Expand Up @@ -168,58 +137,114 @@ def test_dtype_categorical(self):
assert inferred == expected


class TestPandasRoundTrip(ImpalaE2E, unittest.TestCase):
exhaustive_df = pd.DataFrame({
'bigint_col': np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
dtype='i8'),
'bool_col': np.array([True, False, True, False, True, None,
True, False, True, False], dtype=np.bool_),
'bool_obj_col': np.array([True, False, np.nan, False, True, np.nan,
True, np.nan, True, False], dtype=np.object_),
'date_string_col': ['11/01/10', None, '11/01/10', '11/01/10',
'11/01/10', '11/01/10', '11/01/10', '11/01/10',
'11/01/10', '11/01/10'],
'double_col': np.array([0.0, 10.1, np.nan, 30.299999999999997,
40.399999999999999, 50.5, 60.599999999999994,
70.700000000000003, 80.799999999999997,
90.899999999999991], dtype=np.float64),
'float_col': np.array([np.nan, 1.1000000238418579, 2.2000000476837158,
3.2999999523162842, 4.4000000953674316, 5.5,
6.5999999046325684, 7.6999998092651367,
8.8000001907348633,
9.8999996185302734], dtype='f4'),
'int_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i4'),
'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11],
'smallint_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i2'),
'string_col': ['0', '1', None, 'double , whammy', '4', '5',
'6', '7', '8', '9'],
'timestamp_col': [pd.Timestamp('2010-11-01 00:00:00'),
None,
pd.Timestamp('2010-11-01 00:02:00.100000'),
pd.Timestamp('2010-11-01 00:03:00.300000'),
pd.Timestamp('2010-11-01 00:04:00.600000'),
pd.Timestamp('2010-11-01 00:05:00.100000'),
pd.Timestamp('2010-11-01 00:06:00.150000'),
pd.Timestamp('2010-11-01 00:07:00.210000'),
pd.Timestamp('2010-11-01 00:08:00.280000'),
pd.Timestamp('2010-11-01 00:09:00.360000')],
'tinyint_col': np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='i1'),
'year': [2010, 2010, 2010, 2010, 2010, 2009, 2009, 2009, 2009, 2009]})


class TestPandasInterop(ImpalaE2E, unittest.TestCase):

@classmethod
def setUpClass(cls):
super(TestPandasInterop, cls).setUpClass()
cls.alltypes = cls.alltypes.execute()

def test_alltypes_roundtrip(self):
self._check_roundtrip(self.alltypes)

def test_round_trip(self):
pytest.skip('fails')
def test_writer_cleanup_deletes_hdfs_dir(self):
writer = DataFrameWriter(self.con, self.alltypes)

df1 = self.alltypes.execute()
df2 = self.con.pandas(df1, 'bamboo', database=self.tmp_db).execute()
assert (df1.columns == df2.columns).all()
assert (df1.dtypes == df2.dtypes).all()
assert (df1 == df2).all().all()
path = writer.write_temp_csv()
assert self.con.hdfs.exists(path)

def test_round_trip_non_int_missing_data(self):
pytest.skip('WM: hangs -- will investigate later')
df1 = functional_alltypes_with_nulls
table = self.con.pandas(df1, 'fawn', database=self.tmp_db)
writer.cleanup()
assert not self.con.hdfs.exists(path)

# noop
writer.cleanup()
assert not self.con.hdfs.exists(path)

@pytest.mark.superuser
def test_create_table_from_dataframe(self):
tname = 'tmp_pandas_{0}'.format(util.guid())
self.con.create_table(tname, self.alltypes, database=self.tmp_db,
location=self._create_777_tmp_dir())
self.temp_tables.append(tname)

table = self.con.table(tname, database=self.tmp_db)
df = table.execute()
assert_frame_equal(df, self.alltypes)

@pytest.mark.superuser
def test_insert(self):
schema = pandas_to_ibis_schema(exhaustive_df)

table_name = 'tmp_pandas_{0}'.format(util.guid())
self.con.create_table(table_name, database=self.tmp_db,
schema=schema,
location=self._create_777_tmp_dir())
self.temp_tables.append(table_name)

self.con.insert(table_name, exhaustive_df.iloc[:4],
database=self.tmp_db)
self.con.insert(table_name, exhaustive_df.iloc[4:],
database=self.tmp_db)

table = self.con.table(table_name, database=self.tmp_db)

result = (table.execute()
.sort_index(by='tinyint_col')
.reset_index(drop=True))
assert_frame_equal(result, exhaustive_df)

def test_insert_partition(self):
# overwrite

# no overwrite
pass

def test_round_trip_exhaustive(self):
self._check_roundtrip(exhaustive_df)

def _check_roundtrip(self, df):
writer = DataFrameWriter(self.con, df)
path = writer.write_temp_csv()

table = writer.delimited_table(path)
df2 = table.execute()
assert (df1.columns == df2.columns).all()
assert (df1.dtypes == df2.dtypes).all()
# bool/int cols should be exact
assert (df1.bool_col == df2.bool_col).all()
assert (df1.tinyint_col == df2.tinyint_col).all()
assert (df1.smallint_col == df2.smallint_col).all()
assert (df1.int_col == df2.int_col).all()
assert (df1.bigint_col == df2.bigint_col).all()
assert (df1.month == df2.month).all()
assert (df1.year == df2.year).all()
# string cols should be equal everywhere except for the NULLs
assert ((df1.string_col == df2.string_col) ==
[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]).all()
assert ((df1.date_string_col == df2.date_string_col) ==
[1, 0, 1, 1, 1, 1, 1, 1, 1, 1]).all()
# float cols within tolerance, and NULLs should be False
assert ((df1.double_col - df2.double_col < 1e-9) ==
[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]).all()
assert ((df1.float_col - df2.float_col < 1e-9) ==
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1]).all()

def test_round_trip_missing_type_promotion(self):
pytest.skip('unfinished')

# prepare Impala table with missing ints
# TODO: switch to self.con.raw_sql once #412 is fixed
create_query = ('CREATE TABLE {0}.missing_ints '
' (tinyint_col TINYINT, bigint_col BIGINT) '
'STORED AS PARQUET'.format(self.tmp_db))
insert_query = ('INSERT INTO {0}.missing_ints '
'VALUES (NULL, 3), (-5, NULL), (19, 444444)'.format(
self.tmp_db))
self.con.con.cursor.execute(create_query)
self.con.con.cursor.execute(insert_query)

table = self.con.table('missing_ints', database=self.tmp_db)
df = table.execute() # noqa # REMOVE LATER

# WHAT NOW?

assert_frame_equal(df2, df)
218 changes: 212 additions & 6 deletions ibis/impala/tests/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,242 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import ibis
from posixpath import join as pjoin
import pytest

from pandas.util.testing import assert_frame_equal
import pandas as pd

from ibis.compat import unittest
from ibis.impala.compat import ImpylaError
from ibis.impala.tests.common import ImpalaE2E
from ibis.tests.util import assert_equal

import ibis
import ibis.util as util


def _tmp_name():
return 'tmp_partition_{0}'.format(util.guid())


class TestPartitioning(ImpalaE2E, unittest.TestCase):

@classmethod
def setUpClass(cls):
ImpalaE2E.setup_e2e(cls)

df = pd.DataFrame({'year': [2009, 2009, 2009, 2010, 2010, 2010],
'month': [1, 2, 3, 1, 2, 3],
'value': [1, 2, 3, 4, 5, 6]})
df = pd.concat([df] * 10, ignore_index=True)
df['id'] = df.index.values

cls.df = df
cls.db = cls.con.database(cls.tmp_db)
cls.pd_name = _tmp_name()
cls.db.create_table(cls.pd_name, df,
location=cls._temp_location())

@classmethod
def _temp_location(cls):
return cls._create_777_tmp_dir()

def test_is_partitioned(self):
schema = ibis.schema([('foo', 'string'),
('year', 'int32'),
('month', 'int16')])
name = _tmp_name()
self.db.create_table(name, schema=schema,
partition=['year', 'month'],
location=self._temp_location())
assert self.db.table(name).is_partitioned

@pytest.mark.superuser
def test_create_table_with_partition_column(self):
schema = ibis.schema([('year', 'int32'),
('month', 'int8'),
('day', 'int8'),
('value', 'double')])

name = util.guid()
self.con.create_table(name, schema=schema, partition=['year', 'month'])
name = _tmp_name()
self.con.create_table(name, schema=schema,
database=self.tmp_db,
partition=['year', 'month'],
location=self._temp_location())
self.temp_tables.append(name)

# the partition column get put at the end of the table
ex_schema = ibis.schema([('day', 'int8'),
('value', 'double'),
('year', 'int32'),
('month', 'int8')])
table_schema = self.con.get_schema(name)
table_schema = self.con.get_schema(name, database=self.tmp_db)
assert_equal(table_schema, ex_schema)

partition_schema = self.con.get_partition_schema(name)
partition_schema = self.db.table(name).partition_schema()

expected = ibis.schema([('year', 'int32'),
('month', 'int8')])
assert_equal(partition_schema, expected)

@pytest.mark.superuser
def test_create_partitioned_separate_schema(self):
schema = ibis.schema([('day', 'int8'),
('value', 'double')])
part_schema = ibis.schema([('year', 'int32'),
('month', 'int8')])

name = _tmp_name()
self.con.create_table(name, schema=schema, partition=part_schema,
location=self._temp_location())
self.temp_tables.append(name)

# the partition column get put at the end of the table
ex_schema = ibis.schema([('day', 'int8'),
('value', 'double'),
('year', 'int32'),
('month', 'int8')])
table_schema = self.con.get_schema(name)
assert_equal(table_schema, ex_schema)

partition_schema = self.con.table(name).partition_schema()
assert_equal(partition_schema, part_schema)

@pytest.mark.superuser
def test_unpartitioned_table_get_schema(self):
tname = 'functional_alltypes'
with self.assertRaises(ImpylaError):
self.con.table(tname).partition_schema()

@pytest.mark.superuser
def test_insert_select_partitioned_table(self):
df = self.df

unpart_t = self.db.table(self.pd_name)
part_keys = ['year', 'month']
part_t = self._create_partitioned_table(unpart_t.schema(),
part_keys)
unique_keys = df[part_keys].drop_duplicates()

for i, (year, month) in enumerate(unique_keys.itertuples(index=False)):
select_stmt = unpart_t[(unpart_t.year == year) &
(unpart_t.month == month)]

# test both styles of insert
if i:
part = {'year': year, 'month': month}
else:
part = [year, month]
part_t.insert(select_stmt, partition=part)

self._verify_partitioned_table(part_t, df, unique_keys)

@pytest.mark.superuser
def test_insert_overwrite_partition(self):
pass

@pytest.mark.superuser
def test_dynamic_partitioning(self):
pass

@pytest.mark.superuser
def test_add_drop_partition(self):
schema = ibis.schema([('foo', 'string'),
('year', 'int32'),
('month', 'int16')])
name = _tmp_name()
tmp_dir = self._temp_location()
self.db.create_table(name, schema=schema,
partition=['year', 'month'],
location=tmp_dir)

table = self.db.table(name)

part = {'year': 2007, 'month': 4}

path = '/tmp/tmp-{0}'.format(util.guid())
table.add_partition(part, location=path)

assert len(table.partitions()) == 2

table.drop_partition(part)

assert len(table.partitions()) == 1

@pytest.mark.superuser
def test_set_partition_location(self):
pass

@pytest.mark.superuser
def test_load_data_partition(self):
df = self.df

unpart_t = self.db.table(self.pd_name)
part_keys = ['year', 'month']
part_t = self._create_partitioned_table(unpart_t.schema(),
part_keys)

# trim the runtime of this test
df = df[df.month == 1].reset_index(drop=True)

unique_keys = df[part_keys].drop_duplicates()

hdfs_dir = pjoin(self.tmp_dir, 'load-data-partition')

df2 = df.drop(['year', 'month'], axis='columns')

csv_props = {
'serialization.format': ',',
'field.delim': ','
}

for i, (year, month) in enumerate(unique_keys.itertuples(index=False)):
chunk = df2[(df.year == year) & (df.month == month)]
chunk_path = pjoin(hdfs_dir, '{0}.csv'.format(i))

self.con.write_dataframe(chunk, chunk_path)

# test both styles of insert
if i:
part = {'year': year, 'month': month}
else:
part = [year, month]

part_t.add_partition(part)
part_t.alter_partition(part, format='text',
serde_properties=csv_props)
part_t.load_data(chunk_path, partition=part)

self.hdfs.rmdir(hdfs_dir)
self._verify_partitioned_table(part_t, df, unique_keys)

def _verify_partitioned_table(self, part_t, df, unique_keys):
result = (part_t.execute()
.sort_index(by='id')
.reset_index(drop=True)
[df.columns])

assert_frame_equal(result, df)

parts = part_t.partitions()

# allow for the total line
assert len(parts) == (len(unique_keys) + 1)

def _create_partitioned_table(self, schema, part_keys):
part_name = _tmp_name()

self.db.create_table(part_name,
schema=schema,
partition=part_keys,
location=self._temp_location())
self.temp_tables.append(part_name)
return self.db.table(part_name)

@pytest.mark.superuser
def test_drop_partition(self):
pass

@pytest.mark.superuser
def test_repartition_automated(self):
pass
2 changes: 2 additions & 0 deletions ibis/impala/udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ def _ibis_string_to_impala(tval):
'float': 'float',
'double': 'double',
'string': 'string',
'varchar': 'string',
'char': 'string',
'timestamp': 'timestamp',
'decimal': 'decimal'
}
40 changes: 34 additions & 6 deletions ibis/sql/alchemy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import ibis.expr.types as ir
import ibis.sql.compiler as comp
import ibis.sql.transforms as transforms
import ibis.util as util
import ibis


Expand Down Expand Up @@ -480,11 +481,12 @@ def get_sqla_type(self, data_type):


rewrites = AlchemyExprTranslator.rewrites
compiles = AlchemyExprTranslator.compiles


class AlchemyQuery(Query):

def _fetch_from_cursor(self, cursor):
def _fetch(self, cursor):
# No guarantees that the DBAPI cursor has data types
import pandas as pd
proxy = cursor.proxy
Expand Down Expand Up @@ -660,7 +662,7 @@ def _add_order_by(self, fragment):

# here we have to determine if key.expr is in the select set (as it
# will be in the case of order_by fused with an aggregation
if _can_lower_aggregate_column(self.table_set, sort_expr):
if _can_lower_sort_column(self.table_set, sort_expr):
arg = sort_expr.get_name()
else:
arg = self._translate(sort_expr)
Expand Down Expand Up @@ -774,7 +776,7 @@ def _format_table(self, expr):
return result


def _can_lower_aggregate_column(table_set, expr):
def _can_lower_sort_column(table_set, expr):
# we can currently sort by just-appeared aggregate metrics, but the way
# these are references in the expression DSL is as a SortBy (blocking
# table operation) on an aggregation. There's a hack in _collect_SortBy
Expand All @@ -788,11 +790,13 @@ def _can_lower_aggregate_column(table_set, expr):
base = list(bases.values())[0]
base_op = base.op()

if not isinstance(base_op, ops.Aggregation):
if isinstance(base_op, ops.Aggregation):
return base_op.table.equals(table_set)
elif isinstance(base_op, ops.Projection):
return base.equals(table_set)
else:
return False

return base_op.table.equals(table_set)


def _and_all(clauses):
result = clauses[0]
Expand Down Expand Up @@ -845,3 +849,27 @@ def fetchall(self):
def _nullifzero(expr):
arg = expr.op().args[0]
return (arg == 0).ifelse(ibis.NA, arg)


@compiles(ops.Divide)
def _true_divide(t, expr):
op = expr.op()
left, right = op.args

if util.all_of(op.args, ir.IntegerValue):
new_expr = left.div(right.cast('double'))
return t.translate(new_expr)

return fixed_arity(lambda x, y: x / y, 2)(t, expr)


@compiles(ops.FloorDivide)
def _floor_divide(t, expr):
op = expr.op()
left, right = op.args

if util.any_of(op.args, ir.FloatingValue):
new_expr = expr.floor()
return t.translate(new_expr)

return fixed_arity(lambda x, y: x / y, 2)(t, expr)
164 changes: 119 additions & 45 deletions ibis/sql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,10 @@ def _visit_filter_SummaryFilter(self, expr):
backup_metric_name='__tmp__',
parent_table=self.table_set)

pred = (op.arg == getattr(rank_set, op.arg.get_name()))
# GH #667; this may reference a filtered version of self.table_set
arg = L.substitute_parents(op.arg)

pred = (arg == getattr(rank_set, op.arg.get_name()))
self.table_set = self.table_set.semi_join(rank_set, [pred])

return None
Expand Down Expand Up @@ -345,6 +348,7 @@ def _collect_elements(self):
if self.table_set is None:
raise com.InternalError('no table set')
else:
# Expressions not depending on any table
if isinstance(root_op, ir.ExpressionList):
self.select_set = source_expr.exprs()
else:
Expand All @@ -370,25 +374,6 @@ def _collect(self, expr, toplevel=False):

self.op_memo.add(op)

def _collect_Aggregation(self, expr, toplevel=False):
# The select set includes the grouping keys (if any), and these are
# duplicated in the group_by set. SQL translator can decide how to
# format these depending on the database. Most likely the
# GROUP BY 1, 2, ... style
if toplevel:
subbed_expr = self._sub(expr)
sub_op = subbed_expr.op()

self.group_by = self._convert_group_by(sub_op.by)
self.having = sub_op.having
self.select_set = sub_op.by + sub_op.agg_exprs
self.table_set = sub_op.table

self._collect(expr.op().table)

def _convert_group_by(self, exprs):
return list(range(len(exprs)))

def _collect_Distinct(self, expr, toplevel=False):
if toplevel:
self.distinct = True
Expand Down Expand Up @@ -421,6 +406,12 @@ def _collect_Limit(self, expr, toplevel=False):

self._collect(op.table, toplevel=toplevel)

def _collect_Union(self, expr, toplevel=False):
if not toplevel:
return
else:
raise NotImplementedError

def _collect_SortBy(self, expr, toplevel=False):
op = expr.op()

Expand All @@ -437,50 +428,83 @@ def _collect_SortBy(self, expr, toplevel=False):

self._collect(op.table, toplevel=toplevel)

def _collect_MaterializedJoin(self, expr, toplevel=False):
def _collect_Aggregation(self, expr, toplevel=False):
# The select set includes the grouping keys (if any), and these are
# duplicated in the group_by set. SQL translator can decide how to
# format these depending on the database. Most likely the
# GROUP BY 1, 2, ... style
if toplevel:
subbed_expr = self._sub(expr)
sub_op = subbed_expr.op()

self.group_by = self._convert_group_by(sub_op.by)
self.having = sub_op.having
self.select_set = sub_op.by + sub_op.agg_exprs
self.table_set = sub_op.table

self._collect(expr.op().table)

def _collect_Projection(self, expr, toplevel=False):
op = expr.op()
table = op.table

if toplevel:
subbed = self._sub(expr)
sop = subbed.op()

if isinstance(table.op(), ops.Join):
can_sub = self._collect_Join(table)
else:
can_sub = True
self._collect(table)

selections = op.selections

if can_sub:
selections = sop.selections
table = sop.table

self.select_set = selections
self.table_set = table

def _collect_MaterializedJoin(self, expr, toplevel=False):
op = expr.op()
join = op.join
join_op = join.op()

if toplevel:
subbed = self._sub(join)
self.table_set = subbed
self.select_set = [subbed]

self._collect(join_op.left, toplevel=False)
self._collect(join_op.right, toplevel=False)
self._collect_Join(join, toplevel=False)

def _convert_group_by(self, exprs):
return list(range(len(exprs)))

def _collect_Join(self, expr, toplevel=False):
op = expr.op()
if toplevel:
subbed = self._sub(expr)
self.table_set = subbed
self.select_set = [subbed]

self._collect(op.left, toplevel=False)
self._collect(op.right, toplevel=False)
subtables = _get_subtables(expr)

def _collect_Union(self, expr, toplevel=False):
if not toplevel:
return
else:
raise NotImplementedError
# If any of the joined tables are non-blocking modified versions
# (e.g. with Filter) of the same table, then it's not safe to continue
# walking down the tree (see #667), and we should instead have inline
# views rather than attempting to fuse things together into the same
# SELECT query.
can_substitute = _all_distinct_roots(subtables)
if can_substitute:
for table in subtables:
self._collect(table, toplevel=False)

def _collect_Projection(self, expr, toplevel=False):
op = expr.op()
if toplevel:
subbed = self._sub(expr)
sop = subbed.op()

self.select_set = sop.selections
self.table_set = sop.table
self._collect(op.table)
return can_substitute

def _collect_PhysicalTable(self, expr, toplevel=False):
if toplevel:
self.select_set = [expr]
self.table_set = self._sub(expr)
self.table_set = expr # self._sub(expr)

def _collect_SelfReference(self, expr, toplevel=False):
op = expr.op()
Expand Down Expand Up @@ -527,6 +551,42 @@ def _analyze_subqueries(self):
self.context.set_extracted(expr)


def _get_subtables(expr):
subtables = []

def _walk(expr):
op = expr.op()
if isinstance(op, ops.Join):
_walk(op.left)
_walk(op.right)
else:
subtables.append(expr)
_walk(expr)

return subtables


def _all_distinct_roots(subtables):
bases = []
for t in subtables:
base = _blocking_base(t)
for x in bases:
if base.equals(x):
return False
bases.append(base)
return True


def _blocking_base(expr):
node = expr.op()
if isinstance(node, (ir.BlockingTableNode, ops.Join)):
return expr
else:
for arg in expr.op().flat_args():
if isinstance(arg, ir.TableExpr):
return _blocking_base(arg)


def _extract_subqueries(select_stmt):
helper = _ExtractSubqueries(select_stmt)
return helper.get_result()
Expand Down Expand Up @@ -682,7 +742,7 @@ def get_result(self):
def _visit(self, expr, in_subquery=False):
node = expr.op()

in_subquery = self._is_subquery(node)
in_subquery = in_subquery or self._is_subquery(node)

for arg in node.flat_args():
if isinstance(arg, ir.TableExpr):
Expand All @@ -694,7 +754,9 @@ def _visit(self, expr, in_subquery=False):

def _is_subquery(self, node):
# XXX
if isinstance(node, ops.TableArrayView):
if isinstance(node, (ops.TableArrayView,
transforms.ExistsSubquery,
transforms.NotExistsSubquery)):
return True

if isinstance(node, ops.TableColumn):
Expand Down Expand Up @@ -800,7 +862,7 @@ def column_handler(results):
return column_handler

if isinstance(op, ops.TableColumn):
table_expr = op.table
table_expr = op.table[[op.name]]
result_handler = _get_column(op.name)
else:
# Something more complicated.
Expand Down Expand Up @@ -1113,6 +1175,18 @@ def decorator(f):
return decorator
else:
decorator(f)
return f

@classmethod
def compiles(cls, klass, f=None):
def decorator(f):
cls._registry[klass] = f

if f is None:
return decorator
else:
decorator(f)
return f


rewrites = ExprTranslator.rewrites
Expand Down
7 changes: 4 additions & 3 deletions ibis/sql/sqlite/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def compile(expr):
return to_sqlalchemy(expr, dialect=SQLiteDialect)


def connect(path, create=False):
def connect(path=None, create=False):

"""
Create an Ibis client connected to a SQLite database.
Expand All @@ -35,8 +35,9 @@ def connect(path, create=False):
Parameters
----------
path : string
File path to the SQLite database file
path : string, default None
File path to the SQLite database file. If None, creates an in-memory
transient database and you can use attach() to add more files
create : boolean, default False
If file does not exist, create it
"""
Expand Down
7 changes: 5 additions & 2 deletions ibis/sql/sqlite/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,15 @@ class SQLiteClient(alch.AlchemyClient):
dialect = SQLiteDialect
database_class = SQLiteDatabase

def __init__(self, path, create=False):
def __init__(self, path=None, create=False):
self.name = path
self.database_name = 'default'

self.con = sa.create_engine('sqlite://')
self.attach(self.database_name, path, create=create)

if path:
self.attach(self.database_name, path, create=create)

self.meta = sa.MetaData(bind=self.con)

@property
Expand Down
5 changes: 4 additions & 1 deletion ibis/sql/sqlite/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import ibis.expr.types as ir
import ibis.common as com


_operation_registry = alch._operation_registry.copy()


Expand Down Expand Up @@ -171,11 +172,13 @@ class SQLiteExprTranslator(alch.AlchemyExprTranslator):
_rewrites = alch.AlchemyExprTranslator._rewrites.copy()
_type_map = alch.AlchemyExprTranslator._type_map.copy()
_type_map.update({
dt.Double: sa.types.REAL
dt.Double: sa.types.REAL,
dt.Float: sa.types.REAL
})


rewrites = SQLiteExprTranslator.rewrites
compiles = SQLiteExprTranslator.compiles


class SQLiteDialect(alch.AlchemyDialect):
Expand Down
10 changes: 9 additions & 1 deletion ibis/sql/sqlite/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,15 @@ def test_compile_verify(self):
assert supported_expr.verify()

def test_attach_file(self):
pass
client = ibis.sqlite.connect()

client.attach('foo', self.env.db_path)
client.attach('bar', self.env.db_path)

foo_tables = client.list_tables(database='foo')
bar_tables = client.list_tables(database='bar')

assert foo_tables == bar_tables

def test_database_layer(self):
db = self.con.database()
Expand Down
20 changes: 19 additions & 1 deletion ibis/sql/sqlite/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def test_cast(self):
cases = [
(d.cast('int8'), sa.cast(sa_d, sa.types.SMALLINT)),
(s.cast('double'), sa.cast(sa_s, sa.types.REAL)),
(s.cast('float'), sa.cast(sa_s, sa.types.REAL))
]
self._check_expr_cases(cases)

Expand Down Expand Up @@ -99,6 +100,15 @@ def test_binary_arithmetic(self):
]
self._check_e2e_cases(cases)

def test_div_floordiv(self):
cases = [
(L(7) / L(2), 3.5),
(L(7) // L(2), 3),
(L(7).floordiv(2), 3),
(L(2).rfloordiv(7), 3),
]
self._check_e2e_cases(cases)

def test_typeof(self):
cases = [
(L('foo_bar').typeof(), 'text'),
Expand Down Expand Up @@ -156,15 +166,23 @@ def test_string_contains(self):
]
self._check_e2e_cases(cases)

def test_string_functions(self):
def test_string_find(self):
cases = [
(L('foobar').find('bar'), 3),
(L('foobar').find('baz'), -1),
]
self._check_e2e_cases(cases)

def test_string_like(self):
cases = [
(L('foobar').like('%bar'), True),
(L('foobar').like('foo%'), True),
(L('foobar').like('%baz%'), False),
]
self._check_e2e_cases(cases)

def test_str_replace(self):
cases = [
(L('foobarfoo').replace('foo', 'H'), 'HbarH'),
]
self._check_e2e_cases(cases)
Expand Down
168 changes: 151 additions & 17 deletions ibis/sql/tests/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,14 @@ def test_table_column_unbox(self):
query = ast.queries[0]

sql_query = query.compile()
expected = """SELECT `g`, sum(`f`) AS `total`
FROM alltypes
WHERE `c` > 0
GROUP BY 1"""
expected = """\
SELECT `g`
FROM (
SELECT `g`, sum(`f`) AS `total`
FROM alltypes
WHERE `c` > 0
GROUP BY 1
) t0"""

assert sql_query == expected

Expand Down Expand Up @@ -683,6 +687,66 @@ def _case_limit_cte_extract(self):
t2 = t.view()
return t.join(t2).projection(t)

def _case_subquery_aliased(self):
t1 = self.con.table('star1')
t2 = self.con.table('star2')

agged = t1.aggregate([t1.f.sum().name('total')], by=['foo_id'])
what = (agged.inner_join(t2, [agged.foo_id == t2.foo_id])
[agged, t2.value1])

return what

def _case_filter_self_join_analysis_bug(self):
purchases = ibis.table([('region', 'string'),
('kind', 'string'),
('user', 'int64'),
('amount', 'double')], 'purchases')

metric = purchases.amount.sum().name('total')
agged = (purchases.group_by(['region', 'kind'])
.aggregate(metric))

left = agged[agged.kind == 'foo']
right = agged[agged.kind == 'bar']

joined = left.join(right, left.region == right.region)
result = joined[left.region,
(left.total - right.total).name('diff')]

return result, purchases

def _case_projection_fuse_filter(self):
# Probably test this during the evaluation phase. In SQL, "fusable"
# table operations will be combined together into a single select
# statement
#
# see ibis #71 for more on this

t = ibis.table([
('a', 'int8'),
('b', 'int16'),
('c', 'int32'),
('d', 'int64'),
('e', 'float'),
('f', 'double'),
('g', 'string'),
('h', 'boolean')
], 'foo')

proj = t['a', 'b', 'c']

# Rewrite a little more aggressively here
expr1 = proj[t.a > 0]

# at one point these yielded different results
filtered = t[t.a > 0]

expr2 = filtered[t.a, t.b, t.c]
expr3 = filtered.projection(['a', 'b', 'c'])

return expr1, expr2, expr3


class TestSelectSQL(unittest.TestCase, ExprTestCases):

Expand Down Expand Up @@ -1143,6 +1207,16 @@ def test_fuse_projections(self):
assert table3.equals(expected)
assert table3_filtered.equals(expected2)

def test_projection_filter_fuse(self):
expr1, expr2, expr3 = self._case_projection_fuse_filter()

sql1 = to_sql(expr1)
sql2 = to_sql(expr2)
sql3 = to_sql(expr3)

assert sql1 == sql2
assert sql1 == sql3

def test_bug_project_multiple_times(self):
# 108
customer = self.con.table('tpch_customer')
Expand Down Expand Up @@ -1196,6 +1270,12 @@ def test_aggregate_projection_subquery(self):

proj = t[t.f > 0][t, (t.a + t.b).name('foo')]

result = to_sql(proj)
expected = """SELECT *, `a` + `b` AS `foo`
FROM alltypes
WHERE `f` > 0"""
assert result == expected

def agg(x):
return x.aggregate([x.foo.sum().name('foo total')], by=['g'])

Expand Down Expand Up @@ -1236,14 +1316,8 @@ def agg(x):
assert result == expected

def test_subquery_aliased(self):
t1 = self.con.table('star1')
t2 = self.con.table('star2')
case = self._case_subquery_aliased()

agged = t1.aggregate([t1.f.sum().name('total')], by=['foo_id'])
what = (agged.inner_join(t2, [agged.foo_id == t2.foo_id])
[agged, t2.value1])

result = to_sql(what)
expected = """SELECT t0.*, t1.`value1`
FROM (
SELECT `foo_id`, sum(`f`) AS `total`
Expand All @@ -1252,7 +1326,7 @@ def test_subquery_aliased(self):
) t0
INNER JOIN star2 t1
ON t0.`foo_id` = t1.`foo_id`"""
assert result == expected
self._compare_sql(case, expected)

def test_double_nested_subquery_no_aliases(self):
# We don't require any table aliasing anywhere
Expand Down Expand Up @@ -1335,15 +1409,15 @@ def test_subquery_factor_correlated_subquery(self):
result = to_sql(expr)
expected = """\
WITH t0 AS (
SELECT t5.*, t1.`r_name` AS `region`, t3.`o_totalprice` AS `amount`,
SELECT t6.*, t1.`r_name` AS `region`, t3.`o_totalprice` AS `amount`,
CAST(t3.`o_orderdate` AS timestamp) AS `odate`
FROM tpch_region t1
INNER JOIN tpch_nation t2
ON t1.`r_regionkey` = t2.`n_regionkey`
INNER JOIN tpch_customer t5
ON t5.`c_nationkey` = t2.`n_nationkey`
INNER JOIN tpch_customer t6
ON t6.`c_nationkey` = t2.`n_nationkey`
INNER JOIN tpch_orders t3
ON t3.`o_custkey` = t5.`c_custkey`
ON t3.`o_custkey` = t6.`c_custkey`
)
SELECT t0.*
FROM t0
Expand Down Expand Up @@ -1679,6 +1753,16 @@ def test_exists(self):
)"""
assert result == expected

def test_exists_subquery_repr(self):
# GH #660
t1, t2 = self.t1, self.t2

cond = t1.key1 == t2.key1
expr = t1[cond.any()]
stmt = build_ast(expr).queries[0]

repr(stmt.where[0])

def test_not_exists(self):
expr = self._case_not_exists()
result = to_sql(expr)
Expand All @@ -1691,6 +1775,33 @@ def test_not_exists(self):
)"""
assert result == expected

def test_filter_inside_exists(self):
events = ibis.table([('session_id', 'int64'),
('user_id', 'int64'),
('event_type', 'int32'),
('ts', 'timestamp')], 'events')

purchases = ibis.table([('item_id', 'int64'),
('user_id', 'int64'),
('price', 'double'),
('ts', 'timestamp')], 'purchases')
filt = purchases.ts > '2015-08-15'
cond = (events.user_id == purchases[filt].user_id).any()
expr = events[cond]

result = to_sql(expr)
expected = """\
SELECT t0.*
FROM events t0
WHERE EXISTS (
SELECT 1
FROM purchases t1
WHERE t1.`ts` > '2015-08-15' AND
t0.`user_id` = t1.`user_id`
)"""

assert result == expected

def test_self_reference_in_exists(self):
semi, anti = self._case_self_reference_in_exists()

Expand Down Expand Up @@ -1725,7 +1836,7 @@ def test_self_reference_limit_exists(self):
FROM functional_alltypes
LIMIT 100
)
SELECT t0.*
SELECT *
FROM t0
WHERE NOT EXISTS (
SELECT 1
Expand Down Expand Up @@ -1849,6 +1960,29 @@ def test_self_aggregate_in_predicate(self):
# Per ibis #43
pass

def test_self_join_filter_analysis_bug(self):
expr, _ = self._case_filter_self_join_analysis_bug()

expected = """\
WITH t0 AS (
SELECT `region`, `kind`, sum(`amount`) AS `total`
FROM purchases
GROUP BY 1, 2
)
SELECT t1.`region`, t1.`total` - t2.`total` AS `diff`
FROM (
SELECT *
FROM t0
WHERE `kind` = 'foo'
) t1
INNER JOIN (
SELECT *
FROM t0
WHERE `kind` = 'bar'
) t2
ON t1.`region` = t2.`region`"""
self._compare_sql(expr, expected)


class TestUnions(unittest.TestCase, ExprTestCases):

Expand Down
43 changes: 43 additions & 0 deletions ibis/sql/tests/test_sqlalchemy.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,49 @@ def test_where_correlated_subquery(self):
stmt = sa.select([t0]).where(t0.c.y > subq)
self._compare_sqla(expr, stmt)

def test_subquery_aliased(self):
expr = self._case_subquery_aliased()

s1 = self._get_sqla('star1').alias('t2')
s2 = self._get_sqla('star2').alias('t1')

agged = (sa.select([s1.c.foo_id, F.sum(s1.c.f).label('total')])
.group_by(s1.c.foo_id)
.alias('t0'))

joined = agged.join(s2, agged.c.foo_id == s2.c.foo_id)
expected = sa.select([agged, s2.c.value1]).select_from(joined)

self._compare_sqla(expr, expected)

def test_lower_projection_sort_key(self):
expr = self._case_subquery_aliased()

s1 = self._get_sqla('star1').alias('t2')
s2 = self._get_sqla('star2').alias('t1')

expr2 = (expr
[expr.total > 100]
.sort_by(ibis.desc('total')))

agged = (sa.select([s1.c.foo_id, F.sum(s1.c.f).label('total')])
.group_by(s1.c.foo_id)
.alias('t3'))

joined = agged.join(s2, agged.c.foo_id == s2.c.foo_id)
expected = sa.select([agged, s2.c.value1]).select_from(joined)

joined = agged.join(s2, agged.c.foo_id == s2.c.foo_id)
expected = sa.select([agged, s2.c.value1]).select_from(joined)

ex = expected.alias('t0')

expected2 = (sa.select([ex])
.where(ex.c.total > L(100))
.order_by(ex.c.total.desc()))

self._compare_sqla(expr2, expected2)

def test_exists(self):
e1, e2 = self._case_exists()

Expand Down
54 changes: 38 additions & 16 deletions ibis/sql/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
import ibis.util as util


class ExistsExpr(ir.AnalyticExpr):

def type(self):
return 'exists'


class ExistsSubquery(ir.Node):

"""
Expand All @@ -30,6 +36,9 @@ def __init__(self, foreign_table, predicates):
self.predicates = predicates
ir.Node.__init__(self, [foreign_table, predicates])

def output_type(self):
return ExistsExpr


class NotExistsSubquery(ir.Node):

Expand All @@ -38,6 +47,9 @@ def __init__(self, foreign_table, predicates):
self.predicates = predicates
ir.Node.__init__(self, [foreign_table, predicates])

def output_type(self):
return ExistsExpr


class AnyToExistsTransform(object):

Expand Down Expand Up @@ -85,25 +97,35 @@ def _visit(self, expr):
def _visit_table(self, expr):
node = expr.op()

if isinstance(node, ir.BlockingTableNode):
self._ref_check(expr)

if not isinstance(node, ir.BlockingTableNode):
for arg in node.flat_args():
if isinstance(arg, ir.Expr):
self._visit(arg)

def _ref_check(self, expr):
node = expr.op()

if self._is_root(node):
pass
if isinstance(expr, ir.TableExpr):
base_table = _find_blocking_table(expr)
if base_table is not None:
base_node = base_table.op()
if self._is_root(base_node):
pass
else:
# Foreign ref
self.foreign_table = expr
else:
# Foreign ref
foreign_table = expr
self.foreign_table = foreign_table
if not isinstance(node, ir.BlockingTableNode):
for arg in node.flat_args():
if isinstance(arg, ir.Expr):
self._visit(arg)

def _is_root(self, what):
if isinstance(what, ir.Expr):
what = what.op()
return what in self.query_roots


def _find_blocking_table(expr):
node = expr.op()

if isinstance(node, ir.BlockingTableNode):
return expr

for arg in node.flat_args():
if isinstance(arg, ir.Expr):
result = _find_blocking_table(arg)
if result is not None:
return result
9 changes: 9 additions & 0 deletions ibis/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,16 @@

from pytest import skip

import ibis

groups = ['hdfs', 'impala', 'madlib', 'sqlite']


def pytest_configure(config):
if config.getvalue('iverbose'):
ibis.options.verbose = True


def pytest_addoption(parser):
for group in groups:
parser.addoption('--{0}'.format(group), action='store_true',
Expand All @@ -36,6 +42,9 @@ def pytest_addoption(parser):
parser.addoption('--skip-superuser', action='store_true', default=False,
help='Skip tests marked superuser')

parser.addoption('--iverbose', action='store_true', default=False,
help='Set Ibis to verbose')


def pytest_runtest_setup(item):
only_set = False
Expand Down
18 changes: 16 additions & 2 deletions ibis/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import pytest

import os
import psutil
import socket
import struct
import threading
Expand All @@ -30,6 +29,18 @@
reason='non-POSIX system')


try:
import psutil
HAVE_PSUTIL = True
except ImportError:
HAVE_PSUTIL = False


def get_proc(pid):
import psutil
return psutil.Process(pid)


def port_is_closed(port):
server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
Expand Down Expand Up @@ -136,6 +147,9 @@ def _connect(self, port=None):
return sock

def _spawn_worker(self):
if not HAVE_PSUTIL:
pytest.skip('no psutil')

sock = self._connect()

# Ask to create a worker; reply OK on successful fork
Expand All @@ -151,7 +165,7 @@ def _spawn_worker(self):
sock.close()

worker_port, worker_pid = struct.unpack('II', msg)
proc = psutil.Process(worker_pid)
proc = get_proc(worker_pid)
assert proc.status != ('running', 'sleeping')
return worker_port, worker_pid

Expand Down
56 changes: 56 additions & 0 deletions ibis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import types
import ibis.compat as compat

from ibis.config import options


def guid():
try:
Expand Down Expand Up @@ -163,3 +165,57 @@ def g(*args, **kwargs):
print(message)
return f(*args, **kwargs)
return g


def to_stdout(x):
print(x)


def log(msg):
if options.verbose:
(options.verbose_log or to_stdout)(msg)


class cache_readonly(object):

def __init__(self, func=None, allow_setting=False):
if func is not None:
self.func = func
self.name = func.__name__
self.allow_setting = allow_setting

def __call__(self, func, doc=None):
self.func = func
self.name = func.__name__
return self

def __get__(self, obj, typ):
# Get the cache or set a default one if needed

cache = getattr(obj, '_cache', None)
if cache is None:
try:
cache = obj._cache = {}
except (AttributeError):
return

if self.name in cache:
val = cache[self.name]
else:
val = self.func(obj)
cache[self.name] = val
return val

def __set__(self, obj, value):
if not self.allow_setting:
raise Exception("cannot set values for [%s]" % self.name)

# Get the cache or set a default one if needed
cache = getattr(obj, '_cache', None)
if cache is None:
try:
cache = obj._cache = {}
except (AttributeError):
return

cache[self.name] = value
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
pytest
numpy>=1.7.0
pandas>=0.12.0
impyla>=0.10.0
psutil==0.6.1
impyla>=0.12.0
hdfs>=2.0.0
sqlalchemy>=1.0.0
six
5 changes: 2 additions & 3 deletions scripts/run_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,14 @@ pip install $IBIS_HOME
python --version
which python

if [ $IBIS_TEST_USE_KERBEROS = "True" ]; then
if [ $IBIS_TEST_AUTH_MECH = "GSSAPI" -o $IBIS_TEST_AUTH_MECH = "LDAP" ]; then
sudo yum install -y cyrus-sasl-devel
pip install requests-kerberos
pip install git+https://github.com/laserson/python-sasl.git@cython

# CLOUDERA INTERNAL JENKINS/KERBEROS CONFIG
kinit -l 4h -kt /cdep/keytabs/hive.keytab hive
sudo -u hive PYTHON_EGG_CACHE=/dev/null impala-shell -k -q "GRANT ALL ON SERVER TO ROLE cdep_default_admin WITH GRANT OPTION"
sudo -u hive PYTHON_EGG_CACHE=/dev/null impala-shell -k -q "GRANT ALL ON DATABASE $IBIS_TEST_DATA_DB TO ROLE cdep_default_admin"
sudo -u hive PYTHON_EGG_CACHE=/dev/null impala-shell -k -q "GRANT ALL ON DATABASE $IBIS_TEST_TMP_DB TO ROLE cdep_default_admin"
kdestroy
kinit -l 4h -kt /cdep/keytabs/systest.keytab systest
fi
Expand Down
7 changes: 7 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[versioneer]
VCS = git
style = pep440
versionfile_source = ibis/_version.py
versionfile_build = ibis/_version.py
tag_prefix =
parentdir_prefix = ibis-
29 changes: 12 additions & 17 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,16 @@

cmdclass['build_ext'] = build_ext

from setuptools import setup
import os
import sys
from setuptools import setup # noqa
import os # noqa
import sys # noqa

from distutils.extension import Extension
from distutils.extension import Extension # noqa

MAJOR = 0
MINOR = 5
MICRO = 0
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)

ISRELEASED = True
from distutils.command.clean import clean as _clean # noqa

if not ISRELEASED:
VERSION += '.dev'

# todo: acquire git hash


from distutils.command.clean import clean as _clean
class clean(_clean):
def run(self):
_clean.run(self)
Expand All @@ -73,6 +63,9 @@ def run(self):
requirements.append('argparse')
requirements.append('unittest2')

if PY2:
requirements.append('mock')


if COMMS_EXT_ENABLED:
import numpy as np
Expand Down Expand Up @@ -112,6 +105,8 @@ def run(self):
'Topic :: Scientific/Engineering',
]

import versioneer # noqa

setup(
name='ibis-framework',
packages=['ibis',
Expand All @@ -134,10 +129,10 @@ def run(self):
'ibis.sql.vertica',
'ibis.sql.vertica.tests',
'ibis.tests'],
version=VERSION,
version=versioneer.get_version(),
package_data={'ibis': ['*.pxd', '*.pyx']},
ext_modules=extensions,
cmdclass=cmdclass,
cmdclass=versioneer.get_cmdclass(),
install_requires=requirements,
extras_require={'kerberos': ['requests-kerberos']},
description="Productivity-centric Python Big Data Framework",
Expand Down
1,699 changes: 1,699 additions & 0 deletions versioneer.py

Large diffs are not rendered by default.