192 changes: 121 additions & 71 deletions ibis/sql/ddl.py

Large diffs are not rendered by default.

533 changes: 403 additions & 130 deletions ibis/sql/exprs.py

Large diffs are not rendered by default.

124 changes: 113 additions & 11 deletions ibis/sql/tests/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import pandas as pd

import ibis

from ibis.sql.compiler import build_ast, to_sql
from ibis.expr.tests.mocks import MockConnection
from ibis.compat import unittest
import ibis.common as com

import ibis.expr.api as api
Expand Down Expand Up @@ -114,13 +113,6 @@ def test_ast_with_aggregation_join_filter(self):
def test_ast_non_materialized_join(self):
pass

def test_nonequijoin_unsupported(self):
t1 = self.con.table('star1')
t2 = self.con.table('star2')

joined = t1.inner_join(t2, [t1.f < t2.value1])[[t1]]
self.assertRaises(com.TranslationError, to_sql, joined)

def test_sort_by(self):
table = self.con.table('star1')

Expand Down Expand Up @@ -319,6 +311,33 @@ def test_complex_array_expr_projection(self):
) t0"""
assert query == expected

def test_scalar_exprs_no_table_refs(self):
expr1 = ibis.now()
expected1 = """\
SELECT now() AS `tmp`"""

expr2 = ibis.literal(1) + ibis.literal(2)
expected2 = """\
SELECT 1 + 2 AS `tmp`"""

cases = [
(expr1, expected1),
(expr2, expected2)
]

for expr, expected in cases:
result = to_sql(expr)
assert result == expected

def test_expr_list_no_table_refs(self):
exlist = ibis.api.expr_list([ibis.literal(1).name('a'),
ibis.now().name('b'),
ibis.literal(2).log().name('c')])
result = to_sql(exlist)
expected = """\
SELECT 1 AS `a`, now() AS `b`, ln(2) AS `c`"""
assert result == expected

def test_isnull_case_expr_rewrite_failure(self):
# #172, case expression that was not being properly converted into an
# aggregation
Expand Down Expand Up @@ -1149,6 +1168,21 @@ def test_self_join_subquery_distinct_equal(self):

assert result == expected

def test_limit_with_self_join(self):
t = self.con.table('functional_alltypes')
t2 = t.view()

expr = t.join(t2, t.tinyint_col < t2.timestamp_col.minute()).count()

# it works
result = to_sql(expr)
expected = """\
SELECT count(*) AS `tmp`
FROM functional_alltypes t0
INNER JOIN functional_alltypes t1
ON t0.tinyint_col < extract(t1.timestamp_col, 'minute')"""
assert result == expected

def test_cte_factor_distinct_but_equal(self):
t = self.con.table('alltypes')
tt = self.con.table('alltypes')
Expand Down Expand Up @@ -1352,6 +1386,35 @@ def test_topk_predicate_pushdown_bug(self):
ON t1.n_name = t3.n_name"""
assert result == expected

def test_topk_analysis_bug(self):
# GH #398
airlines = ibis.table([('dest', 'string'),
('origin', 'string'),
('arrdelay', 'int32')], 'airlines')

dests = ['ORD', 'JFK', 'SFO']
t = airlines[airlines.dest.isin(dests)]
delay_filter = t.dest.topk(10, by=t.arrdelay.mean())
expr = t[delay_filter].group_by('origin').size()

result = to_sql(expr)
expected = """\
SELECT t0.origin, count(*) AS `count`
FROM airlines t0
LEFT SEMI JOIN (
SELECT dest, avg(arrdelay) AS `__tmp__`
FROM airlines
WHERE dest IN ('ORD', 'JFK', 'SFO')
GROUP BY 1
ORDER BY __tmp__ DESC
LIMIT 10
) t1
ON t0.dest = t1.dest
WHERE t0.dest IN ('ORD', 'JFK', 'SFO')
GROUP BY 1"""

assert result == expected

def test_bottomk(self):
pass

Expand Down Expand Up @@ -1412,7 +1475,7 @@ def setUp(self):

self.t1 = (table[table.int_col > 0]
[table.string_col.name('key'),
table.float_col.name('value').cast('double')])
table.float_col.cast('double').name('value')])
self.t2 = (table[table.int_col <= 0]
[table.string_col.name('key'),
table.double_col.name('value')])
Expand Down Expand Up @@ -1558,6 +1621,45 @@ def setUp(self):
self.t = t = self.con.table('functional_alltypes')
self.expr = t[t.bigint_col > 0]

def test_create_external_table_as(self):
path = '/path/to/table'
select = build_ast(self.con.table('test1')).queries[0]
statement = ddl.CTAS('another_table',
select,
external=True,
overwrite=True,
path=path,
database='foo')
result = statement.compile()

expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
STORED AS PARQUET
LOCATION '{0}'
AS
SELECT *
FROM test1""".format(path)
assert result == expected

def test_create_table_with_location(self):
path = '/path/to/table'
schema = ibis.schema([('foo', 'string'),
('bar', 'int8'),
('baz', 'int16')])
select = build_ast(self.con.table('test1')).queries[0]
statement = ddl.CreateTableWithSchema('another_table', schema,
ddl.NoFormat(), overwrite=True,
path=path, database='foo')
result = statement.compile()

expected = """\
CREATE TABLE foo.`another_table`
(`foo` STRING,
`bar` TINYINT,
`baz` SMALLINT)
LOCATION '{0}'""".format(path)
assert result == expected

def test_create_table_like_parquet(self):
directory = '/path/to/'
path = '/path/to/parquetfile'
Expand Down Expand Up @@ -1616,7 +1718,7 @@ def test_create_table_parquet_with_schema(self):
`bar` TINYINT,
`baz` SMALLINT)
STORED AS PARQUET
LOCATION '{}'""".format(directory)
LOCATION '{0}'""".format(directory)

assert result == expected

Expand Down
148 changes: 135 additions & 13 deletions ibis/sql/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import pandas as pd

from ibis.sql.exprs import ExprTranslator
from ibis.sql.compiler import QueryContext, to_sql
from ibis.expr.tests.mocks import MockConnection
from ibis.compat import unittest
import ibis.expr.types as ir
import ibis

Expand Down Expand Up @@ -218,7 +217,7 @@ def test_timestamp_extract_field(self):
'second', 'millisecond']

cases = [(getattr(self.table.i, field)(),
"extract(i, '{}')".format(field))
"extract(i, '{0}')".format(field))
for field in fields]
self._check_expr_cases(cases)

Expand Down Expand Up @@ -252,7 +251,7 @@ def test_timestamp_deltas(self):
for unit in units:
K = 5
offset = getattr(ibis, unit)(K)
template = '{}s_add({}, {})'
template = '{0}s_add({1}, {2})'

cases.append((t + offset, template.format(unit, f, K)))
cases.append((t - offset, template.format(unit, f, -K)))
Expand Down Expand Up @@ -330,7 +329,7 @@ def test_numeric_monadic_builtins(self):

for cname in ['double_col', 'int_col']:
expr = getattr(self.table[cname], ibis_name)()
cases.append((expr, '{}({})'.format(sql_name, cname)))
cases.append((expr, '{0}({1})'.format(sql_name, cname)))

self._check_expr_cases(cases)

Expand All @@ -344,7 +343,9 @@ def test_round(self):
cases = [
(self.table.double_col.round(), 'round(double_col)'),
(self.table.double_col.round(0), 'round(double_col, 0)'),
(self.table.double_col.round(2, ), 'round(double_col, 2)')
(self.table.double_col.round(2, ), 'round(double_col, 2)'),
(self.table.double_col.round(self.table.tinyint_col),
'round(double_col, tinyint_col)')
]
self._check_expr_cases(cases)

Expand All @@ -362,7 +363,7 @@ def test_hash(self):
def test_reduction_where(self):
cond = self.table.bigint_col < 70
c = self.table.double_col
tmp = '{}(CASE WHEN bigint_col < 70 THEN double_col ELSE NULL END)'
tmp = '{0}(CASE WHEN bigint_col < 70 THEN double_col ELSE NULL END)'
cases = [
(c.sum(where=cond), tmp.format('sum')),
(c.count(where=cond), tmp.format('count')),
Expand Down Expand Up @@ -443,7 +444,19 @@ def test_nullif_ifnull(self):
cases = [
(f.nullif(f == 0),
'nullif(l_quantity, l_quantity = 0)'),
(f.fillna(0), 'isnull(l_quantity, 0)'),
(f.fillna(0),
'isnull(l_quantity, CAST(0 AS decimal(12,2)))'),
]
self._check_expr_cases(cases)

def test_decimal_fillna_cast_arg(self):
table = self.con.table('tpch_lineitem')
f = table.l_extendedprice

cases = [
(f.fillna(0),
'isnull(l_extendedprice, CAST(0 AS decimal(12,2)))'),
(f.fillna(0.0), 'isnull(l_extendedprice, 0.0)'),
]
self._check_expr_cases(cases)

Expand Down Expand Up @@ -722,25 +735,60 @@ def test_least(self):
self._check_expr_cases(cases)


class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest):

def setUp(self):
self.con = MockConnection()
self.table = self.con.table('functional_alltypes')

def test_analytic_exprs(self):
t = self.table

w = ibis.window(order_by=t.float_col)

cases = [
(ibis.row_number().over(w),
'row_number() OVER (ORDER BY float_col) - 1'),
(t.string_col.lag(), 'lag(string_col)'),
(t.string_col.lag(2), 'lag(string_col, 2)'),
(t.string_col.lag(default=0), 'lag(string_col, 1, 0)'),
(t.string_col.lead(), 'lead(string_col)'),
(t.string_col.lead(2), 'lead(string_col, 2)'),
(t.string_col.lead(default=0), 'lead(string_col, 1, 0)'),
(t.double_col.first(), 'first_value(double_col)'),
(t.double_col.last(), 'last_value(double_col)'),
# (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))')
]
self._check_expr_cases(cases)


class TestStringBuiltins(unittest.TestCase, ExprSQLTest):

def setUp(self):
self.con = MockConnection()
self.table = self.con.table('functional_alltypes')

def test_unary_ops(self):
s = self.table.string_col
cases = [
(self.table.string_col.lower(), 'lower(string_col)'),
(self.table.string_col.upper(), 'upper(string_col)'),
(self.table.string_col.length(), 'length(string_col)')
(s.lower(), 'lower(string_col)'),
(s.upper(), 'upper(string_col)'),
(s.reverse(), 'reverse(string_col)'),
(s.strip(), 'trim(string_col)'),
(s.lstrip(), 'ltrim(string_col)'),
(s.rstrip(), 'rtrim(string_col)'),
(s.capitalize(), 'initcap(string_col)'),
(s.length(), 'length(string_col)'),
(s.ascii_str(), 'ascii(string_col)')
]
self._check_expr_cases(cases)

def test_substr(self):
# Database numbers starting from 1
cases = [
(self.table.string_col.substr(2), 'substr(string_col, 3)'),
(self.table.string_col.substr(0, 3), 'substr(string_col, 1, 3)')
(self.table.string_col.substr(2), 'substr(string_col, 2 + 1)'),
(self.table.string_col.substr(0, 3),
'substr(string_col, 0 + 1, 3)')
]
self._check_expr_cases(cases)

Expand All @@ -763,3 +811,77 @@ def test_rlike(self):
(self.table.string_col.re_search('[\d]+'), ex),
]
self._check_expr_cases(cases)

def test_re_extract(self):
sql = "regexp_extract(string_col, '[\d]+', 0)"
cases = [
(self.table.string_col.re_extract('[\d]+', 0), sql)
]
self._check_expr_cases(cases)

def test_re_replace(self):
sql = "regexp_replace(string_col, '[\d]+', 'aaa')"
cases = [
(self.table.string_col.re_replace('[\d]+', 'aaa'), sql)
]
self._check_expr_cases(cases)

def test_parse_url(self):
sql = "parse_url(string_col, 'HOST')"
cases = [
(self.table.string_col.parse_url('HOST'), sql)
]
self._check_expr_cases(cases)

def test_repeat(self):
cases = [
(self.table.string_col.repeat(2), 'repeat(string_col, 2)')
]
self._check_expr_cases(cases)

def test_translate(self):
cases = [
(self.table.string_col.translate('a', 'b'),
"translate(string_col, 'a', 'b')")
]
self._check_expr_cases(cases)

def test_find(self):
s = self.table.string_col
i1 = self.table.tinyint_col
cases = [
(s.find('a'), "locate('a', string_col) - 1"),
(s.find('a', 2), "locate('a', string_col, 3) - 1"),
(s.find('a', start=i1),
"locate('a', string_col, tinyint_col + 1) - 1")
]
self._check_expr_cases(cases)

def test_lpad(self):
cases = [
(self.table.string_col.lpad(1, 'a'), "lpad(string_col, 1, 'a')"),
(self.table.string_col.lpad(25), "lpad(string_col, 25, ' ')")
]
self._check_expr_cases(cases)

def test_rpad(self):
cases = [
(self.table.string_col.rpad(1, 'a'), "rpad(string_col, 1, 'a')"),
(self.table.string_col.rpad(25), "rpad(string_col, 25, ' ')")
]
self._check_expr_cases(cases)

def test_find_in_set(self):
cases = [
(self.table.string_col.find_in_set(['a']),
"find_in_set(string_col, 'a') - 1"),
(self.table.string_col.find_in_set(['a', 'b']),
"find_in_set(string_col, 'a,b') - 1")
]
self._check_expr_cases(cases)

def test_string_join(self):
cases = [
(ibis.literal(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')")
]
self._check_expr_cases(cases)
222 changes: 222 additions & 0 deletions ibis/sql/tests/test_window.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from ibis import window
import ibis

from ibis.sql.compiler import to_sql
from ibis.expr.tests.mocks import BasicTestCase
from ibis.compat import unittest
import ibis.common as com


class TestWindowFunctions(BasicTestCase, unittest.TestCase):

def _check_sql(self, expr, expected):
result = to_sql(expr)
assert result == expected

def test_aggregate_in_projection(self):
t = self.con.table('alltypes')
proj = t[t, (t.f / t.f.sum()).name('normed_f')]

expected = """\
SELECT *, f / sum(f) OVER () AS `normed_f`
FROM alltypes"""
self._check_sql(proj, expected)

def test_add_default_order_by(self):
t = self.con.table('alltypes')

first = t.f.first().name('first')
last = t.f.last().name('last')
lag = t.f.lag().name('lag')
diff = (t.f.lead() - t.f).name('fwd_diff')
lag2 = t.f.lag().over(window(order_by=t.d)).name('lag2')
grouped = t.group_by('g')
proj = grouped.mutate([lag, diff, first, last, lag2])
expected = """\
SELECT *, lag(f) OVER (PARTITION BY g ORDER BY f) AS `lag`,
lead(f) OVER (PARTITION BY g ORDER BY f) - f AS `fwd_diff`,
first_value(f) OVER (PARTITION BY g ORDER BY f) AS `first`,
last_value(f) OVER (PARTITION BY g ORDER BY f) AS `last`,
lag(f) OVER (PARTITION BY g ORDER BY d) AS `lag2`
FROM alltypes"""
self._check_sql(proj, expected)

def test_window_frame_specs(self):
t = self.con.table('alltypes')

ex_template = """\
SELECT sum(d) OVER (ORDER BY f {0}) AS `foo`
FROM alltypes"""

cases = [
(window(preceding=0),
'range between current row and unbounded following'),

(window(following=0),
'range between unbounded preceding and current row'),

(window(preceding=5),
'rows between 5 preceding and unbounded following'),
(window(preceding=5, following=0),
'rows between 5 preceding and current row'),
(window(preceding=5, following=2),
'rows between 5 preceding and 2 following'),
(window(following=2),
'rows between unbounded preceding and 2 following'),
(window(following=2, preceding=0),
'rows between current row and 2 following'),
(window(preceding=5),
'rows between 5 preceding and unbounded following'),
(window(following=[5, 10]),
'rows between 5 following and 10 following'),
(window(preceding=[10, 5]),
'rows between 10 preceding and 5 preceding'),

# # cumulative windows
(ibis.cumulative_window(),
'range between unbounded preceding and current row'),

# # trailing windows
(ibis.trailing_window(10),
'rows between 10 preceding and current row'),
]

for w, frame in cases:
w2 = w.order_by(t.f)
expr = t.projection([t.d.sum().over(w2).name('foo')])
expected = ex_template.format(frame.upper())
self._check_sql(expr, expected)

def test_cumulative_functions(self):
t = self.con.table('alltypes')

w = ibis.window(order_by=t.d)
exprs = [
(t.f.cumsum().over(w), t.f.sum().over(w)),
(t.f.cummin().over(w), t.f.min().over(w)),
(t.f.cummax().over(w), t.f.max().over(w)),
(t.f.cummean().over(w), t.f.mean().over(w)),
]

for cumulative, static in exprs:
actual = cumulative.name('foo')
expected = static.over(ibis.cumulative_window()).name('foo')

expr1 = t.projection(actual)
expr2 = t.projection(expected)

self._compare_sql(expr1, expr2)

def _compare_sql(self, e1, e2):
s1 = to_sql(e1)
s2 = to_sql(e2)
assert s1 == s2

def test_nested_analytic_function(self):
t = self.con.table('alltypes')

w = window(order_by=t.f)
expr = (t.f - t.f.lag()).lag().over(w).name('foo')
result = t.projection([expr])
expected = """\
SELECT lag(f - lag(f) OVER (ORDER BY f)) \
OVER (ORDER BY f) AS `foo`
FROM alltypes"""
self._check_sql(result, expected)

def test_rank_functions(self):
t = self.con.table('alltypes')

proj = t[t.g, t.f.rank().name('minr'),
t.f.dense_rank().name('denser')]
expected = """\
SELECT g, rank() OVER (ORDER BY f) - 1 AS `minr`,
dense_rank() OVER (ORDER BY f) - 1 AS `denser`
FROM alltypes"""
self._check_sql(proj, expected)

def test_multiple_windows(self):
t = self.con.table('alltypes')

w = window(group_by=t.g)

expr = t.f.sum().over(w) - t.f.sum()
proj = t.projection([t.g, expr.name('result')])

expected = """\
SELECT g, sum(f) OVER (PARTITION BY g) - sum(f) OVER () AS `result`
FROM alltypes"""
self._check_sql(proj, expected)

def test_order_by_desc(self):
t = self.con.table('alltypes')

w = window(order_by=ibis.desc(t.f))

proj = t[t.f, ibis.row_number().over(w).name('revrank')]
expected = """\
SELECT f, row_number() OVER (ORDER BY f DESC) - 1 AS `revrank`
FROM alltypes"""
self._check_sql(proj, expected)

expr = (t.group_by('g')
.order_by(ibis.desc(t.f))
[t.d.lag().name('foo'), t.a.max()])
expected = """\
SELECT lag(d) OVER (PARTITION BY g ORDER BY f DESC) AS `foo`,
max(a) OVER (PARTITION BY g ORDER BY f DESC) AS `max`
FROM alltypes"""
self._check_sql(expr, expected)

def test_row_number_requires_order_by(self):
t = self.con.table('alltypes')

with self.assertRaises(com.ExpressionError):
(t.group_by(t.g)
.mutate(ibis.row_number().name('foo')))

expr = (t.group_by(t.g)
.order_by(t.f)
.mutate(ibis.row_number().name('foo')))

expected = """\
SELECT *, row_number() OVER (PARTITION BY g ORDER BY f) - 1 AS `foo`
FROM alltypes"""
self._check_sql(expr, expected)

def test_unsupported_aggregate_functions(self):
t = self.con.table('alltypes')
w = ibis.window(order_by=t.d)

exprs = [
t.f.approx_nunique(),
t.f.approx_median(),
t.g.group_concat(),
]

for expr in exprs:
with self.assertRaises(com.TranslationError):
proj = t.projection([expr.over(w).name('foo')])
to_sql(proj)

def test_math_on_windowed_expr(self):
# Window clause may not be found at top level of expression
pass

def test_group_by_then_different_sort_orders(self):
pass
13 changes: 13 additions & 0 deletions ibis/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
26 changes: 26 additions & 0 deletions ibis/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pytest import skip


def pytest_addoption(parser):
parser.addoption('--e2e', action='store_true', default=False,
help='Enable the e2e (end-to-end) tests')


def pytest_runtest_setup(item):
if getattr(item.obj, 'e2e', None): # the test item is marked e2e
if not item.config.getvalue('e2e'): # but --e2e option not set
skip('e2e was not enabled')
2 changes: 1 addition & 1 deletion ibis/tests/test_comms.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
import os
import sys
import threading
import unittest

import pytest

import numpy as np

from ibis.util import guid
from ibis.compat import unittest

try:
import ibis.comms as comms
Expand Down
218 changes: 184 additions & 34 deletions ibis/tests/test_filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from six import BytesIO

from posixpath import join as pjoin
from os import path as osp
import os
import shutil
import unittest

from hdfs import InsecureClient
import pytest

from ibis.filesystems import HDFS, WebHDFS
from ibis.compat import unittest
from ibis.tests.util import IbisTestEnv
import ibis.util as util


ENV = IbisTestEnv()


class MockHDFS(HDFS):

def __init__(self):
Expand Down Expand Up @@ -63,41 +70,31 @@ def test_find_any_file(self):
assert result == '/path/0.parq'


@pytest.mark.e2e
class TestHDFSE2E(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.host = os.environ.get('IBIS_TEST_HOST', 'localhost')
cls.protocol = os.environ.get('IBIS_TEST_PROTOCOL', 'hiveserver2')
cls.port = os.environ.get('IBIS_TEST_PORT', 21050)

cls.test_dir = '/{}'.format(util.guid())

# Impala dev environment uses port 5070 for HDFS web interface

hdfs_host = 'localhost'
webhdfs_port = 5070
url = 'http://{}:{}'.format(hdfs_host, webhdfs_port)

try:
cls.hdfs_client = InsecureClient(url)
cls.hdfs = WebHDFS(cls.hdfs_client)
cls.hdfs.mkdir(cls.test_dir)
except Exception as e:
pytest.skip('Could not connect to HDFS: {}'.format(e.message))
cls.ENV = ENV
cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid())
cls.hdfs_client = InsecureClient(cls.ENV.hdfs_url)
cls.hdfs = WebHDFS(cls.hdfs_client)
cls.hdfs.mkdir(cls.tmp_dir)

@classmethod
def tearDownClass(cls):
try:
cls.hdfs.rmdir(cls.test_dir)
cls.hdfs.rmdir(cls.tmp_dir)
except:
pass

def setUp(self):
self.test_files = []
self.test_directories = []

def tearDown(self):
self._delete_test_files()
pass

def _delete_test_files(self):
for path in self.test_files:
Expand All @@ -106,11 +103,30 @@ def _delete_test_files(self):
except os.error:
pass

def _make_random_file(self, units=100, directory=None):
for path in self.test_directories:
try:
shutil.rmtree(path)
except os.error:
pass

def _make_test_directory(self, files=5, filesize=1024, directory=None):
if directory is None:
directory = util.guid()
os.mkdir(directory)
self.test_directories.append(directory)

for i in xrange(files):
self._make_random_file(size=filesize, directory=directory)

return directory

def _make_random_file(self, size=1024, directory=None):
path = util.guid()

if directory:
path = os.path.join(directory, path)
path = osp.join(directory, path)

units = size / 32

with open(path, 'wb') as f:
for i in xrange(units):
Expand All @@ -119,13 +135,39 @@ def _make_random_file(self, units=100, directory=None):
self.test_files.append(path)
return path

def _make_random_hdfs_file(self, size=1024, directory=None):
local_path = self._make_random_file(size=size)
remote_path = pjoin(directory or self.tmp_dir, local_path)
self.hdfs.put(remote_path, local_path)
return remote_path

def test_mkdir(self):
path = pjoin(self.test_dir, 'mkdir-test')
path = pjoin(self.tmp_dir, 'mkdir-test')
self.hdfs.mkdir(path)
assert self.hdfs.exists(path)

def test_write_get_delete_file(self):
dirpath = pjoin(self.test_dir, 'write-delete-test')
def test_mv_to_existing_file(self):
remote_file = self._make_random_hdfs_file()
existing_remote_file_dest = self._make_random_hdfs_file()
self.hdfs.mv(remote_file, existing_remote_file_dest)

def test_mv_to_existing_file_no_overwrite(self):
remote_file = self._make_random_hdfs_file()
existing_remote_file_dest = self._make_random_hdfs_file()
with self.assertRaises(Exception):
self.hdfs.mv(remote_file, existing_remote_file_dest, overwrite=False)

def test_mv_to_directory(self):
remote_file = self._make_random_hdfs_file()
dest_dir = pjoin(self.tmp_dir, util.guid())
self.hdfs.mkdir(dest_dir)
self.hdfs.mv(remote_file, dest_dir)
new_remote_file = pjoin(dest_dir, os.path.basename(remote_file))
file_status = self.hdfs.status(new_remote_file)
assert file_status['type'] == 'FILE'

def test_put_get_delete_file(self):
dirpath = pjoin(self.tmp_dir, 'write-delete-test')
self.hdfs.mkdir(dirpath)

lpath = self._make_random_file()
Expand All @@ -146,7 +188,7 @@ def test_write_get_delete_file(self):
def test_overwrite_file(self):
pass

def test_write_get_directory(self):
def test_put_get_directory(self):
local_dir = util.guid()
local_download_dir = util.guid()

Expand All @@ -158,7 +200,7 @@ def test_write_get_directory(self):
for i in xrange(K):
self._make_random_file(directory=local_dir)

remote_dir = pjoin(self.test_dir, local_dir)
remote_dir = pjoin(self.tmp_dir, local_dir)
self.hdfs.put(remote_dir, local_dir)

assert self.hdfs.exists(remote_dir)
Expand All @@ -176,6 +218,49 @@ def test_write_get_directory(self):
finally:
shutil.rmtree(local_dir)

def test_put_file_into_directory(self):
local_path = self._make_random_file()
self.hdfs.put(self.tmp_dir, local_path)
remote_file_path = pjoin(self.tmp_dir, local_path)
file_status = self.hdfs.status(remote_file_path)
assert file_status['type'] == 'FILE'

def test_get_file_overwrite(self):
local_path = self._make_random_file()
local_path2 = self._make_random_file()

remote_path = pjoin(self.tmp_dir, local_path)
self.hdfs.put(remote_path, local_path)

remote_path2 = pjoin(self.tmp_dir, local_path2)
self.hdfs.put(remote_path2, local_path2)

with self.assertRaises(IOError):
self.hdfs.get(remote_path, '.')

self.hdfs.get(remote_path, local_path2, overwrite=True)
assert open(local_path2).read() == open(local_path).read()

def test_put_buffer_like(self):
data = b'peekaboo'

buf = BytesIO()
buf.write(data)
buf.seek(0)

remote_path = pjoin(self.tmp_dir, util.guid())
self.hdfs.put(remote_path, buf)

local_path = util.guid()
self.test_files.append(local_path)

self.hdfs.get(remote_path, local_path)
assert open(local_path, 'rb').read() == data

def test_get_logging(self):
# TODO write a test for this
pass

def test_get_directory_nested_dirs(self):
local_dir = util.guid()
local_download_dir = util.guid()
Expand All @@ -188,10 +273,10 @@ def test_get_directory_nested_dirs(self):
for i in xrange(K):
self._make_random_file(directory=local_dir)

nested_dir = os.path.join(local_dir, 'nested-dir')
nested_dir = osp.join(local_dir, 'nested-dir')
shutil.copytree(local_dir, nested_dir)

remote_dir = pjoin(self.test_dir, local_dir)
remote_dir = pjoin(self.tmp_dir, local_dir)
self.hdfs.put(remote_dir, local_dir)

# download directory and check contents
Expand All @@ -206,22 +291,87 @@ def test_get_directory_nested_dirs(self):
finally:
shutil.rmtree(local_dir)

def test_get_directory_overwrite(self):
local_dir = self._make_test_directory()
local_dir2 = self._make_test_directory()

remote_dir = pjoin(self.tmp_dir, local_dir)
remote_dir2 = pjoin(self.tmp_dir, local_dir2)

self.hdfs.put(remote_dir, local_dir)
self.hdfs.put(remote_dir2, local_dir2)

self.hdfs.get(remote_dir, local_dir2, overwrite=True)
_check_directories_equal(local_dir2, local_dir)

self.hdfs.get(remote_dir, local_dir2, overwrite=True)
_check_directories_equal(local_dir2, local_dir)

def _try_delete_directory(self, path):
try:
shutil.rmtree(path)
except os.error:
pass

def test_ls(self):
test_dir = pjoin(self.test_dir, 'ls-test')
test_dir = pjoin(self.tmp_dir, 'ls-test')
self.hdfs.mkdir(test_dir)
for i in xrange(10):
local_path = self._make_random_file()
hdfs_path = pjoin(test_dir, local_path)
self.hdfs.put(hdfs_path, local_path)

assert len(self.hdfs.ls(test_dir)) == 10

def test_size(self):
test_dir = pjoin(self.tmp_dir, 'size-test')

K = 2048
path = self._make_random_file(size=K)
hdfs_path = pjoin(test_dir, path)
self.hdfs.put(hdfs_path, path)
assert self.hdfs.size(hdfs_path) == K

size_test_dir = self._sample_nested_directory()

hdfs_path = pjoin(test_dir, size_test_dir)
self.hdfs.put(hdfs_path, size_test_dir)

assert self.hdfs.size(hdfs_path) == K * 7

def test_put_get_tarfile(self):
test_dir = pjoin(self.tmp_dir, 'tarfile-test')

dirname = self._sample_nested_directory()

import subprocess
tf_name = '{0}.tar.gz'.format(dirname)
cmd = 'tar zc {0} > {1}'.format(dirname, tf_name)

retcode = subprocess.call(cmd, shell=True)
if retcode:
raise Exception((retcode, cmd))

self.test_files.append(tf_name)

randname = util.guid()
hdfs_path = pjoin(test_dir, randname)
self.hdfs.put_tarfile(hdfs_path, tf_name, compression='gzip')

self.hdfs.get(hdfs_path, '.')
self.test_directories.append(randname)
_check_directories_equal(osp.join(randname, dirname), dirname)

def _sample_nested_directory(self):
K = 2048
dirname = self._make_test_directory(files=2, filesize=K)
nested_dir = osp.join(dirname, util.guid())
os.mkdir(nested_dir)

self._make_test_directory(files=5, filesize=K,
directory=nested_dir)

return dirname


def _check_directories_equal(left, right):
left_files = _get_all_files(left)
Expand All @@ -243,12 +393,12 @@ def _contents_equal(left, right):
def _get_all_files(path):
paths = {}
for dirpath, _, filenames in os.walk(path):
rel_dir = os.path.relpath(dirpath, path)
rel_dir = osp.relpath(dirpath, path)
if rel_dir == '.':
rel_dir = ''
for name in filenames:
abspath = os.path.join(dirpath, name)
relpath = os.path.join(rel_dir, name)
abspath = osp.join(dirpath, name)
relpath = osp.join(rel_dir, name)
paths[relpath] = abspath

return paths
576 changes: 454 additions & 122 deletions ibis/tests/test_impala_e2e.py

Large diffs are not rendered by default.

222 changes: 222 additions & 0 deletions ibis/tests/test_pandas_interop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
import pytest

import ibis
import ibis.expr.types as ir
from ibis.compat import unittest
from ibis.util import pandas_to_ibis_schema
from ibis.common import IbisTypeError
from ibis.tests.util import ImpalaE2E


functional_alltypes_with_nulls = pd.DataFrame({
'bigint_col': np.int64([0, 10, 20, 30, 40, 50, 60, 70, 80, 90]),
'bool_col': np.bool_([True, False, True, False, True, None, True, False, True,
False]),
'date_string_col': ['11/01/10', None, '11/01/10', '11/01/10',
'11/01/10', '11/01/10', '11/01/10', '11/01/10',
'11/01/10', '11/01/10'],
'double_col': np.float64([0.0, 10.1, None, 30.299999999999997,
40.399999999999999, 50.5, 60.599999999999994,
70.700000000000003, 80.799999999999997, 90.899999999999991]),
'float_col': np.float32([None, 1.1000000238418579, 2.2000000476837158,
3.2999999523162842, 4.4000000953674316, 5.5,
6.5999999046325684, 7.6999998092651367, 8.8000001907348633,
9.8999996185302734]),
'int_col': np.int32([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11],
'smallint_col': np.int16([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'string_col': ['0', '1', None, '3', '4', '5', '6', '7', '8', '9'],
'timestamp_col': [pd.Timestamp('2010-11-01 00:00:00'),
None,
pd.Timestamp('2010-11-01 00:02:00.100000'),
pd.Timestamp('2010-11-01 00:03:00.300000'),
pd.Timestamp('2010-11-01 00:04:00.600000'),
pd.Timestamp('2010-11-01 00:05:00.100000'),
pd.Timestamp('2010-11-01 00:06:00.150000'),
pd.Timestamp('2010-11-01 00:07:00.210000'),
pd.Timestamp('2010-11-01 00:08:00.280000'),
pd.Timestamp('2010-11-01 00:09:00.360000')],
'tinyint_col': np.int8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'year': [2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010]})


class TestPandasTypeInterop(unittest.TestCase):

def test_series_to_ibis_literal(self):
values = [1, 2, 3, 4]
s = pd.Series(values)

expr = ir.as_value_expr(s)
expected = ir.sequence(list(s))
assert expr.equals(expected)


class TestPandasSchemaInference(unittest.TestCase):

def test_dtype_bool(self):
df = pd.DataFrame({'col': [True, False, False]})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'boolean')])
assert inferred == expected

def test_dtype_int8(self):
df = pd.DataFrame({'col': np.int8([-3, 9, 17])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int8')])
assert inferred == expected

def test_dtype_int16(self):
df = pd.DataFrame({'col': np.int16([-5, 0, 12])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int16')])
assert inferred == expected

def test_dtype_int32(self):
df = pd.DataFrame({'col': np.int32([-12, 3, 25000])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int32')])
assert inferred == expected

def test_dtype_int64(self):
df = pd.DataFrame({'col': np.int64([102, 67228734, -0])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int64')])
assert inferred == expected

def test_dtype_float32(self):
df = pd.DataFrame({'col': np.float32([45e-3, -0.4, 99.])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'float')])
assert inferred == expected

def test_dtype_float64(self):
df = pd.DataFrame({'col': np.float64([-3e43, 43., 10000000.])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'double')])
assert inferred == expected

def test_dtype_uint8(self):
df = pd.DataFrame({'col': np.uint8([3, 0, 16])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int16')])
assert inferred == expected

def test_dtype_uint16(self):
df = pd.DataFrame({'col': np.uint16([5569, 1, 33])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int32')])
assert inferred == expected

def test_dtype_uint32(self):
df = pd.DataFrame({'col': np.uint32([100, 0, 6])})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int64')])
assert inferred == expected

def test_dtype_uint64(self):
df = pd.DataFrame({'col': np.uint64([666, 2, 3])})
with self.assertRaises(IbisTypeError):
inferred = pandas_to_ibis_schema(df)

def test_dtype_datetime64(self):
df = pd.DataFrame({
'col': [pd.Timestamp('2010-11-01 00:01:00'),
pd.Timestamp('2010-11-01 00:02:00.1000'),
pd.Timestamp('2010-11-01 00:03:00.300000')]})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'timestamp')])
assert inferred == expected

def test_dtype_timedelta64(self):
df = pd.DataFrame({
'col': [pd.Timedelta('1 days'),
pd.Timedelta('-1 days 2 min 3us'),
pd.Timedelta('-2 days +23:57:59.999997')]})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'int64')])
assert inferred == expected

def test_dtype_string(self):
df = pd.DataFrame({'col': ['foo', 'bar', 'hello']})
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'string')])
assert inferred == expected

def test_dtype_categorical(self):
df = pd.DataFrame({'col': ['a', 'b', 'c', 'a']}, dtype='category')
inferred = pandas_to_ibis_schema(df)
expected = ibis.schema([('col', 'category')])
assert inferred == expected


@pytest.mark.e2e
class TestPandasRoundTrip(ImpalaE2E, unittest.TestCase):

def test_round_trip(self):
pytest.skip('fails')

df1 = self.alltypes.execute()
df2 = self.con.pandas(df1, 'bamboo', database=self.tmp_db).execute()
assert (df1.columns == df2.columns).all()
assert (df1.dtypes == df2.dtypes).all()
assert (df1 == df2).all().all()

def test_round_trip_non_int_missing_data(self):
df1 = functional_alltypes_with_nulls
table = self.con.pandas(df1, 'fawn', database=self.tmp_db)
df2 = table.execute()
assert (df1.columns == df2.columns).all()
assert (df1.dtypes == df2.dtypes).all()
# bool/int cols should be exact
assert (df1.bool_col == df2.bool_col).all()
assert (df1.tinyint_col == df2.tinyint_col).all()
assert (df1.smallint_col == df2.smallint_col).all()
assert (df1.int_col == df2.int_col).all()
assert (df1.bigint_col == df2.bigint_col).all()
assert (df1.month == df2.month).all()
assert (df1.year == df2.year).all()
# string cols should be equal everywhere except for the NULLs
assert ((df1.string_col == df2.string_col) ==
[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]).all()
assert ((df1.date_string_col == df2.date_string_col) ==
[1, 0, 1, 1, 1, 1, 1, 1, 1, 1]).all()
# float cols within tolerance, and NULLs should be False
assert ((df1.double_col - df2.double_col < 1e-9) ==
[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]).all()
assert ((df1.float_col - df2.float_col < 1e-9) ==
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1]).all()

def test_round_trip_missing_type_promotion(self):
pytest.skip('unfinished')

# prepare Impala table with missing ints
# TODO: switch to self.con.raw_sql once #412 is fixed
create_query = ('CREATE TABLE {0}.missing_ints '
' (tinyint_col TINYINT, bigint_col BIGINT) '
'STORED AS PARQUET'.format(self.tmp_db))
insert_query = ('INSERT INTO {0}.missing_ints '
'VALUES (NULL, 3), (-5, NULL), (19, 444444)'.format(
self.tmp_db))
self.con.con.cursor.execute(create_query)
self.con.con.cursor.execute(insert_query)

table = self.con.table('missing_ints', database=self.tmp_db)
df = table.execute()

# WHAT NOW?
2 changes: 1 addition & 1 deletion ibis/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import socket
import struct
import threading
import unittest

from ibis.compat import unittest
from ibis.server import IbisServerNode


Expand Down
7 changes: 3 additions & 4 deletions ibis/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,20 @@

import os
import pytest
import unittest

import pandas as pd

from cPickle import loads as pickle_load
from ibis.cloudpickle import dumps as pickle_dump

from test_comms import double_ex

from ibis.tasks import IbisTaskMessage, IbisTaskExecutor
from ibis.util import guid
from ibis.wire import BytesIO
import ibis.wire as wire

from ibis.compat import unittest
from ibis.tests.test_server import WorkerTestFixture

try:
Expand Down Expand Up @@ -154,9 +156,6 @@ def delete_all_guid_files():
[os.remove(x) for x in glob.glob('*') if len(x) == 32]


from test_comms import double_ex


class NRows(object):

def __init__(self):
Expand Down
139 changes: 139 additions & 0 deletions ibis/tests/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time

import pytest

from ibis import Schema
from ibis import options
import ibis.util as util
import ibis


class IbisTestEnv(object):

def __init__(self):
# TODO: allow initializing values through a constructor
self.impala_host = os.environ.get('IBIS_TEST_IMPALA_HOST', 'localhost')
self.impala_protocol = os.environ.get('IBIS_TEST_IMPALA_PROTOCOL',
'hiveserver2')
self.impala_port = int(os.environ.get('IBIS_TEST_IMPALA_PORT', 21050))
self.tmp_db = os.environ.get('IBIS_TEST_TMP_DB',
'__ibis_tmp_{0}'.format(util.guid()))
self.tmp_dir = os.environ.get('IBIS_TEST_TMP_HDFS_DIR',
'/tmp/__ibis_test')
self.test_data_db = os.environ.get('IBIS_TEST_DATA_DB', 'ibis_testing')
self.test_data_dir = os.environ.get('IBIS_TEST_DATA_HDFS_DIR',
'/__ibis/ibis-testing-data')
self.nn_host = os.environ.get('IBIS_TEST_NN_HOST', 'localhost')
# 5070 is default for impala dev env
self.webhdfs_port = int(os.environ.get('IBIS_TEST_WEBHDFS_PORT', 5070))
self.hdfs_url = 'http://{0}:{1}'.format(self.nn_host,
self.webhdfs_port)
self.use_codegen = os.environ.get('IBIS_TEST_USE_CODEGEN',
'False').lower() == 'true'
self.cleanup_test_data = os.environ.get('IBIS_TEST_CLEANUP_TEST_DATA',
'True').lower() == 'true'
self.use_kerberos = os.environ.get('IBIS_TEST_USE_KERBEROS',
'False').lower() == 'true'

# update global Ibis config where relevant
options.impala.temp_db = self.tmp_db
options.impala.temp_hdfs_path = self.tmp_dir

def __repr__(self):
kvs = ['{0}={1}'.format(k, v) for (k, v) in self.__dict__.iteritems()]
return 'IbisTestEnv(\n {0})'.format(',\n '.join(kvs))


def connect_test(env, with_hdfs=True):
con = ibis.impala_connect(host=env.impala_host,
protocol=env.impala_protocol,
database=env.test_data_db,
port=env.impala_port,
use_kerberos=env.use_kerberos,
pool_size=2)
if with_hdfs:
if env.use_kerberos:
from hdfs.ext.kerberos import KerberosClient
hdfs_client = KerberosClient(env.hdfs_url, mutual_auth='REQUIRED')
else:
from hdfs.client import InsecureClient
hdfs_client = InsecureClient(env.hdfs_url)
return ibis.make_client(con, hdfs_client)
else:
return ibis.make_client(con)


@pytest.mark.e2e
class ImpalaE2E(object):

@classmethod
def setUpClass(cls):
ENV = IbisTestEnv()
cls.con = connect_test(ENV)
# Tests run generally faster without it
if not ENV.use_codegen:
cls.con.disable_codegen()
cls.hdfs = cls.con.hdfs
cls.test_data_dir = ENV.test_data_dir
cls.test_data_db = ENV.test_data_db
cls.tmp_dir = ENV.tmp_dir
cls.tmp_db = ENV.tmp_db
cls.alltypes = cls.con.table('functional_alltypes')

if not cls.con.exists_database(cls.tmp_db):
cls.con.create_database(cls.tmp_db)

@classmethod
def tearDownClass(cls):
i, retries = 0, 3
while True:
# reduce test flakiness
try:
cls.con.drop_database(cls.tmp_db, force=True)
break
except:
i += 1
if i >= retries:
raise

time.sleep(0.1)

def setUp(self):
self.temp_databases = []
self.temp_tables = []
self.temp_views = []

def tearDown(self):
for t in self.temp_tables:
self.con.drop_table(t, force=True)

for t in self.temp_views:
self.con.drop_view(t, force=True)

self.con.set_database(self.test_data_db)
for t in self.temp_databases:
self.con.drop_database(t, force=True)


def assert_equal(left, right):
if util.all_of([left, right], Schema):
assert left.equals(right),\
'Comparing schemas: \n%s !=\n%s' % (repr(left), repr(right))
else:
assert left.equals(right), ('Objects unequal: {0}\nvs\n{1}'
.format(repr(left), repr(right)))
66 changes: 66 additions & 0 deletions ibis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
import pandas.core.common as pdcom

import ibis
from ibis.common import IbisTypeError


def guid():
try:
Expand Down Expand Up @@ -114,3 +121,62 @@ def get(self, key):
if key.equals(k):
return v
raise KeyError(key)


def pandas_col_to_ibis_type(col):
dty = col.dtype

# datetime types
if pdcom.is_datetime64_dtype(dty):
if pdcom.is_datetime64_ns_dtype(dty):
return 'timestamp'
else:
raise IbisTypeError(
"Column {0} has dtype {1}, which is datetime64-like but does "
"not use nanosecond units".format(col.name, dty))
if pdcom.is_timedelta64_dtype(dty):
print("Warning: encoding a timedelta64 as an int64")
return 'int64'

if pdcom.is_categorical_dtype(dty):
return 'category'

if pdcom.is_bool_dtype(dty):
return 'boolean'

# simple numerical types
if issubclass(dty.type, np.int8):
return 'int8'
if issubclass(dty.type, np.int16):
return 'int16'
if issubclass(dty.type, np.int32):
return 'int32'
if issubclass(dty.type, np.int64):
return 'int64'
if issubclass(dty.type, np.float32):
return 'float'
if issubclass(dty.type, np.float64):
return 'double'
if issubclass(dty.type, np.uint8):
return 'int16'
if issubclass(dty.type, np.uint16):
return 'int32'
if issubclass(dty.type, np.uint32):
return 'int64'
if issubclass(dty.type, np.uint64):
raise IbisTypeError("Column {0} is an unsigned int64".format(col.name))

if pdcom.is_object_dtype(dty):
# TODO: overly broad?
return 'string'

raise IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))


def pandas_to_ibis_schema(frame):
# no analog for decimal in pandas
pairs = []
for col_name in frame:
ibis_type = pandas_col_to_ibis_type(frame[col_name])
pairs.append((col_name, ibis_type))
return ibis.schema(pairs)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ pandas>=0.12.0
impyla>=0.9.1
psutil==0.6.1
snakebite
hdfs[kerberos]
hdfs[kerberos]>=1.1.1
six
51 changes: 51 additions & 0 deletions scripts/airline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import ibis
import os
import pandas


def wrangle_csvs():
years = range(1987, 2009)

for year in years:
path = '%d.csv.bz2' % year
outpath = os.path.expanduser('~/data/%d_clean.csv' % year)

print 'Working on %s' % path

df = pandas.read_csv(path, compression='bz2')
df.to_csv(outpath, header=False, index=False,
float_format='%g', na_rep='\N')



schema = ibis.schema([
('year', 'int32'),
('month', 'int8'),
('day', 'int8'),
('dayofweek', 'int8'),
('dep_time', 'int32'),
('crs_dep_time', 'int32'),
('arr_time', 'int32'),
('crs_arr_time', 'int32'),
('carrier', 'string'),
('flight_num', 'int32'),
('tail_num', 'int32'),
('actual_elapsed_time', 'int32'),
('crs_elapsed_time', 'int32'),
('airtime', 'int32'),
('arrdelay', 'int32'),
('depdelay', 'int32'),
('origin', 'string'),
('dest', 'string'),
('distince', 'int32'),
('taxi_in', 'int32'),
('taxi_out', 'int32'),
('cancelled', 'int8'),
('cancellation_code', 'string'),
('diverted', 'int8'),
('carrier_delay', 'int32'),
('weather_delay', 'int32'),
('nas_delay', 'int32'),
('security_delay', 'int32'),
('late_aircraft_delay', 'int32')
])
50 changes: 50 additions & 0 deletions scripts/cleanup_testing_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#! /usr/bin/env python
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Cleans up the ibis-testing-data from Impala/HDFS and also the HDFS tmp data
# directory

from __future__ import print_function

from posixpath import join as pjoin
import os
import posixpath
import shutil
import sys
import tempfile
import subprocess

import ibis
from ibis.tests.util import IbisTestEnv


ENV = IbisTestEnv()


def make_connection():
ic = ibis.impala_connect(host=ENV.impala_host, port=ENV.impala_port,
protocol=ENV.impala_protocol)
hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port)
return ibis.make_client(ic, hdfs_client=hdfs)


if __name__ == '__main__':
if ENV.cleanup_test_data:
con = make_connection()
con.drop_database(ENV.test_data_db, force=True)
con.hdfs.rmdir(ENV.test_data_dir)
con.hdfs.rmdir(ENV.tmp_dir)
else:
print('IBIS_TEST_CLEANUP_TEST_DATA not set to True; refusing to clean')
124 changes: 124 additions & 0 deletions scripts/create_test_data_archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#! /usr/bin/env python
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Populates the ibis_testing Impala database

from posixpath import join as pjoin
import os
import posixpath
import shutil
import tempfile
import subprocess

import numpy as np
import pandas as pd
import pandas.util.testing as tm

from ibis.util import guid
from ibis.tests.util import IbisTestEnv
import ibis


ENV = IbisTestEnv()
TMP_DB_HDFS_PATH = pjoin(ENV.tmp_dir, guid())
TMP_DB = guid()
# hardcoded:
IBIS_TEST_DATA_LOCAL_DIR = 'ibis-testing-data'


def make_connection():
ic = ibis.impala_connect(host=ENV.impala_host, port=ENV.impala_port,
protocol=ENV.impala_protocol)
hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port)
return ibis.make_client(ic, hdfs_client=hdfs)


def make_temp_database(con):
if con.exists_database(TMP_DB):
con.drop_database(TMP_DB, force=True)
con.create_database(TMP_DB, path=TMP_DB_HDFS_PATH)
print('Created database {0} at {1}'.format(TMP_DB, TMP_DB_HDFS_PATH))


def scrape_parquet_files(con):
to_scrape = [('tpch', x) for x in con.list_tables(database='tpch')]
to_scrape.append(('functional', 'alltypes'))
for db, tname in to_scrape:
table = con.table(tname, database=db)
new_name = '{0}_{1}'.format(db, tname)
print('Creating {0}'.format(new_name))
con.create_table(new_name, table, database=TMP_DB)


def download_parquet_files(con):
parquet_path = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'parquet')
print("Downloading {0}".format(parquet_path))
con.hdfs.get(TMP_DB_HDFS_PATH, parquet_path)


def download_avro_files(con):
avro_path = '/test-warehouse/tpch.region_avro'
os.mkdir(os.path.join(IBIS_TEST_DATA_LOCAL_DIR, 'avro'))
print("Downloading {0}".format(avro_path))
con.hdfs.get(avro_path,
pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'avro', 'tpch_region_avro'))


def generate_csv_files():
N = 10
nfiles = 10

csv_base = os.path.join(IBIS_TEST_DATA_LOCAL_DIR, 'csv')
os.mkdir(csv_base)

df = pd.DataFrame({
'foo': [tm.rands(10) for _ in xrange(N)],
'bar': np.random.randn(N),
'baz': np.random.randint(0, 100, size=N)
}, columns=['foo', 'bar', 'baz'])

for i in xrange(nfiles):
csv_path = os.path.join(csv_base, '{0}.csv'.format(i))
print('Writing {0}'.format(csv_path))
df.to_csv(csv_path, index=False, header=False)


def cleanup_temporary_stuff(con):
con.drop_database(TMP_DB, force=True)
assert not con.hdfs.exists(TMP_DB_HDFS_PATH)


def make_local_test_archive():
con = make_connection()
make_temp_database(con)

try:
scrape_parquet_files(con)

if os.path.exists(IBIS_TEST_DATA_LOCAL_DIR):
shutil.rmtree(IBIS_TEST_DATA_LOCAL_DIR)
os.mkdir(IBIS_TEST_DATA_LOCAL_DIR)

download_parquet_files(con)
download_avro_files(con)
generate_csv_files()
finally:
cleanup_temporary_stuff(con)

# TODO: push a tarball to S3?


if __name__ == '__main__':
make_local_test_archive()
208 changes: 93 additions & 115 deletions scripts/load_test_data.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#! /usr/bin/env python
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -12,131 +13,55 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Populates the ibis_testing Impala database
# Fetches the ibis-testing-data archive and loads it into Impala

from __future__ import print_function

from posixpath import join as pjoin
import os
import posixpath
import shutil
import sys
import tempfile
import subprocess

from ibis.util import guid
import ibis
from ibis.tests.util import IbisTestEnv

IMPALA_HOST = 'localhost'
HDFS_HOST = 'localhost'
WEBHDFS_PORT = 5070
TEST_DB = 'ibis_testing'
TEST_DATA_DIR = 'ibis-testing-data'
TEST_DATA_HDFS_LOC = '/__ibis/ibis-testing-data'


def make_connection():
ic = ibis.impala_connect(host=IMPALA_HOST)
hdfs = ibis.hdfs_connect(host=HDFS_HOST, port=WEBHDFS_PORT)
con = ibis.make_client(ic, hdfs_client=hdfs)

return con

# ----------------------------------------------------------------------
# Functions for creating the test data archive to begin with

TMP_DB_LOCATION = '/__ibis/{0}'.format(guid())
TMP_DB = guid()

def make_temp_database(con):
if con.exists_database(TMP_DB):
con.drop_database(TMP_DB, force=True)
con.create_database(TMP_DB, path=TMP_DB_LOCATION)
print('Created database {0} at {1}'.format(TMP_DB, TMP_DB_LOCATION))


def cleanup_temporary_stuff(con):
con.drop_database(TMP_DB, force=True)
assert not con.hdfs.exists(TMP_DB_LOCATION)

def download_parquet_files(con):
parquet_path = pjoin(TEST_DATA_DIR, 'parquet')
print("Downloading {0}".format(parquet_path))
con.hdfs.get(TMP_DB_LOCATION, parquet_path)

ENV = IbisTestEnv()
# hardcoded:
IBIS_TEST_DATA_URL = ('https://ibis-test-resources.s3.amazonaws.com/'
'ibis-testing-data.tar.gz')

def download_avro_files(con):
avro_path = '/test-warehouse/tpch.region_avro'
os.mkdir(os.path.join(TEST_DATA_DIR, 'avro'))
print("Downloading {0}".format(avro_path))
con.hdfs.get(avro_path, pjoin(TEST_DATA_DIR, 'avro', 'tpch.region'))


def generate_csv_files():
import numpy as np
import pandas as pd
import pandas.util.testing as tm

N = 10
nfiles = 10

csv_base = os.path.join(TEST_DATA_DIR, 'csv')
os.mkdir(csv_base)

df = pd.DataFrame({
'foo': [tm.rands(10) for _ in xrange(N)],
'bar': np.random.randn(N),
'baz': np.random.randint(0, 100, size=N)
}, columns=['foo', 'bar', 'baz'])

for i in xrange(nfiles):
csv_path = os.path.join(csv_base, '{}.csv'.format(i))
print('Writing {}'.format(csv_path))
df.to_csv(csv_path, index=False, header=False)


def scrape_parquet_files(con):
to_scrape = [('tpch', x) for x in con.list_tables(database='tpch')]
to_scrape.append(('functional', 'alltypes'))
for db, tname in to_scrape:
table = con.table(tname, database=db)
new_name = '{}_{}'.format(db, tname)
print('Creating {}'.format(new_name))
con.create_table(new_name, table, database=TMP_DB)


def make_local_test_archive():
con = make_connection()
make_temp_database(con)

try:
scrape_parquet_files(con)

if os.path.exists(TEST_DATA_DIR):
shutil.rmtree(TEST_DATA_DIR)
os.mkdir(TEST_DATA_DIR)

download_parquet_files(con)
download_avro_files(con)
generate_csv_files()
finally:
cleanup_temporary_stuff(con)

# ----------------------------------------------------------------------
#
def make_connection():
ic = ibis.impala_connect(host=ENV.impala_host, port=ENV.impala_port,
protocol=ENV.impala_protocol)
hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port)
return ibis.make_client(ic, hdfs_client=hdfs)


def write_data_to_hdfs(con):
# TODO per #278, write directly from the gzipped tarball
con.hdfs.put(TEST_DATA_HDFS_LOC, TEST_DATA_DIR,
verbose=True, overwrite=True)
def get_ibis_test_data(local_path):
cmd = 'cd {0} && wget -q {1} && tar -xzf {2}'.format(
local_path, IBIS_TEST_DATA_URL, os.path.basename(IBIS_TEST_DATA_URL))
subprocess.check_call(cmd, shell=True)
data_dir = pjoin(local_path,
os.path.basename(IBIS_TEST_DATA_URL).split('.', 2)[0])
print('Downloaded {0} and unpacked it to {1}'.format(IBIS_TEST_DATA_URL,
data_dir))
return data_dir


def create_test_database(con):
if con.exists_database(TEST_DB):
con.drop_database(TEST_DB, force=True)
con.create_database(TEST_DB)
print('Created database {0}'.format(TEST_DB))
if con.exists_database(ENV.test_data_db):
con.drop_database(ENV.test_data_db, force=True)
con.create_database(ENV.test_data_db)
print('Created database {0}'.format(ENV.test_data_db))


def create_parquet_tables(con):
parquet_files = con.hdfs.ls(pjoin(TEST_DATA_HDFS_LOC, 'parquet'))

parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet'))
schemas = {
'functional_alltypes': ibis.schema(
[('id', 'int32'),
Expand All @@ -151,26 +76,79 @@ def create_parquet_tables(con):
('string_col', 'string'),
('timestamp_col', 'timestamp'),
('year', 'int32'),
('month', 'int32')])
}
('month', 'int32')]),
'tpch_region': ibis.schema(
[('r_regionkey', 'int16'),
('r_name', 'string'),
('r_comment', 'string')])}

for path in parquet_files:
head, table_name = posixpath.split(path)
print 'Creating {0}'.format(table_name)

print('Creating {0}'.format(table_name))
# if no schema infer!
schema = schemas.get(table_name)

con.parquet_file(path, schema=schema, name=table_name,
database=TEST_DB, persist=True)
database=ENV.test_data_db, persist=True)


def setup_test_data():
def create_avro_tables(con):
avro_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'avro'))
schemas = {
'tpch_region_avro': {
'type': 'record',
'name': 'a',
'fields': [
{'name': 'R_REGIONKEY', 'type': ['null', 'int']},
{'name': 'R_NAME', 'type': ['null', 'string']},
{'name': 'R_COMMENT', 'type': ['null', 'string']}]}}

for path in avro_files:
head, table_name = posixpath.split(path)
print('Creating {0}'.format(table_name))
schema = schemas[table_name]
con.avro_file(path, schema, name=table_name, database=ENV.test_data_db,
persist=True)


def setup_test_data(local_data_dir):
con = make_connection()
write_data_to_hdfs(con)
hdfs = con.hdfs

if hdfs.exists(ENV.test_data_dir):
hdfs.rmdir(ENV.test_data_dir)
hdfs.put(ENV.test_data_dir, local_data_dir, verbose=True)

create_test_database(con)
create_parquet_tables(con)
create_avro_tables(con)


def can_write_to_hdfs():
from ibis.compat import BytesIO
con = make_connection()

test_path = pjoin(ENV.test_data_dir, ibis.util.guid())
test_file = BytesIO(ibis.util.guid())

try:
con.hdfs.put(test_path, test_file)
con.hdfs.rm(test_path)
return True
except:
return False


if __name__ == '__main__':
setup_test_data()
if len(sys.argv) > 1:
data_dir = os.path.expanduser(sys.argv[1])
setup_test_data(data_dir)
else:
if not can_write_to_hdfs():
print('Do not have write permission to HDFS')

try:
tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp')
local_data_dir = get_ibis_test_data(tmp_dir)
setup_test_data(local_data_dir)
finally:
shutil.rmtree(tmp_dir)
75 changes: 75 additions & 0 deletions scripts/run_jenkins.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#! /usr/bin/env bash
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script calls machinery that initializes an ibis.tests.util.IbisTestEnv,
# so it needs those variables set correctly. It also assumes that WORKSPACE is
# set (i.e., that it is being run as a Jenkins job). If the latter is not
# true, you can instead set GIT_URL and GIT_BRANCH to check them out manually.

set -e
set -x

printenv

mkdir -p /tmp/impyla-dbapi
TMP_DIR=$(mktemp -d -p /tmp/impyla-dbapi tmpXXXX)

function cleanup {
rm -rf $TMP_DIR
}
trap cleanup EXIT

cd $TMP_DIR

# Checkout ibis if necessary
if [ -z "$WORKSPACE" ]; then
: ${GIT_URL:?"GIT_URL is unset"}
: ${GIT_BRANCH:?"GIT_BRANCH is unset"}
git clone $GIT_URL
pushd ibis && git checkout origin/$GIT_BRANCH && popd
IBIS_HOME=$TMP_DIR/ibis
else
# WORKSPACE is set, so I must be on a Jenkins slave
IBIS_HOME=$WORKSPACE
fi

# Setup Python
curl https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh > miniconda.sh
bash miniconda.sh -b -p $TMP_DIR/miniconda
export PATH="$TMP_DIR/miniconda/bin:$PATH"
conda update -y -q conda
conda info -a

# Install ibis and deps into new environment
CONDA_ENV_NAME=pyenv-ibis-test
conda create -y -q -n $CONDA_ENV_NAME python=$PYTHON_VERSION numpy pandas
source activate $CONDA_ENV_NAME
pip install $IBIS_HOME

python --version
which python

cd $IBIS_HOME

python -c "from ibis.tests.util import IbisTestEnv; print(IbisTestEnv())"

# load necessary test data
scripts/load_test_data.py

# run the test suite
py.test --e2e ibis

# cleanup
scripts/cleanup_testing_data.py
16 changes: 10 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,14 @@

cmdclass['build_ext'] = build_ext

import numpy as np

from setuptools import setup
import os
import sys

from distutils.extension import Extension

MAJOR = 0
MINOR = 2
MINOR = 3
MICRO = 0
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)

Expand All @@ -65,13 +63,19 @@ def run(self):

cmdclass['clean'] = clean

common_include = ['ibis/src', np.get_include()]

with open('requirements.txt') as f:
file_reqs = f.read().splitlines()
requirements = requirements + file_reqs

PY26 = sys.version_info[0] == 2 and sys.version_info[1] == 6
if PY26:
requirements.append('argparse')
requirements.append('unittest2')

if COMMS_EXT_ENABLED:
import numpy as np

common_include = ['ibis/src', np.get_include()]
comms_ext_libraries = []
if sys.platform != 'darwin':
# libuuid is available without additional linking as part of the base
Expand All @@ -88,7 +92,7 @@ def run(self):
extensions = cythonize([comms_ext])

setup(
name='ibis',
name='ibis-framework',
packages=['ibis',
'ibis.expr',
'ibis.expr.tests',
Expand Down