| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,222 @@ | ||
| # Copyright 2014 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| from ibis import window | ||
| import ibis | ||
|
|
||
| from ibis.sql.compiler import to_sql | ||
| from ibis.expr.tests.mocks import BasicTestCase | ||
| from ibis.compat import unittest | ||
| import ibis.common as com | ||
|
|
||
|
|
||
| class TestWindowFunctions(BasicTestCase, unittest.TestCase): | ||
|
|
||
| def _check_sql(self, expr, expected): | ||
| result = to_sql(expr) | ||
| assert result == expected | ||
|
|
||
| def test_aggregate_in_projection(self): | ||
| t = self.con.table('alltypes') | ||
| proj = t[t, (t.f / t.f.sum()).name('normed_f')] | ||
|
|
||
| expected = """\ | ||
| SELECT *, f / sum(f) OVER () AS `normed_f` | ||
| FROM alltypes""" | ||
| self._check_sql(proj, expected) | ||
|
|
||
| def test_add_default_order_by(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| first = t.f.first().name('first') | ||
| last = t.f.last().name('last') | ||
| lag = t.f.lag().name('lag') | ||
| diff = (t.f.lead() - t.f).name('fwd_diff') | ||
| lag2 = t.f.lag().over(window(order_by=t.d)).name('lag2') | ||
| grouped = t.group_by('g') | ||
| proj = grouped.mutate([lag, diff, first, last, lag2]) | ||
| expected = """\ | ||
| SELECT *, lag(f) OVER (PARTITION BY g ORDER BY f) AS `lag`, | ||
| lead(f) OVER (PARTITION BY g ORDER BY f) - f AS `fwd_diff`, | ||
| first_value(f) OVER (PARTITION BY g ORDER BY f) AS `first`, | ||
| last_value(f) OVER (PARTITION BY g ORDER BY f) AS `last`, | ||
| lag(f) OVER (PARTITION BY g ORDER BY d) AS `lag2` | ||
| FROM alltypes""" | ||
| self._check_sql(proj, expected) | ||
|
|
||
| def test_window_frame_specs(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| ex_template = """\ | ||
| SELECT sum(d) OVER (ORDER BY f {0}) AS `foo` | ||
| FROM alltypes""" | ||
|
|
||
| cases = [ | ||
| (window(preceding=0), | ||
| 'range between current row and unbounded following'), | ||
|
|
||
| (window(following=0), | ||
| 'range between unbounded preceding and current row'), | ||
|
|
||
| (window(preceding=5), | ||
| 'rows between 5 preceding and unbounded following'), | ||
| (window(preceding=5, following=0), | ||
| 'rows between 5 preceding and current row'), | ||
| (window(preceding=5, following=2), | ||
| 'rows between 5 preceding and 2 following'), | ||
| (window(following=2), | ||
| 'rows between unbounded preceding and 2 following'), | ||
| (window(following=2, preceding=0), | ||
| 'rows between current row and 2 following'), | ||
| (window(preceding=5), | ||
| 'rows between 5 preceding and unbounded following'), | ||
| (window(following=[5, 10]), | ||
| 'rows between 5 following and 10 following'), | ||
| (window(preceding=[10, 5]), | ||
| 'rows between 10 preceding and 5 preceding'), | ||
|
|
||
| # # cumulative windows | ||
| (ibis.cumulative_window(), | ||
| 'range between unbounded preceding and current row'), | ||
|
|
||
| # # trailing windows | ||
| (ibis.trailing_window(10), | ||
| 'rows between 10 preceding and current row'), | ||
| ] | ||
|
|
||
| for w, frame in cases: | ||
| w2 = w.order_by(t.f) | ||
| expr = t.projection([t.d.sum().over(w2).name('foo')]) | ||
| expected = ex_template.format(frame.upper()) | ||
| self._check_sql(expr, expected) | ||
|
|
||
| def test_cumulative_functions(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| w = ibis.window(order_by=t.d) | ||
| exprs = [ | ||
| (t.f.cumsum().over(w), t.f.sum().over(w)), | ||
| (t.f.cummin().over(w), t.f.min().over(w)), | ||
| (t.f.cummax().over(w), t.f.max().over(w)), | ||
| (t.f.cummean().over(w), t.f.mean().over(w)), | ||
| ] | ||
|
|
||
| for cumulative, static in exprs: | ||
| actual = cumulative.name('foo') | ||
| expected = static.over(ibis.cumulative_window()).name('foo') | ||
|
|
||
| expr1 = t.projection(actual) | ||
| expr2 = t.projection(expected) | ||
|
|
||
| self._compare_sql(expr1, expr2) | ||
|
|
||
| def _compare_sql(self, e1, e2): | ||
| s1 = to_sql(e1) | ||
| s2 = to_sql(e2) | ||
| assert s1 == s2 | ||
|
|
||
| def test_nested_analytic_function(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| w = window(order_by=t.f) | ||
| expr = (t.f - t.f.lag()).lag().over(w).name('foo') | ||
| result = t.projection([expr]) | ||
| expected = """\ | ||
| SELECT lag(f - lag(f) OVER (ORDER BY f)) \ | ||
| OVER (ORDER BY f) AS `foo` | ||
| FROM alltypes""" | ||
| self._check_sql(result, expected) | ||
|
|
||
| def test_rank_functions(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| proj = t[t.g, t.f.rank().name('minr'), | ||
| t.f.dense_rank().name('denser')] | ||
| expected = """\ | ||
| SELECT g, rank() OVER (ORDER BY f) - 1 AS `minr`, | ||
| dense_rank() OVER (ORDER BY f) - 1 AS `denser` | ||
| FROM alltypes""" | ||
| self._check_sql(proj, expected) | ||
|
|
||
| def test_multiple_windows(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| w = window(group_by=t.g) | ||
|
|
||
| expr = t.f.sum().over(w) - t.f.sum() | ||
| proj = t.projection([t.g, expr.name('result')]) | ||
|
|
||
| expected = """\ | ||
| SELECT g, sum(f) OVER (PARTITION BY g) - sum(f) OVER () AS `result` | ||
| FROM alltypes""" | ||
| self._check_sql(proj, expected) | ||
|
|
||
| def test_order_by_desc(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| w = window(order_by=ibis.desc(t.f)) | ||
|
|
||
| proj = t[t.f, ibis.row_number().over(w).name('revrank')] | ||
| expected = """\ | ||
| SELECT f, row_number() OVER (ORDER BY f DESC) - 1 AS `revrank` | ||
| FROM alltypes""" | ||
| self._check_sql(proj, expected) | ||
|
|
||
| expr = (t.group_by('g') | ||
| .order_by(ibis.desc(t.f)) | ||
| [t.d.lag().name('foo'), t.a.max()]) | ||
| expected = """\ | ||
| SELECT lag(d) OVER (PARTITION BY g ORDER BY f DESC) AS `foo`, | ||
| max(a) OVER (PARTITION BY g ORDER BY f DESC) AS `max` | ||
| FROM alltypes""" | ||
| self._check_sql(expr, expected) | ||
|
|
||
| def test_row_number_requires_order_by(self): | ||
| t = self.con.table('alltypes') | ||
|
|
||
| with self.assertRaises(com.ExpressionError): | ||
| (t.group_by(t.g) | ||
| .mutate(ibis.row_number().name('foo'))) | ||
|
|
||
| expr = (t.group_by(t.g) | ||
| .order_by(t.f) | ||
| .mutate(ibis.row_number().name('foo'))) | ||
|
|
||
| expected = """\ | ||
| SELECT *, row_number() OVER (PARTITION BY g ORDER BY f) - 1 AS `foo` | ||
| FROM alltypes""" | ||
| self._check_sql(expr, expected) | ||
|
|
||
| def test_unsupported_aggregate_functions(self): | ||
| t = self.con.table('alltypes') | ||
| w = ibis.window(order_by=t.d) | ||
|
|
||
| exprs = [ | ||
| t.f.approx_nunique(), | ||
| t.f.approx_median(), | ||
| t.g.group_concat(), | ||
| ] | ||
|
|
||
| for expr in exprs: | ||
| with self.assertRaises(com.TranslationError): | ||
| proj = t.projection([expr.over(w).name('foo')]) | ||
| to_sql(proj) | ||
|
|
||
| def test_math_on_windowed_expr(self): | ||
| # Window clause may not be found at top level of expression | ||
| pass | ||
|
|
||
| def test_group_by_then_different_sort_orders(self): | ||
| pass |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| from pytest import skip | ||
|
|
||
|
|
||
| def pytest_addoption(parser): | ||
| parser.addoption('--e2e', action='store_true', default=False, | ||
| help='Enable the e2e (end-to-end) tests') | ||
|
|
||
|
|
||
| def pytest_runtest_setup(item): | ||
| if getattr(item.obj, 'e2e', None): # the test item is marked e2e | ||
| if not item.config.getvalue('e2e'): # but --e2e option not set | ||
| skip('e2e was not enabled') |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,222 @@ | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import pytest | ||
|
|
||
| import ibis | ||
| import ibis.expr.types as ir | ||
| from ibis.compat import unittest | ||
| from ibis.util import pandas_to_ibis_schema | ||
| from ibis.common import IbisTypeError | ||
| from ibis.tests.util import ImpalaE2E | ||
|
|
||
|
|
||
| functional_alltypes_with_nulls = pd.DataFrame({ | ||
| 'bigint_col': np.int64([0, 10, 20, 30, 40, 50, 60, 70, 80, 90]), | ||
| 'bool_col': np.bool_([True, False, True, False, True, None, True, False, True, | ||
| False]), | ||
| 'date_string_col': ['11/01/10', None, '11/01/10', '11/01/10', | ||
| '11/01/10', '11/01/10', '11/01/10', '11/01/10', | ||
| '11/01/10', '11/01/10'], | ||
| 'double_col': np.float64([0.0, 10.1, None, 30.299999999999997, | ||
| 40.399999999999999, 50.5, 60.599999999999994, | ||
| 70.700000000000003, 80.799999999999997, 90.899999999999991]), | ||
| 'float_col': np.float32([None, 1.1000000238418579, 2.2000000476837158, | ||
| 3.2999999523162842, 4.4000000953674316, 5.5, | ||
| 6.5999999046325684, 7.6999998092651367, 8.8000001907348633, | ||
| 9.8999996185302734]), | ||
| 'int_col': np.int32([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), | ||
| 'month': [11, 11, 11, 11, 2, 11, 11, 11, 11, 11], | ||
| 'smallint_col': np.int16([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), | ||
| 'string_col': ['0', '1', None, '3', '4', '5', '6', '7', '8', '9'], | ||
| 'timestamp_col': [pd.Timestamp('2010-11-01 00:00:00'), | ||
| None, | ||
| pd.Timestamp('2010-11-01 00:02:00.100000'), | ||
| pd.Timestamp('2010-11-01 00:03:00.300000'), | ||
| pd.Timestamp('2010-11-01 00:04:00.600000'), | ||
| pd.Timestamp('2010-11-01 00:05:00.100000'), | ||
| pd.Timestamp('2010-11-01 00:06:00.150000'), | ||
| pd.Timestamp('2010-11-01 00:07:00.210000'), | ||
| pd.Timestamp('2010-11-01 00:08:00.280000'), | ||
| pd.Timestamp('2010-11-01 00:09:00.360000')], | ||
| 'tinyint_col': np.int8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), | ||
| 'year': [2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010]}) | ||
|
|
||
|
|
||
| class TestPandasTypeInterop(unittest.TestCase): | ||
|
|
||
| def test_series_to_ibis_literal(self): | ||
| values = [1, 2, 3, 4] | ||
| s = pd.Series(values) | ||
|
|
||
| expr = ir.as_value_expr(s) | ||
| expected = ir.sequence(list(s)) | ||
| assert expr.equals(expected) | ||
|
|
||
|
|
||
| class TestPandasSchemaInference(unittest.TestCase): | ||
|
|
||
| def test_dtype_bool(self): | ||
| df = pd.DataFrame({'col': [True, False, False]}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'boolean')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_int8(self): | ||
| df = pd.DataFrame({'col': np.int8([-3, 9, 17])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int8')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_int16(self): | ||
| df = pd.DataFrame({'col': np.int16([-5, 0, 12])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int16')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_int32(self): | ||
| df = pd.DataFrame({'col': np.int32([-12, 3, 25000])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int32')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_int64(self): | ||
| df = pd.DataFrame({'col': np.int64([102, 67228734, -0])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int64')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_float32(self): | ||
| df = pd.DataFrame({'col': np.float32([45e-3, -0.4, 99.])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'float')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_float64(self): | ||
| df = pd.DataFrame({'col': np.float64([-3e43, 43., 10000000.])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'double')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_uint8(self): | ||
| df = pd.DataFrame({'col': np.uint8([3, 0, 16])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int16')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_uint16(self): | ||
| df = pd.DataFrame({'col': np.uint16([5569, 1, 33])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int32')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_uint32(self): | ||
| df = pd.DataFrame({'col': np.uint32([100, 0, 6])}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int64')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_uint64(self): | ||
| df = pd.DataFrame({'col': np.uint64([666, 2, 3])}) | ||
| with self.assertRaises(IbisTypeError): | ||
| inferred = pandas_to_ibis_schema(df) | ||
|
|
||
| def test_dtype_datetime64(self): | ||
| df = pd.DataFrame({ | ||
| 'col': [pd.Timestamp('2010-11-01 00:01:00'), | ||
| pd.Timestamp('2010-11-01 00:02:00.1000'), | ||
| pd.Timestamp('2010-11-01 00:03:00.300000')]}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'timestamp')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_timedelta64(self): | ||
| df = pd.DataFrame({ | ||
| 'col': [pd.Timedelta('1 days'), | ||
| pd.Timedelta('-1 days 2 min 3us'), | ||
| pd.Timedelta('-2 days +23:57:59.999997')]}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'int64')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_string(self): | ||
| df = pd.DataFrame({'col': ['foo', 'bar', 'hello']}) | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'string')]) | ||
| assert inferred == expected | ||
|
|
||
| def test_dtype_categorical(self): | ||
| df = pd.DataFrame({'col': ['a', 'b', 'c', 'a']}, dtype='category') | ||
| inferred = pandas_to_ibis_schema(df) | ||
| expected = ibis.schema([('col', 'category')]) | ||
| assert inferred == expected | ||
|
|
||
|
|
||
| @pytest.mark.e2e | ||
| class TestPandasRoundTrip(ImpalaE2E, unittest.TestCase): | ||
|
|
||
| def test_round_trip(self): | ||
| pytest.skip('fails') | ||
|
|
||
| df1 = self.alltypes.execute() | ||
| df2 = self.con.pandas(df1, 'bamboo', database=self.tmp_db).execute() | ||
| assert (df1.columns == df2.columns).all() | ||
| assert (df1.dtypes == df2.dtypes).all() | ||
| assert (df1 == df2).all().all() | ||
|
|
||
| def test_round_trip_non_int_missing_data(self): | ||
| df1 = functional_alltypes_with_nulls | ||
| table = self.con.pandas(df1, 'fawn', database=self.tmp_db) | ||
| df2 = table.execute() | ||
| assert (df1.columns == df2.columns).all() | ||
| assert (df1.dtypes == df2.dtypes).all() | ||
| # bool/int cols should be exact | ||
| assert (df1.bool_col == df2.bool_col).all() | ||
| assert (df1.tinyint_col == df2.tinyint_col).all() | ||
| assert (df1.smallint_col == df2.smallint_col).all() | ||
| assert (df1.int_col == df2.int_col).all() | ||
| assert (df1.bigint_col == df2.bigint_col).all() | ||
| assert (df1.month == df2.month).all() | ||
| assert (df1.year == df2.year).all() | ||
| # string cols should be equal everywhere except for the NULLs | ||
| assert ((df1.string_col == df2.string_col) == | ||
| [1, 1, 0, 1, 1, 1, 1, 1, 1, 1]).all() | ||
| assert ((df1.date_string_col == df2.date_string_col) == | ||
| [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]).all() | ||
| # float cols within tolerance, and NULLs should be False | ||
| assert ((df1.double_col - df2.double_col < 1e-9) == | ||
| [1, 1, 0, 1, 1, 1, 1, 1, 1, 1]).all() | ||
| assert ((df1.float_col - df2.float_col < 1e-9) == | ||
| [0, 1, 1, 1, 1, 1, 1, 1, 1, 1]).all() | ||
|
|
||
| def test_round_trip_missing_type_promotion(self): | ||
| pytest.skip('unfinished') | ||
|
|
||
| # prepare Impala table with missing ints | ||
| # TODO: switch to self.con.raw_sql once #412 is fixed | ||
| create_query = ('CREATE TABLE {0}.missing_ints ' | ||
| ' (tinyint_col TINYINT, bigint_col BIGINT) ' | ||
| 'STORED AS PARQUET'.format(self.tmp_db)) | ||
| insert_query = ('INSERT INTO {0}.missing_ints ' | ||
| 'VALUES (NULL, 3), (-5, NULL), (19, 444444)'.format( | ||
| self.tmp_db)) | ||
| self.con.con.cursor.execute(create_query) | ||
| self.con.con.cursor.execute(insert_query) | ||
|
|
||
| table = self.con.table('missing_ints', database=self.tmp_db) | ||
| df = table.execute() | ||
|
|
||
| # WHAT NOW? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,139 @@ | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import os | ||
| import time | ||
|
|
||
| import pytest | ||
|
|
||
| from ibis import Schema | ||
| from ibis import options | ||
| import ibis.util as util | ||
| import ibis | ||
|
|
||
|
|
||
| class IbisTestEnv(object): | ||
|
|
||
| def __init__(self): | ||
| # TODO: allow initializing values through a constructor | ||
| self.impala_host = os.environ.get('IBIS_TEST_IMPALA_HOST', 'localhost') | ||
| self.impala_protocol = os.environ.get('IBIS_TEST_IMPALA_PROTOCOL', | ||
| 'hiveserver2') | ||
| self.impala_port = int(os.environ.get('IBIS_TEST_IMPALA_PORT', 21050)) | ||
| self.tmp_db = os.environ.get('IBIS_TEST_TMP_DB', | ||
| '__ibis_tmp_{0}'.format(util.guid())) | ||
| self.tmp_dir = os.environ.get('IBIS_TEST_TMP_HDFS_DIR', | ||
| '/tmp/__ibis_test') | ||
| self.test_data_db = os.environ.get('IBIS_TEST_DATA_DB', 'ibis_testing') | ||
| self.test_data_dir = os.environ.get('IBIS_TEST_DATA_HDFS_DIR', | ||
| '/__ibis/ibis-testing-data') | ||
| self.nn_host = os.environ.get('IBIS_TEST_NN_HOST', 'localhost') | ||
| # 5070 is default for impala dev env | ||
| self.webhdfs_port = int(os.environ.get('IBIS_TEST_WEBHDFS_PORT', 5070)) | ||
| self.hdfs_url = 'http://{0}:{1}'.format(self.nn_host, | ||
| self.webhdfs_port) | ||
| self.use_codegen = os.environ.get('IBIS_TEST_USE_CODEGEN', | ||
| 'False').lower() == 'true' | ||
| self.cleanup_test_data = os.environ.get('IBIS_TEST_CLEANUP_TEST_DATA', | ||
| 'True').lower() == 'true' | ||
| self.use_kerberos = os.environ.get('IBIS_TEST_USE_KERBEROS', | ||
| 'False').lower() == 'true' | ||
|
|
||
| # update global Ibis config where relevant | ||
| options.impala.temp_db = self.tmp_db | ||
| options.impala.temp_hdfs_path = self.tmp_dir | ||
|
|
||
| def __repr__(self): | ||
| kvs = ['{0}={1}'.format(k, v) for (k, v) in self.__dict__.iteritems()] | ||
| return 'IbisTestEnv(\n {0})'.format(',\n '.join(kvs)) | ||
|
|
||
|
|
||
| def connect_test(env, with_hdfs=True): | ||
| con = ibis.impala_connect(host=env.impala_host, | ||
| protocol=env.impala_protocol, | ||
| database=env.test_data_db, | ||
| port=env.impala_port, | ||
| use_kerberos=env.use_kerberos, | ||
| pool_size=2) | ||
| if with_hdfs: | ||
| if env.use_kerberos: | ||
| from hdfs.ext.kerberos import KerberosClient | ||
| hdfs_client = KerberosClient(env.hdfs_url, mutual_auth='REQUIRED') | ||
| else: | ||
| from hdfs.client import InsecureClient | ||
| hdfs_client = InsecureClient(env.hdfs_url) | ||
| return ibis.make_client(con, hdfs_client) | ||
| else: | ||
| return ibis.make_client(con) | ||
|
|
||
|
|
||
| @pytest.mark.e2e | ||
| class ImpalaE2E(object): | ||
|
|
||
| @classmethod | ||
| def setUpClass(cls): | ||
| ENV = IbisTestEnv() | ||
| cls.con = connect_test(ENV) | ||
| # Tests run generally faster without it | ||
| if not ENV.use_codegen: | ||
| cls.con.disable_codegen() | ||
| cls.hdfs = cls.con.hdfs | ||
| cls.test_data_dir = ENV.test_data_dir | ||
| cls.test_data_db = ENV.test_data_db | ||
| cls.tmp_dir = ENV.tmp_dir | ||
| cls.tmp_db = ENV.tmp_db | ||
| cls.alltypes = cls.con.table('functional_alltypes') | ||
|
|
||
| if not cls.con.exists_database(cls.tmp_db): | ||
| cls.con.create_database(cls.tmp_db) | ||
|
|
||
| @classmethod | ||
| def tearDownClass(cls): | ||
| i, retries = 0, 3 | ||
| while True: | ||
| # reduce test flakiness | ||
| try: | ||
| cls.con.drop_database(cls.tmp_db, force=True) | ||
| break | ||
| except: | ||
| i += 1 | ||
| if i >= retries: | ||
| raise | ||
|
|
||
| time.sleep(0.1) | ||
|
|
||
| def setUp(self): | ||
| self.temp_databases = [] | ||
| self.temp_tables = [] | ||
| self.temp_views = [] | ||
|
|
||
| def tearDown(self): | ||
| for t in self.temp_tables: | ||
| self.con.drop_table(t, force=True) | ||
|
|
||
| for t in self.temp_views: | ||
| self.con.drop_view(t, force=True) | ||
|
|
||
| self.con.set_database(self.test_data_db) | ||
| for t in self.temp_databases: | ||
| self.con.drop_database(t, force=True) | ||
|
|
||
|
|
||
| def assert_equal(left, right): | ||
| if util.all_of([left, right], Schema): | ||
| assert left.equals(right),\ | ||
| 'Comparing schemas: \n%s !=\n%s' % (repr(left), repr(right)) | ||
| else: | ||
| assert left.equals(right), ('Objects unequal: {0}\nvs\n{1}' | ||
| .format(repr(left), repr(right))) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,5 +4,5 @@ pandas>=0.12.0 | |
| impyla>=0.9.1 | ||
| psutil==0.6.1 | ||
| snakebite | ||
| hdfs[kerberos]>=1.1.1 | ||
| six | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| import ibis | ||
| import os | ||
| import pandas | ||
|
|
||
|
|
||
| def wrangle_csvs(): | ||
| years = range(1987, 2009) | ||
|
|
||
| for year in years: | ||
| path = '%d.csv.bz2' % year | ||
| outpath = os.path.expanduser('~/data/%d_clean.csv' % year) | ||
|
|
||
| print 'Working on %s' % path | ||
|
|
||
| df = pandas.read_csv(path, compression='bz2') | ||
| df.to_csv(outpath, header=False, index=False, | ||
| float_format='%g', na_rep='\N') | ||
|
|
||
|
|
||
|
|
||
| schema = ibis.schema([ | ||
| ('year', 'int32'), | ||
| ('month', 'int8'), | ||
| ('day', 'int8'), | ||
| ('dayofweek', 'int8'), | ||
| ('dep_time', 'int32'), | ||
| ('crs_dep_time', 'int32'), | ||
| ('arr_time', 'int32'), | ||
| ('crs_arr_time', 'int32'), | ||
| ('carrier', 'string'), | ||
| ('flight_num', 'int32'), | ||
| ('tail_num', 'int32'), | ||
| ('actual_elapsed_time', 'int32'), | ||
| ('crs_elapsed_time', 'int32'), | ||
| ('airtime', 'int32'), | ||
| ('arrdelay', 'int32'), | ||
| ('depdelay', 'int32'), | ||
| ('origin', 'string'), | ||
| ('dest', 'string'), | ||
| ('distince', 'int32'), | ||
| ('taxi_in', 'int32'), | ||
| ('taxi_out', 'int32'), | ||
| ('cancelled', 'int8'), | ||
| ('cancellation_code', 'string'), | ||
| ('diverted', 'int8'), | ||
| ('carrier_delay', 'int32'), | ||
| ('weather_delay', 'int32'), | ||
| ('nas_delay', 'int32'), | ||
| ('security_delay', 'int32'), | ||
| ('late_aircraft_delay', 'int32') | ||
| ]) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| #! /usr/bin/env python | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| # Cleans up the ibis-testing-data from Impala/HDFS and also the HDFS tmp data | ||
| # directory | ||
|
|
||
| from __future__ import print_function | ||
|
|
||
| from posixpath import join as pjoin | ||
| import os | ||
| import posixpath | ||
| import shutil | ||
| import sys | ||
| import tempfile | ||
| import subprocess | ||
|
|
||
| import ibis | ||
| from ibis.tests.util import IbisTestEnv | ||
|
|
||
|
|
||
| ENV = IbisTestEnv() | ||
|
|
||
|
|
||
| def make_connection(): | ||
| ic = ibis.impala_connect(host=ENV.impala_host, port=ENV.impala_port, | ||
| protocol=ENV.impala_protocol) | ||
| hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port) | ||
| return ibis.make_client(ic, hdfs_client=hdfs) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| if ENV.cleanup_test_data: | ||
| con = make_connection() | ||
| con.drop_database(ENV.test_data_db, force=True) | ||
| con.hdfs.rmdir(ENV.test_data_dir) | ||
| con.hdfs.rmdir(ENV.tmp_dir) | ||
| else: | ||
| print('IBIS_TEST_CLEANUP_TEST_DATA not set to True; refusing to clean') |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| #! /usr/bin/env python | ||
| # Copyright 2014 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| # Populates the ibis_testing Impala database | ||
|
|
||
| from posixpath import join as pjoin | ||
| import os | ||
| import posixpath | ||
| import shutil | ||
| import tempfile | ||
| import subprocess | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import pandas.util.testing as tm | ||
|
|
||
| from ibis.util import guid | ||
| from ibis.tests.util import IbisTestEnv | ||
| import ibis | ||
|
|
||
|
|
||
| ENV = IbisTestEnv() | ||
| TMP_DB_HDFS_PATH = pjoin(ENV.tmp_dir, guid()) | ||
| TMP_DB = guid() | ||
| # hardcoded: | ||
| IBIS_TEST_DATA_LOCAL_DIR = 'ibis-testing-data' | ||
|
|
||
|
|
||
| def make_connection(): | ||
| ic = ibis.impala_connect(host=ENV.impala_host, port=ENV.impala_port, | ||
| protocol=ENV.impala_protocol) | ||
| hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port) | ||
| return ibis.make_client(ic, hdfs_client=hdfs) | ||
|
|
||
|
|
||
| def make_temp_database(con): | ||
| if con.exists_database(TMP_DB): | ||
| con.drop_database(TMP_DB, force=True) | ||
| con.create_database(TMP_DB, path=TMP_DB_HDFS_PATH) | ||
| print('Created database {0} at {1}'.format(TMP_DB, TMP_DB_HDFS_PATH)) | ||
|
|
||
|
|
||
| def scrape_parquet_files(con): | ||
| to_scrape = [('tpch', x) for x in con.list_tables(database='tpch')] | ||
| to_scrape.append(('functional', 'alltypes')) | ||
| for db, tname in to_scrape: | ||
| table = con.table(tname, database=db) | ||
| new_name = '{0}_{1}'.format(db, tname) | ||
| print('Creating {0}'.format(new_name)) | ||
| con.create_table(new_name, table, database=TMP_DB) | ||
|
|
||
|
|
||
| def download_parquet_files(con): | ||
| parquet_path = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'parquet') | ||
| print("Downloading {0}".format(parquet_path)) | ||
| con.hdfs.get(TMP_DB_HDFS_PATH, parquet_path) | ||
|
|
||
|
|
||
| def download_avro_files(con): | ||
| avro_path = '/test-warehouse/tpch.region_avro' | ||
| os.mkdir(os.path.join(IBIS_TEST_DATA_LOCAL_DIR, 'avro')) | ||
| print("Downloading {0}".format(avro_path)) | ||
| con.hdfs.get(avro_path, | ||
| pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'avro', 'tpch_region_avro')) | ||
|
|
||
|
|
||
| def generate_csv_files(): | ||
| N = 10 | ||
| nfiles = 10 | ||
|
|
||
| csv_base = os.path.join(IBIS_TEST_DATA_LOCAL_DIR, 'csv') | ||
| os.mkdir(csv_base) | ||
|
|
||
| df = pd.DataFrame({ | ||
| 'foo': [tm.rands(10) for _ in xrange(N)], | ||
| 'bar': np.random.randn(N), | ||
| 'baz': np.random.randint(0, 100, size=N) | ||
| }, columns=['foo', 'bar', 'baz']) | ||
|
|
||
| for i in xrange(nfiles): | ||
| csv_path = os.path.join(csv_base, '{0}.csv'.format(i)) | ||
| print('Writing {0}'.format(csv_path)) | ||
| df.to_csv(csv_path, index=False, header=False) | ||
|
|
||
|
|
||
| def cleanup_temporary_stuff(con): | ||
| con.drop_database(TMP_DB, force=True) | ||
| assert not con.hdfs.exists(TMP_DB_HDFS_PATH) | ||
|
|
||
|
|
||
| def make_local_test_archive(): | ||
| con = make_connection() | ||
| make_temp_database(con) | ||
|
|
||
| try: | ||
| scrape_parquet_files(con) | ||
|
|
||
| if os.path.exists(IBIS_TEST_DATA_LOCAL_DIR): | ||
| shutil.rmtree(IBIS_TEST_DATA_LOCAL_DIR) | ||
| os.mkdir(IBIS_TEST_DATA_LOCAL_DIR) | ||
|
|
||
| download_parquet_files(con) | ||
| download_avro_files(con) | ||
| generate_csv_files() | ||
| finally: | ||
| cleanup_temporary_stuff(con) | ||
|
|
||
| # TODO: push a tarball to S3? | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| make_local_test_archive() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| #! /usr/bin/env bash | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| # This script calls machinery that initializes an ibis.tests.util.IbisTestEnv, | ||
| # so it needs those variables set correctly. It also assumes that WORKSPACE is | ||
| # set (i.e., that it is being run as a Jenkins job). If the latter is not | ||
| # true, you can instead set GIT_URL and GIT_BRANCH to check them out manually. | ||
|
|
||
| set -e | ||
| set -x | ||
|
|
||
| printenv | ||
|
|
||
| mkdir -p /tmp/impyla-dbapi | ||
| TMP_DIR=$(mktemp -d -p /tmp/impyla-dbapi tmpXXXX) | ||
|
|
||
| function cleanup { | ||
| rm -rf $TMP_DIR | ||
| } | ||
| trap cleanup EXIT | ||
|
|
||
| cd $TMP_DIR | ||
|
|
||
| # Checkout ibis if necessary | ||
| if [ -z "$WORKSPACE" ]; then | ||
| : ${GIT_URL:?"GIT_URL is unset"} | ||
| : ${GIT_BRANCH:?"GIT_BRANCH is unset"} | ||
| git clone $GIT_URL | ||
| pushd ibis && git checkout origin/$GIT_BRANCH && popd | ||
| IBIS_HOME=$TMP_DIR/ibis | ||
| else | ||
| # WORKSPACE is set, so I must be on a Jenkins slave | ||
| IBIS_HOME=$WORKSPACE | ||
| fi | ||
|
|
||
| # Setup Python | ||
| curl https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh > miniconda.sh | ||
| bash miniconda.sh -b -p $TMP_DIR/miniconda | ||
| export PATH="$TMP_DIR/miniconda/bin:$PATH" | ||
| conda update -y -q conda | ||
| conda info -a | ||
|
|
||
| # Install ibis and deps into new environment | ||
| CONDA_ENV_NAME=pyenv-ibis-test | ||
| conda create -y -q -n $CONDA_ENV_NAME python=$PYTHON_VERSION numpy pandas | ||
| source activate $CONDA_ENV_NAME | ||
| pip install $IBIS_HOME | ||
|
|
||
| python --version | ||
| which python | ||
|
|
||
| cd $IBIS_HOME | ||
|
|
||
| python -c "from ibis.tests.util import IbisTestEnv; print(IbisTestEnv())" | ||
|
|
||
| # load necessary test data | ||
| scripts/load_test_data.py | ||
|
|
||
| # run the test suite | ||
| py.test --e2e ibis | ||
|
|
||
| # cleanup | ||
| scripts/cleanup_testing_data.py |