| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,300 @@ | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| from six import StringIO | ||
|
|
||
| import pandas as pd | ||
|
|
||
| from ibis.common import IbisError | ||
| from ibis.expr.api import schema | ||
| from ibis.impala import ddl | ||
| from ibis.util import implements as copydoc | ||
| import ibis.expr.datatypes as dt | ||
| import kudu | ||
|
|
||
|
|
||
| _kudu_type_to_ibis_typeclass = { | ||
| 'int8': dt.Int8, | ||
| 'int16': dt.Int16, | ||
| 'int32': dt.Int32, | ||
| 'int64': dt.Int64, | ||
| 'float': dt.Float, | ||
| 'double': dt.Double, | ||
| 'bool': dt.Boolean, | ||
| 'string': dt.String, | ||
| 'timestamp': dt.Timestamp | ||
| } | ||
|
|
||
|
|
||
| class KuduImpalaInterface(object): | ||
|
|
||
| """ | ||
| User-facing wrapper layer for the ImpalaClient | ||
| """ | ||
|
|
||
| def __init__(self, impala_client): | ||
| self.impala_client = impala_client | ||
| self.client = None | ||
|
|
||
| @copydoc(kudu.client.Client.list_tables) | ||
| def list_tables(self, filter=''): | ||
| return self.client.list_tables(filter) | ||
|
|
||
| @copydoc(kudu.client.Client.table_exists) | ||
| def table_exists(self, name): | ||
| return self.client.table_exists(name) | ||
|
|
||
| def connect(self, host_or_hosts, port_or_ports=7051, rpc_timeout=None): | ||
| """ | ||
| Pass-through connection interface to the Kudu client | ||
| Parameters | ||
| ---------- | ||
| host_or_hosts : string or list of strings | ||
| If you have multiple Kudu masters for HA, pass a list | ||
| port_or_ports : int or list of int, default 7051 | ||
| If you pass multiple host names, pass multiple ports | ||
| rpc_timeout : kudu.TimeDelta | ||
| See Kudu client documentation for details | ||
| Returns | ||
| ------- | ||
| None | ||
| """ | ||
| self.client = kudu.connect(host_or_hosts, port_or_ports, | ||
| rpc_timeout=rpc_timeout) | ||
|
|
||
| def _check_connected(self): | ||
| if not self.is_connected: | ||
| raise IbisError('Please first connect to a Kudu cluster ' | ||
| 'with client.kudu.connect') | ||
|
|
||
| @property | ||
| def is_connected(self): | ||
| # crude check for now | ||
| return self.client is not None | ||
|
|
||
| def create_table(self, impala_name, kudu_name, primary_keys=None, | ||
| obj=None, schema=None, database=None, | ||
| external=False, force=False): | ||
| """ | ||
| Create an Kudu-backed table in the connected Impala cluster. For | ||
| non-external tables, this will create a Kudu table with a compatible | ||
| storage schema. | ||
| This function is patterned after the ImpalaClient.create_table function | ||
| designed for physical filesystems (like HDFS). | ||
| Parameters | ||
| ---------- | ||
| impala_name : string | ||
| Name of the created Impala table | ||
| kudu_name : string | ||
| Name of hte backing Kudu table. Will be created if external=False | ||
| primary_keys : list of column names | ||
| List of | ||
| obj : TableExpr or pandas.DataFrame, optional | ||
| If passed, creates table from select statement results | ||
| schema : ibis.Schema, optional | ||
| Mutually exclusive with expr, creates an empty table with a | ||
| particular schema | ||
| database : string, default None (optional) | ||
| external : boolean, default False | ||
| If False, a new Kudu table will be created. Otherwise, the Kudu table | ||
| must already exist. | ||
| """ | ||
| self._check_connected() | ||
|
|
||
| if not external and (primary_keys is None or len(primary_keys) == 0): | ||
| raise ValueError('Must specify primary keys when DDL creates a ' | ||
| 'new Kudu table') | ||
|
|
||
| if obj is not None: | ||
| if external: | ||
| raise ValueError('Cannot create an external Kudu-Impala table ' | ||
| 'from an expression or DataFrame') | ||
|
|
||
| if isinstance(obj, pd.DataFrame): | ||
| from ibis.impala.pandas_interop import write_temp_dataframe | ||
| writer, to_insert = write_temp_dataframe(self.impala_client, | ||
| obj) | ||
| else: | ||
| to_insert = obj | ||
| # XXX: exposing a lot of internals | ||
| ast = self.impala_client._build_ast(to_insert) | ||
| select = ast.queries[0] | ||
|
|
||
| stmt = CTASKudu(impala_name, kudu_name, | ||
| self.client.master_addrs, | ||
| select, primary_keys, | ||
| database=database) | ||
| else: | ||
| if external: | ||
| ktable = self.client.table(kudu_name) | ||
| kschema = ktable.schema | ||
| schema = schema_kudu_to_ibis(kschema) | ||
| primary_keys = kschema.primary_keys() | ||
| elif schema is None: | ||
| raise ValueError('Must specify schema for new empty ' | ||
| 'Kudu-backed table') | ||
|
|
||
| stmt = CreateTableKudu(impala_name, kudu_name, | ||
| self.client.master_addrs, | ||
| schema, primary_keys, | ||
| external=external, | ||
| database=database, | ||
| can_exist=False) | ||
|
|
||
| self.impala_client._execute(stmt) | ||
|
|
||
| def table(self, kudu_name, name=None, database=None, persist=False, | ||
| external=True): | ||
| """ | ||
| Convenience to expose an existing Kudu table (using CREATE TABLE) as an | ||
| Impala table. To create a new table both in the Hive Metastore with | ||
| storage in Kudu, use create_table. | ||
| Note: all tables created are EXTERNAL for now. Creates a temporary | ||
| table (like parquet_file and others) unless persist=True. | ||
| If you create a persistent table you can thereafter use it like any | ||
| other Impala table. | ||
| Parameters | ||
| ---------- | ||
| kudu_name : string | ||
| The name of the table in the Kudu cluster | ||
| name : string, optional | ||
| Name of the created table in Impala / Hive Metastore. Randomly | ||
| generated if not specified. | ||
| database : string, optional | ||
| Database to create the table in. Uses the temp db if not provided | ||
| persist : boolean, default False | ||
| If True, do not drop the table upon Ibis garbage collection / | ||
| interpreter shutdown. Be careful using this in conjunction with the | ||
| `external` option. | ||
| external : boolean, default True | ||
| If True, create the Impala table as EXTERNAL so the Kudu data is not | ||
| deleted when the Impala table is dropped | ||
| Returns | ||
| ------- | ||
| parquet_table : ImpalaTable | ||
| """ | ||
| # Law of demeter, but OK for now because internal class coupling | ||
| name, database = (self.impala_client | ||
| ._get_concrete_table_path(name, database, | ||
| persist=persist)) | ||
| self.create_table(name, kudu_name, database=database, external=True) | ||
| return self.impala_client._wrap_new_table(name, database, persist) | ||
|
|
||
|
|
||
| class CreateTableKudu(ddl.CreateTable): | ||
|
|
||
| """ | ||
| Creates an Impala table that scans from a Kudu table | ||
| """ | ||
|
|
||
| # TODO | ||
| # - DISTRIBUTE BY HASH | ||
| # - DISTRIBUTE BY RANGE` | ||
| # - multi master test | ||
|
|
||
| def __init__(self, table_name, kudu_table_name, | ||
| master_addrs, schema, key_columns, | ||
| external=True, **kwargs): | ||
| self.kudu_table_name = kudu_table_name | ||
| self.master_addrs = master_addrs | ||
| self.schema = schema | ||
| self.key_columns = key_columns | ||
| ddl.CreateTable.__init__(self, table_name, external=external, | ||
| **kwargs) | ||
|
|
||
| self._validate() | ||
|
|
||
| def _validate(self): | ||
| pass | ||
|
|
||
| def compile(self): | ||
| buf = StringIO() | ||
| buf.write(self._create_line()) | ||
|
|
||
| schema = ddl.format_schema(self.schema) | ||
| buf.write('\n{0}'.format(schema)) | ||
|
|
||
| props = self._get_table_properties() | ||
| buf.write('\n') | ||
| buf.write(ddl.format_tblproperties(props)) | ||
| return buf.getvalue() | ||
|
|
||
| _table_props_base = { | ||
| 'storage_handler': 'com.cloudera.kudu.hive.KuduStorageHandler' | ||
| } | ||
|
|
||
| def _get_table_properties(self): | ||
| tbl_props = self._table_props_base.copy() | ||
|
|
||
| addr_string = ', '.join(self.master_addrs) | ||
| keys_string = ', '.join(self.key_columns) | ||
|
|
||
| tbl_props.update({ | ||
| 'kudu.table_name': self.kudu_table_name, | ||
| 'kudu.master_addresses': addr_string, | ||
| 'kudu.key_columns': keys_string | ||
| }) | ||
|
|
||
| return tbl_props | ||
|
|
||
|
|
||
| class CTASKudu(CreateTableKudu): | ||
|
|
||
| def __init__(self, table_name, kudu_name, master_addrs, | ||
| select, key_columns, database=None, | ||
| external=False, can_exist=False): | ||
| self.select = select | ||
| CreateTableKudu.__init__(self, table_name, kudu_name, | ||
| master_addrs, None, key_columns, | ||
| database=database, | ||
| external=external, | ||
| can_exist=can_exist) | ||
|
|
||
| def compile(self): | ||
| buf = StringIO() | ||
| buf.write(self._create_line()) | ||
|
|
||
| props = self._get_table_properties() | ||
| buf.write('\n') | ||
| buf.write(ddl.format_tblproperties(props)) | ||
|
|
||
| select_query = self.select.compile() | ||
| buf.write(' AS\n{0}'.format(select_query)) | ||
| return buf.getvalue() | ||
|
|
||
|
|
||
| def schema_kudu_to_ibis(kschema, drop_nn=False): | ||
| ibis_types = [] | ||
| for i in range(len(kschema)): | ||
| col = kschema[i] | ||
|
|
||
| typeclass = _kudu_type_to_ibis_typeclass[col.type.name] | ||
|
|
||
| if drop_nn: | ||
| # For testing, because Impala does not have nullable types | ||
| itype = typeclass(True) | ||
| else: | ||
| itype = typeclass(col.nullable) | ||
|
|
||
| ibis_types.append((col.name, itype)) | ||
|
|
||
| return schema(ibis_types) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,289 @@ | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import os | ||
| import pytest | ||
|
|
||
| from ibis.compat import unittest | ||
| from ibis.expr.tests.mocks import MockConnection | ||
| from ibis.impala.client import build_ast | ||
| from ibis.impala.tests.common import IbisTestEnv, ImpalaE2E | ||
| from ibis.tests.util import assert_equal | ||
| import ibis.expr.datatypes as dt | ||
| import ibis.util as util | ||
| import ibis | ||
|
|
||
| try: | ||
| from ibis.impala import kudu_support as ksupport | ||
| import kudu | ||
| HAVE_KUDU_CLIENT = True | ||
| except ImportError: | ||
| HAVE_KUDU_CLIENT = False | ||
|
|
||
|
|
||
| pytestmark = pytest.mark.skipif(not HAVE_KUDU_CLIENT, | ||
| reason='Kudu client not installed') | ||
|
|
||
|
|
||
| class KuduImpalaTestEnv(IbisTestEnv): | ||
|
|
||
| def __init__(self): | ||
| IbisTestEnv.__init__(self) | ||
|
|
||
| # band-aid until Kudu support merged into Impala mainline | ||
| self.test_host = os.getenv('IBIS_TEST_KIMPALA_HOST', | ||
| 'quickstart.cloudera') | ||
|
|
||
| # XXX | ||
| self.impala_host = self.test_host | ||
| self.impala_port = 21050 | ||
| self.master_host = os.getenv('IBIS_TEST_KUDU_MASTER', self.test_host) | ||
| self.master_port = os.getenv('IBIS_TEST_KUDU_MASTER_PORT', 7051) | ||
| self.nn_host = os.environ.get('IBIS_TEST_KUDU_NN_HOST', self.test_host) | ||
|
|
||
| self.webhdfs_port = int(os.environ.get('IBIS_TEST_WEBHDFS_PORT', | ||
| 50070)) | ||
| self.hdfs_superuser = os.environ.get('IBIS_TEST_HDFS_SUPERUSER', | ||
| 'hdfs') | ||
|
|
||
| ENV = KuduImpalaTestEnv() | ||
|
|
||
|
|
||
| class TestKuduTools(unittest.TestCase): | ||
|
|
||
| # Test schema conversion, DDL statements, etc. | ||
|
|
||
| def test_kudu_schema_convert(self): | ||
| spec = [ | ||
| # name, type, is_nullable, is_primary_key | ||
| ('a', dt.Int8(False), 'int8', False, True), | ||
| ('b', dt.Int16(False), 'int16', False, True), | ||
| ('c', dt.Int32(False), 'int32', False, False), | ||
| ('d', dt.Int64(True), 'int64', True, False), | ||
| ('e', dt.String(True), 'string', True, False), | ||
| ('f', dt.Boolean(False), 'bool', False, False), | ||
| ('g', dt.Float(False), 'float', False, False), | ||
| ('h', dt.Double(True), 'double', True, False), | ||
|
|
||
| # TODO | ||
| # ('i', 'binary', False, False), | ||
|
|
||
| ('j', dt.Timestamp(True), 'timestamp', True, False) | ||
| ] | ||
|
|
||
| builder = kudu.schema_builder() | ||
| primary_keys = [] | ||
| ibis_types = [] | ||
| for name, itype, type_, is_nullable, is_primary_key in spec: | ||
| builder.add_column(name, type_, nullable=is_nullable) | ||
|
|
||
| if is_primary_key: | ||
| primary_keys.append(name) | ||
|
|
||
| ibis_types.append((name, itype)) | ||
|
|
||
| builder.set_primary_keys(primary_keys) | ||
| kschema = builder.build() | ||
|
|
||
| ischema = ksupport.schema_kudu_to_ibis(kschema) | ||
| expected = ibis.schema(ibis_types) | ||
|
|
||
| assert_equal(ischema, expected) | ||
|
|
||
| def test_create_external_ddl(self): | ||
| schema = ibis.schema([('key1', 'int32'), | ||
| ('key2', 'int64'), | ||
| ('value1', 'double')]) | ||
|
|
||
| stmt = ksupport.CreateTableKudu('impala_name', 'kudu_name', | ||
| ['master1.d.com:7051', | ||
| 'master2.d.com:7051'], | ||
| schema, ['key1', 'key2']) | ||
|
|
||
| result = stmt.compile() | ||
| expected = """\ | ||
| CREATE EXTERNAL TABLE `impala_name` | ||
| (`key1` int, | ||
| `key2` bigint, | ||
| `value1` double) | ||
| TBLPROPERTIES ( | ||
| 'kudu.key_columns'='key1, key2', | ||
| 'kudu.master_addresses'='master1.d.com:7051, master2.d.com:7051', | ||
| 'kudu.table_name'='kudu_name', | ||
| 'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler' | ||
| )""" | ||
| assert result == expected | ||
|
|
||
| def test_ctas_ddl(self): | ||
| con = MockConnection() | ||
|
|
||
| select = build_ast(con.table('test1')).queries[0] | ||
| statement = ksupport.CTASKudu( | ||
| 'another_table', 'kudu_name', ['dom.d.com:7051'], | ||
| select, ['string_col'], external=True, | ||
| can_exist=False, database='foo') | ||
| result = statement.compile() | ||
|
|
||
| expected = """\ | ||
| CREATE EXTERNAL TABLE foo.`another_table` | ||
| TBLPROPERTIES ( | ||
| 'kudu.key_columns'='string_col', | ||
| 'kudu.master_addresses'='dom.d.com:7051', | ||
| 'kudu.table_name'='kudu_name', | ||
| 'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler' | ||
| ) AS | ||
| SELECT * | ||
| FROM test1""" | ||
| assert result == expected | ||
|
|
||
|
|
||
| class TestKuduE2E(ImpalaE2E, unittest.TestCase): | ||
|
|
||
| @classmethod | ||
| def setUpClass(cls): | ||
| ImpalaE2E.setup_e2e(cls, ENV) | ||
|
|
||
| cls.temp_tables = [] | ||
|
|
||
| cls.kclient = kudu.connect(cls.env.master_host, cls.env.master_port) | ||
|
|
||
| cls.con.kudu.connect(cls.env.master_host, cls.env.master_port) | ||
|
|
||
| def _new_kudu_example_table(self, kschema): | ||
| kudu_name = 'ibis-tmp-{0}'.format(util.guid()) | ||
|
|
||
| self.kclient.create_table(kudu_name, kschema) | ||
| self.temp_tables.append(kudu_name) | ||
|
|
||
| return kudu_name | ||
|
|
||
| @classmethod | ||
| def tearDownClass(cls): | ||
| cls.teardown_e2e(cls) | ||
|
|
||
| for table in cls.temp_tables: | ||
| cls.kclient.delete_table(table) | ||
|
|
||
| @classmethod | ||
| def example_schema(cls): | ||
| builder = kudu.schema_builder() | ||
| builder.add_column('key', kudu.int32, nullable=False) | ||
| builder.add_column('int_val', kudu.int32) | ||
| builder.add_column('string_val', kudu.string) | ||
| builder.set_primary_keys(['key']) | ||
|
|
||
| return builder.build() | ||
|
|
||
| def _write_example_data(self, table_name, nrows=100): | ||
| table = self.kclient.table(table_name) | ||
| session = self.kclient.new_session() | ||
| for i in range(nrows): | ||
| op = table.new_insert() | ||
| row = i, i * 2, 'hello_%d' % i | ||
| op['key'] = row[0] | ||
| op['int_val'] = row[1] | ||
| op['string_val'] = row[2] | ||
| session.apply(op) | ||
| session.flush() | ||
|
|
||
| @pytest.mark.kudu | ||
| def test_external_kudu_table(self): | ||
| kschema = self.example_schema() | ||
| kudu_name = self._new_kudu_example_table(kschema) | ||
|
|
||
| nrows = 100 | ||
| self._write_example_data(kudu_name, nrows) | ||
|
|
||
| table = self.con.kudu.table(kudu_name) | ||
| result = table.execute() | ||
| assert len(result) == 100 | ||
|
|
||
| ischema = ksupport.schema_kudu_to_ibis(kschema, drop_nn=True) | ||
| assert_equal(table.schema(), ischema) | ||
|
|
||
| @pytest.mark.kudu | ||
| def test_internal_kudu_table(self): | ||
| kschema = self.example_schema() | ||
| kudu_name = self._new_kudu_example_table(kschema) | ||
|
|
||
| nrows = 100 | ||
| self._write_example_data(kudu_name, nrows) | ||
|
|
||
| impala_name = self._temp_impala_name() | ||
| impala_db = self.env.test_data_db | ||
| self.con.kudu.table(kudu_name, name=impala_name, | ||
| database=impala_db, | ||
| external=True, | ||
| persist=True) | ||
|
|
||
| t = self.con.table(impala_name, database=impala_db) | ||
| assert len(t.execute()) == nrows | ||
|
|
||
| # Make internal | ||
| t.set_external(False) | ||
| t.drop() | ||
|
|
||
| assert not self.con.kudu.table_exists(kudu_name) | ||
|
|
||
| @pytest.mark.kudu | ||
| def test_create_table_as_select_ctas(self): | ||
| # TODO | ||
| kschema = self.example_schema() | ||
| kudu_name = self._new_kudu_example_table(kschema) | ||
|
|
||
| nrows = 100 | ||
| self._write_example_data(kudu_name, nrows) | ||
|
|
||
| impala_name = self._temp_impala_name() | ||
| impala_db = self.env.test_data_db | ||
| self.con.kudu.table(kudu_name, name=impala_name, | ||
| database=impala_db, | ||
| external=True, | ||
| persist=True) | ||
|
|
||
| impala_name2 = self._temp_impala_name() | ||
| expr = self.con.table(impala_name, database=impala_db) | ||
|
|
||
| kudu_name2 = 'ibis-ctas-{0}'.format(util.guid()) | ||
|
|
||
| self.con.kudu.create_table(impala_name2, kudu_name2, | ||
| primary_keys=['key'], | ||
| obj=expr, database=impala_db) | ||
|
|
||
| # TODO: should some stats be automatically computed? | ||
| itable = self.con.table(impala_name2, database=impala_db) | ||
| assert len(itable.execute()) == len(expr.execute()) | ||
|
|
||
| ktable = self.kclient.table(kudu_name2) | ||
| assert ktable.schema.primary_keys() == ['key'] | ||
|
|
||
| @pytest.mark.kudu | ||
| def test_create_empty_internal_table(self): | ||
| kschema = self.example_schema() | ||
| ischema = ksupport.schema_kudu_to_ibis(kschema, drop_nn=True) | ||
|
|
||
| impala_name = self._temp_impala_name() | ||
| kudu_name = 'ibis-empty-{0}'.format(util.guid()) | ||
|
|
||
| self.con.kudu.create_table(impala_name, kudu_name, | ||
| primary_keys=['key'], | ||
| schema=ischema, | ||
| database=self.env.test_data_db) | ||
|
|
||
| ktable = self.kclient.table(kudu_name) | ||
| assert ktable.schema.equals(kschema) | ||
| self.temp_tables.append(kudu_name) | ||
|
|
||
| def _temp_impala_name(self): | ||
| return 'kudu_test_{0}'.format(util.guid()) |