| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| Taken from pandas | ||
|
|
||
| Source: https://github.com/pydata/pandas/tree/master/doc/sphinxext/ipython_sphinxext | ||
| License: BSD |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| """reST directive for syntax-highlighting ipython interactive sessions. | ||
| XXX - See what improvements can be made based on the new (as of Sept 2009) | ||
| 'pycon' lexer for the python console. At the very least it will give better | ||
| highlighted tracebacks. | ||
| """ | ||
|
|
||
| #----------------------------------------------------------------------------- | ||
| # Needed modules | ||
|
|
||
| # Standard library | ||
| import re | ||
|
|
||
| # Third party | ||
| from pygments.lexer import Lexer, do_insertions | ||
| from pygments.lexers.agile import (PythonConsoleLexer, PythonLexer, | ||
| PythonTracebackLexer) | ||
| from pygments.token import Comment, Generic | ||
|
|
||
| from sphinx import highlighting | ||
|
|
||
| #----------------------------------------------------------------------------- | ||
| # Global constants | ||
| line_re = re.compile('.*?\n') | ||
|
|
||
| #----------------------------------------------------------------------------- | ||
| # Code begins - classes and functions | ||
|
|
||
|
|
||
| class IPythonConsoleLexer(Lexer): | ||
|
|
||
| """ | ||
| For IPython console output or doctests, such as: | ||
| .. sourcecode:: ipython | ||
| In [1]: a = 'foo' | ||
| In [2]: a | ||
| Out[2]: 'foo' | ||
| In [3]: print(a) | ||
| foo | ||
| In [4]: 1 / 0 | ||
| Notes: | ||
| - Tracebacks are not currently supported. | ||
| - It assumes the default IPython prompts, not customized ones. | ||
| """ | ||
|
|
||
| name = 'IPython console session' | ||
| aliases = ['ipython'] | ||
| mimetypes = ['text/x-ipython-console'] | ||
| input_prompt = re.compile("(In \[[0-9]+\]: )|( \.\.\.+:)") | ||
| output_prompt = re.compile("(Out\[[0-9]+\]: )|( \.\.\.+:)") | ||
| continue_prompt = re.compile(" \.\.\.+:") | ||
| tb_start = re.compile("\-+") | ||
|
|
||
| def get_tokens_unprocessed(self, text): | ||
| pylexer = PythonLexer(**self.options) | ||
| tblexer = PythonTracebackLexer(**self.options) | ||
|
|
||
| curcode = '' | ||
| insertions = [] | ||
| for match in line_re.finditer(text): | ||
| line = match.group() | ||
| input_prompt = self.input_prompt.match(line) | ||
| continue_prompt = self.continue_prompt.match(line.rstrip()) | ||
| output_prompt = self.output_prompt.match(line) | ||
| if line.startswith("#"): | ||
| insertions.append((len(curcode), | ||
| [(0, Comment, line)])) | ||
| elif input_prompt is not None: | ||
| insertions.append((len(curcode), | ||
| [(0, Generic.Prompt, input_prompt.group())])) | ||
| curcode += line[input_prompt.end():] | ||
| elif continue_prompt is not None: | ||
| insertions.append((len(curcode), | ||
| [(0, Generic.Prompt, continue_prompt.group())])) | ||
| curcode += line[continue_prompt.end():] | ||
| elif output_prompt is not None: | ||
| # Use the 'error' token for output. We should probably make | ||
| # our own token, but error is typicaly in a bright color like | ||
| # red, so it works fine for our output prompts. | ||
| insertions.append((len(curcode), | ||
| [(0, Generic.Error, output_prompt.group())])) | ||
| curcode += line[output_prompt.end():] | ||
| else: | ||
| if curcode: | ||
| for item in do_insertions(insertions, | ||
| pylexer.get_tokens_unprocessed(curcode)): | ||
| yield item | ||
| curcode = '' | ||
| insertions = [] | ||
| yield match.start(), Generic.Output, line | ||
| if curcode: | ||
| for item in do_insertions(insertions, | ||
| pylexer.get_tokens_unprocessed(curcode)): | ||
| yield item | ||
|
|
||
|
|
||
| def setup(app): | ||
| """Setup as a sphinx extension.""" | ||
|
|
||
| # This is only a lexer, so adding it below to pygments appears sufficient. | ||
| # But if somebody knows that the right API usage should be to do that via | ||
| # sphinx, by all means fix it here. At least having this setup.py | ||
| # suppresses the sphinx warning we'd get without it. | ||
| pass | ||
|
|
||
| #----------------------------------------------------------------------------- | ||
| # Register the extension as a valid pygments lexer | ||
| highlighting.lexers['ipython'] = IPythonConsoleLexer() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,269 @@ | ||
| # Copyright 2014 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import ibis | ||
|
|
||
| from ibis.compat import unittest | ||
| from ibis.expr.tests.mocks import BasicTestCase | ||
| import ibis.expr.analysis as L | ||
| import ibis.expr.operations as ops | ||
| import ibis.common as com | ||
|
|
||
| from ibis.tests.util import assert_equal | ||
|
|
||
|
|
||
| # Place to collect esoteric expression analysis bugs and tests | ||
|
|
||
|
|
||
| class TestTableExprBasics(BasicTestCase, unittest.TestCase): | ||
|
|
||
| def test_rewrite_substitute_distinct_tables(self): | ||
| t = self.con.table('test1') | ||
| tt = self.con.table('test1') | ||
|
|
||
| expr = t[t.c > 0] | ||
| expr2 = tt[tt.c > 0] | ||
|
|
||
| metric = t.f.sum().name('metric') | ||
| expr3 = expr.aggregate(metric) | ||
|
|
||
| result = L.sub_for(expr3, [(expr2, t)]) | ||
| expected = t.aggregate(metric) | ||
|
|
||
| assert_equal(result, expected) | ||
|
|
||
| def test_rewrite_join_projection_without_other_ops(self): | ||
| # Drop out filters and other commutative table operations. Join | ||
| # predicates are "lifted" to reference the base, unmodified join roots | ||
|
|
||
| # Star schema with fact table | ||
| table = self.con.table('star1') | ||
| table2 = self.con.table('star2') | ||
| table3 = self.con.table('star3') | ||
|
|
||
| filtered = table[table['f'] > 0] | ||
|
|
||
| pred1 = table['foo_id'] == table2['foo_id'] | ||
| pred2 = filtered['bar_id'] == table3['bar_id'] | ||
|
|
||
| j1 = filtered.left_join(table2, [pred1]) | ||
| j2 = j1.inner_join(table3, [pred2]) | ||
|
|
||
| # Project out the desired fields | ||
| view = j2[[filtered, table2['value1'], table3['value2']]] | ||
|
|
||
| # Construct the thing we expect to obtain | ||
| ex_pred2 = table['bar_id'] == table3['bar_id'] | ||
| ex_expr = (table.left_join(table2, [pred1]) | ||
| .inner_join(table3, [ex_pred2])) | ||
|
|
||
| rewritten_proj = L.substitute_parents(view) | ||
| op = rewritten_proj.op() | ||
| assert_equal(op.table, ex_expr) | ||
|
|
||
| # Ensure that filtered table has been substituted with the base table | ||
| assert op.selections[0] is table | ||
|
|
||
| def test_rewrite_past_projection(self): | ||
| table = self.con.table('test1') | ||
|
|
||
| # Rewrite past a projection | ||
| table3 = table[['c', 'f']] | ||
| expr = table3['c'] == 2 | ||
|
|
||
| result = L.substitute_parents(expr) | ||
| expected = table['c'] == 2 | ||
| assert_equal(result, expected) | ||
|
|
||
| # Unsafe to rewrite past projection | ||
| table5 = table[(table.f * 2).name('c'), table.f] | ||
| expr = table5['c'] == 2 | ||
| result = L.substitute_parents(expr) | ||
| assert result is expr | ||
|
|
||
| def test_rewrite_expr_with_parent(self): | ||
| table = self.con.table('test1') | ||
|
|
||
| table2 = table[table['f'] > 0] | ||
|
|
||
| expr = table2['c'] == 2 | ||
|
|
||
| result = L.substitute_parents(expr) | ||
| expected = table['c'] == 2 | ||
| assert_equal(result, expected) | ||
|
|
||
| # Substitution not fully possible if we depend on a new expr in a | ||
| # projection | ||
|
|
||
| table4 = table[['c', (table['c'] * 2).name('foo')]] | ||
| expr = table4['c'] == table4['foo'] | ||
| result = L.substitute_parents(expr) | ||
| expected = table['c'] == table4['foo'] | ||
| assert_equal(result, expected) | ||
|
|
||
| def test_rewrite_distinct_but_equal_objects(self): | ||
| t = self.con.table('test1') | ||
| t_copy = self.con.table('test1') | ||
|
|
||
| table2 = t[t_copy['f'] > 0] | ||
|
|
||
| expr = table2['c'] == 2 | ||
|
|
||
| result = L.substitute_parents(expr) | ||
| expected = t['c'] == 2 | ||
| assert_equal(result, expected) | ||
|
|
||
| def test_projection_with_join_pushdown_rewrite_refs(self): | ||
| # Observed this expression IR issue in a TopK-rewrite context | ||
| table1 = ibis.table([ | ||
| ('a_key1', 'string'), | ||
| ('a_key2', 'string'), | ||
| ('a_value', 'double') | ||
| ], 'foo') | ||
|
|
||
| table2 = ibis.table([ | ||
| ('b_key1', 'string'), | ||
| ('b_name', 'string'), | ||
| ('b_value', 'double') | ||
| ], 'bar') | ||
|
|
||
| table3 = ibis.table([ | ||
| ('c_key2', 'string'), | ||
| ('c_name', 'string') | ||
| ], 'baz') | ||
|
|
||
| proj = (table1.inner_join(table2, [('a_key1', 'b_key1')]) | ||
| .inner_join(table3, [(table1.a_key2, table3.c_key2)]) | ||
| [table1, table2.b_name.name('b'), table3.c_name.name('c'), | ||
| table2.b_value]) | ||
|
|
||
| cases = [ | ||
| (proj.a_value > 0, table1.a_value > 0), | ||
| (proj.b_value > 0, table2.b_value > 0) | ||
| ] | ||
|
|
||
| for higher_pred, lower_pred in cases: | ||
| result = proj.filter([higher_pred]) | ||
| op = result.op() | ||
| assert isinstance(op, ops.Projection) | ||
| filter_op = op.table.op() | ||
| assert isinstance(filter_op, ops.Filter) | ||
| new_pred = filter_op.predicates[0] | ||
| assert_equal(new_pred, lower_pred) | ||
|
|
||
| def test_multiple_join_deeper_reference(self): | ||
| # Join predicates down the chain might reference one or more root | ||
| # tables in the hierarchy. | ||
| table1 = ibis.table({'key1': 'string', 'key2': 'string', | ||
| 'value1': 'double'}) | ||
| table2 = ibis.table({'key3': 'string', 'value2': 'double'}) | ||
| table3 = ibis.table({'key4': 'string', 'value3': 'double'}) | ||
|
|
||
| joined = table1.inner_join(table2, [table1['key1'] == table2['key3']]) | ||
| joined2 = joined.inner_join(table3, [table1['key2'] == table3['key4']]) | ||
|
|
||
| # it works, what more should we test here? | ||
| materialized = joined2.materialize() | ||
| repr(materialized) | ||
|
|
||
| def test_filter_on_projected_field(self): | ||
| # See #173. Impala and other SQL engines do not allow filtering on a | ||
| # just-created alias in a projection | ||
| region = self.con.table('tpch_region') | ||
| nation = self.con.table('tpch_nation') | ||
| customer = self.con.table('tpch_customer') | ||
| orders = self.con.table('tpch_orders') | ||
|
|
||
| fields_of_interest = [customer, | ||
| region.r_name.name('region'), | ||
| orders.o_totalprice.name('amount'), | ||
| orders.o_orderdate | ||
| .cast('timestamp').name('odate')] | ||
|
|
||
| all_join = ( | ||
| region.join(nation, region.r_regionkey == nation.n_regionkey) | ||
| .join(customer, customer.c_nationkey == nation.n_nationkey) | ||
| .join(orders, orders.o_custkey == customer.c_custkey)) | ||
|
|
||
| tpch = all_join[fields_of_interest] | ||
|
|
||
| # Correlated subquery, yikes! | ||
| t2 = tpch.view() | ||
| conditional_avg = t2[(t2.region == tpch.region)].amount.mean() | ||
|
|
||
| # `amount` is part of the projection above as an aliased field | ||
| amount_filter = tpch.amount > conditional_avg | ||
|
|
||
| result = tpch.filter([amount_filter]) | ||
|
|
||
| # Now then! Predicate pushdown here is inappropriate, so we check that | ||
| # it didn't occur. | ||
|
|
||
| # If filter were pushed below projection, the top-level operator type | ||
| # would be Projection instead. | ||
| assert type(result.op()) == ops.Filter | ||
|
|
||
| def test_bad_join_predicate_raises(self): | ||
| # Join predicate references a derived table, but we can salvage and | ||
| # rewrite it to get the join semantics out | ||
| # see ibis #74 | ||
| table = ibis.table([ | ||
| ('c', 'int32'), | ||
| ('f', 'double'), | ||
| ('g', 'string') | ||
| ], 'foo_table') | ||
|
|
||
| table2 = ibis.table([ | ||
| ('key', 'string'), | ||
| ('value', 'double') | ||
| ], 'bar_table') | ||
|
|
||
| filter_pred = table['f'] > 0 | ||
| table3 = table[filter_pred] | ||
|
|
||
| with self.assertRaises(com.ExpressionError): | ||
| table.inner_join(table2, [table3['g'] == table2['key']]) | ||
|
|
||
| # expected = table.inner_join(table2, [table['g'] == table2['key']]) | ||
| # assert_equal(result, expected) | ||
|
|
||
| def test_filter_self_join(self): | ||
| # GH #667 | ||
| purchases = ibis.table([('region', 'string'), | ||
| ('kind', 'string'), | ||
| ('user', 'int64'), | ||
| ('amount', 'double')], 'purchases') | ||
|
|
||
| metric = purchases.amount.sum().name('total') | ||
| agged = (purchases.group_by(['region', 'kind']) | ||
| .aggregate(metric)) | ||
|
|
||
| left = agged[agged.kind == 'foo'] | ||
| right = agged[agged.kind == 'bar'] | ||
|
|
||
| cond = left.region == right.region | ||
| joined = left.join(right, cond) | ||
|
|
||
| # unmodified by analysis | ||
| assert_equal(joined.op().predicates[0], cond) | ||
|
|
||
| metric = (left.total - right.total).name('diff') | ||
| what = [left.region, metric] | ||
| projected = joined.projection(what) | ||
|
|
||
| proj_exprs = projected.op().selections | ||
|
|
||
| # proj exprs unaffected by analysis | ||
| assert_equal(proj_exprs[0], left.region) | ||
| assert_equal(proj_exprs[1], metric) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,320 @@ | ||
| # Copyright 2014 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| from six import StringIO | ||
| import pandas as pd | ||
|
|
||
|
|
||
| def parse_metadata(descr_table): | ||
| parser = MetadataParser(descr_table) | ||
| return parser.parse() | ||
|
|
||
|
|
||
| def _noop(tup): | ||
| return None | ||
|
|
||
|
|
||
| def _item_converter(i): | ||
| def _get_item(converter=None): | ||
| def _converter(tup): | ||
| result = tup[i] | ||
| if converter is not None: | ||
| result = converter(result) | ||
| return result | ||
|
|
||
| return _converter | ||
|
|
||
| return _get_item | ||
|
|
||
| _get_type = _item_converter(1) | ||
| _get_comment = _item_converter(2) | ||
|
|
||
|
|
||
| def _try_timestamp(x): | ||
| try: | ||
| return pd.Timestamp(x) | ||
| except (ValueError, TypeError): | ||
| return x | ||
|
|
||
|
|
||
| def _try_unix_timestamp(x): | ||
| try: | ||
| return pd.Timestamp.fromtimestamp(int(x)) | ||
| except (ValueError, TypeError): | ||
| return x | ||
|
|
||
|
|
||
| def _try_boolean(x): | ||
| try: | ||
| x = x.lower() | ||
| if x in ('true', 'yes'): | ||
| return True | ||
| elif x in ('false', 'no'): | ||
| return False | ||
| return x | ||
| except (ValueError, TypeError): | ||
| return x | ||
|
|
||
|
|
||
| def _try_int(x): | ||
| try: | ||
| return int(x) | ||
| except (ValueError, TypeError): | ||
| return x | ||
|
|
||
|
|
||
| class MetadataParser(object): | ||
|
|
||
| """ | ||
| A simple state-ish machine to parse the results of DESCRIBE FORMATTED | ||
| """ | ||
|
|
||
| def __init__(self, table): | ||
| self.table = table | ||
| self.tuples = list(self.table.itertuples(index=False)) | ||
|
|
||
| def _reset(self): | ||
| self.pos = 0 | ||
| self.schema = None | ||
| self.partitions = None | ||
| self.info = None | ||
| self.storage = None | ||
|
|
||
| def _next_tuple(self): | ||
| if self.pos == len(self.tuples): | ||
| raise StopIteration | ||
|
|
||
| result = self.tuples[self.pos] | ||
| self.pos += 1 | ||
| return result | ||
|
|
||
| def parse(self): | ||
| self._reset() | ||
| self._parse() | ||
|
|
||
| return TableMetadata(self.schema, self.info, self.storage, | ||
| partitions=self.partitions) | ||
|
|
||
| def _parse(self): | ||
| self.schema = self._parse_schema() | ||
|
|
||
| next_section = self._next_tuple() | ||
| if 'partition' in next_section[0].lower(): | ||
| self._parse_partitions() | ||
| else: | ||
| self._parse_info() | ||
|
|
||
| def _parse_partitions(self): | ||
| self.partitions = self._parse_schema() | ||
|
|
||
| next_section = self._next_tuple() | ||
| if 'table information' not in next_section[0].lower(): | ||
| raise ValueError('Table information not present') | ||
|
|
||
| self._parse_info() | ||
|
|
||
| def _parse_schema(self): | ||
| tup = self._next_tuple() | ||
| if 'col_name' not in tup[0]: | ||
| raise ValueError('DESCRIBE FORMATTED did not return ' | ||
| 'the expected results: {0}' | ||
| .format(tup)) | ||
| self._next_tuple() | ||
|
|
||
| # Use for both main schema and partition schema (if any) | ||
| schema = [] | ||
| while True: | ||
| tup = self._next_tuple() | ||
| if tup[0].strip() == '': | ||
| break | ||
| schema.append((tup[0], tup[1])) | ||
|
|
||
| return schema | ||
|
|
||
| def _parse_info(self): | ||
| self.info = {} | ||
| while True: | ||
| tup = self._next_tuple() | ||
| orig_key = tup[0].strip(':') | ||
| key = _clean_param_name(tup[0]) | ||
|
|
||
| if key == '' or key.startswith('#'): | ||
| # section is done | ||
| break | ||
|
|
||
| if key == 'table parameters': | ||
| self._parse_table_parameters() | ||
| elif key in self._info_cleaners: | ||
| result = self._info_cleaners[key](tup) | ||
| self.info[orig_key] = result | ||
| else: | ||
| self.info[orig_key] = tup[1] | ||
|
|
||
| if 'storage information' not in key: | ||
| raise ValueError('Storage information not present') | ||
|
|
||
| self._parse_storage_info() | ||
|
|
||
| _info_cleaners = { | ||
| 'database': _get_type(), | ||
| 'owner': _get_type(), | ||
| 'createtime': _get_type(_try_timestamp), | ||
| 'lastaccesstime': _get_type(_try_timestamp), | ||
| 'protect mode': _get_type(), | ||
| 'retention': _get_type(_try_int), | ||
| 'location': _get_type(), | ||
| 'table type': _get_type() | ||
| } | ||
|
|
||
| def _parse_table_parameters(self): | ||
| params = self._parse_nested_params(self._table_param_cleaners) | ||
| self.info['Table Parameters'] = params | ||
|
|
||
| _table_param_cleaners = { | ||
| 'external': _try_boolean, | ||
| 'column_stats_accurate': _try_boolean, | ||
| 'numfiles': _try_int, | ||
| 'totalsize': _try_int, | ||
| 'stats_generated_via_stats_task': _try_boolean, | ||
| 'numrows': _try_int, | ||
| 'transient_lastddltime': _try_unix_timestamp, | ||
| } | ||
|
|
||
| def _parse_storage_info(self): | ||
| self.storage = {} | ||
| while True: | ||
| # end of the road | ||
| try: | ||
| tup = self._next_tuple() | ||
| except StopIteration: | ||
| break | ||
|
|
||
| orig_key = tup[0].strip(':') | ||
| key = _clean_param_name(tup[0]) | ||
|
|
||
| if key == '' or key.startswith('#'): | ||
| # section is done | ||
| break | ||
|
|
||
| if key == 'storage desc params': | ||
| self._parse_storage_desc_params() | ||
| elif key in self._storage_cleaners: | ||
| result = self._storage_cleaners[key](tup) | ||
| self.storage[orig_key] = result | ||
| else: | ||
| self.storage[orig_key] = tup[1] | ||
|
|
||
| _storage_cleaners = { | ||
| 'compressed': _get_type(_try_boolean), | ||
| 'num buckets': _get_type(_try_int), | ||
| } | ||
|
|
||
| def _parse_storage_desc_params(self): | ||
| params = self._parse_nested_params(self._storage_param_cleaners) | ||
| self.storage['Desc Params'] = params | ||
|
|
||
| _storage_param_cleaners = {} | ||
|
|
||
| def _parse_nested_params(self, cleaners): | ||
| params = {} | ||
| while True: | ||
| try: | ||
| tup = self._next_tuple() | ||
| except StopIteration: | ||
| break | ||
| if pd.isnull(tup[1]): | ||
| break | ||
|
|
||
| key, value = tup[1:] | ||
|
|
||
| if key.lower() in cleaners: | ||
| cleaner = cleaners[key.lower()] | ||
| value = cleaner(value) | ||
| params[key] = value | ||
|
|
||
| return params | ||
|
|
||
|
|
||
| def _clean_param_name(x): | ||
| return x.strip().strip(':').lower() | ||
|
|
||
|
|
||
| def _get_meta(attr, key): | ||
| @property | ||
| def f(self): | ||
| data = getattr(self, attr) | ||
| if isinstance(key, list): | ||
| result = data | ||
| for k in key: | ||
| if k not in result: | ||
| raise KeyError(k) | ||
| result = result[k] | ||
| return result | ||
| else: | ||
| return data[key] | ||
| return f | ||
|
|
||
|
|
||
| class TableMetadata(object): | ||
|
|
||
| """ | ||
| Container for the parsed and wrangled results of DESCRIBE FORMATTED for | ||
| easier Ibis use (and testing). | ||
| """ | ||
| def __init__(self, schema, info, storage, partitions=None): | ||
| self.schema = schema | ||
| self.info = info | ||
| self.storage = storage | ||
| self.partitions = partitions | ||
|
|
||
| def __repr__(self): | ||
| import pprint | ||
|
|
||
| # Quick and dirty for now | ||
| buf = StringIO() | ||
| buf.write(str(type(self))) | ||
| buf.write('\n') | ||
|
|
||
| data = { | ||
| 'schema': self.schema, | ||
| 'info': self.info, | ||
| 'storage info': self.storage | ||
| } | ||
| if self.partitions is not None: | ||
| data['partition schema'] = self.partitions | ||
|
|
||
| pprint.pprint(data, stream=buf) | ||
|
|
||
| return buf.getvalue() | ||
|
|
||
| @property | ||
| def is_partitioned(self): | ||
| return self.partitions is not None | ||
|
|
||
| create_time = _get_meta('info', 'CreateTime') | ||
| location = _get_meta('info', 'Location') | ||
| owner = _get_meta('info', 'Owner') | ||
| num_rows = _get_meta('info', ['Table Parameters', 'numRows']) | ||
| hive_format = _get_meta('storage', 'InputFormat') | ||
|
|
||
| tbl_properties = _get_meta('info', 'Table Parameters') | ||
| serde_properties = _get_meta('storage', 'Desc Params') | ||
|
|
||
|
|
||
| class TableInfo(object): | ||
| pass | ||
|
|
||
|
|
||
| class TableStorageInfo(object): | ||
| pass |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,209 @@ | ||
| # Copyright 2014 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| from posixpath import join as pjoin | ||
| import os | ||
|
|
||
| import pandas.core.common as pdcom | ||
| import pandas as pd | ||
|
|
||
| import ibis.common as com | ||
|
|
||
| from ibis.config import options | ||
| from ibis.util import log | ||
| import ibis.compat as compat | ||
| import ibis.expr.datatypes as itypes | ||
| import ibis.util as util | ||
|
|
||
|
|
||
| # ---------------------------------------------------------------------- | ||
| # pandas integration | ||
|
|
||
|
|
||
| def pandas_col_to_ibis_type(col): | ||
| import numpy as np | ||
| dty = col.dtype | ||
|
|
||
| # datetime types | ||
| if pdcom.is_datetime64_dtype(dty): | ||
| if pdcom.is_datetime64_ns_dtype(dty): | ||
| return 'timestamp' | ||
| else: | ||
| raise com.IbisTypeError("Column {0} has dtype {1}, which is " | ||
| "datetime64-like but does " | ||
| "not use nanosecond units" | ||
| .format(col.name, dty)) | ||
| if pdcom.is_timedelta64_dtype(dty): | ||
| print("Warning: encoding a timedelta64 as an int64") | ||
| return 'int64' | ||
|
|
||
| if pdcom.is_categorical_dtype(dty): | ||
| return itypes.Category(len(col.cat.categories)) | ||
|
|
||
| if pdcom.is_bool_dtype(dty): | ||
| return 'boolean' | ||
|
|
||
| # simple numerical types | ||
| if issubclass(dty.type, np.int8): | ||
| return 'int8' | ||
| if issubclass(dty.type, np.int16): | ||
| return 'int16' | ||
| if issubclass(dty.type, np.int32): | ||
| return 'int32' | ||
| if issubclass(dty.type, np.int64): | ||
| return 'int64' | ||
| if issubclass(dty.type, np.float32): | ||
| return 'float' | ||
| if issubclass(dty.type, np.float64): | ||
| return 'double' | ||
| if issubclass(dty.type, np.uint8): | ||
| return 'int16' | ||
| if issubclass(dty.type, np.uint16): | ||
| return 'int32' | ||
| if issubclass(dty.type, np.uint32): | ||
| return 'int64' | ||
| if issubclass(dty.type, np.uint64): | ||
| raise com.IbisTypeError("Column {0} is an unsigned int64" | ||
| .format(col.name)) | ||
|
|
||
| if pdcom.is_object_dtype(dty): | ||
| return _infer_object_dtype(col) | ||
|
|
||
| raise com.IbisTypeError("Column {0} is dtype {1}".format(col.name, dty)) | ||
|
|
||
|
|
||
| def _infer_object_dtype(arr): | ||
| # TODO: accelerate with Cython/C | ||
|
|
||
| BOOLEAN, STRING = 0, 1 | ||
| state = BOOLEAN | ||
|
|
||
| avalues = arr.values if isinstance(arr, pd.Series) else arr | ||
| nulls = pd.isnull(avalues) | ||
|
|
||
| if nulls.any(): | ||
| for i in compat.range(len(avalues)): | ||
| if state == BOOLEAN: | ||
| if not nulls[i] and not pdcom.is_bool(avalues[i]): | ||
| state = STRING | ||
| elif state == STRING: | ||
| break | ||
| if state == BOOLEAN: | ||
| return 'boolean' | ||
| elif state == STRING: | ||
| return 'string' | ||
| else: | ||
| return pd.lib.infer_dtype(avalues) | ||
|
|
||
|
|
||
| class DataFrameWriter(object): | ||
|
|
||
| """ | ||
| Interface class for writing pandas objects to Impala tables | ||
| Class takes ownership of any temporary data written to HDFS | ||
| """ | ||
| def __init__(self, client, df, path=None): | ||
| self.client = client | ||
| self.hdfs = client.hdfs | ||
|
|
||
| self.df = df | ||
|
|
||
| self.temp_hdfs_dirs = [] | ||
|
|
||
| def write_temp_csv(self): | ||
| temp_hdfs_dir = pjoin(options.impala.temp_hdfs_path, | ||
| 'pandas_{0}'.format(util.guid())) | ||
| self.hdfs.mkdir(temp_hdfs_dir) | ||
|
|
||
| # Keep track of the temporary HDFS file | ||
| self.temp_hdfs_dirs.append(temp_hdfs_dir) | ||
|
|
||
| # Write the file to HDFS | ||
| hdfs_path = pjoin(temp_hdfs_dir, '0.csv') | ||
|
|
||
| self.write_csv(hdfs_path) | ||
|
|
||
| return temp_hdfs_dir | ||
|
|
||
| def write_csv(self, path): | ||
| import csv | ||
|
|
||
| tmp_path = 'tmp_{0}.csv'.format(util.guid()) | ||
| f = open(tmp_path, 'w+') | ||
|
|
||
| try: | ||
| # Write the DataFrame to the temporary file path | ||
| if options.verbose: | ||
| log('Writing DataFrame to temporary file') | ||
|
|
||
| self.df.to_csv(f, header=False, index=False, | ||
| sep=',', | ||
| quoting=csv.QUOTE_NONE, | ||
| escapechar='\\', | ||
| na_rep='#NULL') | ||
| f.seek(0) | ||
|
|
||
| if options.verbose: | ||
| log('Writing CSV to: {0}'.format(path)) | ||
|
|
||
| self.hdfs.put(path, f) | ||
| finally: | ||
| f.close() | ||
| try: | ||
| os.remove(tmp_path) | ||
| except os.error: | ||
| pass | ||
|
|
||
| return path | ||
|
|
||
| def get_schema(self): | ||
| # define a temporary table using delimited data | ||
| return pandas_to_ibis_schema(self.df) | ||
|
|
||
| def delimited_table(self, csv_dir, name=None, database=None): | ||
| temp_delimited_name = 'ibis_tmp_pandas_{0}'.format(util.guid()) | ||
| schema = self.get_schema() | ||
|
|
||
| return self.client.delimited_file(csv_dir, schema, | ||
| name=temp_delimited_name, | ||
| database=database, | ||
| delimiter=',', | ||
| na_rep='#NULL', | ||
| escapechar='\\\\', | ||
| external=True, | ||
| persist=False) | ||
|
|
||
| def __del__(self): | ||
| try: | ||
| self.cleanup() | ||
| except com.IbisError: | ||
| pass | ||
|
|
||
| def cleanup(self): | ||
| for path in self.temp_hdfs_dirs: | ||
| self.hdfs.rmdir(path) | ||
| self.temp_hdfs_dirs = [] | ||
| self.csv_dir = None | ||
|
|
||
|
|
||
| def pandas_to_ibis_schema(frame): | ||
| from ibis.expr.api import schema | ||
| # no analog for decimal in pandas | ||
| pairs = [] | ||
| for col_name in frame: | ||
| ibis_type = pandas_col_to_ibis_type(frame[col_name]) | ||
| pairs.append((col_name, ibis_type)) | ||
| return schema(pairs) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| # Copyright 2014 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| # Impala Parquet configuration and any other Parquet utilities | ||
| # / support |