| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| .. _self_joins: | ||
|
|
||
| ********** | ||
| Self joins | ||
| ********** | ||
|
|
||
| If you’re a relational data guru, you may have wondered how it’s | ||
| possible to join tables with themselves, because joins clauses involve | ||
| column references back to the original table. | ||
|
|
||
| Consider the SQL | ||
|
|
||
| .. code:: sql | ||
| SELECT t1.key, sum(t1.value - t2.value) AS metric | ||
| FROM my_table t1 | ||
| JOIN my_table t2 | ||
| ON t1.key = t2.subkey | ||
| GROUP BY 1 | ||
| Here, we have an unambiguous way to refer to each of the tables through | ||
| aliasing. | ||
|
|
||
| Let’s consider the TPC-H database, and support we want to compute | ||
| year-over-year change in total order amounts by region using joins. | ||
|
|
||
| .. code:: python | ||
| >>> region = con.table('tpch_region') | ||
| >>> nation = con.table('tpch_nation') | ||
| >>> customer = con.table('tpch_customer') | ||
| >>> orders = con.table('tpch_orders') | ||
| >>> orders.limit(5) | ||
| o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate \ | ||
| 0 1 36901 O 173665.47 1996-01-02 | ||
| 1 2 78002 O 46929.18 1996-12-01 | ||
| 2 3 123314 F 193846.25 1993-10-14 | ||
| 3 4 136777 O 32151.78 1995-10-11 | ||
| 4 5 44485 F 144659.20 1994-07-30 | ||
| o_orderpriority o_clerk o_shippriority \ | ||
| 0 5-LOW Clerk#000000951 0 | ||
| 1 1-URGENT Clerk#000000880 0 | ||
| 2 5-LOW Clerk#000000955 0 | ||
| 3 5-LOW Clerk#000000124 0 | ||
| 4 5-LOW Clerk#000000925 0 | ||
| o_comment | ||
| 0 nstructions sleep furiously among | ||
| 1 foxes. pending accounts at the pending, silen... | ||
| 2 sly final accounts boost. carefully regular id... | ||
| 3 sits. slyly regular warthogs cajole. regular, ... | ||
| 4 quickly. bold deposits sleep slyly. packages u... | ||
| First, let’s join all the things and select the fields we care about: | ||
|
|
||
| .. code:: python | ||
| >>> fields_of_interest = [region.r_name.name('region'), | ||
| ... nation.n_name.name('nation'), | ||
| ... orders.o_totalprice.name('amount'), | ||
| ... orders.o_orderdate.cast('timestamp').name('odate') # these are strings | ||
| ... ] | ||
| >>> joined_all = (region.join(nation, region.r_regionkey == nation.n_regionkey) | ||
| ... .join(customer, customer.c_nationkey == nation.n_nationkey) | ||
| ... .join(orders, orders.o_custkey == customer.c_custkey) | ||
| ... [fields_of_interest]) | ||
| Okay, great, let’s have a look: | ||
|
|
||
| .. code:: python | ||
| >>> joined_all.limit(5) | ||
| region nation amount odate | ||
| 0 AMERICA UNITED STATES 160843.35 1992-06-22 | ||
| 1 MIDDLE EAST IRAN 78307.91 1996-04-19 | ||
| 2 EUROPE FRANCE 103237.90 1994-10-12 | ||
| 3 EUROPE FRANCE 201463.59 1997-09-12 | ||
| 4 ASIA JAPAN 166098.86 1995-09-12 | ||
| Sweet, now let’s aggregate by year and region: | ||
|
|
||
| .. code:: python | ||
| >>> year = joined_all.odate.year().name('year') | ||
| >>> total = joined_all.amount.sum().cast('double').name('total') | ||
| >>> annual_amounts = (joined_all | ||
| ... .group_by(['region', year]) | ||
| ... .aggregate(total)) | ||
| >>> annual_amounts.limit(5) | ||
| region year total | ||
| 0 EUROPE 1994 6.979473e+09 | ||
| 1 EUROPE 1996 7.015421e+09 | ||
| 2 ASIA 1997 6.910663e+09 | ||
| 3 ASIA 1998 4.058824e+09 | ||
| 4 EUROPE 1992 6.926705e+09 | ||
| Looking good so far. Now, we need to join this table on itself, by | ||
| subtracting 1 from one of the year columns. | ||
|
|
||
| We do this by creating a “joinable” view of a table that is considered a | ||
| distinct object within Ibis. To do this, use the ``view`` function: | ||
|
|
||
| .. code:: python | ||
| >>> current = annual_amounts | ||
| >>> prior = annual_amounts.view() | ||
| >>> yoy_change = (current.total - prior.total).name('yoy_change') | ||
| >>> results = (current.join(prior, ((current.region == prior.region) & | ||
| ... (current.year == (prior.year - 1)))) | ||
| ... [current.region, current.year, yoy_change]) | ||
| >>> df = results.execute() | ||
| .. code:: python | ||
| >>> df['yoy_pretty'] = df.yoy_change.map(lambda x: '$%.2fmm' % (x / 1000000.)) | ||
| If you’re being fastidious and want to consider the first year occurring | ||
| in the dataset for each region to have 0 for the prior year, you will | ||
| instead need to do an outer join and treat nulls in the prior side of | ||
| the join as zero: | ||
|
|
||
| .. code:: python | ||
| >>> yoy_change = (current.total - prior.total.zeroifnull()).name('yoy_change') | ||
| >>> results = (current.outer_join(prior, ((current.region == prior.region) & | ||
| ... (current.year == (prior.year - 1)))) | ||
| ... [current.region, current.year, current.total, | ||
| ... prior.total.zeroifnull().name('prior_total'), | ||
| ... yoy_change]) | ||
| >>> results.limit(5) | ||
| region year total prior_total yoy_change | ||
| 0 ASIA 1998 4.058824e+09 0.000000e+00 4.058824e+09 | ||
| 1 AFRICA 1994 6.837587e+09 6.908429e+09 -7.084172e+07 | ||
| 2 AMERICA 1996 6.883057e+09 6.922465e+09 -3.940791e+07 | ||
| 3 AFRICA 1996 6.878112e+09 6.848983e+09 2.912979e+07 | ||
| 4 AFRICA 1992 6.873319e+09 6.859733e+09 1.358699e+07 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| .. _topk: | ||
|
|
||
| ***************** | ||
| “Top-K” Filtering | ||
| ***************** | ||
|
|
||
| A common analytical pattern involves subsetting based on some method of | ||
| ranking. For example, “the 5 most frequently occurring widgets in a | ||
| dataset”. By choosing the right metric, you can obtain the most | ||
| important or least important items from some dimension, for some | ||
| definition of important. | ||
|
|
||
| To carry out the pattern by hand involves the following | ||
|
|
||
| - Choose a ranking metric | ||
| - Aggregate, computing the ranking metric, by the target dimension | ||
| - Order by the ranking metric and take the highest K values | ||
| - Use those values as a set filter (either with ``semi_join`` or | ||
| ``isin``) in your next query | ||
|
|
||
| For example, let’s look at the TPC-H tables and find the 5 or 10 | ||
| customers who placed the most orders over their lifetime: | ||
|
|
||
| .. code:: python | ||
| >>> orders = con.table('tpch_orders') | ||
| >>> top_orders = (orders | ||
| ... .group_by('o_custkey') | ||
| ... .size() | ||
| ... .sort_by(('count', False)) | ||
| ... .limit(5)) | ||
| >>> top_orders | ||
| o_custkey count | ||
| 0 3451 41 | ||
| 1 102022 41 | ||
| 2 102004 41 | ||
| 3 79300 40 | ||
| 4 117082 40 | ||
| Now, we could use these customer keys as a filter in some other | ||
| analysis: | ||
|
|
||
| .. code:: python | ||
| >>> # Among the top 5 most frequent customers, what's the histogram of their order statuses? | ||
| >>> analysis = (orders[orders.o_custkey.isin(top_orders.o_custkey)] | ||
| ... .group_by('o_orderstatus') | ||
| ... .size()) | ||
| >>> analysis | ||
| o_orderstatus count | ||
| 0 P 5 | ||
| 1 F 85 | ||
| 2 O 113 | ||
| This is such a common pattern that Ibis supports a high level primitive | ||
| ``topk`` operation, which can be used immediately as a filter: | ||
|
|
||
| .. code:: python | ||
| >>> top_orders = orders.o_custkey.topk(5) | ||
| >>> orders[top_orders].group_by('o_orderstatus').size() | ||
| o_orderstatus count | ||
| 0 P 5 | ||
| 1 F 85 | ||
| 2 O 113 | ||
| This goes a little further. Suppose now we want to rank customers by | ||
| their total spending instead of the number of orders, perhaps a more | ||
| meaningful metric: | ||
|
|
||
| .. code:: python | ||
| >>> total_spend = orders.o_totalprice.sum().name('total') | ||
| >>> top_spenders = (orders | ||
| ... .group_by('o_custkey') | ||
| ... .aggregate(total_spend) | ||
| ... .sort_by(('total', False)) | ||
| ... .limit(5)) | ||
| >>> top_spenders | ||
| o_custkey total | ||
| 0 143500 7012696.48 | ||
| 1 95257 6563511.23 | ||
| 2 87115 6457526.26 | ||
| 3 131113 6311428.86 | ||
| 4 103834 6306524.23 | ||
| To use another metric, just pass it to the ``by`` argument in ``topk``: | ||
|
|
||
| .. code:: python | ||
| >>> top_spenders = orders.o_custkey.topk(5, by=total_spend) | ||
| >>> orders[top_spenders].group_by('o_orderstatus').size() | ||
| o_orderstatus count | ||
| 0 P 1 | ||
| 1 F 78 | ||
| 2 O 98 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,4 +18,4 @@ The next backends provide UDF support: | |
|
|
||
| - :ref:`udf.impala` | ||
| - :ref:`udf.pandas` | ||
| - Bigquery | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,7 @@ h3 a { | |
| } | ||
| h4 { | ||
| font-size: 1rem; | ||
| font-weight: 600; | ||
| color: #444; | ||
| } | ||
| a { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,59 +1,59 @@ | ||
| # This file should have all the dependencies for development excluding the specific to the backends. | ||
| name: ibis | ||
| dependencies: | ||
| # Ibis hard dependencies | ||
| - multipledispatch | ||
| - numpy | ||
| - pandas | ||
| - pytz | ||
| - regex | ||
| - toolz | ||
| - cached_property # for 3.7 compat, functools.cached_property is for >=3.8 | ||
| - setuptools | ||
| - parsy | ||
|
|
||
| # Ibis soft dependencies | ||
| - sqlalchemy | ||
| - python-graphviz | ||
|
|
||
| # Dev tools | ||
| - asv | ||
| - black=19.10b0 | ||
| - click # few scripts in ci/ | ||
| - flake8 | ||
| - flake8-comprehensions # used by flake8, linting of unnecessary comprehensions | ||
| - isort | ||
| - jinja2<3 # feedstock | ||
| - mypy | ||
| - plumbum # few scripts in ci/ and dev/ | ||
| - pydocstyle | ||
| - pytest | ||
| - pytest-cov | ||
| - pytest-mock | ||
|
|
||
| # Release | ||
| - twine | ||
| - wheel | ||
| - conda-build # feedstock | ||
| - ruamel.yaml # feedstock | ||
| - pygit2 # dev/genrelease.py | ||
|
|
||
| # Docs | ||
| - pip | ||
| - pip: | ||
| - pysuerga | ||
| - pytest-randomly | ||
| - ipython | ||
| - ipykernel | ||
| - nbconvert | ||
| - nbsphinx | ||
| - nomkl | ||
| - semantic_version=2.6 # https://github.com/ibis-project/ibis/issues/2027 | ||
| - sphinx | ||
| - sphinx-releases | ||
| - sphinx_rtd_theme | ||
|
|
||
| # Type annotations | ||
| - types-setuptools | ||
| - types-pytz | ||
| - types-python-dateutil |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,84 +1,95 @@ | ||
| """Initialize Ibis module.""" | ||
| import pkg_resources | ||
|
|
||
| # Converting an Ibis schema to a pandas DataFrame requires registering | ||
| # some type conversions that are currently registered in the pandas backend | ||
| import ibis.backends.pandas | ||
| import ibis.config | ||
| import ibis.expr.types as ir | ||
| from ibis import util | ||
| from ibis.backends.base import BaseBackend | ||
| from ibis.common.exceptions import IbisError | ||
| from ibis.config import options | ||
| from ibis.expr import api | ||
| from ibis.expr.api import * # noqa: F401,F403 | ||
|
|
||
| from ._version import get_versions # noqa: E402 | ||
|
|
||
| __all__ = ['api', 'ir', 'util', 'IbisError', 'options'] | ||
| __all__ += api.__all__ | ||
|
|
||
|
|
||
| ibis.config.register_option( | ||
| 'interactive', False, validator=ibis.config.is_bool | ||
| ) | ||
| ibis.config.register_option('verbose', False, validator=ibis.config.is_bool) | ||
| ibis.config.register_option('verbose_log', None) | ||
| ibis.config.register_option( | ||
| 'graphviz_repr', | ||
| True, | ||
| """\ | ||
| Whether to render expressions as GraphViz PNGs when repr-ing in a Jupyter | ||
| notebook. | ||
| """, | ||
| validator=ibis.config.is_bool, | ||
| ) | ||
| ibis.config.register_option('default_backend', None) | ||
| with ibis.config.config_prefix('context_adjustment'): | ||
| ibis.config.register_option( | ||
| 'time_col', | ||
| 'time', | ||
| 'Name of the timestamp col for execution with a timecontext' | ||
| 'See ibis.expr.timecontext for details.', | ||
| validator=ibis.config.is_str, | ||
| ) | ||
| with ibis.config.config_prefix('sql'): | ||
| ibis.config.register_option( | ||
| 'default_limit', | ||
| 10_000, | ||
| 'Number of rows to be retrieved for an unlimited table expression', | ||
| ) | ||
|
|
||
| __version__ = get_versions()['version'] | ||
| del get_versions | ||
|
|
||
|
|
||
| def __getattr__(name: str) -> BaseBackend: | ||
| """Load backends in a lazy way with `ibis.<backend-name>`. | ||
| This also registers the backend options. | ||
| Examples | ||
| -------- | ||
| >>> import ibis | ||
| >>> con = ibis.sqlite.connect(...) | ||
| When accessing the `sqlite` attribute of the `ibis` module, this function | ||
| is called, and a backend with the `sqlite` name is tried to load from | ||
| the `ibis.backends` entrypoints. If successful, the `ibis.sqlite` | ||
| attribute is "cached", so this function is only called the first time. | ||
| """ | ||
| entry_points = list( | ||
| pkg_resources.iter_entry_points(group='ibis.backends', name=name) | ||
| ) | ||
| if len(entry_points) == 0: | ||
| raise AttributeError( | ||
| f"module 'ibis' has no attribute '{name}'. " | ||
| f"If you are trying to access the '{name}' backend, " | ||
| f"try installing it first with `pip install ibis-{name}`" | ||
| ) | ||
| elif len(entry_points) > 1: | ||
| raise RuntimeError( | ||
| f"{len(entry_points)} packages found for backend '{name}'. " | ||
| "There should be only one, please uninstall the unused packages " | ||
| "and just leave the one that needs to be used." | ||
| ) | ||
|
|
||
| backend = entry_points[0].resolve().Backend() | ||
|
|
||
| # The first time a backend is loaded, we register its options, and we set | ||
| # it as an attribute of `ibis`, so `__getattr__` is not called again for it | ||
| with ibis.config.config_prefix(name): | ||
| backend.register_options() | ||
|
|
||
| setattr(ibis, name, backend) | ||
| return backend |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,176 @@ | ||
| import abc | ||
| import warnings | ||
| from pathlib import Path | ||
|
|
||
| import pandas as pd | ||
|
|
||
| import ibis | ||
| import ibis.expr.types as ir | ||
| from ibis.backends.base import BaseBackend, Database | ||
| from ibis.backends.pandas.core import execute_and_reset | ||
|
|
||
| # Load options of pandas backend | ||
| ibis.pandas | ||
|
|
||
|
|
||
| class FileDatabase(Database): | ||
| def __init__(self, name, client): | ||
| super().__init__(name, client) | ||
| self.path = client.path | ||
|
|
||
| def __str__(self): | ||
| return '{0.__class__.__name__}({0.name})'.format(self) | ||
|
|
||
| def __dir__(self): | ||
| dbs = self.list_databases(path=self.path) | ||
| tables = self.list_tables(path=self.path) | ||
| return sorted(set(dbs).union(set(tables))) | ||
|
|
||
| def __getattr__(self, name): | ||
| try: | ||
| return self.table(name, path=self.path) | ||
| except AttributeError: | ||
| return self.database(name, path=self.path) | ||
|
|
||
| def table(self, name, path): | ||
| return self.client.table(name, path=path) | ||
|
|
||
| def database(self, name=None, path=None): | ||
| return self.client.database(name=name, path=path) | ||
|
|
||
| def list_databases(self, path=None): | ||
| if path is None: | ||
| path = self.path | ||
| return sorted(self.client.list_databases(path=path)) | ||
|
|
||
| def list_tables(self, path=None, database=None): | ||
| if path is None: | ||
| path = self.path | ||
| return sorted(self.client.list_tables(path=path, database=database)) | ||
|
|
||
|
|
||
| class BaseFileBackend(BaseBackend): | ||
| """ | ||
| Base backend class for pandas pseudo-backends for file formats. | ||
| """ | ||
|
|
||
| database_class = FileDatabase | ||
|
|
||
| def connect(self, path): | ||
| """Create a Client for use with Ibis | ||
| Parameters | ||
| ---------- | ||
| path : str or pathlib.Path | ||
| Returns | ||
| ------- | ||
| Backend | ||
| """ | ||
| new_backend = self.__class__() | ||
| new_backend.path = new_backend.root = Path(path) | ||
| new_backend.dictionary = {} | ||
| return new_backend | ||
|
|
||
| @property | ||
| def version(self) -> str: | ||
| return pd.__version__ | ||
|
|
||
| def list_tables( | ||
| self, path: Path = None, like: str = None, database: str = None | ||
| ): | ||
| # For file backends, we return files in the `path` directory. | ||
|
|
||
| def is_valid(path): | ||
| return path.is_file() and path.suffix == '.' + self.extension | ||
|
|
||
| path = path or self.path | ||
|
|
||
| if path.is_dir(): | ||
| tables = [f.stem for f in path.iterdir() if is_valid(f)] | ||
| elif is_valid(path): | ||
| tables = [path.stem] | ||
| else: | ||
| tables = [] | ||
|
|
||
| return self._filter_with_like(tables, like) | ||
|
|
||
| @property | ||
| def current_database(self): | ||
| # Databases for the file backend are a bit confusing | ||
| # `list_databases()` will return the directories in the current path | ||
| # The `current_database` is not in that list. Probably we want to | ||
| # rethink this eventually. For now we just return `None` here, as if | ||
| # databases were not supported | ||
| return '.' | ||
|
|
||
| def compile(self, expr, *args, **kwargs): | ||
| return expr | ||
|
|
||
| def _list_databases_dirs(self, path=None): | ||
| tables = [] | ||
| if path.is_dir(): | ||
| for d in path.iterdir(): | ||
| if d.is_dir(): | ||
| tables.append(d.name) | ||
| return tables | ||
|
|
||
| def _list_tables_files(self, path=None): | ||
| # tables are files in a dir | ||
| if path is None: | ||
| path = self.root | ||
|
|
||
| tables = [] | ||
| if path.is_dir(): | ||
| for d in path.iterdir(): | ||
| if d.is_file(): | ||
| if str(d).endswith(self.extension): | ||
| tables.append(d.stem) | ||
| elif path.is_file(): | ||
| if str(path).endswith(self.extension): | ||
| tables.append(path.stem) | ||
| return tables | ||
|
|
||
| def list_databases(self, path=None, like=None): | ||
| if path is None: | ||
| path = self.path | ||
| else: | ||
| warnings.warn( | ||
| 'The `path` argument of `list_databases` is deprecated and ' | ||
| 'will be removed in a future version of Ibis. Connect to a ' | ||
| 'different path with the `connect()` method instead.', | ||
| FutureWarning, | ||
| ) | ||
| databases = ['.'] + self._list_databases_dirs(path) | ||
| return self._filter_with_like(databases, like) | ||
|
|
||
| @abc.abstractmethod | ||
| def insert(self, path, expr, **kwargs): | ||
| pass | ||
|
|
||
| @abc.abstractmethod | ||
| def table(self, name, path): | ||
| pass | ||
|
|
||
| def database(self, name=None, path=None): | ||
| if name is None: | ||
| self.path = path or self.path | ||
| return super().database(name) | ||
|
|
||
| if path is None: | ||
| path = self.root | ||
| if name not in self.list_databases(path): | ||
| raise AttributeError(name) | ||
|
|
||
| new_name = f"{name}.{self.extension}" | ||
| if (self.root / name).is_dir(): | ||
| path /= name | ||
| elif not str(path).endswith(new_name): | ||
| path /= new_name | ||
|
|
||
| self.path = path | ||
| return super().database(name) | ||
|
|
||
| def execute(self, expr, params=None, **kwargs): # noqa | ||
| assert isinstance(expr, ir.Expr) | ||
| return execute_and_reset(expr, params=params, **kwargs) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,216 @@ | ||
| import abc | ||
| from typing import Optional | ||
|
|
||
| import ibis.expr.operations as ops | ||
| import ibis.expr.schema as sch | ||
| import ibis.expr.types as ir | ||
| import ibis.util as util | ||
| from ibis.backends.base import BaseBackend | ||
| from ibis.expr.typing import TimeContext | ||
|
|
||
| from .compiler import Compiler | ||
|
|
||
| __all__ = 'BaseSQLBackend' | ||
|
|
||
|
|
||
| class BaseSQLBackend(BaseBackend): | ||
| """ | ||
| Base backend class for backends that compile to SQL. | ||
| """ | ||
|
|
||
| compiler = Compiler | ||
| table_class = ops.DatabaseTable | ||
| table_expr_class = ir.TableExpr | ||
|
|
||
| def table(self, name, database=None): | ||
| """Create a table expression. | ||
| Create a table expression that references a particular table in the | ||
| database. | ||
| Parameters | ||
| ---------- | ||
| name : string | ||
| database : string, optional | ||
| Returns | ||
| ------- | ||
| table : TableExpr | ||
| """ | ||
| qualified_name = self._fully_qualified_name(name, database) | ||
| schema = self.get_schema(qualified_name) | ||
| node = self.table_class(qualified_name, schema, self) | ||
| return self.table_expr_class(node) | ||
|
|
||
| def _fully_qualified_name(self, name, database): | ||
| # XXX | ||
| return name | ||
|
|
||
| def sql(self, query): | ||
| """Convert a SQL query to an Ibis table expression. | ||
| Parameters | ||
| ---------- | ||
| query : string | ||
| Returns | ||
| ------- | ||
| table : TableExpr | ||
| """ | ||
| # Get the schema by adding a LIMIT 0 on to the end of the query. If | ||
| # there is already a limit in the query, we find and remove it | ||
| limited_query = f'SELECT * FROM ({query}) t0 LIMIT 0' | ||
| schema = self._get_schema_using_query(limited_query) | ||
| return ops.SQLQueryResult(query, schema, self).to_expr() | ||
|
|
||
| def raw_sql(self, query: str, results=False): | ||
| """Execute a given query string. | ||
| Could have unexpected results if the query modifies the behavior of | ||
| the session in a way unknown to Ibis; be careful. | ||
| Parameters | ||
| ---------- | ||
| query : string | ||
| DML or DDL statement | ||
| Returns | ||
| ------- | ||
| Backend cursor | ||
| """ | ||
| # TODO results is unused, it can be removed | ||
| # (requires updating Impala tests) | ||
| # TODO `self.con` is assumed to be defined in subclasses, but there | ||
| # is nothing that enforces it. We should find a way to make sure | ||
| # `self.con` is always a DBAPI2 connection, or raise an error | ||
| cursor = self.con.execute(query) # type: ignore | ||
| if cursor: | ||
| return cursor | ||
| cursor.release() | ||
|
|
||
| def execute(self, expr, params=None, limit='default', **kwargs): | ||
| """Compile and execute the given Ibis expression. | ||
| Compile and execute Ibis expression using this backend client | ||
| interface, returning results in-memory in the appropriate object type | ||
| Parameters | ||
| ---------- | ||
| expr : Expr | ||
| limit : int, default None | ||
| For expressions yielding result yets; retrieve at most this number of | ||
| values/rows. Overrides any limit already set on the expression. | ||
| params : not yet implemented | ||
| kwargs : Backends can receive extra params. For example, clickhouse | ||
| uses this to receive external_tables as dataframes. | ||
| Returns | ||
| ------- | ||
| output : input type dependent | ||
| Table expressions: pandas.DataFrame | ||
| Array expressions: pandas.Series | ||
| Scalar expressions: Python scalar value | ||
| """ | ||
| # TODO Reconsider having `kwargs` here. It's needed to support | ||
| # `external_tables` in clickhouse, but better to deprecate that | ||
| # feature than all this magic. | ||
| # we don't want to pass `timecontext` to `raw_sql` | ||
| kwargs.pop('timecontext', None) | ||
| query_ast = self.compiler.to_ast_ensure_limit( | ||
| expr, limit, params=params | ||
| ) | ||
| sql = query_ast.compile() | ||
| self._log(sql) | ||
| cursor = self.raw_sql(sql, **kwargs) | ||
| schema = self.ast_schema(query_ast, **kwargs) | ||
| result = self.fetch_from_cursor(cursor, schema) | ||
|
|
||
| if hasattr(getattr(query_ast, 'dml', query_ast), 'result_handler'): | ||
| result = query_ast.dml.result_handler(result) | ||
|
|
||
| return result | ||
|
|
||
| @abc.abstractmethod | ||
| def fetch_from_cursor(self, cursor, schema): | ||
| """Fetch data from cursor.""" | ||
|
|
||
| def ast_schema(self, query_ast): | ||
| """Return the schema of the expression. | ||
| Returns | ||
| ------- | ||
| Schema | ||
| Raises | ||
| ------ | ||
| ValueError | ||
| if self.expr doesn't have a schema. | ||
| """ | ||
| dml = getattr(query_ast, 'dml', query_ast) | ||
| expr = getattr(dml, 'parent_expr', getattr(dml, 'table_set', None)) | ||
|
|
||
| if isinstance(expr, (ir.TableExpr, ir.ExprList, sch.HasSchema)): | ||
| return expr.schema() | ||
| elif isinstance(expr, ir.ValueExpr): | ||
| return sch.schema([(expr.get_name(), expr.type())]) | ||
| else: | ||
| raise ValueError( | ||
| 'Expression with type {} does not have a ' | ||
| 'schema'.format(type(self.expr)) | ||
| ) | ||
|
|
||
| def _log(self, sql): | ||
| """Log the SQL, usually to the standard output. | ||
| This method can be implemented by subclasses. The logging happens | ||
| when `ibis.options.verbose` is `True`. | ||
| """ | ||
| pass | ||
|
|
||
| def compile( | ||
| self, | ||
| expr, | ||
| limit=None, | ||
| params=None, | ||
| timecontext: Optional[TimeContext] = None, | ||
| ): | ||
| """Translate expression. | ||
| Translate expression to one or more queries according to | ||
| backend target. | ||
| Returns | ||
| ------- | ||
| output : single query or list of queries | ||
| """ | ||
| return self.compiler.to_ast_ensure_limit( | ||
| expr, limit, params=params | ||
| ).compile() | ||
|
|
||
| def explain(self, expr, params=None): | ||
| """Explain expression. | ||
| Query for and return the query plan associated with the indicated | ||
| expression or SQL query. | ||
| Returns | ||
| ------- | ||
| plan : string | ||
| """ | ||
| if isinstance(expr, ir.Expr): | ||
| context = self.compiler.make_context(params=params) | ||
| query_ast = self.compiler.to_ast(expr, context) | ||
| if len(query_ast.queries) > 1: | ||
| raise Exception('Multi-query expression') | ||
|
|
||
| query = query_ast.queries[0].compile() | ||
| else: | ||
| query = expr | ||
|
|
||
| statement = f'EXPLAIN {query}' | ||
|
|
||
| cur = self.raw_sql(statement) | ||
| result = self._get_list(cur) | ||
| cur.release() | ||
|
|
||
| return '\n'.join(['Query:', util.indent(query, 2), '', *result]) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| import ibis.expr.operations as ops | ||
| import ibis.expr.schema as sch | ||
| from ibis.backends.base import Database | ||
|
|
||
|
|
||
| class AlchemyDatabase(Database): | ||
| """ | ||
| Attributes | ||
| ---------- | ||
| client : AlchemyClient | ||
| """ | ||
|
|
||
| def table(self, name, schema=None): | ||
| return self.client.table(name, schema=schema) | ||
|
|
||
|
|
||
| class AlchemyTable(ops.DatabaseTable): | ||
| def __init__(self, table, source, schema=None): | ||
| schema = sch.infer(table, schema=schema) | ||
| super().__init__(table.name, schema, source) | ||
| self.sqla_table = table |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,292 @@ | ||
| from typing import Optional | ||
|
|
||
| import sqlalchemy as sa | ||
| from sqlalchemy.dialects import mysql, postgresql, sqlite | ||
| from sqlalchemy.dialects.mysql.base import MySQLDialect | ||
| from sqlalchemy.dialects.postgresql.base import PGDialect | ||
| from sqlalchemy.dialects.sqlite.base import SQLiteDialect | ||
| from sqlalchemy.engine.interfaces import Dialect | ||
|
|
||
| import ibis.expr.datatypes as dt | ||
| import ibis.expr.schema as sch | ||
|
|
||
| from .geospatial import geospatial_supported | ||
|
|
||
| if geospatial_supported: | ||
| import geoalchemy2 as ga | ||
|
|
||
|
|
||
| def table_from_schema(name, meta, schema, database: Optional[str] = None): | ||
| # Convert Ibis schema to SQLA table | ||
| columns = [] | ||
|
|
||
| for colname, dtype in zip(schema.names, schema.types): | ||
| satype = to_sqla_type(dtype) | ||
| column = sa.Column(colname, satype, nullable=dtype.nullable) | ||
| columns.append(column) | ||
|
|
||
| return sa.Table(name, meta, schema=database, *columns) | ||
|
|
||
|
|
||
| # TODO(cleanup) | ||
| ibis_type_to_sqla = { | ||
| dt.Null: sa.types.NullType, | ||
| dt.Date: sa.Date, | ||
| dt.Time: sa.Time, | ||
| dt.Boolean: sa.Boolean, | ||
| dt.Binary: sa.LargeBinary, | ||
| dt.String: sa.Text, | ||
| dt.Decimal: sa.NUMERIC, | ||
| # Mantissa-based | ||
| dt.Float: sa.Float(precision=24), | ||
| dt.Double: sa.Float(precision=53), | ||
| dt.Int8: sa.SmallInteger, | ||
| dt.Int16: sa.SmallInteger, | ||
| dt.Int32: sa.Integer, | ||
| dt.Int64: sa.BigInteger, | ||
| } | ||
|
|
||
|
|
||
| def to_sqla_type(itype, type_map=None): | ||
| if type_map is None: | ||
| type_map = ibis_type_to_sqla | ||
| if isinstance(itype, dt.Decimal): | ||
| return sa.types.NUMERIC(itype.precision, itype.scale) | ||
| elif isinstance(itype, dt.Date): | ||
| return sa.Date() | ||
| elif isinstance(itype, dt.Timestamp): | ||
| # SQLAlchemy DateTimes do not store the timezone, just whether the db | ||
| # supports timezones. | ||
| return sa.TIMESTAMP(bool(itype.timezone)) | ||
| elif isinstance(itype, dt.Array): | ||
| ibis_type = itype.value_type | ||
| if not isinstance(ibis_type, (dt.Primitive, dt.String)): | ||
| raise TypeError( | ||
| 'Type {} is not a primitive type or string type'.format( | ||
| ibis_type | ||
| ) | ||
| ) | ||
| return sa.ARRAY(to_sqla_type(ibis_type, type_map=type_map)) | ||
| elif geospatial_supported and isinstance(itype, dt.GeoSpatial): | ||
| if itype.geotype == 'geometry': | ||
| return ga.Geometry | ||
| elif itype.geotype == 'geography': | ||
| return ga.Geography | ||
| else: | ||
| return ga.types._GISType | ||
| else: | ||
| return type_map[type(itype)] | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.NullType) | ||
| def sa_null(_, satype, nullable=True): | ||
| return dt.null | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.Boolean) | ||
| def sa_boolean(_, satype, nullable=True): | ||
| return dt.Boolean(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(MySQLDialect, mysql.NUMERIC) | ||
| def sa_mysql_numeric(_, satype, nullable=True): | ||
| # https://dev.mysql.com/doc/refman/8.0/en/fixed-point-types.html | ||
| return dt.Decimal( | ||
| satype.precision or 10, satype.scale or 0, nullable=nullable | ||
| ) | ||
|
|
||
|
|
||
| @dt.dtype.register(PGDialect, postgresql.NUMERIC) | ||
| def sa_postgres_numeric(_, satype, nullable=True): | ||
| # PostgreSQL allows any precision for numeric values if not specified, | ||
| # up to the implementation limit. Here, default to the maximum value that | ||
| # can be specified by the user. The scale defaults to zero. | ||
| # https://www.postgresql.org/docs/10/datatype-numeric.html | ||
| return dt.Decimal( | ||
| satype.precision or 1000, satype.scale or 0, nullable=nullable | ||
| ) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.Numeric) | ||
| @dt.dtype.register(SQLiteDialect, sqlite.NUMERIC) | ||
| def sa_numeric(_, satype, nullable=True): | ||
| return dt.Decimal(satype.precision, satype.scale, nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.SmallInteger) | ||
| def sa_smallint(_, satype, nullable=True): | ||
| return dt.Int16(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.Integer) | ||
| def sa_integer(_, satype, nullable=True): | ||
| return dt.Int32(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, mysql.TINYINT) | ||
| def sa_mysql_tinyint(_, satype, nullable=True): | ||
| return dt.Int8(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.BigInteger) | ||
| def sa_bigint(_, satype, nullable=True): | ||
| return dt.Int64(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.Float) | ||
| def sa_float(_, satype, nullable=True): | ||
| return dt.Float(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(SQLiteDialect, sa.types.Float) | ||
| @dt.dtype.register(PGDialect, postgresql.DOUBLE_PRECISION) | ||
| def sa_double(_, satype, nullable=True): | ||
| return dt.Double(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(PGDialect, postgresql.UUID) | ||
| def sa_uuid(_, satype, nullable=True): | ||
| return dt.UUID(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(PGDialect, postgresql.MACADDR) | ||
| def sa_macaddr(_, satype, nullable=True): | ||
| return dt.MACADDR(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(PGDialect, postgresql.INET) | ||
| def sa_inet(_, satype, nullable=True): | ||
| return dt.INET(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(PGDialect, postgresql.JSON) | ||
| def sa_json(_, satype, nullable=True): | ||
| return dt.JSON(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(PGDialect, postgresql.JSONB) | ||
| def sa_jsonb(_, satype, nullable=True): | ||
| return dt.JSONB(nullable=nullable) | ||
|
|
||
|
|
||
| if geospatial_supported: | ||
|
|
||
| @dt.dtype.register(Dialect, (ga.Geometry, ga.types._GISType)) | ||
| def ga_geometry(_, gatype, nullable=True): | ||
| t = gatype.geometry_type | ||
| if t == 'POINT': | ||
| return dt.Point(nullable=nullable) | ||
| if t == 'LINESTRING': | ||
| return dt.LineString(nullable=nullable) | ||
| if t == 'POLYGON': | ||
| return dt.Polygon(nullable=nullable) | ||
| if t == 'MULTILINESTRING': | ||
| return dt.MultiLineString(nullable=nullable) | ||
| if t == 'MULTIPOINT': | ||
| return dt.MultiPoint(nullable=nullable) | ||
| if t == 'MULTIPOLYGON': | ||
| return dt.MultiPolygon(nullable=nullable) | ||
| if t == 'GEOMETRY': | ||
| return dt.Geometry(nullable=nullable) | ||
| else: | ||
| raise ValueError(f"Unrecognized geometry type: {t}") | ||
|
|
||
|
|
||
| POSTGRES_FIELD_TO_IBIS_UNIT = { | ||
| "YEAR": "Y", | ||
| "MONTH": "M", | ||
| "DAY": "D", | ||
| "HOUR": "h", | ||
| "MINUTE": "m", | ||
| "SECOND": "s", | ||
| "YEAR TO MONTH": "M", | ||
| "DAY TO HOUR": "h", | ||
| "DAY TO MINUTE": "m", | ||
| "DAY TO SECOND": "s", | ||
| "HOUR TO MINUTE": "m", | ||
| "HOUR TO SECOND": "s", | ||
| "MINUTE TO SECOND": "s", | ||
| } | ||
|
|
||
|
|
||
| @dt.dtype.register(PGDialect, postgresql.INTERVAL) | ||
| def sa_postgres_interval(_, satype, nullable=True): | ||
| field = satype.fields.upper() | ||
| unit = POSTGRES_FIELD_TO_IBIS_UNIT.get(field, None) | ||
| if unit is None: | ||
| raise ValueError(f"Unknown PostgreSQL interval field {field!r}") | ||
| elif unit in {"Y", "M"}: | ||
| raise ValueError( | ||
| "Variable length timedeltas are not yet supported with PostgreSQL" | ||
| ) | ||
| return dt.Interval(unit=unit, nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(MySQLDialect, mysql.DOUBLE) | ||
| def sa_mysql_double(_, satype, nullable=True): | ||
| # TODO: handle asdecimal=True | ||
| return dt.Double(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.types.String) | ||
| def sa_string(_, satype, nullable=True): | ||
| return dt.String(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.LargeBinary) | ||
| def sa_binary(_, satype, nullable=True): | ||
| return dt.Binary(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.Time) | ||
| def sa_time(_, satype, nullable=True): | ||
| return dt.Time(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.Date) | ||
| def sa_date(_, satype, nullable=True): | ||
| return dt.Date(nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.DateTime) | ||
| def sa_datetime(_, satype, nullable=True, default_timezone='UTC'): | ||
| timezone = default_timezone if satype.timezone else None | ||
| return dt.Timestamp(timezone=timezone, nullable=nullable) | ||
|
|
||
|
|
||
| @dt.dtype.register(Dialect, sa.ARRAY) | ||
| def sa_array(dialect, satype, nullable=True): | ||
| dimensions = satype.dimensions | ||
| if dimensions is not None and dimensions != 1: | ||
| raise NotImplementedError('Nested array types not yet supported') | ||
|
|
||
| value_dtype = dt.dtype(dialect, satype.item_type) | ||
| return dt.Array(value_dtype, nullable=nullable) | ||
|
|
||
|
|
||
| @sch.infer.register(sa.Table) | ||
| def schema_from_table(table, schema=None): | ||
| """Retrieve an ibis schema from a SQLAlchemy ``Table``. | ||
| Parameters | ||
| ---------- | ||
| table : sa.Table | ||
| Returns | ||
| ------- | ||
| schema : ibis.expr.datatypes.Schema | ||
| An ibis schema corresponding to the types of the columns in `table`. | ||
| """ | ||
| schema = schema if schema is not None else {} | ||
| pairs = [] | ||
| for name, column in zip(table.columns.keys(), table.columns): | ||
| if name in schema: | ||
| dtype = dt.dtype(schema[name]) | ||
| else: | ||
| dtype = dt.dtype( | ||
| getattr(table.bind, 'dialect', Dialect()), | ||
| column.type, | ||
| nullable=column.nullable, | ||
| ) | ||
| pairs.append((name, dtype)) | ||
| return sch.schema(pairs) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| try: | ||
| import geoalchemy2 # noqa F401 | ||
| import geoalchemy2.shape # noqa F401 | ||
| import geopandas # noqa F401 | ||
| except ImportError: | ||
| geospatial_supported = False | ||
| else: | ||
| geospatial_supported = True |