| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| ci/udf/CMakeCache.txt | ||
| ci/udf/CMakeFiles/ | ||
| ci/udf/Makefile |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,31 +1,38 @@ | ||
| # Ibis: Python data analysis framework for Hadoop and SQL engines | ||
|
|
||
| [](https://anaconda.org/conda-forge/ibis-framework) | ||
| [](http://docs.ibis-project.org) | ||
| [](https://circleci.com/gh/ibis-project/ibis/tree/master) | ||
| [](https://ci.appveyor.com/project/cpcloud/ibis-xh5g1) | ||
|
|
||
| Ibis is a toolbox to bridge the gap between local Python environments, remote | ||
| storage, execution systems like Hadoop components (HDFS, Impala, Hive, Spark) | ||
| and SQL databases. Its goal is to simplify analytical workflows and make you | ||
| more productive. | ||
|
|
||
| Install Ibis from PyPI with: | ||
|
|
||
| ```sh | ||
| pip install ibis-framework | ||
| ``` | ||
|
|
||
| or from conda-forge with | ||
|
|
||
| ```sh | ||
| conda install ibis-framework -c conda-forge | ||
| ``` | ||
|
|
||
| Ibis currently provides tools for interacting with the following systems: | ||
|
|
||
| - [Apache Impala (incubating)](http://impala.io/) | ||
| - [Apache Kudu](http://getkudu.io) | ||
| - [Hadoop Distributed File System (HDFS)](https://hadoop.apache.org/) | ||
| - [PostgreSQL](https://www.postgresql.org/) | ||
| - [MySQL](https://www.mysql.com/) (Experimental) | ||
| - [SQLite](https://www.sqlite.org/) | ||
| - [Pandas](https://pandas.pydata.org/) [DataFrames](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe) (Experimental) | ||
| - [Clickhouse](https://clickhouse.yandex) | ||
| - [BigQuery](https://cloud.google.com/bigquery) | ||
|
|
||
| Learn more about using the library at http://docs.ibis-project.org and read the | ||
| project blog at http://ibis-project.org for news and updates. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| FROM ibisproject/miniconda3 | ||
|
|
||
| # fonts are for docs | ||
| RUN apt-get -qq update -y \ | ||
| && apt-get -qq install -y --no-install-recommends ttf-dejavu \ | ||
| git gcc make clang libboost-dev postgresql-client ca-certificates \ | ||
| && rm -rf /var/lib/apt/lists/* | ||
|
|
||
| ARG PYTHON | ||
| ARG ENVKIND | ||
|
|
||
| ADD ci/requirements-${ENVKIND}-${PYTHON}.yml / | ||
|
|
||
| RUN conda env create -q -n ibis-${ENVKIND}-${PYTHON} -f /requirements-${ENVKIND}-${PYTHON}.yml \ | ||
| && conda install conda-build -y -q | ||
|
|
||
| # we intentionally keep conda artifacts in the image to speedup recipe building | ||
| # on the other hand to reduce image size run the following in the previous layer | ||
| # && conda clean -a -y | ||
|
|
||
| RUN echo 'source activate ibis-'${ENVKIND}-${PYTHON}' && exec "$@"' > activate.sh | ||
|
|
||
| ADD . /ibis | ||
| WORKDIR /ibis | ||
| RUN bash /activate.sh python setup.py develop | ||
|
|
||
| ENTRYPOINT ["bash", "/activate.sh"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| CWD=$(dirname $0) | ||
|
|
||
| pip install asv | ||
| $CWD/asvconfig.py $1 | tee $HOME/.asv-machine.json | ||
| git remote add upstream https://github.com/ibis-project/ibis | ||
| git fetch upstream refs/heads/master | ||
| asv continuous -f 1.5 -e upstream/master $2 || echo > /dev/null |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| #!/bin/bash -e | ||
|
|
||
| docker-compose rm --force --stop | ||
| docker-compose up -d --no-build postgres mysql clickhouse impala | ||
| docker-compose run --rm waiter | ||
| docker-compose build --pull ibis | ||
| docker-compose run --rm ibis ci/load-data.sh |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| version: '3' | ||
| services: | ||
|
|
||
| postgres: | ||
| image: postgres | ||
| ports: | ||
| - 5432:5432 | ||
| environment: | ||
| POSTGRES_PASSWORD: postgres | ||
|
|
||
| mysql: | ||
| image: mariadb:10.2 | ||
| ports: | ||
| - 3306:3306 | ||
| environment: | ||
| - MYSQL_ALLOW_EMPTY_PASSWORD=1 | ||
| - MYSQL_DATABASE=ibis_testing | ||
| - MYSQL_USER=ibis | ||
| - MYSQL_PASSWORD=ibis | ||
|
|
||
| impala: | ||
| image: ibisproject/impala | ||
| hostname: impala | ||
| networks: | ||
| default: | ||
| aliases: | ||
| - quickstart.cloudera | ||
| environment: | ||
| PGPASSWORD: postgres | ||
| ports: | ||
| # HDFS | ||
| - 9020:9020 | ||
| - 50070:50070 | ||
| - 50075:50075 | ||
| - 8020:8020 | ||
| - 8042:8042 | ||
| # Hive | ||
| - 9083:9083 | ||
| # Impala | ||
| - 21000:21000 | ||
| - 21050:21050 | ||
| - 25000:25000 | ||
| - 25010:25010 | ||
| - 25020:25020 | ||
|
|
||
| clickhouse: | ||
| image: yandex/clickhouse-server:1.1.54327 | ||
| ports: | ||
| - 8123:8123 | ||
| - 9000:9000 | ||
|
|
||
| waiter: | ||
| image: jwilder/dockerize | ||
| command: | | ||
| dockerize -wait tcp://mysql:3306 | ||
| -wait tcp://postgres:5432 | ||
| -wait tcp://impala:21050 | ||
| -wait tcp://impala:50070 | ||
| -wait tcp://clickhouse:9000 | ||
| -wait-retry-interval 5s | ||
| -timeout 5m | ||
| ibis: | ||
| image: ibis:${PYTHON_VERSION:-3.6} | ||
| environment: | ||
| - IBIS_TEST_DOWNLOAD_DIRECTORY=/tmp | ||
| - IBIS_TEST_DATA_DIRECTORY=/tmp/ibis-testing-data | ||
| - IBIS_TEST_SQLITE_DATABASE=/tmp/ibis_testing.db | ||
| - IBIS_TEST_NN_HOST=impala | ||
| - IBIS_TEST_IMPALA_HOST=impala | ||
| - IBIS_TEST_IMPALA_PORT=21050 | ||
| - IBIS_TEST_WEBHDFS_PORT=50070 | ||
| - IBIS_TEST_WEBHDFS_USER=hdfs | ||
| - IBIS_TEST_MYSQL_HOST=mysql | ||
| - IBIS_TEST_MYSQL_PORT=3306 | ||
| - IBIS_TEST_MYSQL_USER=ibis | ||
| - IBIS_TEST_MYSQL_PASSWORD=ibis | ||
| - IBIS_TEST_MYSQL_DATABASE=ibis_testing | ||
| - IBIS_TEST_POSTGRES_HOST=postgres | ||
| - IBIS_TEST_POSTGRES_PORT=5432 | ||
| - IBIS_TEST_POSTGRES_USER=postgres | ||
| - IBIS_TEST_POSTGRES_PASSWORD=postgres | ||
| - IBIS_TEST_POSTGRES_DATABASE=ibis_testing | ||
| - IBIS_TEST_CLICKHOUSE_HOST=clickhouse | ||
| - IBIS_TEST_CLICKHOUSE_PORT=9000 | ||
| - IBIS_TEST_CLICKHOUSE_DATABASE=ibis_testing | ||
| - GOOGLE_BIGQUERY_PROJECT_ID=ibis-gbq | ||
| - GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcloud-service-key.json | ||
| volumes: | ||
| - /tmp/ibis:/tmp | ||
| build: | ||
| context: .. | ||
| dockerfile: ci/Dockerfile | ||
| args: | ||
| PYTHON: ${PYTHON_VERSION:-3.6} | ||
| ENVKIND: ${ENVKIND:-dev} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| #!/bin/bash -e | ||
|
|
||
| export ENVKIND=docs | ||
| export PYTHON_VERSION="3.6" | ||
|
|
||
| docker-compose build --pull ibis | ||
| docker-compose run --rm ibis ping -c 1 quickstart.cloudera | ||
| docker-compose run --rm ibis rm -rf /tmp/docs.ibis-project.org | ||
| docker-compose run --rm ibis git clone \ | ||
| --branch gh-pages \ | ||
| https://github.com/ibis-project/docs.ibis-project.org /tmp/docs.ibis-project.org | ||
|
|
||
| docker-compose run --rm ibis find /tmp/docs.ibis-project.org -maxdepth 1 ! -wholename /tmp/docs.ibis-project.org ! -name '*.git' ! -name '.' ! -name 'CNAME' ! -name '*.nojekyll' -exec rm -rf {} \; | ||
| docker-compose run --rm ibis sphinx-build -b html docs/source /tmp/docs.ibis-project.org -W -j auto -T |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| CWD="$(dirname "${0}")" | ||
|
|
||
| declare -A argcommands=([sqlite]=sqlite | ||
| [parquet]="parquet -i" | ||
| [postgres]=postgres | ||
| [clickhouse]=clickhouse | ||
| [mysql]=mysql | ||
| [impala]=impala) | ||
|
|
||
| if [[ "$#" == 0 ]]; then | ||
| ARGS=(${!argcommands[@]}) # keys of argcommands | ||
| else | ||
| ARGS=($*) | ||
| fi | ||
|
|
||
| python $CWD/datamgr.py download | ||
|
|
||
| for arg in ${ARGS[@]}; do | ||
| if [[ "${arg}" == "impala" ]]; then | ||
| python "${CWD}"/impalamgr.py load --data & | ||
| else | ||
| python "${CWD}"/datamgr.py ${argcommands[${arg}]} & | ||
| fi | ||
| done | ||
|
|
||
| FAIL=0 | ||
|
|
||
| for job in `jobs -p` | ||
| do | ||
| wait "${job}" || let FAIL+=1 | ||
| done | ||
|
|
||
| if [[ "${FAIL}" == 0 ]]; then | ||
| echo "Done loading ${ARGS[@]}" | ||
| exit 0 | ||
| else | ||
| echo "Failed loading ${ARGS[@]}" >&2 | ||
| exit 1 | ||
| fi |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| DROP TABLE IF EXISTS diamonds; | ||
|
|
||
| CREATE TABLE diamonds ( | ||
| carat FLOAT, | ||
| cut TEXT, | ||
| color TEXT, | ||
| clarity TEXT, | ||
| depth FLOAT, | ||
| `table` FLOAT, | ||
| price BIGINT, | ||
| x FLOAT, | ||
| y FLOAT, | ||
| z FLOAT | ||
| ) DEFAULT CHARACTER SET = utf8; | ||
|
|
||
| DROP TABLE IF EXISTS batting; | ||
|
|
||
| CREATE TABLE batting ( | ||
| `playerID` VARCHAR(255), | ||
| `yearID` BIGINT, | ||
| stint BIGINT, | ||
| `teamID` VARCHAR(7), | ||
| `lgID` VARCHAR(7), | ||
| `G` BIGINT, | ||
| `AB` BIGINT, | ||
| `R` BIGINT, | ||
| `H` BIGINT, | ||
| `X2B` BIGINT, | ||
| `X3B` BIGINT, | ||
| `HR` BIGINT, | ||
| `RBI` BIGINT, | ||
| `SB` BIGINT, | ||
| `CS` BIGINT, | ||
| `BB` BIGINT, | ||
| `SO` BIGINT, | ||
| `IBB` BIGINT, | ||
| `HBP` BIGINT, | ||
| `SH` BIGINT, | ||
| `SF` BIGINT, | ||
| `GIDP` BIGINT | ||
| ) DEFAULT CHARACTER SET = utf8; | ||
|
|
||
| DROP TABLE IF EXISTS awards_players; | ||
|
|
||
| CREATE TABLE awards_players ( | ||
| `playerID` VARCHAR(255), | ||
| `awardID` VARCHAR(255), | ||
| `yearID` BIGINT, | ||
| `lgID` VARCHAR(7), | ||
| tie VARCHAR(7), | ||
| notes VARCHAR(255) | ||
| ) DEFAULT CHARACTER SET = utf8; | ||
|
|
||
| DROP TABLE IF EXISTS functional_alltypes; | ||
|
|
||
| CREATE TABLE functional_alltypes ( | ||
| `index` BIGINT, | ||
| `Unnamed: 0` BIGINT, | ||
| id INTEGER, | ||
| bool_col BOOLEAN, | ||
| tinyint_col TINYINT, | ||
| smallint_col SMALLINT, | ||
| int_col INTEGER, | ||
| bigint_col BIGINT, | ||
| float_col FLOAT, | ||
| double_col DOUBLE, | ||
| date_string_col TEXT, | ||
| string_col TEXT, | ||
| timestamp_col TIMESTAMP, | ||
| year INTEGER, | ||
| month INTEGER | ||
| ) DEFAULT CHARACTER SET = utf8; | ||
|
|
||
| CREATE INDEX `ix_functional_alltypes_index` ON functional_alltypes (`index`); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| #!/bin/bash -e | ||
|
|
||
| cmd='$(find /ibis -name "*.py[co]" -delete > /dev/null 2>&1 || true) && pytest "$@"' | ||
| docker-compose build --pull ibis | ||
| docker-compose run --rm ibis bash -c "$cmd" -- "$@" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| .. _backends: | ||
|
|
||
| Backends | ||
| ======== | ||
|
|
||
| This document describes the classes of backends, how they work, and any details | ||
| about each backend that are relevant to end users. | ||
|
|
||
| .. _classes_of_backends: | ||
|
|
||
| Classes of Backends | ||
| ------------------- | ||
|
|
||
| There are currently three classes of backends that live in ibis. | ||
|
|
||
| #. String generating backends | ||
| #. Expression generating backends | ||
| #. Direct execution backends | ||
|
|
||
| .. _string_generating_backends: | ||
|
|
||
| String Generating Backends | ||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
|
||
| The first category of backend translates ibis expressions into strings. | ||
| Generally speaking these backends also need to handle their own execution. | ||
| They work by translating each node into a string, and passing the generated | ||
| string to the database through a driver API. | ||
|
|
||
| Impala | ||
| ****** | ||
|
|
||
| TODO | ||
|
|
||
| Clickhouse | ||
| ********** | ||
|
|
||
| TODO | ||
|
|
||
| BigQuery | ||
| ******** | ||
|
|
||
| TODO | ||
|
|
||
| .. _expression_generating_backends: | ||
|
|
||
| Expression Generating Backends | ||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
|
||
| The second category of backends translates ibis expressions into other | ||
| expressions. Currently, all expression generating backends generate `SQLAlchemy | ||
| expressions <http://docs.sqlalchemy.org/en/latest/core/tutorial.html>`_. | ||
|
|
||
| Instead of generating strings at each translation step, these backends build up | ||
| an expression. These backends tend to execute their expressions directly | ||
| through the driver APIs provided by SQLAlchemy (or one of its transitive | ||
| dependencies). | ||
|
|
||
| SQLite | ||
| ****** | ||
|
|
||
| TODO | ||
|
|
||
| PostgreSQL | ||
| ********** | ||
|
|
||
| TODO | ||
|
|
||
| .. _direct_execution_backends: | ||
|
|
||
| Direct Execution Backends | ||
| ~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
|
||
| The only existing backend that directly executes ibis expressions is the pandas | ||
| backend. A full description of the implementation can be found in the module | ||
| docstring of the pandas backend located in ``ibis/pandas/execution/core.py``. | ||
|
|
||
| Pandas | ||
| ****** | ||
|
|
||
| TODO |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,212 @@ | ||
| .. _design: | ||
|
|
||
| Design | ||
| ====== | ||
|
|
||
|
|
||
| .. _primary_goals: | ||
|
|
||
| Primary Goals | ||
| ------------- | ||
|
|
||
| #. Type safety | ||
| #. Expressiveness | ||
| #. Composability | ||
| #. Familiarity | ||
|
|
||
| .. _flow_of_execution: | ||
|
|
||
| Flow of Execution | ||
| ----------------- | ||
|
|
||
| #. User writes expression | ||
| #. Each method or function call builds a new expression | ||
| #. Expressions are type checked as you create them | ||
| #. Expressions have some optimizations that happen as the user builds them | ||
| #. Backend specific rewrites | ||
| #. Expressions are compiled | ||
| #. The SQL string that generated by the compiler is sent to the database and | ||
| executed (this step is skipped for the pandas backend) | ||
| #. The database returns some data that is then turned into a pandas DataFrame | ||
| by ibis | ||
|
|
||
| .. _expressions: | ||
|
|
||
| Expressions | ||
| ----------- | ||
|
|
||
| The main user-facing component of ibis is expressions. The base class of all | ||
| expressions in ibis is the :class:`~ibis.expr.types.Expr` class. | ||
|
|
||
| Expressions provide the user facing API, defined in ``ibis/expr/api.py`` | ||
|
|
||
| .. _type_system: | ||
|
|
||
| Type System | ||
| ~~~~~~~~~~~ | ||
|
|
||
| Ibis's type system consists of a set of rules for specifying the types of | ||
| inputs to :class:`~ibis.expr.types.Node` subclasses. Upon construction of a | ||
| :class:`~ibis.expr.types.Node` subclass, ibis performs validation of every | ||
| input to the node based on the rule that was used to declare the input. | ||
|
|
||
| Rules are defined in ``ibis/expr/rules.py`` | ||
|
|
||
| .. _expr_class: | ||
|
|
||
| The :class:`~ibis.expr.types.Expr` class | ||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
|
||
| Expressions are a thin but important abstraction over operations, containing | ||
| only type information and shape information, i.e., whether they are tables, | ||
| columns, or scalars. | ||
|
|
||
| Examples of expressions include :class:`~ibis.expr.types.Int64Column`, | ||
| :class:`~ibis.expr.types.StringScalar`, and | ||
| :class:`~ibis.expr.types.TableExpr`. | ||
|
|
||
| Here's an example of each type of expression: | ||
|
|
||
| .. code-block:: ipython | ||
| import ibis | ||
| t = ibis.table([('a', 'int64')]) | ||
| int64_column = t.a | ||
| type(int64_column) | ||
| string_scalar = ibis.literal('some_string_value') | ||
| type(string_scalar) | ||
| table_expr = t.mutate(b=t.a + 1) | ||
| type(table_expr) | ||
| .. _node_class: | ||
|
|
||
| The :class:`~ibis.expr.types.Node` Class | ||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
|
||
| :class:`~ibis.expr.types.Node` subclasses make up the core set of operations of | ||
| ibis. Each node corresponds to a particular operation. | ||
|
|
||
| Most nodes are defined in the :mod:`~ibis.expr.operations` module. | ||
|
|
||
| Examples of nodes include :class:`~ibis.expr.operations.Add` and | ||
| :class:`~ibis.expr.operations.Sum`. | ||
|
|
||
| Nodes have two important members (and often these are the only members defined): | ||
|
|
||
| #. ``input_type``: a list of rules | ||
| #. ``output_type``: a rule or method | ||
|
|
||
| The ``input_type`` member is a list of rules that defines the types of | ||
| the inputs to the operation. This is sometimes called the signature. | ||
|
|
||
| The ``output_type`` member is a rule or a method that defines the output type | ||
| of the operation. This is sometimes called the return type. | ||
|
|
||
| An example of ``input_type``/``output_type`` usage is the | ||
| :class:`~ibis.expr.operations.Log` class: | ||
|
|
||
| .. code-block:: ipython | ||
| class Log(Node): | ||
| input_type = [ | ||
| rules.double(), | ||
| rules.double(name='base', optional=True) | ||
| ] | ||
| output_type = rules.shape_like_arg(0, 'double') | ||
| This class describes an operation called ``Log`` that takes one required | ||
| argument: a double scalar or column, and one optional argument: a double scalar | ||
| or column named ``base`` that defaults to nothing if not provided. The base | ||
| argument is ``None`` by default so that the expression will behave as the | ||
| underlying database does. | ||
|
|
||
| These objects are instantiated when you use ibis APIs: | ||
|
|
||
| .. code-block:: ipython | ||
| import ibis | ||
| t = ibis.table([('a', 'double')]) | ||
| log_1p = (1 + t.a).log() # an Add and a Log are instantiated here | ||
| .. _expr_vs_ops: | ||
|
|
||
| Expressions vs Operations: Why are they different? | ||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
|
||
| Separating expressions from their underlying operations makes it easy to | ||
| generically describe and validate the inputs to particular nodes. In the log | ||
| example, it doesn't matter what *operation* (node) the double-valued arguments | ||
| are coming from, they must only satisfy the requirement denoted by the rule. | ||
|
|
||
| Separation of the :class:`~ibis.expr.types.Node` and | ||
| :class:`~ibis.expr.types.Expr` classes also allows the API to be tied to the | ||
| physical type of the expression rather than the particular operation, making it | ||
| easy to define the API in terms of types rather than specific operations. | ||
|
|
||
| Furthermore, operations often have an output type that depends on the input | ||
| type. An example of this is the ``greatest`` function, which takes the maximum | ||
| of all of its arguments. Another example is ``CASE`` statements, whose ``THEN`` | ||
| expressions determine the output type of the expression. | ||
|
|
||
| This allows ibis to provide **only** the APIs that make sense for a particular | ||
| type, even when an operation yields a different output type depending on its | ||
| input. Concretely, this means that you cannot perform operations that don't | ||
| make sense, like computing the average of a string column. | ||
|
|
||
| .. _compilation: | ||
|
|
||
| Compilation | ||
| ----------- | ||
|
|
||
| The next major component of ibis is the compilers. | ||
|
|
||
| The first few versions of ibis directly generated strings, but the compiler | ||
| infrastructure was generalized to support compilation of `SQLAlchemy | ||
| <https://docs.sqlalchemy.org/en/latest/core/tutorial.html>`_ based expressions. | ||
|
|
||
| The compiler works by translating the different pieces of SQL expression into a | ||
| string or SQLAlchemy expression. | ||
|
|
||
| The main pieces of a ``SELECT`` statement are: | ||
|
|
||
| #. The set of column expressions (``select_set``) | ||
| #. ``WHERE`` clauses (``where``) | ||
| #. ``GROUP BY`` clauses (``group_by``) | ||
| #. ``HAVING`` clauses (``having``) | ||
| #. ``LIMIT`` clauses (``limit``) | ||
| #. ``ORDER BY`` clauses (``order_by``) | ||
| #. ``DISTINCT`` clauses (``distinct``) | ||
|
|
||
| Each of these pieces is translated into a SQL string and finally assembled by | ||
| the instance of the :class:`~ibis.sql.compiler.ExprTranslator` subclass | ||
| specific to the backend being compiled. For example, the | ||
| :class:`~ibis.impala.compiler.ImpalaExprTranslator` is one of the subclasses | ||
| that will perform this translation. | ||
|
|
||
| .. note:: | ||
|
|
||
| While ibis was designed with an explicit goal of first-class SQL support, | ||
| ibis can target other systems such as pandas. | ||
|
|
||
| .. _execution: | ||
|
|
||
| Execution | ||
| --------- | ||
|
|
||
| We presumably want to *do* something with our compiled expressions. This is | ||
| where execution comes in. | ||
|
|
||
| This is least complex part of ibis, mostly only requiring ibis to correctly | ||
| handle whatever the database hands back. | ||
|
|
||
| By and large, the execution of compiled SQL is handled by the database to which | ||
| SQL is sent from ibis. | ||
|
|
||
| However, once the data arrives from the database we need to convert that | ||
| data to a pandas DataFrame. | ||
|
|
||
| The Query class, with its :meth:`~ibis.sql.client.Query._fetch` method, | ||
| provides a way for ibis :class:`~ibis.sql.client.SQLClient` objects to do any | ||
| additional processing necessary after the database returns results to the | ||
| client. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| .. _extending: | ||
|
|
||
|
|
||
| Extending Ibis | ||
| ============== | ||
|
|
||
| Users typically want to extend ibis in one of two ways: | ||
|
|
||
| #. Add a new expression | ||
| #. Add a new backend | ||
|
|
||
|
|
||
| Below we provide notebooks showing how to extend ibis in each of these ways. | ||
|
|
||
|
|
||
| Adding a New Expression | ||
| ----------------------- | ||
|
|
||
| .. note:: | ||
|
|
||
| Make sure you've run the following commands before executing the notebook | ||
|
|
||
| .. code-block:: sh | ||
| docker-compose up -d --no-build postgres dns | ||
| docker-compose run waiter | ||
| docker-compose run ibis ci/load-data.sh postgres | ||
| Here we show how to add a ``sha1`` method to the PostgreSQL backend: | ||
|
|
||
| .. toctree:: | ||
| :maxdepth: 1 | ||
|
|
||
| notebooks/tutorial/9-Adding-a-new-expression.ipynb | ||
|
|
||
|
|
||
| Adding a New Backend | ||
| -------------------- | ||
|
|
||
| TBD |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| { | ||
| "cells": [ | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "# Impala/HDFS intro and Setup" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Getting started" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "You're going to want to make sure you can import `ibis`" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "import ibis\n", | ||
| "import os" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "If you have WebHDFS available, connect to HDFS with according to your WebHDFS config. For kerberized or more complex HDFS clusters please look at http://hdfscli.readthedocs.org/en/latest/ for info on connecting. You can use a connection from that library instead of using `hdfs_connect`" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "hdfs_port = os.environ.get('IBIS_WEBHDFS_PORT', 50070)\n", | ||
| "hdfs = ibis.hdfs_connect(host='quickstart.cloudera', port=hdfs_port)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Finally, create the Ibis client" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "con = ibis.impala.connect('quickstart.cloudera', hdfs_client=hdfs)\n", | ||
| "con" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Obviously, substitute the parameters that are appropriate for your environment (see docstring for `ibis.impala.connect`). `impala.connect` uses the same parameters as Impyla's (https://pypi.python.org/pypi/impyla) DBAPI interface" | ||
| ] | ||
| } | ||
| ], | ||
| "metadata": { | ||
| "kernelspec": { | ||
| "display_name": "Python 3", | ||
| "language": "python", | ||
| "name": "python3" | ||
| }, | ||
| "language_info": { | ||
| "codemirror_mode": { | ||
| "name": "ipython", | ||
| "version": 3 | ||
| }, | ||
| "file_extension": ".py", | ||
| "mimetype": "text/x-python", | ||
| "name": "python", | ||
| "nbconvert_exporter": "python", | ||
| "pygments_lexer": "ipython3", | ||
| "version": "3.6.3" | ||
| } | ||
| }, | ||
| "nbformat": 4, | ||
| "nbformat_minor": 1 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,331 @@ | ||
| { | ||
| "cells": [ | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "# Advanced Topics: Top-K and Self Joins" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Setup" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "import ibis\n", | ||
| "import os\n", | ||
| "hdfs_port = os.environ.get('IBIS_WEBHDFS_PORT', 50070)\n", | ||
| "hdfs = ibis.hdfs_connect(host='quickstart.cloudera', port=hdfs_port)\n", | ||
| "con = ibis.impala.connect(host='quickstart.cloudera', database='ibis_testing',\n", | ||
| " hdfs_client=hdfs)\n", | ||
| "ibis.options.interactive = True" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## \"Top-K\" Filtering\n", | ||
| "\n", | ||
| "\n", | ||
| "A common analytical pattern involves subsetting based on some method of ranking. For example, \"the 5 most frequently occurring widgets in a dataset\". By choosing the right metric, you can obtain the most important or least important items from some dimension, for some definition of important.\n", | ||
| "\n", | ||
| "To carry out the pattern by hand involves the following\n", | ||
| "\n", | ||
| "- Choose a ranking metric\n", | ||
| "- Aggregate, computing the ranking metric, by the target dimension\n", | ||
| "- Order by the ranking metric and take the highest K values\n", | ||
| "- Use those values as a set filter (either with `semi_join` or `isin`) in your next query\n", | ||
| "\n", | ||
| "For example, let's look at the TPC-H tables and find the 5 or 10 customers who placed the most orders over their lifetime:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "orders = con.table('tpch_orders')\n", | ||
| "\n", | ||
| "top_orders = (orders\n", | ||
| " .group_by('o_custkey')\n", | ||
| " .size()\n", | ||
| " .sort_by(('count', False))\n", | ||
| " .limit(5))\n", | ||
| "top_orders" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Now, we could use these customer keys as a filter in some other analysis:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "# Among the top 5 most frequent customers, what's the histogram of their order statuses?\n", | ||
| "analysis = (orders[orders.o_custkey.isin(top_orders.o_custkey)]\n", | ||
| " .group_by('o_orderstatus')\n", | ||
| " .size())\n", | ||
| "analysis" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "This is such a common pattern that Ibis supports a high level primitive `topk` operation, which can be used immediately as a filter:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "top_orders = orders.o_custkey.topk(5)\n", | ||
| "orders[top_orders].group_by('o_orderstatus').size()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "This goes a little further. Suppose now we want to rank customers by their total spending instead of the number of orders, perhaps a more meaningful metric:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "total_spend = orders.o_totalprice.sum().name('total')\n", | ||
| "top_spenders = (orders\n", | ||
| " .group_by('o_custkey')\n", | ||
| " .aggregate(total_spend)\n", | ||
| " .sort_by(('total', False))\n", | ||
| " .limit(5))\n", | ||
| "top_spenders" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "To use another metric, just pass it to the `by` argument in `topk`:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "top_spenders = orders.o_custkey.topk(5, by=total_spend)\n", | ||
| "orders[top_spenders].group_by('o_orderstatus').size()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Self joins\n", | ||
| "\n", | ||
| "\n", | ||
| "If you're a relational data guru, you may have wondered how it's possible to join tables with themselves, because joins clauses involve column references back to the original table.\n", | ||
| "\n", | ||
| "Consider the SQL\n", | ||
| "\n", | ||
| "```sql\n", | ||
| " SELECT t1.key, sum(t1.value - t2.value) AS metric\n", | ||
| " FROM my_table t1\n", | ||
| " JOIN my_table t2\n", | ||
| " ON t1.key = t2.subkey\n", | ||
| " GROUP BY 1\n", | ||
| "```\n", | ||
| " \n", | ||
| "Here, we have an unambiguous way to refer to each of the tables through aliasing." | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Let's consider the TPC-H database, and support we want to compute year-over-year change in total order amounts by region using joins." | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "region = con.table('tpch_region')\n", | ||
| "nation = con.table('tpch_nation')\n", | ||
| "customer = con.table('tpch_customer')\n", | ||
| "orders = con.table('tpch_orders')\n", | ||
| "\n", | ||
| "orders.limit(5)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "First, let's join all the things and select the fields we care about:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "fields_of_interest = [region.r_name.name('region'), \n", | ||
| " nation.n_name.name('nation'),\n", | ||
| " orders.o_totalprice.name('amount'),\n", | ||
| " orders.o_orderdate.cast('timestamp').name('odate') # these are strings\n", | ||
| " ]\n", | ||
| "\n", | ||
| "joined_all = (region.join(nation, region.r_regionkey == nation.n_regionkey)\n", | ||
| " .join(customer, customer.c_nationkey == nation.n_nationkey)\n", | ||
| " .join(orders, orders.o_custkey == customer.c_custkey)\n", | ||
| " [fields_of_interest])" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Okay, great, let's have a look:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "joined_all.limit(5)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Sweet, now let's aggregate by year and region:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "year = joined_all.odate.year().name('year')\n", | ||
| "\n", | ||
| "total = joined_all.amount.sum().cast('double').name('total')\n", | ||
| "\n", | ||
| "annual_amounts = (joined_all\n", | ||
| " .group_by(['region', year])\n", | ||
| " .aggregate(total))\n", | ||
| "annual_amounts" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Looking good so far. Now, we need to join this table on itself, by subtracting 1 from one of the year columns.\n", | ||
| "\n", | ||
| "We do this by creating a \"joinable\" view of a table that is considered a distinct object within Ibis. To do this, use the `view` function:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "current = annual_amounts\n", | ||
| "prior = annual_amounts.view()\n", | ||
| "\n", | ||
| "yoy_change = (current.total - prior.total).name('yoy_change')\n", | ||
| "\n", | ||
| "results = (current.join(prior, ((current.region == prior.region) & \n", | ||
| " (current.year == (prior.year - 1))))\n", | ||
| " [current.region, current.year, yoy_change])\n", | ||
| "df = results.execute()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "df['yoy_pretty'] = df.yoy_change.map(lambda x: '$%.2fmm' % (x / 1000000.))\n", | ||
| "df" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "If you're being fastidious and want to consider the first year occurring in the dataset for each region to have 0 for the prior year, you will instead need to do an outer join and treat nulls in the prior side of the join as zero:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "yoy_change = (current.total - prior.total.zeroifnull()).name('yoy_change')\n", | ||
| "results = (current.outer_join(prior, ((current.region == prior.region) & \n", | ||
| " (current.year == (prior.year - 1))))\n", | ||
| " [current.region, current.year, current.total,\n", | ||
| " prior.total.zeroifnull().name('prior_total'), \n", | ||
| " yoy_change])\n", | ||
| "\n", | ||
| "results.limit(10)" | ||
| ] | ||
| } | ||
| ], | ||
| "metadata": { | ||
| "kernelspec": { | ||
| "display_name": "Python 3", | ||
| "language": "python", | ||
| "name": "python3" | ||
| }, | ||
| "language_info": { | ||
| "codemirror_mode": { | ||
| "name": "ipython", | ||
| "version": 3 | ||
| }, | ||
| "file_extension": ".py", | ||
| "mimetype": "text/x-python", | ||
| "name": "python", | ||
| "nbconvert_exporter": "python", | ||
| "pygments_lexer": "ipython3", | ||
| "version": "3.6.3" | ||
| } | ||
| }, | ||
| "nbformat": 4, | ||
| "nbformat_minor": 1 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,359 @@ | ||
| { | ||
| "cells": [ | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "# Advanced Topics: Additional Filtering\n", | ||
| "\n", | ||
| "The filtering examples we've shown to this point have been pretty simple, either comparisons between columns or fixed values, or set filter functions like `isin` and `notin`. \n", | ||
| "\n", | ||
| "Ibis supports a number of richer analytical filters that can involve one or more of:\n", | ||
| "\n", | ||
| "- Aggregates computed from the same or other tables\n", | ||
| "- Conditional aggregates (in SQL-speak these are similar to \"correlated subqueries\")\n", | ||
| "- \"Existence\" set filters (equivalent to the SQL `EXISTS` and `NOT EXISTS` keywords)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Setup" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "import ibis\n", | ||
| "import os\n", | ||
| "hdfs_port = os.environ.get('IBIS_WEBHDFS_PORT', 50070)\n", | ||
| "hdfs = ibis.hdfs_connect(host='quickstart.cloudera', port=hdfs_port)\n", | ||
| "con = ibis.impala.connect(host='quickstart.cloudera', database='ibis_testing',\n", | ||
| " hdfs_client=hdfs)\n", | ||
| "ibis.options.interactive = True" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Using scalar aggregates in filters" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "table = con.table('functional_alltypes')\n", | ||
| "table.limit(5)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "We could always compute some aggregate value from the table and use that in another expression, or we can use a data-derived aggregate in the filter. Take the average of a column for example:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "table.double_col.mean()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "You can use this expression as a substitute for a scalar value in a filter, and the execution engine will combine everything into a single query rather than having to access Impala multiple times:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "cond = table.bigint_col > table.double_col.mean()\n", | ||
| "expr = table[cond & table.bool_col].limit(5)\n", | ||
| "expr" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Conditional aggregates\n", | ||
| "\n", | ||
| "\n", | ||
| "Suppose that we wish to filter using an aggregate computed conditional on some other expressions holding true. Using the TPC-H datasets, suppose that we want to filter customers based on the following criteria: Orders such that their amount exceeds the average amount for their sales region over the whole dataset. This can be computed any numbers of ways (such as joining auxiliary tables and filtering post-join)\n", | ||
| "\n", | ||
| "Again, from prior examples, here are the joined up tables with all the customer data:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "region = con.table('tpch_region')\n", | ||
| "nation = con.table('tpch_nation')\n", | ||
| "customer = con.table('tpch_customer')\n", | ||
| "orders = con.table('tpch_orders')\n", | ||
| "\n", | ||
| "fields_of_interest = [customer,\n", | ||
| " region.r_name.name('region'), \n", | ||
| " orders.o_totalprice,\n", | ||
| " orders.o_orderdate.cast('timestamp').name('odate')]\n", | ||
| "\n", | ||
| "tpch = (region.join(nation, region.r_regionkey == nation.n_regionkey)\n", | ||
| " .join(customer, customer.c_nationkey == nation.n_nationkey)\n", | ||
| " .join(orders, orders.o_custkey == customer.c_custkey)\n", | ||
| " [fields_of_interest])\n", | ||
| "\n", | ||
| "tpch.limit(5)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "In this particular case, filtering based on the conditional average `o_totalprice` by region requires creating a table view (similar to the self-join examples from earlier) that can be treated as a distinct table entity in the expression. This would **not** be required if we were computing a conditional statistic from some other table. So this is a little more complicated than some other cases would be:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "t2 = tpch.view()\n", | ||
| "conditional_avg = t2[(t2.region == tpch.region)].o_totalprice.mean()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Once you've done this, you can use the conditional average in a filter expression" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": { | ||
| "scrolled": true | ||
| }, | ||
| "outputs": [], | ||
| "source": [ | ||
| "amount_filter = tpch.o_totalprice > conditional_avg\n", | ||
| "tpch[amount_filter].limit(10)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "By looking at the table sizes before and after applying the filter you can see the relative size of the subset taken. " | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "tpch.count()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "tpch[amount_filter].count()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Or even group by year and compare before and after:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "tpch.schema()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "year = tpch.odate.year().name('year')\n", | ||
| "\n", | ||
| "pre_sizes = tpch.group_by(year).size()\n", | ||
| "post_sizes = tpch[amount_filter].group_by(year).size().view()\n", | ||
| "\n", | ||
| "percent = ((post_sizes['count'] / pre_sizes['count'].cast('double'))\n", | ||
| " .name('fraction'))\n", | ||
| "\n", | ||
| "expr = (pre_sizes.join(post_sizes, pre_sizes.year == post_sizes.year)\n", | ||
| " [pre_sizes.year, \n", | ||
| " pre_sizes['count'].name('pre_count'),\n", | ||
| " post_sizes['count'].name('post_count'),\n", | ||
| " percent])\n", | ||
| "expr" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## \"Existence\" filters\n", | ||
| "\n", | ||
| "\n", | ||
| "Some filtering involves checking for the existence of a particular value in a column of another table, or amount the results of some value expression. This is common in many-to-many relationships, and can be performed in numerous different ways, but it's nice to be able to express it with a single concise statement and let Ibis compute it optimally.\n", | ||
| "\n", | ||
| "Here's some examples:\n", | ||
| "\n", | ||
| "- Filter down to customers having at least one open order\n", | ||
| "- Find customers having no open orders with 1-URGENT status\n", | ||
| "- Find stores (in the stores table) having the same name as a vendor (in the vendors table).\n", | ||
| "\n", | ||
| "We'll go ahead and solve the first couple of these problems using the TPC-H tables to illustrate the API:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "customer = con.table('tpch_customer')\n", | ||
| "orders = con.table('tpch_orders')" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "orders.limit(5)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "We introduce the `any` reduction:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "has_open_orders = ((orders.o_orderstatus == 'O') & \n", | ||
| " (customer.c_custkey == orders.o_custkey)).any()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "This is now a valid filter:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "customer[has_open_orders].limit(10)" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "For the second example, in which we want to find customers not having any open urgent orders, we write down the condition that they _do_ have some first:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "has_open_urgent_orders = ((orders.o_orderstatus == 'O') & \n", | ||
| " (orders.o_orderpriority == '1-URGENT') & \n", | ||
| " (customer.c_custkey == orders.o_custkey)).any()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Now, we can negate this condition and use it as a filter:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "customer[-has_open_urgent_orders].count()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "In this case, it is true that `customer.c_custkey` has no duplicate values, but that need not be the case. There could be multiple copies of any given value in either table column being compared, and the behavior will be the same (existence or non-existence is verified)." | ||
| ] | ||
| } | ||
| ], | ||
| "metadata": { | ||
| "kernelspec": { | ||
| "display_name": "Python 3", | ||
| "language": "python", | ||
| "name": "python3" | ||
| }, | ||
| "language_info": { | ||
| "codemirror_mode": { | ||
| "name": "ipython", | ||
| "version": 3 | ||
| }, | ||
| "file_extension": ".py", | ||
| "mimetype": "text/x-python", | ||
| "name": "python", | ||
| "nbconvert_exporter": "python", | ||
| "pygments_lexer": "ipython3", | ||
| "version": "3.6.3" | ||
| } | ||
| }, | ||
| "nbformat": 4, | ||
| "nbformat_minor": 1 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,292 @@ | ||
| { | ||
| "cells": [ | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "# Additional Analytics Tools" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Setup" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "import ibis\n", | ||
| "import os\n", | ||
| "hdfs_port = os.environ.get('IBIS_WEBHDFS_PORT', 50070)\n", | ||
| "hdfs = ibis.hdfs_connect(host='quickstart.cloudera', port=hdfs_port)\n", | ||
| "con = ibis.impala.connect(host='quickstart.cloudera', database='ibis_testing',\n", | ||
| " hdfs_client=hdfs)\n", | ||
| "ibis.options.interactive = True" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Frequency tables\n", | ||
| "\n", | ||
| "Ibis provides the `value_counts` API, just like pandas, for computing a frequency table for a table column or array expression. You might have seen it used already earlier in the tutorial. " | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "lineitem = con.table('tpch_lineitem')\n", | ||
| "orders = con.table('tpch_orders')\n", | ||
| "\n", | ||
| "items = (orders.join(lineitem, orders.o_orderkey == lineitem.l_orderkey)\n", | ||
| " [lineitem, orders])\n", | ||
| "\n", | ||
| "items.o_orderpriority.value_counts()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "This can be customized, of course:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "freq = (items.group_by(items.o_orderpriority)\n", | ||
| " .aggregate([items.count().name('nrows'),\n", | ||
| " items.l_extendedprice.sum().name('total $')]))\n", | ||
| "freq" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Binning and histograms\n", | ||
| "\n", | ||
| "\n", | ||
| "Numeric array expressions (columns with numeric type and other array expressions) have `bucket` and `histogram` methods which produce different kinds of binning. These produce category values (the computed bins) that can be used in grouping and other analytics.\n", | ||
| "\n", | ||
| "Let's have a look at a few examples\n", | ||
| "\n", | ||
| "I'll use the `summary` function to see the general distribution of lineitem prices in the order data joined above:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "items.l_extendedprice.summary()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Alright then, now suppose we want to split the item prices up into some buckets of our choosing:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "buckets = [0, 5000, 10000, 50000, 100000]" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "The `bucket` function creates a bucketed category from the prices:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "bucketed = items.l_extendedprice.bucket(buckets).name('bucket')" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Let's have a look at the value counts:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "bucketed.value_counts()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "The buckets we wrote down define 4 buckets numbered 0 through 3. The `NaN` is a pandas `NULL` value (since that's how pandas represents nulls in numeric arrays), so don't worry too much about that. Since the bucketing ends at 100000, we see there are 4122 values that are over 100000. These can be included in the bucketing with `include_over`:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "bucketed = (items.l_extendedprice\n", | ||
| " .bucket(buckets, include_over=True)\n", | ||
| " .name('bucket'))\n", | ||
| "bucketed.value_counts()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "The `bucketed` object here is a special **_category_** type" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "bucketed.type()" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Category values can either have a known or unknown **_cardinality_**. In this case, there's either 4 or 5 buckets based on how we used the `bucket` function.\n", | ||
| "\n", | ||
| "Labels can be assigned to the buckets at any time using the `label` function:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "bucket_counts = bucketed.value_counts()\n", | ||
| "\n", | ||
| "labeled_bucket = (bucket_counts.bucket\n", | ||
| " .label(['0 to 5000', '5000 to 10000', '10000 to 50000',\n", | ||
| " '50000 to 100000', 'Over 100000'])\n", | ||
| " .name('bucket_name'))\n", | ||
| "\n", | ||
| "expr = (bucket_counts[labeled_bucket, bucket_counts]\n", | ||
| " .sort_by('bucket'))\n", | ||
| "expr" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "Nice, huh?\n", | ||
| "\n", | ||
| "`histogram` is a linear (fixed size bin) equivalent:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "t = con.table('functional_alltypes')\n", | ||
| "\n", | ||
| "d = t.double_col\n", | ||
| "\n", | ||
| "tier = d.histogram(10).name('hist_bin')\n", | ||
| "expr = (t.group_by(tier)\n", | ||
| " .aggregate([d.min(), d.max(), t.count()])\n", | ||
| " .sort_by('hist_bin'))\n", | ||
| "expr" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "markdown", | ||
| "metadata": {}, | ||
| "source": [ | ||
| "## Filtering in aggregations\n", | ||
| "\n", | ||
| "\n", | ||
| "Suppose that you want to compute an aggregation with a subset of the data for _only one_ of the metrics / aggregates in question, and the complete data set with the other aggregates. Most aggregation functions are thus equipped with a `where` argument. Let me show it to you in action:" | ||
| ] | ||
| }, | ||
| { | ||
| "cell_type": "code", | ||
| "execution_count": null, | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "t = con.table('functional_alltypes')\n", | ||
| "\n", | ||
| "d = t.double_col\n", | ||
| "s = t.string_col\n", | ||
| "\n", | ||
| "cond = s.isin(['3', '5', '7'])\n", | ||
| "\n", | ||
| "metrics = [t.count().name('# rows total'), \n", | ||
| " cond.sum().name('# selected'),\n", | ||
| " d.sum().name('total'),\n", | ||
| " d.sum(where=cond).name('selected total')]\n", | ||
| "\n", | ||
| "color = (t.float_col\n", | ||
| " .between(3, 7)\n", | ||
| " .ifelse('red', 'blue')\n", | ||
| " .name('color'))\n", | ||
| "\n", | ||
| "t.group_by(color).aggregate(metrics)" | ||
| ] | ||
| } | ||
| ], | ||
| "metadata": { | ||
| "kernelspec": { | ||
| "display_name": "Python 3", | ||
| "language": "python", | ||
| "name": "python3" | ||
| }, | ||
| "language_info": { | ||
| "codemirror_mode": { | ||
| "name": "ipython", | ||
| "version": 3 | ||
| }, | ||
| "file_extension": ".py", | ||
| "mimetype": "text/x-python", | ||
| "name": "python", | ||
| "nbconvert_exporter": "python", | ||
| "pygments_lexer": "ipython3", | ||
| "version": "3.6.3" | ||
| } | ||
| }, | ||
| "nbformat": 4, | ||
| "nbformat_minor": 1 | ||
| } |