| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| name: Conventional commits check | ||
|
|
||
| on: | ||
| # runs on `pull_request_target` events so that commenting on the PR is allowed | ||
| pull_request_target: | ||
| types: [opened, edited, synchronize, reopened] | ||
|
|
||
| jobs: | ||
| commitlint: | ||
| name: Check PR title conforms to semantic-release | ||
| runs-on: ubuntu-latest | ||
| permissions: | ||
| issues: write | ||
| pull-requests: write | ||
| steps: | ||
| - name: install node | ||
| uses: actions/setup-node@v4 | ||
| with: | ||
| node-version: "20" | ||
|
|
||
| - name: checkout code to pick up commitlint configuration | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| ref: ${{ github.event.pull_request.head.sha }} | ||
|
|
||
| - name: install deps | ||
| run: npm install "@commitlint/config-conventional" | ||
|
|
||
| - name: run commitlint | ||
| id: lint | ||
| run: | | ||
| failed=0 | ||
| if ! npx commitlint --extends "@commitlint/config-conventional" --verbose <<< "$COMMIT_MSG"; then | ||
| failed=1 | ||
| fi | ||
| echo "failed=$failed" >> "$GITHUB_OUTPUT" | ||
| env: | ||
| COMMIT_MSG: | | ||
| ${{ github.event.pull_request.title }} | ||
| ${{ github.event.pull_request.body }} | ||
| - name: find existing comment | ||
| if: steps.lint.outputs.failed == '1' | ||
| uses: peter-evans/find-comment@v3 | ||
| id: fc | ||
| with: | ||
| issue-number: ${{ github.event.pull_request.number }} | ||
| body-includes: "ACTION NEEDED" | ||
|
|
||
| - name: post a message if the pull request title and body fail `commitlint` | ||
| if: steps.lint.outputs.failed == '1' && steps.fc.outputs.comment-body == '' | ||
| uses: peter-evans/create-or-update-comment@v4 | ||
| with: | ||
| issue-number: ${{ github.event.pull_request.number }} | ||
| body: | | ||
| **ACTION NEEDED** | ||
| Ibis follows the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) for release automation. | ||
| The PR title and description are used as the merge commit message. | ||
| Please update your PR title and description to match the specification. | ||
| - name: fail the check if commitlint failed | ||
| if: steps.lint.outputs.failed == '1' | ||
| run: exit 1 # templating not allowed here it seems |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| # Security Policy | ||
|
|
||
| ## Supported Versions | ||
|
|
||
| Security updates are provided by releasing a new version of Ibis. | ||
|
|
||
| ## Reporting a Vulnerability | ||
|
|
||
| - Send security reports to security@ibis-project.org | ||
| - Vulnerability reports are published on GitHub at https://github.com/ibis-project/ibis/security/advisories | ||
| - If a vulnerability is accepted we will attempt to address it as soon as possible, by cutting a new release. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| [FreeTDS] | ||
| Driver = libtdsodbc.so |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,177 @@ | ||
| SET RW_IMPLICIT_FLUSH=true; | ||
|
|
||
| DROP TABLE IF EXISTS diamonds CASCADE; | ||
|
|
||
| CREATE TABLE diamonds ( | ||
| carat FLOAT, | ||
| cut TEXT, | ||
| color TEXT, | ||
| clarity TEXT, | ||
| depth FLOAT, | ||
| "table" FLOAT, | ||
| price BIGINT, | ||
| x FLOAT, | ||
| y FLOAT, | ||
| z FLOAT | ||
| ) WITH ( | ||
| connector = 'posix_fs', | ||
| match_pattern = 'diamonds.csv', | ||
| posix_fs.root = '/data', | ||
| ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' ); | ||
|
|
||
| DROP TABLE IF EXISTS astronauts CASCADE; | ||
|
|
||
| CREATE TABLE astronauts ( | ||
| "id" BIGINT, | ||
| "number" BIGINT, | ||
| "nationwide_number" BIGINT, | ||
| "name" VARCHAR, | ||
| "original_name" VARCHAR, | ||
| "sex" VARCHAR, | ||
| "year_of_birth" BIGINT, | ||
| "nationality" VARCHAR, | ||
| "military_civilian" VARCHAR, | ||
| "selection" VARCHAR, | ||
| "year_of_selection" BIGINT, | ||
| "mission_number" BIGINT, | ||
| "total_number_of_missions" BIGINT, | ||
| "occupation" VARCHAR, | ||
| "year_of_mission" BIGINT, | ||
| "mission_title" VARCHAR, | ||
| "ascend_shuttle" VARCHAR, | ||
| "in_orbit" VARCHAR, | ||
| "descend_shuttle" VARCHAR, | ||
| "hours_mission" DOUBLE PRECISION, | ||
| "total_hrs_sum" DOUBLE PRECISION, | ||
| "field21" BIGINT, | ||
| "eva_hrs_mission" DOUBLE PRECISION, | ||
| "total_eva_hrs" DOUBLE PRECISION | ||
| ) WITH ( | ||
| connector = 'posix_fs', | ||
| match_pattern = 'astronauts.csv', | ||
| posix_fs.root = '/data', | ||
| ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' ); | ||
|
|
||
| DROP TABLE IF EXISTS batting CASCADE; | ||
|
|
||
| CREATE TABLE batting ( | ||
| "playerID" TEXT, | ||
| "yearID" BIGINT, | ||
| stint BIGINT, | ||
| "teamID" TEXT, | ||
| "lgID" TEXT, | ||
| "G" BIGINT, | ||
| "AB" BIGINT, | ||
| "R" BIGINT, | ||
| "H" BIGINT, | ||
| "X2B" BIGINT, | ||
| "X3B" BIGINT, | ||
| "HR" BIGINT, | ||
| "RBI" BIGINT, | ||
| "SB" BIGINT, | ||
| "CS" BIGINT, | ||
| "BB" BIGINT, | ||
| "SO" BIGINT, | ||
| "IBB" BIGINT, | ||
| "HBP" BIGINT, | ||
| "SH" BIGINT, | ||
| "SF" BIGINT, | ||
| "GIDP" BIGINT | ||
| ) WITH ( | ||
| connector = 'posix_fs', | ||
| match_pattern = 'batting.csv', | ||
| posix_fs.root = '/data', | ||
| ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' ); | ||
|
|
||
| DROP TABLE IF EXISTS awards_players CASCADE; | ||
|
|
||
| CREATE TABLE awards_players ( | ||
| "playerID" TEXT, | ||
| "awardID" TEXT, | ||
| "yearID" BIGINT, | ||
| "lgID" TEXT, | ||
| tie TEXT, | ||
| notes TEXT | ||
| ) WITH ( | ||
| connector = 'posix_fs', | ||
| match_pattern = 'awards_players.csv', | ||
| posix_fs.root = '/data', | ||
| ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' ); | ||
|
|
||
| DROP TABLE IF EXISTS functional_alltypes CASCADE; | ||
|
|
||
| CREATE TABLE functional_alltypes ( | ||
| id INTEGER, | ||
| bool_col BOOLEAN, | ||
| tinyint_col SMALLINT, | ||
| smallint_col SMALLINT, | ||
| int_col INTEGER, | ||
| bigint_col BIGINT, | ||
| float_col REAL, | ||
| double_col DOUBLE PRECISION, | ||
| date_string_col TEXT, | ||
| string_col TEXT, | ||
| timestamp_col TIMESTAMP WITHOUT TIME ZONE, | ||
| year INTEGER, | ||
| month INTEGER | ||
| ) WITH ( | ||
| connector = 'posix_fs', | ||
| match_pattern = 'functional_alltypes.csv', | ||
| posix_fs.root = '/data', | ||
| ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' ); | ||
|
|
||
| DROP TABLE IF EXISTS tzone CASCADE; | ||
|
|
||
| CREATE TABLE tzone ( | ||
| ts TIMESTAMP WITH TIME ZONE, | ||
| key TEXT, | ||
| value DOUBLE PRECISION | ||
| ); | ||
|
|
||
| INSERT INTO tzone | ||
| SELECT | ||
| CAST('2017-05-28 11:01:31.000400' AS TIMESTAMP WITH TIME ZONE) + | ||
| t * INTERVAL '1 day 1 second' AS ts, | ||
| CHR(97 + t) AS key, | ||
| t + t / 10.0 AS value | ||
| FROM generate_series(0, 9) AS t; | ||
|
|
||
| DROP TABLE IF EXISTS array_types CASCADE; | ||
|
|
||
| CREATE TABLE IF NOT EXISTS array_types ( | ||
| x BIGINT[], | ||
| y TEXT[], | ||
| z DOUBLE PRECISION[], | ||
| grouper TEXT, | ||
| scalar_column DOUBLE PRECISION, | ||
| multi_dim BIGINT[][] | ||
| ); | ||
|
|
||
| INSERT INTO array_types VALUES | ||
| (ARRAY[1, 2, 3], ARRAY['a', 'b', 'c'], ARRAY[1.0, 2.0, 3.0], 'a', 1.0, ARRAY[ARRAY[NULL::BIGINT, NULL, NULL], ARRAY[1, 2, 3]]), | ||
| (ARRAY[4, 5], ARRAY['d', 'e'], ARRAY[4.0, 5.0], 'a', 2.0, ARRAY[]::BIGINT[][]), | ||
| (ARRAY[6, NULL], ARRAY['f', NULL], ARRAY[6.0, NULL], 'a', 3.0, ARRAY[NULL, ARRAY[]::BIGINT[], NULL]), | ||
| (ARRAY[NULL, 1, NULL], ARRAY[NULL, 'a', NULL], ARRAY[]::DOUBLE PRECISION[], 'b', 4.0, ARRAY[ARRAY[1], ARRAY[2], ARRAY[NULL::BIGINT], ARRAY[3]]), | ||
| (ARRAY[2, NULL, 3], ARRAY['b', NULL, 'c'], NULL, 'b', 5.0, NULL), | ||
| (ARRAY[4, NULL, NULL, 5], ARRAY['d', NULL, NULL, 'e'], ARRAY[4.0, NULL, NULL, 5.0], 'c', 6.0, ARRAY[ARRAY[1, 2, 3]]); | ||
|
|
||
| DROP TABLE IF EXISTS json_t CASCADE; | ||
|
|
||
| CREATE TABLE IF NOT EXISTS json_t (js JSONB); | ||
|
|
||
| INSERT INTO json_t VALUES | ||
| ('{"a": [1,2,3,4], "b": 1}'), | ||
| ('{"a":null,"b":2}'), | ||
| ('{"a":"foo", "c":null}'), | ||
| ('null'), | ||
| ('[42,47,55]'), | ||
| ('[]'); | ||
|
|
||
| DROP TABLE IF EXISTS win CASCADE; | ||
| CREATE TABLE win (g TEXT, x BIGINT, y BIGINT); | ||
| INSERT INTO win VALUES | ||
| ('a', 0, 3), | ||
| ('a', 1, 2), | ||
| ('a', 2, 0), | ||
| ('a', 3, 1), | ||
| ('a', 4, 1); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import os | ||
|
|
||
| import hypothesis as h | ||
|
|
||
| # setup hypothesis profiles | ||
| h.settings.register_profile( | ||
| "ci", | ||
| max_examples=1000, | ||
| suppress_health_check=[h.HealthCheck.too_slow], | ||
| deadline=None, | ||
| ) | ||
| h.settings.register_profile( | ||
| "dev", | ||
| max_examples=50, | ||
| suppress_health_check=[h.HealthCheck.too_slow], | ||
| deadline=None, | ||
| ) | ||
| h.settings.register_profile( | ||
| "debug", | ||
| max_examples=10, | ||
| verbosity=h.Verbosity.verbose, | ||
| suppress_health_check=[h.HealthCheck.too_slow], | ||
| deadline=None, | ||
| ) | ||
|
|
||
| # load default hypothesis profile, either set HYPOTHESIS_PROFILE environment | ||
| # variable or pass --hypothesis-profile option to pytest, to see the generated | ||
| # examples try: | ||
| # pytest pyarrow -sv --hypothesis-profile=debug | ||
| h.settings.load_profile(os.environ.get("HYPOTHESIS_PROFILE", "dev")) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| FROM flink:1.18.1 | ||
| # ibis-flink requires PyFlink dependency | ||
| RUN wget -nv -P $FLINK_HOME/lib/ https://repo1.maven.org/maven2/org/apache/flink/flink-python/1.18.1/flink-python-1.18.1.jar |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| <?xml version="1.0"?> | ||
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | ||
| <!-- | ||
| Licensed to the Apache Software Foundation (ASF) under one or more | ||
| contributor license agreements. See the NOTICE file distributed with | ||
| this work for additional information regarding copyright ownership. | ||
| The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| (the "License"); you may not use this file except in compliance with | ||
| the License. You may obtain a copy of the License at | ||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| --> | ||
| <!-- | ||
| Hive configuration for Impala quickstart docker cluster. | ||
| --> | ||
| <configuration> | ||
| <property> | ||
| <!-- Required for automatic metadata sync. --> | ||
| <name>hive.metastore.dml.events</name> | ||
| <value>true</value> | ||
| </property> | ||
|
|
||
| <property> | ||
| <!-- User impala is not authorized to consume notifications by default, disable | ||
| authentication to work around this. --> | ||
| <name>hive.metastore.event.db.notification.api.auth</name> | ||
| <value>false</value> | ||
| </property> | ||
|
|
||
| <property> | ||
| <name>hive.metastore.uris</name> | ||
| <value>thrift://impala-hive-metastore:9083</value> | ||
| </property> | ||
|
|
||
| <!-- Managed and external tablespaces must live on the Docker volumes that we | ||
| configure for the cluster. --> | ||
| <property> | ||
| <name>hive.metastore.warehouse.dir</name> | ||
| <value>/user/hive/warehouse/managed</value> | ||
| </property> | ||
|
|
||
| <property> | ||
| <name>hive.metastore.warehouse.external.dir</name> | ||
| <value>/user/hive/warehouse/external</value> | ||
| </property> | ||
|
|
||
| <property> | ||
| <!-- Required to enable Hive transactions --> | ||
| <name>hive.support.concurrency</name> | ||
| <value>true</value> | ||
| </property> | ||
|
|
||
| <property> | ||
| <!-- Required to enable Hive transactions --> | ||
| <name>hive.txn.manager</name> | ||
| <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value> | ||
| </property> | ||
|
|
||
| <!-- Hive stats autogathering negatively affects latency of DDL operations, etc and | ||
| is not particularly useful for Impala --> | ||
| <property> | ||
| <name>hive.stats.autogather</name> | ||
| <value>false</value> | ||
| </property> | ||
|
|
||
| <property> | ||
| <name>hive.compactor.initiator.on</name> | ||
| <value>true</value> | ||
| </property> | ||
|
|
||
| <property> | ||
| <name>hive.compactor.worker.threads</name> | ||
| <value>1</value> | ||
| </property> | ||
| </configuration> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,2 @@ | ||
| FROM postgis/postgis:15-3.3-alpine | ||
| RUN apk add --no-cache postgresql15-plpython3 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| # Flink | ||
|
|
||
| [https://nightlies.apache.org/flink/flink-docs-stable/](https://nightlies.apache.org/flink/flink-docs-stable/) | ||
|
|
||
|    | ||
|
|
||
| ## Install | ||
|
|
||
| Install Ibis and dependencies for the Flink backend: | ||
|
|
||
| ::: {.panel-tabset} | ||
|
|
||
| ## `pip` | ||
|
|
||
| Install alongside the `apache-flink` package: | ||
|
|
||
| ```{.bash} | ||
| pip install ibis-framework apache-flink | ||
| ``` | ||
|
|
||
| And connect: | ||
|
|
||
| ```{.python} | ||
| import ibis | ||
| con = ibis.flink.connect() # <1> | ||
| ``` | ||
|
|
||
| 1. Adjust connection parameters as needed. | ||
|
|
||
| ::: | ||
|
|
||
| ## Connect | ||
|
|
||
| ### `ibis.flink.connect` | ||
|
|
||
| ```python | ||
| con = ibis.flink.connect(table_env=table_env) | ||
| ``` | ||
|
|
||
| ::: {.callout-note} | ||
| `ibis.flink.connect` is a thin wrapper around [`ibis.backends.flink.Backend.do_connect`](#ibis.backends.flink.Backend.do_connect). | ||
| ::: | ||
|
|
||
| ::: {.callout-note} | ||
| The `flink` backend does not create `TableEnvironment` objects; you must create a `TableEnvironment` and pass that to `ibis.flink.connect`. | ||
| ::: | ||
|
|
||
| ### Connection Parameters | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| from _utils import render_do_connect | ||
| render_do_connect("flink") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| BACKEND = "Flink" | ||
| ``` | ||
|
|
||
| {{< include ./_templates/api.qmd >}} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| # Composable data ecosystem | ||
|
|
||
| Ibis exists in a broader composable data ecosystem. [The Composable Codex by | ||
| Voltron Data](https://voltrondata.com/codex) is the result of years of | ||
| experience and hours of writing by experts in the field, providing an in-depth | ||
| introduction the composable data systems. We'll take a look at: | ||
|
|
||
| - [Apache Arrow](https://github.com/apache/arrow) | ||
| - [Apache Arrow Database Connectivity (ADBC)](https://arrow.apache.org/adbc/current/index.html) | ||
| - [Substrait](https://substrait.io/) | ||
|
|
||
| and how they fit in with Ibis. | ||
|
|
||
| ## Overview | ||
|
|
||
| Ibis is the portable Python dataframe API, supporting many backends. This is | ||
| achieved by decoupling the API from the execution engine. While Ibis already | ||
| relies on standards like Apache Arrow today, we expect the composable data | ||
| ecosystem to mature and broader adoption of adjacent projects going forward. | ||
| This will allow Ibis to simplify its implementation and improve performance for | ||
| backends that support these standards. | ||
|
|
||
| [The first chapter of The Composable Codex provides a great overview of where | ||
| these projects fit | ||
| in](https://voltrondata.com/codex/standards-over-silos#1-2-3-a-composable-ecosystem): | ||
|
|
||
|  | ||
|
|
||
| And a table explaining the standards: | ||
|
|
||
| | Label | Types of standards | Standards | | ||
| | --- | --- | --- | | ||
| | A | Intermediate representation | [Substrait](https://substrait.io) allows any user-interface that produces Substrait to pass the compute operations to a Substrait-consuming execution engine. You could swap any Substrait compatible user interfaces or execution engine. | | ||
| | B | Connectivity | [Arrow Database Connectivity (ADBC)](https://arrow.apache.org/adbc/current/index.html) ensures that no matter where the computation is performed the data will be returned in the Arrow format. You can swap your execution engine and know that your downstream code will still work. | | ||
| | C | Data memory layout | The [Apache Arrow in-memory data format](https://arrow.apache.org/docs/format/Columnar.html) ensures that the data can pass from the storage to the engine (and even across the systems in a distributed environment) and back to the user without slowing down to serialize and deserialize. | | ||
|
|
||
| ## History | ||
|
|
||
| The composable data ecosystem has been envisioned for some time. [Wes | ||
| McKinney](https://wesmckinney.com) has been instrumental in the development of | ||
| the composable data ecosystem, co-founding Voltron Data, Apache Arrow, and | ||
| initially creating Ibis. [Wes looked back on 15 years on the road to composable | ||
| data systems](https://wesmckinney.com/blog/looking-back-15-years/) and gave some | ||
| motivation for Ibis in his infamous ["Apache Arrow and the '10 Things I Hate | ||
| About pandas'"](https://wesmckinney.com/blog/apache-arrow-pandas-internals/). | ||
|
|
||
| Ibis started as a pandas-like API for Apache Impala, but has since expanded to | ||
| support many backends. It currently leverages open-source projects like | ||
| [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy) and | ||
| [SQLGlot](https://github.com/tobymao/sqlglot) to work with many backends. While | ||
| these projects are great, they rely on backend-specific SQL that does not | ||
| constitute a standard. Going forward, we expect ADBC and Substrait to be the | ||
| standards for connectivity and intermediate representation, respectively. | ||
|
|
||
| ## Apache Arrow | ||
|
|
||
| Ibis uses [Apache Arrow](https://arrow.apache.org/) to provide a common data | ||
| format for data interchange between Ibis and backends. Many backends also use | ||
| Apache Arrows as their in-memory data format. | ||
|
|
||
| ### Dataframe interchange protocol | ||
|
|
||
| Ibis supports [the dataframe interchange | ||
| protocol](https://data-apis.org/dataframe-protocol/latest/purpose_and_scope.html) | ||
| for data interchange between with other Python dataframe libraries and | ||
| visualization libraries. This is largely efficient because most libraries | ||
| support Apache Arrow as their in-memory data format, making interchange between | ||
| them cheap and efficient. | ||
|
|
||
| ## Apache Arrow Database Connectivity (ADBC) | ||
|
|
||
| [Apache Arrow Database Connectivity | ||
| (ADBC)](https://arrow.apache.org/docs/format/ADBC.html) is a relatively new | ||
| standard for database connectivity. It is an API for exchanging data between a | ||
| client and a database. It is a successor to ODBC and JDBC, and is designed to be | ||
| a more modern and performant alternative to these standards. | ||
|
|
||
| While Ibis does not currently use ADBC for its backends, as the project matures | ||
| we expect an increase in performance and a decrease in complexity for backends | ||
| that support ADBC. | ||
|
|
||
| ## Substrait | ||
|
|
||
| [Substrait](https://substrait.io/) is a relatively new standard for | ||
| cross-language serialization of relational algebra. It is intended as an | ||
| intermediary representation between the user interface and other points in the | ||
| data system. Ibis can already compile expressions to Substrait which can then be | ||
| executed by Substrait-consuming backends. Support today is limited but, like | ||
| ADBC, we expect the project to mature and for Ibis to leverage it more in the | ||
| future. | ||
|
|
||
| ### Why not SQL? | ||
|
|
||
| Structured Query Language (SQL) is not a standard. There is a commonly | ||
| referenced [ANSI Standard for | ||
| SQL](https://blog.ansi.org/sql-standard-iso-iec-9075-2023-ansi-x3-135) that you | ||
| can pay a lot of money to access and most execution engines claim to support. | ||
| However, most execution engines extend or subtly deviate from the standard and | ||
| in practice it is not possible to simply reuse SQL from one execution engine on | ||
| another. This leads to notoriously difficult database migrations and vendor | ||
| lock-in. | ||
|
|
||
| ### Why Substrait? | ||
|
|
||
| Substrait, unlike SQL, is not intended as a user interface. Instead, a user | ||
| interface like Ibis in Python or dplyr in R would compile to Substrait and pass | ||
| it to a Substrait-consuming execution engine. This allows the user interface to | ||
| be decoupled from the execution engine, allowing for more flexibility and | ||
| portability. | ||
|
|
||
| ## Going forward | ||
|
|
||
| Going forward, Ibis intends to leverage other standards in the broader | ||
| composable data ecosystem to simplify its implementation and improve | ||
| performance. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| # User testimonials | ||
|
|
||
| This page collects user testimonials about Ibis from the community! They may be | ||
| been lightly edited for clarity, with the originals linked. | ||
|
|
||
| ## From the community | ||
|
|
||
| From [Nick Crews on | ||
| GitHub](https://github.com/ibis-project/ibis/issues/7743#issuecomment-1856391530): | ||
|
|
||
| > I have been very impressed with the responsiveness of the team. When I report | ||
| > bugs they are usually addressed within the next release in the next 1-2 months, | ||
| > and some feature requests of mine have been implemented with little convincing | ||
| > needed. The detailed CHANGELOG has made version transitions fairly easy, though | ||
| > there has been some bulk refactoring occasionally needed, but through the last 3 | ||
| > major version upgrades I've gone through the process has never been that bad. | ||
| > Nothing but good things to say :), I hope you join along! | ||
| --- | ||
|
|
||
| From [Daniel Kim on | ||
| Zulip](https://ibis-project.zulipchat.com/#narrow/stream/405263-general/topic/.E2.9C.94.20ibis-on-reddit/near/407807779): | ||
|
|
||
| > We have a production DB2 server that is already under a heavy load. So what I've | ||
| > done was extract a subset of the data it has locally onto my machine and then | ||
| > use ibis w/duckdb backend to perform ad-hoc analysis on this local data which is | ||
| > a bit too big for pandas, instead of hammering the production server. Often | ||
| > times, I don't know what queries I'll be building or what kind of rabbit hole my | ||
| > analysis may take me. So it's great that I can just query away with my local | ||
| > data. Performance has been great. | ||
| [And later in the same | ||
| topic](https://ibis-project.zulipchat.com/#narrow/stream/405263-general/topic/.E2.9C.94.20ibis-on-reddit/near/407813572): | ||
|
|
||
| > ...I have a lot of "medium" data that I need to work with locally, and so Ibis | ||
| > has been perfect for my use cases. We have this metric called cumulative defect | ||
| > rate that I need to forecast. It requires making cumulative sums and then having | ||
| > to pivot this data, along with some wonky transformations requiring UDFs. The | ||
| > need to dynamically pivot this data is where I turn to Ibis. Love that with | ||
| > Ibis, I can use SQL for the heavy lifting or aggregations, and then being able | ||
| > to switch to dataframe-like API for the type of dynamic transformations (pivot, | ||
| > forward fill, etc) that would otherwise be tedious to do in pure SQL. | ||
| --- | ||
|
|
||
| From [stereoF on | ||
| GitHub](https://github.com/ibis-project/ibis/issues/7341#issuecomment-1760625921): | ||
|
|
||
| > My story around pyspark -> trying a bunch of stuff -> Ibis, which has feature of | ||
| > lazy computation. | ||
| > Our company has implemented an OLAP platform with its persistence layer on hdfs | ||
| > and the query engine being Presto. Typically, the OLAP platform is geared | ||
| > towards agile analysis, and its table structure is based on an event-driven | ||
| > model. As we delve deeper into machine learning modeling, we often need to | ||
| > transition from this event-based structure to a wide-table feature construction. | ||
| > Back between 2019 and 2020, I worked on a similar OLAP platform during my tenure | ||
| > at Tencent. I developed some generic analysis model tools, and at that time, the | ||
| > query engine was Impala. My approach was to dynamically concatenate SQL, which | ||
| > unfortunately was not conducive to code encapsulation, modularization, and | ||
| > future maintenance. | ||
| > In my pursuit of better code encapsulation and to decouple different parts of | ||
| > logic, I was initially inclined to use PySpark. However, when PySpark connects | ||
| > to Presto via JDBC, if we use the dataframe interface, the aggregation | ||
| > operations run on Spark. This doesn't harness the full power of Presto, leading | ||
| > to slow performances. On the other hand, if we use Spark's SQL interface, | ||
| > aggregation is processed on Presto. But in doing so, we lose the original intent | ||
| > of using Spark - which is better code encapsulation and the decoupling of | ||
| > different processes. | ||
| > The dataframe interface of Ibis and its feature of lazy computation perfectly | ||
| > align with my needs. In fact, back in 2019, I was on the hunt for such a tool. | ||
| > Sadly, I didn't come across Ibis at that time and even contemplated creating a | ||
| > set on my own. | ||
| ## Have a story to share? | ||
|
|
||
| Let us know! We'd love to include it here. Please share your experience with | ||
| Ibis [in our Zulip community chat](https://ibis-project.zulipchat.com) and make | ||
| a PR to this page (or ask us to do it for you). |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| # Who supports Ibis? | ||
|
|
||
| Ibis is an open-source project that welcomes contributions from anyone! We have | ||
| a growing community of users and contributors, and we'd love to have you join | ||
| us. If you're interested in contributing, please see our [contributing | ||
| guide](/contribute). | ||
|
|
||
| ## Voltron Data | ||
|
|
||
| [Voltron Data](https://voltrondata.com) is the primary sponsor of Ibis, with | ||
| most of the core development team employed there. As of writing, this includes | ||
| five full-time developers, one technical product manager, and other staff who | ||
| contribute to Ibis. | ||
|
|
||
| ## Other companies | ||
|
|
||
| Ibis is used by many other companies, with various tools built on top of it. | ||
| Some include: | ||
|
|
||
| - [Google BigQuery DataFrames](https://github.com/googleapis/python-bigquery-dataframes), a clone of the pandas API built on Ibis | ||
| - [Starburst Galaxy Python DataFrames](https://www.starburst.io/blog/introducing-python-dataframes/), with support for Ibis | ||
| - [Claypot AI's contribution of the Flink backend](https://github.com/claypotai/ibis-flink-example), working in collaboration with Voltron Data | ||
| - [Microsoft's Magpie project](https://www.microsoft.com/en-us/research/project/magpie-2/), built on top of Ibis | ||
| - [SuperDuperDB](https://github.com/SuperDuperDB/superduperdb), bringing AI to any backend Ibis supports | ||
|
|
||
| Ibis is also contributed to by other companies. You can [look through the full | ||
| list of contributors on | ||
| GitHub](https://github.com/ibis-project/ibis/graphs/contributors). | ||
|
|
||
| ## History | ||
|
|
||
| Ibis was originally created by [Wes McKinney](https://wesmckinney.com/). Wes | ||
| created pandas, co-created Apache Arrrow, and co-founded Voltron Data (among | ||
| other things). Ibis was initially a pandas-like dataframe library for Apache | ||
| Impala, but has since grown to support many other backends and mature under the | ||
| stewardship of [Phillip Cloud](https://github.com/cpcloud) and others on the | ||
| Ibis team. | ||
|
|
||
| The Ibis project is part of a broader composable data ecosystem envisioned by | ||
| Wes, Voltron Data, and others to solve problems seen throughout the space that | ||
| are compounding as data volume and AI complexity increase. Some good background | ||
| material on the composable data ecosystem and Ibis can be found at: | ||
|
|
||
| - ["Apache Arrow and the '10 Things I Hate About pandas'" by Wes](https://wesmckinney.com/blog/apache-arrow-pandas-internals/) | ||
| - ["The Road to Composable Data Systems: Thoughts on the Last 15 Years and the Future" by Wes](https://wesmckinney.com/blog/looking-back-15-years/) | ||
| - ["The Composable Codex" by Voltron Data](https://voltrondata.com/codex) | ||
|
|
||
| ## Support for production workloads | ||
|
|
||
| Voltron Data is committed to the success of Ibis, and it's already in production | ||
| across numerous enterprises. The API is stable and while there are breaking | ||
| changes across major versions, we do our best to minimize them and provide easy | ||
| migration. | ||
|
|
||
| [Voltron Data offers commercial support for | ||
| Ibis](https://voltrondata.com/enterprise-support) if you're interested. | ||
| Otherwise, interacting through the open-source project channels | ||
| ([GitHub](https://github.com/ibis-project/ibis) and | ||
| [Zulip](https://ibis-project.zulipchat.com)) is the best way to get help. | ||
|
|
||
| ## Next steps | ||
|
|
||
| If you're interested, [get started with Ibis!](../tutorials/getting_started.qmd) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 1brc |