172 changes: 56 additions & 116 deletions .github/workflows/ibis-backends.yml

Large diffs are not rendered by default.

67 changes: 19 additions & 48 deletions .github/workflows/ibis-docs-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ name: Docs/Linting/Benchmarks
on:
push:
branches:
- master
- main
- "*.x.x"
pull_request:
branches:
- master
- main
- "*.x.x"
merge_group:

Expand All @@ -21,39 +21,20 @@ permissions:
contents: read

jobs:
commitlint:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- name: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: install nix
uses: cachix/install-nix-action@v24
with:
nix_path: nixpkgs=channel:nixos-unstable-small
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: lint commits
run: nix run 'nixpkgs#commitlint' -- --from=${{ github.event.pull_request.base.sha }} --to=${{ github.sha }} --verbose

lint:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4

- name: install nix
uses: cachix/install-nix-action@v24
uses: cachix/install-nix-action@v25
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v13
uses: cachix/cachix-action@v14
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -72,13 +53,13 @@ jobs:
fetch-depth: 0

- name: install nix
uses: cachix/install-nix-action@v24
uses: cachix/install-nix-action@v25
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v13
uses: cachix/cachix-action@v14
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -94,28 +75,18 @@ jobs:
- name: checkout
uses: actions/checkout@v4

- name: install poetry
run: pipx install 'poetry==1.7.1'

- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: "3.11"
cache: poetry

- name: install system dependencies
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libgeos-dev freetds-dev libkrb5-dev krb5-config
- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: benchmarks

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: benchmarks-${{ steps.install_python.outputs.python-version }}

- run: python -m pip install --upgrade pip 'poetry==1.7.1'
run: sudo apt-get install -qq -y build-essential libgeos-dev freetds-dev unixodbc-dev

- name: install ibis
run: poetry install --without dev --without docs --all-extras
Expand Down Expand Up @@ -171,13 +142,13 @@ jobs:
concurrency: docs_pr-${{ github.repository }}-${{ github.head_ref || github.sha }}
steps:
- name: install nix
uses: cachix/install-nix-action@v24
uses: cachix/install-nix-action@v25
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v13
uses: cachix/cachix-action@v14
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -189,7 +160,7 @@ jobs:
- name: run doctest
# keep HOME because duckdb (which we use for doctests) wants to use
# that for extensions
run: nix develop --ignore-environment --keep HOME -c just doctest
run: nix develop --ignore-environment --keep HOME --keep HYPOTHESIS_PROFILE -c just doctest

- name: generate api docs
run: nix develop --ignore-environment -c just docs-apigen --verbose
Expand All @@ -209,13 +180,13 @@ jobs:
concurrency: docs-${{ github.repository }}
steps:
- name: install nix
uses: cachix/install-nix-action@v24
uses: cachix/install-nix-action@v25
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v13
uses: cachix/cachix-action@v14
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -227,7 +198,7 @@ jobs:
- name: run doctests
# keep HOME because duckdb (which we use for doctests) wants to use
# that for extensions
run: nix develop --ignore-environment --keep HOME -c just doctest
run: nix develop --ignore-environment --keep HOME --keep HYPOTHESIS_PROFILE -c just doctest

- name: build api docs
run: nix develop --ignore-environment -c just docs-apigen --verbose
Expand All @@ -253,12 +224,12 @@ jobs:
with:
fetch-depth: 0

- uses: cachix/install-nix-action@v24
- uses: cachix/install-nix-action@v25
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- uses: cachix/cachix-action@v13
- uses: cachix/cachix-action@v14
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ibis-main-skip-helper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
pull_request:
paths:
Expand All @@ -19,7 +19,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
merge_group:
jobs:
Expand Down
66 changes: 21 additions & 45 deletions .github/workflows/ibis-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
pull_request:
# Skip the test suite if all changes are in the docs directory
Expand All @@ -19,7 +19,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
merge_group:

Expand All @@ -32,12 +32,12 @@ concurrency:

env:
FORCE_COLOR: "1"
SQLALCHEMY_WARN_20: "1"
HYPOTHESIS_PROFILE: "ci"

jobs:
test_core:
name: Test ${{ matrix.os }} python-${{ matrix.python-version }}
env:
SQLALCHEMY_WARN_20: "1"
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
Expand All @@ -53,23 +53,15 @@ jobs:
- name: checkout
uses: actions/checkout@v4

- name: install poetry
run: pipx install 'poetry==1.7.1'

- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: no-backends-${{ steps.install_python.outputs.python-version }}

- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: core
cache: poetry

- name: install ${{ matrix.os }} system dependencies
if: matrix.os == 'ubuntu-latest'
Expand Down Expand Up @@ -100,7 +92,7 @@ jobs:

- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
flags: core,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}

Expand All @@ -118,21 +110,15 @@ jobs:
- name: checkout
uses: actions/checkout@v4

- name: install poetry
run: pipx install 'poetry==1.7.1'

- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}

- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: shapely-duckdb

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: shapely-duckdb-${{ steps.install_python.outputs.python-version }}
cache: poetry

- name: install ${{ matrix.os }} system dependencies
run: |
Expand All @@ -141,8 +127,6 @@ jobs:
sudo apt-get update -y -qq
sudo apt-get install -y -q build-essential libgeos-dev
- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- name: install ibis
# install duckdb and geospatial because of https://github.com/ibis-project/ibis/issues/4856
run: poetry install --without dev --without docs --without test --extras duckdb --extras geospatial
Expand All @@ -165,29 +149,21 @@ jobs:
run: |
set -euo pipefail
sudo apt-get update -y -qq
sudo apt-get install -y -q build-essential graphviz libgeos-dev libkrb5-dev freetds-dev
sudo apt-get update -y -q
sudo apt-get install -y -q build-essential graphviz libgeos-dev freetds-dev unixodbc-dev
- name: checkout
uses: actions/checkout@v4

- name: install poetry
run: pipx install 'poetry==1.7.1'

- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: doctests-${{ steps.install_python.outputs.python-version }}

- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: doctests
cache: poetry

- name: install ibis with all extras
run: poetry install --without dev --without docs --extras all
Expand All @@ -197,10 +173,10 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: run doctests
run: just doctest --junitxml=junit.xml --cov=ibis --cov-report=xml:coverage.xml
run: just ci-doctest --junitxml=junit.xml --cov=ibis --cov-report=xml:coverage.xml

- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
flags: core,doctests,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
4 changes: 2 additions & 2 deletions .github/workflows/nix-skip-helper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
pull_request:
paths:
Expand All @@ -20,7 +20,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
merge_group:

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/nix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
pull_request:
paths-ignore:
Expand All @@ -17,7 +17,7 @@ on:
- "**/*.qmd"
- ".envrc"
branches:
- master
- main
- "*.x.x"
merge_group:

Expand All @@ -42,14 +42,14 @@ jobs:
uses: actions/checkout@v4

- name: install nix
uses: cachix/install-nix-action@v24
uses: cachix/install-nix-action@v25
with:
nix_path: nixpkgs=channel:nixos-unstable-small
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v13
uses: cachix/cachix-action@v14
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down
67 changes: 67 additions & 0 deletions .github/workflows/pr-title.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: Conventional commits check

on:
# runs on `pull_request_target` events so that commenting on the PR is allowed
pull_request_target:
types: [opened, edited, synchronize, reopened]

jobs:
commitlint:
name: Check PR title conforms to semantic-release
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- name: install node
uses: actions/setup-node@v4
with:
node-version: "20"

- name: checkout code to pick up commitlint configuration
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}

- name: install deps
run: npm install "@commitlint/config-conventional"

- name: run commitlint
id: lint
run: |
failed=0
if ! npx commitlint --extends "@commitlint/config-conventional" --verbose <<< "$COMMIT_MSG"; then
failed=1
fi
echo "failed=$failed" >> "$GITHUB_OUTPUT"
env:
COMMIT_MSG: |
${{ github.event.pull_request.title }}
${{ github.event.pull_request.body }}
- name: find existing comment
if: steps.lint.outputs.failed == '1'
uses: peter-evans/find-comment@v3
id: fc
with:
issue-number: ${{ github.event.pull_request.number }}
body-includes: "ACTION NEEDED"

- name: post a message if the pull request title and body fail `commitlint`
if: steps.lint.outputs.failed == '1' && steps.fc.outputs.comment-body == ''
uses: peter-evans/create-or-update-comment@v4
with:
issue-number: ${{ github.event.pull_request.number }}
body: |
**ACTION NEEDED**
Ibis follows the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) for release automation.
The PR title and description are used as the merge commit message.
Please update your PR title and description to match the specification.
- name: fail the check if commitlint failed
if: steps.lint.outputs.failed == '1'
run: exit 1 # templating not allowed here it seems
4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ jobs:
fetch-depth: 0
token: ${{ steps.generate_token.outputs.token }}

- uses: cachix/install-nix-action@v24
- uses: cachix/install-nix-action@v25
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- uses: cachix/cachix-action@v13
- uses: cachix/cachix-action@v14
with:
name: ibis
extraPullNames: nix-community,poetry2nix
Expand Down
115 changes: 0 additions & 115 deletions .github/workflows/update-deps.yml

This file was deleted.

4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,7 @@ ibis/examples/descriptions

# chat
*zuliprc*

# automatically generated odbc file for ci
ci/odbc/odbc.ini
*-citibike-tripdata.tar.xz
2 changes: 1 addition & 1 deletion .releaserc.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"use strict";

module.exports = {
branches: ["master"],
branches: ["main"],
tagFormat: "${version}",
preset: "conventionalcommits",
plugins: [
Expand Down
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
[![Project Chat](https://img.shields.io/badge/zulip-join_chat-purple.svg?logo=zulip)](https://ibis-project.zulipchat.com)
[![Anaconda-Server Badge](https://anaconda.org/conda-forge/ibis-framework/badges/version.svg)](https://anaconda.org/conda-forge/ibis-framework)
[![PyPI](https://img.shields.io/pypi/v/ibis-framework.svg)](https://pypi.org/project/ibis-framework)
[![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml?query=branch%3Amaster)
[![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml?query=branch%3Amaster)
[![Codecov branch](https://img.shields.io/codecov/c/github/ibis-project/ibis/master.svg)](https://codecov.io/gh/ibis-project/ibis)
[![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml?query=branch%3Amain)
[![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml?query=branch%3Amain)
[![Codecov branch](https://img.shields.io/codecov/c/github/ibis-project/ibis/main.svg)](https://codecov.io/gh/ibis-project/ibis)

## What is Ibis?

Expand Down Expand Up @@ -74,7 +74,7 @@ Ibis acts as a universal frontend to the following systems:

The list of supported backends is continuously growing. Anyone can get involved
in adding new ones! Learn more about contributing to ibis in our contributing
documentation at https://github.com/ibis-project/ibis/blob/master/docs/CONTRIBUTING.md
documentation at https://github.com/ibis-project/ibis/blob/main/docs/CONTRIBUTING.md

## Installation

Expand Down Expand Up @@ -184,9 +184,9 @@ Show the 5 least populous countries in Asia

Ibis is an open source project and welcomes contributions from anyone in the community.

- Read [the contributing guide](https://github.com/ibis-project/ibis/blob/master/docs/CONTRIBUTING.md).
- We care about keeping the community welcoming for all. Check out [the code of conduct](https://github.com/ibis-project/ibis/blob/master/docs/CODE_OF_CONDUCT.md).
- The Ibis project is open sourced under the [Apache License](https://github.com/ibis-project/ibis/blob/master/LICENSE.txt).
- Read [the contributing guide](https://github.com/ibis-project/ibis/blob/main/docs/CONTRIBUTING.md).
- We care about keeping the community welcoming for all. Check out [the code of conduct](https://github.com/ibis-project/ibis/blob/main/docs/CODE_OF_CONDUCT.md).
- The Ibis project is open sourced under the [Apache License](https://github.com/ibis-project/ibis/blob/main/LICENSE.txt).

Join our community by interacting on GitHub or chatting with us on [Zulip](https://ibis-project.zulipchat.com/).

Expand Down
11 changes: 11 additions & 0 deletions SECURITY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Security Policy

## Supported Versions

Security updates are provided by releasing a new version of Ibis.

## Reporting a Vulnerability

- Send security reports to security@ibis-project.org
- Vulnerability reports are published on GitHub at https://github.com/ibis-project/ibis/security/advisories
- If a vulnerability is accepted we will attempt to address it as soon as possible, by cutting a new release.
5 changes: 1 addition & 4 deletions ci/make_geography_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@
"independence": lambda row: toolz.assoc(
row,
"independence_date",
datetime.datetime.strptime(
row["independence_date"],
"%Y-%m-%d",
).date(),
datetime.datetime.fromisoformat(row["independence_date"]).date(),
)
}

Expand Down
2 changes: 2 additions & 0 deletions ci/odbc/odbcinst.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[FreeTDS]
Driver = libtdsodbc.so
39 changes: 36 additions & 3 deletions ci/schema/postgres.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
CREATE EXTENSION IF NOT EXISTS hstore;
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION IF NOT EXISTS plpython3u;
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS first_last_agg;
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;

-- Create a function that always returns the first non-NULL value:
CREATE OR REPLACE FUNCTION public.first_agg (anyelement, anyelement)
RETURNS anyelement
LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE
AS 'SELECT $1';

-- Then wrap an aggregate around it:
DROP AGGREGATE IF EXISTS public.first (anyelement);
CREATE AGGREGATE public.first (anyelement) (
SFUNC = public.first_agg,
STYPE = anyelement,
PARALLEL = safe
);

-- Create a function that always returns the last non-NULL value:
CREATE OR REPLACE FUNCTION public.last_agg (anyelement, anyelement)
RETURNS anyelement
LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE AS
'SELECT $2';

-- Then wrap an aggregate around it:
DROP AGGREGATE IF EXISTS public.last (anyelement);
CREATE AGGREGATE public.last (anyelement) (
SFUNC = public.last_agg,
STYPE = anyelement,
PARALLEL = safe
);

DROP TABLE IF EXISTS diamonds CASCADE;

CREATE TABLE diamonds (
Expand Down Expand Up @@ -95,12 +121,19 @@ CREATE TABLE awards_players (

COPY awards_players FROM '/data/awards_players.csv' WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',');

DROP TYPE IF EXISTS vector CASCADE;
CREATE TYPE vector AS (
x FLOAT8,
y FLOAT8,
z FLOAT8
);

DROP VIEW IF EXISTS awards_players_special_types CASCADE;
CREATE VIEW awards_players_special_types AS
SELECT
*,
setweight(to_tsvector('simple', notes), 'A')::TSVECTOR AS search,
'[1,2,3]'::VECTOR AS simvec
NULL::vector AS simvec
FROM awards_players;

DROP TABLE IF EXISTS functional_alltypes CASCADE;
Expand Down
177 changes: 177 additions & 0 deletions ci/schema/risingwave.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
SET RW_IMPLICIT_FLUSH=true;

DROP TABLE IF EXISTS diamonds CASCADE;

CREATE TABLE diamonds (
carat FLOAT,
cut TEXT,
color TEXT,
clarity TEXT,
depth FLOAT,
"table" FLOAT,
price BIGINT,
x FLOAT,
y FLOAT,
z FLOAT
) WITH (
connector = 'posix_fs',
match_pattern = 'diamonds.csv',
posix_fs.root = '/data',
) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' );

DROP TABLE IF EXISTS astronauts CASCADE;

CREATE TABLE astronauts (
"id" BIGINT,
"number" BIGINT,
"nationwide_number" BIGINT,
"name" VARCHAR,
"original_name" VARCHAR,
"sex" VARCHAR,
"year_of_birth" BIGINT,
"nationality" VARCHAR,
"military_civilian" VARCHAR,
"selection" VARCHAR,
"year_of_selection" BIGINT,
"mission_number" BIGINT,
"total_number_of_missions" BIGINT,
"occupation" VARCHAR,
"year_of_mission" BIGINT,
"mission_title" VARCHAR,
"ascend_shuttle" VARCHAR,
"in_orbit" VARCHAR,
"descend_shuttle" VARCHAR,
"hours_mission" DOUBLE PRECISION,
"total_hrs_sum" DOUBLE PRECISION,
"field21" BIGINT,
"eva_hrs_mission" DOUBLE PRECISION,
"total_eva_hrs" DOUBLE PRECISION
) WITH (
connector = 'posix_fs',
match_pattern = 'astronauts.csv',
posix_fs.root = '/data',
) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' );

DROP TABLE IF EXISTS batting CASCADE;

CREATE TABLE batting (
"playerID" TEXT,
"yearID" BIGINT,
stint BIGINT,
"teamID" TEXT,
"lgID" TEXT,
"G" BIGINT,
"AB" BIGINT,
"R" BIGINT,
"H" BIGINT,
"X2B" BIGINT,
"X3B" BIGINT,
"HR" BIGINT,
"RBI" BIGINT,
"SB" BIGINT,
"CS" BIGINT,
"BB" BIGINT,
"SO" BIGINT,
"IBB" BIGINT,
"HBP" BIGINT,
"SH" BIGINT,
"SF" BIGINT,
"GIDP" BIGINT
) WITH (
connector = 'posix_fs',
match_pattern = 'batting.csv',
posix_fs.root = '/data',
) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' );

DROP TABLE IF EXISTS awards_players CASCADE;

CREATE TABLE awards_players (
"playerID" TEXT,
"awardID" TEXT,
"yearID" BIGINT,
"lgID" TEXT,
tie TEXT,
notes TEXT
) WITH (
connector = 'posix_fs',
match_pattern = 'awards_players.csv',
posix_fs.root = '/data',
) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' );

DROP TABLE IF EXISTS functional_alltypes CASCADE;

CREATE TABLE functional_alltypes (
id INTEGER,
bool_col BOOLEAN,
tinyint_col SMALLINT,
smallint_col SMALLINT,
int_col INTEGER,
bigint_col BIGINT,
float_col REAL,
double_col DOUBLE PRECISION,
date_string_col TEXT,
string_col TEXT,
timestamp_col TIMESTAMP WITHOUT TIME ZONE,
year INTEGER,
month INTEGER
) WITH (
connector = 'posix_fs',
match_pattern = 'functional_alltypes.csv',
posix_fs.root = '/data',
) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ',' );

DROP TABLE IF EXISTS tzone CASCADE;

CREATE TABLE tzone (
ts TIMESTAMP WITH TIME ZONE,
key TEXT,
value DOUBLE PRECISION
);

INSERT INTO tzone
SELECT
CAST('2017-05-28 11:01:31.000400' AS TIMESTAMP WITH TIME ZONE) +
t * INTERVAL '1 day 1 second' AS ts,
CHR(97 + t) AS key,
t + t / 10.0 AS value
FROM generate_series(0, 9) AS t;

DROP TABLE IF EXISTS array_types CASCADE;

CREATE TABLE IF NOT EXISTS array_types (
x BIGINT[],
y TEXT[],
z DOUBLE PRECISION[],
grouper TEXT,
scalar_column DOUBLE PRECISION,
multi_dim BIGINT[][]
);

INSERT INTO array_types VALUES
(ARRAY[1, 2, 3], ARRAY['a', 'b', 'c'], ARRAY[1.0, 2.0, 3.0], 'a', 1.0, ARRAY[ARRAY[NULL::BIGINT, NULL, NULL], ARRAY[1, 2, 3]]),
(ARRAY[4, 5], ARRAY['d', 'e'], ARRAY[4.0, 5.0], 'a', 2.0, ARRAY[]::BIGINT[][]),
(ARRAY[6, NULL], ARRAY['f', NULL], ARRAY[6.0, NULL], 'a', 3.0, ARRAY[NULL, ARRAY[]::BIGINT[], NULL]),
(ARRAY[NULL, 1, NULL], ARRAY[NULL, 'a', NULL], ARRAY[]::DOUBLE PRECISION[], 'b', 4.0, ARRAY[ARRAY[1], ARRAY[2], ARRAY[NULL::BIGINT], ARRAY[3]]),
(ARRAY[2, NULL, 3], ARRAY['b', NULL, 'c'], NULL, 'b', 5.0, NULL),
(ARRAY[4, NULL, NULL, 5], ARRAY['d', NULL, NULL, 'e'], ARRAY[4.0, NULL, NULL, 5.0], 'c', 6.0, ARRAY[ARRAY[1, 2, 3]]);

DROP TABLE IF EXISTS json_t CASCADE;

CREATE TABLE IF NOT EXISTS json_t (js JSONB);

INSERT INTO json_t VALUES
('{"a": [1,2,3,4], "b": 1}'),
('{"a":null,"b":2}'),
('{"a":"foo", "c":null}'),
('null'),
('[42,47,55]'),
('[]');

DROP TABLE IF EXISTS win CASCADE;
CREATE TABLE win (g TEXT, x BIGINT, y BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
('a', 2, 0),
('a', 3, 1),
('a', 4, 1);
10 changes: 5 additions & 5 deletions ci/schema/trino.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ CREATE TABLE hive.default.diamonds (
"y" DOUBLE,
"z" DOUBLE
) WITH (
external_location = 's3a://warehouse/diamonds',
external_location = 's3a://trino/diamonds',
format = 'PARQUET'
);

Expand Down Expand Up @@ -45,7 +45,7 @@ CREATE TABLE hive.default.astronauts (
"eva_hrs_mission" REAL,
"total_eva_hrs" REAL
) WITH (
external_location = 's3a://warehouse/astronauts',
external_location = 's3a://trino/astronauts',
format = 'PARQUET'
);

Expand Down Expand Up @@ -77,7 +77,7 @@ CREATE TABLE hive.default.batting (
"SF" BIGINT,
"GIDP" BIGINT
) WITH (
external_location = 's3a://warehouse/batting',
external_location = 's3a://trino/batting',
format = 'PARQUET'
);

Expand All @@ -93,7 +93,7 @@ CREATE TABLE hive.default.awards_players (
"tie" VARCHAR,
"notes" VARCHAR
) WITH (
external_location = 's3a://warehouse/awards-players',
external_location = 's3a://trino/awards-players',
format = 'PARQUET'
);

Expand All @@ -116,7 +116,7 @@ CREATE TABLE hive.default.functional_alltypes (
"year" INTEGER,
"month" INTEGER
) WITH (
external_location = 's3a://warehouse/functional-alltypes',
external_location = 's3a://trino/functional-alltypes',
format = 'PARQUET'
);
CREATE OR REPLACE VIEW memory.default.functional_alltypes AS
Expand Down
345 changes: 229 additions & 116 deletions compose.yaml

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

import os

import hypothesis as h

# setup hypothesis profiles
h.settings.register_profile(
"ci",
max_examples=1000,
suppress_health_check=[h.HealthCheck.too_slow],
deadline=None,
)
h.settings.register_profile(
"dev",
max_examples=50,
suppress_health_check=[h.HealthCheck.too_slow],
deadline=None,
)
h.settings.register_profile(
"debug",
max_examples=10,
verbosity=h.Verbosity.verbose,
suppress_health_check=[h.HealthCheck.too_slow],
deadline=None,
)

# load default hypothesis profile, either set HYPOTHESIS_PROFILE environment
# variable or pass --hypothesis-profile option to pytest, to see the generated
# examples try:
# pytest pyarrow -sv --hypothesis-profile=debug
h.settings.load_profile(os.environ.get("HYPOTHESIS_PROFILE", "dev"))
4 changes: 2 additions & 2 deletions docker/flink/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
FROM flink:1.18.0-scala_2.12
FROM flink:1.18.1
# ibis-flink requires PyFlink dependency
RUN wget -nv -P $FLINK_HOME/lib/ https://repo1.maven.org/maven2/org/apache/flink/flink-python/1.18.0/flink-python-1.18.0.jar
RUN wget -nv -P $FLINK_HOME/lib/ https://repo1.maven.org/maven2/org/apache/flink/flink-python/1.18.1/flink-python-1.18.1.jar
81 changes: 81 additions & 0 deletions docker/impala/conf/hive-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
Hive configuration for Impala quickstart docker cluster.
-->
<configuration>
<property>
<!-- Required for automatic metadata sync. -->
<name>hive.metastore.dml.events</name>
<value>true</value>
</property>

<property>
<!-- User impala is not authorized to consume notifications by default, disable
authentication to work around this. -->
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>

<property>
<name>hive.metastore.uris</name>
<value>thrift://impala-hive-metastore:9083</value>
</property>

<!-- Managed and external tablespaces must live on the Docker volumes that we
configure for the cluster. -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse/managed</value>
</property>

<property>
<name>hive.metastore.warehouse.external.dir</name>
<value>/user/hive/warehouse/external</value>
</property>

<property>
<!-- Required to enable Hive transactions -->
<name>hive.support.concurrency</name>
<value>true</value>
</property>

<property>
<!-- Required to enable Hive transactions -->
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>

<!-- Hive stats autogathering negatively affects latency of DDL operations, etc and
is not particularly useful for Impala -->
<property>
<name>hive.stats.autogather</name>
<value>false</value>
</property>

<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>

<property>
<name>hive.compactor.worker.threads</name>
<value>1</value>
</property>
</configuration>
2 changes: 1 addition & 1 deletion docker/minio/config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"version": "10",
"aliases": {
"trino": {
"data": {
"url": "http://minio:9000",
"accessKey": "accesskey",
"secretKey": "secretkey",
Expand Down
8 changes: 1 addition & 7 deletions docker/postgres/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,2 @@
FROM postgis/postgis:15-3.3-alpine
RUN apk add --no-cache build-base clang15 llvm15 postgresql15-plpython3 python3 py3-pip && \
python3 -m pip install pgxnclient && \
pgxn install vector && \
pgxn install first_last_agg && \
python3 -m pip uninstall -y pgxnclient && \
rm -rf ~/.cache/pip && \
apk del build-base clang15 llvm15 python3 py3-pip
RUN apk add --no-cache postgresql15-plpython3
6 changes: 3 additions & 3 deletions docs/_freeze/backends/clickhouse/execute-results/html.json

Large diffs are not rendered by default.

15 changes: 0 additions & 15 deletions docs/_freeze/how-to/extending/builtin/execute-results/html.json

This file was deleted.

16 changes: 16 additions & 0 deletions docs/_freeze/posts/1brc/index/execute-results/html.json

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

19 changes: 14 additions & 5 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ website:
site-url: https://ibis-project.org
description: "the portable Python dataframe library"
favicon: logo.svg
open-graph:
image: logo.png
twitter-card:
image: logo.png

# search
search:
Expand All @@ -41,11 +45,10 @@ website:

# options
reader-mode: false
twitter-card: true
back-to-top-navigation: true
repo-url: https://github.com/ibis-project/ibis
repo-actions: [edit, issue]
repo-branch: master
repo-branch: main
repo-subdir: docs
issue-url: https://github.com/ibis-project/ibis/issues/new/choose

Expand Down Expand Up @@ -177,6 +180,10 @@ format:
dark: [darkly, theme-dark.scss]
toc: true
css: styles.css
include-in-header:
- text: |
<script data-goatcounter="https://ibis.goatcounter.com/count"
async src="//gc.zgo.at/count.js"></script>
quartodoc:
package: ibis
Expand Down Expand Up @@ -227,6 +234,9 @@ quartodoc:
- name: union
dynamic: true
signature_name: full
- name: join
dynamic: true
signature_name: full
- name: row_number
dynamic: true
signature_name: full
Expand Down Expand Up @@ -307,9 +317,6 @@ quartodoc:
- name: case
dynamic: true
signature_name: full
- name: show_sql
dynamic: true
signature_name: full
- name: to_sql
dynamic: true
signature_name: full
Expand Down Expand Up @@ -418,6 +425,8 @@ quartodoc:
contents:
- GeoSpatialValue
- GeoSpatialColumn
- name: NumericValue.point
package: ibis.expr.types.numeric

- kind: page
summary:
Expand Down
49 changes: 26 additions & 23 deletions docs/_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ def render(self, el: qd.ast.ExampleCode) -> str:
lambda line: quartodoc_skip_doctest in line or skip_doctest in line
)

has_executed_chunks = False

for chunk in toolz.partitionby(chunker, lines):
first, *rest = chunk

Expand All @@ -39,11 +37,22 @@ def render(self, el: qd.ast.ExampleCode) -> str:
# check whether to skip execution and if so, render the code
# block as `python` (not `{python}`) if it's marked with
# skip_doctest, expect_failure or quartodoc_skip_doctest
if any(map(should_skip, chunk)):
if skipped := any(map(should_skip, chunk)):
start = end = ""
else:
has_executed_chunks = True
start, end = "{}"
result.append(
dedent(
"""
```{python}
#| echo: false
import ibis
ibis.options.interactive = True
```
"""
)
)

result.append(f"```{start}python{end}")

Expand All @@ -67,22 +76,16 @@ def render(self, el: qd.ast.ExampleCode) -> str:
result.extend(rest)
result.append("```\n")

examples = "\n".join(result)

if has_executed_chunks:
# turn off interactive mode before rendering
return (
dedent(
"""
```{python}
#| echo: false
import ibis
ibis.options.interactive = False
```
"""
)
+ examples
)
else:
return examples
if not skipped:
result.append(
dedent(
"""
```{python}
#| echo: false
ibis.options.interactive = False
```
"""
)
)

return "\n".join(result)
45 changes: 35 additions & 10 deletions docs/_tabsets/install.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ You can install Ibis and a supported backend with `pip`, `conda`, `mamba`, or `p
```{python}
#| echo: false
#| output: asis
from textwrap import dedent
backends = [
{"name": "BigQuery", "module": "bigquery"},
Expand All @@ -13,6 +14,8 @@ backends = [
{"name": "DataFusion", "module": "datafusion"},
{"name": "Druid", "module": "druid"},
{"name": "DuckDB", "module": "duckdb"},
{"name": "Exasol", "module": "exasol"},
{"name": "Flink", "module": "flink"},
{"name": "Impala", "module": "impala"},
{"name": "MSSQL", "module": "mssql"},
{"name": "MySQL", "module": "mysql"},
Expand All @@ -34,30 +37,52 @@ installers = [
]
for installer in installers:
name = installer["name"]
installer_name = installer["name"]
cmd = installer["cmd"]
line = installer["line"]
print(f"## `{name}`")
print(f"## `{installer_name}`")
print("::: {.panel-tabset}")
print()
for backend in backends:
name = backend["name"]
backend_name = backend["name"]
mod = backend["module"]
extra = backend.get("extra", mod)
print(f"## {name}")
print(f"## {backend_name}")
print()
print(line.format(extra=extra))
print()
print(f"```bash\n{cmd.format(extra=extra)}\n```")
if backend_name == "Flink":
if installer_name == "pip":
print("Install alongside the `apache-flink` package:")
print()
print(f"```bash\npip install ibis-framework apache-flink\n```")
else:
print(
dedent(
"""\
::: {.callout-important}
## PyFlink is not available on conda-forge; please
use `pip` to install the PyFlink backend instead.
:::"""
)
)
continue
else:
extra = backend.get("extra", mod)
print(line.format(extra=extra))
print()
print(f"```bash\n{cmd.format(extra=extra)}\n```")
print()
print(f"Connect using [`ibis.{mod}.connect`](./backends/{name.lower()}.qmd#ibis.{mod}.connect).")
print(f"Connect using [`ibis.{mod}.connect`](./backends/{backend_name.lower()}.qmd#ibis.{mod}.connect).")
print()
if name == "pip":
if installer_name == "pip":
print("{{< include /_callouts/pypi_warning.qmd >}}")
print()
Expand Down
2 changes: 1 addition & 1 deletion docs/_tabsets/install_default.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ We recommend starting with the default (DuckDB) backend for a performant, fully-
## Using `pip`

```bash
pip install 'ibis-framework[duckdb]'
pip install 'ibis-framework[duckdb,examples]'
```

{{< include /_callouts/pypi_warning.qmd >}}
Expand Down
209 changes: 0 additions & 209 deletions docs/backends/app/backend_info_app.py

This file was deleted.

5 changes: 0 additions & 5 deletions docs/backends/app/requirements.txt

This file was deleted.

2 changes: 1 addition & 1 deletion docs/backends/bigquery.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ render_do_connect("bigquery")
### `ibis.connect` URL format

In addition to `ibis.bigquery.connect`, you can also connect to BigQuery by
passing a properly formatted BigQuery connection URL to `ibis.connect`
passing a properly-formatted BigQuery connection URL to `ibis.connect`:

```python
con = ibis.connect(f"bigquery://{project_id}/{dataset_id}")
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/clickhouse.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ render_do_connect("clickhouse")
### `ibis.connect` URL format

In addition to `ibis.clickhouse.connect`, you can also connect to ClickHouse by
passing a properly formatted ClickHouse connection URL to `ibis.connect`
passing a properly-formatted ClickHouse connection URL to `ibis.connect`:

```python
con = ibis.connect(f"clickhouse://{user}:{password}@{host}:{port}?secure={secure}")
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/druid.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ render_do_connect("druid")
### `ibis.connect` URL format

In addition to `ibis.druid.connect`, you can also connect to Druid by
passing a properly formatted Druid connection URL to `ibis.connect`
passing a properly-formatted Druid connection URL to `ibis.connect`:

```python
con = ibis.connect("druid://localhost:8082/druid/v2/sql")
Expand Down
4 changes: 2 additions & 2 deletions docs/backends/duckdb.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,15 @@ render_do_connect("duckdb")
### `ibis.connect` URL format

In addition to `ibis.duckdb.connect`, you can also connect to DuckDB by
passing a properly formatted DuckDB connection URL to `ibis.connect`
passing a properly-formatted DuckDB connection URL to `ibis.connect`:

```{python}
import ibis
con = ibis.connect("duckdb://local.ddb")
```

Without an empty path, `ibis.connect` will connect to an ephemeral, in-memory database.
Given an empty path, `ibis.connect` will connect to an ephemeral, in-memory database.

```{python}
con = ibis.connect("duckdb://")
Expand Down
64 changes: 64 additions & 0 deletions docs/backends/flink.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Flink

[https://nightlies.apache.org/flink/flink-docs-stable/](https://nightlies.apache.org/flink/flink-docs-stable/)

![](https://img.shields.io/badge/memtables-native-green?style=flat-square) ![](https://img.shields.io/badge/inputs-Flink tables-blue?style=flat-square) ![](https://img.shields.io/badge/outputs-Flink tables | pandas-orange?style=flat-square)

## Install

Install Ibis and dependencies for the Flink backend:

::: {.panel-tabset}

## `pip`

Install alongside the `apache-flink` package:

```{.bash}
pip install ibis-framework apache-flink
```

And connect:

```{.python}
import ibis
con = ibis.flink.connect() # <1>
```

1. Adjust connection parameters as needed.

:::

## Connect

### `ibis.flink.connect`

```python
con = ibis.flink.connect(table_env=table_env)
```

::: {.callout-note}
`ibis.flink.connect` is a thin wrapper around [`ibis.backends.flink.Backend.do_connect`](#ibis.backends.flink.Backend.do_connect).
:::

::: {.callout-note}
The `flink` backend does not create `TableEnvironment` objects; you must create a `TableEnvironment` and pass that to `ibis.flink.connect`.
:::

### Connection Parameters

```{python}
#| echo: false
#| output: asis
from _utils import render_do_connect
render_do_connect("flink")
```

```{python}
#| echo: false
BACKEND = "Flink"
```

{{< include ./_templates/api.qmd >}}
227 changes: 9 additions & 218 deletions docs/backends/impala.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -145,29 +145,23 @@ render_methods(
render_methods(backend, "drop_table_or_view", "create_view")
```

## Accessing data formats in HDFS
## Accessing data

```{python}
#| echo: false
#| output: asis
render_methods(backend, "delimited_file", "parquet_file", "avro_file")
```

## HDFS Interaction

Ibis delegates all HDFS interaction to the
[`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) library.

## The Impala client object

To use Ibis with Impala, you first must connect to a cluster using the
`ibis.impala.connect` function, optionally supplying an HDFS connection:
`ibis.impala.connect` function:

```python
import ibis

hdfs = ibis.impala.hdfs_connect(host=webhdfs_host, port=webhdfs_port)
client = ibis.impala.connect(host=impala_host, port=impala_port, hdfs_client=hdfs)
client = ibis.impala.connect(host=impala_host, port=impala_port)
```

By default binary transport mode is used, however it is also possible to use HTTP.
Expand Down Expand Up @@ -196,8 +190,7 @@ using docker:
```python
import ibis

hdfs = ibis.impala.hdfs_connect(host="localhost", port=50070)
client = ibis.impala.connect(host=host, hdfs_client=hdfs)
client = ibis.impala.connect(host=host)
```

You can accomplish many tasks directly through the client object, but we
Expand Down Expand Up @@ -707,7 +700,6 @@ query execution:
#| output: asis
render_methods(
backend,
"disable_codegen",
"get_options",
"set_options",
"set_compression_codec",
Expand All @@ -725,8 +717,6 @@ For example:
'COMPUTE_STATS_MIN_SAMPLE_SIZE': '1073741824',
'DEFAULT_JOIN_DISTRIBUTION_MODE': '0',
'DEFAULT_SPILLABLE_BUFFER_SIZE': '2097152',
'DISABLE_CODEGEN': '0',
'DISABLE_CODEGEN_ROWS_THRESHOLD': '50000',
'DISABLE_ROW_RUNTIME_FILTERING': '0',
'DISABLE_STREAMING_PREAGGREGATIONS': '0',
'DISABLE_UNSAFE_SPILLS': '0',
Expand Down Expand Up @@ -851,92 +841,7 @@ For example:
>>> to_insert.drop()
```

## Uploading / downloading data from HDFS

If you've set up an HDFS connection, you can use the Ibis HDFS interface
to look through your data and read and write files to and from HDFS:

```python
>>> hdfs = con.hdfs
>>> hdfs.ls('/__ibis/ibis-testing-data')
['README.md',
'avro',
'awards_players.csv',
'batting.csv',
'csv',
'diamonds.csv',
'functional_alltypes.csv',
'functional_alltypes.parquet',
'geo.csv',
'ibis_testing.db',
'parquet',
'struct_table.avro',
'udf']
```

```python
>>> hdfs.ls('/__ibis/ibis-testing-data/parquet')
['functional_alltypes',
'tpch_customer',
'tpch_lineitem',
'tpch_nation',
'tpch_orders',
'tpch_part',
'tpch_partsupp',
'tpch_region',
'tpch_supplier']
```

Suppose we wanted to download
`/__ibis/ibis-testing-data/parquet/functional_alltypes`, which is a
directory. We need only do:

```bash
$ rm -rf parquet_dir/
```

```python
>>> hdfs.get('/__ibis/ibis-testing-data/parquet/functional_alltypes',
... 'parquet_dir',
... recursive=True)
'/ibis/docs/source/tutorial/parquet_dir'
```

Now we have that directory locally:

```bash
$ ls parquet_dir/
9a41de519352ab07-4e76bc4d9fb5a789_1624886651_data.0.parq
9a41de519352ab07-4e76bc4d9fb5a78a_778826485_data.0.parq
9a41de519352ab07-4e76bc4d9fb5a78b_1277612014_data.0.parq
```

Files and directories can be written to HDFS just as easily using `put`:

```python
>>> path = '/__ibis/dir-write-example'
>>> hdfs.rm(path, recursive=True)
>>> hdfs.put(path, 'parquet_dir', recursive=True)
```

```python
>>> hdfs.ls('/__ibis/dir-write-example')
['9a41de519352ab07-4e76bc4d9fb5a789_1624886651_data.0.parq',
'9a41de519352ab07-4e76bc4d9fb5a78a_778826485_data.0.parq',
'9a41de519352ab07-4e76bc4d9fb5a78b_1277612014_data.0.parq']
```

Delete files and directories with `rm`:

```python
>>> hdfs.rm('/__ibis/dir-write-example', recursive=True)
```

```bash
rm -rf parquet_dir/
```

## Queries on Parquet, Avro, and Delimited files in HDFS
## Queries on Parquet, Avro, and Delimited files

Ibis can easily create temporary or persistent Impala tables that
reference data in the following formats:
Expand Down Expand Up @@ -995,36 +900,7 @@ Decimal('229577310901.20')
>>> con.drop_table('my_parquet_table')
```

To query delimited files, you need to write down an Ibis schema. At some
point we'd like to build some helper tools that will infer the schema
for you, all in good time.

There's some CSV files in the test folder, so let's use those:

```python
>>> hdfs.get('/__ibis/ibis-testing-data/csv', 'csv-files', recursive=True)
'/ibis/docs/source/tutorial/csv-files'
```

```bash
$ cat csv-files/0.csv
63IEbRheTh,0.679388707915,6
mG4hlqnjeG,2.80710565922,15
JTPdX9SZH5,-0.155126406372,55
2jcl6FypOl,1.03787834032,21
k3TbJLaadQ,-1.40190801103,23
rP5J4xvinM,-0.442092712869,22
WniUylixYt,-0.863748033806,27
znsDuKOB1n,-0.566029637098,47
4SRP9jlo1M,0.331460412318,88
KsfjPyDf5e,-0.578930506363,70
```

```bash
$ rm -rf csv-files/
```

The schema here is pretty simple (see `ibis.schema` for more):
To query delimited files, you need to write down an Ibis schema.

```python
>>> schema = ibis.schema(dict(foo='string', bar='double', baz='int32'))
Expand All @@ -1049,32 +925,9 @@ The schema here is pretty simple (see `ibis.schema` for more):
0 100 0 -1.401908 2.807106 8.479978 0.0848 10
```

For functions like `parquet_file` and `delimited_file`, an HDFS directory must
For functions like `parquet_file` and `delimited_file`, a directory must
be passed and the directory must contain files all having the same schema.

If you have Avro data, you can query it too if you have the full avro
schema:

```python
>>> avro_schema = {
... "fields": [
... {"type": ["int", "null"], "name": "R_REGIONKEY"},
... {"type": ["string", "null"], "name": "R_NAME"},
... {"type": ["string", "null"], "name": "R_COMMENT"}],
... "type": "record",
... "name": "a"
... }

>>> path = '/__ibis/ibis-testing-data/avro/tpch.region'

>>> hdfs.mkdir(path, create_parents=True)
>>> table = con.avro_file(path, avro_schema)
>>> table
Empty DataFrame
Columns: [r_regionkey, r_name, r_comment]
Index: []
```

## Other helper functions for interacting with the database

We're adding a growing list of useful utility functions for interacting
Expand Down Expand Up @@ -1132,63 +985,21 @@ you want for the data files

```python
>>> db = 'ibis_testing2'
>>> con.create_database(db, path='/__ibis/my-test-database', force=True)

>>> # you may or may not have to give the impala user write and execute permissions to '/__ibis/my-test-database'
>>> hdfs.chmod('/__ibis/my-test-database', 0o777)
>>> con.create_database(db, force=True)
```

```python
>>> con.create_table('example_table', con.table('functional_alltypes'),
... database=db, force=True)
```

Hopefully, there will be data files in the indicated spot in HDFS:

```python
>>> hdfs.ls('/__ibis/my-test-database')
['example_table']
```

To drop a database, including all tables in it, you can use
`drop_database` with `force=True`:

```python
>>> con.drop_database(db, force=True)
```

## Faster queries on small data in Impala

Since Impala internally uses LLVM to compile parts of queries (aka
"codegen") to make them faster on large data sets there is a certain
amount of overhead with running many kinds of queries, even on small
datasets. You can disable LLVM code generation when using Ibis, which
may significantly speed up queries on smaller datasets:

```python
>>> from numpy.random import rand
>>> con.disable_codegen()
>>> t = con.table('ibis_testing.functional_alltypes')
```

```bash
$ time python -c "(t.double_col + rand()).sum().to_pandas()"
27.7 ms ± 996 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
```

```python
# Turn codegen back on
con.disable_codegen(False)
```

```bash
$ time python -c "(t.double_col + rand()).sum().to_pandas()"
27 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
```

It's important to remember that codegen is a fixed overhead and will
significantly speed up queries on big data

## User Defined functions (UDF)

Impala currently supports user-defined scalar functions (known
Expand Down Expand Up @@ -1298,26 +1109,6 @@ Specifically, after authenticating yourself against Kerberos (e.g., by issuing
the appropriate `kinit` command), pass `auth_mechanism='GSSAPI'` or
`auth_mechanism='LDAP'` (and set `kerberos_service_name` if necessary along
with `user` and `password` if necessary) to the
`ibis.impala_connect(...)` method when instantiating an `ImpalaConnection`.
`ibis.impala_connect(...)` method.
This method also takes arguments to configure SSL (`use_ssl`, `ca_cert`).
See the documentation for the Impala shell for more details.

Ibis also includes functionality that communicates directly with HDFS, using
the WebHDFS REST API. When calling `ibis.impala.hdfs_connect(...)`, also pass
`auth_mechanism='GSSAPI'` or `auth_mechanism='LDAP'`, and ensure that you
are connecting to the correct port, which may likely be an SSL-secured WebHDFS
port. Also note that you can pass `verify=False` to avoid verifying SSL
certificates (which may be helpful in testing). Ibis will assume `https`
when connecting to a Kerberized cluster. Because some Ibis commands create HDFS
directories as well as new Impala databases and/or tables, your user will
require the necessary privileges.

## Default Configuration Values for CDH Components

Cloudera CDH ships with HDFS, Impala, Hive and many other components.
Sometimes it's not obvious what default configuration values these tools are
using or should be using.

Check out [this
link](https://www.cloudera.com/documentation/enterprise/latest/topics/cdh_ig_ports_cdh5.html#topic_9_1)
to see the default configuration values for every component of CDH.
2 changes: 1 addition & 1 deletion docs/backends/mssql.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ render_do_connect("mssql")
### `ibis.connect` URL format

In addition to `ibis.mssql.connect`, you can also connect to MSSQL by
passing a properly formatted MSSQL connection URL to `ibis.connect`
passing a properly-formatted MSSQL connection URL to `ibis.connect`:

```python
con = ibis.connect(f"mssql://{user}:{password}@{host}:{port}")
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/mysql.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ render_do_connect("mysql")
### `ibis.connect` URL format

In addition to `ibis.mysql.connect`, you can also connect to MySQL by
passing a properly formatted MySQL connection URL to `ibis.connect`
passing a properly-formatted MySQL connection URL to `ibis.connect`:

```python
con = ibis.connect(f"mysql://{user}:{password}@{host}:{port}/{database}")
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/oracle.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ render_do_connect("oracle")
### `ibis.connect` URL format

In addition to `ibis.oracle.connect`, you can also connect to Oracle by
passing a properly formatted Oracle connection URL to `ibis.connect`
passing a properly-formatted Oracle connection URL to `ibis.connect`:

```python
con = ibis.connect(f"oracle://{user}:{password}@{host}:{port}/{database}")
Expand Down
8 changes: 5 additions & 3 deletions docs/backends/pandas.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,12 @@ def zscore(series):
- The objects you receive as input arguments are either `pandas.Series` or
Python/NumPy scalars.

!!! warning "Keyword arguments must be given a default"
::: {.callout-warning}
## Keyword arguments must be given a default

Any keyword arguments must be given a default value or the function **will
not work**.
Any keyword arguments must be given a default value or the function **will
not work**.
:::

A common Python convention is to set the default value to `None` and
handle setting it to something not `None` in the body of the function.
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/postgresql.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ render_do_connect("postgres")
### `ibis.connect` URL format

In addition to `ibis.postgres.connect`, you can also connect to Postgres by
passing a properly formatted Postgres connection URL to `ibis.connect`
passing a properly-formatted Postgres connection URL to `ibis.connect`:

```python
con = ibis.connect(f"postgres://{user}:{password}@{host}:{port}/{database}")
Expand Down
13 changes: 10 additions & 3 deletions docs/backends/pyspark.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ con = ibis.pyspark.connect() # <1>

:::



## Connect

### `ibis.pyspark.connect`
Expand All @@ -81,7 +79,7 @@ con = ibis.pyspark.connect(session=session)
:::

::: {.callout-note}
The `pyspark` backend does not create `SparkSession` objects, you must create a `SparkSession` and pass that to `ibis.pyspark.connect`.
The `pyspark` backend does not create `SparkSession` objects (unless you [connect using a URL](#ibis.connect-url-format)); you must create a `SparkSession` and pass that to `ibis.pyspark.connect`.
:::

### Connection Parameters
Expand All @@ -94,6 +92,15 @@ from _utils import render_do_connect
render_do_connect("pyspark")
```

### `ibis.connect` URL format

In addition to `ibis.pyspark.connect`, you can also connect to PySpark by
passing a properly-formatted PySpark connection URL to `ibis.connect`:

```python
con = ibis.connect(f"pyspark://{warehouse-dir}?spark.app.name=CountingSheep&spark.master=local[2]")
```

```{python}
#| echo: false
BACKEND = "PySpark"
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/snowflake.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ render_do_connect("snowflake")
### `ibis.connect` URL format

In addition to `ibis.snowflake.connect`, you can also connect to Snowflake by
passing a properly formatted Snowflake connection URL to `ibis.connect`
passing a properly-formatted Snowflake connection URL to `ibis.connect`:

```python
con = ibis.connect(f"snowflake://{user}:{password}@{account}/{database}")
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/sqlite.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ render_do_connect("sqlite")
### `ibis.connect` URL format

In addition to `ibis.sqlite.connect`, you can also connect to SQLite by
passing a properly formatted SQLite connection URL to `ibis.connect`:
passing a properly-formatted SQLite connection URL to `ibis.connect`:

```python
con = ibis.connect("sqlite:///path/to/local/file")
Expand Down
115 changes: 115 additions & 0 deletions docs/concepts/composable-ecosystem.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Composable data ecosystem

Ibis exists in a broader composable data ecosystem. [The Composable Codex by
Voltron Data](https://voltrondata.com/codex) is the result of years of
experience and hours of writing by experts in the field, providing an in-depth
introduction the composable data systems. We'll take a look at:

- [Apache Arrow](https://github.com/apache/arrow)
- [Apache Arrow Database Connectivity (ADBC)](https://arrow.apache.org/adbc/current/index.html)
- [Substrait](https://substrait.io/)

and how they fit in with Ibis.

## Overview

Ibis is the portable Python dataframe API, supporting many backends. This is
achieved by decoupling the API from the execution engine. While Ibis already
relies on standards like Apache Arrow today, we expect the composable data
ecosystem to mature and broader adoption of adjacent projects going forward.
This will allow Ibis to simplify its implementation and improve performance for
backends that support these standards.

[The first chapter of The Composable Codex provides a great overview of where
these projects fit
in](https://voltrondata.com/codex/standards-over-silos#1-2-3-a-composable-ecosystem):

![Standards](images/standards.png)

And a table explaining the standards:

| Label | Types of standards | Standards |
| --- | --- | --- |
| A | Intermediate representation | [Substrait](https://substrait.io) allows any user-interface that produces Substrait to pass the compute operations to a Substrait-consuming execution engine. You could swap any Substrait compatible user interfaces or execution engine. |
| B | Connectivity | [Arrow Database Connectivity (ADBC)](https://arrow.apache.org/adbc/current/index.html) ensures that no matter where the computation is performed the data will be returned in the Arrow format. You can swap your execution engine and know that your downstream code will still work. |
| C | Data memory layout | The [Apache Arrow in-memory data format](https://arrow.apache.org/docs/format/Columnar.html) ensures that the data can pass from the storage to the engine (and even across the systems in a distributed environment) and back to the user without slowing down to serialize and deserialize. |

## History

The composable data ecosystem has been envisioned for some time. [Wes
McKinney](https://wesmckinney.com) has been instrumental in the development of
the composable data ecosystem, co-founding Voltron Data, Apache Arrow, and
initially creating Ibis. [Wes looked back on 15 years on the road to composable
data systems](https://wesmckinney.com/blog/looking-back-15-years/) and gave some
motivation for Ibis in his infamous ["Apache Arrow and the '10 Things I Hate
About pandas'"](https://wesmckinney.com/blog/apache-arrow-pandas-internals/).

Ibis started as a pandas-like API for Apache Impala, but has since expanded to
support many backends. It currently leverages open-source projects like
[SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy) and
[SQLGlot](https://github.com/tobymao/sqlglot) to work with many backends. While
these projects are great, they rely on backend-specific SQL that does not
constitute a standard. Going forward, we expect ADBC and Substrait to be the
standards for connectivity and intermediate representation, respectively.

## Apache Arrow

Ibis uses [Apache Arrow](https://arrow.apache.org/) to provide a common data
format for data interchange between Ibis and backends. Many backends also use
Apache Arrows as their in-memory data format.

### Dataframe interchange protocol

Ibis supports [the dataframe interchange
protocol](https://data-apis.org/dataframe-protocol/latest/purpose_and_scope.html)
for data interchange between with other Python dataframe libraries and
visualization libraries. This is largely efficient because most libraries
support Apache Arrow as their in-memory data format, making interchange between
them cheap and efficient.

## Apache Arrow Database Connectivity (ADBC)

[Apache Arrow Database Connectivity
(ADBC)](https://arrow.apache.org/docs/format/ADBC.html) is a relatively new
standard for database connectivity. It is an API for exchanging data between a
client and a database. It is a successor to ODBC and JDBC, and is designed to be
a more modern and performant alternative to these standards.

While Ibis does not currently use ADBC for its backends, as the project matures
we expect an increase in performance and a decrease in complexity for backends
that support ADBC.

## Substrait

[Substrait](https://substrait.io/) is a relatively new standard for
cross-language serialization of relational algebra. It is intended as an
intermediary representation between the user interface and other points in the
data system. Ibis can already compile expressions to Substrait which can then be
executed by Substrait-consuming backends. Support today is limited but, like
ADBC, we expect the project to mature and for Ibis to leverage it more in the
future.

### Why not SQL?

Structured Query Language (SQL) is not a standard. There is a commonly
referenced [ANSI Standard for
SQL](https://blog.ansi.org/sql-standard-iso-iec-9075-2023-ansi-x3-135) that you
can pay a lot of money to access and most execution engines claim to support.
However, most execution engines extend or subtly deviate from the standard and
in practice it is not possible to simply reuse SQL from one execution engine on
another. This leads to notoriously difficult database migrations and vendor
lock-in.

### Why Substrait?

Substrait, unlike SQL, is not intended as a user interface. Instead, a user
interface like Ibis in Python or dplyr in R would compile to Substrait and pass
it to a Substrait-consuming execution engine. This allows the user interface to
be decoupled from the execution engine, allowing for more flexibility and
portability.

## Going forward

Going forward, Ibis intends to leverage other standards in the broader
composable data ecosystem to simplify its implementation and improve
performance.
Binary file added docs/concepts/images/standards.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
82 changes: 82 additions & 0 deletions docs/concepts/user-testimonials.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# User testimonials

This page collects user testimonials about Ibis from the community! They may be
been lightly edited for clarity, with the originals linked.

## From the community

From [Nick Crews on
GitHub](https://github.com/ibis-project/ibis/issues/7743#issuecomment-1856391530):

> I have been very impressed with the responsiveness of the team. When I report
> bugs they are usually addressed within the next release in the next 1-2 months,
> and some feature requests of mine have been implemented with little convincing
> needed. The detailed CHANGELOG has made version transitions fairly easy, though
> there has been some bulk refactoring occasionally needed, but through the last 3
> major version upgrades I've gone through the process has never been that bad.
> Nothing but good things to say :), I hope you join along!
---

From [Daniel Kim on
Zulip](https://ibis-project.zulipchat.com/#narrow/stream/405263-general/topic/.E2.9C.94.20ibis-on-reddit/near/407807779):

> We have a production DB2 server that is already under a heavy load. So what I've
> done was extract a subset of the data it has locally onto my machine and then
> use ibis w/duckdb backend to perform ad-hoc analysis on this local data which is
> a bit too big for pandas, instead of hammering the production server. Often
> times, I don't know what queries I'll be building or what kind of rabbit hole my
> analysis may take me. So it's great that I can just query away with my local
> data. Performance has been great.
[And later in the same
topic](https://ibis-project.zulipchat.com/#narrow/stream/405263-general/topic/.E2.9C.94.20ibis-on-reddit/near/407813572):

> ...I have a lot of "medium" data that I need to work with locally, and so Ibis
> has been perfect for my use cases. We have this metric called cumulative defect
> rate that I need to forecast. It requires making cumulative sums and then having
> to pivot this data, along with some wonky transformations requiring UDFs. The
> need to dynamically pivot this data is where I turn to Ibis. Love that with
> Ibis, I can use SQL for the heavy lifting or aggregations, and then being able
> to switch to dataframe-like API for the type of dynamic transformations (pivot,
> forward fill, etc) that would otherwise be tedious to do in pure SQL.
---

From [stereoF on
GitHub](https://github.com/ibis-project/ibis/issues/7341#issuecomment-1760625921):

> My story around pyspark -> trying a bunch of stuff -> Ibis, which has feature of
> lazy computation.
> Our company has implemented an OLAP platform with its persistence layer on hdfs
> and the query engine being Presto. Typically, the OLAP platform is geared
> towards agile analysis, and its table structure is based on an event-driven
> model. As we delve deeper into machine learning modeling, we often need to
> transition from this event-based structure to a wide-table feature construction.
> Back between 2019 and 2020, I worked on a similar OLAP platform during my tenure
> at Tencent. I developed some generic analysis model tools, and at that time, the
> query engine was Impala. My approach was to dynamically concatenate SQL, which
> unfortunately was not conducive to code encapsulation, modularization, and
> future maintenance.
> In my pursuit of better code encapsulation and to decouple different parts of
> logic, I was initially inclined to use PySpark. However, when PySpark connects
> to Presto via JDBC, if we use the dataframe interface, the aggregation
> operations run on Spark. This doesn't harness the full power of Presto, leading
> to slow performances. On the other hand, if we use Spark's SQL interface,
> aggregation is processed on Presto. But in doing so, we lose the original intent
> of using Spark - which is better code encapsulation and the decoupling of
> different processes.
> The dataframe interface of Ibis and its feature of lazy computation perfectly
> align with my needs. In fact, back in 2019, I was on the hunt for such a tool.
> Sadly, I didn't come across Ibis at that time and even contemplated creating a
> set on my own.
## Have a story to share?

Let us know! We'd love to include it here. Please share your experience with
Ibis [in our Zulip community chat](https://ibis-project.zulipchat.com) and make
a PR to this page (or ask us to do it for you).
63 changes: 63 additions & 0 deletions docs/concepts/who.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Who supports Ibis?

Ibis is an open-source project that welcomes contributions from anyone! We have
a growing community of users and contributors, and we'd love to have you join
us. If you're interested in contributing, please see our [contributing
guide](/contribute).

## Voltron Data

[Voltron Data](https://voltrondata.com) is the primary sponsor of Ibis, with
most of the core development team employed there. As of writing, this includes
five full-time developers, one technical product manager, and other staff who
contribute to Ibis.

## Other companies

Ibis is used by many other companies, with various tools built on top of it.
Some include:

- [Google BigQuery DataFrames](https://github.com/googleapis/python-bigquery-dataframes), a clone of the pandas API built on Ibis
- [Starburst Galaxy Python DataFrames](https://www.starburst.io/blog/introducing-python-dataframes/), with support for Ibis
- [Claypot AI's contribution of the Flink backend](https://github.com/claypotai/ibis-flink-example), working in collaboration with Voltron Data
- [Microsoft's Magpie project](https://www.microsoft.com/en-us/research/project/magpie-2/), built on top of Ibis
- [SuperDuperDB](https://github.com/SuperDuperDB/superduperdb), bringing AI to any backend Ibis supports

Ibis is also contributed to by other companies. You can [look through the full
list of contributors on
GitHub](https://github.com/ibis-project/ibis/graphs/contributors).

## History

Ibis was originally created by [Wes McKinney](https://wesmckinney.com/). Wes
created pandas, co-created Apache Arrrow, and co-founded Voltron Data (among
other things). Ibis was initially a pandas-like dataframe library for Apache
Impala, but has since grown to support many other backends and mature under the
stewardship of [Phillip Cloud](https://github.com/cpcloud) and others on the
Ibis team.

The Ibis project is part of a broader composable data ecosystem envisioned by
Wes, Voltron Data, and others to solve problems seen throughout the space that
are compounding as data volume and AI complexity increase. Some good background
material on the composable data ecosystem and Ibis can be found at:

- ["Apache Arrow and the '10 Things I Hate About pandas'" by Wes](https://wesmckinney.com/blog/apache-arrow-pandas-internals/)
- ["The Road to Composable Data Systems: Thoughts on the Last 15 Years and the Future" by Wes](https://wesmckinney.com/blog/looking-back-15-years/)
- ["The Composable Codex" by Voltron Data](https://voltrondata.com/codex)

## Support for production workloads

Voltron Data is committed to the success of Ibis, and it's already in production
across numerous enterprises. The API is stable and while there are breaking
changes across major versions, we do our best to minimize them and provide easy
migration.

[Voltron Data offers commercial support for
Ibis](https://voltrondata.com/enterprise-support) if you're interested.
Otherwise, interacting through the open-source project channels
([GitHub](https://github.com/ibis-project/ibis) and
[Zulip](https://ibis-project.zulipchat.com)) is the best way to get help.

## Next steps

If you're interested, [get started with Ibis!](../tutorials/getting_started.qmd)
94 changes: 89 additions & 5 deletions docs/contribute/03_style.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

## Code style

Ibis uses several code linters, like [`ruff`](https://github.com/charliermarsh/ruff), [`shellcheck`](https://github.com/koalaman/shellcheck), [`statix`](https://github.com/nerdypepper/statix), [`nixpkgs-fmt`](https://github.com/nix-community/nixpkgs-fmt) and others, that are enforced by CI. Developers should run them locally before submitting a PR.
Ibis uses several code linters, like
[`ruff`](https://github.com/charliermarsh/ruff),
[`shellcheck`](https://github.com/koalaman/shellcheck),
[`statix`](https://github.com/nerdypepper/statix),
[`nixpkgs-fmt`](https://github.com/nix-community/nixpkgs-fmt) and others, that
are enforced by CI. Developers should run them locally before submitting a PR.

1. Install `pre-commit`

Expand All @@ -16,22 +21,101 @@ pip install pre-commit
pre-commit run --all-files
```
::: {.callout-note}
Some of the packages needed to run the `pre-commit` linting can not be installed automatically (e.g. `prettier`, `actionlint`, `shellcheck`), and they need to be installed through a system package manager.
Some of the packages needed to run the `pre-commit` linting can not be installed
automatically (e.g. `prettier`, `actionlint`, `shellcheck`), and they need to be
installed through a system package manager.
:::

Optionally, you may want to setup the `pre-commit` hooks to run automatically when making a git commit. To do this, run the following from the root of the Ibis repository:
Optionally, you may want to setup the `pre-commit` hooks to run automatically
when making a git commit. To do this, run the following from the root of the
Ibis repository:

```sh
pre-commit install
```

This will run the code linters automatically when you make a git commit. If you want to skip these checks, do `git commit --no-verify`
This will run the code linters automatically when you make a git commit. If you
want to skip these checks, do `git commit --no-verify`


::: {.callout-tip}
If you use `nix-shell`, all of these are already setup for you and ready to use, and you don't need to do anything to install these tools.
If you use `nix-shell`, all of these are already setup for you and ready to use,
and you don't need to do anything to install these tools.
:::

## Docstrings
We use [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) as our
standard format for docstrings.

## Documentation, blog, and other prose style

Always capitalize Ibis in prose.

### General points on style and word usage

Avoid the passive voice. Either Ibis does something or the user does
something or a particular backend does something.

Try to avoid using words like "simply", "simple", "obviously", "just", when
describing things that Ibis makes easier. Either the simplicity is
self-evident, or it isn't and you are inadvertently insulting the reader.

Prose should be authored by you™. We discourage LLM-written docs and posts, but
you are free to use these tools to aid in your writing.

Use American English spelling and grammar. For example, use "color" not "colour",
"realize" not "realise", and "behavior" not "behaviour".

### Text formatting

Wrap long prose strings to fewer than 90 characters per line. It is very helpful
to reviewers when looking at diffs of prose.

- In Vim you can use `gq` and then a movement to wrap. `gqG` to wrap an entire
document.
- In Neovim you can use `gw` and then a movement to wrap. `gwG` to wrap and
entire document.
- In Emacs you can use `fill-paragraph`.
- VSCode has a plugin called Rewrap (and probably several others).

::: {.callout-note}
Wrapping bulleted sentences requires a bit of extra attention. Ensure that the
wrapped line begins in-line with the start of the bulleted text, not the bullet.

```
* This is a long sentence that is a bulleted sentence and perhaps it shouldn't
have been a bullet but it got away from me and, well, here we are.
```
:::

Do not commit Jupyter notebooks. You are more than welcome to author docs in
Jupyter but the `ipynb` file should be converted to `qmd`. (We use notebooks
for tutorials and interactive work, but for prose review, JSON is suboptimal).

Use `# Sentence case for section headers`, not `# Title Case for Headers`

### Quarto

Any computations that can't or shouldn't be done again should use `freeze: auto`
in the YAML front matter. This is the default behavior for documents in the
`docs/posts` directory.

::: {.callout-note}
**NEVER** use `freeze: true` as this results in silently stale pages.
:::

Format code blocks with `black` or `ruff` where possible.

Prefer `language` over `{.language}` for non-executable code blocks:

Prefer this:

```python
code here
```

Over this:

```{.python}
code here
```
8 changes: 4 additions & 4 deletions docs/contribute/04_maintainers_guide.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@ Ibis maintainers are expected to handle the following tasks as they arise:

## Dependencies

A number of tasks that are typically associated with maintenance are partially or fully automated.

- [WhiteSource Renovate](https://www.whitesourcesoftware.com/free-developer-tools/renovate/) (Python library dependencies and GitHub Actions)
- [Custom GitHub Action](https://github.com/ibis-project/ibis/actions/workflows/update-deps.yml) (Nix dependencies)
Dependency updates are automated using [Mend Renovate](https://www.mend.io/renovate/).

### poetry

Occasionally you may need to lock [`poetry`](https://python-poetry.org) dependencies. Edit `pyproject.toml` as needed, then run:

```sh
poetry lock --no-update
poetry export --extras all --with dev --with test --with docs --without-hashes --no-ansi > requirements-dev.txt
```

The second step updates `requirements-dev.txt` for developers using `pip`.

## Adding examples

If you're not a maintainer, please open an issue asking us to add your example.
Expand Down
101 changes: 25 additions & 76 deletions docs/how-to/extending/builtin.qmd
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
---
execute:
freeze: auto
---

# Reference built-in functions


Expand All @@ -27,9 +22,10 @@ functions](https://duckdb.org/docs/sql/functions/char.html#text-similarity-funct
Let's expose the `mismatches` API.

```{python}
from ibis import udf
import ibis
ibis.options.interactive = True
@udf.scalar.builtin
@ibis.udf.scalar.builtin
def mismatches(left: str, right: str) -> int:
...
```
Expand All @@ -47,8 +43,6 @@ write in the function body **will be ignored**.
We can now call this function on any ibis expression:

```{python}
import ibis
con = ibis.duckdb.connect() # <1>
```

Expand All @@ -62,98 +56,51 @@ con.execute(expr)
Like any other ibis expression you can inspect the SQL:

```{python}
import ibis
ibis.to_sql(expr, dialect="duckdb") # <1>
```

1. The `dialect` keyword argument must be passed, because we constructed
a literal expression which has no backend attached.

Because built-in UDFs are ultimately Ibis expressions, they compose with the
rest of the library:
Similarly we can expose Duckdb's
[`jaro_winkler_similarity`](https://duckdb.org/docs/sql/functions/char.html#text-similarity-functions)
function. Let's alias it to `jw_sim` to illustrate some more of the Ibis `udf` API:

```{python}
ibis.options.interactive = True
@udf.scalar.builtin
def jaro_winkler_similarity(a: str, b: str) -> float:
@ibis.udf.scalar.builtin(name="jaro_winkler_similarity")
def jw_sim(a: str, b: str) -> float:
...
```

Because built-in UDFs are ultimately Ibis expressions, they compose with the
rest of the library:

```{python}
pkgs = ibis.read_parquet(
"https://storage.googleapis.com/ibis-tutorial-data/pypi/packages.parquet"
)
pandas_ish = pkgs[jaro_winkler_similarity(pkgs.name, "pandas") >= 0.9]
pandas_ish = pkgs[jw_sim(pkgs.name, "pandas") >= 0.9]
pandas_ish
```

Let's count the results:
### Defining Signatures

```{python}
pandas_ish.count()
```

There are a good number of packages that look similar to `pandas`!

### Snowflake

Similarly we can expose Snowflake's
[`jarowinkler_similarity`](https://docs.snowflake.com/en/sql-reference/functions/jarowinkler_similarity)
function.

Let's alias it to `jw_sim` to illustrate some more of the Ibis `udf` API:

```{python}
@udf.scalar.builtin(name="jarowinkler_similarity") # <1>
def jw_sim(left: str, right: str) -> float:
...
```

1. `target` is the name of the function in the backend. This argument is
required in this because the function name is different than the name of the
function in ibis.


Now let's connect to Snowflake and call our `jw_sim` function:

```{python}
import os
con = ibis.connect(os.environ["SNOWFLAKE_URL"])
```

```{python}
expr = jw_sim("snow", "shoe")
con.execute(expr)
```

And let's take a look at the SQL

```{python}
ibis.to_sql(expr, dialect="snowflake")
```

### Input types

Sometimes the input types of builtin functions are difficult to spell.
Sometimes the signatures of builtin functions are difficult to spell.

Consider a function that computes the length of any array: the elements in the
array can be floats, integers, strings and even other arrays. Spelling that
type is difficult.

Fortunately the `udf.scalar.builtin` decorator doesn't require you to specify
input types in these cases:
Fortunately, the `udf.scalar.builtin` decorator **only** requires you to
specify the type of the **return value**. The type of the function **parameters**
are **not** required. Thus, this is adequate:

```{python}
@udf.scalar.builtin(name="array_size")
@ibis.udf.scalar.builtin(name="array_length")
def cardinality(arr) -> int:
...
```

::: {.callout-caution}
## The return type annotation **is always required**.
:::

We can pass arrays with different element types to our `cardinality` function:

```{python}
Expand All @@ -164,14 +111,16 @@ con.execute(cardinality([1, 2, 3]))
con.execute(cardinality(["a", "b"]))
```

When you bypass input types the errors you get back are backend dependent:
When you do not specify input types, Ibis isn't able to catch typing errors
early, and they are only caught during execution.
The errors you get back are backend dependent:

```{python}
#| error: true
con.execute(cardinality("foo"))
```

Here, Snowflake is informing us that the `ARRAY_SIZE` function does not accept
Here, DuckDB is informing us that the `ARRAY_LENGTH` function does not accept
strings as input.


Expand All @@ -198,7 +147,7 @@ function that isn't exposed in ibis:
First, define the builtin aggregate function:

```{python}
@udf.agg.builtin
@ibis.udf.agg.builtin
def kurtosis(x: float) -> float: # <1>
...
```
Expand Down
2 changes: 1 addition & 1 deletion docs/how-to/timeseries/sessionize.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,6 @@ sessionized = (
)
```

Calling `ibis.show_sql(sessionized)` displays the SQL query and can be used to confirm that this Ibis table expression does not rely on any join operations.
Calling `print(ibis.to_sql(sessionized))` displays the SQL query and can be used to confirm that this Ibis table expression does not rely on any join operations.

Calling `sessionized.to_pandas()` should complete in less than a minute, depending on the speed of the internet connection to download the data and the number of CPU cores available to parallelize the processing of this nested query.
21 changes: 15 additions & 6 deletions docs/index.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ title: "Ibis"
description: "the portable Python dataframe library"
repo-actions: false
code-annotations: hover
twitter-card:
image: logo.png
format:
html:
toc: false
Expand Down Expand Up @@ -40,10 +42,10 @@ about:
We recommend starting with the default backend (DuckDB).

```bash
pip install 'ibis-framework[duckdb]' # <1>
pip install 'ibis-framework[duckdb,examples]' # <1>
```

1. Install Ibis with optional dependencies (DuckDB in this case).
1. Install Ibis with the DuckDB backend along with examples.

<div class="d-grid gap-2"><a class="btn btn-lg btn-primary" data-bs-toggle="collapse" href="#collapseBackends" role="button" aria-expanded="false" aria-controls="collapseBackends" margin="100px">Show supported backends</a></div>

Expand Down Expand Up @@ -306,12 +308,13 @@ X = f.select(s.contains("zscore")) # <2>
n_components = 3 # <3>
pca = PCA(n_components=n_components).fit(X) # <3>
t_pca = ibis.memtable(pca.transform(X)).relabel( # <4>
{"col0": "pc1", "col1": "pc2", "col2": "pc3"} # <4>
t_pca = ibis.memtable(pca.transform(X)).rename( # <4>
{"pc1": "col0", "pc2": "col1", "pc3": "col2"} # <4>
) # <4>
f = f.mutate(row_number=ibis.row_number().over()).join( # <5>
t_pca.mutate(row_number=ibis.row_number().over()), "row_number" # <5>
t_pca.mutate(row_number=ibis.row_number().over()), # <5>
"row_number", # <5>
) # <5>
px.scatter_3d( # <6>
Expand Down Expand Up @@ -355,7 +358,13 @@ Ibis supports a variety of input and output options.

### SQL + Python

Ibis has the `ibis.to_sql` to generate SQL strings and `ibis.show_sql` display them. Ibis uses [SQLGlot](https://sqlglot.com) under the hood to allow passing a `dialect` parameter to SQL methods.
Ibis has the `ibis.to_sql` to generate SQL strings.

In a Jupyter notebook or IPython shell session, the output of `ibis.to_sql` will be syntax highlighted.

In a plain Python REPL use `print(ibis.to_sql(...))` to pretty print SQL.

Ibis uses [SQLGlot](https://sqlglot.com) under the hood to allow passing a `dialect` parameter to SQL methods.

::: {.panel-tabset}

Expand Down
Binary file added docs/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/posts/1brc/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1brc
Loading