230 changes: 0 additions & 230 deletions .github/workflows/ibis-backends-flink.yml

This file was deleted.

103 changes: 87 additions & 16 deletions .github/workflows/ibis-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ jobs:
extras:
- duckdb
- deltalake
- geospatial
additional_deps:
- torch
- name: pandas
Expand Down Expand Up @@ -158,6 +159,23 @@ jobs:
- oracle
services:
- oracle
- name: exasol
title: Exasol
serial: true
extras:
- exasol
services:
- exasol
- name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- apache-flink
- pytest-split
services:
- flink
exclude:
- os: windows-latest
backend:
Expand Down Expand Up @@ -254,6 +272,33 @@ jobs:
- oracle
services:
- oracle
- os: windows-latest
backend:
name: flink
title: Flink
serial: true
extras:
- flink
services:
- flink
- python-version: "3.11"
backend:
name: flink
title: Flink
serial: true
extras:
- flink
services:
- flink
- os: windows-latest
backend:
name: exasol
title: Exasol
serial: true
extras:
- exasol
services:
- exasol
steps:
- name: update and install system dependencies
if: matrix.os == 'ubuntu-latest' && matrix.backend.sys-deps != null
Expand Down Expand Up @@ -282,18 +327,18 @@ jobs:
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ steps.install_python.outputs.python-version }}
custom_cache_key_element: ${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'
run: python -m pip install --upgrade pip 'poetry==1.7.1'

- uses: syphar/restore-virtualenv@v1
with:
Expand All @@ -303,6 +348,10 @@ jobs:
- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"

- name: install deps for broken avro-python setup
if: matrix.backend.name == 'flink'
run: poetry run pip install wheel

- name: install other deps
if: matrix.backend.additional_deps != null
run: poetry run pip install ${{ join(matrix.backend.additional_deps, ' ') }}
Expand All @@ -325,8 +374,22 @@ jobs:
IBIS_TEST_WEBHDFS_USER: hdfs
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}

# FIXME(deepyaman): If some backend-specific test, in test_ddl.py,
# executes before common tests, they will fail with:
# org.apache.flink.table.api.ValidationException: Table `default_catalog`.`default_database`.`functional_alltypes` was not found.
# Therefore, we run backend-specific tests second to avoid this.
- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name == 'flink'
run: |
just ci-check -m ${{ matrix.backend.name }} ibis/backends/tests
just ci-check -m ${{ matrix.backend.name }} ibis/backends/flink/tests
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
FLINK_REMOTE_CLUSTER_ADDR: localhost
FLINK_REMOTE_CLUSTER_PORT: "8081"

- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name != 'impala'
if: matrix.backend.serial && matrix.backend.name != 'impala' && matrix.backend.name != 'flink'
run: just ci-check -m ${{ matrix.backend.name }}
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
Expand Down Expand Up @@ -430,13 +493,17 @@ jobs:
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'
run: python -m pip install --upgrade pip 'poetry==1.7.1'

- name: remove lonboard
# it requires a version of pandas that min versions are not compatible with
run: poetry remove lonboard

- name: install minimum versions
run: poetry add --lock --optional ${{ join(matrix.backend.deps, ' ') }}
Expand Down Expand Up @@ -484,7 +551,7 @@ jobs:
- name: checkout
uses: actions/checkout@v4

- uses: actions/setup-java@v3
- uses: actions/setup-java@v4
with:
distribution: microsoft
java-version: 17
Expand All @@ -497,13 +564,17 @@ jobs:
run: just download-data

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'
run: python -m pip install --upgrade pip 'poetry==1.7.1'

- name: remove lonboard
# it requires a version of pandas that pyspark is not compatible with
run: poetry remove lonboard

- name: install maximum versions of pandas and numpy
run: poetry add --lock 'pandas@<2' 'numpy<1.24'
Expand Down Expand Up @@ -542,14 +613,14 @@ jobs:
uses: actions/checkout@v4

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.11"

- run: python -m pip install --upgrade pip 'poetry==1.6.1'
- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- name: remove deps that are not compatible with sqlalchemy 2
run: poetry remove snowflake-sqlalchemy
run: poetry remove snowflake-sqlalchemy sqlalchemy-exasol

- name: add sqlalchemy 2
run: poetry add --lock --optional 'sqlalchemy>=2,<3'
Expand All @@ -567,7 +638,7 @@ jobs:
run: poetry show sqlalchemy --no-ansi | grep version | cut -d ':' -f2- | sed 's/ //g' | grep -P '^2\.'

- name: upload deps file
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: deps
path: |
Expand Down Expand Up @@ -657,13 +728,13 @@ jobs:
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}

- name: download poetry lockfile
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: deps
path: deps
Expand All @@ -686,7 +757,7 @@ jobs:
custom_cache_key_element: ${{ steps.install_python.outputs.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'
run: python -m pip install --upgrade pip 'poetry==1.7.1'

- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"
Expand Down
30 changes: 15 additions & 15 deletions .github/workflows/ibis-docs-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
fetch-depth: 0

- name: install nix
uses: cachix/install-nix-action@v23
uses: cachix/install-nix-action@v24
with:
nix_path: nixpkgs=channel:nixos-unstable-small
extra_nix_config: |
Expand All @@ -47,13 +47,13 @@ jobs:
uses: actions/checkout@v4

- name: install nix
uses: cachix/install-nix-action@v23
uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v12
uses: cachix/cachix-action@v13
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -72,13 +72,13 @@ jobs:
fetch-depth: 0

- name: install nix
uses: cachix/install-nix-action@v23
uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v12
uses: cachix/cachix-action@v13
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -95,7 +95,7 @@ jobs:
uses: actions/checkout@v4

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: "3.11"
Expand All @@ -115,7 +115,7 @@ jobs:
requirement_files: poetry.lock
custom_cache_key_element: benchmarks-${{ steps.install_python.outputs.python-version }}

- run: python -m pip install --upgrade pip 'poetry==1.6.1'
- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- name: install ibis
run: poetry install --without dev --without docs --all-extras
Expand All @@ -126,11 +126,11 @@ jobs:
- name: benchmark
run: poetry run pytest --benchmark-enable --benchmark-json .benchmarks/output.json ibis/tests/benchmarks

- uses: google-github-actions/auth@v1
- uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_CREDENTIALS }}

- uses: google-github-actions/setup-gcloud@v1
- uses: google-github-actions/setup-gcloud@v2

- name: show gcloud info
run: gcloud info
Expand Down Expand Up @@ -171,13 +171,13 @@ jobs:
concurrency: docs_pr-${{ github.repository }}-${{ github.head_ref || github.sha }}
steps:
- name: install nix
uses: cachix/install-nix-action@v23
uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v12
uses: cachix/cachix-action@v13
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down Expand Up @@ -209,13 +209,13 @@ jobs:
concurrency: docs-${{ github.repository }}
steps:
- name: install nix
uses: cachix/install-nix-action@v23
uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v12
uses: cachix/cachix-action@v13
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down Expand Up @@ -253,12 +253,12 @@ jobs:
with:
fetch-depth: 0

- uses: cachix/install-nix-action@v23
- uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- uses: cachix/cachix-action@v12
- uses: cachix/cachix-action@v13
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/ibis-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
uses: actions/checkout@v4

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -64,7 +64,7 @@ jobs:
requirement_files: poetry.lock
custom_cache_key_element: no-backends-${{ steps.install_python.outputs.python-version }}

- run: python -m pip install --upgrade pip 'poetry==1.6.1'
- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- uses: syphar/restore-virtualenv@v1
with:
Expand Down Expand Up @@ -119,7 +119,7 @@ jobs:
uses: actions/checkout@v4

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -141,7 +141,7 @@ jobs:
sudo apt-get update -y -qq
sudo apt-get install -y -q build-essential libgeos-dev
- run: python -m pip install --upgrade pip 'poetry==1.6.1'
- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- name: install ibis
# install duckdb and geospatial because of https://github.com/ibis-project/ibis/issues/4856
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
uses: actions/checkout@v4

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -182,7 +182,7 @@ jobs:
requirement_files: poetry.lock
custom_cache_key_element: doctests-${{ steps.install_python.outputs.python-version }}

- run: python -m pip install --upgrade pip 'poetry==1.6.1'
- run: python -m pip install --upgrade pip 'poetry==1.7.1'

- uses: syphar/restore-virtualenv@v1
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ jobs:
uses: actions/checkout@v4

- name: install nix
uses: cachix/install-nix-action@v23
uses: cachix/install-nix-action@v24
with:
nix_path: nixpkgs=channel:nixos-unstable-small
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v12
uses: cachix/cachix-action@v13
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/pre-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,23 @@ concurrency:

jobs:
pre-release:
if: github.repository_owner == 'ibis-project'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: upgrade pip
run: python -m pip install --upgrade pip

- name: install poetry
run: python -m pip install 'poetry==1.6.1' poetry-dynamic-versioning
run: python -m pip install 'poetry==1.7.1' poetry-dynamic-versioning

- name: compute ibis version
id: get_version
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ jobs:
fetch-depth: 0
token: ${{ steps.generate_token.outputs.token }}

- uses: cachix/install-nix-action@v23
- uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- uses: cachix/cachix-action@v12
- uses: cachix/cachix-action@v13
with:
name: ibis
extraPullNames: nix-community,poetry2nix
Expand Down
10 changes: 6 additions & 4 deletions .github/workflows/update-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ on:

jobs:
get-flakes:
if: github.repository_owner == 'ibis-project'
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.get-flakes.outputs.matrix }}
steps:
- uses: actions/checkout@v4
- uses: cachix/install-nix-action@v23
- uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
Expand All @@ -25,6 +26,7 @@ jobs:
echo "matrix=${flakes}" >> "$GITHUB_OUTPUT"
flake-update:
if: github.repository_owner == 'ibis-project'
runs-on: ubuntu-latest
needs:
- get-flakes
Expand All @@ -34,13 +36,13 @@ jobs:
steps:
- uses: actions/checkout@v4

- uses: cachix/install-nix-action@v23
- uses: cachix/install-nix-action@v24
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v12
uses: cachix/cachix-action@v13
with:
name: ibis
extraPullNames: nix-community,poetry2nix
Expand All @@ -51,7 +53,7 @@ jobs:
input: ${{ matrix.flake }}

- name: update ${{ matrix.flake }}
run: nix flake lock --update-input ${{ matrix.flake }}
run: nix flake update ${{ matrix.flake }}

- uses: cpcloud/flake-dep-info-action@v2.0.11
id: get_new_commit
Expand Down
5 changes: 0 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@ repos:
rev: v1.6.26
hooks:
- id: actionlint-system
- repo: https://github.com/keewis/blackdoc
rev: v0.3.9
hooks:
- id: blackdoc
exclude: ibis/examples/__init__\.py
- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
hooks:
Expand Down
2 changes: 1 addition & 1 deletion ci/schema/clickhouse.sql
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ INSERT INTO ibis_testing.map VALUES
(map('a', 1, 'b', 2, 'c', 3)),
(map('d', 4, 'e', 5, 'c', 6));

CREATE OR REPLACE TABLE ibis_testing.win (g String, x Int64, y Int64) ENGINE = Memory;
CREATE OR REPLACE TABLE ibis_testing.win (g Nullable(String), x Int64, y Nullable(Int64)) ENGINE = Memory;
INSERT INTO ibis_testing.win VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
2 changes: 1 addition & 1 deletion ci/schema/duckdb.sql
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ INSERT INTO json_t VALUES
('[42,47,55]'),
('[]');

CREATE OR REPLACE TABLE win (g TEXT, x BIGINT, y BIGINT);
CREATE OR REPLACE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
75 changes: 75 additions & 0 deletions ci/schema/exasol.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
DROP SCHEMA IF EXISTS EXASOL CASCADE;
CREATE SCHEMA EXASOL;

CREATE OR REPLACE TABLE EXASOL.diamonds
(
"carat" DOUBLE,
"cut" VARCHAR(256),
"color" VARCHAR(256),
"clarity" VARCHAR(256),
"depth" DOUBLE,
"table" DOUBLE,
"price" BIGINT,
"x" DOUBLE,
"y" DOUBLE,
"z" DOUBLE
);

CREATE OR REPLACE TABLE EXASOL.batting
(
"playerID" VARCHAR(256),
"yearID" BIGINT,
"stint" BIGINT,
"teamID" VARCHAR(256),
"logID" VARCHAR(256),
"G" BIGINT,
"AB" BIGINT,
"R" BIGINT,
"H" BIGINT,
"X2B" BIGINT,
"X3B" BIGINT,
"HR" BIGINT,
"RBI" BIGINT,
"SB" BIGINT,
"CS" BIGINT,
"BB" BIGINT,
"SO" BIGINT,
"IBB" BIGINT,
"HBP" BIGINT,
"SH" BIGINT,
"SF" BIGINT,
"GIDP" BIGINT
);

CREATE OR REPLACE TABLE EXASOL.awards_players
(
"playerId" VARCHAR(256),
"awardID" VARCHAR(256),
"yearID" VARCHAR(256),
"logID" VARCHAR(256),
"tie" VARCHAR(256),
"notest" VARCHAR(256)
);

CREATE OR REPLACE TABLE EXASOL.functional_alltypes
(
"id" INTEGER,
"bool_col" BOOLEAN,
"tinyint_col" SHORTINT,
"small_int" SMALLINT,
"int_col" INTEGER,
"bigint_col" BIGINT,
"float_col" FLOAT,
"double_col" DOUBLE PRECISION,
"date_string_col" VARCHAR(256),
"string_col" VARCHAR(256),
"timestamp_col" TIMESTAMP,
"year" INTEGER,
"month" INTEGER
);


IMPORT INTO EXASOL.diamonds FROM LOCAL CSV FILE '/data/diamonds.csv' COLUMN SEPARATOR = ',' SKIP = 1;
IMPORT INTO EXASOL.batting FROM LOCAL CSV FILE '/data/batting.csv' COLUMN SEPARATOR = ',' SKIP = 1;
IMPORT INTO EXASOL.awards_players FROM LOCAL CSV FILE '/data/awards_players.csv' COLUMN SEPARATOR = ',' SKIP = 1;
IMPORT INTO EXASOL.functional_alltypes FROM LOCAL CSV FILE '/data/functional_alltypes.csv' COLUMN SEPARATOR = ',' SKIP = 1;
2 changes: 1 addition & 1 deletion ci/schema/mssql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS win;

CREATE TABLE win (g VARCHAR(MAX), x BIGINT, y BIGINT);
CREATE TABLE win (g VARCHAR(MAX), x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
2 changes: 1 addition & 1 deletion ci/schema/mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ INSERT INTO json_t VALUES

DROP TABLE IF EXISTS win CASCADE;

CREATE TABLE win (g TEXT, x BIGINT, y BIGINT);
CREATE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
2 changes: 1 addition & 1 deletion ci/schema/oracle.sql
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ CREATE TABLE "functional_alltypes" (

DROP TABLE IF EXISTS "win";

CREATE TABLE "win" ("g" VARCHAR2(8), "x" NUMBER(18), "y" NUMBER(18));
CREATE TABLE "win" ("g" VARCHAR2(8), "x" NUMBER(18) NOT NULL, "y" NUMBER(18));
INSERT INTO "win" VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
2 changes: 1 addition & 1 deletion ci/schema/postgres.sql
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ INSERT INTO json_t VALUES
('[]');

DROP TABLE IF EXISTS win CASCADE;
CREATE TABLE win (g TEXT, x BIGINT, y BIGINT);
CREATE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
2 changes: 1 addition & 1 deletion ci/schema/snowflake.sql
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ INSERT INTO json_t ("js")
SELECT parse_json('[42,47,55]') UNION
SELECT parse_json('[]');

CREATE OR REPLACE TABLE win ("g" TEXT, "x" BIGINT, "y" BIGINT);
CREATE OR REPLACE TABLE win ("g" TEXT, "x" BIGINT NOT NULL, "y" BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
2 changes: 1 addition & 1 deletion ci/schema/sqlite.sql
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ INSERT INTO json_t VALUES
('[]');

DROP TABLE IF EXISTS win;
CREATE TABLE win (g TEXT, x BIGINT, y BIGINT);
CREATE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
Expand Down
4 changes: 2 additions & 2 deletions ci/schema/trino.sql
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ CREATE TABLE hive.default.awards_players (
"tie" VARCHAR,
"notes" VARCHAR
) WITH (
external_location = 's3a://warehouse/awards_players',
external_location = 's3a://warehouse/awards-players',
format = 'PARQUET'
);

Expand All @@ -116,7 +116,7 @@ CREATE TABLE hive.default.functional_alltypes (
"year" INTEGER,
"month" INTEGER
) WITH (
external_location = 's3a://warehouse/functional_alltypes',
external_location = 's3a://warehouse/functional-alltypes',
format = 'PARQUET'
);
CREATE OR REPLACE VIEW memory.default.functional_alltypes AS
Expand Down
67 changes: 58 additions & 9 deletions docker-compose.yml → compose.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
version: "3.4"
services:
clickhouse:
image: clickhouse/clickhouse-server:23.10.3.5-alpine
image: clickhouse/clickhouse-server:23.11.2.11-alpine
ports:
- 8123:8123 # http port
- 9000:9000 # native protocol port
healthcheck:
interval: 1s
retries: 10
Expand Down Expand Up @@ -162,13 +162,11 @@ services:
- trino

minio:
# TODO: healthcheck?
image: minio/minio:RELEASE.2023-11-01T18-37-25Z
image: bitnami/minio:2023.12.14
environment:
MINIO_ROOT_USER: accesskey
MINIO_ROOT_PASSWORD: secretkey
entrypoint: sh
command: -c 'mkdir -p /data/warehouse && minio server /data'
MINIO_SKIP_CLIENT: yes
healthcheck:
interval: 1s
retries: 20
Expand All @@ -178,8 +176,8 @@ services:
networks:
- trino
volumes:
- minio:/opt/data/raw
- $PWD/docker/minio/config.json:/tmp/.mc/config.json:ro
- minio:/data
- $PWD/docker/minio/config.json:/.mc/config.json:ro

hive-metastore:
# TODO: healthcheck?
Expand Down Expand Up @@ -224,7 +222,7 @@ services:
test:
- CMD-SHELL
- trino --output-format null --execute 'show schemas in hive; show schemas in memory'
image: trinodb/trino:433
image: trinodb/trino:435
ports:
- 8080:8080
networks:
Expand Down Expand Up @@ -412,6 +410,54 @@ services:
volumes:
- oracle:/opt/oracle/data

exasol:
image: exasol/docker-db:7.1.25
privileged: true
ports:
- 8563:8563
healthcheck:
interval: 10s
retries: 9
timeout: 90s
test:
- CMD-SHELL
- /usr/opt/EXASuite-7/EXASolution-7.*/bin/Console/exaplus -c 127.0.0.1:8563 -u sys -p exasol -encryption OFF <<< 'SELECT 1'
networks:
- exasol
volumes:
- exasol:/data

flink-jobmanager:
build: ./docker/flink
image: ibis-flink
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: flink-jobmanager
ports:
- 8081:8081
command: jobmanager
networks:
- flink

flink:
build: ./docker/flink
image: ibis-flink
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: flink-jobmanager
taskmanager.numberOfTaskSlots: 2
taskmanager.memory.process.size: 2048m
taskmanager.memory.network.fraction: 0.4
taskmanager.memory.network.min: 512mb
taskmanager.memory.network.max: 2gb
depends_on:
- flink-jobmanager
command: taskmanager
networks:
- flink

networks:
impala:
mysql:
Expand All @@ -421,6 +467,8 @@ networks:
trino:
druid:
oracle:
exasol:
flink:

volumes:
broker_var:
Expand All @@ -437,3 +485,4 @@ volumes:
oracle:
postgres:
minio:
exasol:
3 changes: 3 additions & 0 deletions docker/flink/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM flink:1.18.0-scala_2.12
# ibis-flink requires PyFlink dependency
RUN wget -nv -P $FLINK_HOME/lib/ https://repo1.maven.org/maven2/org/apache/flink/flink-python/1.18.0/flink-python-1.18.0.jar

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -543,3 +543,10 @@ quartodoc:
- Options
- Repr
- SQL

- title: Contributing
desc: "Ibis Backend Developer Documentation"
package: ibis.backends.tests.base
contents:
- BackendTest
- ServiceBackendTest
31 changes: 27 additions & 4 deletions docs/_renderer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from textwrap import dedent

import quartodoc as qd
import toolz
from plum import dispatch
Expand All @@ -26,6 +28,8 @@ def render(self, el: qd.ast.ExampleCode) -> str:
lambda line: quartodoc_skip_doctest in line or skip_doctest in line
)

has_executed_chunks = False

for chunk in toolz.partitionby(chunker, lines):
first, *rest = chunk

Expand All @@ -35,10 +39,11 @@ def render(self, el: qd.ast.ExampleCode) -> str:
# check whether to skip execution and if so, render the code
# block as `python` (not `{python}`) if it's marked with
# skip_doctest, expect_failure or quartodoc_skip_doctest
if not any(map(should_skip, chunk)):
start, end = "{}"
else:
if any(map(should_skip, chunk)):
start = end = ""
else:
has_executed_chunks = True
start, end = "{}"

result.append(f"```{start}python{end}")

Expand All @@ -62,4 +67,22 @@ def render(self, el: qd.ast.ExampleCode) -> str:
result.extend(rest)
result.append("```\n")

return "\n".join(result)
examples = "\n".join(result)

if has_executed_chunks:
# turn off interactive mode before rendering
return (
dedent(
"""
```{python}
#| echo: false
import ibis
ibis.options.interactive = False
```
"""
)
+ examples
)
else:
return examples
2 changes: 1 addition & 1 deletion docs/backends/_templates/api.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ backend = get_backend(module)
print(f"## `{module}.Backend` {{ #{backend.canonical_path} }}")
methods = sorted(
key for key, value in backend.members.items()
key for key, value in backend.all_members.items()
if value.is_function
if not value.name.startswith("_")
if value.name != "do_connect"
Expand Down
1 change: 1 addition & 0 deletions docs/backends/app/backend_info_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def backends_info_df():
"datafusion": ["sql"],
"druid": ["sqlalchemy", "sql"],
"duckdb": ["sqlalchemy", "sql"],
"exasol": ["sqlalchemy", "sql"],
"flink": ["string", "sql"],
"impala": ["string", "sql"],
"mssql": ["sqlalchemy", "sql"],
Expand Down
96 changes: 96 additions & 0 deletions docs/backends/exasol.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Exasol

[https://www.exasol.com](https://www.exasol.com)

## Install

Install Ibis and dependencies for the Exasol backend:

::: {.panel-tabset}

## `pip`

Install with the `exasol` extra:

```{.bash}
pip install 'ibis-framework[exasol]'
```

And connect:

```{.python}
import ibis
con = ibis.exasol.connect(...) # <1>
```

1. Adjust connection parameters as needed.

## `conda`

Install for Exasol:

```{.bash}
conda install -c conda-forge ibis-exasol
```

And connect:

```{.python}
import ibis
con = ibis.exasol.connect(...) # <1>
```

1. Adjust connection parameters as needed.

## `mamba`

Install for Exasol:

```{.bash}
mamba install -c conda-forge ibis-exasol
```

And connect:

```{.python}
import ibis
con = ibis.exasol.connect(...) # <1>
```

1. Adjust connection parameters as needed.

:::

## Connect

### `ibis.exasol.connect`

```python
con = ibis.exasol.connect(
user = "username",
password = "password",
host = "localhost",
port = 8563,
schema = None,
encryption = True,
certificate_validation = True,
encoding = "en_US.UTF-8"
)
```

::: {.callout-note}
`ibis.exasol.connect` is a thin wrapper around [`ibis.backends.exasol.Backend.do_connect`](#ibis.backends.exasol.Backend.do_connect).
:::

### Connection Parameters

```{python}
#| echo: false
#| output: asis
from _utils import render_do_connect
render_do_connect("exasol")
```
46 changes: 0 additions & 46 deletions docs/concepts/backend.qmd

This file was deleted.

6 changes: 3 additions & 3 deletions docs/contribute/01_environment.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ For a better development experience see the `conda` or `nix` setup instructions.
1. Install development dependencies

```sh
pip install 'poetry==1.6.1'
pip install 'poetry==1.7.1'
pip install -r requirements-dev.txt
```

1. Install ibis in development mode

```sh
pip install -e .
pip install -e '.[all]'
```

## Conda
Expand Down Expand Up @@ -212,7 +212,7 @@ for manager, params in managers.items():
## Building the docs
Run
Install [`just`](https://just.systems/man/en/chapter_4.html) and run
```bash
just docs-preview
Expand Down
2 changes: 1 addition & 1 deletion docs/contribute/02_workflow.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ pytest -m sqlite
## Setting up non-trivial backends

These client-server backends need to be started before testing them.
They can be started with `docker-compose` directly, or using the `just` tool.
They can be started with `docker compose` directly, or using the `just` tool.

- ClickHouse: `just up clickhouse`
- PostgreSQL: `just up postgres`
Expand Down
31 changes: 26 additions & 5 deletions docs/contribute/03_style.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,36 @@

## Code style

- [`ruff`](https://github.com/charliermarsh/ruff): Formatting Python code and sorting `import` statements
- [`shellcheck`](https://github.com/koalaman/shellcheck): Linting shell scripts
- [`shfmt`](https://github.com/mvdan/sh): Formatting shell scripts
- [`statix`](https://github.com/nerdypepper/statix): Linting nix files
- [`nixpkgs-fmt`](https://github.com/nix-community/nixpkgs-fmt): Formatting nix files
Ibis uses several code linters, like [`ruff`](https://github.com/charliermarsh/ruff), [`shellcheck`](https://github.com/koalaman/shellcheck), [`statix`](https://github.com/nerdypepper/statix), [`nixpkgs-fmt`](https://github.com/nix-community/nixpkgs-fmt) and others, that are enforced by CI. Developers should run them locally before submitting a PR.

1. Install `pre-commit`

```sh
pip install pre-commit
```

2. Run

```sh
pre-commit run --all-files
```
::: {.callout-note}
Some of the packages needed to run the `pre-commit` linting can not be installed automatically (e.g. `prettier`, `actionlint`, `shellcheck`), and they need to be installed through a system package manager.
:::

Optionally, you may want to setup the `pre-commit` hooks to run automatically when making a git commit. To do this, run the following from the root of the Ibis repository:

```sh
pre-commit install
```

This will run the code linters automatically when you make a git commit. If you want to skip these checks, do `git commit --no-verify`


::: {.callout-tip}
If you use `nix-shell`, all of these are already setup for you and ready to use, and you don't need to do anything to install these tools.
:::

## Docstrings
We use [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) as our
standard format for docstrings.
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ Assuming your file is called `example.csv`:
1. Add a file named `ibis/examples/descriptions/example` that contains a
description of your example. One line is best, but not necessary.
1. Run one of the following **from the git root of an ibis clone**:
- `python ibis/examples/gen_registry.py` (doesn't include R dependenices)
- `nix run '.#gen-examples'` (includes R dependenices)
- `python ibis/examples/gen_registry.py` (doesn't include R dependencies)
- `nix run '.#gen-examples'` (includes R dependencies)

## Release

Expand Down
49 changes: 49 additions & 0 deletions docs/contribute/05_reference.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
---
title: "Test Class Reference"
---

This page provides a partial reference to the attributes, methods, properties
and class-level variables that are used to help configure a backend for the Ibis
test suite.

Contributors are encouraged to look over the methods and class-level variables
in `ibis/backends/tests/base.py`.

To add a new backend test configuration import one of `BackendTest` or
`ServiceBackendTest` into a `conftest.py` file with the path
`ibis/backends/{backend_name}/tests/conftest.py`. Then update / override the
relevant class-level variables and methods.

```python
from ibis.backends.tests.base import BackendTest

class TestConf(BackendTest):
"""Backend-specific class with information for testing."""

supports_divide_by_zero = True
supports_floating_modulus = False
returned_timestamp_unit = "us"
supports_structs = True
supports_json = True
check_names = False
force_sort = True

@staticmethod
def connect(*args, **kwargs):
...
```

```{python}
#| echo: false
#| output: asis
import os
paths = [
"../reference/BackendTest.qmd",
"../reference/ServiceBackendTest.qmd",
]
for path in filter(os.path.exists, paths):
with open(path) as f:
print(f.read())
```
3 changes: 2 additions & 1 deletion docs/how-to/extending/builtin.qmd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
---
freeze: auto
execute:
freeze: auto
---

# Reference built-in functions
Expand Down
2 changes: 1 addition & 1 deletion docs/how-to/extending/sql.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ another.
## `Backend.sql`

There's also the `Backend.sql` method, which can handle arbitrary `SELECT`
statements as well and return's an Ibis table expression.
statements as well and returns an Ibis table expression.

The main difference with `Table.sql` is that `Backend.sql` **can only refer to
tables that already exist in the database**, because the API is defined on
Expand Down
117 changes: 117 additions & 0 deletions docs/posts/dbt-ibis/index.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
---
title: "dbt-ibis: Write your dbt models using Ibis"
author: "Stefan Binder"
date: "2023-11-24"
categories:
- blog
- dbt
- data engineering
---

# Introduction to dbt
[dbt](https://github.com/dbt-labs/dbt-core) has revolutionized how transformations are
orchestrated and managed within modern data warehouses. Initially released in 2016,
dbt quickly gained traction within the data analytics community due to its focus on bringing software engineering best practices to analytics code like modularity, portability, CI/CD, and documentation.

At the heart of dbt are so called "models" which are just simple SQL SELECT statements
(see further below for an example). dbt removes the need to write any DDL/DML,
allowing users to focus on writing SELECT statements. Depending on how you configure it, the queries are materialized as tables, views, or custom materializations. dbt also infers dependencies between models and runs them in order. The following is a dbt model which selects from two
other models called `stg_orders` and `stg_customers`:

```sql
WITH customer_orders as (
SELECT
customer_id AS customer_id,
MIN(order_date) AS first_order,
MAX(order_date) AS most_recent_order,
COUNT(*) AS number_of_orders
FROM {{ ref('stg_orders') }} AS orders
GROUP BY
customer_id
), customer_orders_info as (
SELECT
customers.customer_id AS customer_id,
customers.first_name AS first_name,
customers.last_name AS last_name,
customer_orders.customer_id AS customer_id_right,
customer_orders.first_order AS first_order,
customer_orders.most_recent_order AS most_recent_order,
customer_orders.number_of_orders AS number_of_orders
FROM {{ ref('stg_customers') }} AS customers
LEFT OUTER JOIN customer_orders
ON customers.customer_id = customer_orders.customer_id
)
SELECT
customer_id,
first_name,
last_name,
first_order,
most_recent_order,
number_of_orders
FROM customer_orders_info
```
dbt will make sure that the resulting table will be created after `stg_orders`
and `stg_customers`. This model is inspired by the [jaffle shop demo project by dbt Labs](https://github.com/dbt-labs/jaffle_shop)
where you can find more example queries.

At the end of 2022, dbt added support for [Python models](https://docs.getdbt.com/docs/build/python-models)
on specific platforms (Snowflake, Databricks, Google Cloud Platform). This can be useful
for complex transformations such as using a machine learning model and storing the results.
However, it also requires that your Python code is run in a cloud data warehouse and often,
that data is moved into a Python process which can be slower than leveraging the power of modern SQL engines.


# Why dbt and Ibis go great together
[dbt-ibis](https://github.com/binste/dbt-ibis) offers a lightweight and compatible alternative,
which allows you to write dbt models using Ibis. dbt-ibis transparently converts your Ibis
statements into SQL and then hands it over to dbt. Your database does not need to have Python
support for this as everything is executed in the same process as dbt. Hence, this allows for
working in Python for all dbt adapters with supported Ibis backends. Rewriting the above SQL model in Ibis we get:

```python
from dbt_ibis import depends_on, ref


@depends_on(ref("stg_customers"), ref("stg_orders"))
def model(customers, orders):
customer_orders = orders.group_by("customer_id").aggregate(
first_order=orders["order_date"].min(),
most_recent_order=orders["order_date"].max(),
number_of_orders=orders.count(),
)
# Add first_name and last_name
customer_orders = customers.join(customer_orders, "customer_id", how="left")
return customer_orders.select(
"customer_id",
"first_name",
"last_name",
"first_order",
"most_recent_order",
"number_of_orders",
)
```

Using Ibis instead of SQL for dbt models brings you many advantages:

* Type checks and validation before your code is executed in a database.
* More composable as you can break down complex queries into smaller pieces.
* Better reusability of code. Although dbt allows you to use [Jinja and macros](https://docs.getdbt.com/docs/build/jinja-macros), which is an improvement over plain SQL, this gets you only so far. String manipulation is inherently fragile. With dbt-ibis, you can easily share common code between models.
* Your dbt models become backend agnostic which reduces lock-in to a specific database. Furthermore, you get the possibility of building a [multi-engine data stack](https://juhache.substack.com/p/n-engines-1-language?publication_id=1211981&post_id=137718100). For example, you could use DuckDB for small to medium workloads and Snowflake for heavy workloads and as an end-user and BI layer leveraging its governance features. Depending on the size of your warehouse, this can result in significant cost savings.
* Unit test your code with your favorite Python testing frameworks such as pytest.

In addition, you can stick to the tool (Ibis) you like, no matter if you're writing an
ingestion pipeline, a dbt model to transform the data in your data warehouse, or conduct an ad-hoc analysis in a Jupyter notebook.

Be aware that a current limitation of dbt-ibis is that you cannot connect to the database
from within your dbt models, i.e. you purely use Ibis to construct a SELECT statement. You cannot execute statements and act based on the results.

# Further readings
If you want to give dbt-ibis a try, head over to the [GitHub repo](https://github.com/binste/dbt-ibis/blob/main/README.md)
for more information on how to get up and running in no time!

For more details on the future of the integration of Ibis within dbt, you can check out
[this PR](https://github.com/dbt-labs/dbt-core/pull/5274#issuecomment-1132772028) and [this GitHub issue](https://github.com/dbt-labs/dbt-core/issues/6184)
on adding an official plugin system to dbt
which could be used to provide first-class support for modeling languages in general and
which might allow dbt-ibis to provide an even better user experience and more features.
See also this [discussion on Ibis as a dataframe API in the dbt GitHub repo](https://github.com/dbt-labs/dbt-core/discussions/5738).
248 changes: 248 additions & 0 deletions docs/posts/ibis-duckdb-geospatial/index.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
---
title: "Ibis + DuckDB geospatial: a match made on Earth"
author: Naty Clementi
date: 2023-12-07
categories:
- blog
- duckdb
- geospatial
execute:
freeze: false
---

Ibis now has support for [DuckDB geospatial functions](https://gist.github.com/ncclementi/fbc5564af709e2d7f8882821e3a8649f)!

This blogpost showcases some examples of the geospatial API for the DuckDB backend. The material is inspired by
the ["DuckDB: Spatial Relationships"](https://geog-414.gishub.org/book/duckdb/07_spatial_relationships.html) lesson from
[Dr. Qiusheng Wu](https://geog-414.gishub.org/book/preface/instructor.html)'s course "Spatial Data Management" from the
Department of Geography & Sustainability at the University of Tennessee, Knoxville.

::: {.callout-note}
You can check Dr. Qiusheng Wu's full Spatial Data Management course material on its
[website](https://geog-414.gishub.org/index.html), and the classes are also on
[YouTube](https://www.youtube.com/watch?v=A4TOAdsXsEs&list=PLAxJ4-o7ZoPe9SkgnophygyLjTDBzIEbi).
:::

## Data

We are going to be working with data from New York City. The database contains multiple tables with information about
subway stations, streets, neighborhood, census data and, homicides. The datasets in the database are in NAD83 / UTM zone
18N projection, EPSG:26918.

```{python}
from pathlib import Path
from zipfile import ZipFile
from urllib.request import urlretrieve
# Download and unzip
url = "https://open.gishub.org/data/duckdb/nyc_data.db.zip"
zip_path = Path("nyc_data.db.zip")
db_path = Path("nyc_data.db")
if not zip_path.exists():
urlretrieve(url, zip_path)
if not db_path.exists():
with ZipFile(zip_path) as zip_file:
zip_file.extract("nyc_data.db")
```

## Let's get started

The beauty of spatial databases is that they allow us to both store *and* compute over geometries.

```{python}
import ibis
from ibis import _
ibis.options.interactive = True
con = ibis.duckdb.connect("nyc_data.db")
con.list_tables()
```

We have multiple tables with information about New York City. Following Dr. Wu's class, we'll take a look at some
spatial relations.

We can start by taking a peek at the `nyc_subway_stations` table.

```{python}
subway_stations = con.table("nyc_subway_stations")
subway_stations
```

Notice that the last column has a `geometry` type, and in this case it contains points that represent the location of
each subway station. Let's grab the entry for the Broad St subway station.

```{python}
broad_station = subway_stations.filter(subway_stations.NAME == "Broad St")
broad_station
```

### `geo_equals` (`ST_Equals`)

In DuckDB `ST_Equals` returns `True` if two geometries are topologically equal. This means that they have the same
dimension and identical coordinate values, although the order of the vertices may be different.

The following is a bit redundant but we can check if our `"Broad St"` point matches only one point in our data using
`geo_equals`

```{python}
subway_stations.filter(subway_stations.geom.geo_equals(broad_station.geom))
```

We can also write this query without using `broad_station` as a variable, and with the help of the deferred expressions
API, also known as [the underscore API](../../how-to/analytics/chain_expressions.qmd).

```{python}
subway_stations.filter(_.geom.geo_equals(_.filter(_.NAME == "Broad St").geom))
```

### `intersect` (ST_Intersect)

Let's locate the neighborhood of the "Broad Street" subway station using the
geospatial `intersect` function. The `intersect` function returns `True` if two geometries have any points in common.

```{python}
boroughs = con.table("nyc_neighborhoods")
boroughs
```

```{python}
boroughs.filter(boroughs.geom.intersects(broad_station.select(broad_station.geom).to_array()))
```

### `d_within` (ST_DWithin)

We can also find the streets near (say, within 10 meters) the Broad St subway station using the `d_within`
function. The `d_within` function returns True if the geometries are within a given distance.

```{python}
streets = con.table("nyc_streets")
streets
```

Using the deferred API, we can check which streets are within `d=10` meters of distance.

```{python}
sts_near_broad = streets.filter(_.geom.d_within(broad_station.select(_.geom).to_array(), 10))
sts_near_broad
```

::: {.callout-note}
In the previous query, `streets` and `broad_station` are different tables. We use [`to_array()`](../../reference/expression-tables.qmd#ibis.expr.types.relations.Table.to_array) to generate a
scalar subquery from a table with a single column (whose shape is scalar).
:::

To visualize the findings, we will convert the tables to GeoPandas DataFrames.

```{python}
broad_station_gdf = broad_station.to_pandas()
broad_station_gdf.crs = "EPSG:26918"
sts_near_broad_gdf = sts_near_broad.to_pandas()
sts_near_broad_gdf.crs = "EPSG:26918"
streets_gdf = streets.to_pandas()
streets_gdf.crs = "EPSG:26918"
```

```{python}
import leafmap.deckgl as leafmap # <1>
```

1. `leafmap.deckgl` allows us to visualize multiple layers

```{python}
m = leafmap.Map()
m.add_vector(broad_station_gdf, get_fill_color="blue")
m.add_vector(sts_near_broad_gdf, get_color="red", opacity=0.5)
m.add_vector(streets_gdf, get_color="grey", zoom_to_layer=False, opacity=0.3)
m
```

You can zoom in and out, and hover over the map to check on the street names.

### `buffer` (ST_Buffer)

Next, we'll take a look at the homicides table and showcase some
additional functionality related to polygon handling.

```{python}
homicides = con.table("nyc_homicides")
homicides
```

Let's use the `buffer` method to find homicides near our `"Broad St"` station point.

The `buffer` method computes a polygon or multipolygon that represents all points whose distance from a geometry is less
than or equal to a given distance.

```{python}
broad_station.geom.buffer(200)
```

We can check the area using the `area` (`ST_Area`) function, and see that is $~ \pi r^{2}=125664$

```{python}
broad_station.geom.buffer(200).area()
```

To find if there were any homicides in that area, we can find where the polygon resulting from adding the
200 meters buffer to our "Broad St" station point intersects with the geometry column in our homicides table.

```{python}
h_near_broad = homicides.filter(_.geom.intersects(broad_station.select(_.geom.buffer(200)).to_array()))
h_near_broad
```

It looks like there was one homicide within 200 meters from the "Broad St" station, but from this
data we can't tell the street near which it happened. However, we can check if the homicide point is within a small
distance of a street.

```{python}
h_street = streets.filter(_.geom.d_within(h_near_broad.select(_.geom).to_array(), 2))
h_street
```

Let's plot this:

```{python}
broad_station_zone = broad_station.mutate(geom=broad_station.geom.buffer(200))
broad_station_zone = broad_station_zone.to_pandas()
broad_station_zone.crs = "EPSG:26918"
h_near_broad_gdf = h_near_broad.to_pandas()
h_near_broad_gdf.crs = "EPSG:26918"
h_street_gdf = h_street.to_pandas()
h_street_gdf.crs = "EPSG:26918"
mh = leafmap.Map()
mh.add_vector(broad_station_gdf, get_fill_color="orange")
mh.add_vector(broad_station_zone, get_fill_color="orange", opacity=0.1)
mh.add_vector(h_near_broad_gdf, get_fill_color="red", opacity=0.5)
mh.add_vector(h_street_gdf, get_color="blue", opacity=0.3)
mh.add_vector(streets_gdf, get_color="grey", zoom_to_layer=False, opacity=0.2)
mh
```


## Functions supported and next steps

At the moment in Ibis we have support for around thirty geospatial functions in DuckDB and we will add some more
(see list [here](https://gist.github.com/ncclementi/fbc5564af709e2d7f8882821e3a8649f)).

We also support reading multiple geospatial formats via [`read_geo()`](../../backends/duckdb.qmd#ibis.backends.duckdb.Backend.read_geo).

Here are some resources to learn more about Ibis:

- [Ibis Docs](https://ibis-project.org/)
- [Ibis GitHub](https://github.com/ibis-project/ibis)

Chat with us on Zulip:

- [Ibis Zulip Chat](https://ibis-project.zulipchat.com/)
33 changes: 33 additions & 0 deletions docs/posts/pydata-performance-part2/datafusion_ibis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

import ibis
from ibis import _

ibis.set_backend("datafusion")

expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.dropna("ext")
.order_by([_.month.desc(), _.project_count.desc()])
)
df = expr.to_pandas()
12 changes: 12 additions & 0 deletions docs/posts/pydata-performance-part2/datafusion_native.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from __future__ import annotations

import datafusion

with open("./datafusion_native.sql") as f:
query = f.read()

ctx = datafusion.SessionContext()
ctx.register_parquet(name="pypi", path="/data/pypi-parquet/*.parquet")
expr = ctx.sql(query)

df = expr.to_pandas()
47 changes: 47 additions & 0 deletions docs/posts/pydata-performance-part2/datafusion_native.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
SELECT
month,
ext,
COUNT(DISTINCT project_name) AS project_count
FROM (
SELECT
project_name,
DATE_TRUNC('month', uploaded_on) AS month,
NULLIF(
REPLACE(
REPLACE(
REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_MATCH(path, CONCAT('(', '\.([a-z0-9]+)$', ')'))[2],
'cxx|cpp|cc|c|hpp|h',
'C/C++',
'g'
),
'^f.*$',
'Fortran',
'g'
),
'rs',
'Rust'
),
'go',
'Go'
),
'asm',
'Assembly'
),
''
) AS ext
FROM pypi
WHERE COALESCE(
ARRAY_LENGTH(
REGEXP_MATCH(path, '\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$')
) > 0,
FALSE
)
AND NOT COALESCE(ARRAY_LENGTH(REGEXP_MATCH(path, '(^|/)test(|s|ing)')) > 0, FALSE)
AND NOT STRPOS(path, '/site-packages/') > 0
)
WHERE ext IS NOT NULL
GROUP BY month, ext
ORDER BY month DESC, project_count DESC
31 changes: 31 additions & 0 deletions docs/posts/pydata-performance-part2/duckdb_ibis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from __future__ import annotations

import ibis
from ibis import _

expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.dropna("ext")
.order_by([_.month.desc(), _.project_count.desc()]) # <1>
)
df = expr.to_pandas()
281 changes: 281 additions & 0 deletions docs/posts/pydata-performance-part2/index.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
---
title: "Ibis versus X: Performance across the ecosystem part 2"
author: "Phillip Cloud"
date: 2023-12-11
categories:
- blog
- case study
- ecosystem
- performance
---

**TL; DR**: Ibis supports both Polars and DataFusion. Both backends are have
about the same runtime performance, and lag far behind DuckDB on this workload.
There's negligible performance difference between Ibis and the backend native
APIs.

## Motivation

This is part 2 of a series of posts showing performance across various backends
that Ibis supports.

Check out [part 1](../pydata-performance/) if you haven't already!

In this post, I'll continue with the [Polars](../../backends/polars.qmd) and
[DataFusion](../../backends/datafusion.qmd) backends.

I show each tool using both the Ibis API and the tool's native API. We'll see
that the performance difference between these approaches is negligible.

```{python}
#| echo: false
def show_file(path, language: str = "python") -> None:
with open(path) as f:
source = f.read()
print(f"```{language}\n{source}\n```")
```

## Setup

I ran all of the code in this blog post on a machine with these specs.

```{python}
#| echo: false
#| output: asis
import os
import platform
import shutil
import cpuinfo
import psutil
info = cpuinfo.get_cpu_info()
uname = platform.uname()
MiB = 1 << 20
GiB = 1 << 30
TiB = 1 << 40
ram_gib = psutil.virtual_memory().total / GiB
disk_tib = shutil.disk_usage("/").total / TiB
lines = [
"| Component | Specification |",
"| --------- | ------------- |",
f"| CPU | {info['brand_raw']} ({os.cpu_count()} threads) |",
f"| RAM | {ram_gib:.0f} GiB |",
f"| Disk | {disk_tib:.1f} TiB SSD |",
f"| OS | NixOS ({uname.system} {uname.release}) |",
]
print("\n".join(lines))
```

### Library versions

Here are the versions I used to run this experiment at the time of writing.

```{python}
#| echo: false
#| output: asis
import importlib
import subprocess
import sys
import pandas as pd
cmd = "git", "rev-parse", "--short", "HEAD"
proc = subprocess.run(cmd, check=True, text=True, capture_output=True)
commit = proc.stdout.strip()
link = f"https://github.com/ibis-project/ibis/tree/{commit}"
version_pair = lambda name: (name, importlib.import_module(name).__version__)
versions = pd.DataFrame(
[("Python", sys.version)] + sorted(
[
*map(version_pair, ("pandas", "polars", "datafusion", "pyarrow")),
("ibis", f"[`{commit}`]({link})"),
]
),
columns=["Dependency", "Version"],
)
print(versions.to_markdown(index=False))
```

## Running the query across backends

Here are the different Ibis expressions for each backend as well as the same
query with native APIs, along with timed executions of the query.

### DuckDB

First, let's run the Ibis + DuckDB version of the query from the original post:

```{python}
#| echo: false
#| output: asis
show_file("./duckdb_ibis.py")
```

```{python}
duckdb_ibis_results = %timeit -n1 -r1 -o %run duckdb_ibis.py
df.head()
```

### DataFusion and Polars

::: {.panel-tabset}

## DataFusion

::: {.panel-tabset}

## Ibis

```{python}
#| echo: false
#| output: asis
show_file("./datafusion_ibis.py")
```

```{python}
datafusion_ibis_results = %timeit -n1 -r1 -o %run datafusion_ibis.py
df.head()
```

## DataFusion native

<details open>

<summary>DataFusion SQL</summary>

```{python}
#| echo: false
#| output: asis
show_file("./datafusion_native.sql", language="sql")
```

</details>

```{python}
#| echo: false
#| output: asis
show_file("./datafusion_native.py")
```

```{python}
datafusion_native_results = %timeit -n1 -r1 -o %run datafusion_native.py
df.head()
```

:::

## Polars

::: {.panel-tabset}

## Ibis

```{python}
#| echo: false
#| output: asis
show_file("./polars_ibis.py")
```

```{python}
polars_ibis_results = %timeit -n1 -r1 -o %run polars_ibis.py
df.head()
```

## Polars native

```{python}
#| echo: false
#| output: asis
show_file("./polars_native.py")
```

```{python}
polars_native_results = %timeit -n1 -r1 -o %run polars_native.py
df.head()
```

:::

:::

## Takeaways

**Ibis + DuckDB is the only system tested that handles this workload well out of the box**

* Both Polars and DataFusion are much slower than DuckDB and Dask on this
workload.
* Polars memory use fluctuates quite bit, while DataFusion's memory profile is
similar to DuckDB.

Let's recap the results with some numbers:

### Numbers

```{python}
#| echo: false
#| output: asis
import glob
allfiles = glob.glob("/data/pypi-parquet/*.parquet")
total_size = sum(map(os.path.getsize, allfiles))
def make_line(name, results, file_size: int = total_size):
duration = results.best
mib = file_size / MiB
throughput = mib / duration
data = [
name, f"{mib:,.0f} MiB", f"{duration:.0f} seconds", f"{throughput:.0f} MiB/s"
]
row = " | ".join(data)
return f"| {row} |"
results = sorted(
[
{"name": "Ibis + DuckDB", "results": duckdb_ibis_results},
{"name": "Ibis + Polars", "results": polars_ibis_results},
{"name": "Polars native API", "results": polars_native_results},
{"name": "Ibis + DataFusion", "results": datafusion_ibis_results},
{"name": "DataFusion native API", "results": datafusion_native_results},
],
key=lambda run: total_size / run["results"].best,
reverse=True,
)
header = "| Toolset | Data size | Duration | Throughput |"
sep = "| ------------------ | --------: | -----------: | ---------: |"
rows = [header, sep]
rows.extend(make_line(**result) for result in results)
print("\n".join(rows))
```

::: {.callout-warning}
## The Polars run durations were highly variable

I couldn't figure out how to get consistent run times.
:::

## Conclusion

If you're considering Polars for new code, give Ibis a try with the [DuckDB
backend](../../backends/duckdb.qmd).

You'll get better performance than Polars on some workloads, and with a broader
cross-backend API that helps you scale from development to production.

If you find that Polars has better performance than DuckDB on a particular
workload you can always switch to the Polars backend for that workload.

Everyone wins!

In the next post in this series we'll cover the cloud backends: Snowflake,
BigQuery, Trino and ClickHouse.
33 changes: 33 additions & 0 deletions docs/posts/pydata-performance-part2/polars_ibis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

import ibis
from ibis import _

ibis.set_backend("polars")

expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.dropna("ext")
.order_by([_.month.desc(), _.project_count.desc()])
)
df = expr.to_pandas(streaming=True)
33 changes: 33 additions & 0 deletions docs/posts/pydata-performance-part2/polars_native.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

import polars as pl

expr = (
pl.scan_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
pl.col("path").str.contains(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~pl.col("path").str.contains(r"(^|/)test(|s|ing)"),
~pl.col("path").str.contains("/site-packages/", literal=True),
]
)
.with_columns(
month=pl.col("uploaded_on").dt.truncate("1mo"),
ext=pl.col("path")
.str.extract(pattern=r"\.([a-z0-9]+)$", group_index=1)
.str.replace_all(pattern=r"cxx|cpp|cc|c|hpp|h", value="C/C++")
.str.replace_all(pattern="^f.*$", value="Fortran")
.str.replace("rs", "Rust", literal=True)
.str.replace("go", "Go", literal=True)
.str.replace("asm", "Assembly", literal=True)
.replace({"": None}),
)
.group_by(["month", "ext"])
.agg(project_count=pl.col("project_name").n_unique())
.drop_nulls(["ext"])
.sort(["month", "project_count"], descending=True)
)

df = expr.collect(streaming=True).to_pandas()
39 changes: 39 additions & 0 deletions docs/posts/pydata-performance/dask_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations

import logging

import dask.dataframe as dd
from dask.distributed import Client

if __name__ == "__main__":
client = Client(silence_logs=logging.ERROR)
df = dd.read_parquet(
"/data/pypi-parquet/*.parquet",
columns=["path", "uploaded_on", "project_name"],
split_row_groups=True,
)
df = df[
df.path.str.contains(
r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
)
& ~df.path.str.contains(r"(?:^|/)test(?:|s|ing)")
& ~df.path.str.contains("/site-packages/")
]
print(
df.assign(
month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(),
ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0, expand=False)
.str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True)
.str.replace("^f.*$", "Fortran", regex=True)
.str.replace("rs", "Rust")
.str.replace("go", "Go")
.str.replace("asm", "Assembly"),
)
.groupby(["month", "ext"])
.project_name.nunique()
.rename("project_count")
.compute()
.reset_index()
.sort_values(["month", "project_count"], ascending=False)
)
client.shutdown()
726 changes: 726 additions & 0 deletions docs/posts/pydata-performance/index.qmd

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions docs/posts/pydata-performance/pandas_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

import glob
import os

import pandas as pd

df = pd.read_parquet(
min(glob.glob("/data/pypi-parquet/*.parquet"), key=os.path.getsize),
columns=["path", "uploaded_on", "project_name"],
)
df = df[
df.path.str.contains(r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$")
& ~df.path.str.contains(r"(?:(?:^|/)test(?:|s|ing)|/site-packages/)")
]
print(
df.assign(
month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(),
ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0)
.iloc[:, 0]
.str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True)
.str.replace("^f.*$", "Fortran", regex=True)
.str.replace("rs", "Rust")
.str.replace("go", "Go")
.str.replace("asm", "Assembly"),
)
.groupby(["month", "ext"])
.project_name.nunique()
.rename("project_count")
.reset_index()
.sort_values(["month", "project_count"], ascending=False)
)
41 changes: 41 additions & 0 deletions docs/posts/pydata-performance/step0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from __future__ import annotations

import ibis
from ibis import _, udf


@udf.scalar.builtin
def flatten(x: list[list[str]]) -> list[str]: # <1>
...


expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1),
)
.aggregate(projects=_.project_name.collect().unique())
.order_by(_.month.desc())
.mutate(
ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.group_by(["month", "ext"])
.aggregate(project_count=flatten(_.projects.collect()).unique().length())
.dropna("ext")
.order_by([_.month.desc(), _.project_count.desc()]) # <2>
)
37 changes: 37 additions & 0 deletions docs/posts/pydata-performance/step1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from __future__ import annotations

import ibis
from ibis import _, udf


@udf.scalar.builtin
def flatten(x: list[list[str]]) -> list[str]:
...


expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(projects=_.project_name.collect().unique())
.order_by(_.month.desc())
.group_by(["month", "ext"])
.aggregate(project_count=flatten(_.projects.collect()).unique().length())
)
37 changes: 37 additions & 0 deletions docs/posts/pydata-performance/step2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from __future__ import annotations

import ibis
from ibis import _, udf


@udf.scalar.builtin
def flatten(x: list[list[str]]) -> list[str]:
...


expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(projects=_.project_name.collect().unique())
.group_by(["month", "ext"])
.aggregate(project_count=flatten(_.projects.collect()).unique().length())
.order_by(_.month.desc())
)
30 changes: 30 additions & 0 deletions docs/posts/pydata-performance/step3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import annotations

import ibis
from ibis import _

expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.dropna("ext")
.order_by([_.month.desc(), _.project_count.desc()]) # <1>
)
272 changes: 272 additions & 0 deletions docs/posts/querying-pypi-metadata-compiled-languages/index.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
---
title: Querying every file in every release on the Python Package Index (redux)
author: Gil Forsyth
date: 2023-11-15
categories:
- blog
---

Seth Larson wrote a great [blog
post](https://sethmlarson.dev/security-developer-in-residence-weekly-report-18)
on querying a PyPI dataset to look for trends in the use of memory-safe
languages in Python.

Check out Seth's article for more information on the dataset (and
it's a good read!). It caught our eye because it makes use of
[DuckDB](https://duckdb.org/) to clean the data for analysis.

That's right up our alley here in Ibis land, so let's see if we can duplicate
Seth's results (and then continue on to plot them!)

## Grab the data (locations)

Seth showed (and then safely decomposed) a nested `curl` statement and that's
always viable -- we're in Python land so why not grab the filenames using
`urllib3`?

```{python}
import urllib3
url = "https://raw.githubusercontent.com/pypi-data/data/main/links/dataset.txt"
with urllib3.PoolManager() as http:
resp = http.request("GET", url)
parquet_files = resp.data.decode().split()
parquet_files
```

## Grab the data

Now we're ready to get started with Ibis!

DuckDB is clever enough to grab only the parquet metadata. This means we can
use `read_parquet` to create a lazy view of the parquet files and then build up
our expression without downloading everything beforehand!

```{python}
import ibis
from ibis import _ # <1>
ibis.options.interactive = True
```

1. See https://ibis-project.org/how-to/analytics/chain_expressions.html for docs
on the deferred operator!

Create a DuckDB connection:

```{python}
con = ibis.duckdb.connect()
```

And load up one of the files (we can run the full query after)!

```{python}
pypi = con.read_parquet(parquet_files[0], table_name="pypi")
```

```{python}
pypi.schema()
```

## Query crafting

Let's break down what we're looking for. As a high-level view of the use of
compiled languages, Seth is using file extensions as an indicator that a given
filetype is used in a Python project.

The dataset we're using has _every file in every project_ -- what criteria should we use?

We can follow Seth's lead and look for things:

1. A file extension that is one of: `asm`, `cc`, `cpp`, `cxx`, `h`, `hpp`, `rs`, `go`, and variants of `F90`, `f90`, etc...
That is, C, C++, Assembly, Rust, Go, and Fortran.
2. We exclude matches where the file path is within the `site-packages/` directory.
3. We exclude matches that are in directories used for testing.

```{python}
expr = pypi.filter(
[
_.path.re_search(r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
expr
```

That _could_ be right -- we can peak at the filename at the end of the `path` column to do a quick check:

```{python}
expr.path.split("/")[-1]
```

Ok! Next up, we want to group the matches by:

1. The month that the package / file was published
For this, we can use the `truncate` method and ask for month as our truncation window.
2. The file extension of the file used

```{python}
expr.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1),
).aggregate()
```

That looks promising. Now we need to grab the package names that correspond to a
given file extension in a given month and deduplicate it. And to match Seth's
results, we'll also sort by the month in descending order:

```{python}
expr = (
expr.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1),
)
.aggregate(projects=_.project_name.collect().unique())
.order_by(_.month.desc())
)
expr
```

## Massage and plot

Let's continue and see what our results look like.

We'll do a few things:

1. Combine all of the C and C++ extensions into a single group by renaming them all.
2. Count the number of distinct entries in each group
3. Plot the results!

```{python}
collapse_names = expr.mutate(
ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
).dropna("ext")
collapse_names
```

Note that now we need to de-duplicate again, since we might've had separate
unique entries for both an `h` and `c` file extension, and we don't want to
double-count!

We could rewrite our original query and include the renames in the original
`group_by` (this would be the smart thing to do), but let's push on and see if
we can make this work.

The `projects` column is now a column of string arrays, so we want to collect
all of the arrays in each group, this will give us a "list of lists", then we'll
`flatten` that list and call `unique().length()` as before.

DuckDB has a `flatten` function, but it isn't exposed in Ibis (yet!).

We'll use a handy bit of Ibis magic to define a `builtin` `UDF` that will map directly
onto the underlying DuckDB function (what!? See
[here](https://ibis-project.org/how-to/extending/builtin.html#duckdb) for more
info):

```{python}
@ibis.udf.scalar.builtin
def flatten(x: list[list[str]]) -> list[str]:
...
collapse_names = collapse_names.group_by(["month", "ext"]).aggregate(
projects=flatten(_.projects.collect())
)
collapse_names
```

We could have included the `unique().length()` in the `aggregate` call, but
sometimes it's good to check that your slightly off-kilter idea has worked (and
it has!).

```{python}
collapse_names = collapse_names.select(
_.month, _.ext, project_count=_.projects.unique().length()
)
collapse_names
```

Now that the data are tidied, we can pass our expression directly to Altair and see what it looks like!

```{python}
import altair as alt
chart = (
alt.Chart(collapse_names.to_pandas())
.mark_line()
.encode(x="month", y="project_count", color="ext")
.properties(width=600, height=300)
)
chart
```

That looks good, but it definitely doesn't match the plot from Seth's post:

![upstream plot](upstream_plot.png)

Our current plot is only showing the results from a subset of the available
data. Now that our expression is complete, we can re-run on the full dataset and
compare.

## The full run

To recap -- we pulled a lazy view of a single parquet file from the `pypi-data`
repo, filtered for all the files that contain file extensions we care about,
then grouped them all together to get counts of the various filetypes used
across projects by month.

Here's the entire query chained together into a single command, now running on
all of the `parquet` files we have access to:

```{python}
pypi = con.read_parquet(parquet_files, table_name="pypi")
full_query = (
pypi.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1),
)
.aggregate(projects=_.project_name.collect().unique())
.order_by(_.month.desc())
.mutate(
ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.dropna("ext")
.group_by(["month", "ext"])
.aggregate(project_count=flatten(_.projects.collect()).unique().length())
)
chart = (
alt.Chart(full_query.to_pandas())
.mark_line()
.encode(x="month", y="project_count", color="ext")
.properties(width=600, height=300)
)
chart
```
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
147 changes: 147 additions & 0 deletions docs/release_notes.md

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion docs/support_matrix.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ hide:

# Operation support matrix

We provide Ibis's operation support matrix as a [Streamlit](https://streamlit.io/) app that shows supported operations for each backend. Ibis defines a common API for analytics and data transformation code that is transpiled to native code for each backend. This code is often, but not always, SQL -- see the [backends concept page](/concepts/backend.qmd) for details. Due to differences in SQL dialects and support for different operations in different backends, support for the full breadth of the Ibis API varies.
We provide Ibis's operation support matrix as
a [Streamlit](https://streamlit.io/) app that shows supported operations for
each backend. Ibis defines a common API for analytics and data transformation
code that is transpiled to native code for each backend. Due to differences in
SQL dialects and upstream support for different operations in different
backends, support for the full breadth of the Ibis API varies.

You can use this page to see which operations are supported on each backend.

Expand Down
10 changes: 5 additions & 5 deletions docs/tutorials/ibis-for-pandas-users.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -242,17 +242,17 @@ mutated
### Renaming columns

In addition to replacing columns, you can rename them as well. This is done with
the `relabel` method which takes a dictionary containing the name mappings.
the `rename` method which takes a dictionary containing the name mappings.


```{python}
relabeled = t.relabel(
renamed = t.rename(
dict(
one="a",
two="b",
a="one",
b="two",
)
)
relabeled
renamed
```

## Selecting rows
Expand Down
18 changes: 9 additions & 9 deletions flake.lock
2 changes: 1 addition & 1 deletion ibis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Initialize Ibis module."""
from __future__ import annotations

__version__ = "7.1.0"
__version__ = "7.2.0"

from ibis import examples, util
from ibis.backends.base import BaseBackend
Expand Down
65 changes: 45 additions & 20 deletions ibis/backends/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@

__all__ = ("BaseBackend", "Database", "connect")


_IBIS_TO_SQLGLOT_DIALECT = {
"mssql": "tsql",
"impala": "hive",
"pyspark": "spark",
"polars": "postgres",
"datafusion": "postgres",
# closest match see https://github.com/ibis-project/ibis/pull/7303#discussion_r1350223901
"exasol": "oracle",
}


_SQLALCHEMY_TO_SQLGLOT_DIALECT = {
# sqlalchemy dialects of backends not listed here match the sqlglot dialect
# name
Expand All @@ -52,6 +52,8 @@
# druid allows double quotes for identifiers, like postgres:
# https://druid.apache.org/docs/latest/querying/sql#identifiers-and-literals
"druid": "postgres",
# closest match see https://github.com/ibis-project/ibis/pull/7303#discussion_r1350223901
"exa.websocket": "oracle",
}


Expand Down Expand Up @@ -249,6 +251,34 @@ def _import_pyarrow():

return pyarrow

def to_pandas(
self,
expr: ir.Expr,
*,
params: Mapping[ir.Scalar, Any] | None = None,
limit: int | str | None = None,
**kwargs: Any,
) -> pd.DataFrame | pd.Series | Any:
"""Execute an Ibis expression and return a pandas `DataFrame`, `Series`, or scalar.

::: {.callout-note}
This method is a wrapper around `execute`.
:::

Parameters
----------
expr
Ibis expression to execute.
params
Mapping of scalar parameter expressions to value.
limit
An integer to effect a specific row limit. A value of `None` means
"no limit". The default is in `ibis/config.py`.
kwargs
Keyword arguments
"""
return self.execute(expr, params=params, limit=limit, **kwargs)

def to_pandas_batches(
self,
expr: ir.Expr,
Expand Down Expand Up @@ -327,23 +357,18 @@ def to_pyarrow(
"""
pa = self._import_pyarrow()
self._run_pre_execute_hooks(expr)

table_expr = expr.as_table()
arrow_schema = table_expr.schema().to_pyarrow()
try:
with self.to_pyarrow_batches(
table_expr, params=params, limit=limit, **kwargs
) as reader:
table = (
pa.Table.from_batches(reader)
.rename_columns(table_expr.columns)
.cast(arrow_schema)
)
except pa.lib.ArrowInvalid:
raise
except ValueError:
table = arrow_schema.empty_table()

return expr.__pyarrow_result__(table)
schema = table_expr.schema()
arrow_schema = schema.to_pyarrow()
with self.to_pyarrow_batches(
table_expr, params=params, limit=limit, **kwargs
) as reader:
table = pa.Table.from_batches(reader, schema=arrow_schema)

return expr.__pyarrow_result__(
table.rename_columns(table_expr.columns).cast(arrow_schema)
)

@util.experimental
def to_pyarrow_batches(
Expand Down Expand Up @@ -547,7 +572,7 @@ def to_parquet(
import pyarrow.parquet as pq

with expr.to_pyarrow_batches(params=params) as batch_reader:
with pq.ParquetWriter(path, batch_reader.schema) as writer:
with pq.ParquetWriter(path, batch_reader.schema, **kwargs) as writer:
for batch in batch_reader:
writer.write_batch(batch)

Expand Down Expand Up @@ -582,7 +607,7 @@ def to_csv(
import pyarrow.csv as pcsv

with expr.to_pyarrow_batches(params=params) as batch_reader:
with pcsv.CSVWriter(path, batch_reader.schema) as writer:
with pcsv.CSVWriter(path, batch_reader.schema, **kwargs) as writer:
for batch in batch_reader:
writer.write_batch(batch)

Expand Down
4 changes: 1 addition & 3 deletions ibis/backends/base/df/timecontext.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,7 @@ def construct_time_context_aware_series(
1 2.2
2 3.3
Name: value, dtype: float64
>>> construct_time_context_aware_series(
... series, df
... ) # quartodoc: +SKIP # doctest: +SKIP
>>> construct_time_context_aware_series(series, df) # quartodoc: +SKIP # doctest: +SKIP
time
0 2017-01-02 1.1
1 2017-01-03 2.2
Expand Down
Loading