105 changes: 81 additions & 24 deletions .github/workflows/ibis-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ jobs:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
- "3.10"
- "3.12"
steps:
- name: checkout
uses: actions/checkout@v4
Expand All @@ -65,7 +65,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: install poetry
run: pip install 'poetry==1.8.2'
run: pip install 'poetry==1.8.3'

- name: install ibis
run: poetry install --without dev --without docs --extras bigquery
Expand Down Expand Up @@ -95,27 +95,21 @@ jobs:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
- "3.10"
- "3.12"
backend:
- name: duckdb
title: DuckDB
serial: true
extras:
- duckdb
- deltalake
- geospatial
- examples
- decompiler
- polars
additional_deps:
- torch
# TODO: remove this duckdb job once the next duckdb_spatial is released
- name: duckdb
title: DuckDB + Geospatial
extras:
- geospatial
additional_deps:
- "duckdb==0.9.2"
- name: clickhouse
title: ClickHouse
services:
Expand Down Expand Up @@ -147,6 +141,7 @@ jobs:
extras:
- mysql
- geospatial
- polars
sys-deps:
- libgeos-dev
- name: postgres
Expand Down Expand Up @@ -192,6 +187,7 @@ jobs:
title: MS SQL Server
extras:
- mssql
- polars
services:
- mssql
sys-deps:
Expand Down Expand Up @@ -222,6 +218,7 @@ jobs:
serial: true
extras:
- oracle
- polars
services:
- oracle
- name: flink
Expand All @@ -232,6 +229,7 @@ jobs:
additional_deps:
- "'apache-flink < 1.20.0'"
- "'pandas < 2.2'"
- setuptools
services:
- flink
include:
Expand All @@ -243,12 +241,33 @@ jobs:
extras:
- dask
- os: ubuntu-latest
python-version: "3.9"
python-version: "3.11"
backend:
name: dask
title: Dask
name: flink
title: Flink
serial: true
extras:
- dask
- flink
additional_deps:
- "'apache-flink < 1.20.0'"
- "'pandas < 2.2'"
- setuptools
services:
- flink
- os: ubuntu-latest
python-version: "3.11"
backend:
name: impala
title: Impala
serial: true
extras:
- impala
services:
- impala
- kudu
sys-deps:
- cmake
- ninja-build
exclude:
- os: windows-latest
backend:
Expand All @@ -257,6 +276,7 @@ jobs:
extras:
- mysql
- geospatial
- polars
services:
- mysql
sys-deps:
Expand Down Expand Up @@ -304,6 +324,21 @@ jobs:
- postgres
sys-deps:
- libgeos-dev
# TODO(deepyaman): Test whether this works upon releasing https://github.com/cloudera/impyla/commit/bf1f94c3c4106ded6267d2485c1e939775a6a87f
- os: ubuntu-latest
python-version: "3.12"
backend:
name: impala
title: Impala
serial: true
extras:
- impala
services:
- impala
- kudu
sys-deps:
- cmake
- ninja-build
- os: windows-latest
backend:
name: impala
Expand All @@ -323,6 +358,7 @@ jobs:
title: MS SQL Server
extras:
- mssql
- polars
services:
- mssql
sys-deps:
Expand Down Expand Up @@ -352,8 +388,23 @@ jobs:
serial: true
extras:
- oracle
- polars
services:
- oracle
- os: ubuntu-latest
python-version: "3.12"
backend:
name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- "'apache-flink < 1.20.0'"
- "'pandas < 2.2'"
- setuptools
services:
- flink
- os: windows-latest
backend:
name: flink
Expand All @@ -364,6 +415,7 @@ jobs:
additional_deps:
- "'apache-flink < 1.20.0'"
- "'pandas < 2.2'"
- setuptools
services:
- flink
- os: windows-latest
Expand Down Expand Up @@ -398,6 +450,10 @@ jobs:
- name: download backend data
run: just download-data

- name: show docker compose version
if: matrix.backend.services != null
run: docker compose version

- name: start services
if: matrix.backend.services != null
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}
Expand All @@ -409,7 +465,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: install poetry
run: pip install 'poetry==1.8.2'
run: pip install 'poetry==1.8.3'

- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"
Expand Down Expand Up @@ -468,8 +524,8 @@ jobs:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
- "3.10"
- "3.12"
backend:
- name: dask
title: Dask
Expand Down Expand Up @@ -515,7 +571,7 @@ jobs:
extras:
- postgres
- geospatial
- python-version: "3.11"
- python-version: "3.12"
backend:
name: postgres
title: PostgreSQL
Expand All @@ -532,7 +588,7 @@ jobs:
extras:
- postgres
- geospatial
- python-version: "3.11"
- python-version: "3.12"
backend:
name: dask
title: Dask
Expand Down Expand Up @@ -573,7 +629,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.8.2'
run: python -m pip install --upgrade pip 'poetry==1.8.3'

- name: remove incompatible deps
# it requires a version of pandas that min versions are not compatible with
Expand Down Expand Up @@ -623,7 +679,7 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: "3.9"
- python-version: "3.10"
pyspark-version: "3.3"
deps:
- "'pandas@<2'"
Expand All @@ -638,6 +694,7 @@ jobs:
deps:
- "'pandas@>2'"
- "'numpy@>1.24'"
- setuptools
steps:
- name: checkout
uses: actions/checkout@v4
Expand All @@ -661,7 +718,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.8.2'
run: python -m pip install --upgrade pip 'poetry==1.8.3'

- name: remove lonboard
# it requires a version of pandas that pyspark is not compatible with
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ibis-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
python-version: "3.11"

- name: install poetry
run: pip install 'poetry==1.8.2'
run: pip install 'poetry==1.8.3'

- name: install system dependencies
run: sudo apt-get install -qq -y build-essential libgeos-dev freetds-dev unixodbc-dev
Expand Down
16 changes: 12 additions & 4 deletions .github/workflows/ibis-docs-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: install nix
uses: cachix/install-nix-action@v26
uses: cachix/install-nix-action@v27
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v14
uses: cachix/cachix-action@v15
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down Expand Up @@ -69,10 +69,18 @@ jobs:
run: |
python -m pip install --upgrade algoliasearch
- name: Create and Upload Index
- name: Create and Upload Base Index
run: |
python .github/workflows/upload-algolia.py
env:
ALGOLIA_WRITE_API_KEY: ${{ secrets.ALGOLIA_WRITE_API_KEY }}
ALGOLIA_APP_ID: HS77W8GWM1
ALGOLIA_APP_ID: TNU9HG3L41
ALGOLIA_INDEX: prod_ibis

- name: Create and Upload API Records to index
run: |
python .github/workflows/upload-algolia-api.py
env:
ALGOLIA_WRITE_API_KEY: ${{ secrets.ALGOLIA_WRITE_API_KEY }}
ALGOLIA_APP_ID: TNU9HG3L41
ALGOLIA_INDEX: prod_ibis
4 changes: 2 additions & 2 deletions .github/workflows/ibis-docs-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: install nix
uses: cachix/install-nix-action@v26
uses: cachix/install-nix-action@v27
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v14
uses: cachix/cachix-action@v15
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/ibis-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ jobs:
uses: actions/checkout@v4

- name: install nix
uses: cachix/install-nix-action@v26
uses: cachix/install-nix-action@v27
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v14
uses: cachix/cachix-action@v15
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -53,13 +53,13 @@ jobs:
fetch-depth: 0

- name: install nix
uses: cachix/install-nix-action@v26
uses: cachix/install-nix-action@v27
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v14
uses: cachix/cachix-action@v15
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -75,12 +75,12 @@ jobs:
with:
fetch-depth: 0

- uses: cachix/install-nix-action@v26
- uses: cachix/install-nix-action@v27
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- uses: cachix/cachix-action@v14
- uses: cachix/cachix-action@v15
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand Down
11 changes: 4 additions & 7 deletions .github/workflows/ibis-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ jobs:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.10"
- "3.11"
- "3.12"
Expand All @@ -61,7 +60,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: install poetry
run: pip install 'poetry==1.8.2'
run: pip install 'poetry==1.8.3'

- name: install ${{ matrix.os }} system dependencies
if: matrix.os == 'ubuntu-latest'
Expand Down Expand Up @@ -112,7 +111,7 @@ jobs:
python-version: "3.12"

- name: install poetry
run: pip install 'poetry==1.8.2'
run: pip install 'poetry==1.8.3'

- name: install system dependencies
run: |
Expand All @@ -129,8 +128,6 @@ jobs:
run: poetry run python -c 'import shapely.geometry, duckdb'

test_doctests:
# FIXME(kszucs): re-enable this build
if: false
name: Doctests
runs-on: ubuntu-latest
steps:
Expand All @@ -148,10 +145,10 @@ jobs:
uses: actions/setup-python@v5
id: install_python
with:
python-version: "3.12"
python-version: "3.10"

- name: install poetry
run: pip install 'poetry==1.8.2'
run: pip install 'poetry==1.8.3'

- name: install ibis with all extras
run: poetry install --without dev --without docs --all-extras
Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/nix-skip-helper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@ jobs:
fail-fast: false
matrix:
os:
- ubuntu-latest
- ubuntu-latest # x86_64-linux
python-version:
- "3.9"
- "3.10"
- "3.11"
- "3.12"
include:
- os: ubuntu-arm64-small
python-version: "3.12"
- os: macos-14
python-version: "3.10"
steps:
- run: echo "No build required"
21 changes: 15 additions & 6 deletions .github/workflows/nix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,29 @@ jobs:
fail-fast: false
matrix:
os:
- ubuntu-latest
- ubuntu-latest # x86_64-linux
python-version:
- "3.9"
- "3.10"
- "3.11"
- "3.12"
include:
- os: ubuntu-arm64-small
python-version: "3.12"
- os: macos-14
python-version: "3.10"
steps:
- name: checkout
uses: actions/checkout@v4

- name: install nix
uses: cachix/install-nix-action@v26
uses: cachix/install-nix-action@v27
with:
nix_path: nixpkgs=channel:nixos-unstable-small
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: setup cachix
uses: cachix/cachix-action@v14
uses: cachix/cachix-action@v15
with:
name: ibis
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
Expand All @@ -63,11 +67,16 @@ jobs:
nix build ".#ibis${version//./}" --fallback --keep-going --print-build-logs
- name: nix build devShell
if: github.event_name == 'push'
run: |
set -euo pipefail
version='${{ matrix.python-version }}'
host_system="$(nix eval --raw 'nixpkgs#stdenv.hostPlatform.system')"
flake=".#devShells.${host_system}.ibis${version//./}"
nix build "$flake" --fallback --keep-going --print-build-logs
args=("--fallback" "--keep-going" "--print-build-logs")
if [[ "${{ github.event_name }}" != "push" ]]; then
args+=("--dry-run")
fi
nix build "$flake" "${args[@]}"
2 changes: 1 addition & 1 deletion .github/workflows/pre-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
run: python -m pip install --upgrade pip

- name: install poetry
run: python -m pip install 'poetry==1.8.2' poetry-dynamic-versioning
run: python -m pip install 'poetry==1.8.3' poetry-dynamic-versioning

- name: compute ibis version
id: get_version
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
release:
runs-on: ubuntu-latest
steps:
- uses: actions/create-github-app-token@v1.9.3
- uses: actions/create-github-app-token@v1.10.1
id: generate_token
with:
app-id: ${{ secrets.APP_ID }}
Expand All @@ -25,12 +25,12 @@ jobs:
fetch-depth: 0
token: ${{ steps.generate_token.outputs.token }}

- uses: cachix/install-nix-action@v26
- uses: cachix/install-nix-action@v27
with:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- uses: cachix/cachix-action@v14
- uses: cachix/cachix-action@v15
with:
name: ibis
extraPullNames: nix-community,poetry2nix
Expand Down
97 changes: 97 additions & 0 deletions .github/workflows/upload-algolia-api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from __future__ import annotations # noqa: INP001

import os
import re
from functools import partial

from algoliasearch.search_client import SearchClient

api_key = os.environ["ALGOLIA_WRITE_API_KEY"]
app_id = os.environ["ALGOLIA_APP_ID"]
index_name = os.environ["ALGOLIA_INDEX"]


# These are QMD files generated with help from Quartodoc.
API_QMDS = [
"docs/reference/expression-collections.qmd",
"docs/reference/expression-generic.qmd",
"docs/reference/expression-geospatial.qmd",
"docs/reference/expression-numeric.qmd",
"docs/reference/expression-strings.qmd",
"docs/reference/expression-tables.qmd",
"docs/reference/expression-temporal.qmd",
]


HORRID_REGEX = re.compile(r"\|\s*\[(\w+)\]\((#[\w.]+)\)\s*\|\s*(.*?)\s*\|")
# Given | [method](some-anchor) | some multiword description |
# this regex extracts ("method", "some-anchor", "some multiword description")


def _grab_qmd_methods(lines):
# All of the QMD files have a section that looks like:
#
# ## Methods
#
# | [method](anchor-ref) | description |
# ...
#
# ### method
#
# yes this is gross, but grab the lines between the `## Methods` and the
# first `###` and then smash it into a list
methods = lines[(fence := lines.find("## Methods")) : lines.find("###", fence)]
methods = [entry for entry in methods.split("\n") if entry.startswith("| [")]

# Now this in in the form:
# | [method name](#anchor-name) | Top-level description |
return methods


def _create_api_record_from_method_line(base_url, method):
# for e.g. `reference/expression-collections.html` we want to grab "Collections"
section = (
base_url.removesuffix(".html")
.removeprefix("reference/expression-")
.capitalize()
)
name, anchor, desc = re.match(HORRID_REGEX, method).groups()
record = {
"objectID": f"{base_url}{anchor}",
"href": f"{base_url}{anchor}",
"title": name,
"text": desc,
"crumbs": ["Expression API", "API", f"{section} expressions"],
}

return record


def main():
client = SearchClient.create(app_id, api_key)
index = client.init_index(index_name)

records = []
for qmd in API_QMDS:
# For each QMD file, get the table-section of the methods, anchors, and descriptions
with open(qmd) as f:
methods = _grab_qmd_methods(f.read())

# Massage the QMD filename into the expected URL that prepends the anchor
# so we end up eventually with something like
# `reference/expression-collections.html#some-anchor`
base_url = f"{qmd.removeprefix('docs/').removesuffix('.qmd')}.html"

# Generate a dictionary for each row of the method table
_creator = partial(_create_api_record_from_method_line, base_url)
records += list(map(_creator, methods))

# This saves the list of records to Algolia
# If the object IDs are new (which typically should be) this adds a new
# record to the Algolia index. If the object ID already exists, it gets
# updated with the new fields in the record dict
index.save_objects(records)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,4 @@ docs/**/*.html

# jupyterlite stuff
.jupyterlite.doit.db
docs/jupyter_lite_config.json
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ default_stages:
- commit
repos:
- repo: https://github.com/rhysd/actionlint
rev: v1.6.27
rev: v1.7.1
hooks:
- id: actionlint-system
- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
rev: v2.3.0
hooks:
- id: codespell
additional_dependencies:
Expand Down
15 changes: 9 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ You can `pip install` Ibis with a backend and example data:
pip install 'ibis-framework[duckdb,examples]'
```

> [!TIP]
> 💡 **Tip**
>
> See the [installation guide](https://ibis-project.org/install) for more installation options.
Then use Ibis:
Expand All @@ -56,7 +57,7 @@ Then use Ibis:
│ Adelie │ Torgersen │ 42.020.21904250NULL2007
│ … │ … │ … │ … │ … │ … │ … │ … │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
>>> g = t.group_by(["species", "island"]).agg(count=t.count()).order_by("count")
>>> g = t.group_by("species", "island").agg(count=t.count()).order_by("count")
>>> g
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
┃ species ┃ island ┃ count ┃
Expand All @@ -71,7 +72,8 @@ Then use Ibis:
└───────────┴───────────┴───────┘
```

> [!TIP]
> 💡 **Tip**
>
> See the [getting started tutorial](https://ibis-project.org/tutorials/getting_started) for a full introduction to Ibis.
## Python + SQL: better together
Expand Down Expand Up @@ -165,7 +167,7 @@ Ibis broadly supports two types of backend:
1. SQL-generating backends
2. DataFrame-generating backends

![Ibis backend types](docs/images/backends.png)
![Ibis backend types](https://raw.githubusercontent.com/ibis-project/ibis/main/docs/images/backends.png)

## Portability

Expand Down Expand Up @@ -202,15 +204,16 @@ You can also read from common file formats like CSV or Apache Parquet:

This allows you to iterate locally and deploy remotely by changing a single line of code.

> [!TIP]
> 💡 **Tip**
>
> Check out [the blog on backend agnostic arrays](https://ibis-project.org/posts/backend-agnostic-arrays/) for one example using the same code across DuckDB and BigQuery.
## Community and contributing

Ibis is an open source project and welcomes contributions from anyone in the community.

- Read [the contributing guide](https://github.com/ibis-project/ibis/blob/main/docs/CONTRIBUTING.md).
- We care about keeping the community welcoming for all. Check out [the code of conduct](https://github.com/ibis-project/ibis/blob/main/docs/CODE_OF_CONDUCT.md).
- We care about keeping the community welcoming for all. Check out [the code of conduct](https://github.com/ibis-project/ibis/blob/main/CODE_OF_CONDUCT.md).
- The Ibis project is open sourced under the [Apache License](https://github.com/ibis-project/ibis/blob/main/LICENSE.txt).

Join our community by interacting on GitHub or chatting with us on [Zulip](https://ibis-project.zulipchat.com/).
Expand Down
4 changes: 2 additions & 2 deletions ci/release/dry_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ nix develop '.#release' -c git commit -m 'test: semantic-release dry run' --no-v
unset GITHUB_ACTIONS

nix develop '.#release' -c npx --yes \
-p semantic-release \
-p "semantic-release@24.0.0" \
-p "@semantic-release/commit-analyzer" \
-p "@semantic-release/release-notes-generator" \
-p "@semantic-release/changelog" \
-p "@semantic-release/exec" \
-p "@semantic-release/git" \
-p "semantic-release-replace-plugin@1.2.0" \
-p "conventional-changelog-conventionalcommits@6.1.0" \
-p "conventional-changelog-conventionalcommits@8.0.0" \
semantic-release \
--ci \
--dry-run \
Expand Down
4 changes: 2 additions & 2 deletions ci/release/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
set -euo pipefail

nix develop '.#release' -c npx --yes \
-p semantic-release \
-p "semantic-release@24.0.0" \
-p "@semantic-release/commit-analyzer" \
-p "@semantic-release/release-notes-generator" \
-p "@semantic-release/changelog" \
-p "@semantic-release/github" \
-p "@semantic-release/exec" \
-p "@semantic-release/git" \
-p "semantic-release-replace-plugin@1.2.0" \
-p "conventional-changelog-conventionalcommits@6.1.0" \
-p "conventional-changelog-conventionalcommits@8.0.0" \
semantic-release --ci
20 changes: 10 additions & 10 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
services:
clickhouse:
image: clickhouse/clickhouse-server:24.3.2.23-alpine
image: clickhouse/clickhouse-server:24.5.1.1763-alpine
ports:
- 8123:8123 # http port
- 9000:9000 # native protocol port
Expand All @@ -9,7 +9,7 @@ services:
retries: 10
test:
- CMD-SHELL
- wget -qO- 'http://localhost:8123/?query=SELECT%201' # SELECT 1
- wget -qO- 'http://127.0.0.1:8123/?query=SELECT%201' # SELECT 1
volumes:
- clickhouse:/var/lib/clickhouse/user_files/ibis
networks:
Expand All @@ -28,7 +28,7 @@ services:
- CMD
- mariadb-admin
- ping
image: mariadb:11.3.2
image: mariadb:11.4.2
ports:
- 3306:3306
networks:
Expand Down Expand Up @@ -77,7 +77,7 @@ services:
- mssql

hive-metastore-db:
image: postgres:16.2-alpine
image: postgres:16.3-alpine
environment:
POSTGRES_USER: admin
POSTGRES_PASSWORD: admin
Expand All @@ -94,7 +94,7 @@ services:
- trino

minio:
image: bitnami/minio:2024.4.28
image: bitnami/minio:2024.6.11
environment:
MINIO_ROOT_USER: accesskey
MINIO_ROOT_PASSWORD: secretkey
Expand Down Expand Up @@ -156,7 +156,7 @@ services:
test:
- CMD-SHELL
- trino --output-format null --execute 'show schemas in hive; show schemas in memory'
image: trinodb/trino:445
image: trinodb/trino:449
ports:
- 8080:8080
networks:
Expand All @@ -167,7 +167,7 @@ services:
- $PWD/docker/trino/jvm.config:/etc/trino/jvm.config:ro

druid-postgres:
image: postgres:16.2-alpine
image: postgres:16.3-alpine
container_name: druid-postgres
environment:
POSTGRES_PASSWORD: FoolishPassword
Expand Down Expand Up @@ -341,7 +341,7 @@ services:
- druid

oracle:
image: gvenzl/oracle-free:23.3-slim
image: gvenzl/oracle-free:23.4-slim
environment:
ORACLE_PASSWORD: ibis
ORACLE_DATABASE: IBIS_TESTING
Expand All @@ -362,7 +362,7 @@ services:
- oracle:/opt/oracle/data

exasol:
image: exasol/docker-db:7.1.26
image: exasol/docker-db:8.27.0
privileged: true
ports:
- 8563:8563
Expand All @@ -372,7 +372,7 @@ services:
timeout: 90s
test:
- CMD-SHELL
- /usr/opt/EXASuite-7/EXASolution-7.*/bin/Console/exaplus -c 127.0.0.1:8563 -u sys -p exasol -encryption OFF <<< 'SELECT 1'
- /opt/exasol/db-8.*/bin/Console/exaplus -q -x -c 127.0.0.1:8563 -u sys -p exasol -jdbcparam validateservercertificate=0 -sql 'SELECT 1;'
networks:
- exasol
volumes:
Expand Down
1 change: 1 addition & 0 deletions docker/trino/jvm.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
-Djdk.attach.allowAttachSelf=true
-Djdk.nio.maxCachedBufferSize=2000000
-XX:+UnlockDiagnosticVMOptions
-XX:G1NumCollectionsKeepPinned=10000000
-XX:+UseAESCTRIntrinsics
2 changes: 1 addition & 1 deletion docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ To get started:
1. [Set up a development environment](https://ibis-project.org/community/contribute/01_environment/)
1. [Learn about the commit workflow](https://ibis-project.org/community/contribute/02_workflow/)
1. [Review the code style guidelines](https://ibis-project.org/community/contribute/03_style/)
1. [Dig into the nitty gritty of being a maintainer](https://ibis-project.org/community/contribute/05_maintainers_guide/)
1. [Dig into the nitty gritty of being a maintainer](https://ibis-project.org/community/contribute/04_maintainers_guide/)
12 changes: 0 additions & 12 deletions docs/README.md

This file was deleted.

9,821 changes: 4,980 additions & 4,841 deletions docs/_extensions/quarto-ext/fontawesome/assets/css/all.css

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 30 additions & 8 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ website:
type: overlay
algolia:
index-name: prod_ibis
application-id: HS77W8GWM1
search-only-api-key: 8ca4fcd24da322db857322ae4f79d6f3
application-id: TNU9HG3L41
search-only-api-key: 26350948d1961209df16d9717a7e01d6
analytics-events: true
show-logo: true

Expand Down Expand Up @@ -124,7 +124,7 @@ website:
collapse-level: 2
contents:
- auto: backends/*.qmd
- support_matrix.qmd
- auto: backends/support
- id: how-to
title: "How-to"
style: "docked"
Expand Down Expand Up @@ -298,10 +298,6 @@ quartodoc:
- name: param
dynamic: true
signature_name: full
- name: NA
# Ideally exposed under `ibis` but that doesn't seem to work??
package: ibis.expr.api
signature_name: full
- name: "null"
dynamic: true
signature_name: full
Expand Down Expand Up @@ -506,6 +502,32 @@ quartodoc:
- name: NumericValue.point
package: ibis.expr.types.numeric

- kind: page
path: operations
package: ibis.expr.operations
summary:
name: Operations
desc: Low level operation classes. Subject to change in non-major releases.
contents:
- analytic
- arrays
- generic
- geospatial
- histograms
- json
- logical
- maps
- numeric
- reductions
- relations
- sortkeys
- strings
- structs
- subqueries
- temporal
- temporal_windows
- udf
- window
- kind: page
summary:
name: Column selectors
Expand Down Expand Up @@ -545,10 +567,10 @@ quartodoc:
package: ibis
dynamic: true
signature_name: full
- DataType
- Array
- Binary
- Boolean
- DataType
- Date
- Decimal
- Float16
Expand Down
4 changes: 0 additions & 4 deletions docs/backends/pyspark.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,6 @@ con = ibis.pyspark.connect(session=session)
`ibis.pyspark.connect` is a thin wrapper around [`ibis.backends.pyspark.Backend.do_connect`](#ibis.backends.pyspark.Backend.do_connect).
:::

::: {.callout-note}
The `pyspark` backend does not create `SparkSession` objects (unless you [connect using a URL](#ibis.connect-url-format)); you must create a `SparkSession` and pass that to `ibis.pyspark.connect`.
:::

### Connection Parameters

```{python}
Expand Down
130 changes: 130 additions & 0 deletions docs/backends/support/matrix.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
---
title: "Operation support matrix"
format: dashboard
hide:
- toc
---

## {height=25%}

::: {.card title="Welcome to the operation support matrix!"}

This is a [Quarto dashboard](https://quarto.org/docs/dashboards/) that shows
the operations each backend supports.

Due to differences in SQL dialects and upstream support for different
operations in different backends, support for the full breadth of the Ibis API
varies.

::: {.callout-tip}
Backends with low coverage are good places to start contributing!

Each backend implements operations differently, but this is usually very
similar to other backends. If you want to start contributing to ibis, it's
a good idea to start by adding missing operations to backends that have low
operation coverage.
:::

:::

### {width=25%}

```{python}
#| content: valuebox
#| title: "Number of backends"
import ibis
dict(
value=len(ibis.util.backend_entry_points()),
color="info",
icon="signpost-split-fill",
)
```

### {width=25%}

```{python}
#| content: valuebox
#| title: "Number of SQL backends"
import importlib
from ibis.backends.sql import SQLBackend
sql_backends = sum(
issubclass(
importlib.import_module(f"ibis.backends.{entry_point.name}").Backend, SQLBackend
)
for entry_point in ibis.util.backend_entry_points()
)
assert sql_backends > 0
dict(value=sql_backends, color="green", icon="database")
```

## {height=70%}

```{python}
#| echo: false
import pandas as pd
import ibis
import ibis.expr.operations as ops
def make_support_matrix():
"""Construct the backend operation support matrix data."""
from ibis.backends.sql.compiler import ALL_OPERATIONS
support_matrix_ignored_operations = (ops.ScalarParameter,)
public_ops = ALL_OPERATIONS.difference(support_matrix_ignored_operations)
assert public_ops
support = {"Operation": [f"{op.__module__}.{op.__name__}" for op in public_ops]}
support.update(
(backend, list(map(getattr(ibis, backend).has_operation, public_ops)))
for backend in sorted(ep.name for ep in ibis.util.backend_entry_points())
)
def make_link(parts):
module, op = parts[-2:]
return f'<a href="./operations.html#ibis.expr.operations.{module}.{op}">{op}</a>'
support_matrix = (
pd.DataFrame(support)
.assign(splits=lambda df: df.Operation.str.findall("[a-zA-Z_][a-zA-Z_0-9]*"))
.assign(
Category=lambda df: df.splits.str[-2],
Operation=lambda df: df.splits.map(make_link),
)
.drop(["splits"], axis=1)
.set_index(["Category", "Operation"])
.sort_index()
)
all_visible_ops_count = len(support_matrix)
assert all_visible_ops_count
coverage = pd.Index(
support_matrix.sum()
.map(lambda n: f"{n} ({round(100 * n / all_visible_ops_count)}%)")
.T
)
support_matrix.columns = pd.MultiIndex.from_tuples(
list(zip(support_matrix.columns, coverage)), names=("Backend", "API coverage")
)
return support_matrix
```

```{python}
from itables import show
matrix = make_support_matrix()
show(
matrix.replace({True: "✔", False: "🚫"}),
ordering=False,
paging=False,
buttons=["copy", "excel", "csv"],
)
```
1 change: 1 addition & 0 deletions docs/backends/support/operations.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{{< include ../../reference/operations.qmd >}}
2 changes: 1 addition & 1 deletion docs/backends_sankey.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def to_greyish(hex_code, grey_value=128):

fig.update_layout(
title_text="Ibis backend types",
font_size=24,
font_size=20,
# font_family="Arial",
title_font_size=30,
margin=dict(l=30, r=30, t=80, b=30),
Expand Down
2 changes: 1 addition & 1 deletion docs/contribute/01_environment.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ For a better development experience see the `conda/mamba` or `nix` setup instruc
1. Install development dependencies

```sh
pip install 'poetry==1.8.2'
pip install 'poetry==1.8.3'
pip install -r requirements-dev.txt
```

Expand Down
2 changes: 1 addition & 1 deletion docs/contribute/index.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Contribute

Check out our [contributing guide](../../CONTRIBUTING.md) for details! Guides for setting up an environment and getting started are here.
Check out our [contributing guide](../CONTRIBUTING.md) for details! Guides for setting up an environment and getting started are here.
12 changes: 0 additions & 12 deletions docs/getting-started.qmd

This file was deleted.

2 changes: 1 addition & 1 deletion docs/how-to/extending/builtin.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ rest of the library:

```{python}
pkgs = ibis.read_parquet(
"https://storage.googleapis.com/ibis-tutorial-data/pypi/packages.parquet"
"https://storage.googleapis.com/ibis-tutorial-data/pypi/2024-04-24/packages.parquet"
)
pandas_ish = pkgs[jw_sim(pkgs.name, "pandas") >= 0.9]
pandas_ish
Expand Down
2 changes: 1 addition & 1 deletion docs/how-to/extending/unbound_expression.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ Because Ibis separates the transformation logic from the execution engine, you
can easily reuse the written transformation for another backend. Here we use
Polars as an example, but you can do the same for any of Ibis' 20+ supported
backends as long as that particular backend supports the operations
(see [the operation support matrix](../../support_matrix.qmd)).
(see [the operation support matrix](../../backends/support/matrix.qmd)).

```{python}
pl = ibis.polars.connect()
Expand Down
2 changes: 1 addition & 1 deletion docs/how-to/timeseries/sessionize.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ sessionized = (
data
# Create a session id for each character by using a cumulative sum
# over the `new_session` column.
.mutate(new_session=is_new_session.fillna(True))
.mutate(new_session=is_new_session.fill_null(True))
# Create a session id for each character by using a cumulative sum
# over the `new_session` column.
.mutate(session_id=c.new_session.sum().over(entity_window))
Expand Down
519 changes: 116 additions & 403 deletions docs/index.qmd

Large diffs are not rendered by default.

8 changes: 0 additions & 8 deletions docs/jupyter_lite_config.json

This file was deleted.

2 changes: 1 addition & 1 deletion docs/posts/campaign-finance/index.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def get_election_type(pgi: StringValue) -> StringValue:
"E": "recount",
}
first_letter = pgi[0]
return first_letter.substitute(election_types, else_=ibis.NA)
return first_letter.substitute(election_types, else_=ibis.null())
cleaned = cleaned.mutate(election_type=get_election_type(_.TRANSACTION_PGI)).drop(
Expand Down
2 changes: 1 addition & 1 deletion docs/posts/ibis-analytics/index.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -1220,7 +1220,7 @@ def transform_downloads(extract_downloads):
)
.order_by(ibis._.timestamp.desc())
)
downloads = downloads.mutate(ibis._["python"].fillna("").name("python_full"))
downloads = downloads.mutate(ibis._["python"].fill_null("").name("python_full"))
downloads = downloads.mutate(
f.clean_version(downloads["python_full"], patch=False).name("python")
)
Expand Down
62 changes: 47 additions & 15 deletions docs/posts/ibis-duckdb-geospatial/index.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -164,20 +164,29 @@ streets_gdf.crs = "EPSG:26918"
```

```{python}
import leafmap.deckgl as leafmap # <1>
from lonboard import Map, ScatterplotLayer, PathLayer, PolygonLayer
```

1. `leafmap.deckgl` allows us to visualize multiple layers

```{python}
m = leafmap.Map()
m.add_vector(broad_station_gdf, get_fill_color="blue")
m.add_vector(sts_near_broad_gdf, get_color="red", opacity=0.5)
m.add_vector(streets_gdf, get_color="grey", zoom_to_layer=False, opacity=0.3)
broad_station_layer = ScatterplotLayer.from_geopandas(
broad_station_gdf, get_fill_color="blue", get_radius=5
)
sts_near_broad_layer = PathLayer.from_geopandas(
sts_near_broad_gdf, get_color="red", opacity=0.4, get_width=2
)
streets_layer = PathLayer.from_geopandas(streets_gdf, get_color="grey", opacity=0.3)
m = Map(
[
broad_station_layer,
sts_near_broad_layer,
streets_layer,
],
view_state={"longitude": -74.01066, "latitude": 40.7069, "zoom": 16}
)
m
```


You can zoom in and out, and hover over the map to check on the street names.

### `buffer` (ST_Buffer)
Expand Down Expand Up @@ -234,15 +243,38 @@ h_near_broad_gdf.crs = "EPSG:26918"
h_street_gdf = h_street.to_pandas()
h_street_gdf.crs = "EPSG:26918"
```


mh = leafmap.Map()
mh.add_vector(broad_station_gdf, get_fill_color="orange")
mh.add_vector(broad_station_zone, get_fill_color="orange", opacity=0.1)
mh.add_vector(h_near_broad_gdf, get_fill_color="red", opacity=0.5)
mh.add_vector(h_street_gdf, get_color="blue", opacity=0.3)
mh.add_vector(streets_gdf, get_color="grey", zoom_to_layer=False, opacity=0.2)
```{python}
broad_station_layer = ScatterplotLayer.from_geopandas(
broad_station_gdf, get_fill_color="orange", get_radius=5
)
broad_station_zone_layer = PolygonLayer.from_geopandas(
broad_station_zone, get_fill_color="orange", opacity=0.1
)
h_near_broad_layer = ScatterplotLayer.from_geopandas(
h_near_broad_gdf, get_fill_color="red", get_radius=5
)
h_street_layer = PathLayer.from_geopandas(
h_street_gdf, get_color="blue", opacity=0.5, get_width=2
)
streets_layer = PathLayer.from_geopandas(streets_gdf, get_color="grey", opacity=0.3)
mh = Map(
[
broad_station_layer,
broad_station_zone_layer,
h_near_broad_layer,
h_street_layer,
streets_layer,
],
view_state={"longitude": -74.01066, "latitude": 40.7069, "zoom": 16}
)
mh
```

Expand Down
487 changes: 487 additions & 0 deletions docs/posts/ibis-version-9.0.0-release/index.qmd

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions docs/posts/sqlmesh-ibis/index.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
title: "Unlocking data insights with Ibis and SQLMesh"
author: "Chloe He"
date: "2024-05-21"
image: thumbnail.png
categories:
- blog
- sqlmesh
- data engineering
---

# Overview

Have you ever needed to learn new dialects of database languages as a data
scientist or struggled with the differences between database languages? Does
your company manage different production pipelines with multiple databases
or engines? Have you needed to rewrite data pipelines from experimentation
to deployment?

These are challenges that SQLMesh and Ibis together can solve.

[SQLMesh](https://sqlmesh.com/) is a next-generation data transformation
and modeling framework. It aims to be easy to use, correct, and efficient
and is maintained by the [Tobiko Data](https://tobikodata.com/) team. It
helps you scalably, reliably and safely modify your data pipelines because
it understands SQL and can make intelligent updates instead of stringing
scripts together. SQLMesh boasts several future-proof features such as
automatic data contracts, virtual data environments and snapshots, extensive
change summaries (before updates are applied!) and column-level lineage out
of the box.

We walk through an example together to show you how to harness the full
potential of your data analytics workflow and how SQLMesh and Ibis can work
together hand-in-hand. Read the full article on
[SQLMesh blog](https://tobikodata.com/ibis-sqlmesh-unlocking-data-insights.html)!

In today's data-driven world, the ability to efficiently analyze and derive
insights from vast amounts of data is paramount. Leveraging powerful
open-source tools like SQLMesh and Ibis can streamline this process,
enabling you to easily manipulate and query data.

Let us know how you're using SQLMesh and Ibis together in your use case!
Binary file added docs/posts/sqlmesh-ibis/thumbnail.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
296 changes: 258 additions & 38 deletions docs/presentations/overview.qmd

Large diffs are not rendered by default.

Binary file added docs/presentations/pycon2024/basement-ci.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/presentations/pycon2024/bill.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/presentations/pycon2024/docker-eye-roll.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/presentations/pycon2024/gha.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/presentations/pycon2024/heisenbug.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/presentations/pycon2024/machine.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
355 changes: 355 additions & 0 deletions docs/presentations/pycon2024/maintainers.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
---
title: "Test 20 databases on every commit"
subtitle: "It's not hyperbole"
author:
- Phillip Cloud
execute:
echo: true
format:
revealjs:
footer: <https://ibis-project.org>
# https://quarto.org/docs/presentations/revealjs/themes.html#using-themes
---

## Who

:::: {.columns}

::: {.column width="50%"}
### Me

- Phillip Cloud
- Ibis project
- Voltron Data
- Data tools for 10+ years
:::

::: {.column width="50%"}
### Where

- {{< fa brands github >}} [`@cpcloud`](https://github.com/cpcloud)
- {{< fa brands youtube >}} [Phillip in the Cloud](https://www.youtube.com/@cpcloud)
- {{< fa brands twitter >}} [`@cpcloudy`](https://x.com/cpcloudy)
:::

::::

# What

## Maybe this is you

![](./docker-eye-roll.gif){fig-align="center"}

## Or this

![](./wonka.png){fig-align="center"}

## Or maybe even this

![](./basement-ci.jpeg){fig-align="center"}

## Not earth shattering

:::: {.columns}

::: {.column width="50%"}
### Overview

- What we learned about maintenance
- Day to day of 20+ databases
- Unique challenges
:::

::: {.column width="50%"}
### Tools

- Docker
- Packaging
- CI
- `pytest` plugins
:::
::::

# Overview of Ibis

## Ibis is a Python library for:

- Exploratory data analysis (EDA)
- Analytics
- Data engineering
- ML preprocessing
- Building your own DataFrame lib

::: {.r-fit-text}
_Dev to prod with the same API_
:::

## One API, 20+ backends {.smaller .scrollable}

```{python}
#| code-fold: true
#| echo: false
import ibis
ibis.options.interactive = True
t = ibis.examples.penguins.fetch()
t.to_parquet("penguins.parquet")
```

::: {.panel-tabset}

## DuckDB

```{python}
con = ibis.connect("duckdb://")
```

```{python}
t = con.read_parquet("penguins.parquet")
t.head(3)
```

```{python}
t.group_by("species", "island").agg(count=t.count()).order_by("count")
```

## Polars

```{python}
con = ibis.connect("polars://")
```

```{python}
t = con.read_parquet("penguins.parquet")
t.head(3)
```

```{python}
t.group_by("species", "island").agg(count=t.count()).order_by("count")
```

## DataFusion

```{python}
con = ibis.connect("datafusion://")
```

```{python}
t = con.read_parquet("penguins.parquet")
t.head(3)
```

```{python}
t.group_by("species", "island").agg(count=t.count()).order_by("count")
```

## PySpark

```{python}
con = ibis.connect("pyspark://")
```

```{python}
t = con.read_parquet("penguins.parquet")
t.head(3)
```

```{python}
t.group_by("species", "island").agg(count=t.count()).order_by("count")
```

## 16+ other things

![](./machine.gif){fig-align="center" width="100%" height="100%"}

:::

## How it works

```{python}
#| echo: false
#| fig-align: center
import os
import sys
sys.path.append(os.path.abspath("../.."))
from backends_sankey import fig
fig.show()
```

# What's in an Ibis?

## By the numbers {.smaller}

:::: {.columns}
::: {.column width="50%"}
### Backends
- **17** SQL
- **3** non-SQL
- **2** cloud
:::

::: {.column width="50%"}
### Engines + APIs
- **9** distributed SQL
- **3** dataframe
- oldest: **~45** years 👀
- youngest: **~2** years
:::
::::

### Other facts

- Latency is variable
- Deployment models vary

::: {.fragment}
::: {.r-fit-text}
_**Feature development**_
:::
:::

## Bit of a pickle

![](./picklerick.png)

# How

## High level

### Goal: fast iteration

- fast env setup (dependency management)
- fast(ish) tests (test-running library)
- high **job** concurrency (ci/provider)
- **easy to run**: dev speed ([`just`](https://github.com/casey/just))

::: {.fragment}
::: {.r-fit-text}
_CI must complete "quickly"_
:::
:::

## Tools: overview

- 📦 poetry
- 🖥️ GitHub Actions
- 🦁 docker
- 🐕 docker
- 🐱 no special tx (duckdb, polars)
- 🏃 task runner (e.g.: `just up postgres`)

## Tools: poetry

::: {.callout-warning}
## Opinions follow
:::

- **Env setup needs to be _fast_**: avoid constraint solving
- Poetry is one way; there are others
- Get yourself a lockfile
- Downsides?

::: {.fragment}
::: {.r-fit-text}
_Are you doing that **now**_
:::
:::

## This plot

::: {layout="[[-1], [1], [-1]]"}

![](./progress.png){fig-align="center"}

:::

::: {.fragment}
::: {.r-fit-text}
_We've added 3 or 4 new backends since the switch_
:::
:::

## Tools: docker

- Sure, docker
- But, do you to use it locally?
- Use health checks; "dumb" ones are fine
- Make it easy for devs to use

## Tools: GitHub Actions {.smaller}

::: {.callout-note}
### I don't work for GitHub
:::

- Pay for the [the Teams plan](https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration#usage-limits) to get more concurrency
- Automate dependency updates

::: {.columns}
::: {.column width="50%"}
### GHA concurrency limits

![](./gha.png)
:::

::: {.column width="50%"}
### Ibis CI cost

![](./bill.png)
:::
:::

## `pytest` {.smaller}

### Ibis problems

- Tests run across **ALL** backends
- Backends don't implement the same stuff
- Need to know when backend passes
- Answer questions like: "will it _ever_ blend?"

::: {.fragment}
### Markers + hooks

```python
@pytest.mark.never("duckdb") # never gonna happen
@pytest.mark.notyet("impala") # might happen
@pytest.mark.notimpl("snowflake") # ibis devs: do some work
def test_soundex():
...

def pytest_ignore_collect(...):
# pytest -m duckdb: don't collect things that aren't marked duckdb
...
```
:::

## `pytest` plugins you may like

**`pytest-`**

- `xdist`: make this work if you can
- `randomly`: break your bogus stateful assumptions
- `repeat`: when `randomly` exposes your assumptions
- `clarity`: readable test failure messages
- `snapshot`: better than the giant `f`-string you just wrote

**hypothesis** 👈 that too, we don't use it enough

## Why `pytest-randomly`?

![](./heisenbug.png){fig-align="center"}

# Summary

- Use docker for dev **and** prod
- Lock your dependencies (dev only!)
- Auto update stuff
- `pytest` probably has a thing for that
- Spend time on dev ex
- Track CI run durations, look at them too

# Questions?
Binary file added docs/presentations/pycon2024/picklerick.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/presentations/pycon2024/progress.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/presentations/pycon2024/wonka.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
143 changes: 117 additions & 26 deletions docs/release_notes_generated.qmd

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions docs/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ section[id^="parameters-"] {
content: url("./zulip.svg");
vertical-align: -0.125em;
width: 1em;
filter: grayscale(1);
}

.jupyterlite-console {
Expand All @@ -21,3 +22,17 @@ section[id^="parameters-"] {
margin: auto;
display: block;
}

.index-grid {
@extend .grid;
display: flex;
justify-content: space-between;
}

.index-g-col-4 {
@extend .g-col-4;
flex: 1;
/* Ensures all columns grow to fill the same space */
margin: 0 5px;
/* Adds a small margin between columns */
}
52 changes: 0 additions & 52 deletions docs/support_matrix.py

This file was deleted.

74 changes: 0 additions & 74 deletions docs/support_matrix.qmd

This file was deleted.

7 changes: 6 additions & 1 deletion docs/tutorials/getting_started.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

This is a quick tour of some basic commands and usage patterns, just to get your flippers wet.

::: {.callout-tip}
You can run this tutorial in a GitHub Codespace with everything setup for you:

[![](https://github.com/codespaces/badge.svg)](https://codespaces.new/ibis-project/ibis)
:::

## Install Ibis

{{< include ../_tabsets/install_default.qmd >}}
Expand All @@ -24,7 +30,6 @@ con.create_table(
You can now see the example dataset copied over to the database:

```{python}
con = ibis.connect("duckdb://penguins.ddb")
con.list_tables()
```

Expand Down
8 changes: 4 additions & 4 deletions docs/tutorials/ibis-for-pandas-users.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ represented by `NaN`. This can be confusing when working with numeric data,
since `NaN` is also a valid floating point value (along with `+/-inf`).

In Ibis, we try to be more precise: All data types are nullable, and we use
`ibis.NA` to represent `NULL` values, and all datatypes have a `.isnull()` method.
`ibis.null()` to represent `NULL` values, and all datatypes have a `.isnull()` method.
For floating point values, we use different values for `NaN` and `+/-inf`, and there
are the additional methods `.isnan()` and `.isinf()`.

Expand All @@ -532,17 +532,17 @@ the column name for the value to apply to.


```{python}
no_null_peng = penguins.fillna(dict(bill_depth_mm=0, bill_length_mm=0))
no_null_peng = penguins.fill_null(dict(bill_depth_mm=0, bill_length_mm=0))
```

### Replacing `NULL`s

Both pandas and Ibis have `fillna` methods which allow you to specify a replacement value
The Ibis equivalent of pandas `fillna` is `fill_null`, this method allows you to specify a replacement value
for `NULL` values.


```{python}
bill_length_no_nulls = penguins.bill_length_mm.fillna(0)
bill_length_no_nulls = penguins.bill_length_mm.fill_null(0)
```

## Type casts
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/ibis-for-sql-users.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -522,10 +522,10 @@ ibis.to_sql(expr)

### Using `NULL` in expressions

To use `NULL` in an expression, either use the special `ibis.NA` value:
To use `NULL` in an expression, use `ibis.null()` value:

```{python}
pos_two = (t.two > 0).ifelse(t.two, ibis.NA)
pos_two = (t.two > 0).ifelse(t.two, ibis.null())
expr = t.mutate(two_positive=pos_two)
ibis.to_sql(expr)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/why.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ Ibis already works well with machine learning libraries like:

{{< include ./_tabsets/install.qmd >}}

See the [backend support matrix](support_matrix.qmd) for details on operations
See the [backend support matrix](./backends/support/matrix.qmd) for details on operations
supported. [Open a feature
request](https://github.com/ibis-project/ibis/issues/new?assignees=&labels=feature&projects=&template=feature-request.yml&title=feat)
if you'd like to see support for an operation in a given backend. If the backend
Expand Down
18 changes: 9 additions & 9 deletions flake.lock
7 changes: 4 additions & 3 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -125,20 +125,21 @@
in
rec {
packages = {
inherit (pkgs) ibis39 ibis310 ibis311 ibis312;
inherit (pkgs) ibis310 ibis311 ibis312;

default = pkgs.ibis312;

inherit (pkgs) update-lock-files gen-examples check-release-notes-spelling;
};

devShells = rec {
ibis39 = mkDevShell pkgs.ibisDevEnv39;
ibis310 = mkDevShell pkgs.ibisDevEnv310;
ibis311 = mkDevShell pkgs.ibisDevEnv311;
ibis312 = mkDevShell pkgs.ibisDevEnv312;

default = ibis312;
# move back to 3.12 when dask-expr is supported or the dask backend is
# removed
default = ibis310;

preCommit = pkgs.mkShell {
name = "preCommit";
Expand Down
11 changes: 6 additions & 5 deletions gen_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
"/api/expressions/top_level/": "/reference/expressions/top_level/",
"/api/schemas/": "/reference/schemas/",
"/api/selectors/": "/reference/selectors/",
"/backends/": "/support_matrix",
"/backends/": "/backends/support/matrix",
"/backends/BigQuery/": "/backends/bigquery/",
"/backends/Clickhouse/": "/backends/clickhouse/",
"/backends/Dask/": "/backends/dask/",
Expand All @@ -87,8 +87,8 @@
"/backends/SQLite/": "/backends/sqlite/",
"/backends/Snowflake/": "/backends/snowflake/",
"/backends/Trino/": "/backends/trino/",
"/backends/_support_matrix/": "/support_matrix",
"/backends/support_matrix": "/support_matrix",
"/backends/_support_matrix/": "/backends/support/matrix",
"/backends/support_matrix": "/backends/support/matrix",
"/blog": "/posts",
"/blog/Ibis-version-3.0.0-release/": "/posts/Ibis-version-3.0.0-release/",
"/blog/Ibis-version-3.1.0-release/": "/posts/Ibis-version-3.1.0-release/",
Expand All @@ -113,12 +113,12 @@
"/community/contribute/01_environment/": "/contribute/01_environment",
"/community/contribute/02_workflow/": "/contribute/02_workflow/",
"/community/contribute/03_style/": "/contribute/03_style/",
"/community/contribute/05_maintainers_guide/": "/contribute/05_maintainers_guide/",
"/community/contribute/04_maintainers_guide/": "/contribute/04_maintainers_guide/",
"/concept/backends/": "/concepts/backend",
"/concept/design/": "/concepts/internals",
"/concept/why_ibis/": "/why",
"/docs/": "/",
"/docs/dev/backends/support_matrix/": "/support_matrix",
"/docs/dev/backends/support_matrix/": "/backends/support/matrix",
"/docs/dev/contribute/01_environment/": "/contribute/01_environment",
"/docs/dev/release_notes/": "/release_notes",
"/getting_started/": "/tutorial/getting_started/",
Expand Down Expand Up @@ -154,6 +154,7 @@
"/reference/expressions/tables/": "/reference/expressions-tables",
"/reference/expressions/timestamps/": "/reference/expression-temporal",
"/reference/expressions/top_level/": "/reference/top_level",
"/support_matrix": "/backends/support/matrix",
"/tutorial/": "/tutorials/getting_started/",
"/tutorial/getting_started/": "/tutorials/getting_started",
"/tutorial/ibis-for-dplyr-users/": "/tutorials/ibis-for-dplyr-users/",
Expand Down
24 changes: 20 additions & 4 deletions ibis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

from __future__ import annotations

__version__ = "9.0.0"
__version__ = "9.1.0"

import warnings
from typing import Any

from ibis import examples, util
from ibis.backends import BaseBackend
Expand Down Expand Up @@ -36,7 +39,7 @@ def __dir__() -> list[str]:
return sorted(out)


def __getattr__(name: str) -> BaseBackend:
def load_backend(name: str) -> BaseBackend:
"""Load backends in a lazy way with `ibis.<backend-name>`.
This also registers the backend options.
Expand All @@ -52,6 +55,7 @@ def __getattr__(name: str) -> BaseBackend:
attribute is "cached", so this function is only called the first time.
"""

entry_points = {ep for ep in util.backend_entry_points() if ep.name == name}

if not entry_points:
Expand Down Expand Up @@ -95,7 +99,6 @@ def __getattr__(name: str) -> BaseBackend:
# - connect
# - compile
# - has_operation
# - add_operation
# - _from_url
# - _to_sqlglot
#
Expand All @@ -116,7 +119,6 @@ def connect(*args, **kwargs):
proxy.connect = connect
proxy.compile = backend.compile
proxy.has_operation = backend.has_operation
proxy.add_operation = backend.add_operation
proxy.name = name
proxy._from_url = backend._from_url
proxy._to_sqlglot = backend._to_sqlglot
Expand All @@ -125,3 +127,17 @@ def connect(*args, **kwargs):
setattr(proxy, name, getattr(backend, name))

return proxy


def __getattr__(name: str) -> Any:
if name == "NA":
warnings.warn(
"The 'ibis.NA' constant is deprecated as of v9.1 and will be removed in a future "
"version. Use 'ibis.null()' instead.",
DeprecationWarning,
stacklevel=2,
)

return null() # noqa: F405
else:
return load_backend(name)
73 changes: 26 additions & 47 deletions ibis/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
import importlib.metadata
import keyword
import re
import sys
import urllib.parse
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar
from typing import TYPE_CHECKING, Any, ClassVar
from urllib.parse import parse_qs, urlparse

import ibis
Expand Down Expand Up @@ -51,15 +50,15 @@ def __init__(self, backend: BaseBackend):
def __getitem__(self, name) -> ir.Table:
try:
return self._backend.table(name)
except Exception as exc: # noqa: BLE001
except Exception as exc:
raise KeyError(name) from exc

def __getattr__(self, name) -> ir.Table:
if name.startswith("_"):
raise AttributeError(name)
try:
return self._backend.table(name)
except Exception as exc: # noqa: BLE001
except Exception as exc:
raise AttributeError(name) from exc

def __iter__(self) -> Iterator[str]:
Expand Down Expand Up @@ -903,6 +902,17 @@ def list_tables(
For some backends, the tables may be files in a directory,
or other equivalent entities in a SQL database.
::: {.callout-note}
## Ibis does not use the word `schema` to refer to database hierarchy.
A collection of tables is referred to as a `database`.
A collection of `database` is referred to as a `catalog`.
These terms are mapped onto the corresponding features in each
backend (where available), regardless of whether the backend itself
uses the same terminology.
:::
Parameters
----------
like
Expand All @@ -914,17 +924,6 @@ def list_tables(
pass in a dotted string path like `"catalog.database"` or a tuple of
strings like `("catalog", "database")`.
::: {.callout-note}
## Ibis does not use the word `schema` to refer to database hierarchy.
A collection of tables is referred to as a `database`.
A collection of `database` is referred to as a `catalog`.
These terms are mapped onto the corresponding features in each
backend (where available), regardless of whether the backend itself
uses the same terminology.
:::
Returns
-------
list[str]
Expand All @@ -938,6 +937,17 @@ def table(
) -> ir.Table:
"""Construct a table expression.
::: {.callout-note}
## Ibis does not use the word `schema` to refer to database hierarchy.
A collection of tables is referred to as a `database`.
A collection of `database` is referred to as a `catalog`.
These terms are mapped onto the corresponding features in each
backend (where available), regardless of whether the backend itself
uses the same terminology.
:::
Parameters
----------
name
Expand All @@ -949,17 +959,6 @@ def table(
pass in a dotted string path like `"catalog.database"` or a tuple of
strings like `("catalog", "database")`.
::: {.callout-note}
## Ibis does not use the word `schema` to refer to database hierarchy.
A collection of tables is referred to as a `database`.
A collection of `database` is referred to as a `catalog`.
These terms are mapped onto the corresponding features in each
backend (where available), regardless of whether the backend itself
uses the same terminology.
:::
Returns
-------
Table
Expand Down Expand Up @@ -1052,23 +1051,6 @@ def _to_sqlglot(self, expr: ir.Expr, **kwargs) -> sg.exp.Expression:
def execute(self, expr: ir.Expr) -> Any:
"""Execute an expression."""

def add_operation(self, operation: ops.Node) -> Callable:
"""Add a translation function to the backend for a specific operation.
Operations are defined in `ibis.expr.operations`, and a translation
function receives the translator object and an expression as
parameters, and returns a value depending on the backend.
"""
if not hasattr(self, "compiler"):
raise RuntimeError("Only SQL-based backends support `add_operation`")

def decorator(translation_function: Callable) -> None:
self.compiler.translator_class.add_operation(
operation, translation_function
)

return decorator

@abc.abstractmethod
def create_table(
self,
Expand Down Expand Up @@ -1296,10 +1278,7 @@ def _get_backend_names(*, exclude: tuple[str] = ()) -> frozenset[str]:
"""

if sys.version_info < (3, 10):
entrypoints = importlib.metadata.entry_points()["ibis.backends"]
else:
entrypoints = importlib.metadata.entry_points(group="ibis.backends")
entrypoints = importlib.metadata.entry_points(group="ibis.backends")
return frozenset(ep.name for ep in entrypoints).difference(exclude)


Expand Down
102 changes: 71 additions & 31 deletions ibis/backends/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
import glob
import os
import re
from typing import TYPE_CHECKING, Any, Callable, Optional
from typing import TYPE_CHECKING, Any, Optional
from urllib.parse import parse_qs, urlparse

import google.auth.credentials
import google.cloud.bigquery as bq
import google.cloud.bigquery_storage_v1 as bqstorage
import pandas as pd
import pydata_google_auth
import sqlglot as sg
import sqlglot.expressions as sge
Expand All @@ -27,7 +26,6 @@
from ibis import util
from ibis.backends import CanCreateDatabase, CanCreateSchema
from ibis.backends.bigquery.client import (
BigQueryCursor,
bigquery_param,
parse_project_and_dataset,
rename_partitioned_column,
Expand All @@ -40,9 +38,11 @@
from ibis.backends.sql.datatypes import BigQueryType

if TYPE_CHECKING:
from collections.abc import Iterable, Mapping
from collections.abc import Callable, Iterable, Mapping
from pathlib import Path

import pandas as pd
import polars as pl
import pyarrow as pa
from google.cloud.bigquery.table import RowIterator

Expand Down Expand Up @@ -93,6 +93,8 @@ def _qualify_memtable(
if isinstance(node, sge.Table) and _MEMTABLE_PATTERN.match(node.name) is not None:
node.args["db"] = dataset
node.args["catalog"] = project
# make sure to quote table location
node = _force_quote_table(node)
return node


Expand Down Expand Up @@ -125,6 +127,27 @@ def _remove_null_ordering_from_unsupported_window(
return node


def _force_quote_table(table: sge.Table) -> sge.Table:
"""Force quote all the parts of a bigquery path.
The BigQuery identifier quoting semantics are bonkers
https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers
my-table is OK, but not mydataset.my-table
mytable-287 is OK, but not mytable-287a
Just quote everything.
"""
for key in ("this", "db", "catalog"):
if (val := table.args[key]) is not None:
if isinstance(val, sg.exp.Identifier) and not val.quoted:
val.args["quoted"] = True
else:
table.args[key] = sg.to_identifier(val, quoted=True)
return table


class Backend(SQLBackend, CanCreateDatabase, CanCreateSchema):
name = "bigquery"
compiler = BigQueryCompiler()
Expand Down Expand Up @@ -570,7 +593,8 @@ def table(

node = ops.DatabaseTable(
table.name,
schema=schema_from_bigquery_table(bq_table),
# https://cloud.google.com/bigquery/docs/querying-wildcard-tables#filtering_selected_tables_using_table_suffix
schema=schema_from_bigquery_table(bq_table, wildcard=table.name[-1] == "*"),
source=self,
namespace=ops.Namespace(database=dataset, catalog=project),
)
Expand Down Expand Up @@ -605,7 +629,7 @@ def _execute(self, stmt, query_parameters=None):
stmt, job_config=job_config, project=self.billing_project
)
query.result() # blocks until finished
return BigQueryCursor(query)
return query

def _to_sqlglot(
self,
Expand Down Expand Up @@ -715,11 +739,12 @@ def execute(self, expr, params=None, limit="default", **kwargs):

sql = self.compile(expr, limit=limit, params=params, **kwargs)
self._log(sql)
cursor = self.raw_sql(sql, params=params, **kwargs)
query = self.raw_sql(sql, params=params, **kwargs)

result = self.fetch_from_cursor(cursor, expr.as_table().schema())
schema = expr.as_table().schema() - ibis.schema({"_TABLE_SUFFIX": "string"})
result = self.fetch_from_query(query, schema)

return expr.__pandas_result__(result)
return expr.__pandas_result__(result, schema=schema)

def insert(
self,
Expand Down Expand Up @@ -759,27 +784,29 @@ def insert(
overwrite=overwrite,
)

def fetch_from_cursor(self, cursor, schema):
def fetch_from_query(self, query, schema):
from ibis.backends.bigquery.converter import BigQueryPandasData

arrow_t = self._cursor_to_arrow(cursor)
arrow_t = self._query_to_arrow(query)
df = arrow_t.to_pandas(timestamp_as_object=True)
return BigQueryPandasData.convert_table(df, schema)
return BigQueryPandasData.convert_table(
df, schema - ibis.schema({"_TABLE_SUFFIX": "string"})
)

def _cursor_to_arrow(
def _query_to_arrow(
self,
cursor,
query,
*,
method: Callable[[RowIterator], pa.Table | Iterable[pa.RecordBatch]]
| None = None,
method: (
Callable[[RowIterator], pa.Table | Iterable[pa.RecordBatch]] | None
) = None,
chunk_size: int | None = None,
):
if method is None:
method = lambda result: result.to_arrow(
progress_bar_type=None,
bqstorage_client=self.storage_client,
)
query = cursor.query
query_result = query.result(page_size=chunk_size)
# workaround potentially not having the ability to create read sessions
# in the dataset project
Expand All @@ -803,8 +830,8 @@ def to_pyarrow(
self._register_in_memory_tables(expr)
sql = self.compile(expr, limit=limit, params=params, **kwargs)
self._log(sql)
cursor = self.raw_sql(sql, params=params, **kwargs)
table = self._cursor_to_arrow(cursor)
query = self.raw_sql(sql, params=params, **kwargs)
table = self._query_to_arrow(query)
return expr.__pyarrow_result__(table)

def to_pyarrow_batches(
Expand All @@ -823,9 +850,9 @@ def to_pyarrow_batches(
self._register_in_memory_tables(expr)
sql = self.compile(expr, limit=limit, params=params, **kwargs)
self._log(sql)
cursor = self.raw_sql(sql, params=params, **kwargs)
batch_iter = self._cursor_to_arrow(
cursor,
query = self.raw_sql(sql, params=params, **kwargs)
batch_iter = self._query_to_arrow(
query,
method=lambda result: result.to_arrow_iterable(
bqstorage_client=self.storage_client
),
Expand Down Expand Up @@ -853,7 +880,11 @@ def get_schema(
),
name,
)
return schema_from_bigquery_table(self.client.get_table(table_ref))
return schema_from_bigquery_table(
self.client.get_table(table_ref),
# https://cloud.google.com/bigquery/docs/querying-wildcard-tables#filtering_selected_tables_using_table_suffix
wildcard=name[-1] == "*",
)

def list_databases(
self, like: str | None = None, catalog: str | None = None
Expand Down Expand Up @@ -918,7 +949,12 @@ def version(self):
def create_table(
self,
name: str,
obj: pd.DataFrame | pa.Table | ir.Table | None = None,
obj: ir.Table
| pd.DataFrame
| pa.Table
| pl.DataFrame
| pl.LazyFrame
| None = None,
*,
schema: ibis.Schema | None = None,
database: str | None = None,
Expand Down Expand Up @@ -1005,13 +1041,10 @@ def create_table(
for name, value in (options or {}).items()
)

if obj is not None:
import pyarrow as pa
import pyarrow_hotfix # noqa: F401

if isinstance(obj, (pd.DataFrame, pa.Table)):
obj = ibis.memtable(obj, schema=schema)
if obj is not None and not isinstance(obj, ir.Table):
obj = ibis.memtable(obj, schema=schema)

if obj is not None:
self._register_in_memory_tables(obj)

if temp:
Expand All @@ -1025,14 +1058,21 @@ def create_table(
try:
table = sg.parse_one(name, into=sge.Table, read="bigquery")
except sg.ParseError:
table = sg.table(name, db=dataset, catalog=project_id)
table = sg.table(
name,
db=dataset,
catalog=project_id,
quoted=self.compiler.quoted,
)
else:
if table.args["db"] is None:
table.args["db"] = dataset

if table.args["catalog"] is None:
table.args["catalog"] = project_id

table = _force_quote_table(table)

column_defs = [
sge.ColumnDef(
this=sg.to_identifier(name, quoted=self.compiler.quoted),
Expand Down
41 changes: 4 additions & 37 deletions ibis/backends/bigquery/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
NATIVE_PARTITION_COL = "_PARTITIONTIME"


def schema_from_bigquery_table(table):
def schema_from_bigquery_table(table, *, wildcard: bool):
schema = BigQuerySchema.to_ibis(table.schema)

# Check for partitioning information
Expand All @@ -26,43 +26,10 @@ def schema_from_bigquery_table(table):
if partition_field not in schema:
schema |= {partition_field: dt.Timestamp(timezone="UTC")}

return schema


class BigQueryCursor:
"""BigQuery cursor.
if wildcard:
schema |= {"_TABLE_SUFFIX": dt.string}

This allows the BigQuery client to reuse machinery in
:file:`ibis/client.py`.
"""

def __init__(self, query):
"""Construct a BigQueryCursor with query `query`."""
self.query = query

def fetchall(self):
"""Fetch all rows."""
result = self.query.result()
return [row.values() for row in result]

@property
def columns(self):
"""Return the columns of the result set."""
result = self.query.result()
return [field.name for field in result.schema]

@property
def description(self):
"""Get the fields of the result set's schema."""
result = self.query.result()
return list(result.schema)

def __enter__(self):
"""No-op for compatibility."""
return self

def __exit__(self, *_):
"""No-op for compatibility."""
return schema


@functools.singledispatch
Expand Down
108 changes: 45 additions & 63 deletions ibis/backends/bigquery/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@
exclude_unsupported_window_frame_from_ops,
exclude_unsupported_window_frame_from_rank,
exclude_unsupported_window_frame_from_row_number,
rewrite_sample_as_filter,
)
from ibis.common.temporal import DateUnit, IntervalUnit, TimestampUnit, TimeUnit
from ibis.expr.rewrites import rewrite_stringslice

_NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+')

Expand All @@ -31,35 +29,25 @@ class BigQueryCompiler(SQLGlotCompiler):
type_mapper = BigQueryType
udf_type_mapper = BigQueryUDFType
rewrites = (
rewrite_sample_as_filter,
exclude_unsupported_window_frame_from_ops,
exclude_unsupported_window_frame_from_row_number,
exclude_unsupported_window_frame_from_rank,
rewrite_stringslice,
*SQLGlotCompiler.rewrites,
)

UNSUPPORTED_OPERATIONS = frozenset(
(
ops.CountDistinctStar,
ops.DateDiff,
ops.ExtractAuthority,
ops.ExtractFile,
ops.ExtractFragment,
ops.ExtractHost,
ops.ExtractPath,
ops.ExtractProtocol,
ops.ExtractQuery,
ops.ExtractUserInfo,
ops.FindInSet,
ops.Median,
ops.Quantile,
ops.MultiQuantile,
ops.RegexSplit,
ops.RowID,
ops.TimestampBucket,
ops.TimestampDiff,
)
UNSUPPORTED_OPS = (
ops.CountDistinctStar,
ops.DateDiff,
ops.ExtractAuthority,
ops.ExtractUserInfo,
ops.FindInSet,
ops.Median,
ops.Quantile,
ops.MultiQuantile,
ops.RegexSplit,
ops.RowID,
ops.TimestampBucket,
ops.TimestampDiff,
)

NAN = sge.Cast(
Expand Down Expand Up @@ -126,16 +114,9 @@ class BigQueryCompiler(SQLGlotCompiler):
ops.TimeFromHMS: "time",
ops.TimestampFromYMDHMS: "datetime",
ops.TimestampNow: "current_timestamp",
ops.ExtractHost: "net.host",
}

def _aggregate(self, funcname: str, *args, where):
func = self.f[funcname]

if where is not None:
args = tuple(self.if_(where, arg, NULL) for arg in args)

return func(*args, dialect=self.dialect)

@staticmethod
def _minimize_spec(start, end, spec):
if (
Expand Down Expand Up @@ -397,6 +378,9 @@ def visit_ExtractEpochSeconds(self, op, *, arg):
def visit_ExtractWeekOfYear(self, op, *, arg):
return self.f.extract(self.v.isoweek, arg)

def visit_ExtractIsoYear(self, op, *, arg):
return self.f.extract(self.v.isoyear, arg)

def visit_ExtractMillisecond(self, op, *, arg):
return self.f.extract(self.v.millisecond, arg)

Expand Down Expand Up @@ -638,33 +622,7 @@ def visit_Correlation(self, op, *, left, right, how, where):
return self.agg.corr(left, right, where=where)

def visit_TypeOf(self, op, *, arg):
name = sg.to_identifier(util.gen_name("bq_typeof"))
from_ = self._unnest(self.f.array(self.f.format("%T", arg)), as_=name)
ifs = [
self.if_(
self.f.regexp_contains(name, '^[A-Z]+ "'),
self.f.regexp_extract(name, '^([A-Z]+) "'),
),
self.if_(self.f.regexp_contains(name, "^-?[0-9]*$"), "INT64"),
self.if_(
self.f.regexp_contains(
name, r'^(-?[0-9]+[.e].*|CAST\("([^"]*)" AS FLOAT64\))$'
),
"FLOAT64",
),
self.if_(name.isin(sge.convert("true"), sge.convert("false")), "BOOL"),
self.if_(
sg.or_(self.f.starts_with(name, '"'), self.f.starts_with(name, "'")),
"STRING",
),
self.if_(self.f.starts_with(name, 'b"'), "BYTES"),
self.if_(self.f.starts_with(name, "["), "ARRAY"),
self.if_(self.f.regexp_contains(name, r"^(STRUCT)?\("), "STRUCT"),
self.if_(self.f.starts_with(name, "ST_"), "GEOGRAPHY"),
self.if_(name.eq(sge.convert("NULL")), "NULL"),
]
case = sge.Case(ifs=ifs, default=sge.convert("UNKNOWN"))
return sg.select(case).from_(from_).subquery()
return self._pudf("typeof", arg)

def visit_Xor(self, op, *, left, right):
return sg.or_(sg.and_(left, sg.not_(right)), sg.and_(sg.not_(left), right))
Expand All @@ -676,18 +634,18 @@ def visit_HashBytes(self, op, *, arg, how):

@staticmethod
def _gen_valid_name(name: str) -> str:
return "_".join(_NAME_REGEX.findall(name)) or "tmp"
return "_".join(map(str.strip, _NAME_REGEX.findall(name))) or "tmp"

def visit_CountStar(self, op, *, arg, where):
if where is not None:
return self.f.countif(where)
return self.f.count(STAR)

def visit_Degrees(self, op, *, arg):
return sge.paren(180 * arg / self.f.acos(-1), copy=False)
return self._pudf("degrees", arg)

def visit_Radians(self, op, *, arg):
return sge.paren(self.f.acos(-1) * arg / 180, copy=False)
return self._pudf("radians", arg)

def visit_CountDistinct(self, op, *, arg, where):
if where is not None:
Expand All @@ -696,3 +654,27 @@ def visit_CountDistinct(self, op, *, arg, where):

def visit_RandomUUID(self, op, **kwargs):
return self.f.generate_uuid()

def visit_ExtractFile(self, op, *, arg):
return self._pudf("cw_url_extract_file", arg)

def visit_ExtractFragment(self, op, *, arg):
return self._pudf("cw_url_extract_fragment", arg)

def visit_ExtractPath(self, op, *, arg):
return self._pudf("cw_url_extract_path", arg)

def visit_ExtractProtocol(self, op, *, arg):
return self._pudf("cw_url_extract_protocol", arg)

def visit_ExtractQuery(self, op, *, arg, key):
if key is not None:
return self._pudf("cw_url_extract_parameter", arg, key)
else:
return self._pudf("cw_url_extract_query", arg)

def _pudf(self, name, *args):
name = sg.table(name, db="persistent_udfs", catalog="bigquery-public-data").sql(
self.dialect
)
return self.f[name](*args)
13 changes: 12 additions & 1 deletion ibis/backends/bigquery/tests/system/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ def test_repr_struct_of_array_of_struct():


def test_raw_sql(con):
assert con.raw_sql("SELECT 1").fetchall() == [(1,)]
result = con.raw_sql("SELECT 1").result()
assert [row.values() for row in result] == [(1,)]


def test_parted_column_rename(parted_alltypes):
Expand Down Expand Up @@ -386,6 +387,8 @@ def test_fully_qualified_memtable_compile(project_id, dataset_id):
assert new_bq_con._session_dataset is not None
assert project_id in sql

assert f"`{project_id}`.`{new_bq_con._session_dataset.dataset_id}`.`" in sql


def test_create_table_with_options(con):
name = gen_name("bigquery_temp_table")
Expand Down Expand Up @@ -431,3 +434,11 @@ def test_create_temp_table_from_scratch(project_id, dataset_id):
df = con.tables.functional_alltypes.limit(1)
t = con.create_table(name, obj=df, temp=True)
assert len(t.execute()) == 1


def test_table_suffix():
con = ibis.connect("bigquery://ibis-gbq")
t = con.table("gsod*", database="bigquery-public-data.noaa_gsod")
expr = t.filter(t._TABLE_SUFFIX == "1929", t.max != 9999.9).head(1)
result = expr.execute()
assert not result.empty
14 changes: 14 additions & 0 deletions ibis/backends/bigquery/tests/system/test_connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,17 @@ def test_client_with_regional_endpoints(project_id, credentials, dataset_id):
df = alltypes.execute()
assert df.empty
assert not len(alltypes.to_pyarrow())


def test_create_table_from_memtable_needs_quotes(project_id, credentials):
con = ibis.bigquery.connect(
project_id=project_id,
dataset_id=f"{project_id}.testing",
credentials=credentials,
)

con.create_table(
"region-table",
schema=ibis.schema(dict(its_always="str", quoting="int")),
)
con.drop_table("region-table")
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
approx_quantiles(IF(`t0`.`month` > 0, `t0`.`double_col`, NULL), IF(`t0`.`month` > 0, 2, NULL))[offset(1)] AS `ApproxMedian_double_col_ Greater_month_ 0`
approx_quantiles(IF(`t0`.`month` > 0, `t0`.`double_col`, NULL), IF(`t0`.`month` > 0, 2, NULL))[offset(1)] AS `ApproxMedian_double_col_Greater_month_0`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
APPROX_COUNT_DISTINCT(IF(`t0`.`month` > 0, `t0`.`double_col`, NULL)) AS `ApproxCountDistinct_double_col_ Greater_month_ 0`
APPROX_COUNT_DISTINCT(IF(`t0`.`month` > 0, `t0`.`double_col`, NULL)) AS `ApproxCountDistinct_double_col_Greater_month_0`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
CAST(`t0`.`value` AS BYTES) AS `Cast_value_ binary`
CAST(`t0`.`value` AS BYTES) AS `Cast_value_binary`
FROM `t` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
bit_and(IF(`t0`.`bigint_col` > 0, `t0`.`int_col`, NULL)) AS `BitAnd_int_col_ Greater_bigint_col_ 0`
bit_and(IF(`t0`.`bigint_col` > 0, `t0`.`int_col`, NULL)) AS `BitAnd_int_col_Greater_bigint_col_0`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
bit_or(IF(`t0`.`bigint_col` > 0, `t0`.`int_col`, NULL)) AS `BitOr_int_col_ Greater_bigint_col_ 0`
bit_or(IF(`t0`.`bigint_col` > 0, `t0`.`int_col`, NULL)) AS `BitOr_int_col_Greater_bigint_col_0`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
bit_xor(IF(`t0`.`bigint_col` > 0, `t0`.`int_col`, NULL)) AS `BitXor_int_col_ Greater_bigint_col_ 0`
bit_xor(IF(`t0`.`bigint_col` > 0, `t0`.`int_col`, NULL)) AS `BitXor_int_col_Greater_bigint_col_0`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ SELECT
CAST(`t0`.`bool_col` AS INT64),
NULL
)
) AS `Sum_bool_col_ And_Greater_month_ 6_ Less_month_ 10`
) AS `Sum_bool_col_And_Greater_month_6_Less_month_10`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
AVG(IF(`t0`.`month` > 6, CAST(`t0`.`bool_col` AS INT64), NULL)) AS `Mean_bool_col_ Greater_month_ 6`
AVG(IF(`t0`.`month` > 6, CAST(`t0`.`bool_col` AS INT64), NULL)) AS `Mean_bool_col_Greater_month_6`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
CAST(trunc(`t0`.`double_col`) AS INT64) AS `Cast_double_col_ int64`
CAST(trunc(`t0`.`double_col`) AS INT64) AS `Cast_double_col_int64`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
COVAR_POP(`t0`.`double_col`, `t0`.`double_col`) AS `Covariance_double_col_ double_col`
COVAR_POP(`t0`.`double_col`, `t0`.`double_col`) AS `Covariance_double_col_double_col`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
COVAR_SAMP(`t0`.`double_col`, `t0`.`double_col`) AS `Covariance_double_col_ double_col`
COVAR_SAMP(`t0`.`double_col`, `t0`.`double_col`) AS `Covariance_double_col_double_col`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT
MOD(EXTRACT(dayofweek FROM DATE(2017, 1, 1)) + 5, 7) AS `DayOfWeekIndex_datetime_date_2017_ 1_ 1`
MOD(EXTRACT(dayofweek FROM DATE(2017, 1, 1)) + 5, 7) AS `DayOfWeekIndex_datetime_date_2017_1_1`
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT
INITCAP(CAST(DATE(2017, 1, 1) AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date_2017_ 1_ 1`
INITCAP(CAST(DATE(2017, 1, 1) AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_date_2017_1_1`
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT
MOD(EXTRACT(dayofweek FROM datetime('2017-01-01T04:55:59')) + 5, 7) AS `DayOfWeekIndex_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
MOD(EXTRACT(dayofweek FROM datetime('2017-01-01T04:55:59')) + 5, 7) AS `DayOfWeekIndex_datetime_datetime_2017_1_1_4_55_59`
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
SELECT
INITCAP(CAST(datetime('2017-01-01T04:55:59') AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime_2017_ 1_ 1_ 4_ 55_ 59`
INITCAP(CAST(datetime('2017-01-01T04:55:59') AS STRING FORMAT 'DAY')) AS `DayOfWeekName_datetime_datetime_2017_1_1_4_55_59`
Loading