230 changes: 230 additions & 0 deletions .github/workflows/ibis-backends-flink.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
name: Flink Backend

on:
push:
# Skip the backend suite if all changes are docs
paths-ignore:
- "docs/**"
- "**/*.md"
- "**/*.qmd"
- "codecov.yml"
- ".envrc"
branches:
- master
- "*.x.x"
pull_request:
# Skip the backend suite if all changes are docs
paths-ignore:
- "docs/**"
- "**/*.md"
- "**/*.qmd"
- "codecov.yml"
- ".envrc"
branches:
- master
- "*.x.x"
merge_group:

permissions:
# this allows extractions/setup-just to list releases for `just` at a higher
# rate limit while restricting GITHUB_TOKEN permissions elsewhere
contents: read

concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true

env:
FORCE_COLOR: "1"

jobs:
test_backends:
name: ${{ matrix.backend.title }} ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
python-version:
- "3.9"
backend:
- name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- apache-flink
- grpcio-status # FIXME(deepyaman)
- pytest-split
group:
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
]
steps:
- name: checkout
uses: actions/checkout@v4

- uses: extractions/setup-just@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: download backend data
run: just download-data

- name: install python
uses: actions/setup-python@v4
id: install_python
with:
python-version: ${{ matrix.python-version }}

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ steps.install_python.outputs.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'

- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ matrix.backend.name }}-${{ steps.install_python.outputs.python-version }}

- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"

- name: install other deps
run: poetry run pip install ${{ join(matrix.backend.additional_deps, ' ') }}

# FIXME(deepyaman): We reinstall pandas~=1.5 to get a version with
# ArrowDtype, but this step will be removed once PyFlink relaxes
# its overly-restrictive dependencies (specifying pandas<1.4.0).
- name: override overly-restrictive PyFlink requirements
run: poetry run pip install pandas~=1.5

- name: show installed deps
run: poetry run pip list

- name: "run serial tests: ${{ matrix.backend.name }} (common)"
if: matrix.group > 0
run: just ci-check -m ${{ matrix.backend.name }} ibis/backends/tests --splits 20 --group ${{ matrix.group }} --splitting-algorithm least_duration
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
JVM_ARGS: -XX:CompressedClassSpaceSize=3G

# FIXME(deepyaman): If some backend-specific test, in test_ddl.py,
# executes before common tests, they will fail with:
# org.apache.flink.table.api.ValidationException: Table `default_catalog`.`default_database`.`functional_alltypes` was not found.
# Therefore, we quarantine backend-specific tests to avoid this.
- name: "run serial tests: ${{ matrix.backend.name }} (backend-specific)"
if: matrix.group == 0
run: just ci-check -m ${{ matrix.backend.name }} ibis/backends/flink/tests
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
JVM_ARGS: -XX:CompressedClassSpaceSize=3G

- name: check that no untracked files were produced
shell: bash
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .

- name: upload code coverage
uses: actions/upload-artifact@v3
with:
name: coverage${{ matrix.group }}
path: .coverage

coverage:
needs: test_backends
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
python-version:
- "3.9"
backend:
- name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- apache-flink
- grpcio-status # FIXME(deepyaman)
- pytest-split
steps:
- name: checkout
uses: actions/checkout@v4

- name: install python
uses: actions/setup-python@v4
id: install_python
with:
python-version: ${{ matrix.python-version }}

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ steps.install_python.outputs.python-version }}

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'

- uses: syphar/restore-virtualenv@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ matrix.backend.name }}-${{ steps.install_python.outputs.python-version }}

- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"

- name: install other deps
run: poetry run pip install ${{ join(matrix.backend.additional_deps, ' ') }}

# FIXME(deepyaman): We reinstall pandas~=1.5 to get a version with
# ArrowDtype, but this step will be removed once PyFlink relaxes
# its overly-restrictive dependencies (specifying pandas<1.4.0).
- name: override overly-restrictive PyFlink requirements
run: poetry run pip install pandas~=1.5

- name: show installed deps
run: poetry run pip list

- name: download all artifacts
# Downloads coverage1, coverage2, etc.
uses: actions/download-artifact@v3

- name: run coverage
run: |
coverage combine coverage*/.coverage*
coverage report
coverage xml
- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
with:
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
138 changes: 18 additions & 120 deletions .github/workflows/ibis-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,52 +38,9 @@ env:
FORCE_COLOR: "1"

jobs:
gen_lockfile_backends:
name: Generate Poetry Lockfile for non-Snowflake Backends
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version:
- "3.9"
- "3.11"
steps:
- name: checkout
uses: actions/checkout@v4

- name: install python
id: install_python
uses: actions/setup-python@v4
with:
python-version: "${{ matrix.python-version }}"

- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'

- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
custom_cache_key_element: ${{ steps.install_python.outputs.python-version }}

- name: remove snowflake deps that are not compatible with everything else
run: poetry remove snowflake-sqlalchemy snowflake-connector-python

- name: update deps originally constrained by snowflake
run: poetry update numpy pandas pyarrow datafusion

- name: upload deps file
uses: actions/upload-artifact@v3
with:
name: backend-deps-${{ matrix.python-version }}
path: |
pyproject.toml
poetry.lock
test_backends:
name: ${{ matrix.backend.title }} ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
needs:
- gen_lockfile_backends
env:
SQLALCHEMY_WARN_20: "1"
strategy:
Expand Down Expand Up @@ -201,15 +158,6 @@ jobs:
- oracle
services:
- oracle
- name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- apache-flink
even_more_deps:
- pandas~=1.5
exclude:
- os: windows-latest
backend:
Expand Down Expand Up @@ -306,17 +254,6 @@ jobs:
- oracle
services:
- oracle
- python-version: "3.11"
backend:
name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- apache-flink
even_more_deps:
- pandas~=1.5
steps:
- name: update and install system dependencies
if: matrix.os == 'ubuntu-latest' && matrix.backend.sys-deps != null
Expand Down Expand Up @@ -350,20 +287,6 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: download poetry lockfile
uses: actions/download-artifact@v3
with:
name: backend-deps-${{ matrix.python-version }}
path: deps

- name: pull out lockfile
shell: bash
run: |
set -euo pipefail
mv -f deps/* .
rm -r deps
- uses: syphar/restore-pip-download-cache@v1
with:
requirement_files: poetry.lock
Expand All @@ -384,11 +307,6 @@ jobs:
if: matrix.backend.additional_deps != null
run: poetry run pip install ${{ join(matrix.backend.additional_deps, ' ') }}

# FIXME(deepyaman)
- name: install even more deps
if: matrix.backend.even_more_deps != null
run: poetry run pip install ${{ join(matrix.backend.even_more_deps, ' ') }}

- name: show installed deps
run: poetry run pip list

Expand All @@ -408,13 +326,7 @@ jobs:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}

- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name == 'flink'
run: just ci-check -m ${{ matrix.backend.name }} ibis/backends/flink/tests
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}

- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial && matrix.backend.name != 'impala' && matrix.backend.name != 'flink'
if: matrix.backend.serial && matrix.backend.name != 'impala'
run: just ci-check -m ${{ matrix.backend.name }}
env:
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
Expand Down Expand Up @@ -502,7 +414,9 @@ jobs:

- name: install libgeos for shapely
if: matrix.backend.name == 'postgres'
run: sudo apt-get install -qq -y build-essential libgeos-dev
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libgeos-dev
- uses: extractions/setup-just@v1
env:
Expand Down Expand Up @@ -557,7 +471,7 @@ jobs:
run: docker compose logs

test_pyspark:
name: PySpark ${{ matrix.os }} python-${{ matrix.python-version }} pandas-${{ matrix.pandas.version }}
name: PySpark ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
Expand All @@ -566,24 +480,6 @@ jobs:
- ubuntu-latest
python-version:
- "3.10"
pandas:
- version: "1.5.*"
- version: "2.*.*"
conflicts:
- snowflake-sqlalchemy
- snowflake-connector-python
include:
- os: ubuntu-latest
python-version: "3.9"
pandas:
version: "1.5.*"
- os: ubuntu-latest
python-version: "3.11"
pandas:
version: "2.*.*"
conflicts:
- snowflake-sqlalchemy
- snowflake-connector-python
steps:
- name: checkout
uses: actions/checkout@v4
Expand All @@ -609,12 +505,8 @@ jobs:
- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.6.1'

- name: remove conflicting deps
if: matrix.pandas.conflicts != null
run: poetry remove ${{ join(matrix.pandas.conflicts, ' ') }}

- name: install minimum versions
run: poetry add --lock 'pandas@${{ matrix.pandas.version }}' 'numpy@1.23.*'
- name: install maximum versions of pandas and numpy
run: poetry add --lock 'pandas@<2' 'numpy<1.24'

- name: checkout the lock file
run: git checkout poetry.lock
Expand All @@ -636,10 +528,11 @@ jobs:
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .

- name: upload code coverage
if: success()
# only upload coverage for jobs that aren't mostly xfails
if: success() && matrix.python-version != '3.11'
uses: codecov/codecov-action@v3
with:
flags: backend,pyspark,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }},pandas-${{ matrix.pandas.version }}
flags: backend,pyspark,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}

gen_lockfile_sqlalchemy2:
name: Generate Poetry Lockfile for SQLAlchemy 2
Expand Down Expand Up @@ -684,7 +577,8 @@ jobs:
test_backends_sqlalchemy2:
name: SQLAlchemy 2 ${{ matrix.backend.title }} ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
needs: gen_lockfile_sqlalchemy2
needs:
- gen_lockfile_sqlalchemy2
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -741,11 +635,15 @@ jobs:

- name: install libgeos for shapely
if: ${{ matrix.backend.name == 'postgres' }}
run: sudo apt-get install -qq -y build-essential libgeos-dev
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libgeos-dev
- name: install freetds-dev for mssql
if: ${{ matrix.backend.name == 'mssql' }}
run: sudo apt-get install -qq -y build-essential libkrb5-dev krb5-config freetds-dev
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libkrb5-dev krb5-config freetds-dev
- uses: extractions/setup-just@v1
env:
Expand Down
65 changes: 39 additions & 26 deletions .github/workflows/ibis-docs-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ concurrency:
cancel-in-progress: true

permissions:
# increase the rate limit for nix operations hitting github, but limit the
# permissions to reading things
# increase the rate limit for github operations, but limit token permissions
# to read-only
contents: read

jobs:
Expand Down Expand Up @@ -101,7 +101,9 @@ jobs:
python-version: "3.11"

- name: install system dependencies
run: sudo apt-get install -qq -y build-essential libgeos-dev freetds-dev libkrb5-dev krb5-config
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libgeos-dev freetds-dev libkrb5-dev krb5-config
- uses: syphar/restore-virtualenv@v1
with:
Expand All @@ -124,21 +126,6 @@ jobs:
- name: benchmark
run: poetry run pytest --benchmark-enable --benchmark-json .benchmarks/output.json ibis/tests/benchmarks

- uses: tibdex/github-app-token@v2
id: generate-token
with:
app_id: ${{ secrets.SQUAWK_BOT_APP_ID }}
private_key: ${{ secrets.SQUAWK_BOT_APP_PRIVATE_KEY }}

- uses: benchmark-action/github-action-benchmark@v1
with:
tool: pytest
github-token: ${{ steps.generate-token.outputs.token }}
output-file-path: .benchmarks/output.json
benchmark-data-dir-path: ./bench
auto-push: false
comment-on-alert: false

- uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GCP_CREDENTIALS }}
Expand All @@ -148,13 +135,35 @@ jobs:
- name: show gcloud info
run: gcloud info

- name: copy benchmark data to gcs
- name: download the latest duckdb release
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
gh release download -R duckdb/duckdb --pattern 'duckdb_cli-linux-amd64.zip'
unzip duckdb_cli-linux-amd64.zip
- name: convert json data to parquet
run: |
set -euo pipefail
# sort json keys
jq --sort-keys -rcM < "$PWD/.benchmarks/output.json" > output.json
# connect to a file to allow spilling to disk
./duckdb json2parquet.ddb <<EOF
COPY (
SELECT * FROM read_ndjson_auto('output.json', maximum_object_size=2**27)
) TO 'output.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
EOF
- name: copy data to gcs
run: |
# remove whitespace and compress
jq -rcM < ./.benchmarks/output.json | gzip -c > output.json.gz
set -euo pipefail
timestamp="$(date --iso-8601=ns --utc | tr ',' '.')"
gsutil cp output.json.gz "gs://ibis-benchmark-data/ci/${timestamp}.json.gz"
gsutil cp output.parquet "gs://ibis-benchmark-data/ci/${timestamp}.parquet"
docs_pr:
runs-on: ubuntu-latest
Expand All @@ -178,13 +187,15 @@ jobs:
uses: actions/checkout@v4

- name: run doctest
run: nix develop --ignore-environment -c just doctest
# keep HOME because duckdb (which we use for doctests) wants to use
# that for extensions
run: nix develop --ignore-environment --keep HOME -c just doctest

- name: generate api docs
run: nix develop --ignore-environment -c just docs-apigen --verbose

- name: build docs
run: nix develop --ignore-environment -c just docs-render
run: nix develop --ignore-environment --keep HOME -c just docs-render

- name: check that all frozen computations were done before push
run: git diff --exit-code --stat
Expand Down Expand Up @@ -214,13 +225,15 @@ jobs:
uses: actions/checkout@v4

- name: run doctests
run: nix develop --ignore-environment -c just doctest
# keep HOME because duckdb (which we use for doctests) wants to use
# that for extensions
run: nix develop --ignore-environment --keep HOME -c just doctest

- name: build api docs
run: nix develop --ignore-environment -c just docs-apigen --verbose

- name: build docs
run: nix develop --ignore-environment -c just docs-render
run: nix develop --ignore-environment --keep HOME -c just docs-render

- name: check that all frozen computations were done before push
run: git diff --exit-code --stat
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/ibis-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ jobs:
run: |
set -euo pipefail
sudo apt-get update -y -q
sudo apt-get update -y -qq
sudo apt-get install -y -q build-essential graphviz libgeos-dev freetds-dev
- name: install ${{ matrix.os }} system dependencies
if: matrix.os == 'windows-latest'
run: choco install graphviz

- name: install ibis
run: poetry install --without dev --without docs --extras visualization
run: poetry install --without dev --without docs --extras "visualization decompiler"

- uses: extractions/setup-just@v1
env:
Expand Down Expand Up @@ -138,7 +138,7 @@ jobs:
run: |
set -euo pipefail
sudo apt-get update -y -q
sudo apt-get update -y -qq
sudo apt-get install -y -q build-essential libgeos-dev
- run: python -m pip install --upgrade pip 'poetry==1.6.1'
Expand All @@ -165,7 +165,7 @@ jobs:
run: |
set -euo pipefail
sudo apt-get update -y -q
sudo apt-get update -y -qq
sudo apt-get install -y -q build-essential graphviz libgeos-dev libkrb5-dev freetds-dev
- name: checkout
Expand Down
8 changes: 1 addition & 7 deletions .github/workflows/nix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,6 @@ jobs:
- "3.9"
- "3.10"
- "3.11"
include:
- os: macos-latest
python-version: "3.10"
steps:
- name: checkout
uses: actions/checkout@v4
Expand All @@ -51,9 +48,6 @@ jobs:
extra_nix_config: |
access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
- name: check poetry version
run: nix run '.#check-poetry-version' -- "1.6.1"

- name: setup cachix
uses: cachix/cachix-action@v12
with:
Expand All @@ -68,7 +62,7 @@ jobs:
nix build ".#ibis${version//./}" --fallback --keep-going --print-build-logs
- name: nix build devShell
if: github.event_name == 'push' && matrix.os != 'macos-latest'
if: github.event_name == 'push'
run: |
set -euo pipefail
Expand Down
28 changes: 19 additions & 9 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ ci:
- nixpkgs-fmt
- prettier
- ruff
- ruff-format
- shellcheck
- shfmt
- statix
Expand All @@ -20,17 +21,13 @@ repos:
rev: v1.6.26
hooks:
- id: actionlint-system
- repo: https://github.com/psf/black
rev: 23.9.1
hooks:
- id: black
- repo: https://github.com/keewis/blackdoc
rev: v0.3.8
rev: v0.3.9
hooks:
- id: blackdoc
exclude: ibis/examples/__init__\.py
- repo: https://github.com/codespell-project/codespell
rev: v2.2.5
rev: v2.2.6
hooks:
- id: codespell
additional_dependencies:
Expand All @@ -48,15 +45,28 @@ repos:
args: ["check", "--force-exclude", "--show-source", "--fix"]
require_serial: true
minimum_pre_commit_version: "2.9.2"
- repo: local
hooks:
- id: ruff-format
name: ruff-format
description: "Run 'ruff' for extremely fast Python linting"
entry: ruff
language: system
types_or:
- python
- pyi
args: ["format", "--force-exclude"]
require_serial: true
minimum_pre_commit_version: "2.9.2"
- repo: https://github.com/adrienverge/yamllint
rev: v1.32.0
rev: v1.33.0
hooks:
- id: yamllint
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: check-added-large-files
args: ["--maxkb=700"]
args: ["--maxkb=710"]
- id: check-case-conflict
- id: check-executables-have-shebangs
- id: check-merge-conflict
Expand Down
1 change: 1 addition & 0 deletions .yamllint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ extends: default

ignore:
- docs/_publish.yml
- ci/conda-lock/python-*.yml

rules:
document-start: disable
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright 2015 Ibis developers

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
3 changes: 3 additions & 0 deletions ci/conda-lock/condarc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ channels:
- conda-forge
channel_priority: strict

# use mamba for SAT solving
solver: libmamba

always_yes: true

# remote_connect_timeout_secs (float)
Expand Down
49 changes: 27 additions & 22 deletions ci/conda-lock/generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ extras=(
-e bigquery
-e clickhouse
-e dask
-e datafusion
-e druid
-e duckdb
# this doesn't work on any platform yet (issues with resolving some google deps)
# -e geospatial
-e geospatial
-e impala
-e mssql
-e mysql
Expand All @@ -32,24 +32,29 @@ extras=(
-e trino
-e visualization
-e decompiler
-e deltalake
)
template="conda-lock/{platform}-${python_version}.lock"

function conda_lock() {
local platforms
platforms=(--platform "$1" --platform "$2")
shift 2
conda lock \
--file pyproject.toml \
--file "${python_version_file}" \
--kind explicit \
"${platforms[@]}" \
--filename-template "${template}" \
--filter-extras \
--conda="$(which conda)" \
--category dev --category test --category docs \
"${@}"
}

conda_lock linux-64 osx-64 "${extras[@]}" -e datafusion
conda_lock osx-arm64 win-64 "${extras[@]}"

# directory of this script
top="$(dirname "$(readlink -f -- "$0")")"

python_version="${1}"
shift 1

template="${top}/{platform}/${python_version}.lock"

conda lock \
--file pyproject.toml \
--file "${python_version_file}" \
--kind explicit \
--filename-template "${template}" \
--channel conda-forge \
--platform linux-64 \
--platform osx-64 \
--platform osx-arm64 \
--platform win-64 \
--filter-extras \
--mamba \
--category dev --category test --category docs \
"${extras[@]}" \
"${@}"
436 changes: 436 additions & 0 deletions ci/conda-lock/linux-64/3.10.lock

Large diffs are not rendered by default.

471 changes: 229 additions & 242 deletions conda-lock/linux-64-3.11.lock → ci/conda-lock/linux-64/3.11.lock

Large diffs are not rendered by default.

417 changes: 417 additions & 0 deletions ci/conda-lock/osx-64/3.10.lock

Large diffs are not rendered by default.

460 changes: 224 additions & 236 deletions conda-lock/osx-64-3.11.lock → ci/conda-lock/osx-64/3.11.lock

Large diffs are not rendered by default.

417 changes: 417 additions & 0 deletions ci/conda-lock/osx-arm64/3.10.lock

Large diffs are not rendered by default.

459 changes: 224 additions & 235 deletions conda-lock/osx-arm64-3.11.lock → ci/conda-lock/osx-arm64/3.11.lock

Large diffs are not rendered by default.

418 changes: 418 additions & 0 deletions ci/conda-lock/win-64/3.10.lock

Large diffs are not rendered by default.

469 changes: 229 additions & 240 deletions conda-lock/win-64-3.11.lock → ci/conda-lock/win-64/3.11.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ci/release/verify_conditions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ nix develop '.#release' -c poetry check
#
# the lock file might not be the most fresh, but that's okay: it need only be
# consistent with pyproject.toml
nix develop '.#release' -c poetry lock --check
nix develop '.#release' -c poetry check --lock

# verify that we have a token available to push to pypi using set -u
if [ "${dry_run}" = "false" ]; then
Expand Down
8 changes: 4 additions & 4 deletions ci/schema/duckdb.sql
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ INSERT INTO struct VALUES
(NULL),
({'a': 3.0, 'b': 'orange', 'c': NULL});

CREATE OR REPLACE TABLE json_t (js JSON);
CREATE OR REPLACE TABLE json_t (js TEXT);

INSERT INTO json_t VALUES
('{"a": [1,2,3,4], "b": 1}'),
Expand All @@ -47,7 +47,7 @@ INSERT INTO win VALUES
('a', 3, 1),
('a', 4, 1);

CREATE OR REPLACE TABLE map (kv MAP(STRING, BIGINT));
CREATE OR REPLACE TABLE map (idx BIGINT, kv MAP(STRING, BIGINT));
INSERT INTO map VALUES
(MAP(['a', 'b', 'c'], [1, 2, 3])),
(MAP(['d', 'e', 'f'], [4, 5, 6]));
(1, MAP(['a', 'b', 'c'], [1, 2, 3])),
(2, MAP(['d', 'e', 'f'], [4, 5, 6]));
6 changes: 3 additions & 3 deletions ci/schema/postgres.sql
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ INSERT INTO win VALUES
('a', 4, 1);

DROP TABLE IF EXISTS map CASCADE;
CREATE TABLE map (kv HSTORE);
CREATE TABLE map (idx BIGINT, kv HSTORE);
INSERT INTO map VALUES
('a=>1,b=>2,c=>3'),
('d=>4,e=>5,c=>6');
(1, 'a=>1,b=>2,c=>3'),
(2, 'd=>4,e=>5,c=>6');
8 changes: 4 additions & 4 deletions ci/schema/snowflake.sql
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,11 @@ INSERT INTO array_types ("x", "y", "z", "grouper", "scalar_column", "multi_dim")
SELECT [2, NULL, 3], ['b', NULL, 'c'], NULL, 'b', 5.0, NULL UNION
SELECT [4, NULL, NULL, 5], ['d', NULL, NULL, 'e'], [4.0, NULL, NULL, 5.0], 'c', 6.0, [[1, 2, 3]];

CREATE OR REPLACE TABLE map ("kv" OBJECT);
CREATE OR REPLACE TABLE map ("idx" BIGINT, "kv" OBJECT);

INSERT INTO map ("kv")
SELECT object_construct('a', 1, 'b', 2, 'c', 3) UNION
SELECT object_construct('d', 4, 'e', 5, 'f', 6);
INSERT INTO map ("idx", "kv")
SELECT 1, object_construct('a', 1, 'b', 2, 'c', 3) UNION
SELECT 2, object_construct('d', 4, 'e', 5, 'f', 6);


CREATE OR REPLACE TABLE struct ("abc" OBJECT);
Expand Down
188 changes: 178 additions & 10 deletions ci/schema/trino.sql
Original file line number Diff line number Diff line change
@@ -1,12 +1,180 @@
DROP TABLE IF EXISTS hive.default.diamonds;
CREATE TABLE hive.default.diamonds (
"carat" DOUBLE,
"cut" VARCHAR,
"color" VARCHAR,
"clarity" VARCHAR,
"depth" DOUBLE,
"table" DOUBLE,
"price" BIGINT,
"x" DOUBLE,
"y" DOUBLE,
"z" DOUBLE
) WITH (
external_location = 's3a://warehouse/diamonds',
format = 'PARQUET'
);

CREATE OR REPLACE VIEW memory.default.diamonds AS
SELECT * FROM hive.default.diamonds;

DROP TABLE IF EXISTS hive.default.astronauts;
CREATE TABLE hive.default.astronauts (
"id" BIGINT,
"number" BIGINT,
"nationwide_number" BIGINT,
"name" VARCHAR,
"original_name" VARCHAR,
"sex" VARCHAR,
"year_of_birth" BIGINT,
"nationality" VARCHAR,
"military_civilian" VARCHAR,
"selection" VARCHAR,
"year_of_selection" BIGINT,
"mission_number" BIGINT,
"total_number_of_missions" BIGINT,
"occupation" VARCHAR,
"year_of_mission" BIGINT,
"mission_title" VARCHAR,
"ascend_shuttle" VARCHAR,
"in_orbit" VARCHAR,
"descend_shuttle" VARCHAR,
"hours_mission" REAL,
"total_hrs_sum" REAL,
"field21" BIGINT,
"eva_hrs_mission" REAL,
"total_eva_hrs" REAL
) WITH (
external_location = 's3a://warehouse/astronauts',
format = 'PARQUET'
);

CREATE OR REPLACE VIEW memory.default.astronauts AS
SELECT * FROM hive.default.astronauts;

DROP TABLE IF EXISTS hive.default.batting;
CREATE TABLE hive.default.batting (
"playerID" VARCHAR,
"yearID" BIGINT,
"stint" BIGINT,
"teamID" VARCHAR,
"lgID" VARCHAR,
"G" BIGINT,
"AB" BIGINT,
"R" BIGINT,
"H" BIGINT,
"X2B" BIGINT,
"X3B" BIGINT,
"HR" BIGINT,
"RBI" BIGINT,
"SB" BIGINT,
"CS" BIGINT,
"BB" BIGINT,
"SO" BIGINT,
"IBB" BIGINT,
"HBP" BIGINT,
"SH" BIGINT,
"SF" BIGINT,
"GIDP" BIGINT
) WITH (
external_location = 's3a://warehouse/batting',
format = 'PARQUET'
);

CREATE OR REPLACE VIEW memory.default.batting AS
SELECT * FROM hive.default.batting;

DROP TABLE IF EXISTS hive.default.awards_players;
CREATE TABLE hive.default.awards_players (
"playerID" VARCHAR,
"awardID" VARCHAR,
"yearID" BIGINT,
"lgID" VARCHAR,
"tie" VARCHAR,
"notes" VARCHAR
) WITH (
external_location = 's3a://warehouse/awards_players',
format = 'PARQUET'
);

CREATE OR REPLACE VIEW memory.default.awards_players AS
SELECT * FROM hive.default.awards_players;

DROP TABLE IF EXISTS hive.default.functional_alltypes;
CREATE TABLE hive.default.functional_alltypes (
"id" INTEGER,
"bool_col" BOOLEAN,
"tinyint_col" TINYINT,
"smallint_col" SMALLINT,
"int_col" INTEGER,
"bigint_col" BIGINT,
"float_col" REAL,
"double_col" DOUBLE,
"date_string_col" VARCHAR,
"string_col" VARCHAR,
"timestamp_col" TIMESTAMP(6),
"year" INTEGER,
"month" INTEGER
) WITH (
external_location = 's3a://warehouse/functional_alltypes',
format = 'PARQUET'
);
CREATE OR REPLACE VIEW memory.default.functional_alltypes AS
SELECT * FROM hive.default.functional_alltypes;

DROP TABLE IF EXISTS array_types;

CREATE TABLE IF NOT EXISTS array_types (
x ARRAY<BIGINT>,
y ARRAY<VARCHAR>,
z ARRAY<DOUBLE>,
grouper VARCHAR,
scalar_column DOUBLE,
multi_dim ARRAY<ARRAY<BIGINT>>
);

INSERT INTO array_types VALUES
(ARRAY[1, 2, 3], ARRAY['a', 'b', 'c'], ARRAY[1.0, 2.0, 3.0], 'a', 1.0, ARRAY[ARRAY[NULL, NULL, NULL], ARRAY[1, 2, 3]]),
(ARRAY[4, 5], ARRAY['d', 'e'], ARRAY[4.0, 5.0], 'a', 2.0, ARRAY[]),
(ARRAY[6, NULL], ARRAY['f', NULL], ARRAY[6.0, NULL], 'a', 3.0, ARRAY[NULL, ARRAY[], NULL]),
(ARRAY[NULL, 1, NULL], ARRAY[NULL, 'a', NULL], ARRAY[], 'b', 4.0, ARRAY[ARRAY[1], ARRAY[2], ARRAY[NULL], ARRAY[3]]),
(ARRAY[2, NULL, 3], ARRAY['b', NULL, 'c'], NULL, 'b', 5.0, NULL),
(ARRAY[4, NULL, NULL, 5], ARRAY['d', NULL, NULL, 'e'], ARRAY[4.0, NULL, NULL, 5.0], 'c', 6.0, ARRAY[ARRAY[1, 2, 3]]);

DROP TABLE IF EXISTS map;
CREATE TABLE map (kv MAP<VARCHAR, BIGINT>);
CREATE TABLE map (idx BIGINT, kv MAP<VARCHAR, BIGINT>);
INSERT INTO map VALUES
(MAP(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3])),
(MAP(ARRAY['d', 'e', 'f'], ARRAY[4, 5, 6]));

DROP TABLE IF EXISTS ts;
CREATE TABLE ts (x TIMESTAMP(3), y TIMESTAMP(6), z TIMESTAMP(9));
INSERT INTO ts VALUES
(TIMESTAMP '2023-01-07 13:20:05.561',
TIMESTAMP '2023-01-07 13:20:05.561021',
TIMESTAMP '2023-01-07 13:20:05.561000231');
(1, MAP(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3])),
(2, MAP(ARRAY['d', 'e', 'f'], ARRAY[4, 5, 6]));

DROP TABLE IF EXISTS struct;
CREATE TABLE struct (abc ROW(a DOUBLE, b VARCHAR, c BIGINT));
INSERT INTO struct
SELECT ROW(1.0, 'banana', 2) UNION
SELECT ROW(2.0, 'apple', 3) UNION
SELECT ROW(3.0, 'orange', 4) UNION
SELECT ROW(NULL, 'banana', 2) UNION
SELECT ROW(2.0, NULL, 3) UNION
SELECT NULL UNION
SELECT ROW(3.0, 'orange', NULL);

DROP TABLE IF EXISTS memory.default.json_t;

CREATE TABLE IF NOT EXISTS memory.default.json_t (js JSON);

INSERT INTO memory.default.json_t VALUES
(JSON '{"a": [1,2,3,4], "b": 1}'),
(JSON '{"a":null,"b":2}'),
(JSON '{"a":"foo", "c":null}'),
(JSON 'null'),
(JSON '[42,47,55]'),
(JSON '[]');

DROP TABLE IF EXISTS win;
CREATE TABLE win (g VARCHAR, x BIGINT, y BIGINT);
INSERT INTO win VALUES
('a', 0, 3),
('a', 1, 2),
('a', 2, 0),
('a', 3, 1),
('a', 4, 1);
444 changes: 0 additions & 444 deletions conda-lock/linux-64-3.10.lock

This file was deleted.

445 changes: 0 additions & 445 deletions conda-lock/linux-64-3.9.lock

This file was deleted.

424 changes: 0 additions & 424 deletions conda-lock/osx-64-3.10.lock

This file was deleted.

425 changes: 0 additions & 425 deletions conda-lock/osx-64-3.9.lock

This file was deleted.

423 changes: 0 additions & 423 deletions conda-lock/osx-arm64-3.10.lock

This file was deleted.

424 changes: 0 additions & 424 deletions conda-lock/osx-arm64-3.9.lock

This file was deleted.

424 changes: 0 additions & 424 deletions conda-lock/win-64-3.10.lock

This file was deleted.

425 changes: 0 additions & 425 deletions conda-lock/win-64-3.9.lock

This file was deleted.

13 changes: 0 additions & 13 deletions default.nix

This file was deleted.

231 changes: 105 additions & 126 deletions docker-compose.yml

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions docker/minio/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"version": "10",
"aliases": {
"trino": {
"url": "http://minio:9000",
"accessKey": "accesskey",
"secretKey": "secretkey",
"api": "s3v4",
"path": "auto"
}
}
}
31 changes: 0 additions & 31 deletions docker/trino/Dockerfile

This file was deleted.

3 changes: 1 addition & 2 deletions docker/trino/catalog/hive.properties
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ hive.metastore.username=admin
hive.non-managed-table-writes-enabled=true
hive.s3.aws-access-key=accesskey
hive.s3.aws-secret-key=secretkey
hive.s3.endpoint=http://hive-metastore-minio:9000
hive.s3.endpoint=http://minio:9000
hive.s3.path-style-access=true
hive.storage-format=PARQUET
hive.timestamp-precision=MICROSECONDS
5 changes: 0 additions & 5 deletions docker/trino/catalog/postgresql.properties

This file was deleted.

30 changes: 0 additions & 30 deletions docker/trino/entrypoint.sh

This file was deleted.

56 changes: 0 additions & 56 deletions docker/trino/metastore-site.xml

This file was deleted.

15 changes: 15 additions & 0 deletions docs/_freeze/backends/clickhouse/execute-results/html.json

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,21 @@ quartodoc:
- name: row_number
dynamic: true
signature_name: full
- name: rank
dynamic: true
signature_name: full
- name: dense_rank
dynamic: true
signature_name: full
- name: percent_rank
dynamic: true
signature_name: full
- name: cume_dist
dynamic: true
signature_name: full
- name: ntile
dynamic: true
signature_name: full
- name: window
dynamic: true
signature_name: full
Expand Down
5 changes: 4 additions & 1 deletion docs/backends/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ def get_renderer(level: int) -> MdRenderer:

@cache
def get_backend(backend: str):
return get_object(f"ibis.backends.{backend}", "Backend")
if backend == "pandas":
return get_object(f"ibis.backends.{backend}", "BasePandasBackend")
else:
return get_object(f"ibis.backends.{backend}", "Backend")


def get_callable(obj, name):
Expand Down
1 change: 1 addition & 0 deletions docs/backends/app/backend_info_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def backends_info_df():
"datafusion": ["sql"],
"druid": ["sqlalchemy", "sql"],
"duckdb": ["sqlalchemy", "sql"],
"flink": ["string", "sql"],
"impala": ["string", "sql"],
"mssql": ["sqlalchemy", "sql"],
"mysql": ["sqlalchemy", "sql"],
Expand Down
21 changes: 7 additions & 14 deletions docs/backends/clickhouse.qmd
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
---
execute:
freeze: auto
---
# ClickHouse

[https://clickhouse.com](https://clickhouse.com)
Expand Down Expand Up @@ -103,25 +107,14 @@ con = ibis.connect(f"clickhouse://{user}:{password}@{host}:{port}?secure={secure

## ClickHouse playground

ClickHouse provides a free playground with several datasets that you can connect to using `ibis`:
ClickHouse provides a free playground with several datasets that you can connect to using Ibis:

```{python}
from ibis.interactive import *
con = ibis.clickhouse.connect(
host="play.clickhouse.com",
secure=True,
user="play",
password="clickhouse",
)
con.table("actors")
```

or

```{python}
con = ibis.connect("clickhouse://play:clickhouse@play.clickhouse.com:443?secure=True")
con.table("opensky")
actors = con.table("actors")
actors
```

```{python}
Expand Down
17 changes: 17 additions & 0 deletions docs/backends/images/starburst_clusters.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 17 additions & 0 deletions docs/backends/images/starburst_connection_info.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/backends/mysql.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ con = ibis.mysql.connect(
#| output: asis
from _utils import render_do_connect
render_do_connect("mssql")
render_do_connect("mysql")
```

### `ibis.connect` URL format
Expand Down
53 changes: 53 additions & 0 deletions docs/backends/trino.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,59 @@ from _utils import render_do_connect
render_do_connect("trino")
```

### Authenticating with SSO

Ibis supports connecting to SSO-enabled Trino clusters using the
`OAuth2Authentication` helper from the `trino` library.

```python
import ibis
from trino.auth import OAuth2Authentication

con = ibis.trino.connect(
user="user",
host="hostname",
port=443,
database="database",
schema="default",
auth=OAuth2Authentication(),
http_scheme="https"
)
```

### Connecting to Starburst managed Trino instances

Starburst makes use of role-based access controls. When connecting to a
Starburst Trino cluster, if you encounter issues listing or connecting to
tables, ensure that a role is specified using the `roles` keyword.

```python
import ibis

con = ibis.trino.connect(
user="user",
host="hostname",
port=443,
database="sample",
schema="demo",
roles="defaultrolewithtableaccess", #<1>
http_scheme="https"
)
```

1. Role names will be visible in the Starburst Galaxy dashboard.

#### Finding your Starburst `host`

Log into Starburst Galaxy and select `Clusters` from the left-hand-side menu:

![](./images/starburst_clusters.svg)

Select `Connection info` for the cluster you wish to connect to -- the username
and hostname displayed can be copied directly into the Ibis `connect` call.

![](./images/starburst_connection_info.svg)

```{python}
#| echo: false
BACKEND = "Trino"
Expand Down
9 changes: 5 additions & 4 deletions docs/contribute/01_environment.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ for manager, params in managers.items():
print("```sh")
print(f"# Create a dev environment for {platform}")
print("cd ibis")
print(f"{manager} create -n ibis-dev --file=conda-lock/{platform}-3.10.lock")
print(f"{manager} create -n ibis-dev --file=ci/conda-lock/{platform}/3.10.lock")
print("```")
print()
print()
Expand All @@ -140,7 +140,7 @@ for manager, params in managers.items():
print()
print(" ```sh")
print(" cd ibis")
print(" pip install -e .")
print(" pip install -e '.[all]'")
print(" ```")
print()
```
Expand Down Expand Up @@ -198,13 +198,14 @@ for manager, params in managers.items():
nix-shell -p cachix --run 'cachix use ibis'
```
1. Run `nix-shell` in the checkout directory:
1. Run `nix develop` in the checkout directory:
```sh
cd ibis
nix-shell
nix develop
```
This will launch a `bash` shell with all of the required dependencies installed.
This may take a while due to artifact download from the cache.
:::
Expand Down
3 changes: 1 addition & 2 deletions docs/contribute/03_style.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

## Code style

- [`black`](https://github.com/psf/black): Formatting Python code
- [`ruff`](https://github.com/charliermarsh/ruff): Formatting and sorting `import` statements
- [`ruff`](https://github.com/charliermarsh/ruff): Formatting Python code and sorting `import` statements
- [`shellcheck`](https://github.com/koalaman/shellcheck): Linting shell scripts
- [`shfmt`](https://github.com/mvdan/sh): Formatting shell scripts
- [`statix`](https://github.com/nerdypepper/statix): Linting nix files
Expand Down
212 changes: 205 additions & 7 deletions docs/release_notes.md

Large diffs are not rendered by default.

74 changes: 47 additions & 27 deletions flake.lock
15 changes: 5 additions & 10 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,6 @@
description = "Expressive Python analytics at any scale.";

inputs = {
flake-compat = {
url = "github:edolstra/flake-compat";
flake = false;
};

flake-utils.url = "github:numtide/flake-utils";

gitignore = {
Expand All @@ -28,7 +23,7 @@
outputs = { self, flake-utils, gitignore, nixpkgs, poetry2nix, ... }: {
overlays.default = nixpkgs.lib.composeManyExtensions [
gitignore.overlay
poetry2nix.overlay
poetry2nix.overlays.default
(import ./nix/overlay.nix)
];
} // flake-utils.lib.eachDefaultSystem (
Expand All @@ -53,11 +48,11 @@
# duckdb
duckdb
# mysql
mycli
mariadb-client
# pyspark
openjdk17_headless
# postgres client
pgcli
postgresql
# sqlite with readline
sqlite-interactive
];
Expand Down Expand Up @@ -88,7 +83,7 @@
# python dev environment
env
# poetry executable
env.pkgs.poetry
poetry
# rendering release notes
changelog
glow
Expand Down Expand Up @@ -125,7 +120,7 @@

default = pkgs.ibis310;

inherit (pkgs) update-lock-files gen-all-extras gen-examples check-poetry-version check-release-notes-spelling;
inherit (pkgs) update-lock-files gen-all-extras gen-examples check-release-notes-spelling;
};

devShells = rec {
Expand Down
5 changes: 1 addition & 4 deletions gen_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,14 @@ def main():
internal_ops = {
# Never translates into anything
ops.UnresolvedExistsSubquery,
ops.UnresolvedNotExistsSubquery,
ops.ScalarParameter,
}

public_ops = frozenset(get_leaf_classes(ops.Value)) - internal_ops
support = {"operation": [f"{op.__module__}.{op.__name__}" for op in public_ops]}
support.update(
(name, list(map(backend.has_operation, public_ops)))
# exclude flink until https://github.com/apache/flink/pull/23141 is
# merged and released we also need to roll it into poetry
for name, backend in get_backends(exclude=("flink",))
for name, backend in get_backends()
)

df = pd.DataFrame(support).set_index("operation").sort_index()
Expand Down
2 changes: 1 addition & 1 deletion ibis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Initialize Ibis module."""
from __future__ import annotations

__version__ = "7.0.0"
__version__ = "7.1.0"

from ibis import examples, util
from ibis.backends.base import BaseBackend
Expand Down
45 changes: 43 additions & 2 deletions ibis/backends/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import ibis.expr.types as ir
from ibis import util
from ibis.common.caching import RefCountedCache
from ibis.formats.pandas import PandasData

if TYPE_CHECKING:
from collections.abc import Iterable, Iterator, Mapping, MutableMapping
Expand All @@ -40,6 +39,19 @@
"impala": "hive",
"pyspark": "spark",
"polars": "postgres",
"datafusion": "postgres",
}


_SQLALCHEMY_TO_SQLGLOT_DIALECT = {
# sqlalchemy dialects of backends not listed here match the sqlglot dialect
# name
"mssql": "tsql",
"postgresql": "postgres",
"default": "duckdb",
# druid allows double quotes for identifiers, like postgres:
# https://druid.apache.org/docs/latest/querying/sql#identifiers-and-literals
"druid": "postgres",
}


Expand Down Expand Up @@ -233,6 +245,8 @@ def _import_pyarrow():
"Exporting to arrow formats requires `pyarrow` but it is not installed"
)
else:
import pyarrow_hotfix # noqa: F401

return pyarrow

def to_pandas_batches(
Expand Down Expand Up @@ -266,6 +280,8 @@ def to_pandas_batches(
Iterator[pd.DataFrame]
An iterator of pandas `DataFrame`s.
"""
from ibis.formats.pandas import PandasData

orig_expr = expr
expr = expr.as_table()
schema = expr.schema()
Expand Down Expand Up @@ -475,6 +491,31 @@ def read_json(
f"{self.name} does not support direct registration of JSON data."
)

def read_delta(
self, source: str | Path, table_name: str | None = None, **kwargs: Any
):
"""Register a Delta Lake table in the current database.
Parameters
----------
source
The data source. Must be a directory
containing a Delta Lake table.
table_name
An optional name to use for the created table. This defaults to
a sequentially generated name.
**kwargs
Additional keyword arguments passed to the underlying backend or library.
Returns
-------
ir.Table
The just-registered table.
"""
raise NotImplementedError(
f"{self.name} does not support direct registration of DeltaLake tables."
)

@util.experimental
def to_parquet(
self,
Expand Down Expand Up @@ -944,7 +985,7 @@ def _run_pre_execute_hooks(self, expr: ir.Expr) -> None:
self._register_in_memory_tables(expr)

def _define_udf_translation_rules(self, expr):
if self.supports_in_memory_tables:
if self.supports_python_udfs:
raise NotImplementedError(self.name)

def compile(
Expand Down
4 changes: 3 additions & 1 deletion ibis/backends/base/sql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def table(self, name: str, database: str | None = None) -> ir.Table:
)
qualified_name = self._fully_qualified_name(name, database)
schema = self.get_schema(qualified_name)
node = ops.DatabaseTable(name, schema, self, namespace=database)
node = ops.DatabaseTable(
name, schema, self, namespace=ops.Namespace(database=database)
)
return node.to_expr()

def _fully_qualified_name(self, name, database):
Expand Down
148 changes: 87 additions & 61 deletions ibis/backends/base/sql/alchemy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import TYPE_CHECKING, Any

import sqlalchemy as sa
import sqlglot as sg
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.sql import quoted_name
from sqlalchemy.sql.expression import ClauseElement, Executable
Expand Down Expand Up @@ -38,6 +39,7 @@
AlchemyContext,
AlchemyExprTranslator,
)
from ibis.backends.base.sqlglot import STAR
from ibis.formats.pandas import PandasData

if TYPE_CHECKING:
Expand Down Expand Up @@ -284,6 +286,7 @@ def create_table(

import pandas as pd
import pyarrow as pa
import pyarrow_hotfix # noqa: F401

if isinstance(obj, (pd.DataFrame, pa.Table)):
obj = ibis.memtable(obj)
Expand Down Expand Up @@ -445,7 +448,9 @@ def drop_table(
"Dropping tables from a different database is not yet implemented"
)

t = self._get_sqla_table(name, schema=database, autoload=False)
t = self._get_sqla_table(
name, namespace=ops.Namespace(database=database), autoload=False
)
with self.begin() as bind:
t.drop(bind=bind, checkfirst=force)

Expand All @@ -456,7 +461,7 @@ def drop_table(
del self._schemas[qualified_name]

def truncate_table(self, name: str, database: str | None = None) -> None:
t = self._get_sqla_table(name, schema=database)
t = self._get_sqla_table(name, namespace=ops.Namespace(database=database))
with self.begin() as con:
con.execute(t.delete())

Expand Down Expand Up @@ -488,7 +493,12 @@ def _new_sa_metadata():
return sa.MetaData()

def _get_sqla_table(
self, name: str, schema: str | None = None, autoload: bool = True, **_: Any
self,
name: str,
*,
namespace: ops.Namespace = ops.Namespace(), # noqa: B008
autoload: bool = True,
**_: Any,
) -> sa.Table:
meta = self._new_sa_metadata()
with warnings.catch_warnings():
Expand All @@ -501,7 +511,7 @@ def _get_sqla_table(
table = sa.Table(
name,
meta,
schema=schema,
schema=namespace.schema,
autoload_with=self.con if autoload else None,
quote=self.compiler.translator_class._quote_table_names,
)
Expand Down Expand Up @@ -613,16 +623,9 @@ def table(
Table
Table expression
"""
namespace = schema
if database is not None:
if not isinstance(database, str):
raise com.IbisTypeError(
f"`database` must be a string; got {type(database)}"
)
if database != self.current_database:
return self.database(name=database).table(name=name, schema=schema)
namespace = ops.Namespace(schema=schema, database=database)

sqla_table = self._get_sqla_table(name, schema=schema)
sqla_table = self._get_sqla_table(name, namespace=namespace)

schema = self._schema_from_sqla_table(
sqla_table, schema=self._schemas.get(name)
Expand All @@ -635,9 +638,9 @@ def table(
def _insert_dataframe(
self, table_name: str, df: pd.DataFrame, overwrite: bool
) -> None:
schema = self._current_schema
namespace = ops.Namespace(schema=self._current_schema)

t = self._get_sqla_table(table_name, schema=schema)
t = self._get_sqla_table(table_name, namespace=namespace)
with self.con.begin() as con:
if overwrite:
con.execute(t.delete())
Expand Down Expand Up @@ -699,7 +702,9 @@ def insert(
self.drop_table(table_name, database=database)
self.create_table(table_name, schema=to_table_schema, database=database)

to_table = self._get_sqla_table(table_name, schema=database)
to_table = self._get_sqla_table(
table_name, namespace=ops.Namespace(database=database)
)

from_table_expr = obj

Expand All @@ -711,7 +716,9 @@ def insert(
with self.begin() as bind:
bind.execute(to_table.insert().from_select(columns, compiled))
elif isinstance(obj, (list, dict)):
to_table = self._get_sqla_table(table_name, schema=database)
to_table = self._get_sqla_table(
table_name, namespace=ops.Namespace(database=database)
)

with self.begin() as bind:
if overwrite:
Expand Down Expand Up @@ -902,62 +909,81 @@ class AlchemyCrossSchemaBackend(BaseAlchemyBackend):
currently active one.
"""

@property
@abc.abstractmethod
def use_stmt_prefix(self) -> str:
"""The prefix to use for switching schemas.
Common examples are `USE` and `USE SCHEMA`.
"""
def _get_table_identifier(self, *, name, namespace):
database = namespace.database
schema = namespace.schema

@contextlib.contextmanager
def _use_schema(self, ident: str, current_db: str, current_schema: str) -> None:
use_prefix = self.use_stmt_prefix
if schema is None:
schema = self.current_schema

try:
with self.begin() as c:
c.exec_driver_sql(f"{use_prefix} {ident}")
yield
finally:
with self.begin() as c:
c.exec_driver_sql(
f"{use_prefix} {self._quote(current_db)}.{self._quote(current_schema)}"
schema = sg.parse_one(schema, into=sg.exp.Identifier)
except sg.ParseError:
# not actually a table, but that's how sqlglot parses
# `CREATE SCHEMA` statements
parsed = sg.parse_one(schema, into=sg.exp.Table)

# user passed database="foo", schema="bar.baz", which is ambiguous
if database is not None:
raise com.IbisInputError(
"Cannot specify both `database` and a dotted path in `schema`"
)

db = parsed.args["db"].this
schema = parsed.args["this"].this
else:
db = database

table = sg.table(
name,
db=schema,
catalog=db,
quoted=self.compiler.translator_class._quote_table_names,
)
return table

def _get_sqla_table(
self,
name: str,
schema: str | None = None,
database: str | None = None,
**_: Any,
self, name: str, namespace: ops.Namespace, **_: Any
) -> sa.Table:
current_db = self.current_database
current_schema = self.current_schema
if schema is None:
schema = current_schema
*db, schema = schema.split(".")
db = "".join(db) or database or current_db
ident = ".".join(map(self._quote, filter(None, (db, schema))))

pairs = self._metadata(f"SELECT * FROM {ident}.{self._quote(name)} LIMIT 0")
table = self._get_table_identifier(name=name, namespace=namespace)
metadata_query = sg.select(STAR).from_(table).limit(0).sql(dialect=self.name)
pairs = self._metadata(metadata_query)
ibis_schema = ibis.schema(pairs)

with self._use_schema(ident, current_db, current_schema):
result = self._table_from_schema(name, schema=ibis_schema)
result.schema = self._get_schema_for_table(qualname=ident, schema=schema)
columns = self._columns_from_schema(name, ibis_schema)
result = sa.Table(
name,
sa.MetaData(),
*columns,
quote=self.compiler.translator_class._quote_table_names,
)
result.fullname = table.sql(dialect=self.name)
return result

@abc.abstractmethod
def _get_schema_for_table(self, *, qualname: str, schema: str) -> str:
"""Choose whether to prefix a table with its fully qualified path or schema."""

def drop_table(
self, name: str, database: str | None = None, force: bool = False
) -> None:
name = self._quote(name)
# TODO: handle database quoting
if database is not None:
name = f"{database}.{name}"
drop_stmt = "DROP TABLE" + (" IF EXISTS" * force) + f" {name}"
table = sg.table(name, db=database)
drop_table = sg.exp.Drop(kind="TABLE", exists=force, this=table)
drop_table_sql = drop_table.sql(dialect=self.name)
with self.begin() as con:
con.exec_driver_sql(drop_stmt)
con.exec_driver_sql(drop_table_sql)


@compiles(sa.Table, "trino", "duckdb")
def compile_trino_table(element, compiler, **kw):
return element.fullname


@compiles(sa.Table, "snowflake")
def compile_snowflake_table(element, compiler, **kw):
dialect = compiler.dialect.name
return (
sg.parse_one(element.fullname, into=sg.exp.Table, read=dialect)
.transform(
lambda node: node.__class__(this=node.this, quoted=True)
if isinstance(node, sg.exp.Identifier)
else node
)
.sql(dialect)
)
1 change: 1 addition & 0 deletions ibis/backends/base/sql/alchemy/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class Unknown(sa.Text):
sat.BOOLEAN: dt.Boolean,
sat.Boolean: dt.Boolean,
sat.BINARY: dt.Binary,
sat.BLOB: dt.Binary,
sat.LargeBinary: dt.Binary,
sat.DATE: dt.Date,
sat.Date: dt.Date,
Expand Down
119 changes: 66 additions & 53 deletions ibis/backends/base/sql/alchemy/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import functools

import sqlalchemy as sa
import sqlglot as sg
import toolz
from sqlalchemy import sql

import ibis.common.exceptions as com
import ibis.expr.analysis as an
import ibis.expr.operations as ops
from ibis.backends.base import _SQLALCHEMY_TO_SQLGLOT_DIALECT
from ibis.backends.base.sql.alchemy.translator import (
AlchemyContext,
AlchemyExprTranslator,
Expand Down Expand Up @@ -82,79 +85,85 @@ def _get_join_type(self, op):

def _format_table(self, op):
ctx = self.context
ref_op = op

if isinstance(op, ops.SelfReference):
ref_op = op.table
orig_op = op
if isinstance(op, (ops.SelfReference, ops.Sample)):
op = op.table

alias = ctx.get_ref(op)
alias = ctx.get_ref(orig_op)

translator = ctx.compiler.translator_class(ref_op, ctx)
translator = ctx.compiler.translator_class(op, ctx)

if isinstance(ref_op, ops.DatabaseTable):
result = ref_op.source._get_sqla_table(ref_op.name, schema=ref_op.namespace)
elif isinstance(ref_op, ops.UnboundTable):
if isinstance(op, ops.DatabaseTable):
namespace = op.namespace
result = op.source._get_sqla_table(op.name, namespace=namespace)
elif isinstance(op, ops.UnboundTable):
# use SQLAlchemy's TableClause for unbound tables
name = op.name
namespace = op.namespace
result = sa.Table(
ref_op.name,
name,
sa.MetaData(),
*translator._schema_to_sqlalchemy_columns(ref_op.schema),
*translator._schema_to_sqlalchemy_columns(op.schema),
quote=translator._quote_table_names,
)
elif isinstance(ref_op, ops.SQLQueryResult):
columns = translator._schema_to_sqlalchemy_columns(ref_op.schema)
result = sa.text(ref_op.query).columns(*columns)
elif isinstance(ref_op, ops.SQLStringView):
columns = translator._schema_to_sqlalchemy_columns(ref_op.schema)
result = sa.text(ref_op.query).columns(*columns).cte(ref_op.name)
elif isinstance(ref_op, ops.View):
dialect = translator._dialect_name
result.fullname = sg.table(
name, db=namespace.schema, catalog=namespace.database
).sql(dialect=_SQLALCHEMY_TO_SQLGLOT_DIALECT.get(dialect, dialect))
elif isinstance(op, ops.SQLQueryResult):
columns = translator._schema_to_sqlalchemy_columns(op.schema)
result = sa.text(op.query).columns(*columns)
elif isinstance(op, ops.SQLStringView):
columns = translator._schema_to_sqlalchemy_columns(op.schema)
result = sa.text(op.query).columns(*columns).cte(op.name)
elif isinstance(op, ops.View):
# TODO(kszucs): avoid converting to expression
child_expr = ref_op.child.to_expr()
child_expr = op.child.to_expr()
definition = child_expr.compile()
result = sa.Table(
ref_op.name,
op.name,
sa.MetaData(),
*translator._schema_to_sqlalchemy_columns(ref_op.schema),
*translator._schema_to_sqlalchemy_columns(op.schema),
quote=translator._quote_table_names,
)
backend = child_expr._find_backend()
backend._create_temp_view(view=result, definition=definition)
elif isinstance(ref_op, ops.InMemoryTable):
result = self._format_in_memory_table(op, ref_op, translator)
elif isinstance(ref_op, ops.DummyTable):
elif isinstance(op, ops.InMemoryTable):
result = self._format_in_memory_table(op, translator)
elif isinstance(op, ops.DummyTable):
result = sa.select(
*(
translator.translate(value).label(name)
for name, value in zip(ref_op.schema.names, ref_op.values)
for name, value in zip(op.schema.names, op.values)
)
)
elif ctx.is_extracted(op):
if isinstance(orig_op, ops.SelfReference):
result = ctx.get_ref(op)
else:
result = alias
else:
# A subquery
if ctx.is_extracted(ref_op):
# Was put elsewhere, e.g. WITH block, we just need to grab
# its alias
alias = ctx.get_ref(op)

# hack
if isinstance(op, ops.SelfReference):
table = ctx.get_ref(ref_op)
self_ref = alias if hasattr(alias, "name") else table.alias(alias)
ctx.set_ref(op, self_ref)
return self_ref
return alias

alias = ctx.get_ref(op)
result = ctx.get_compiled_expr(op)

result = alias if hasattr(alias, "name") else result.alias(alias)
ctx.set_ref(op, result)

if isinstance(orig_op, ops.Sample):
result = self._format_sample(orig_op, result)

ctx.set_ref(orig_op, result)
return result

def _format_in_memory_table(self, op, ref_op, translator):
columns = translator._schema_to_sqlalchemy_columns(ref_op.schema)
def _format_sample(self, op, table):
# Should never be hit in practice, as Sample operations should be rewritten
# before this point for all backends without TABLESAMPLE support
raise com.UnsupportedOperationError("`Table.sample` is not supported")

def _format_in_memory_table(self, op, translator):
columns = translator._schema_to_sqlalchemy_columns(op.schema)
if self.context.compiler.cheap_in_memory_tables:
result = sa.Table(
ref_op.name,
op.name,
sa.MetaData(),
*columns,
quote=translator._quote_table_names,
Expand All @@ -167,8 +176,8 @@ def _format_in_memory_table(self, op, ref_op, translator):
)
).limit(0)
elif self.context.compiler.support_values_syntax_in_select:
rows = list(ref_op.data.to_frame().itertuples(index=False))
result = sa.values(*columns, name=ref_op.name).data(rows)
rows = list(op.data.to_frame().itertuples(index=False))
result = sa.values(*columns, name=op.name).data(rows).select().subquery()
else:
raw_rows = (
sa.select(
Expand All @@ -179,7 +188,7 @@ def _format_in_memory_table(self, op, ref_op, translator):
)
for row in op.data.to_frame().itertuples(index=False)
)
result = sa.union_all(*raw_rows).alias(ref_op.name)
result = sa.union_all(*raw_rows).alias(op.name)
return result


Expand Down Expand Up @@ -219,13 +228,11 @@ def _compile_subqueries(self):
self.context.set_ref(expr, result)

def _compile_table_set(self):
if self.table_set is not None:
helper = self.table_set_formatter_class(self, self.table_set)
result = helper.get_result()
return result
else:
if self.table_set is None:
return None

return self.table_set_formatter_class(self, self.table_set).get_result()

def _add_select(self, table_set):
if not self.select_set:
return table_set.element
Expand Down Expand Up @@ -316,7 +323,10 @@ def _add_where(self, fragment):
if not self.where:
return fragment

args = [self._translate(pred, permit_subquery=True) for pred in self.where]
args = [
self._translate(pred, permit_subquery=True, within_where=True)
for pred in self.where
]
clause = functools.reduce(sql.and_, args)
return fragment.where(clause)

Expand Down Expand Up @@ -355,7 +365,10 @@ def _add_limit(self, fragment):
)

if n is not None:
fragment = fragment.limit(n)
try:
fragment = fragment.limit(n)
except AttributeError:
fragment = fragment.subquery().select().limit(n)

offset = self.limit.offset

Expand Down
12 changes: 4 additions & 8 deletions ibis/backends/base/sql/alchemy/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,6 @@ def _exists_subquery(t, op):
sub_ctx = ctx.subcontext()
clause = ctx.compiler.to_sql(filtered, sub_ctx, exists=True)

if isinstance(op, ops.NotExistsSubquery):
clause = sa.not_(clause)

return clause


Expand Down Expand Up @@ -563,7 +560,6 @@ class array_filter(FunctionElement):
ops.TableColumn: _table_column,
ops.TableArrayView: _table_array_view,
ops.ExistsSubquery: _exists_subquery,
ops.NotExistsSubquery: _exists_subquery,
# miscellaneous varargs
ops.Least: varargs(sa.func.least),
ops.Greatest: varargs(sa.func.greatest),
Expand Down Expand Up @@ -675,10 +671,10 @@ class array_filter(FunctionElement):
ops.FirstValue: unary(sa.func.first_value),
ops.LastValue: unary(sa.func.last_value),
ops.RowNumber: fixed_arity(sa.func.row_number, 0),
ops.DenseRank: unary(lambda _: sa.func.dense_rank()),
ops.MinRank: unary(lambda _: sa.func.rank()),
ops.PercentRank: unary(lambda _: sa.func.percent_rank()),
ops.CumeDist: unary(lambda _: sa.func.cume_dist()),
ops.DenseRank: fixed_arity(sa.func.dense_rank, 0),
ops.MinRank: fixed_arity(sa.func.rank, 0),
ops.PercentRank: fixed_arity(sa.func.percent_rank, 0),
ops.CumeDist: fixed_arity(sa.func.cume_dist, 0),
ops.NthValue: _nth_value,
ops.WindowFunction: _window_function,
}
Expand Down
64 changes: 41 additions & 23 deletions ibis/backends/base/sql/compiler/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ibis.backends.base.sql.registry import quote_identifier
from ibis.common.grounds import Comparable
from ibis.config import options
from ibis.expr.rewrites import rewrite_dropna, rewrite_fillna

if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -76,6 +77,9 @@ def _quote_identifier(self, name):
return quote_identifier(name)

def _format_in_memory_table(self, op):
if self.context.compiler.cheap_in_memory_tables:
return op.name

names = op.schema.names
raw_rows = []
for row in op.data.to_frame().itertuples(index=False):
Expand All @@ -98,44 +102,52 @@ def _format_table(self, op):
# TODO: This could probably go in a class and be significantly nicer
ctx = self.context

ref_op = op
if isinstance(op, ops.SelfReference):
ref_op = op.table
orig_op = op
if isinstance(op, (ops.SelfReference, ops.Sample)):
op = op.table

alias = ctx.get_ref(orig_op)

if isinstance(ref_op, ops.InMemoryTable):
result = self._format_in_memory_table(ref_op)
elif isinstance(ref_op, ops.PhysicalTable):
if isinstance(op, ops.InMemoryTable):
result = self._format_in_memory_table(op)
elif isinstance(op, ops.PhysicalTable):
# TODO(kszucs): add a mandatory `name` field to the base
# PhyisicalTable instead of the child classes, this should prevent
# this error scenario
if (name := ref_op.name) is None:
if (name := op.name) is None:
raise com.RelationError(f"Table did not have a name: {op!r}")

namespace = getattr(op, "namespace", None)
catalog = getattr(namespace, "database", None)
db = getattr(namespace, "schema", None)
result = sg.table(
name,
db=getattr(ref_op, "namespace", None),
db=db,
catalog=catalog,
quoted=self.parent.translator_class._quote_identifiers,
).sql(dialect=self.parent.translator_class._dialect_name)
elif ctx.is_extracted(op):
if isinstance(orig_op, ops.SelfReference):
result = ctx.get_ref(op)
else:
result = alias
else:
# A subquery
if ctx.is_extracted(ref_op):
# Was put elsewhere, e.g. WITH block, we just need to grab its
# alias
alias = ctx.get_ref(op)

# HACK: self-references have to be treated more carefully here
if isinstance(op, ops.SelfReference):
return f"{ctx.get_ref(ref_op)} {alias}"
else:
return alias

subquery = ctx.get_compiled_expr(op)
result = f"(\n{util.indent(subquery, self.indent)}\n)"

result += f" {ctx.get_ref(op)}"
if result != alias:
result = f"{result} {alias}"

if isinstance(orig_op, ops.Sample):
result = self._format_sample(orig_op, result)

return result

def _format_sample(self, op, table):
# Should never be hit in practice, as Sample operations should be rewritten
# before this point for all backends without TABLESAMPLE support
raise com.UnsupportedOperationError("`Table.sample` is not supported")

def get_result(self):
# Got to unravel the join stack; the nesting order could be
# arbitrary, so we do a depth first search and push the join tokens
Expand Down Expand Up @@ -216,12 +228,13 @@ def __init__(

self.indent = indent

def _translate(self, expr, named=False, permit_subquery=False):
def _translate(self, expr, named=False, permit_subquery=False, within_where=False):
translator = self.translator_class(
expr,
context=self.context,
named=named,
permit_subquery=permit_subquery,
within_where=within_where,
)
return translator.get_result()

Expand Down Expand Up @@ -395,7 +408,7 @@ def format_where(self):
fmt_preds = []
npreds = len(self.where)
for pred in self.where:
new_pred = self._translate(pred, permit_subquery=True)
new_pred = self._translate(pred, permit_subquery=True, within_where=True)
if npreds > 1:
new_pred = f"({new_pred})"
fmt_preds.append(new_pred)
Expand Down Expand Up @@ -516,6 +529,8 @@ class Compiler:
support_values_syntax_in_select = True
null_limit = None

rewrites = rewrite_fillna | rewrite_dropna

@classmethod
def make_context(cls, params=None):
params = params or {}
Expand All @@ -535,6 +550,9 @@ def to_ast(cls, node, context=None):
if isinstance(node, ir.Expr):
node = node.op()

if cls.rewrites:
node = node.replace(cls.rewrites)

if context is None:
context = cls.make_context()

Expand Down
76 changes: 20 additions & 56 deletions ibis/backends/base/sql/compiler/select_builder.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from __future__ import annotations

import functools
from collections.abc import Mapping
from typing import NamedTuple

import ibis.expr.analysis as an
import ibis.expr.datatypes as dt
import ibis.expr.operations as ops


Expand Down Expand Up @@ -130,6 +127,8 @@ def _collect(self, op, toplevel=False):
self._collect_PhysicalTable(op, toplevel=toplevel)
elif isinstance(op, ops.Join):
self._collect_Join(op, toplevel=toplevel)
elif isinstance(op, ops.WindowingTVF):
self._collect_WindowingTVF(op, toplevel=toplevel)
else:
raise NotImplementedError(type(op))

Expand All @@ -139,48 +138,6 @@ def _collect_Distinct(self, op, toplevel=False):

self._collect(op.table, toplevel=toplevel)

def _collect_DropNa(self, op, toplevel=False):
if toplevel:
if op.subset is None:
columns = [
ops.TableColumn(op.table, name) for name in op.table.schema.names
]
else:
columns = op.subset
if columns:
filters = [
functools.reduce(
ops.And if op.how == "any" else ops.Or,
[ops.NotNull(c) for c in columns],
)
]
elif op.how == "all":
filters = [ops.Literal(False, dtype=dt.bool)]
else:
filters = []
self.table_set = op.table
self.select_set = [op.table]
self.filters = filters

def _collect_FillNa(self, op, toplevel=False):
if toplevel:
table = op.table.to_expr()
if isinstance(op.replacements, Mapping):
mapping = op.replacements
else:
mapping = {
name: op.replacements
for name, type in table.schema().items()
if type.nullable
}
new_op = table.mutate(
[
table[name].fillna(value).name(name)
for name, value in mapping.items()
]
).op()
self._collect(new_op, toplevel=toplevel)

def _collect_Limit(self, op, toplevel=False):
if toplevel:
if isinstance(table := op.table, ops.Limit):
Expand All @@ -192,6 +149,11 @@ def _collect_Limit(self, op, toplevel=False):
assert self.limit is None
self.limit = _LimitSpec(op.n, op.offset)

def _collect_Sample(self, op, toplevel=False):
if toplevel:
self.table_set = op
self.select_set = [op]

def _collect_Union(self, op, toplevel=False):
if toplevel:
self.table_set = op
Expand All @@ -213,14 +175,12 @@ def _collect_Aggregation(self, op, toplevel=False):
# format these depending on the database. Most likely the
# GROUP BY 1, 2, ... style
if toplevel:
sub_op = an.substitute_parents(op)

self.group_by = self._convert_group_by(sub_op.by)
self.having = sub_op.having
self.select_set = sub_op.by + sub_op.metrics
self.table_set = sub_op.table
self.filters = sub_op.predicates
self.order_by = sub_op.sort_keys
self.group_by = self._convert_group_by(op.by)
self.having = op.having
self.select_set = op.by + op.metrics
self.table_set = op.table
self.filters = op.predicates
self.order_by = op.sort_keys

self._collect(op.table)

Expand Down Expand Up @@ -256,9 +216,8 @@ def _convert_group_by(self, nodes):

def _collect_Join(self, op, toplevel=False):
if toplevel:
subbed = an.substitute_parents(op)
self.table_set = subbed
self.select_set = [subbed]
self.table_set = op
self.select_set = [op]

def _collect_PhysicalTable(self, op, toplevel=False):
if toplevel:
Expand All @@ -274,6 +233,11 @@ def _collect_SelfReference(self, op, toplevel=False):
if toplevel:
self._collect(op.table, toplevel=toplevel)

def _collect_WindowingTVF(self, op, toplevel=False):
if toplevel:
self.table_set = op
self.select_set = [op]

# --------------------------------------------------------------------
# Subquery analysis / extraction

Expand Down
23 changes: 9 additions & 14 deletions ibis/backends/base/sql/compiler/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,11 @@ class ExprTranslator:
)
_dialect_name = "hive"
_quote_identifiers = None
_bool_aggs_need_cast_to_int32 = False

def __init__(self, node, context, named=False, permit_subquery=False):
def __init__(
self, node, context, named=False, permit_subquery=False, within_where=False
):
self.node = node
self.permit_subquery = permit_subquery

Expand All @@ -198,6 +201,11 @@ def __init__(self, node, context, named=False, permit_subquery=False):
# For now, governing whether the result will have a name
self.named = named

# used to indicate whether the expression being rendered is within a
# WHERE clause. This is used for MSSQL to determine whether to use
# boolean expressions or not.
self.within_where = within_where

def _needs_name(self, op):
if not self.named:
return False
Expand Down Expand Up @@ -272,7 +280,6 @@ def decorator(f):
rewrites = ExprTranslator.rewrites


# TODO(kszucs): use analysis.substitute() instead of a custom rewriter
@rewrites(ops.Bucket)
def _bucket(op):
# TODO(kszucs): avoid the expression roundtrip
Expand Down Expand Up @@ -336,23 +343,11 @@ def _any_expand(op):
return ops.Max(op.arg, where=op.where)


@rewrites(ops.NotAny)
def _notany_expand(op):
zero = ops.Literal(0, dtype=op.arg.dtype)
return ops.Min(ops.Equals(op.arg, zero), where=op.where)


@rewrites(ops.All)
def _all_expand(op):
return ops.Min(op.arg, where=op.where)


@rewrites(ops.NotAll)
def _notall_expand(op):
zero = ops.Literal(0, dtype=op.arg.dtype)
return ops.Max(ops.Equals(op.arg, zero), where=op.where)


@rewrites(ops.Cast)
def _rewrite_cast(op):
# TODO(kszucs): avoid the expression roundtrip
Expand Down
28 changes: 21 additions & 7 deletions ibis/backends/base/sql/registry/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,39 @@
from __future__ import annotations

import itertools

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.operations as ops


def _reduction_format(translator, func_name, where, arg, *args):
def _maybe_cast_bool(translator, op, arg):
if (
translator._bool_aggs_need_cast_to_int32
and isinstance(op, (ops.Sum, ops.Mean, ops.Min, ops.Max))
and (dtype := arg.dtype).is_boolean()
):
return ops.Cast(arg, dt.Int32(nullable=dtype.nullable))
return arg


def _reduction_format(translator, op, func_name, where, *args):
args = (
_maybe_cast_bool(translator, op, arg)
for arg in args
if isinstance(arg, ops.Node)
)
if where is not None:
arg = ops.IfElse(where, arg, ibis.NA)
args = (ops.IfElse(where, arg, ibis.NA) for arg in args)

return "{}({})".format(
func_name,
", ".join(map(translator.translate, itertools.chain([arg], args))),
", ".join(map(translator.translate, args)),
)


def reduction(func_name):
def formatter(translator, op):
*args, where = op.args
return _reduction_format(translator, func_name, where, *args)
return _reduction_format(translator, op, func_name, where, *args)

return formatter

Expand All @@ -31,7 +45,7 @@ def variance_like(func_name):
}

def formatter(translator, op):
return _reduction_format(translator, func_names[op.how], op.where, op.arg)
return _reduction_format(translator, op, func_names[op.how], op.where, op.arg)

return formatter

Expand Down
20 changes: 10 additions & 10 deletions ibis/backends/base/sql/registry/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,14 @@ def exists_subquery(translator, op):
ctx = translator.context

dummy = ir.literal(1).name("")

filtered = op.foreign_table.to_expr().filter(
[pred.to_expr() for pred in op.predicates]
node = ops.Selection(
table=op.foreign_table,
selections=[dummy],
predicates=op.predicates,
)
node = filtered.select(dummy).op()

subquery = ctx.get_compiled_expr(node)

prefix = "NOT " * isinstance(op, ops.NotExistsSubquery)
return f"{prefix}EXISTS (\n{util.indent(subquery, ctx.indent)}\n)"
return f"EXISTS (\n{util.indent(subquery, ctx.indent)}\n)"


# XXX this is not added to operation_registry, but looks like impala is
Expand Down Expand Up @@ -191,6 +189,7 @@ def sort_key(translator, op):
def count_star(translator, op):
return aggregate._reduction_format(
translator,
op,
"count",
op.where,
ops.Literal(value=1, dtype=dt.int64),
Expand Down Expand Up @@ -260,8 +259,10 @@ def count_star(translator, op):
ops.Tan: unary("tan"),
ops.Pi: fixed_arity("pi", 0),
ops.E: fixed_arity("e", 0),
ops.Degrees: lambda t, op: f"(180 * {t.translate(op.arg)} / {t.translate(ops.Pi())})",
ops.Radians: lambda t, op: f"({t.translate(ops.Pi())} * {t.translate(op.arg)} / 180)",
ops.Degrees: lambda t,
op: f"(180 * {t.translate(op.arg)} / {t.translate(ops.Pi())})",
ops.Radians: lambda t,
op: f"({t.translate(ops.Pi())} * {t.translate(op.arg)} / 180)",
# Unary aggregates
ops.ApproxMedian: aggregate.reduction("appx_median"),
ops.ApproxCountDistinct: aggregate.reduction("ndv"),
Expand Down Expand Up @@ -350,7 +351,6 @@ def count_star(translator, op):
ops.TimestampDiff: timestamp.timestamp_diff,
ops.TimestampFromUNIX: timestamp.timestamp_from_unix,
ops.ExistsSubquery: exists_subquery,
ops.NotExistsSubquery: exists_subquery,
# RowNumber, and rank functions starts with 0 in Ibis-land
ops.RowNumber: lambda *_: "row_number()",
ops.DenseRank: lambda *_: "dense_rank()",
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/base/sql/registry/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def window(translator, op):
arg_formatted = translator.translate(func.__window_op__)
result = f"{arg_formatted} {window_formatted}"

if isinstance(func, ops.RankBase):
if isinstance(func, (ops.RankBase, ops.NTile)):
return f"({result} - 1)"
else:
return result
Expand Down
30 changes: 18 additions & 12 deletions ibis/backends/base/sqlglot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,8 @@ def __getitem__(self, key: str) -> partial:
return getattr(self, key)


def _to_sqlglot(arg):
return arg if isinstance(arg, sg.exp.Expression) else lit(arg)


def _func(name: str, *args: Any, **kwargs: Any):
return sg.func(name, *map(_to_sqlglot, args), **kwargs)
return sg.func(name, *map(sg.exp.convert, args), **kwargs)


class FuncGen:
Expand All @@ -41,16 +37,16 @@ def __getitem__(self, key: str) -> partial:
return getattr(self, key)

def array(self, *args):
return sg.exp.Array.from_arg_list(list(map(_to_sqlglot, args)))
return sg.exp.Array.from_arg_list(list(map(sg.exp.convert, args)))

def tuple(self, *args):
return sg.func("tuple", *map(_to_sqlglot, args))
return sg.func("tuple", *map(sg.exp.convert, args))

def exists(self, query):
return sg.exp.Exists(this=query)

def concat(self, *args):
return sg.exp.Concat.from_arg_list(list(map(_to_sqlglot, args)))
return sg.exp.Concat.from_arg_list(list(map(sg.exp.convert, args)))

def map(self, keys, values):
return sg.exp.Map(keys=keys, values=values)
Expand All @@ -66,16 +62,26 @@ def __getitem__(self, key: str) -> sg.exp.Column:
return sg.column(key)


def lit(val):
return sg.exp.Literal(this=str(val), is_string=isinstance(val, str))
def paren(expr):
"""Wrap a sqlglot expression in parentheses."""
return sg.exp.Paren(this=expr)


def parenthesize(op, arg):
import ibis.expr.operations as ops

if isinstance(op, (ops.Binary, ops.Unary)):
return paren(arg)
# function calls don't need parens
return arg


def interval(value, *, unit):
return sg.exp.Interval(this=_to_sqlglot(value), unit=sg.exp.var(unit))
return sg.exp.Interval(this=sg.exp.convert(value), unit=sg.exp.var(unit))


F = FuncGen()
C = ColGen()
F = FuncGen()
NULL = sg.exp.NULL
FALSE = sg.exp.FALSE
TRUE = sg.exp.TRUE
Expand Down
20 changes: 20 additions & 0 deletions ibis/backends/base/sqlglot/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,26 @@ class DuckDBType(SqlglotType):
default_decimal_scale = 3
default_interval_precision = "us"

@classmethod
def _from_sqlglot_TIMESTAMP(cls) -> dt.Timestamp:
return dt.Timestamp(scale=6, nullable=cls.default_nullable)

@classmethod
def _from_sqlglot_TIMESTAMPTZ(cls) -> dt.Timestamp:
return dt.Timestamp(scale=6, timezone="UTC", nullable=cls.default_nullable)

@classmethod
def _from_sqlglot_TIMESTAMP_S(cls) -> dt.Timestamp:
return dt.Timestamp(scale=0, nullable=cls.default_nullable)

@classmethod
def _from_sqlglot_TIMESTAMP_MS(cls) -> dt.Timestamp:
return dt.Timestamp(scale=3, nullable=cls.default_nullable)

@classmethod
def _from_sqlglot_TIMESTAMP_NS(cls) -> dt.Timestamp:
return dt.Timestamp(scale=9, nullable=cls.default_nullable)


class TrinoType(SqlglotType):
dialect = "trino"
Expand Down
Loading