18 changes: 15 additions & 3 deletions .github/workflows/ibis-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ on:
# Skip the test suite if all changes are in the docs directory
paths-ignore:
- "docs/**"
- "mkdocs.yml"
branches:
- master
- "*.x.x"
pull_request:
# Skip the test suite if all changes are in the docs directory
paths-ignore:
- "docs/**"
- "mkdocs.yml"
branches:
- master
- "*.x.x"
Expand Down Expand Up @@ -61,22 +63,32 @@ jobs:
set -euo pipefail
sudo apt-get update -y -q
sudo apt-get install -y -q build-essential graphviz
sudo apt-get install -y -q build-essential graphviz krb5-config libkrb5-dev libgeos-dev
- name: install ${{ matrix.os }} system dependencies
if: ${{ matrix.os == 'windows-latest' }}
run: choco install graphviz

- run: python -m pip install --upgrade pip poetry
- run: python -m pip install --upgrade pip 'poetry<1.2'

- name: install ibis
if: ${{ matrix.os == 'ubuntu-latest' }}
run: poetry install --extras all

- name: install ibis
if: ${{ matrix.os == 'windows-latest' }}
run: poetry install --extras visualization

- uses: extractions/setup-just@v1

- name: run tests
- name: run core tests
run: just ci-check -m core

- name: run benchmarks once
if: ${{ matrix.os == 'ubuntu-latest' }}
# run benchmarks once to make sure they aren't broken
run: just ci-check -m benchmark

- name: upload code coverage
if: success()
uses: codecov/codecov-action@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ibis-tpch-queries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
working-directory: tpc-queries
run: |
python -m pip install -r requirements.txt
python -m pip install -U duckdb>=0.4
python -m pip install -U 'duckdb>=0.4' 'duckdb-engine>=0.6'
- name: install ibis
run: python -m pip install ".[sqlite,duckdb]"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,5 @@ jobs:
extraPullNames: nix-community,poetry2nix

- name: nix build and run tests
continue-on-error: ${{ matrix.os == 'macos-latest' && matrix.python-version == '3.8' }}
run: nix build --keep-going --print-build-logs --file . --argstr python ${{ matrix.python-version }}
6 changes: 3 additions & 3 deletions .github/workflows/update-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ jobs:
name: ibis
extraPullNames: nix-community,poetry2nix

- uses: cpcloud/niv-dep-info-action@v2.0.6
- uses: cpcloud/niv-dep-info-action@v2.0.7
id: get_current_commit
with:
dependency: ${{ matrix.dep }}

- name: update ${{ matrix.dep }}
run: nix run 'nixpkgs#niv' -- update ${{ matrix.dep }}

- uses: cpcloud/niv-dep-info-action@v2.0.6
- uses: cpcloud/niv-dep-info-action@v2.0.7
id: get_new_commit
with:
dependency: ${{ matrix.dep }}
Expand All @@ -74,7 +74,7 @@ jobs:
app_id: ${{ secrets.PR_APPROVAL_BOT_APP_ID }}
private_key: ${{ secrets.PR_APPROVAL_BOT_APP_PRIVATE_KEY }}

- uses: cpcloud/compare-commits-action@v5.0.21
- uses: cpcloud/compare-commits-action@v5.0.23
if: ${{ fromJSON(steps.needs_pr.outputs.did_change) }}
id: compare_commits
with:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,4 @@ spark-warehouse
docs/backends/support_matrix.csv
__pycache__
tags
.DS_Store
9 changes: 0 additions & 9 deletions .pep8speaks.yml

This file was deleted.

12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,20 @@ repos:
rev: 5.10.1
hooks:
- id: isort
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
hooks:
- id: flake8
- repo: https://github.com/psf/black
rev: 22.6.0
rev: 22.8.0
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 5.0.4
hooks:
- id: flake8
- repo: https://github.com/MarcoGorelli/absolufy-imports
rev: v0.3.1
hooks:
- id: absolufy-imports
- repo: https://github.com/asottile/pyupgrade
rev: v2.37.1
rev: v2.37.3
hooks:
- id: pyupgrade
exclude: setup.py
Expand Down
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ result
result-*
docs/release_notes.md
docs/overrides/*.html
docs/api/expressions/top_level.md
docs/SUMMARY.md
site
ci/udf/CMakeFiles
Expand Down
91 changes: 91 additions & 0 deletions .releaserc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"use strict";

module.exports = {
branches: ["master"],
tagFormat: "${version}",
preset: "conventionalcommits",
plugins: [
[
"@semantic-release/commit-analyzer",
{
// deprecations are patch releases
releaseRules: [{ type: "depr", release: "patch" }],
preset: "conventionalcommits",
},
],
[
"@semantic-release/release-notes-generator",
{
preset: "conventionalcommits",
presetConfig: {
types: [
{ type: "feat", section: "Features" },
{ type: "fix", section: "Bug Fixes" },
{ type: "chore", hidden: true },
{ type: "docs", section: "Documentation" },
{ type: "style", hidden: true },
{ type: "refactor", hidden: true },
{ type: "perf", section: "Performance" },
{ type: "test", hidden: true },
{ type: "depr", section: "Deprecations" },
],
},
},
],
[
"@semantic-release/changelog",
{
changelogTitle: "Release Notes\n---",
changelogFile: "docs/release_notes.md",
},
],
[
"@google/semantic-release-replace-plugin",
{
replacements: [
{
files: ["ibis/__init__.py"],
from: '__version__ = ".*"',
to: '__version__ = "${nextRelease.version}"',
results: [
{
file: "ibis/__init__.py",
hasChanged: true,
numMatches: 1,
numReplacements: 1,
},
],
countMatches: true,
},
],
},
],
[
"@semantic-release/exec",
{
verifyConditionsCmd: "ci/release/verify.sh ${options.dryRun}",
prepareCmd: "ci/release/prepare.sh ${nextRelease.version}",
publishCmd: "ci/release/publish.sh",
},
],
[
"@semantic-release/github",
{
successComment: false,
assets: ["dist/*.whl"],
},
],
[
"@semantic-release/git",
{
assets: [
"pyproject.toml",
"docs/release_notes.md",
"setup.py",
"ibis/__init__.py",
],
message: "chore(release): ${nextRelease.version}",
},
],
],
};
64 changes: 0 additions & 64 deletions .releaserc.json

This file was deleted.

4 changes: 0 additions & 4 deletions .rgignore

This file was deleted.

19 changes: 0 additions & 19 deletions LICENSES/hdfscli.txt

This file was deleted.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Ibis provides tools for interacting with the following systems:
- [Apache Impala](https://ibis-project.org/docs/latest/backends/Impala/)
- [Google BigQuery](https://github.com/ibis-project/ibis-bigquery)
- [ClickHouse](https://ibis-project.org/docs/latest/backends/ClickHouse/)
- [HeavyAI](https://github.com/heavyai/ibis-heavyai)
- [Dask](https://ibis-project.org/docs/latest/backends/Dask/)
- [DuckDB](https://ibis-project.org/docs/latest/backends/DuckDB/)
- [MySQL](https://ibis-project.org/docs/latest/backends/MySQL/)
Expand Down
19 changes: 11 additions & 8 deletions ci/release/dry_run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env nix-shell
#!nix-shell -I nixpkgs=channel:nixos-unstable-small --pure -p git nodejs nix -i bash
#!nix-shell -I nixpkgs=channel:nixos-unstable-small --pure -p git jq nodejs nix -i bash
# shellcheck shell=bash

set -euo pipefail
Expand All @@ -12,7 +12,7 @@ git worktree add "$worktree"

function cleanup() {
cd "$curdir" || exit 1
git worktree remove "$worktree"
git worktree remove --force "$worktree"
git worktree prune
git branch -D "$branch"
}
Expand All @@ -21,6 +21,15 @@ trap cleanup EXIT ERR

cd "$worktree" || exit 1

node <<< 'console.log(JSON.stringify(require("./.releaserc.js")))' |
jq '.plugins |= [.[] | select(.[0] != "@semantic-release/github")]' > .releaserc.json

git rm .releaserc.js

git add .releaserc.json

git commit -m 'test: semantic-release dry run' --no-verify --no-gpg-sign

npx --yes \
-p semantic-release \
-p "@semantic-release/commit-analyzer" \
Expand All @@ -33,11 +42,5 @@ npx --yes \
semantic-release \
--ci \
--dry-run \
--preset conventionalcommits \
--plugins \
--analyze-commits "@semantic-release/commit-analyzer" \
--generate-notes "@semantic-release/release-notes-generator" \
--verify-conditions "@semantic-release/changelog,@semantic-release/exec,@semantic-release/git" \
--prepare "@semantic-release/changelog,@semantic-release/exec,@google/semantic-release-replace-plugin" \
--branches "$branch" \
--repository-url "file://$PWD"
16 changes: 11 additions & 5 deletions ci/release/verify.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
#!/usr/bin/env nix-shell
#!nix-shell --pure --keep POETRY_PYPI_TOKEN_PYPI -p poetry -p git -i bash
#!nix-shell -I nixpkgs=channel:nixos-unstable-small --pure --keep POETRY_PYPI_TOKEN_PYPI -p dyff git poetry yj -i bash
# shellcheck shell=bash

set -euo pipefail

# verify TOML is sane
dry_run="${1:-false}"

# verify pyproject.toml
poetry check

# verify that the lock file is up to date
PYTHONHASHSEED=42 poetry lock --no-update
git diff --exit-code poetry.lock
#
# go through the rigamarole of yj and dyff because poetry is sensitive to
# PYTHONHASHSEED
bash ./dev/lockfile_diff.sh

# verify that we have a token available to push to pypi using set -u
: "${POETRY_PYPI_TOKEN_PYPI}"
if [ "${dry_run}" = "false" ]; then
: "${POETRY_PYPI_TOKEN_PYPI}"
fi
302 changes: 152 additions & 150 deletions conda-lock/linux-64-3.10.lock

Large diffs are not rendered by default.

274 changes: 138 additions & 136 deletions conda-lock/linux-64-3.8.lock

Large diffs are not rendered by default.

274 changes: 138 additions & 136 deletions conda-lock/linux-64-3.9.lock

Large diffs are not rendered by default.

314 changes: 158 additions & 156 deletions conda-lock/osx-64-3.10.lock

Large diffs are not rendered by default.

286 changes: 144 additions & 142 deletions conda-lock/osx-64-3.8.lock

Large diffs are not rendered by default.

286 changes: 144 additions & 142 deletions conda-lock/osx-64-3.9.lock

Large diffs are not rendered by default.

273 changes: 137 additions & 136 deletions conda-lock/win-64-3.10.lock

Large diffs are not rendered by default.

323 changes: 162 additions & 161 deletions conda-lock/win-64-3.8.lock

Large diffs are not rendered by default.

325 changes: 163 additions & 162 deletions conda-lock/win-64-3.9.lock

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions dev/lockfile_diff.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env nix-shell
#!nix-shell -I nixpkgs=channel:nixos-unstable-small --pure -p dyff git poetry yj -i bash
# shellcheck shell=bash

set -euo pipefail

old="$(mktemp --suffix=".yaml")"
new="$(mktemp --suffix=".yaml")"

# verify that the lock file is up to date
#
# go through the rigamarole of yj and dyff because poetry is sensitive to
# PYTHONHASHSEED
yj -ty < poetry.lock > "$old"
PYTHONHASHSEED=0 poetry lock --no-update
yj -ty < poetry.lock > "$new"

if ! dyff between "$old" "$new" --ignore-order-changes --omit-header --set-exit-code; then
git checkout poetry.lock
exit 1
fi
5 changes: 3 additions & 2 deletions dev/poetry2setup
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env nix-shell
#!nix-shell --pure -p python3Packages.black -p python3Packages.poetry-core -p bash -i bash
#!nix-shell -I nixpkgs=channel:nixos-unstable-small --pure -p python3Packages.black -p python3Packages.poetry-core -p bash -i bash
# shellcheck shell=bash
# vim: filetype=sh

set -euo pipefail
Expand All @@ -11,4 +12,4 @@ dir="$(readlink -f "$(dirname "$0")")"
# Because the `extras` data structure in poetry is a frozenset and therefore
# arbitrarily ordered, regenerating setup.py without a fixed hash seed can
# cause unnecessary reordering of extras.
PYTHONHASHSEED=42 python "$dir/poetry2setup.py" "$@"
PYTHONHASHSEED=0 python "$dir/poetry2setup.py" "$@"
14 changes: 14 additions & 0 deletions dev/update-lock-files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env nix-shell
#!nix-shell -I nixpkgs=channel:nixos-unstable-small --pure -p poetry nix -i bash
# shellcheck shell=bash
set -euo pipefail

export PYTHONHASHSEED=0

TOP="${1:-$(dirname "$(dirname "$(readlink -f "$0")")")}"

pushd "${TOP}" > /dev/null || exit 1
poetry lock --no-update
poetry export --dev --without-hashes --no-ansi --extras all > "${TOP}/requirements.txt"
"${TOP}/dev/poetry2setup" -o "${TOP}/setup.py"
popd > /dev/null || exit 1
6 changes: 3 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: "3.4"
services:
clickhouse:
image: clickhouse/clickhouse-server:22-alpine
image: clickhouse/clickhouse-server:22.8.5.29-alpine
ports:
- 8123:8123
- 9000:9000
Expand Down Expand Up @@ -48,7 +48,7 @@ services:
- CMD
- pg_isready
timeout: 5s
image: postgres:13.7-alpine
image: postgres:13.8-alpine
networks:
- impala
kudu:
Expand Down Expand Up @@ -89,7 +89,7 @@ services:
- mysqladmin
- ping
timeout: 5s
image: mariadb:10.8
image: mariadb:10.9.2
ports:
- 3306:3306
networks:
Expand Down
2 changes: 1 addition & 1 deletion docker/postgres/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
FROM postgis/postgis:14-3.2-alpine
RUN apk add postgresql14-plpython3 postgresql14-jit
RUN apk add postgresql14-plpython3
50 changes: 50 additions & 0 deletions docs/CODE_OF_CONDUCT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Code of Conduct

## Summary

Ibis is governed by the
[NumFOCUS code of conduct](https://numfocus.org/code-of-conduct):

> Be kind to others. Do not insult or put down others. Behave professionally.
> Remember that harassment and sexist, racist, or exclusionary jokes are not
> appropriate for Ibis.
>
> All communication should be appropriate for a professional audience including
> people of many different backgrounds. Sexual language and imagery is not
> appropriate.
>
> Ibis is dedicated to providing a harassment-free
> community for everyone, regardless of gender, sexual orientation, gender
> identity, and expression, disability, physical appearance, body size, race,
> or religion. We do not tolerate harassment of community members in any form.
>
> Thank you for helping make this a welcoming, friendly community for all.
## Reporting and Enforcement Violations

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the Ibis Code of Conduct committee at
ibis-conduct@googlegroups.com. You can also report

The committee currently consists of:

- Phillip Cloud
- Wes McKinney
- Krisztián Szűcs
- Jeff Reback

All complaints will be reviewed and investigated and will result in a response
that is deemed necessary and appropriate to the circumstances. The committee is
obligated to maintain confidentiality with regard to the reporter of an
incident. In addition, the online form allows you to submit a report
anonymously. Further details of specific enforcement policies may be posted
separately.

Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.

## Attribution

Parts of this CoC are adapated from the [Dask code of
conduct](https://github.com/dask/governance/blob/main/code-of-conduct.md).
11 changes: 11 additions & 0 deletions docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Contributing to Ibis

We love new contributors!

To get started:

1. [Set up a development environment](https://ibis-project.org/docs/latest/contribute/01_environment/)
1. [Learn about the commit workflow](https://ibis-project.org/docs/latest/contribute/02_workflow/)
1. [Review the code style guidelines](https://ibis-project.org/docs/latest/contribute/03_style/)
1. [Learn how to run the backend test suite](https://ibis-project.org/docs/latest/contribute/04_backend_tests/)
1. [Dig into the nitty gritty of being a maintainer](https://ibis-project.org/docs/latest/contribute/05_maintainers_guide/)
5 changes: 4 additions & 1 deletion docs/SUMMARY.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
* [Home](index.md)
* [Blog](blog/)
* Tutorial
* [Introduction to Ibis](tutorial/01-Introduction-to-Ibis.ipynb)
* [Aggregating and Joining](tutorial/02-Aggregates-Joins.ipynb)
Expand All @@ -11,9 +12,10 @@
* [Ibis for SQL Programmers](ibis-for-sql-programmers.ipynb)
* [User Guide](user_guide/)
* [Execution Backends](backends/)
* [How To Guide](how_to/)
* [Contribute](contribute/)
* [Code of Conduct](CODE_OF_CONDUCT.md)
* Community
* [Blog](blog/)
* [About](about/)
* [Ask a question (StackOverflow)](https://stackoverflow.com/questions/tagged/ibis)
* [Chat (Gitter)](https://gitter.im/ibis-dev/Lobby)
Expand All @@ -30,5 +32,6 @@
* [Collections](api/expressions/collections.md)
* [Geospatial](api/expressions/geospatial.md)
* [Data Types](api/datatypes.md)
* [Schemas](api/schemas.md)
* [Backend Interfaces](api/backends/)
* [Configuration](api/config.md)
6 changes: 3 additions & 3 deletions docs/api/datatypes.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Data Types

This module contains classes for handling the different storage types that
This module contains classes for handling the different logical types that
occur in databases.

<!-- prettier-ignore-start -->
All data type constructors take a `nullable: bool` parameter whose default
value is [`True`][True].

::: ibis.expr.datatypes
<!-- prettier-ignore-end -->

::: ibis.expr.datatypes.core
15 changes: 4 additions & 11 deletions docs/api/expressions/collections.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
# Complex Type Expressions

These APIs are available on complex data types such as arrays, maps, and
structs.
These APIs are available on arrays, maps and structs.

::: ibis.expr.types.arrays
::: ibis.expr.types.maps

<!-- prettier-ignore-start -->
::: ibis.expr.types.structs
selection:
filters:
- "!^Destruct.*"
<!-- prettier-ignore-end -->
::: ibis.expr.types.arrays.ArrayValue
::: ibis.expr.types.structs.StructValue
::: ibis.expr.types.maps.MapValue
10 changes: 3 additions & 7 deletions docs/api/expressions/generic.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@

These expressions are available on scalars and columns of any element type.

<!-- prettier-ignore-start -->
::: ibis.expr.types.generic
selection:
filters:
- "!^literal"
- "!^null"
<!-- prettier-ignore-end -->
::: ibis.expr.types.generic.Value
::: ibis.expr.types.generic.Column
::: ibis.expr.types.generic.Scalar
3 changes: 2 additions & 1 deletion docs/api/expressions/geospatial.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@

Ibis supports the following geospatial expression APIs

::: ibis.expr.types.geospatial
::: ibis.expr.types.geospatial.GeoSpatialValue
::: ibis.expr.types.geospatial.GeoSpatialColumn
15 changes: 12 additions & 3 deletions docs/api/expressions/numeric.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# Numeric Expressions
# Numeric and Boolean Expressions

These APIs are available on numeric and boolean expressions.

::: ibis.expr.types.numeric
::: ibis.expr.types.logical
::: ibis.expr.types.numeric.NumericValue
::: ibis.expr.types.numeric.NumericColumn

::: ibis.expr.types.numeric.IntegerValue
::: ibis.expr.types.numeric.IntegerColumn

::: ibis.expr.types.numeric.FloatingValue

::: ibis.expr.types.numeric.DecimalValue

::: ibis.expr.types.logical.BooleanValue
2 changes: 1 addition & 1 deletion docs/api/expressions/strings.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

All string operations are valid for both scalars and columns.

::: ibis.expr.types.strings
::: ibis.expr.types.strings.StringValue
4 changes: 2 additions & 2 deletions docs/api/expressions/tables.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

Table expressions form the basis for most Ibis expressions.

::: ibis.expr.types.relations
::: ibis.expr.types.groupby
::: ibis.expr.types.relations.Table
::: ibis.expr.types.groupby.GroupedTable
8 changes: 6 additions & 2 deletions docs/api/expressions/timestamps.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Temporal Expression APIs

All timestamp operations are valid for both scalars and columns.
All temporal operations are valid for both scalars and columns.

::: ibis.expr.types.temporal
::: ibis.expr.types.temporal.TemporalValue
::: ibis.expr.types.temporal.TimestampValue
::: ibis.expr.types.temporal.DateValue
::: ibis.expr.types.temporal.TimeValue
::: ibis.expr.types.temporal.IntervalValue
8 changes: 8 additions & 0 deletions docs/api/expressions/top_level.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,29 @@ These methods and objects are available directly in the `ibis` module.

`NA` is the null scalar.

::: ibis.and_
::: ibis.array
::: ibis.asc
::: ibis.case
::: ibis.coalesce
::: ibis.cumulative_window
::: ibis.date
::: ibis.desc
::: ibis.difference
::: ibis.greatest
::: ibis.ifelse
::: ibis.intersect
::: ibis.interval
::: ibis.least
::: ibis.literal
::: ibis.map
::: ibis.negate
::: ibis.now
::: ibis.null
::: ibis.or_
::: ibis.param
::: ibis.show_sql
::: ibis.to_sql
::: ibis.random
::: ibis.range_window
::: ibis.row_number
Expand All @@ -32,5 +39,6 @@ These methods and objects are available directly in the `ibis` module.
::: ibis.timestamp
::: ibis.trailing_range_window
::: ibis.trailing_window
::: ibis.union
::: ibis.where
::: ibis.window
5 changes: 5 additions & 0 deletions docs/api/schemas.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Schemas

This module contains APIs for interacting with table schemas.

::: ibis.expr.schema.Schema
8 changes: 8 additions & 0 deletions docs/backends/DuckDB.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ backend_module: duckdb
backend_param_style: a path to a DuckDB database
backend_connection_example: ibis.duckdb.connect("path/to/my.duckdb")
development_only: false
intro: |
!!! danger "`duckdb` >= 0.5.0 requires `duckdb-engine` >= 0.6.2"
If you encounter problems when using `duckdb` >= **0.5.0** you may need to
upgrade `duckdb-engine` to at least version **0.6.2**.
See [this issue](https://github.com/ibis-project/ibis/issues/4503) for
more details.
---

{% include 'backends/template.md' %}
2 changes: 1 addition & 1 deletion docs/blog/Ibis-version-3.0.0-release.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Ibis v3.0.0

#### by: Marlene Mhangami
**by Marlene Mhangami**

The latest version of Ibis, version 3.0.0, has just been released! This post highlights some of the new features, breaking changes, and performance improvements that come with the new release. 3.0.0 is a major release and includes more changes than those listed in this post. A full list of the changes can be found in the project release notes [here](https://ibis-project.org/docs/dev/release_notes/).

Expand Down
2 changes: 1 addition & 1 deletion docs/blog/Ibis-version-3.1.0-release.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Ibis v3.1.0

Marlene Mhangami
**by Marlene Mhangami**

25 July 2022

Expand Down
376 changes: 376 additions & 0 deletions docs/blog/ffill-and-bfill-using-ibis.md

Large diffs are not rendered by default.

21 changes: 0 additions & 21 deletions docs/community/coc.md

This file was deleted.

2 changes: 1 addition & 1 deletion docs/contribute/05_maintainers_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ cases contributors do not have to remember to generate and commit these files.
Run the following command

```sh
PYTHONHASHSEED=42 python ./dev/poetry2setup.py -o setup.py
PYTHONHASHSEED=0 python ./dev/poetry2setup.py -o setup.py
```

!!! question "Why do we need to set `PYTHONHASHSEED`?"
Expand Down
112 changes: 112 additions & 0 deletions docs/how_to/duckdb_register.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Use `register` to load external data files with the DuckDB backend

<!-- prettier-ignore-start -->
Here we use the [`register`][ibis.backends.duckdb.Backend.register] method to load external data files and join them.
<!-- prettier-ignore-end -->

We're going to download one month of [NYC Taxi
data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) in
`parquet` format and also download the "Taxi Zone Lookup Table" which is a `csv`

https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet
https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv

Create an in-memory DuckDB connection via `ibis`

```python
>>> import ibis
>>> con = ibis.duckdb.connect() # in-memory database
>>> con.list_tables()
[]
```

Now we call `register` with the filepath (the `table_name` argument is optional,
if it isn't specified, Ibis will use the filename minus the extension)

```python
>>> con.register("taxi+_zone_lookup.csv", table_name="taxi_zone_lookup")
AlchemyTable: taxi+_zone_lookup
LocationID int32
Borough string
Zone string
service_zone string

>>> con.register("green_tripdata_2022-01.parquet", table_name="tripdata")
AlchemyTable: green_tripdata_2022_01
VendorID int64
lpep_pickup_datetime timestamp
lpep_dropoff_datetime timestamp
store_and_fwd_flag string
RatecodeID float64
PULocationID int64
DOLocationID int64
passenger_count float64
trip_distance float64
fare_amount float64
extra float64
mta_tax float64
tip_amount float64
tolls_amount float64
ehail_fee int32
improvement_surcharge float64
total_amount float64
payment_type float64
trip_type float64
congestion_surcharge float64
>>> con.list_tables()
['tripdata, 'taxi_zone_lookup']
```

We now have a schema parsed from the files and corresponding tables (they are
actually `views` that are lazily-loaded) are available.

Now we can interact with these tables just like a table or view in any backend
connection:

```python
>>> lookup = con.table("taxi_zone_lookup")
>>> tripdata = con.table("tripdata")

>>> tripdata.columns
['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge']

>>> lookup.columns
['LocationID', 'Borough', 'Zone', 'service_zone']
```

We can grab a small subset of the `tripdata` columns and then join them to the
`lookup` table to get human-readable values for the pickup locations:

```python
>>> ibis.options.interactive = True

>>> tripdata = tripdata[["lpep_pickup_datetime", "PULocationID"]]

>>> tripdata.head()
lpep_pickup_datetime PULocationID
0 2022-01-01 00:14:21 42
1 2022-01-01 00:20:55 116
2 2022-01-01 00:57:02 41
3 2022-01-01 00:07:42 181
4 2022-01-01 00:07:50 33

>>> tripdata.join(lookup, tripdata.PULocationID == lookup.LocationID).head()
lpep_pickup_datetime PULocationID LocationID Borough Zone service_zone
0 2022-01-01 00:14:21 42 42 Manhattan Central Harlem North Boro Zone
1 2022-01-01 00:20:55 116 116 Manhattan Hamilton Heights Boro Zone
2 2022-01-01 00:57:02 41 41 Manhattan Central Harlem Boro Zone
3 2022-01-01 00:07:42 181 181 Brooklyn Park Slope Boro Zone
4 2022-01-01 00:07:50 33 33 Brooklyn Brooklyn Heights Boro Zone
```

That's it!

Ibis+duckdb currently supports registering `parquet`, `csv`, and `csv.gz`.

You can pass in the filename and the filetype will be inferred from the extension, or you can pass it explicitly using a file URI, e.g.

```python
con.register("csv://some_csv_file_without_an_extension")
con.register("csv.gz://a_compressed_csv_file.csv")
con.register("parquet://a_parquet_file_with_truncated_extension.parq")
```
87 changes: 87 additions & 0 deletions docs/how_to/ffill_bfill_w_window.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# How to `ffill` and `bfill` using Window Functions

If you have gaps in your data and need to fill them in using a simple forward fill
(given an order, null values are replaced by the value preceeding) or backward fill
(given an order, null values are replaced by the value following), then you can do this in Ibis:

=== "`ffill`"

~~~python
# Create a window that orders your series, default ascending
win = ibis.window(order_by=data.measured_on, following=0)
# Create a grouping that is a rolling count of non-null values
# This creates a partition where each set has no more than one non-null value
grouped = data.mutate(grouper=data.measurement.count().over(win))
# Group by your newly-created grouping and, in each set,
# set all values to the one non-null value in that set (if it exists)
result = (
grouped
.group_by([grouped.grouper])
.mutate(ffill=grouped.measurement.max())
)
# execute to get a pandas dataframe, sort values in case your backend shuffles
result.execute().sort_values(by=['measured_on'])
~~~

=== "`bfill`"

~~~python
# Create a window that orders your series (use ibis.desc to get descending order)
win = ibis.window(order_by=ibis.desc(data.measured_on), following=0)
# Create a grouping that is a rolling count of non-null values
# This creates a partition where each set has no more than one non-null value
grouped = data.mutate(grouper=data.measurement.count().over(win))
# Group by your newly-created grouping and, in each set,
# set all values to the one non-null value in that set (if it exists)
result = (
grouped
.group_by([grouped.grouper])
.mutate(ffill=grouped.measurement.max())
)
# execute to get a pandas dataframe, sort values in case your backend shuffles
result.execute().sort_values(by=['measured_on'])
~~~

If you have an event partition, which means there's another segment you need to consider
for your ffill or bfill operations, you can do this as well:

=== "`ffill` with event partition"

~~~python
# Group your data by your event partition and then order your series (default ascending)
win = ibis.window(group_by=data.event_id, order_by=data.measured_on, following=0)
# Create a grouping that is a rolling count of non-null values within each event
# This creates a partition where each set has no more than one non-null value
grouped = data.mutate(grouper=data.measurement.count().over(win))
# Group by your newly-created grouping and, in each set,
# set all values to the one non-null value in that set (if it exists)
result = (
grouped
.group_by([grouped.event_id, grouped.grouper])
.mutate(ffill=grouped.measurement.max())
)
# execute to get a pandas dataframe, sort values in case your backend shuffles
result.execute().sort_values(by=['event_id', 'measured_on'])
~~~

=== "`bfill` with event partition"

~~~python
# Group your data by your event partition and then order your series (use ibis.desc for desc)
win = ibis.window(group_by=data.event_id, order_by=ibis.desc(data.measured_on), following=0)
# Create a grouping that is a rolling count of non-null values within each event
# This creates a partition where each set has no more than one non-null value
grouped = data.mutate(grouper=data.measurement.count().over(win))
# Group by your newly-created grouping and, in each set,
# set all values to the one non-null value in that set (if it exists)
result = (
grouped
.group_by([grouped.event_id, grouped.grouper])
.mutate(ffill=grouped.measurement.max())
)
# execute to get a pandas dataframe, sort values in case your backend shuffles
result.execute().sort_values(by=['event_id', 'measured_on'])
~~~

We wrote a deeper dive into how this works on the ibis-project blog
[here](../blog/ffill-and-bfill-using-ibis.md).
37 changes: 37 additions & 0 deletions docs/how_to/topk.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Compute the Top K Records

<!-- prettier-ignore-start -->
Here we use the [`topk`][ibis.expr.types.Column.topk] method to compute the top
5 customers for some generated TPC-H data by:
<!-- prettier-ignore-end -->

- count (the default)
- sum of order totals

```python
>>> import ibis

>>> ibis.options.interactive = True

>>> con = ibis.duckdb.connect() # in-memory duckdb

>>> con.raw_sql("CALL dbgen(sf=0.1)")

>>> orders = con.table("orders")

>>> orders.o_custkey.topk(5) # top 5 most frequent customers
o_custkey count
0 11998 36
1 8761 36
2 3151 35
3 388 35
4 8362 35

>>> orders.o_custkey.topk(5, by=orders.o_totalprice.sum()) # top 5 largest spending customers
o_custkey sum
0 8362 5793605.05
1 6958 5370682.19
2 9454 5354381.81
3 346 5323350.43
4 10354 5227957.24
```
135 changes: 135 additions & 0 deletions docs/release_notes.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tutorial/01-Introduction-to-Ibis.ipynb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/tutorial/04-More-Value-Expressions.ipynb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

88 changes: 0 additions & 88 deletions docs/user_guide/topk.md

This file was deleted.

20 changes: 0 additions & 20 deletions docs/user_guide/udfs.md

This file was deleted.

30 changes: 11 additions & 19 deletions ibis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
"""Initialize Ibis module."""
from __future__ import annotations

import importlib.metadata as _importlib_metadata

# Converting an Ibis schema to a pandas DataFrame requires registering
# some type conversions that are currently registered in the pandas backend
import ibis.backends.pandas
Expand All @@ -18,23 +16,15 @@
__all__ = ['api', 'ir', 'util', 'BaseBackend', 'IbisError', 'options']
__all__ += api.__all__

__version__ = "3.1.0"

__version__ = "3.2.0"

def _get_backend_entrypoints() -> list[_importlib_metadata.EntryPoint]:
"""Get the list of installed `ibis.backend` entrypoints"""
import sys

if sys.version_info < (3, 10):
return list(_importlib_metadata.entry_points()['ibis.backends'])
else:
return list(_importlib_metadata.entry_points(group="ibis.backends"))
_KNOWN_BACKENDS = ['bigquery', 'heavyai']


def __dir__() -> list[str]:
"""Adds tab completion for ibis backends to the top-level module"""
out = set(__all__)
out.update(ep.name for ep in _get_backend_entrypoints())
out.update(ep.name for ep in util.backend_entry_points())
return sorted(out)


Expand All @@ -53,14 +43,16 @@ def __getattr__(name: str) -> BaseBackend:
the `ibis.backends` entrypoints. If successful, the `ibis.sqlite`
attribute is "cached", so this function is only called the first time.
"""
entry_points = {ep for ep in _get_backend_entrypoints() if ep.name == name}
entry_points = {
ep for ep in util.backend_entry_points() if ep.name == name
}

if not entry_points:
raise AttributeError(
f"module 'ibis' has no attribute '{name}'. "
f"If you are trying to access the '{name}' backend, "
f"try installing it first with `pip install ibis-{name}`"
)
msg = f"module 'ibis' has no attribute '{name}'. "
if name in _KNOWN_BACKENDS:
msg += f"""If you are trying to access the '{name}' backend,
try installing it first with `pip install ibis-{name}`"""
raise AttributeError(msg)

if len(entry_points) > 1:
raise RuntimeError(
Expand Down
91 changes: 69 additions & 22 deletions ibis/backends/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import abc
import collections.abc
import functools
import importlib.metadata
import keyword
import re
import sys
import urllib.parse
from pathlib import Path
from typing import (
TYPE_CHECKING,
Expand All @@ -14,6 +17,7 @@
Iterable,
Iterator,
Mapping,
MutableMapping,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -277,6 +281,16 @@ def connect(self, *args, **kwargs) -> BaseBackend:
new_backend.reconnect()
return new_backend

def _from_url(self, url: str) -> BaseBackend:
"""Construct an ibis backend from a SQLAlchemy-conforming URL."""
raise NotImplementedError(
f"`_from_url` not implemented for the {self.name} backend"
)

@staticmethod
def _convert_kwargs(kwargs: MutableMapping) -> None:
"""Manipulate keyword arguments to `.connect` method."""

def reconnect(self) -> None:
"""Reconnect to the database already configured with connect."""
self.do_connect(*self._con_args, **self._con_kwargs)
Expand Down Expand Up @@ -672,21 +686,63 @@ def has_operation(cls, operation: type[ops.Value]) -> bool:
_connect = RegexDispatcher("_connect")


@_connect.register(r"(?P<backend>.+)://(?P<path>.*)", priority=10)
def _(_: str, *, backend: str, path: str, **kwargs: Any) -> BaseBackend:
@functools.lru_cache(maxsize=None)
def _get_backend_names() -> frozenset[str]:
"""Return the set of known backend names.
Notes
-----
This function returns a frozenset to prevent cache pollution.
If a `set` is used, then any in-place modifications to the set
are visible to every caller of this function.
"""

if sys.version_info < (3, 10):
entrypoints = importlib.metadata.entry_points()["ibis.backends"]
else:
entrypoints = importlib.metadata.entry_points(group="ibis.backends")
return frozenset(ep.name for ep in entrypoints)


_PATTERN = "|".join(
sorted(_get_backend_names().difference(("duckdb", "sqlite", "pyspark")))
)


@_connect.register(rf"(?P<backend>{_PATTERN})://.+", priority=12)
def _(url: str, *, backend: str, **kwargs: Any) -> BaseBackend:
"""Connect to given `backend` with `path`.
Examples
--------
>>> con = ibis.connect("duckdb://relative/path/to/data.db")
>>> con = ibis.connect("postgres://user:pass@hostname:port/database")
>>> con = ibis.connect("mysql://user:pass@hostname:port/database")
"""
instance = getattr(ibis, backend)
instance: BaseBackend = getattr(ibis, backend)
backend += (backend == "postgres") * "ql"
try:
return instance.connect(url=f"{backend}://{path}", **kwargs)
except TypeError:
return instance.connect(path, **kwargs)
params = "?" * bool(kwargs) + urllib.parse.urlencode(kwargs)
url += params
return instance._from_url(url)


@_connect.register(
r"(?P<backend>duckdb|sqlite|pyspark)://(?P<path>.*)",
priority=12,
)
def _(_: str, *, backend: str, path: str, **kwargs: Any) -> BaseBackend:
"""Connect to given `backend` with `path`.
Examples
--------
>>> con = ibis.connect("duckdb://relative/path/to/data.db")
>>> con = ibis.connect("sqlite:///absolute/path/to/data.db")
"""
instance: BaseBackend = getattr(ibis, backend)
params = "?" * bool(kwargs) + urllib.parse.urlencode(kwargs)
path += params
# extra slash for sqlalchemy
return instance._from_url(f"{backend}:///{path}")


@_connect.register(r"file://(?P<path>.*)", priority=10)
Expand Down Expand Up @@ -716,7 +772,7 @@ def connect(resource: Path | str, **_: Any) -> BaseBackend:
Examples
--------
>>> con = ibis.connect("duckdb://relative/path/to/data.db")
>>> con = ibis.connect("duckdb:///absolute/path/to/data.db")
>>> con = ibis.connect("relative/path/to/data.duckdb")
"""
raise NotImplementedError(type(resource))
Expand Down Expand Up @@ -752,29 +808,20 @@ def _(
Examples
--------
>>> con = ibis.connect("duckdb://relative/path/to/data.csv")
>>> con = ibis.connect("duckdb://relative/path/to/more/data.parquet")
>>> con = ibis.connect("duckdb:///absolute/path/to/more/data.parquet")
"""
con = getattr(ibis, backend).connect(**kwargs)
con.register(f"{extension}://{filename}")
return con


@_connect.register(
r"(?P<filename>.+\.(?P<extension>parquet|csv))",
priority=8,
)
def _(
_: str,
*,
filename: str,
extension: str,
**kwargs: Any,
) -> BaseBackend:
@_connect.register(r".+\.(?:parquet|csv)", priority=8)
def _(filename: str, **kwargs: Any) -> BaseBackend:
"""Connect to `duckdb` and register a parquet or csv file.
Examples
--------
>>> con = ibis.connect("relative/path/to/data.csv")
>>> con = ibis.connect("relative/path/to/more/data.parquet")
"""
return _connect(f"duckdb://{filename}", **kwargs)
return _connect(f"duckdb:///{filename}", **kwargs)
52 changes: 50 additions & 2 deletions ibis/backends/base/sql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from functools import lru_cache
from typing import Any, Mapping

import sqlalchemy as sa

import ibis.expr.lineage as lin
import ibis.expr.operations as ops
import ibis.expr.schema as sch
import ibis.expr.types as ir
Expand All @@ -18,13 +21,45 @@
]


def _find_memtables(expr):
op = expr.op()
return lin.proceed, op if isinstance(op, ops.InMemoryTable) else None


class BaseSQLBackend(BaseBackend):
"""Base backend class for backends that compile to SQL."""

compiler = Compiler
table_class = ops.DatabaseTable
table_expr_class = ir.Table

def _from_url(self, url: str) -> BaseBackend:
"""Connect to a backend using a URL `url`.
Parameters
----------
url
URL with which to connect to a backend.
Returns
-------
BaseBackend
A backend instance
"""
url = sa.engine.make_url(url)

kwargs = {
name: value
for name in ("host", "port", "database", "password")
if (value := getattr(url, name, None))
}
if username := url.username:
kwargs["user"] = username

kwargs.update(url.query)
self._convert_kwargs(kwargs)
return self.connect(**kwargs)

def table(self, name: str, database: str | None = None) -> ir.Table:
"""Construct a table expression.
Expand Down Expand Up @@ -73,7 +108,7 @@ def _get_schema_using_query(self, query):
f"Backend {self.name} does not support .sql()"
)

def raw_sql(self, query: str, results: bool = False) -> Any:
def raw_sql(self, query: str) -> Any:
"""Execute a query string.
Could have unexpected results if the query modifies the behavior of
Expand Down Expand Up @@ -149,6 +184,10 @@ def execute(

schema = self.ast_schema(query_ast, **kwargs)

# register all in memory tables if the backend supports cheap access
# to them
self._register_in_memory_tables(expr)

with self._safe_raw_sql(sql, **kwargs) as cursor:
result = self.fetch_from_cursor(cursor, schema)

Expand All @@ -157,6 +196,14 @@ def execute(

return result

def _register_in_memory_table(self, table_op):
raise NotImplementedError

def _register_in_memory_tables(self, expr):
if self.compiler.cheap_in_memory_tables:
for memtable in lin.traverse(_find_memtables, expr):
self._register_in_memory_table(memtable)

@abc.abstractmethod
def fetch_from_cursor(self, cursor, schema):
"""Fetch data from cursor."""
Expand Down Expand Up @@ -200,6 +247,7 @@ def _log(self, sql: str) -> None:
This method can be implemented by subclasses. The logging happens
when `ibis.options.verbose` is `True`.
"""
util.log(sql)

def compile(
self,
Expand All @@ -208,7 +256,7 @@ def compile(
params: Mapping[ir.Expr, Any] | None = None,
timecontext: TimeContext | None = None,
) -> Any:
"""Compille an Ibis expression.
"""Compile an Ibis expression.
Parameters
----------
Expand Down
33 changes: 27 additions & 6 deletions ibis/backends/base/sql/alchemy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

import contextlib
import getpass
from operator import methodcaller
from typing import Any, Literal

import pandas as pd
import sqlalchemy as sa

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
import ibis.expr.schema as sch
import ibis.expr.types as ir
import ibis.util as util
Expand Down Expand Up @@ -205,16 +207,35 @@ def create_table(
schema = expr.schema()

self._schemas[self._fully_qualified_name(name, database)] = schema
t = self._table_from_schema(
table = self._table_from_schema(
name, schema, database=database or self.current_database
)

if has_expr := expr is not None:
# this has to happen outside the `begin` block, so that in-memory
# tables are visible inside the transaction created by it
self._register_in_memory_tables(expr)

with self.begin() as bind:
t.create(bind=bind, checkfirst=force)
if expr is not None:
bind.execute(
t.insert().from_select(list(expr.columns), expr.compile())
)
table.create(bind=bind, checkfirst=force)
if has_expr:
method = self._get_insert_method(expr)
bind.execute(method(table.insert()))

def _get_insert_method(self, expr):
compiled = self.compile(expr)

# if in memory tables aren't cheap then try to pull out their data
# FIXME: queries that *select* from in memory tables are still broken
# for mysql/sqlite/postgres because the generated SQL is wrong
if not self.compiler.cheap_in_memory_tables and isinstance(
expr.op(), ops.InMemoryTable
):
(from_,) = compiled.get_final_froms()
(rows,) = from_._data
return methodcaller("values", rows)

return methodcaller("from_select", list(expr.columns), compiled)

def _columns_from_schema(
self, name: str, schema: sch.Schema
Expand Down
70 changes: 68 additions & 2 deletions ibis/backends/base/sql/alchemy/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sqlalchemy.dialects.postgresql.base import PGDialect
from sqlalchemy.dialects.sqlite.base import SQLiteDialect
from sqlalchemy.engine.interfaces import Dialect
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.types import UserDefinedType

import ibis.expr.datatypes as dt
Expand All @@ -33,6 +34,41 @@ def get_col_spec(self, **_):
return f"STRUCT({pairs})"


class UInt64(sa.types.Integer):
pass


class UInt32(sa.types.Integer):
pass


class UInt16(sa.types.Integer):
pass


class UInt8(sa.types.Integer):
pass


@compiles(UInt64, "postgresql")
@compiles(UInt32, "postgresql")
@compiles(UInt16, "postgresql")
@compiles(UInt8, "postgresql")
@compiles(UInt64, "mysql")
@compiles(UInt32, "mysql")
@compiles(UInt16, "mysql")
@compiles(UInt8, "mysql")
@compiles(UInt64, "sqlite")
@compiles(UInt32, "sqlite")
@compiles(UInt16, "sqlite")
@compiles(UInt8, "sqlite")
def compile_uint(element, compiler, **kw):
dialect_name = compiler.dialect.name
raise TypeError(
f"unsigned integers are not supported in the {dialect_name} backend"
)


def table_from_schema(name, meta, schema, database: str | None = None):
# Convert Ibis schema to SQLA table
columns = []
Expand All @@ -57,11 +93,18 @@ def table_from_schema(name, meta, schema, database: str | None = None):
# Mantissa-based
dt.Float16: sa.REAL,
dt.Float32: sa.REAL,
dt.Float64: sa.FLOAT,
# precision is the number of bits in the mantissa
# without specifying this, some backends interpret the type as FLOAT, which
# means float32 (and precision == 24)
dt.Float64: sa.Float(precision=53),
dt.Int8: sa.SmallInteger,
dt.Int16: sa.SmallInteger,
dt.Int32: sa.Integer,
dt.Int64: sa.BigInteger,
dt.UInt8: UInt8,
dt.UInt16: UInt16,
dt.UInt32: UInt32,
dt.UInt64: UInt64,
dt.JSON: sa.JSON,
}

Expand Down Expand Up @@ -137,6 +180,29 @@ def sa_mysql_numeric(_, satype, nullable=True):
)


@dt.dtype.register(MySQLDialect, mysql.TINYBLOB)
@dt.dtype.register(MySQLDialect, mysql.MEDIUMBLOB)
@dt.dtype.register(MySQLDialect, mysql.BLOB)
@dt.dtype.register(MySQLDialect, mysql.LONGBLOB)
def sa_mysql_blob(_, satype, nullable=True):
return dt.Binary(nullable=nullable)


_FLOAT_PREC_TO_TYPE = {
11: dt.Float16,
24: dt.Float32,
53: dt.Float64,
}


@dt.dtype.register(Dialect, sa.types.Float)
def sa_float(_, satype, nullable=True):
precision = satype.precision
if (typ := _FLOAT_PREC_TO_TYPE.get(precision)) is not None:
return typ(nullable=nullable)
return dt.Decimal(precision, satype.scale, nullable=nullable)


@dt.dtype.register(Dialect, sa.types.Numeric)
@dt.dtype.register(SQLiteDialect, sqlite.NUMERIC)
def sa_numeric(_, satype, nullable=True):
Expand Down Expand Up @@ -164,7 +230,7 @@ def sa_bigint(_, satype, nullable=True):


@dt.dtype.register(Dialect, sa.REAL)
def sa_float(_, satype, nullable=True):
def sa_real(_, satype, nullable=True):
return dt.Float32(nullable=nullable)


Expand Down
75 changes: 58 additions & 17 deletions ibis/backends/base/sql/alchemy/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
Select,
SelectBuilder,
TableSetFormatter,
Union,
)
from ibis.backends.base.sql.compiler.base import SetOp


def _schema_to_sqlalchemy_columns(schema: sch.Schema) -> list[sa.Column]:
Expand Down Expand Up @@ -90,12 +90,14 @@ def _format_table(self, expr):
if isinstance(ref_op, AlchemyTable):
result = ref_op.sqla_table
elif isinstance(ref_op, ops.UnboundTable):
# use SQLAlchemy's TableClause and ColumnClause for unbound tables
schema = ref_op.schema
# use SQLAlchemy's TableClause for unbound tables
result = sa.table(
ref_op.name,
*_schema_to_sqlalchemy_columns(schema),
*_schema_to_sqlalchemy_columns(ref_op.schema),
)
elif isinstance(ref_op, ops.SQLQueryResult):
columns = _schema_to_sqlalchemy_columns(ref_op.schema)
result = sa.text(ref_op.query).columns(*columns)
elif isinstance(ref_op, ops.SQLStringView):
columns = _schema_to_sqlalchemy_columns(ref_op.schema)
result = sa.text(ref_op.query).columns(*columns).cte(ref_op.name)
Expand All @@ -107,6 +109,16 @@ def _format_table(self, expr):
)
backend = ref_op.child._find_backend()
backend._create_temp_view(view=result, definition=definition)
elif isinstance(ref_op, ops.InMemoryTable):
columns = _schema_to_sqlalchemy_columns(ref_op.schema)

if self.context.compiler.cheap_in_memory_tables:
result = sa.table(ref_op.name, *columns)
else:
# this has horrendous performance for medium to large tables
# should we warn?
rows = list(ref_op.data.to_frame().itertuples(index=False))
result = sa.values(*columns).data(rows)
else:
# A subquery
if ctx.is_extracted(ref_expr):
Expand Down Expand Up @@ -143,7 +155,7 @@ def _can_lower_sort_column(table_set, expr):
# aggregation so they appear in same query. It's generally for
# cosmetics and doesn't really affect query semantics.
bases = {op: op.to_expr() for op in expr.op().root_tables()}
if len(bases) > 1:
if len(bases) != 1:
return False

base = list(bases.values())[0]
Expand Down Expand Up @@ -194,7 +206,7 @@ def _compile_subqueries(self):

def _compile_table_set(self):
if self.table_set is not None:
helper = _AlchemyTableSetFormatter(self, self.table_set)
helper = self.table_set_formatter_class(self, self.table_set)
result = helper.get_result()
if isinstance(result, sql.selectable.Select):
return result.subquery()
Expand Down Expand Up @@ -330,11 +342,9 @@ def _add_limit(self, fragment):
if self.limit is None:
return fragment

n, offset = self.limit['n'], self.limit['offset']
fragment = fragment.limit(n)
if offset is not None and offset != 0:
fragment = fragment.limit(self.limit.n)
if offset := self.limit.offset:
fragment = fragment.offset(offset)

return fragment


Expand All @@ -343,21 +353,50 @@ def _convert_group_by(self, exprs):
return exprs


class AlchemyUnion(Union):
class AlchemySetOp(SetOp):
def compile(self):
def reduce_union(left, right, distincts=iter(self.distincts)):
distinct = next(distincts)
sa_func = sa.union if distinct else sa.union_all
return sa_func(left, right)

context = self.context
selects = []

def call(distinct, *args):
return (
self.distinct_func(*args)
if distinct
else self.non_distinct_func(*args)
)

for table in self.tables:
table_set = context.get_compiled_expr(table)
selects.append(table_set.cte().select())

return functools.reduce(reduce_union, selects)
if len(set(self.distincts)) == 1:
# distinct is either all True or all False, handle with a single
# call. This generates much more concise SQL.
return call(self.distincts[0], *selects)
else:
# We need to iteratively apply the set operations to handle
# disparate `distinct` values. Subqueries _must_ be converted using
# `.subquery().select()` to get sqlalchemy to put parenthesis in
# the proper places.
result = selects[0]
for select, distinct in zip(selects[1:], self.distincts):
result = call(distinct, result.subquery().select(), select)
return result


class AlchemyUnion(AlchemySetOp):
distinct_func = staticmethod(sa.union)
non_distinct_func = staticmethod(sa.union_all)


class AlchemyIntersection(AlchemySetOp):
distinct_func = staticmethod(sa.intersect)
non_distinct_func = staticmethod(sa.intersect_all)


class AlchemyDifference(AlchemySetOp):
distinct_func = staticmethod(sa.except_)
non_distinct_func = staticmethod(sa.except_all)


class AlchemyCompiler(Compiler):
Expand All @@ -367,6 +406,8 @@ class AlchemyCompiler(Compiler):
select_builder_class = AlchemySelectBuilder
select_class = AlchemySelect
union_class = AlchemyUnion
intersect_class = AlchemyIntersection
difference_class = AlchemyDifference

@classmethod
def to_sql(cls, expr, context=None, params=None, exists=False):
Expand Down
28 changes: 26 additions & 2 deletions ibis/backends/base/sql/alchemy/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import ibis.expr.types as ir
import ibis.expr.window as W
from ibis.backends.base.sql.alchemy.database import AlchemyTable
from ibis.backends.base.sql.alchemy.datatypes import to_sqla_type
from ibis.backends.base.sql.alchemy.geospatial import geospatial_supported


Expand Down Expand Up @@ -468,7 +467,7 @@ def _zero_if_null(t, expr):
arg = op.arg
sa_arg = t.translate(op.arg)
return sa.case(
[(sa_arg.is_(None), sa.cast(0, to_sqla_type(arg.type())))],
[(sa_arg.is_(None), sa.cast(0, t.get_sqla_type(arg.type())))],
else_=sa_arg,
)

Expand Down Expand Up @@ -520,6 +519,25 @@ def translate(t, expr):
return translate


def _bitwise_op(operator):
def translate(t, expr):
op = expr.op()
left = t.translate(op.left)
right = t.translate(op.right)
return left.op(operator)(right)

return translate


def _bitwise_not(t, expr):
op = expr.op()
arg = t.translate(op.arg)
return sa.sql.elements.UnaryExpression(
arg,
operator=sa.sql.operators.custom_op("~"),
)


sqlalchemy_operation_registry: Dict[Any, Any] = {
ops.Alias: _alias,
ops.And: fixed_arity(operator.and_, 2),
Expand Down Expand Up @@ -645,6 +663,12 @@ def translate(t, expr):
),
3,
),
ops.BitwiseAnd: _bitwise_op("&"),
ops.BitwiseOr: _bitwise_op("|"),
ops.BitwiseXor: _bitwise_op("^"),
ops.BitwiseLeftShift: _bitwise_op("<<"),
ops.BitwiseRightShift: _bitwise_op(">>"),
ops.BitwiseNot: _bitwise_not,
}


Expand Down
33 changes: 19 additions & 14 deletions ibis/backends/base/sql/alchemy/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,30 +56,35 @@ def name(self, translated, name, force=True):
def get_sqla_type(self, data_type):
return to_sqla_type(data_type, type_map=self._type_map)

def _reduction(self, sa_func, expr):
op = expr.op()
arg = op.arg
def _maybe_cast_bool(self, op, arg):
if (
self._bool_aggs_need_cast_to_int32
and isinstance(op, (ops.Sum, ops.Mean, ops.Min, ops.Max))
and isinstance(
type := arg.type(),
dt.Boolean,
)
and isinstance(type := arg.type(), dt.Boolean)
):
arg = arg.cast(dt.Int32(nullable=type.nullable))
return arg.cast(dt.Int32(nullable=type.nullable))
return arg

def _reduction(self, sa_func, expr):
op = expr.op()

argtuple = (
self._maybe_cast_bool(op, arg)
for name, arg in zip(op.argnames, op.args)
if isinstance(arg, ir.Expr) and name != "where"
)
if (where := op.where) is not None:
if self._has_reduction_filter_syntax:
return sa_func(self.translate(arg)).filter(
self.translate(where)
)
sa_args = tuple(map(self.translate, argtuple))
return sa_func(*sa_args).filter(self.translate(where))
else:
sa_arg = self.translate(where.ifelse(arg, None))
sa_args = tuple(
self.translate(where.ifelse(arg, None)) for arg in argtuple
)
else:
sa_arg = self.translate(arg)
sa_args = tuple(map(self.translate, argtuple))

return sa_func(sa_arg)
return sa_func(*sa_args)


rewrites = AlchemyExprTranslator.rewrites
Expand Down
13 changes: 9 additions & 4 deletions ibis/backends/base/sql/compiler/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,20 @@ def compile(self):


class SetOp(DML):
def __init__(self, tables, expr, context):
def __init__(self, tables, expr, context, distincts):
self.context = context
self.tables = tables
self.table_set = expr
self.distincts = distincts
self.filters = []

@classmethod
def keyword(cls, distinct):
return cls._keyword + (not distinct) * " ALL"

def _get_keyword_list(self):
return map(self.keyword, self.distincts)

def _extract_subqueries(self):
self.subqueries = _extract_common_table_expressions(
[self.table_set, *self.filters]
Expand All @@ -84,9 +92,6 @@ def format_relation(self, expr):
return f'SELECT *\nFROM {ref}'
return self.context.get_compiled_expr(expr)

def _get_keyword_list(self):
raise NotImplementedError("Need objects to interleave")

def compile(self):
self._extract_subqueries()

Expand Down
91 changes: 50 additions & 41 deletions ibis/backends/base/sql/compiler/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
import ibis.expr.types as ir
import ibis.util as util
from ibis.backends.base.sql.compiler.base import DML, QueryAST, SetOp
from ibis.backends.base.sql.compiler.select_builder import SelectBuilder
from ibis.backends.base.sql.compiler.select_builder import (
SelectBuilder,
_LimitSpec,
)
from ibis.backends.base.sql.compiler.translator import (
ExprTranslator,
QueryContext,
Expand Down Expand Up @@ -97,6 +100,18 @@ def _get_join_type(self, op):
def _quote_identifier(self, name):
return quote_identifier(name)

def _format_in_memory_table(self, op):
names = op.schema.names
raw_rows = (
", ".join(
f"{val!r} AS {self._quote_identifier(name)}"
for val, name in zip(row, names)
)
for row in op.data.to_frame().itertuples(index=False)
)
rows = ", ".join(f"({raw_row})" for raw_row in raw_rows)
return f"(VALUES {rows})"

def _format_table(self, expr):
# TODO: This could probably go in a class and be significantly nicer
ctx = self.context
Expand All @@ -107,7 +122,10 @@ def _format_table(self, expr):
ref_expr = op.table
ref_op = ref_expr.op()

if isinstance(ref_op, ops.PhysicalTable):
if isinstance(ref_op, ops.InMemoryTable):
result = self._format_in_memory_table(ref_op)
is_subquery = True
elif isinstance(ref_op, ops.PhysicalTable):
name = ref_op.name
if name is None:
raise com.RelationError(f'Table did not have a name: {expr!r}')
Expand Down Expand Up @@ -446,42 +464,27 @@ def format_limit(self):

buf = StringIO()

n, offset = self.limit['n'], self.limit['offset']
n = self.limit.n
buf.write(f'LIMIT {n}')
if offset is not None and offset != 0:
if offset := self.limit.offset:
buf.write(f' OFFSET {offset}')

return buf.getvalue()


class Union(SetOp):
def __init__(self, tables, expr, context, distincts):
super().__init__(tables, expr, context)
self.distincts = distincts

@staticmethod
def keyword(distinct):
return 'UNION' if distinct else 'UNION ALL'

def _get_keyword_list(self):
return map(self.keyword, self.distincts)
_keyword = "UNION"


class Intersection(SetOp):
_keyword = "INTERSECT"

def _get_keyword_list(self):
return [self._keyword] * (len(self.tables) - 1)


class Difference(SetOp):
_keyword = "EXCEPT"

def _get_keyword_list(self):
return [self._keyword] * (len(self.tables) - 1)


def flatten_union(table: ir.Table):
def flatten_set_op(table: ir.Table):
"""Extract all union queries from `table`.
Parameters
Expand All @@ -493,14 +496,14 @@ def flatten_union(table: ir.Table):
Iterable[Union[Table, bool]]
"""
op = table.op()
if isinstance(op, ops.Union):
if isinstance(op, ops.SetOp):
# For some reason mypy considers `op.left` and `op.right`
# of `Argument` type, and fails the validation. While in
# `flatten` types are the same, and it works
return toolz.concatv(
flatten_union(op.left), # type: ignore
flatten_set_op(op.left), # type: ignore
[op.distinct],
flatten_union(op.right), # type: ignore
flatten_set_op(op.right), # type: ignore
)
return [table]

Expand All @@ -517,7 +520,9 @@ def flatten(table: ir.Table):
Iterable[Union[Table]]
"""
op = table.op()
return list(toolz.concatv(flatten_union(op.left), flatten_union(op.right)))
return list(
toolz.concatv(flatten_set_op(op.left), flatten_set_op(op.right))
)


class Compiler:
Expand All @@ -530,6 +535,8 @@ class Compiler:
intersect_class = Intersection
difference_class = Difference

cheap_in_memory_tables = False

@classmethod
def make_context(cls, params=None):
params = params or {}
Expand Down Expand Up @@ -597,9 +604,9 @@ def to_ast_ensure_limit(cls, expr, limit=None, params=None):
else:
query_limit = limit
if query_limit:
query.limit = {'n': query_limit, 'offset': 0}
query.limit = _LimitSpec(query_limit, offset=0)
elif limit is not None and limit != 'default':
query.limit = {'n': limit, 'offset': query.limit['offset']}
query.limit = _LimitSpec(limit, query.limit.offset)

return query_ast

Expand All @@ -617,35 +624,37 @@ def _generate_setup_queries(expr, context):
def _generate_teardown_queries(expr, context):
return []

@classmethod
def _make_union(cls, expr, context):
@staticmethod
def _make_set_op(cls, expr, context):
# flatten unions so that we can codegen them all at once
union_info = list(flatten_union(expr))
set_op_info = list(flatten_set_op(expr))

# since op is a union, we have at least 3 elements in union_info (left
# distinct right) and if there is more than a single union we have an
# additional two elements per union (distinct right) which means the
# total number of elements is at least 3 + (2 * number of unions - 1)
# and is therefore an odd number
npieces = len(union_info)
assert npieces >= 3 and npieces % 2 != 0, 'Invalid union expression'
npieces = len(set_op_info)
assert (
npieces >= 3 and npieces % 2 != 0
), 'Invalid set operation expression'

# 1. every other object starting from 0 is a Table instance
# 2. every other object starting from 1 is a bool indicating the type
# of union (distinct or not distinct)
table_exprs, distincts = union_info[::2], union_info[1::2]
return cls.union_class(
table_exprs, expr, distincts=distincts, context=context
)
# of $set_op (distinct or not distinct)
table_exprs, distincts = set_op_info[::2], set_op_info[1::2]
return cls(table_exprs, expr, distincts=distincts, context=context)

@classmethod
def _make_union(cls, expr, context):
return cls._make_set_op(cls.union_class, expr, context)

@classmethod
def _make_intersect(cls, expr, context):
# flatten intersections so that we can codegen them all at once
table_exprs = list(flatten(expr))
return cls.intersect_class(table_exprs, expr, context=context)
return cls._make_set_op(cls.intersect_class, expr, context)

@classmethod
def _make_difference(cls, expr, context):
# flatten differences so that we can codegen them all at once
table_exprs = list(flatten(expr))
return cls.difference_class(table_exprs, expr, context=context)
return cls._make_set_op(cls.difference_class, expr, context)
50 changes: 47 additions & 3 deletions ibis/backends/base/sql/compiler/select_builder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from __future__ import annotations

import functools
import operator
from typing import NamedTuple

import toolz

import ibis
import ibis.common.exceptions as com
import ibis.expr.analysis as L
import ibis.expr.operations as ops
Expand All @@ -10,6 +17,11 @@
)


class _LimitSpec(NamedTuple):
n: int
offset: int


class _CorrelatedRefCheck:
def __init__(self, query, expr):
self.query = query
Expand Down Expand Up @@ -433,16 +445,43 @@ def _collect_Distinct(self, expr, toplevel=False):

self._collect(expr.op().table, toplevel=toplevel)

def _collect_DropNa(self, expr, toplevel=False):
if toplevel:
op = expr.op()
if op.subset is None:
columns = [op.table[c] for c in op.table.columns]
else:
columns = op.subset
if columns:
filters = [
functools.reduce(
operator.and_ if op.how == "any" else operator.or_,
[c.notnull() for c in columns],
)
]
elif op.how == "all":
filters = [ibis.literal(False)]
else:
filters = []
self.table_set = op.table
self.select_set = [op.table]
self.filters = filters

def _collect_Limit(self, expr, toplevel=False):
if not toplevel:
return

op = expr.op()
n = op.n
offset = op.offset or 0

# Ignore "inner" limits, because they've been overrided by an exterior
# one
if self.limit is None:
self.limit = {'n': op.n, 'offset': op.offset}
self.limit = _LimitSpec(n, offset)
else:
self.limit = _LimitSpec(
min(n, self.limit.n),
offset + self.limit.offset,
)

self._collect(op.table, toplevel=toplevel)

Expand Down Expand Up @@ -499,6 +538,11 @@ def _collect_Selection(self, expr, toplevel=False):
self.table_set = table
self.filters = filters

def _collect_PandasInMemoryTable(self, expr, toplevel=False):
if toplevel:
self.select_set = [expr]
self.table_set = expr

def _convert_group_by(self, exprs):
return list(range(len(exprs)))

Expand Down
15 changes: 14 additions & 1 deletion ibis/backends/base/sql/registry/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,11 @@ def hash(translator, expr):
raise NotImplementedError(how)


def concat(translator, expr):
joined_args = ', '.join(map(translator.translate, expr.op().arg))
return f"concat({joined_args})"


binary_infix_ops = {
# Binary operations
ops.Add: binary_infix.binary_infix_op('+'),
Expand All @@ -247,6 +252,13 @@ def hash(translator, expr):
ops.And: binary_infix.binary_infix_op('AND'),
ops.Or: binary_infix.binary_infix_op('OR'),
ops.Xor: binary_infix.xor,
# bitwise operations
ops.BitwiseAnd: fixed_arity('bitand', 2),
ops.BitwiseOr: fixed_arity('bitor', 2),
ops.BitwiseXor: fixed_arity('bitxor', 2),
ops.BitwiseLeftShift: fixed_arity('shiftleft', 2),
ops.BitwiseRightShift: fixed_arity('shiftright', 2),
ops.BitwiseNot: unary('bitnot'),
}


Expand Down Expand Up @@ -304,7 +316,7 @@ def hash(translator, expr):
ops.Count: aggregate.reduction('count'),
ops.CountDistinct: aggregate.count_distinct,
# string operations
ops.StringConcat: fixed_arity('concat', 2),
ops.StringConcat: concat,
ops.StringLength: unary('length'),
ops.StringAscii: unary('ascii'),
ops.Lowercase: unary('lower'),
Expand All @@ -330,6 +342,7 @@ def hash(translator, expr):
ops.ParseURL: string.parse_url,
ops.StartsWith: string.startswith,
ops.EndsWith: string.endswith,
ops.StringReplace: fixed_arity('replace', 3),
# Timestamp operations
ops.Date: unary('to_date'),
ops.TimestampNow: lambda *args: 'now()',
Expand Down
19 changes: 18 additions & 1 deletion ibis/backends/clickhouse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any, Literal, Mapping

import pandas as pd
import toolz
from clickhouse_driver.client import Client as _DriverClient
from pydantic import Field

Expand Down Expand Up @@ -38,6 +39,13 @@ class Options(ibis.config.BaseModel):
description="Database to use for temporary objects.",
)

def __init__(self, *args, external_tables=None, **kwargs):
super().__init__(*args, **kwargs)
self._external_tables = external_tables or {}

def _register_in_memory_table(self, table_op):
self._external_tables[table_op.name] = table_op.data.to_frame()

def do_connect(
self,
host: str = "localhost",
Expand All @@ -49,6 +57,7 @@ def do_connect(
compression: (
Literal["lz4", "lz4hc", "quicklz", "zstd"] | bool
) = _default_compression,
external_tables=None,
**kwargs: Any,
):
"""Create a ClickHouse client for use with Ibis.
Expand Down Expand Up @@ -92,6 +101,7 @@ def do_connect(
compression=compression,
**kwargs,
)
self._external_tables = external_tables or {}

@property
def version(self) -> str:
Expand All @@ -109,11 +119,16 @@ def current_database(self):

def list_databases(self, like=None):
data, _ = self.raw_sql('SELECT name FROM system.databases')
# in theory this should never be empty
if not data: # pragma: no cover
return []
databases = list(data[0])
return self._filter_with_like(databases, like)

def list_tables(self, like=None, database=None):
data, _ = self.raw_sql('SHOW TABLES')
if not data:
return []
databases = list(data[0])
return self._filter_with_like(databases, like)

Expand All @@ -140,7 +155,9 @@ def raw_sql(
external_tables_list = []
if external_tables is None:
external_tables = {}
for name, df in external_tables.items():
for name, df in toolz.merge(
self._external_tables, external_tables
).items():
if not isinstance(df, pd.DataFrame):
raise TypeError(
'External table is not an instance of pandas dataframe'
Expand Down
10 changes: 8 additions & 2 deletions ibis/backends/clickhouse/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def format_limit(self):

buf = StringIO()

n, offset = self.limit['n'], self.limit['offset']
if offset is not None and offset != 0:
n = self.limit.n
if offset := self.limit.offset:
buf.write(f'LIMIT {offset}, {n}')
else:
buf.write(f'LIMIT {n}')
Expand All @@ -75,6 +75,11 @@ class ClickhouseTableSetFormatter(TableSetFormatter):

_non_equijoin_supported = False

def _format_in_memory_table(self, op):
# We register in memory tables as external tables because clickhouse
# doesn't implement a generic VALUES statement
return op.name


class ClickhouseExprTranslator(ExprTranslator):
_registry = operation_registry
Expand Down Expand Up @@ -118,6 +123,7 @@ def day_of_week_name(expr):


class ClickhouseCompiler(Compiler):
cheap_in_memory_tables = True
translator_class = ClickhouseExprTranslator
table_set_formatter_class = ClickhouseTableSetFormatter
select_builder_class = ClickhouseSelectBuilder
Expand Down
31 changes: 31 additions & 0 deletions ibis/backends/clickhouse/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,32 @@ def struct():
nullable=False,
)

@p.generate
def enum_value():
yield dt.SPACES
key = yield dt.RAW_STRING
yield dt.spaceless_string('=')
value = yield p.digit.at_least(1).concat()
return (key, int(value))

@p.generate
def lowcardinality():
yield dt.spaceless_string('LowCardinality')
yield dt.LPAREN
r = yield ty
yield dt.RPAREN
return r

@p.generate
def enum():
yield dt.spaceless_string('enum')
enumsz = yield p.digit.at_least(1).concat()
enumsz = int(enumsz)
yield dt.LPAREN
yield enum_value.sep_by(dt.COMMA).map(dict) # ignore values
yield dt.RPAREN
return dt.String(nullable=False)

ty = (
nullable
| nested
Expand All @@ -160,6 +186,11 @@ def struct():
| array
| map
| struct
| enum
| lowcardinality
| dt.spaceless_string("IPv4", "IPv6").result(dt.inet(nullable=False))
| dt.spaceless_string("Object('json')").result(dt.json(nullable=False))
| dt.spaceless_string("JSON").result(dt.json(nullable=False))
)
return ty.parse(text)

Expand Down
64 changes: 45 additions & 19 deletions ibis/backends/clickhouse/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import ibis.expr.operations as ops
import ibis.expr.types as ir
import ibis.util as util
from ibis.backends.base.sql.registry import binary_infix
from ibis.backends.base.sql.registry import binary_infix, window
from ibis.backends.clickhouse.datatypes import serialize
from ibis.backends.clickhouse.identifiers import quote_identifier

Expand Down Expand Up @@ -363,6 +363,9 @@ def _literal(translator, expr):
return _null_literal(translator, expr)
if isinstance(expr, ir.BooleanValue):
return '1' if value else '0'
elif isinstance(expr, ir.INETValue):
v = str(value)
return f"toIPv6({v!r})" if ':' in v else f"toIPv4({v!r})"
elif isinstance(expr, ir.StringValue):
return "'{!s}'".format(value.replace("'", "\\'"))
elif isinstance(expr, ir.NumericValue):
Expand Down Expand Up @@ -402,6 +405,11 @@ def _literal(translator, expr):
return str(list(_tuple_to_list(value)))
elif isinstance(expr, ir.SetScalar):
return '({})'.format(', '.join(map(repr, value)))
elif isinstance(expr, ir.StructScalar):
fields = ", ".join(
f"{value} as `{key}`" for key, value in expr.op().value.items()
)
return f"tuple({fields})"
else:
raise NotImplementedError(type(expr))

Expand Down Expand Up @@ -680,6 +688,20 @@ def _struct_field(translator, expr):
return f"{translator.translate(op.arg)}.`{op.field}`"


def _nth_value(translator, expr):
op = expr.op()
arg = translator.translate(op.arg)
nth = translator.translate(op.nth)
return f"nth_value({arg}, ({nth}) + 1)"


def _repeat(translator, expr):
op = expr.op()
arg = translator.translate(op.arg)
times = translator.translate(op.times)
return f"repeat({arg}, CAST({times} AS UInt64))"


# TODO: clickhouse uses different string functions
# for ascii and utf-8 encodings,

Expand Down Expand Up @@ -746,6 +768,8 @@ def _struct_field(translator, expr):
ops.Sum: _agg('sum'),
ops.Max: _agg('max'),
ops.Min: _agg('min'),
ops.ArgMin: _agg('argMin'),
ops.ArgMax: _agg('argMax'),
ops.ArrayCollect: _agg('groupArray'),
ops.StandardDev: _agg_variance_like('stddev'),
ops.Variance: _agg_variance_like('var'),
Expand Down Expand Up @@ -775,7 +799,7 @@ def _struct_field(translator, expr):
ops.LStrip: _unary('trimLeft'),
ops.RStrip: _unary('trimRight'),
ops.Strip: _unary('trimBoth'),
ops.Repeat: _fixed_arity("repeat", 2),
ops.Repeat: _repeat,
ops.StringConcat: _string_concat,
ops.RegexSearch: _fixed_arity('match', 2),
ops.RegexExtract: _regex_extract,
Expand Down Expand Up @@ -841,6 +865,23 @@ def _struct_field(translator, expr):
ops.Clip: _clip,
ops.StructField: _struct_field,
ops.StructColumn: _struct_column,
ops.Window: window.window,
ops.RowNumber: lambda *args: 'row_number()',
ops.DenseRank: lambda *args: 'dense_rank()',
ops.MinRank: lambda *args: 'rank()',
ops.Lag: window.shift_like('lagInFrame'),
ops.Lead: window.shift_like('leadInFrame'),
ops.FirstValue: _unary('first_value'),
ops.LastValue: _unary('last_value'),
ops.NthValue: _nth_value,
ops.Window: window.window,
ops.NTile: window.ntile,
ops.BitwiseAnd: _fixed_arity('bitAnd', 2),
ops.BitwiseOr: _fixed_arity('bitOr', 2),
ops.BitwiseXor: _fixed_arity('bitXor', 2),
ops.BitwiseNot: _unary('bitNot'),
ops.BitwiseLeftShift: _fixed_arity('bitShiftLeft', 2),
ops.BitwiseRightShift: _fixed_arity('bitShiftRight', 2),
}


Expand Down Expand Up @@ -889,28 +930,13 @@ def _day_of_week_index(translator, expr):


_unsupported_ops_list = [
ops.Window,
ops.DecimalPrecision,
ops.DecimalScale,
ops.BaseConvert,
ops.CumeDist,
ops.CumulativeSum,
ops.CumulativeMin,
ops.CumulativeMax,
ops.CumulativeMean,
ops.CumulativeAny,
ops.CumulativeAll,
ops.IdenticalTo,
ops.RowNumber,
ops.DenseRank,
ops.MinRank,
ops.CumeDist,
ops.PercentRank,
ops.FirstValue,
ops.LastValue,
ops.NthValue,
ops.Lag,
ops.Lead,
ops.NTile,
ops.ReductionVectorizedUDF,
]
_unsupported_ops = {k: _raise_error for k in _unsupported_ops_list}

Expand Down
12 changes: 12 additions & 0 deletions ibis/backends/clickhouse/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,15 @@ def test_insert_with_more_columns(temporary_alltypes, df):
def test_get_schema_using_query(con, query, expected_schema):
result = con._get_schema_using_query(query)
assert result == expected_schema


def test_list_tables_empty(con, worker_id):
dbname = f"tmpdb_{worker_id}"
db = con.current_database
con.raw_sql(f"CREATE DATABASE IF NOT EXISTS {dbname}")
try:
con.raw_sql(f"USE {dbname}")
assert not con.list_tables()
finally:
con.raw_sql(f"USE {db}")
con.raw_sql(f"DROP DATABASE IF EXISTS {dbname}")
2 changes: 1 addition & 1 deletion ibis/backends/clickhouse/tests/test_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def test_isnull_case_expr_rewrite_failure(db, alltypes):

result = ibis.clickhouse.compile(reduction)
expected = """\
SELECT sum(CASE WHEN isNull(`string_col`) THEN 1 ELSE 0 END) AS `sum`
SELECT sum(if(isNull(`string_col`), 1, 0)) AS `sum`
FROM {0}.`functional_alltypes`"""
assert result == expected.format(db.name)

Expand Down
9 changes: 9 additions & 0 deletions ibis/backends/clickhouse/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ def test_columns_types_with_additional_argument(con):
@pytest.mark.parametrize(
('ch_type', 'ibis_type'),
[
(
"Enum8('' = 0, 'CDMA' = 1, 'GSM' = 2, 'LTE' = 3, 'NR' = 4)",
dt.String(nullable=False),
),
('IPv4', dt.inet(nullable=False)),
('IPv6', dt.inet(nullable=False)),
('JSON', dt.json(nullable=False)),
("Object('json')", dt.json(nullable=False)),
('LowCardinality(String)', dt.String(nullable=False)),
('Array(Int8)', dt.Array(dt.Int8(nullable=False), nullable=False)),
('Array(Int16)', dt.Array(dt.Int16(nullable=False), nullable=False)),
('Array(Int32)', dt.Array(dt.Int32(nullable=False), nullable=False)),
Expand Down
115 changes: 69 additions & 46 deletions ibis/backends/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import importlib
import importlib.metadata
import os
import platform
Expand All @@ -10,6 +11,7 @@
import _pytest
import pandas as pd
import sqlalchemy as sa
from packaging.version import parse as vparse

if TYPE_CHECKING:
import pyarrow as pa
Expand All @@ -18,6 +20,7 @@

import ibis
import ibis.util as util
from ibis.backends.base import _get_backend_names

TEST_TABLES = {
"functional_alltypes": ibis.schema(
Expand Down Expand Up @@ -219,28 +222,6 @@ def _random_identifier(suffix: str) -> str:
return f"__ibis_test_{suffix}_{util.guid()}"


@lru_cache(maxsize=None)
def _get_backend_names() -> frozenset[str]:
"""Return the set of known backend names.

Notes
-----
This function returns a frozenset to prevent cache pollution.

If a `set` is used, then any in-place modifications to the set
are visible to every caller of this function.
"""
import sys

if sys.version_info < (3, 10):
entrypoints = list(importlib.metadata.entry_points()['ibis.backends'])
else:
entrypoints = list(
importlib.metadata.entry_points(group="ibis.backends")
)
return frozenset(ep.name for ep in entrypoints)


def _get_backend_conf(backend_str: str):
"""Convert a backend string to the test class for the backend."""
conftest = importlib.import_module(
Expand Down Expand Up @@ -298,6 +279,8 @@ def pytest_ignore_collect(path, config):
def pytest_collection_modifyitems(session, config, items):
# add the backend marker to any tests are inside "ibis/backends"
all_backends = _get_backend_names()
xdist_group_markers = []

for item in items:
parts = item.path.parts
backend = _get_backend_from_parts(parts)
Expand All @@ -306,12 +289,19 @@ def pytest_collection_modifyitems(session, config, items):
item.add_marker(pytest.mark.backend)
elif "backends" not in parts:
# anything else is a "core" test and is run by default
item.add_marker(pytest.mark.core)
if not any(item.iter_markers(name="benchmark")):
item.add_marker(pytest.mark.core)

for name in ("duckdb", "sqlite"):
# build a list of markers so we're don't invalidate the item's
# marker iterator
for _ in item.iter_markers(name=name):
xdist_group_markers.append(
(item, pytest.mark.xdist_group(name=name))
)

if "sqlite" in item.nodeid:
item.add_marker(pytest.mark.xdist_group(name="sqlite"))
if "duckdb" in item.nodeid:
item.add_marker(pytest.mark.xdist_group(name="duckdb"))
for item, marker in xdist_group_markers:
item.add_marker(marker)


@lru_cache(maxsize=None)
Expand Down Expand Up @@ -343,7 +333,6 @@ def _get_backends_to_test(

def pytest_runtest_call(item):
"""Dynamically add various custom markers."""
nodeid = item.nodeid
backend = [
backend.name()
for key, backend in item.funcargs.items()
Expand All @@ -364,25 +353,59 @@ def pytest_runtest_call(item):

backend = next(iter(backend))

for marker in item.iter_markers(name="skip_backends"):
if backend in marker.args[0]:
pytest.skip(f"skip_backends: {backend} {nodeid}")

for marker in item.iter_markers(name='min_spark_version'):
min_version = marker.args[0]
if backend == 'pyspark':
from distutils.version import LooseVersion

import pyspark

if LooseVersion(pyspark.__version__) < LooseVersion(min_version):
item.add_marker(
pytest.mark.xfail(
reason=f'Require minimal spark version {min_version}, '
f'but is {pyspark.__version__}',
**marker.kwargs,
)
for marker in item.iter_markers(name="min_server_version"):
kwargs = marker.kwargs
if backend not in kwargs:
continue

funcargs = item.funcargs
con = funcargs.get(
"con",
getattr(funcargs.get("backend"), "connection", None),
)

if con is None:
continue

min_server_version = kwargs.pop(backend)
server_version = con.version
condition = vparse(server_version) < vparse(min_server_version)
item.add_marker(
pytest.mark.xfail(
condition,
reason=(
"unsupported functionality for server version "
f"{server_version}"
),
**kwargs,
)
)

for marker in item.iter_markers(name="min_version"):
kwargs = marker.kwargs
if backend not in kwargs:
continue

min_version = kwargs.pop(backend)
reason = kwargs.pop("reason", None)
version = getattr(
importlib.import_module(backend), "__version__", None
)
if condition := version is None: # pragma: no cover
if reason is None:
reason = (
f"{backend} backend module has no __version__ attribute"
)
else:
condition = vparse(version) < vparse(min_version)
if reason is None:
reason = (
f"test requires {backend}>={version}; "
f"got version {version}"
)
else:
reason = f"{backend}@{version} (<{min_version}): {reason}"
item.add_marker(pytest.mark.xfail(condition, reason=reason, **kwargs))

# Ibis hasn't exposed existing functionality
# This xfails so that you know when it starts to pass
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/dask/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def execute_with_scope(
# computing anything *and* before associating leaf nodes with data. This
# allows clients to provide their own data for each leaf.
if clients is None:
clients = expr._find_backends()
clients, _ = expr._find_backends()

if aggcontext is None:
aggcontext = agg_ctx.Summarize()
Expand Down
33 changes: 20 additions & 13 deletions ibis/backends/dask/execution/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@
register_types_to_dispatcher,
)
from ibis.backends.pandas.core import (
date_types,
integer_types,
numeric_types,
simple_types,
timestamp_types,
)
from ibis.backends.pandas.execution import constants
from ibis.backends.pandas.execution.generic import (
_execute_binary_op_impl,
execute_between,
execute_cast_series_array,
execute_cast_series_generic,
Expand Down Expand Up @@ -105,14 +107,20 @@
],
ops.Intersection: [
(
(dd.DataFrame, dd.DataFrame),
(dd.DataFrame, dd.DataFrame, bool),
execute_intersection_dataframe_dataframe,
)
],
ops.Difference: [
((dd.DataFrame, dd.DataFrame), execute_difference_dataframe_dataframe)
(
(dd.DataFrame, dd.DataFrame, bool),
execute_difference_dataframe_dataframe,
)
],
ops.DropNa: [
((dd.DataFrame, tuple), execute_node_dropna_dataframe),
((dd.DataFrame, type(None)), execute_node_dropna_dataframe),
],
ops.DropNa: [((dd.DataFrame, tuple), execute_node_dropna_dataframe)],
ops.FillNa: [
((dd.DataFrame, simple_types), execute_node_fillna_dataframe_scalar),
((dd.DataFrame,), execute_node_fillna_dataframe_dict),
Expand Down Expand Up @@ -285,7 +293,7 @@ def execute_cast_series_date(op, data, type, **kwargs):
@execute_node.register(ops.Limit, dd.DataFrame, integer_types, integer_types)
def execute_limit_frame(op, data, nrows, offset, **kwargs):
# NOTE: Dask Dataframes do not support iloc row based indexing
return data.loc[offset : offset + nrows]
return data.loc[offset : (offset + nrows) - 1]


@execute_node.register(ops.Not, (dd.core.Scalar, dd.Series))
Expand All @@ -311,15 +319,14 @@ def execute_not_scalar_or_series(op, data, **kwargs):
@execute_node.register(ops.Comparison, dd.Series, timestamp_types)
@execute_node.register(ops.Comparison, timestamp_types, dd.Series)
def execute_binary_op(op, left, right, **kwargs):
op_type = type(op)
try:
operation = constants.BINARY_OPERATIONS[op_type]
except KeyError:
raise NotImplementedError(
f'Binary operation {op_type.__name__} not implemented'
)
else:
return operation(left, right)
return _execute_binary_op_impl(op, left, right, **kwargs)


@execute_node.register(ops.Comparison, dd.Series, date_types)
def execute_binary_op_date_right(op, left, right, **kwargs):
return _execute_binary_op_impl(
op, dd.to_datetime(left), pd.to_datetime(right), **kwargs
)


@execute_node.register(ops.Binary, ddgb.SeriesGroupBy, ddgb.SeriesGroupBy)
Expand Down
93 changes: 37 additions & 56 deletions ibis/backends/dask/execution/indexing.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,45 @@
"""Execution rules for ops.Where operations"""

import dask.dataframe as dd
import numpy as np

import ibis.expr.operations as ops
from ibis.backends.dask.dispatch import execute_node
from ibis.backends.dask.execution.util import (
TypeRegistrationDict,
register_types_to_dispatcher,
)
from ibis.backends.pandas.core import boolean_types, scalar_types
from ibis.backends.pandas.execution.generic import (
execute_node_where_scalar_scalar_scalar,
execute_node_where_series_series_series,
)

DASK_DISPATCH_TYPES: TypeRegistrationDict = {
ops.Where: [
(
(dd.Series, dd.Series, dd.Series),
execute_node_where_series_series_series,
),
(
(dd.Series, dd.Series, scalar_types),
execute_node_where_series_series_series,
),
(
(
boolean_types,
dd.Series,
dd.Series,
),
execute_node_where_scalar_scalar_scalar,
),
]
}
register_types_to_dispatcher(execute_node, DASK_DISPATCH_TYPES)


def execute_node_where_series_scalar_scalar(op, cond, true, false, **kwargs):
return dd.from_array(np.repeat(true, len(cond))).where(cond, other=false)


for scalar_type in scalar_types:
execute_node.register(ops.Where, dd.Series, scalar_type, scalar_type)(
execute_node_where_series_scalar_scalar
)
from ibis.backends.pandas.core import boolean_types, scalar_types, simple_types
from ibis.backends.pandas.execution.generic import pd_where


@execute_node.register(ops.Where, boolean_types, dd.Series, scalar_types)
def execute_node_where_scalar_series_scalar(op, cond, true, false, **kwargs):
if cond:
return true
else:
# TODO double check this is the right way to do this
out = dd.from_array(np.repeat(false, len(true)))
out.index = true.index
return out


@execute_node.register(ops.Where, boolean_types, scalar_types, dd.Series)
def execute_node_where_scalar_scalar_series(op, cond, true, false, **kwargs):
return dd.from_array(np.repeat(true, len(false))) if cond else false
@execute_node.register(
ops.Where, (dd.Series, *boolean_types), dd.Series, dd.Series
)
@execute_node.register(
ops.Where, (dd.Series, *boolean_types), dd.Series, simple_types
)
@execute_node.register(
ops.Where, (dd.Series, *boolean_types), simple_types, dd.Series
)
@execute_node.register(
ops.Where, (dd.Series, *boolean_types), type(None), type(None)
)
def execute_node_where(op, cond, true, false, **kwargs):
if any(
isinstance(x, (dd.Series, dd.core.Scalar)) for x in (cond, true, false)
):
return dd.map_partitions(pd_where, cond, true, false)
# All are immediate scalars, handle locally
return true if cond else false


# For true/false as scalars, we only support identical type pairs + None to
# limit the size of the dispatch table and not have to worry about type
# promotion.
for typ in (str, *scalar_types):
for cond_typ in (dd.Series, *boolean_types):
execute_node.register(ops.Where, cond_typ, typ, typ)(
execute_node_where
)
execute_node.register(ops.Where, cond_typ, type(None), typ)(
execute_node_where
)
execute_node.register(ops.Where, cond_typ, typ, type(None))(
execute_node_where
)
20 changes: 20 additions & 0 deletions ibis/backends/dask/execution/reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,23 @@ def execute_standard_dev_series(op, data, mask, aggcontext=None, **kwargs):
'std',
ddof=variance_ddof[op.how],
)


@execute_node.register(
ops.ArgMax, dd.Series, dd.Series, (dd.Series, type(None))
)
def execute_argmax_series(op, data, key, mask, aggcontext=None, **kwargs):
idxmax = aggcontext.agg(
key[mask] if mask is not None else key, 'idxmax'
).compute()
return data.loc[idxmax]


@execute_node.register(
ops.ArgMin, dd.Series, dd.Series, (dd.Series, type(None))
)
def execute_argmin_series(op, data, key, mask, aggcontext=None, **kwargs):
idxmin = aggcontext.agg(
key[mask] if mask is not None else key, 'idxmin'
).compute()
return data.loc[idxmin]
18 changes: 9 additions & 9 deletions ibis/backends/dask/execution/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,28 +167,28 @@ def execute_selection_dataframe(
):
result = data

if predicates:
predicates = _compute_predicates(
op.table.op(), predicates, data, scope, timecontext, **kwargs
)
predicate = functools.reduce(operator.and_, predicates)
result = result.loc[predicate]

if selections:
# if we are just performing select operations we can do a direct
# selection
if all(isinstance(s.op(), ops.TableColumn) for s in selections):
result = build_df_from_selection(selections, data, op.table.op())
result = build_df_from_selection(selections, result, op.table.op())
else:
result = build_df_from_projection(
selections,
op,
data,
result,
scope=scope,
timecontext=timecontext,
**kwargs,
)

if predicates:
predicates = _compute_predicates(
op.table.op(), predicates, data, scope, timecontext, **kwargs
)
predicate = functools.reduce(operator.and_, predicates)
result = result.loc[predicate]

if sort_keys:
if len(sort_keys) > 1:
raise NotImplementedError(
Expand Down
7 changes: 7 additions & 0 deletions ibis/backends/dask/execution/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
execute_series_regex_replace,
execute_series_regex_search,
execute_series_right,
execute_series_string_replace,
execute_series_translate_scalar_scalar,
execute_series_translate_scalar_series,
execute_series_translate_series_scalar,
Expand Down Expand Up @@ -80,6 +81,12 @@
),
],
ops.Reverse: [((dd.Series,), execute_string_reverse)],
ops.StringReplace: [
(
(dd.Series, (dd.Series, str), (dd.Series, str)),
execute_series_string_replace,
)
],
ops.Lowercase: [((dd.Series,), execute_string_lower)],
ops.Uppercase: [((dd.Series,), execute_string_upper)],
ops.Capitalize: [((dd.Series,), execute_string_capitalize)],
Expand Down
Loading