Skip to content

feat(pyspark): builtin udf support #21456

feat(pyspark): builtin udf support

feat(pyspark): builtin udf support #21456

Workflow file for this run

name: Backends
on:
push:
# Skip the backend suite if all changes are docs
paths-ignore:
- "docs/**"
- "**/*.md"
- "**/*.qmd"
- "codecov.yml"
- ".envrc"
- ".codespellrc"
branches:
- main
- "*.x.x"
pull_request:
# Skip the backend suite if all changes are docs
paths-ignore:
- "docs/**"
- "**/*.md"
- "**/*.qmd"
- "codecov.yml"
- ".envrc"
- ".codespellrc"
branches:
- main
- "*.x.x"
merge_group:
permissions:
# this allows extractions/setup-just to list releases for `just` at a higher
# rate limit while restricting GITHUB_TOKEN permissions elsewhere
contents: read
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
env:
FORCE_COLOR: "1"
ODBCSYSINI: "${{ github.workspace }}/ci/odbc"
HYPOTHESIS_PROFILE: "ci"
jobs:
test_bigquery_lite:
name: BigQuery ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
steps:
- name: checkout
uses: actions/checkout@v4
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- name: install poetry
run: pip install 'poetry==1.8.2'
- name: install ibis
run: poetry install --without dev --without docs --extras bigquery
- uses: extractions/setup-just@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: run simple bigquery unit tests
run: just ci-check ibis/backends/bigquery/tests/unit
- name: upload code coverage
if: success()
continue-on-error: true
uses: codecov/codecov-action@v4
with:
flags: backend,bigquery,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
token: ${{ secrets.CODECOV_TOKEN }}
test_backends:
name: ${{ matrix.backend.title }} ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
backend:
- name: duckdb
title: DuckDB
serial: true
extras:
- duckdb
- deltalake
- examples
- decompiler
- polars
additional_deps:
- torch
# TODO: remove this duckdb job once the next duckdb_spatial is released
- name: duckdb
title: DuckDB + Geospatial
extras:
- geospatial
additional_deps:
- "duckdb==0.9.2"
- name: clickhouse
title: ClickHouse
services:
- clickhouse
extras:
- clickhouse
- examples
- name: pandas
title: Pandas
extras:
- pandas
- name: sqlite
title: SQLite
extras:
- sqlite
- name: datafusion
title: Datafusion
extras:
- datafusion
- name: polars
title: Polars
extras:
- polars
- deltalake
- name: mysql
title: MySQL
services:
- mysql
extras:
- mysql
- geospatial
sys-deps:
- libgeos-dev
- name: postgres
title: PostgreSQL
extras:
- postgres
- geospatial
services:
- postgres
sys-deps:
- libgeos-dev
- name: postgres
title: PostgreSQL + Torch
extras:
- postgres
- geospatial
- polars
additional_deps:
- torch
services:
- postgres
sys-deps:
- libgeos-dev
- name: risingwave
title: Risingwave
serial: true
services:
- risingwave
extras:
- risingwave
- name: impala
title: Impala
serial: true
extras:
- impala
services:
- impala
- kudu
sys-deps:
- cmake
- ninja-build
- name: mssql
title: MS SQL Server
extras:
- mssql
services:
- mssql
sys-deps:
- freetds-dev
- unixodbc-dev
- tdsodbc
- name: trino
title: Trino
extras:
- trino
services:
- trino
- name: druid
title: Druid
extras:
- druid
services:
- druid
- name: exasol
title: Exasol
serial: true
extras:
- exasol
services:
- exasol
- name: oracle
title: Oracle
serial: true
extras:
- oracle
services:
- oracle
- name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- "'apache-flink < 1.20.0'"
- "'pandas < 2.2'"
services:
- flink
include:
- os: ubuntu-latest
python-version: "3.11.8"
backend:
name: dask
title: Dask
extras:
- dask
- os: ubuntu-latest
python-version: "3.9"
backend:
name: dask
title: Dask
extras:
- dask
exclude:
- os: windows-latest
backend:
name: mysql
title: MySQL
extras:
- mysql
- geospatial
services:
- mysql
sys-deps:
- libgeos-dev
- os: windows-latest
backend:
name: clickhouse
title: ClickHouse
extras:
- clickhouse
- examples
services:
- clickhouse
- os: windows-latest
backend:
name: postgres
title: PostgreSQL
extras:
- postgres
- geospatial
services:
- postgres
sys-deps:
- libgeos-dev
- os: windows-latest
backend:
name: risingwave
title: Risingwave
serial: true
services:
- risingwave
extras:
- risingwave
- os: windows-latest
backend:
name: postgres
title: PostgreSQL + Torch
extras:
- postgres
- geospatial
- polars
additional_deps:
- torch
services:
- postgres
sys-deps:
- libgeos-dev
- os: windows-latest
backend:
name: impala
title: Impala
serial: true
extras:
- impala
services:
- impala
- kudu
sys-deps:
- cmake
- ninja-build
- os: windows-latest
backend:
name: mssql
title: MS SQL Server
extras:
- mssql
services:
- mssql
sys-deps:
- freetds-dev
- unixodbc-dev
- tdsodbc
- os: windows-latest
backend:
name: trino
title: Trino
services:
- trino
extras:
- trino
- os: windows-latest
backend:
name: druid
title: Druid
extras:
- druid
services:
- druid
- os: windows-latest
backend:
name: oracle
title: Oracle
serial: true
extras:
- oracle
services:
- oracle
- os: windows-latest
backend:
name: flink
title: Flink
serial: true
extras:
- flink
additional_deps:
- "'apache-flink < 1.20.0'"
- "'pandas < 2.2'"
services:
- flink
- os: windows-latest
backend:
name: exasol
title: Exasol
serial: true
extras:
- exasol
services:
- exasol
steps:
- name: update and install system dependencies
if: matrix.os == 'ubuntu-latest' && matrix.backend.sys-deps != null
run: |
set -euo pipefail
sudo apt-get update -qq -y
sudo apt-get install -qq -y build-essential ${{ join(matrix.backend.sys-deps, ' ') }}
- name: install sqlite
if: matrix.os == 'windows-latest' && matrix.backend.name == 'sqlite'
run: choco install sqlite
- name: checkout
uses: actions/checkout@v4
- uses: extractions/setup-just@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: download backend data
run: just download-data
- name: start services
if: matrix.backend.services != null
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- name: install poetry
run: pip install 'poetry==1.8.2'
- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"
- name: install deps for broken avro-python setup
if: matrix.backend.name == 'flink'
run: poetry run pip install wheel
- name: install other deps
if: matrix.backend.additional_deps != null
run: poetry run pip install ${{ join(matrix.backend.additional_deps, ' ') }}
- name: show installed deps
run: poetry run pip list
- name: "run parallel tests: ${{ matrix.backend.name }}"
if: ${{ !matrix.backend.serial }}
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup
env:
IBIS_TEST_IMPALA_HOST: localhost
IBIS_TEST_IMPALA_PORT: 21050
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
- name: "run serial tests: ${{ matrix.backend.name }}"
if: matrix.backend.serial
run: just ci-check -m ${{ matrix.backend.name }}
env:
FLINK_REMOTE_CLUSTER_ADDR: localhost
FLINK_REMOTE_CLUSTER_PORT: "8081"
IBIS_EXAMPLES_DATA: ${{ runner.temp }}/examples-${{ matrix.backend.name }}-${{ matrix.os }}-${{ steps.install_python.outputs.python-version }}
- name: check that no untracked files were produced
shell: bash
run: |
! git status --porcelain | tee /dev/stderr | grep .
- name: upload code coverage
if: success()
continue-on-error: true
uses: codecov/codecov-action@v4
with:
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
token: ${{ secrets.CODECOV_TOKEN }}
- name: Show docker compose logs on fail
if: matrix.backend.services != null && failure()
run: docker compose logs
test_backends_min_version:
name: ${{ matrix.backend.title }} Min Version ${{ matrix.os }} python-${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- windows-latest
python-version:
- "3.9"
- "3.11"
backend:
- name: dask
title: Dask
deps:
required:
- "numpy@1.23.2"
- "pyarrow@10.0.1"
optional:
- "dask[array,dataframe]@2022.9.1"
- "pandas@1.5.3"
extras:
- dask
- name: postgres
title: PostgreSQL
deps:
required:
- "numpy@1.23.2"
- "pyarrow@10.0.1"
optional:
- "psycopg2@2.8.4"
- "geopandas@0.6"
- "Shapely@2"
services:
- postgres
extras:
- postgres
- geospatial
exclude:
- os: windows-latest
backend:
name: postgres
title: PostgreSQL
deps:
required:
- "numpy@1.23.2"
- "pyarrow@10.0.1"
optional:
- "psycopg2@2.8.4"
- "geopandas@0.6"
- "Shapely@2"
services:
- postgres
extras:
- postgres
- geospatial
- python-version: "3.11"
backend:
name: postgres
title: PostgreSQL
deps:
required:
- "numpy@1.23.2"
- "pyarrow@10.0.1"
optional:
- "psycopg2@2.8.4"
- "geopandas@0.6"
- "Shapely@2"
services:
- postgres
extras:
- postgres
- geospatial
- python-version: "3.11"
backend:
name: dask
title: Dask
deps:
required:
- "numpy@1.23.2"
- "pyarrow@10.0.1"
optional:
- "dask[array,dataframe]@2022.9.1"
- "pandas@1.5.3"
extras:
- dask
steps:
- name: checkout
uses: actions/checkout@v4
- name: install libgeos for shapely
if: matrix.backend.name == 'postgres'
run: |
sudo apt-get update -y -qq
sudo apt-get install -qq -y build-essential libgeos-dev
- uses: extractions/setup-just@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: download backend data
run: just download-data
- name: start services
if: matrix.backend.services != null
run: docker compose up --wait ${{ join(matrix.backend.services, ' ') }}
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.8.2'
- name: remove incompatible deps
# it requires a version of pandas that min versions are not compatible with
run: poetry remove lonboard deltalake
- name: install minimum versions of required deps
run: poetry add --lock ${{ join(matrix.backend.deps.required, ' ') }} --python="==${{ steps.install_python.outputs.python-version }}"
- name: install minimum versions of optional deps
run: poetry add --lock --optional ${{ join(matrix.backend.deps.optional, ' ') }} --python="==${{ steps.install_python.outputs.python-version }}"
- name: checkout the lock file
run: git checkout poetry.lock
- name: lock with no updates
# poetry add is aggressive and will update other dependencies like
# numpy and pandas so we keep the pyproject.toml edits and then relock
# without updating anything except the requested versions
run: poetry lock --no-update
- name: install ibis
run: poetry install --without dev --without docs --extras "${{ join(matrix.backend.extras, ' ') }}"
- name: run tests
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist=loadgroup
- name: check that no untracked files were produced
shell: bash
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .
- name: upload code coverage
if: success()
continue-on-error: true
uses: codecov/codecov-action@v4
with:
flags: backend,${{ matrix.backend.name }},${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
token: ${{ secrets.CODECOV_TOKEN }}
- name: Show docker compose logs on fail
if: matrix.backend.services != null && failure()
run: docker compose logs
test_pyspark:
name: PySpark ${{ matrix.pyspark-version }} ubuntu-latest python-${{ matrix.python-version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- python-version: "3.9"
pyspark-version: "3.3"
deps:
- "'pandas@<2'"
- "'numpy@<1.24'"
- python-version: "3.11"
pyspark-version: "3.5"
deps:
- "'pandas@>2'"
- "'numpy@>1.24'"
- python-version: "3.12"
pyspark-version: "3.5"
deps:
- "'pandas@>2'"
- "'numpy@>1.24'"
steps:
- name: checkout
uses: actions/checkout@v4
- uses: actions/setup-java@v4
with:
distribution: microsoft
java-version: 17
- uses: extractions/setup-just@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: download backend data
run: just download-data
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: ${{ matrix.python-version }}
- name: install poetry
run: python -m pip install --upgrade pip 'poetry==1.8.2'
- name: remove lonboard
# it requires a version of pandas that pyspark is not compatible with
run: poetry remove lonboard
- name: install exact versions of pyspark, pandas and numpy
run: poetry add --lock 'pyspark@${{ matrix.pyspark-version }}' ${{ join(matrix.deps, ' ') }}
- name: checkout the lock file
run: git checkout poetry.lock
- name: lock with no updates
# poetry add is aggressive and will update other dependencies like
# numpy and pandas so we keep the pyproject.toml edits and then relock
# without updating anything except the requested versions
run: poetry lock --no-update
- name: install ibis
run: poetry install --without dev --without docs --extras pyspark
- name: install delta-spark
if: matrix.pyspark-version == '3.5'
run: poetry run pip install delta-spark
- name: run tests
run: just ci-check -m pyspark
- name: check that no untracked files were produced
shell: bash
run: git checkout poetry.lock pyproject.toml && ! git status --porcelain | tee /dev/stderr | grep .
- name: upload code coverage
# only upload coverage for jobs that aren't mostly xfails
if: success()
continue-on-error: true
uses: codecov/codecov-action@v4
with:
flags: backend,pyspark,${{ runner.os }},python-${{ steps.install_python.outputs.python-version }}
token: ${{ secrets.CODECOV_TOKEN }}
backends:
# this job exists so that we can use a single job from this workflow to gate merging
runs-on: ubuntu-latest
needs:
- test_backends_min_version
- test_backends
- test_pyspark
steps:
- run: exit 0