diff --git a/.env b/.env index c9cd6c8094ed8..014bad3fe2a7a 100644 --- a/.env +++ b/.env @@ -58,8 +58,8 @@ CUDA=11.0.3 DASK=latest DOTNET=7.0 GCC_VERSION="" -GO=1.17 -STATICCHECK=v0.2.2 +GO=1.19.13 +STATICCHECK=v0.4.5 HDFS=3.2.1 JDK=8 KARTOTHEK=latest diff --git a/.gitattributes b/.gitattributes index 1a5b156b491fd..69f4139c4e4f4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,9 +1,11 @@ +cpp/src/arrow/util/bpacking_*_generated.h linguist-generated=true +cpp/src/generated/*.cpp linguist-generated=true +cpp/src/generated/*.h linguist-generated=true +go/**/*.s linguist-generated=true +go/arrow/unionmode_string.go linguist-generated=true r/R/RcppExports.R linguist-generated=true r/R/arrowExports.R linguist-generated=true r/src/RcppExports.cpp linguist-generated=true r/src/arrowExports.cpp linguist-generated=true r/man/*.Rd linguist-generated=true -cpp/src/generated/*.h linguist-generated=true r/NEWS.md merge=union -go/**/*.s linguist-generated=true -go/arrow/unionmode_string.go linguist-generated=true diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index d337ec797cf90..bc11bb42366e2 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -50,7 +50,7 @@ jobs: timeout-minutes: 15 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Git Fixup diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 43eb86ebe9434..cc9e02d955afd 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -35,7 +35,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: arrow # fetch the tags for version number generation @@ -60,7 +60,7 @@ jobs: if: startsWith(github.event.comment.body, '@github-actions autotune') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/pr-fetch@v2 with: repo-token: ${{ secrets.GITHUB_TOKEN }} @@ -158,7 +158,7 @@ jobs: if: startsWith(github.event.comment.body, '@github-actions rebase') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/pr-fetch@v2 with: repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 41032fc1b08fc..fc8d0bad58e9f 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -29,6 +29,7 @@ on: - 'ci/scripts/msys2_*' - 'ci/scripts/util_*' - 'cpp/**' + - 'docker-compose.yml' - 'format/Flight.proto' pull_request: paths: @@ -41,6 +42,7 @@ on: - 'ci/scripts/msys2_*' - 'ci/scripts/util_*' - 'cpp/**' + - 'docker-compose.yml' - 'format/Flight.proto' concurrency: @@ -94,7 +96,7 @@ jobs: UBUNTU: ${{ matrix.ubuntu }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -136,7 +138,7 @@ jobs: timeout-minutes: 45 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -167,6 +169,7 @@ jobs: ARROW_ORC: ON ARROW_PARQUET: ON ARROW_S3: ON + ARROW_SUBSTRAIT: ON ARROW_WITH_BROTLI: ON ARROW_WITH_BZ2: ON ARROW_WITH_LZ4: ON @@ -183,7 +186,7 @@ jobs: sysctl -a | grep cpu sysctl -a | grep "hw.optional" - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -257,6 +260,7 @@ jobs: ARROW_ORC: ON ARROW_PARQUET: ON ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} + ARROW_SUBSTRAIT: ON ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF ARROW_WITH_BROTLI: OFF @@ -286,7 +290,7 @@ jobs: - name: Install Dependencies run: choco install -y --no-progress openssl - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -358,6 +362,7 @@ jobs: ARROW_JEMALLOC: OFF ARROW_PARQUET: ON ARROW_S3: ON + ARROW_SUBSTRAIT: ON ARROW_USE_GLOG: OFF ARROW_VERBOSE_THIRDPARTY_BUILD: OFF ARROW_WITH_BROTLI: ON @@ -388,7 +393,7 @@ jobs: /d 1 ` /f - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 2a739e6f95bde..3d1e513bc609c 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -53,7 +53,7 @@ jobs: with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install Source Link @@ -81,7 +81,7 @@ jobs: with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install Source Link @@ -108,7 +108,7 @@ jobs: with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install Source Link diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 119d11d9a399a..cee3c74762c3c 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -37,7 +37,7 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python @@ -70,7 +70,7 @@ jobs: name: Source Release and Merge Script on ${{ matrix.runs-on }} runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 10 + timeout-minutes: 15 strategy: fail-fast: false matrix: @@ -78,13 +78,13 @@ jobs: - macos-latest - ubuntu-latest env: - GIT_AUTHOR_NAME: Github Actions - GIT_AUTHOR_EMAIL: github@actions - GIT_COMMITTER_NAME: Github Actions - GIT_COMMITTER_EMAIL: github@actions + GIT_AUTHOR_NAME: "github-actions[bot]" + GIT_AUTHOR_EMAIL: "github-actions[bot]@users.noreply.github.com" + GIT_COMMITTER_NAME: "github-actions[bot]" + GIT_COMMITTER_EMAIL: "github-actions[bot]@users.noreply.github.com" steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install Python diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 23db5e0f53bbd..e5d2a77c5a8a2 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -43,7 +43,7 @@ jobs: name: Process runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: apache/arrow ref: main diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a4629001f8e48..a1ac4c3067dae 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -38,7 +38,7 @@ jobs: UBUNTU: "22.04" steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Free up disk space diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index b1e6a21eada7b..74e6eabe24795 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -27,6 +27,7 @@ on: - 'ci/docker/conda-python.dockerfile' - 'ci/scripts/cpp_build.sh' - 'ci/scripts/python_build.sh' + - 'docker-compose.yml' permissions: contents: read @@ -46,7 +47,7 @@ jobs: PYTHON: "3.9" steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Cache Docker Volumes diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 4aa3eef7852b9..ad8fedb9bd9e4 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -23,6 +23,7 @@ on: - '.github/workflows/go.yml' - 'ci/docker/*_go.dockerfile' - 'ci/scripts/go_*' + - 'docker-compose.yml' - 'go/**' pull_request: paths: @@ -30,6 +31,7 @@ on: - 'ci/docker/*_go.dockerfile' - 'ci/docker/**' - 'ci/scripts/go_*' + - 'docker-compose.yml' - 'go/**' concurrency: @@ -52,31 +54,26 @@ jobs: include: - arch-label: AMD64 arch: amd64 - go: 1.17 + go: 1.19 runs-on: ubuntu-latest - staticcheck: v0.2.2 - arch-label: AMD64 arch: amd64 - go: 1.18 + go: '1.20' runs-on: ubuntu-latest - staticcheck: v0.3.3 - arch-label: ARM64 arch: arm64v8 - go: 1.17 - staticcheck: v0.2.2 + go: 1.19 runs-on: ["self-hosted", "arm", "linux"] - arch-label: ARM64 arch: arm64v8 - go: 1.18 - staticcheck: v0.3.3 + go: '1.20' runs-on: ["self-hosted", "arm", "linux"] env: ARCH: ${{ matrix.arch }} GO: ${{ matrix.go }} - STATICCHECK: ${{ matrix.staticcheck }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -137,13 +134,13 @@ jobs: timeout-minutes: 20 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.18 + go-version: 1.19 cache: true cache-dependency-path: go/go.sum - name: Run build @@ -159,18 +156,12 @@ jobs: strategy: fail-fast: false matrix: - go: [1.17, 1.18] - include: - - go: 1.17 - staticcheck: v0.2.2 - - go: 1.18 - staticcheck: v0.3.3 + go: [1.19, '1.20'] env: GO: ${{ matrix.go }} - STATICCHECK: ${{ matrix.staticcheck }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -206,18 +197,12 @@ jobs: strategy: fail-fast: false matrix: - go: [1.17, 1.18] - include: - - go: 1.17 - staticcheck: v0.2.2 - - go: 1.18 - staticcheck: v0.3.3 + go: [1.19, '1.20'] env: GO: ${{ matrix.go }} - STATICCHECK: ${{ matrix.staticcheck }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python @@ -247,19 +232,14 @@ jobs: name: AMD64 Windows 2019 Go ${{ matrix.go }} runs-on: windows-2019 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 + timeout-minutes: 25 strategy: fail-fast: false matrix: - go: [1.17, 1.18] - include: - - go: 1.17 - staticcheck: v0.2.2 - - go: 1.18 - staticcheck: v0.3.3 + go: [1.19, '1.20'] steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -270,7 +250,10 @@ jobs: cache: true cache-dependency-path: go/go.sum - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }} + shell: bash + run: | + . .env + go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - name: Build shell: bash run: ci/scripts/go_build.sh $(pwd) @@ -286,15 +269,10 @@ jobs: strategy: fail-fast: false matrix: - go: [1.17, 1.18] - include: - - go: 1.17 - staticcheck: v0.2.2 - - go: 1.18 - staticcheck: v0.3.3 + go: [1.19, '1.20'] steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -304,8 +282,10 @@ jobs: go-version: ${{ matrix.go }} cache: true cache-dependency-path: go/go.sum - - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }} + - name: Install staticcheck + run: | + . .env + go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - name: Build shell: bash run: ci/scripts/go_build.sh $(pwd) @@ -347,17 +327,12 @@ jobs: strategy: fail-fast: false matrix: - go: [1.17, 1.18] - include: - - go: 1.17 - staticcheck: v0.2.2 - - go: 1.18 - staticcheck: v0.3.3 + go: [1.19, '1.20'] env: ARROW_GO_TESTCGO: "1" steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -371,7 +346,9 @@ jobs: shell: bash run: brew install apache-arrow pkg-config - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }} + run: | + . .env + go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - name: Add To pkg config path shell: bash run: | @@ -407,7 +384,7 @@ jobs: /d 1 ` /f - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -428,11 +405,14 @@ jobs: - name: Install go uses: actions/setup-go@v4 with: - go-version: '1.18' + go-version: '1.19' cache: true cache-dependency-path: go/go.sum - name: Install staticcheck - run: go install honnef.co/go/tools/cmd/staticcheck@v0.3.3 + shell: bash + run: | + . .env + go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - name: Build shell: bash run: ci/scripts/go_build.sh $(pwd) @@ -449,7 +429,7 @@ jobs: timeout-minutes: 60 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 738b6ed3f1987..430b0bb2822e7 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -23,6 +23,7 @@ on: - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' + - 'docker-compose.yml' - 'go/**' - 'integration/**' - 'js/**' @@ -34,6 +35,7 @@ on: - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' + - 'docker-compose.yml' - 'go/**' - 'integration/**' - 'js/**' @@ -60,15 +62,18 @@ jobs: timeout-minutes: 60 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive - name: Checkout Arrow Rust - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: apache/arrow-rs path: rust + - name: Free up disk space + run: | + ci/scripts/util_free_space.sh - name: Cache Docker Volumes uses: actions/cache@v3 with: diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 8a66d7858af37..4c144955fbe93 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -24,6 +24,7 @@ on: - 'ci/docker/*java*' - 'ci/scripts/java*.sh' - 'ci/scripts/util_*.sh' + - 'docker-compose.yml' - 'format/Flight.proto' - 'java/**' pull_request: @@ -32,6 +33,7 @@ on: - 'ci/docker/*java*' - 'ci/scripts/java*.sh' - 'ci/scripts/util_*.sh' + - 'docker-compose.yml' - 'format/Flight.proto' - 'java/**' @@ -78,7 +80,7 @@ jobs: MAVEN: ${{ matrix.maven }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -127,7 +129,7 @@ jobs: distribution: 'zulu' java-version: ${{ matrix.jdk }} - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -154,7 +156,7 @@ jobs: java-version: ${{ matrix.jdk }} distribution: 'temurin' - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 2787ac1fc71a4..0e75468cd374b 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -25,6 +25,7 @@ on: - 'ci/scripts/cpp_build.sh' - 'ci/scripts/java_*' - 'cpp/**' + - 'docker-compose.yml' - 'java/**' pull_request: paths: @@ -33,6 +34,7 @@ on: - 'ci/scripts/cpp_build.sh' - 'ci/scripts/java_*' - 'cpp/**' + - 'docker-compose.yml' - 'java/**' concurrency: @@ -54,7 +56,7 @@ jobs: timeout-minutes: 90 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -94,7 +96,7 @@ jobs: timeout-minutes: 90 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index 5dfef4626597f..4440d36d18f73 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -43,7 +43,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 1 path: arrow @@ -51,7 +51,7 @@ jobs: ref: main submodules: recursive - name: Checkout Crossbow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 path: crossbow diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 7e8ef31b49cb6..9be11596d845f 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -47,7 +47,7 @@ jobs: timeout-minutes: 60 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python @@ -87,7 +87,7 @@ jobs: node: [18] steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Jest Cache @@ -117,7 +117,7 @@ jobs: node: [18] steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Jest Cache diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 427bb813fdbe9..6921e12213b5b 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -46,13 +46,15 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install ninja-build run: sudo apt-get install ninja-build - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a - name: Install ccache run: sudo apt-get install ccache - name: Setup ccache @@ -92,13 +94,15 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install ninja-build run: brew install ninja - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a - name: Install ccache run: brew install ccache - name: Setup ccache @@ -130,11 +134,13 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 9d0d7ec88b857..617f3f2e017a3 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -73,7 +73,7 @@ jobs: curl -sL -o committers.yml $url echo "committers_path=$(pwd)/committers.yml" >> $GITHUB_OUTPUT - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: arrow repository: apache/arrow diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index bf7d2827f6ba7..7a8fd8d10c235 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -23,12 +23,14 @@ on: - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' + - 'docker-compose.yml' - 'python/**' pull_request: paths: - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' + - 'docker-compose.yml' - 'python/**' concurrency: @@ -87,7 +89,7 @@ jobs: NUMPY: ${{ matrix.numpy || 'latest' }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -154,7 +156,7 @@ jobs: MACOSX_DEPLOYMENT_TARGET: 10.15 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 277080bec5ce5..a8680aea56d48 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -27,6 +27,7 @@ on: - "ci/etc/rprofile" - "ci/docker/**" - "cpp/**" + - 'docker-compose.yml' - "r/**" pull_request: paths: @@ -37,6 +38,7 @@ on: - "ci/etc/rprofile" - "ci/docker/**" - "cpp/**" + - 'docker-compose.yml' - "r/**" concurrency: @@ -66,7 +68,7 @@ jobs: UBUNTU: ${{ matrix.ubuntu }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -137,7 +139,7 @@ jobs: DEVTOOLSET_VERSION: ${{ matrix.config.devtoolset }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -195,7 +197,7 @@ jobs: steps: - run: git config --global core.autocrlf false - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup ccache @@ -248,7 +250,7 @@ jobs: steps: - run: git config --global core.autocrlf false - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - run: mkdir r/windows @@ -305,7 +307,8 @@ jobs: RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), MAKEFLAGS = paste0("-j", parallel::detectCores()), ARROW_R_DEV = TRUE, - "_R_CHECK_FORCE_SUGGESTS_" = FALSE + "_R_CHECK_FORCE_SUGGESTS_" = FALSE, + "_R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_" = TRUE ) rcmdcheck::rcmdcheck(".", build_args = '--no-build-vignettes', diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 0b16c16e9430d..7f21d4658e007 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -45,7 +45,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 1 path: arrow @@ -53,7 +53,7 @@ jobs: ref: main submodules: recursive - name: Checkout Crossbow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 path: crossbow diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 1e744975578f5..2e4b98c2428e9 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -29,6 +29,7 @@ on: - 'ci/scripts/util_*' - 'c_glib/**' - 'cpp/**' + - 'docker-compose.yml' - 'ruby/**' pull_request: paths: @@ -41,6 +42,7 @@ on: - 'ci/scripts/util_*' - 'c_glib/**' - 'cpp/**' + - 'docker-compose.yml' - 'ruby/**' concurrency: @@ -69,7 +71,7 @@ jobs: UBUNTU: ${{ matrix.ubuntu }} steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -121,6 +123,7 @@ jobs: ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: OFF ARROW_BUILD_UTILITIES: OFF + ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_FLIGHT_SQL: ON ARROW_GANDIVA: ON @@ -139,7 +142,7 @@ jobs: XML_CATALOG_FILES: /usr/local/etc/xml/catalog steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive @@ -206,6 +209,7 @@ jobs: ARROW_BUILD_TESTS: OFF ARROW_BUILD_UTILITIES: OFF ARROW_BUILD_TYPE: release + ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_FLIGHT_SQL: ON ARROW_GANDIVA: ON @@ -241,7 +245,7 @@ jobs: /d 1 ` /f - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 2169dfe525898..825921ac6fa24 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -23,12 +23,14 @@ on: - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' + - 'docker-compose.yml' - 'swift/**' pull_request: paths: - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' + - 'docker-compose.yml' - 'swift/**' concurrency: @@ -49,7 +51,7 @@ jobs: timeout-minutes: 15 steps: - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 submodules: recursive diff --git a/c_glib/arrow-cuda-glib/meson.build b/c_glib/arrow-cuda-glib/meson.build index fd9e8f0e7b139..1718e2fc990d5 100644 --- a/c_glib/arrow-cuda-glib/meson.build +++ b/c_glib/arrow-cuda-glib/meson.build @@ -43,6 +43,7 @@ libarrow_cuda_glib = library('arrow-cuda-glib', sources: sources, install: true, dependencies: dependencies, + implicit_include_directories: false, include_directories: base_include_directories, soversion: so_version, version: library_version) diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index afdbbd79a1fcd..4037cf00b7a13 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -70,6 +70,7 @@ libarrow_dataset_glib = library('arrow-dataset-glib', sources: sources + enums, install: true, dependencies: dependencies, + implicit_include_directories: false, include_directories: base_include_directories, soversion: so_version, version: library_version) diff --git a/c_glib/arrow-flight-glib/meson.build b/c_glib/arrow-flight-glib/meson.build index 3eb3177a522eb..b869fd226b86a 100644 --- a/c_glib/arrow-flight-glib/meson.build +++ b/c_glib/arrow-flight-glib/meson.build @@ -48,6 +48,7 @@ libarrow_flight_glib = library('arrow-flight-glib', sources: sources, install: true, dependencies: dependencies, + implicit_include_directories: false, include_directories: base_include_directories, soversion: so_version, version: library_version) diff --git a/c_glib/arrow-flight-sql-glib/meson.build b/c_glib/arrow-flight-sql-glib/meson.build index 21cec3f0d101a..f1ea6d67cb04e 100644 --- a/c_glib/arrow-flight-sql-glib/meson.build +++ b/c_glib/arrow-flight-sql-glib/meson.build @@ -45,6 +45,7 @@ libarrow_flight_sql_glib = library('arrow-flight-sql-glib', sources: sources, install: true, dependencies: dependencies, + implicit_include_directories: false, include_directories: base_include_directories, soversion: so_version, version: library_version) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 7fe005f94a5bb..9692f277d183f 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -3346,7 +3346,7 @@ garrow_set_lookup_options_get_property(GObject *object, g_value_set_object(value, priv->value_set); break; case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: - g_value_set_boolean(value, options->skip_nulls); + g_value_set_boolean(value, options->skip_nulls.has_value() && options->skip_nulls.value()); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -3398,13 +3398,11 @@ garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass) * * Since: 6.0.0 */ - spec = g_param_spec_boolean("skip-nulls", - "Skip NULLs", - "Whether NULLs are skipped or not", - options.skip_nulls, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, - PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, + auto skip_nulls = (options.skip_nulls.has_value() && options.skip_nulls.value()); + spec = + g_param_spec_boolean("skip-nulls", "Skip NULLs", "Whether NULLs are skipped or not", + skip_nulls, static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, spec); } @@ -6458,9 +6456,10 @@ garrow_set_lookup_options_new_raw( arrow_copied_options.get()); auto value_set = garrow_datum_new_raw(&(arrow_copied_set_lookup_options->value_set)); + auto skip_nulls = (arrow_options->skip_nulls.has_value() && arrow_options->skip_nulls.value()); auto options = g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS, "value-set", value_set, - "skip-nulls", arrow_options->skip_nulls, + "skip-nulls", skip_nulls, NULL); return GARROW_SET_LOOKUP_OPTIONS(options); } diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index 2b9c3c2472c84..b26dcc4c080eb 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -249,6 +249,7 @@ libarrow_glib = library('arrow-glib', sources: sources + enums, install: true, dependencies: dependencies, + implicit_include_directories: false, include_directories: base_include_directories, soversion: so_version, version: library_version) diff --git a/c_glib/gandiva-glib/meson.build b/c_glib/gandiva-glib/meson.build index c397bd1f9f9e8..52729f64da007 100644 --- a/c_glib/gandiva-glib/meson.build +++ b/c_glib/gandiva-glib/meson.build @@ -85,6 +85,7 @@ libgandiva_glib = library('gandiva-glib', sources: sources + enums, install: true, dependencies: dependencies, + implicit_include_directories: false, include_directories: base_include_directories, soversion: so_version, version: library_version) diff --git a/c_glib/parquet-glib/meson.build b/c_glib/parquet-glib/meson.build index 15a2d56383ba0..08288484bf9b4 100644 --- a/c_glib/parquet-glib/meson.build +++ b/c_glib/parquet-glib/meson.build @@ -55,6 +55,7 @@ libparquet_glib = library('parquet-glib', sources: sources, install: true, dependencies: dependencies, + implicit_include_directories: false, include_directories: base_include_directories, soversion: so_version, version: library_version) diff --git a/c_glib/test/test-array-datum.rb b/c_glib/test/test-array-datum.rb index 623e5589ce40b..1b2c9f91e2aa2 100644 --- a/c_glib/test/test-array-datum.rb +++ b/c_glib/test/test-array-datum.rb @@ -61,7 +61,7 @@ def test_false end def test_to_string - assert_equal("Array", @datum.to_s) + assert_equal("Array([\n" + " true,\n" + " false\n" + "])", @datum.to_s) end def test_value diff --git a/c_glib/test/test-chunked-array-datum.rb b/c_glib/test/test-chunked-array-datum.rb index 76317315327e8..b82f3eed8a7af 100644 --- a/c_glib/test/test-chunked-array-datum.rb +++ b/c_glib/test/test-chunked-array-datum.rb @@ -49,7 +49,7 @@ def test_false end def test_to_string - assert_equal("ChunkedArray", @datum.to_s) + assert_equal("ChunkedArray([\n" + " [\n" + " true,\n" + " false\n" + " ]\n" + "])", @datum.to_s) end def test_value diff --git a/c_glib/test/test-record-batch-datum.rb b/c_glib/test/test-record-batch-datum.rb index 33eb793ba869a..ec572e0f13023 100644 --- a/c_glib/test/test-record-batch-datum.rb +++ b/c_glib/test/test-record-batch-datum.rb @@ -49,7 +49,7 @@ def test_false end def test_to_string - assert_equal("RecordBatch", @datum.to_s) + assert_equal("RecordBatch(visible: [\n" + " true,\n" + " false\n" + " ]\n" + ")", @datum.to_s) end def test_value diff --git a/c_glib/test/test-scalar-datum.rb b/c_glib/test/test-scalar-datum.rb index 17e5d6b061cc7..32a5331518d8b 100644 --- a/c_glib/test/test-scalar-datum.rb +++ b/c_glib/test/test-scalar-datum.rb @@ -60,7 +60,7 @@ def test_false end def test_to_string - assert_equal("Scalar", @datum.to_s) + assert_equal("Scalar(true)", @datum.to_s) end def test_value diff --git a/c_glib/test/test-table-datum.rb b/c_glib/test/test-table-datum.rb index 7ff3997e88a37..c34ecf6314118 100644 --- a/c_glib/test/test-table-datum.rb +++ b/c_glib/test/test-table-datum.rb @@ -49,7 +49,16 @@ def test_false end def test_to_string - assert_equal("Table", @datum.to_s) + assert_equal("Table(visible: bool\n" + + "----\n" + + "visible:\n" + + " [\n" + + " [\n" + + " true,\n" + + " false\n" + + " ]\n" + + " ]\n" + + ")", @datum.to_s) end def test_value diff --git a/ci/conda_env_archery.txt b/ci/conda_env_archery.txt index ace7a42acb026..40875e0a55039 100644 --- a/ci/conda_env_archery.txt +++ b/ci/conda_env_archery.txt @@ -25,7 +25,7 @@ jira pygit2 pygithub ruamel.yaml -setuptools_scm +setuptools_scm<8.0.0 toolz # benchmark diff --git a/ci/conda_env_crossbow.txt b/ci/conda_env_crossbow.txt index 347294650ca28..59b799720f12b 100644 --- a/ci/conda_env_crossbow.txt +++ b/ci/conda_env_crossbow.txt @@ -21,5 +21,5 @@ jinja2 jira pygit2 ruamel.yaml -setuptools_scm +setuptools_scm<8.0.0 toolz diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 4ae5c3614a1dc..d914229ec58c0 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -28,4 +28,4 @@ pytest-faulthandler pytest-lazy-fixture s3fs>=2021.8.0 setuptools -setuptools_scm +setuptools_scm<8.0.0 diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index bd08937ae81be..af1bfe9b780f4 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -20,7 +20,7 @@ breathe doxygen ipython numpydoc -pydata-sphinx-theme==0.8 +pydata-sphinx-theme sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/ci/docker/alpine-linux-3.16-cpp.dockerfile b/ci/docker/alpine-linux-3.16-cpp.dockerfile index f269fa548c141..8828e717a53a1 100644 --- a/ci/docker/alpine-linux-3.16-cpp.dockerfile +++ b/ci/docker/alpine-linux-3.16-cpp.dockerfile @@ -85,6 +85,7 @@ ENV ARROW_ACERO=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ ARROW_USE_CCACHE=ON \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index 2997983ca320d..b635e5e93455c 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -61,6 +61,7 @@ ENV ARROW_ACERO=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ ARROW_USE_CCACHE=ON \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index da315776b58f2..a306790b5cb6d 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -24,7 +24,7 @@ ARG maven=3.5 ARG node=16 ARG yarn=1.22 ARG jdk=8 -ARG go=1.15 +ARG go=1.19.13 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ @@ -57,6 +57,7 @@ ENV DOTNET_ROOT=/opt/dotnet \ RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 7.0 -InstallDir /opt/dotnet ENV ARROW_ACERO=OFF \ + ARROW_AZURE=OFF \ ARROW_BUILD_INTEGRATION=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ @@ -67,11 +68,13 @@ ENV ARROW_ACERO=OFF \ ARROW_FLIGHT=ON \ ARROW_FLIGHT_SQL=ON \ ARROW_GANDIVA=OFF \ + ARROW_GCS=OFF \ ARROW_HDFS=OFF \ ARROW_JEMALLOC=OFF \ ARROW_JSON=OFF \ ARROW_ORC=OFF \ ARROW_PARQUET=OFF \ ARROW_S3=OFF \ + ARROW_SUBSTRAIT=OFF \ ARROW_USE_GLOG=OFF \ CMAKE_UNITY_BUILD=ON diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile new file mode 100644 index 0000000000000..d67ef677276c7 --- /dev/null +++ b/ci/docker/conda-python-cython2.dockerfile @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG repo +ARG arch +ARG python=3.8 +FROM ${repo}:${arch}-conda-python-${python} + +RUN mamba install -q -y "cython<3" && \ + mamba clean --all diff --git a/ci/docker/conda-python-dask.dockerfile b/ci/docker/conda-python-dask.dockerfile index 400106f189e3a..44840110817e9 100644 --- a/ci/docker/conda-python-dask.dockerfile +++ b/ci/docker/conda-python-dask.dockerfile @@ -23,3 +23,15 @@ FROM ${repo}:${arch}-conda-python-${python} ARG dask=latest COPY ci/scripts/install_dask.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_dask.sh ${dask} + +ENV ARROW_ACERO=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=OFF \ + ARROW_FLIGHT_SQL=OFF \ + ARROW_FILESYSTEM=ON \ + ARROW_GANDIVA=OFF \ + ARROW_ORC=ON \ + ARROW_SUBSTRAIT=OFF \ + ARROW_TENSORFLOW=OFF diff --git a/ci/docker/conda-python-substrait.dockerfile b/ci/docker/conda-python-substrait.dockerfile index 33f6957a86f6e..191795f253000 100644 --- a/ci/docker/conda-python-substrait.dockerfile +++ b/ci/docker/conda-python-substrait.dockerfile @@ -36,13 +36,16 @@ RUN mamba install -q -y \ ARG substrait=latest COPY ci/scripts/install_substrait_consumer.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_substrait_consumer.sh + ENV ARROW_ACERO=ON \ - ARROW_BUILD_TESTS=ON \ ARROW_COMPUTE=ON \ ARROW_CSV=ON \ ARROW_DATASET=ON \ ARROW_FILESYSTEM=ON \ + ARROW_FLIGHT=OFF \ + ARROW_FLIGHT_SQL=OFF \ + ARROW_GANDIVA=OFF \ ARROW_JSON=ON \ - ARROW_SUBSTRAIT=ON - -RUN /arrow/ci/scripts/install_substrait_consumer.sh + ARROW_SUBSTRAIT=ON \ + ARROW_TESTING=OFF diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 21e57228f44f0..ca0ceee5f9227 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -46,5 +46,6 @@ ENV ARROW_ACERO=ON \ ARROW_GDB=ON \ ARROW_HDFS=ON \ ARROW_JSON=ON \ + ARROW_SUBSTRAIT=OFF \ ARROW_TENSORFLOW=ON \ ARROW_USE_GLOG=OFF diff --git a/ci/docker/debian-11-cpp.dockerfile b/ci/docker/debian-11-cpp.dockerfile index 00adc6bd6b3c9..46824054624ab 100644 --- a/ci/docker/debian-11-cpp.dockerfile +++ b/ci/docker/debian-11-cpp.dockerfile @@ -108,6 +108,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ ARROW_USE_CCACHE=ON \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ diff --git a/ci/docker/debian-11-go.dockerfile b/ci/docker/debian-11-go.dockerfile index 9f75bf23fddf2..de8186b9b8e1c 100644 --- a/ci/docker/debian-11-go.dockerfile +++ b/ci/docker/debian-11-go.dockerfile @@ -16,8 +16,8 @@ # under the License. ARG arch=amd64 -ARG go=1.17 -ARG staticcheck=v0.2.2 +ARG go=1.19 +ARG staticcheck=v0.4.5 FROM ${arch}/golang:${go}-bullseye # FROM collects all the args, get back the staticcheck version arg diff --git a/ci/docker/fedora-35-cpp.dockerfile b/ci/docker/fedora-35-cpp.dockerfile index 668e35b4435ba..aefa25663ba14 100644 --- a/ci/docker/fedora-35-cpp.dockerfile +++ b/ci/docker/fedora-35-cpp.dockerfile @@ -87,6 +87,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ ARROW_USE_CCACHE=ON \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 125f1f48d482e..1e0a1e4807402 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -99,6 +99,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ @@ -160,6 +161,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ ARROW_USE_ASAN=OFF \ ARROW_USE_CCACHE=ON \ ARROW_USE_UBSAN=OFF \ @@ -172,6 +174,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_WITH_ZSTD=ON \ ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ gRPC_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 0840b3fa5c68d..fffafe2b0521d 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -98,6 +98,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ @@ -184,6 +185,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ ARROW_USE_ASAN=OFF \ ARROW_USE_CCACHE=ON \ ARROW_USE_UBSAN=OFF \ @@ -196,6 +198,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_WITH_ZSTD=ON \ ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 5a89fafc6015e..1f5596e2a50e1 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -104,7 +104,7 @@ cmake \ -DARROW_C_FLAGS_DEBUG="${ARROW_C_FLAGS_DEBUG:-}" \ -DARROW_C_FLAGS_RELEASE="${ARROW_C_FLAGS_RELEASE:-}" \ -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ - -DARROW_DATASET=${ARROW_DATASET:-ON} \ + -DARROW_DATASET=${ARROW_DATASET:-OFF} \ -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ @@ -129,7 +129,7 @@ cmake \ -DARROW_S3=${ARROW_S3:-OFF} \ -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ - -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-ON} \ + -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ @@ -152,6 +152,7 @@ cmake \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ + -DAzure_SOURCE=${Azure_SOURCE:-} \ -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ -DBOOST_SOURCE=${BOOST_SOURCE:-} \ -DBrotli_SOURCE=${Brotli_SOURCE:-} \ @@ -189,6 +190,9 @@ cmake \ export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$[${n_jobs} + 1]} time cmake --build . --target install +# Save disk space by removing large temporary build products +find . -name "*.o" -delete + popd if [ -x "$(command -v ldconfig)" ]; then diff --git a/ci/scripts/go_bench_adapt.py b/ci/scripts/go_bench_adapt.py index 103712f6d8f83..a05e25de8bdd3 100644 --- a/ci/scripts/go_bench_adapt.py +++ b/ci/scripts/go_bench_adapt.py @@ -20,7 +20,7 @@ import uuid import logging from pathlib import Path -from typing import List, Optional, Dict +from typing import List from benchadapt import BenchmarkResult from benchadapt.adapters import BenchmarkAdapter @@ -33,9 +33,9 @@ # `github_commit_info` is meant to communicate GitHub-flavored commit # information to Conbench. See -# https://github.com/conbench/conbench/blob/7c4968e631ecdc064559c86a1174a1353713b700/benchadapt/python/benchadapt/result.py#L66 +# https://github.com/conbench/conbench/blob/cf7931f/benchadapt/python/benchadapt/result.py#L66 # for a specification. -github_commit_info: Optional[Dict] = None +github_commit_info = {"repository": "https://github.com/apache/arrow"} if os.environ.get("CONBENCH_REF") == "main": # Assume GitHub Actions CI. The environment variable lookups below are @@ -53,7 +53,7 @@ # This is probably a local dev environment, for testing. In this case, it # does usually not make sense to provide commit information (not a - # controlled CI environment). Explicitly keep `github_commit_info=None` to + # controlled CI environment). Explicitly leave out "commit" and "pr_number" to # reflect that (to not send commit information). # Reflect 'local dev' scenario in run_reason. Allow user to (optionally) @@ -95,7 +95,7 @@ def _transform_results(self) -> List[BenchmarkResult]: batch_id=batch_id, stats={ "data": [data], - "unit": "b/s", + "unit": "B/s", "times": [time], "time_unit": "i/s", "iterations": benchmark["Runs"], @@ -114,10 +114,9 @@ def _transform_results(self) -> List[BenchmarkResult]: run_reason=run_reason, github=github_commit_info, ) - if github_commit_info is not None: - parsed.run_name = ( - f"{parsed.run_reason}: {github_commit_info['commit']}" - ) + parsed.run_name = ( + f"{parsed.run_reason}: {github_commit_info.get('commit')}" + ) parsed_results.append(parsed) return parsed_results diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh index 3c8cc0f4ee2e2..2a38901337c56 100755 --- a/ci/scripts/go_build.sh +++ b/ci/scripts/go_build.sh @@ -41,3 +41,22 @@ pushd ${source_dir}/parquet go install -v ./... popd + +if [[ -n "${ARROW_GO_INTEGRATION}" ]]; then + pushd ${source_dir}/arrow/internal/cdata_integration + + case "$(uname)" in + Linux) + go_lib="arrow_go_integration.so" + ;; + Darwin) + go_lib="arrow_go_integration.so" + ;; + MINGW*) + go_lib="arrow_go_integration.dll" + ;; + esac + go build -tags cdata_integration,assert -buildmode=c-shared -o ${go_lib} . + + popd +fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index 30cbb2d63791c..a165f8027bf8f 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -22,10 +22,12 @@ set -ex arrow_dir=${1} gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration -pip install -e $arrow_dir/dev/archery +pip install -e $arrow_dir/dev/archery[integration] # Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1 -archery integration \ +time archery integration \ + --run-c-data \ + --run-ipc \ --run-flight \ --with-cpp=1 \ --with-csharp=1 \ diff --git a/ci/scripts/integration_substrait.sh b/ci/scripts/integration_substrait.sh index ce4c68ceb7a86..f7208ae113814 100755 --- a/ci/scripts/integration_substrait.sh +++ b/ci/scripts/integration_substrait.sh @@ -28,4 +28,4 @@ python -c "from substrait_consumer.consumers import AceroConsumer" echo "Executing pytest" cd consumer-testing -pytest substrait_consumer/tests/functional/extension_functions/test_boolean_functions.py --producer IsthmusProducer --consumer AceroConsumer +pytest -r s substrait_consumer/tests/functional/extension_functions/test_boolean_functions.py --producer IsthmusProducer --consumer AceroConsumer diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index c97733257a721..d61f74f0b7ca1 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -32,12 +32,14 @@ yarn lint:ci yarn build if [ "${BUILD_DOCS_JS}" == "ON" ]; then - if [ "$(git config --get remote.origin.url)" == "https://github.com/apache/arrow.git" ]; then - yarn doc - elif [ "$(git config --get remote.upstream.url)" == "https://github.com/apache/arrow.git" ]; then - yarn doc --gitRemote upstream - elif [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + # If apache or upstream are defined use those as remote. + # Otherwise use origin which could be a fork on PRs. + if [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then yarn doc --gitRemote apache + elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then + yarn doc --gitRemote upstream + elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then + yarn doc else echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 diff --git a/ci/scripts/matlab_build.sh b/ci/scripts/matlab_build.sh index 235002da3afc6..d3f86adbb8a2b 100755 --- a/ci/scripts/matlab_build.sh +++ b/ci/scripts/matlab_build.sh @@ -29,8 +29,6 @@ cmake \ -S ${source_dir} \ -B ${build_dir} \ -G Ninja \ - -D MATLAB_BUILD_TESTS=ON \ -D CMAKE_INSTALL_PREFIX=${install_dir} \ -D MATLAB_ADD_INSTALL_DIR_TO_SEARCH_PATH=OFF cmake --build ${build_dir} --config Release --target install -ctest --test-dir ${build_dir} diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index b5b5b75b9679a..c0a27e6e705e9 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -54,7 +54,7 @@ fi export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja} export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} -export PYARROW_WITH_ACERO=${ARROW_ACERO:-ON} +export PYARROW_WITH_ACERO=${ARROW_ACERO:-OFF} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} diff --git a/ci/scripts/r_deps.sh b/ci/scripts/r_deps.sh index 50b3043978716..2b432f768c2b4 100755 --- a/ci/scripts/r_deps.sh +++ b/ci/scripts/r_deps.sh @@ -47,7 +47,7 @@ ${R_BIN} -e "options(warn=2); install.packages('remotes'); remotes::install_cran # Install DuckDB from github when requested if [ ${R_DUCKDB_DEV} == "true" ]; then - ${R_BIN} -e "remotes::install_github('duckdb/duckdb', subdir = '/tools/rpkg', build = FALSE)" + ${R_BIN} -e "remotes::install_github('duckdb/duckdb-r', build = FALSE)" fi # Separately install the optional/test dependencies but don't error on them, diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index d7df44e2e43a9..e0c2ce9efedd8 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -65,6 +65,8 @@ export _R_CHECK_DONTTEST_EXAMPLES_=TRUE export _R_CHECK_FORCE_SUGGESTS_=FALSE export _R_CHECK_LIMIT_CORES_=FALSE export _R_CHECK_TESTS_NLINES_=0 +# This can cause failures on CRAN but needs to be set here so issues an error not a warning +export _R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_=TRUE # By default, aws-sdk tries to contact a non-existing local ip host # to retrieve metadata. Disable this so that S3FileSystem tests run faster. diff --git a/ci/scripts/release_test.sh b/ci/scripts/release_test.sh index ae2ab328884b3..583d9618c657b 100755 --- a/ci/scripts/release_test.sh +++ b/ci/scripts/release_test.sh @@ -23,6 +23,6 @@ arrow_dir=${1} pushd ${arrow_dir} -dev/release/run-test.rb +dev/release/run-test.rb -vv popd diff --git a/ci/scripts/rust_build.sh b/ci/scripts/rust_build.sh index 3532ea3d5c642..2dfc0f1b1892d 100755 --- a/ci/scripts/rust_build.sh +++ b/ci/scripts/rust_build.sh @@ -56,4 +56,9 @@ pushd ${source_dir} # build only the integration testing binaries cargo build -p arrow-integration-testing +# Save disk space by removing large temporary build products +rm -rf target/debug/deps +rm -rf target/debug/build +rm -rf target/debug/incremental + popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 27d7097a2901f..f2906b960eba6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -18,35 +18,59 @@ cmake_minimum_required(VERSION 3.16) message(STATUS "Building using CMake version: ${CMAKE_VERSION}") -# Compiler id for Apple Clang is now AppleClang. # https://www.cmake.org/cmake/help/latest/policy/CMP0025.html +# +# Compiler id for Apple Clang is now AppleClang. cmake_policy(SET CMP0025 NEW) -# Only interpret if() arguments as variables or keywords when unquoted. +# https://cmake.org/cmake/help/latest/policy/CMP0042.html +# +# Enable MACOSX_RPATH by default. @rpath in a target's install name is +# a more flexible and powerful mechanism than @executable_path or +# @loader_path for locating shared libraries. +cmake_policy(SET CMP0042 NEW) + # https://www.cmake.org/cmake/help/latest/policy/CMP0054.html +# +# Only interpret if() arguments as variables or keywords when unquoted. cmake_policy(SET CMP0054 NEW) -# Support new if() IN_LIST operator. # https://www.cmake.org/cmake/help/latest/policy/CMP0057.html +# +# Support new if() IN_LIST operator. cmake_policy(SET CMP0057 NEW) +# https://www.cmake.org/cmake/help/latest/policy/CMP0063.html +# # Adapted from Apache Kudu: https://github.com/apache/kudu/commit/bd549e13743a51013585 # Honor visibility properties for all target types. -# https://www.cmake.org/cmake/help/latest/policy/CMP0063.html cmake_policy(SET CMP0063 NEW) -# RPATH settings on macOS do not affect install_name. # https://cmake.org/cmake/help/latest/policy/CMP0068.html +# +# RPATH settings on macOS do not affect install_name. cmake_policy(SET CMP0068 NEW) -# find_package() uses _ROOT variables. # https://cmake.org/cmake/help/latest/policy/CMP0074.html +# +# find_package() uses _ROOT variables. cmake_policy(SET CMP0074 NEW) -# MSVC runtime library flags are selected by an abstraction. # https://cmake.org/cmake/help/latest/policy/CMP0091.html +# +# MSVC runtime library flags are selected by an abstraction. cmake_policy(SET CMP0091 NEW) +# https://cmake.org/cmake/help/latest/policy/CMP0135.html +# +# CMP0135 is for solving re-building and re-downloading. +# We don't have a real problem with the OLD behavior for now +# but we use the NEW behavior explicitly to suppress CMP0135 +# warnings. +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + set(ARROW_VERSION "14.0.0-SNAPSHOT") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -499,12 +523,21 @@ set(ARROW_PC_CFLAGS_PRIVATE " -DARROW_STATIC") set(ARROW_PC_LIBS_PRIVATE "") set(ARROW_PC_REQUIRES_PRIVATE "") +# For arrow-flight.pc. +set(ARROW_FLIGHT_PC_REQUIRES_PRIVATE "") + # For arrow-testing.pc. set(ARROW_TESTING_PC_CFLAGS "") set(ARROW_TESTING_PC_CFLAGS_PRIVATE " -DARROW_TESTING_STATIC") set(ARROW_TESTING_PC_LIBS "") set(ARROW_TESTING_PC_REQUIRES "") +# For parquet.pc. +set(PARQUET_PC_CFLAGS "") +set(PARQUET_PC_CFLAGS_PRIVATE " -DPARQUET_STATIC") +set(PARQUET_PC_REQUIRES "") +set(PARQUET_PC_REQUIRES_PRIVATE "") + include(ThirdpartyToolchain) # Add common flags @@ -795,6 +828,11 @@ if(ARROW_WITH_OPENTELEMETRY) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) endif() +if(ARROW_WITH_AZURE_SDK) + list(APPEND ARROW_SHARED_LINK_LIBS ${AZURE_SDK_LINK_LIBRARIES}) + list(APPEND ARROW_STATIC_LINK_LIBS ${AZURE_SDK_LINK_LIBRARIES}) +endif() + if(ARROW_WITH_UTF8PROC) list(APPEND ARROW_SHARED_LINK_LIBS utf8proc::utf8proc) list(APPEND ARROW_STATIC_LINK_LIBS utf8proc::utf8proc) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 94141d693be8f..f6324c1c0a96d 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -11,7 +11,8 @@ "hidden": true, "generator": "Ninja", "cacheVariables": { - "ARROW_BUILD_STATIC": "OFF" + "ARROW_BUILD_STATIC": "OFF", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" } }, { diff --git a/cpp/build-support/update-thrift.sh b/cpp/build-support/update-thrift.sh index 1213a628e217a..9b8f2539cffe3 100755 --- a/cpp/build-support/update-thrift.sh +++ b/cpp/build-support/update-thrift.sh @@ -20,4 +20,4 @@ # Run this from cpp/ directory. thrift is expected to be in your path -thrift --gen cpp -out src/generated src/parquet/parquet.thrift +thrift --gen cpp:moveable_types -out src/generated src/parquet/parquet.thrift diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 9112b836c9ef4..083ac2fe9a862 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -99,7 +99,7 @@ function(arrow_create_merged_static_lib output_target) if(APPLE) set(BUNDLE_COMMAND "libtool" "-no_warning_for_no_symbols" "-static" "-o" ${output_lib_path} ${all_library_paths}) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Clang|GNU|Intel)$") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Clang|GNU|Intel|IntelLLVM)$") set(ar_script_path ${CMAKE_BINARY_DIR}/${ARG_NAME}.ar) file(WRITE ${ar_script_path}.in "CREATE ${output_lib_path}\n") diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 29517567ce6e5..6e6a74c9c78f8 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -470,6 +470,15 @@ takes precedence over ccache if a storage backend is configured" ON) "Rely on jemalloc shared libraries where relevant" ${ARROW_DEPENDENCY_USE_SHARED}) + if(MSVC) + # LLVM doesn't support shared library with MSVC. + set(ARROW_LLVM_USE_SHARED_DEFAULT OFF) + else() + set(ARROW_LLVM_USE_SHARED_DEFAULT ${ARROW_DEPENDENCY_USE_SHARED}) + endif() + define_option(ARROW_LLVM_USE_SHARED "Rely on LLVM shared libraries where relevant" + ${ARROW_LLVM_USE_SHARED_DEFAULT}) + define_option(ARROW_LZ4_USE_SHARED "Rely on lz4 shared libraries where relevant" ${ARROW_DEPENDENCY_USE_SHARED}) diff --git a/cpp/cmake_modules/FindAzure.cmake b/cpp/cmake_modules/FindAzure.cmake new file mode 100644 index 0000000000000..fdf354b724e77 --- /dev/null +++ b/cpp/cmake_modules/FindAzure.cmake @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(Azure_FOUND) + return() +endif() + +set(find_package_args) +list(APPEND find_package_args CONFIG) +if(Azure_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() + +if(Azure_FIND_REQUIRED) + list(APPEND find_package_args REQUIRED) +endif() + +find_package(azure-core-cpp ${find_package_args}) +find_package(azure-identity-cpp ${find_package_args}) +find_package(azure-storage-blobs-cpp ${find_package_args}) +find_package(azure-storage-common-cpp ${find_package_args}) +find_package(azure-storage-files-datalake-cpp ${find_package_args}) + +find_package_handle_standard_args( + Azure + REQUIRED_VARS azure-core-cpp_FOUND + azure-identity-cpp_FOUND + azure-storage-blobs-cpp_FOUND + azure-storage-common-cpp_FOUND + azure-storage-files-datalake-cpp_FOUND + VERSION_VAR azure-core-cpp_VERSION) diff --git a/cpp/cmake_modules/FindLLVMAlt.cmake b/cpp/cmake_modules/FindLLVMAlt.cmake index b3d77978f153c..e980f53fd3407 100644 --- a/cpp/cmake_modules/FindLLVMAlt.cmake +++ b/cpp/cmake_modules/FindLLVMAlt.cmake @@ -68,18 +68,6 @@ if(NOT LLVM_FOUND) endif() if(LLVM_FOUND) - # Find the libraries that correspond to the LLVM components - llvm_map_components_to_libnames(LLVM_LIBS - core - mcjit - native - ipo - bitreader - target - linker - analysis - debuginfodwarf) - find_program(LLVM_LINK_EXECUTABLE llvm-link HINTS ${LLVM_TOOLS_BINARY_DIR}) find_program(CLANG_EXECUTABLE @@ -94,22 +82,37 @@ if(LLVM_FOUND) INTERFACE_COMPILE_FLAGS "${LLVM_DEFINITIONS}") add_library(LLVM::LLVM_LIBS INTERFACE IMPORTED) - set_target_properties(LLVM::LLVM_LIBS PROPERTIES INTERFACE_LINK_LIBRARIES - "${LLVM_LIBS}") + if(ARROW_LLVM_USE_SHARED) + target_link_libraries(LLVM::LLVM_LIBS INTERFACE LLVM) + else() + # Find the libraries that correspond to the LLVM components + llvm_map_components_to_libnames(LLVM_LIBS + core + mcjit + native + ipo + bitreader + target + linker + analysis + debuginfodwarf) + target_link_libraries(LLVM::LLVM_LIBS INTERFACE ${LLVM_LIBS}) - if(TARGET LLVMSupport AND NOT ARROW_ZSTD_USE_SHARED) - get_target_property(LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES LLVMSupport - INTERFACE_LINK_LIBRARIES) - list(FIND LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES zstd::libzstd_shared - LLVM_SUPPORT_LIBZSTD_INDEX) - if(NOT LLVM_SUPPORT_LIBZSTD_INDEX EQUAL -1) - list(REMOVE_AT LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES ${LLVM_SUPPORT_LIBZSTD_INDEX}) - list(INSERT LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES ${LLVM_SUPPORT_LIBZSTD_INDEX} - zstd::libzstd_static) + if(TARGET LLVMSupport AND NOT ARROW_ZSTD_USE_SHARED) + get_target_property(LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES LLVMSupport + INTERFACE_LINK_LIBRARIES) + list(FIND LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES zstd::libzstd_shared + LLVM_SUPPORT_LIBZSTD_INDEX) + if(NOT LLVM_SUPPORT_LIBZSTD_INDEX EQUAL -1) + list(REMOVE_AT LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES + ${LLVM_SUPPORT_LIBZSTD_INDEX}) + list(INSERT LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES ${LLVM_SUPPORT_LIBZSTD_INDEX} + zstd::libzstd_static) + endif() + set_target_properties(LLVMSupport + PROPERTIES INTERFACE_LINK_LIBRARIES + "${LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES}") endif() - set_target_properties(LLVMSupport - PROPERTIES INTERFACE_LINK_LIBRARIES - "${LLVM_SUPPORT_INTERFACE_LINK_LIBRARIES}") endif() endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 6b47fcb717287..5531415ac2277 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -49,13 +49,16 @@ if(ARROW_CPU_FLAG STREQUAL "x86") if(MSVC) set(ARROW_SSE4_2_FLAG "") set(ARROW_AVX2_FLAG "/arch:AVX2") + # MSVC has no specific flag for BMI2, it seems to be enabled with AVX2 + set(ARROW_BMI2_FLAG "/arch:AVX2") set(ARROW_AVX512_FLAG "/arch:AVX512") set(CXX_SUPPORTS_SSE4_2 TRUE) else() set(ARROW_SSE4_2_FLAG "-msse4.2") set(ARROW_AVX2_FLAG "-march=haswell") + set(ARROW_BMI2_FLAG "-mbmi2") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ - set(ARROW_AVX512_FLAG "-march=skylake-avx512 -mbmi2") + set(ARROW_AVX512_FLAG "-march=skylake-avx512") # Append the avx2/avx512 subset option also, fix issue ARROW-9877 for homebrew-cpp set(ARROW_AVX2_FLAG "${ARROW_AVX2_FLAG} -mavx2") set(ARROW_AVX512_FLAG @@ -95,13 +98,16 @@ if(ARROW_CPU_FLAG STREQUAL "x86") set(ARROW_HAVE_RUNTIME_SSE4_2 ON) add_definitions(-DARROW_HAVE_RUNTIME_SSE4_2) endif() + # Note: for now we assume that AVX2 support should also enable BMI2 support, + # at least at compile-time (more care may be required for runtime dispatch). if(CXX_SUPPORTS_AVX2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(AVX2|AVX512|MAX)$") set(ARROW_HAVE_RUNTIME_AVX2 ON) + set(ARROW_HAVE_RUNTIME_BMI2 ON) add_definitions(-DARROW_HAVE_RUNTIME_AVX2 -DARROW_HAVE_RUNTIME_BMI2) endif() if(CXX_SUPPORTS_AVX512 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$") set(ARROW_HAVE_RUNTIME_AVX512 ON) - add_definitions(-DARROW_HAVE_RUNTIME_AVX512 -DARROW_HAVE_RUNTIME_BMI2) + add_definitions(-DARROW_HAVE_RUNTIME_AVX512) endif() if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT") set(ARROW_SIMD_LEVEL "SSE4_2") @@ -323,7 +329,8 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-sign-conversion") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdate-time") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL + "IntelLLVM") if(WIN32) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wno-deprecated") @@ -354,7 +361,8 @@ elseif("${BUILD_WARNING_LEVEL}" STREQUAL "EVERYTHING") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL + "IntelLLVM") if(WIN32) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") else() @@ -377,7 +385,8 @@ else() OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL + "IntelLLVM") if(WIN32) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") else() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1767c05b5ee3a..85c0337d108be 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -32,6 +32,9 @@ set(ARROW_BUNDLED_STATIC_LIBS) # Accumulate all system dependencies to provide suitable static link # parameters to the third party libraries. set(ARROW_SYSTEM_DEPENDENCIES) +set(ARROW_FLIGHT_SYSTEM_DEPENDENCIES) +set(ARROW_TESTING_SYSTEM_DEPENDENCIES) +set(PARQUET_SYSTEM_DEPENDENCIES) # ---------------------------------------------------------------------- # Toolchain linkage options @@ -46,6 +49,7 @@ set(ARROW_RE2_LINKAGE set(ARROW_THIRDPARTY_DEPENDENCIES absl AWSSDK + Azure benchmark Boost Brotli @@ -159,6 +163,8 @@ macro(build_dependency DEPENDENCY_NAME) build_absl() elseif("${DEPENDENCY_NAME}" STREQUAL "AWSSDK") build_awssdk() + elseif("${DEPENDENCY_NAME}" STREQUAL "Azure") + build_azure_sdk() elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") build_benchmark() elseif("${DEPENDENCY_NAME}" STREQUAL "Boost") @@ -233,6 +239,7 @@ macro(resolve_dependency DEPENDENCY_NAME) set(options) set(one_value_args ARROW_CMAKE_PACKAGE_NAME + ARROW_PC_PACKAGE_NAME FORCE_ANY_NEWER_VERSION HAVE_ALT IS_RUNTIME_DEPENDENCY @@ -297,12 +304,26 @@ macro(resolve_dependency DEPENDENCY_NAME) if(NOT ARG_ARROW_CMAKE_PACKAGE_NAME) set(ARG_ARROW_CMAKE_PACKAGE_NAME "Arrow") endif() - if(ARG_ARROW_CMAKE_PACKAGE_NAME STREQUAL "Arrow") - provide_find_module(${PACKAGE_NAME} "Arrow") - list(APPEND ARROW_SYSTEM_DEPENDENCIES ${PACKAGE_NAME}) - else() - provide_find_module(${PACKAGE_NAME} ${ARG_ARROW_CMAKE_PACKAGE_NAME}) + # ArrowFlight -> _Arrow_Flight + string(REGEX REPLACE "([A-Z])" "_\\1" ARG_ARROW_CMAKE_PACKAGE_NAME_SNAKE + ${ARG_ARROW_CMAKE_PACKAGE_NAME}) + # _Arrow_Flight -> Arrow_Flight + string(SUBSTRING ${ARG_ARROW_CMAKE_PACKAGE_NAME_SNAKE} 1 -1 + ARG_ARROW_CMAKE_PACKAGE_NAME_SNAKE) + # Arrow_Flight -> ARROW_FLIGHT + string(TOUPPER ${ARG_ARROW_CMAKE_PACKAGE_NAME_SNAKE} + ARG_ARROW_CMAKE_PACKAGE_NAME_UPPER_SNAKE) + provide_find_module(${PACKAGE_NAME} ${ARG_ARROW_CMAKE_PACKAGE_NAME}) + list(APPEND ${ARG_ARROW_CMAKE_PACKAGE_NAME_UPPER_SNAKE}_SYSTEM_DEPENDENCIES + ${PACKAGE_NAME}) + if(NOT ARG_ARROW_PC_PACKAGE_NAME) + set(ARG_ARROW_PC_PACKAGE_NAME "arrow") endif() + # arrow-flight -> arrow_flight + string(REPLACE "-" "_" ARG_ARROW_PC_PACKAGE_NAME_SNAKE ${ARG_ARROW_PC_PACKAGE_NAME}) + # arrow_flight -> ARROW_FLIGHT + string(TOUPPER ${ARG_ARROW_PC_PACKAGE_NAME_SNAKE} + ARG_ARROW_PC_PACKAGE_NAME_UPPER_SNAKE) if(ARROW_BUILD_STATIC) find_package(PkgConfig QUIET) foreach(ARG_PC_PACKAGE_NAME ${ARG_PC_PACKAGE_NAMES}) @@ -311,13 +332,16 @@ macro(resolve_dependency DEPENDENCY_NAME) NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH QUIET) + set(RESOLVE_DEPENDENCY_PC_PACKAGE + "pkg-config package for ${ARG_PC_PACKAGE_NAME} ") + string(APPEND RESOLVE_DEPENDENCY_PC_PACKAGE + "that is used by ${ARG_ARROW_PC_PACKAGE_NAME} for static link") if(${${ARG_PC_PACKAGE_NAME}_PC_FOUND}) - message(STATUS "Using pkg-config package for ${ARG_PC_PACKAGE_NAME} for static link" - ) - string(APPEND ARROW_PC_REQUIRES_PRIVATE " ${ARG_PC_PACKAGE_NAME}") + message(STATUS "Using ${RESOLVE_DEPENDENCY_PC_PACKAGE}") + string(APPEND ${ARG_ARROW_PC_PACKAGE_NAME_UPPER_SNAKE}_PC_REQUIRES_PRIVATE + " ${ARG_PC_PACKAGE_NAME}") else() - message(STATUS "pkg-config package for ${ARG_PC_PACKAGE_NAME} for static link isn't found" - ) + message(STATUS "${RESOLVE_DEPENDENCY_PC_PACKAGE} isn't found") endif() endforeach() endif() @@ -368,6 +392,10 @@ if(ARROW_GCS) set(ARROW_WITH_ZLIB ON) endif() +if(ARROW_AZURE) + set(ARROW_WITH_AZURE_SDK ON) +endif() + if(ARROW_JSON) set(ARROW_WITH_RAPIDJSON ON) endif() @@ -548,6 +576,14 @@ else() "${THIRDPARTY_MIRROR_URL}/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz") endif() +if(DEFINED ENV{ARROW_AZURE_SDK_URL}) + set(ARROW_AZURE_SDK_URL "$ENV{ARROW_AZURE_SDK_URL}") +else() + set_urls(ARROW_AZURE_SDK_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/${ARROW_AZURE_SDK_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_BOOST_URL}) set(BOOST_SOURCE_URL "$ENV{ARROW_BOOST_URL}") else() @@ -960,6 +996,23 @@ else() set(MAKE_BUILD_ARGS "-j${NPROC}") endif() +include(FetchContent) + +macro(prepare_fetchcontent) + set(BUILD_SHARED_LIBS OFF) + set(BUILD_STATIC_LIBS ON) + set(CMAKE_COMPILE_WARNING_AS_ERROR FALSE) + set(CMAKE_EXPORT_NO_PACKAGE_REGISTRY TRUE) + set(CMAKE_MACOSX_RPATH ${ARROW_INSTALL_NAME_RPATH}) + if(MSVC) + string(REPLACE "/WX" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + string(REPLACE "/WX" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + else() + string(APPEND CMAKE_C_FLAGS_DEBUG " -Wno-error") + string(APPEND CMAKE_CXX_FLAGS_DEBUG " -Wno-error") + endif() +endmacro() + # ---------------------------------------------------------------------- # Find pthreads @@ -1269,10 +1322,9 @@ macro(build_snappy) set(Snappy_TARGET Snappy::snappy-static) add_library(${Snappy_TARGET} STATIC IMPORTED) - set_target_properties(${Snappy_TARGET} - PROPERTIES IMPORTED_LOCATION "${SNAPPY_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${SNAPPY_PREFIX}/include") + set_target_properties(${Snappy_TARGET} PROPERTIES IMPORTED_LOCATION + "${SNAPPY_STATIC_LIB}") + target_include_directories(${Snappy_TARGET} BEFORE INTERFACE "${SNAPPY_PREFIX}/include") add_dependencies(toolchain snappy_ep) add_dependencies(${Snappy_TARGET} snappy_ep) @@ -1328,21 +1380,22 @@ macro(build_brotli) file(MAKE_DIRECTORY "${BROTLI_INCLUDE_DIR}") add_library(Brotli::brotlicommon STATIC IMPORTED) - set_target_properties(Brotli::brotlicommon - PROPERTIES IMPORTED_LOCATION "${BROTLI_STATIC_LIBRARY_COMMON}" - INTERFACE_INCLUDE_DIRECTORIES "${BROTLI_INCLUDE_DIR}") + set_target_properties(Brotli::brotlicommon PROPERTIES IMPORTED_LOCATION + "${BROTLI_STATIC_LIBRARY_COMMON}") + target_include_directories(Brotli::brotlicommon BEFORE + INTERFACE "${BROTLI_INCLUDE_DIR}") add_dependencies(Brotli::brotlicommon brotli_ep) add_library(Brotli::brotlienc STATIC IMPORTED) - set_target_properties(Brotli::brotlienc - PROPERTIES IMPORTED_LOCATION "${BROTLI_STATIC_LIBRARY_ENC}" - INTERFACE_INCLUDE_DIRECTORIES "${BROTLI_INCLUDE_DIR}") + set_target_properties(Brotli::brotlienc PROPERTIES IMPORTED_LOCATION + "${BROTLI_STATIC_LIBRARY_ENC}") + target_include_directories(Brotli::brotlienc BEFORE INTERFACE "${BROTLI_INCLUDE_DIR}") add_dependencies(Brotli::brotlienc brotli_ep) add_library(Brotli::brotlidec STATIC IMPORTED) - set_target_properties(Brotli::brotlidec - PROPERTIES IMPORTED_LOCATION "${BROTLI_STATIC_LIBRARY_DEC}" - INTERFACE_INCLUDE_DIRECTORIES "${BROTLI_INCLUDE_DIR}") + set_target_properties(Brotli::brotlidec PROPERTIES IMPORTED_LOCATION + "${BROTLI_STATIC_LIBRARY_DEC}") + target_include_directories(Brotli::brotlidec BEFORE INTERFACE "${BROTLI_INCLUDE_DIR}") add_dependencies(Brotli::brotlidec brotli_ep) list(APPEND @@ -1367,6 +1420,7 @@ endif() set(ARROW_OPENSSL_REQUIRED_VERSION "1.0.2") set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION + OR ARROW_AZURE OR ARROW_FLIGHT OR ARROW_GANDIVA OR ARROW_GCS @@ -1433,9 +1487,8 @@ macro(build_glog) file(MAKE_DIRECTORY "${GLOG_INCLUDE_DIR}") add_library(glog::glog STATIC IMPORTED) - set_target_properties(glog::glog - PROPERTIES IMPORTED_LOCATION "${GLOG_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GLOG_INCLUDE_DIR}") + set_target_properties(glog::glog PROPERTIES IMPORTED_LOCATION "${GLOG_STATIC_LIB}") + target_include_directories(glog::glog BEFORE INTERFACE "${GLOG_INCLUDE_DIR}") add_dependencies(glog::glog glog_ep) list(APPEND ARROW_BUNDLED_STATIC_LIBS glog::glog) @@ -1494,9 +1547,9 @@ macro(build_gflags) add_thirdparty_lib(gflags::gflags_static STATIC ${GFLAGS_STATIC_LIB}) add_dependencies(gflags::gflags_static gflags_ep) set(GFLAGS_LIBRARY gflags::gflags_static) - set_target_properties(${GFLAGS_LIBRARY} - PROPERTIES INTERFACE_COMPILE_DEFINITIONS "GFLAGS_IS_A_DLL=0" - INTERFACE_INCLUDE_DIRECTORIES "${GFLAGS_INCLUDE_DIR}") + set_target_properties(${GFLAGS_LIBRARY} PROPERTIES INTERFACE_COMPILE_DEFINITIONS + "GFLAGS_IS_A_DLL=0") + target_include_directories(${GFLAGS_LIBRARY} BEFORE INTERFACE "${GFLAGS_INCLUDE_DIR}") if(MSVC) set_target_properties(${GFLAGS_LIBRARY} PROPERTIES INTERFACE_LINK_LIBRARIES "shlwapi.lib") @@ -1597,9 +1650,8 @@ macro(build_thrift) add_library(thrift::thrift STATIC IMPORTED) # The include directory must exist before it is referenced by a target. file(MAKE_DIRECTORY "${THRIFT_INCLUDE_DIR}") - set_target_properties(thrift::thrift - PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${THRIFT_INCLUDE_DIR}") + set_target_properties(thrift::thrift PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}") + target_include_directories(thrift::thrift BEFORE INTERFACE "${THRIFT_INCLUDE_DIR}") if(ARROW_USE_BOOST) target_link_libraries(thrift::thrift INTERFACE Boost::headers) endif() @@ -1614,12 +1666,16 @@ endmacro() if(ARROW_WITH_THRIFT) # Thrift C++ code generated by 0.13 requires 0.11 or greater resolve_dependency(Thrift + ARROW_CMAKE_PACKAGE_NAME + Parquet + ARROW_PC_PACKAGE_NAME + parquet HAVE_ALT TRUE - REQUIRED_VERSION - 0.11.0 PC_PACKAGE_NAMES - thrift) + thrift + REQUIRED_VERSION + 0.11.0) string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION}) list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR) @@ -1683,15 +1739,15 @@ macro(build_protobuf) set(Protobuf_INCLUDE_DIRS "${PROTOBUF_INCLUDE_DIR}") add_library(arrow::protobuf::libprotobuf STATIC IMPORTED) - set_target_properties(arrow::protobuf::libprotobuf - PROPERTIES IMPORTED_LOCATION "${PROTOBUF_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${PROTOBUF_INCLUDE_DIR}") + set_target_properties(arrow::protobuf::libprotobuf PROPERTIES IMPORTED_LOCATION + "${PROTOBUF_STATIC_LIB}") + target_include_directories(arrow::protobuf::libprotobuf BEFORE + INTERFACE "${PROTOBUF_INCLUDE_DIR}") add_library(arrow::protobuf::libprotoc STATIC IMPORTED) - set_target_properties(arrow::protobuf::libprotoc - PROPERTIES IMPORTED_LOCATION "${PROTOC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${PROTOBUF_INCLUDE_DIR}") + set_target_properties(arrow::protobuf::libprotoc PROPERTIES IMPORTED_LOCATION + "${PROTOC_STATIC_LIB}") + target_include_directories(arrow::protobuf::libprotoc BEFORE + INTERFACE "${PROTOBUF_INCLUDE_DIR}") add_executable(arrow::protobuf::protoc IMPORTED) set_target_properties(arrow::protobuf::protoc PROPERTIES IMPORTED_LOCATION "${PROTOBUF_COMPILER}") @@ -1715,6 +1771,13 @@ if(ARROW_WITH_PROTOBUF) else() set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") endif() + if(ARROW_FLIGHT) + set(ARROW_PROTOBUF_ARROW_CMAKE_PACKAGE_NAME "ArrowFlight") + set(ARROW_PROTOBUF_ARROW_PC_PACKAGE_NAME "arrow-flight") + else() + set(ARROW_PROTOBUF_ARROW_CMAKE_PACKAGE_NAME "Arrow") + set(ARROW_PROTOBUF_ARROW_PC_PACKAGE_NAME "arrow") + endif() # We need to use FORCE_ANY_NEWER_VERSION here to accept Protobuf # newer version such as 23.4. If we don't use it, 23.4 is processed # as an incompatible version with 3.12.0 with protobuf-config.cmake @@ -1724,14 +1787,18 @@ if(ARROW_WITH_PROTOBUF) # we use FORCE_ANY_NEWER_VERSION here, we can bypass the check and # use 23.4. resolve_dependency(Protobuf + ARROW_CMAKE_PACKAGE_NAME + ${ARROW_PROTOBUF_ARROW_CMAKE_PACKAGE_NAME} + ARROW_PC_PACKAGE_NAME + ${ARROW_PROTOBUF_ARROW_PC_PACKAGE_NAME} FORCE_ANY_NEWER_VERSION TRUE HAVE_ALT TRUE - REQUIRED_VERSION - ${ARROW_PROTOBUF_REQUIRED_VERSION} PC_PACKAGE_NAMES - protobuf) + protobuf + REQUIRED_VERSION + ${ARROW_PROTOBUF_REQUIRED_VERSION}) if(NOT Protobuf_USE_STATIC_LIBS AND MSVC_TOOLCHAIN) add_definitions(-DPROTOBUF_USE_DLLS) @@ -1952,9 +2019,9 @@ macro(build_jemalloc) add_library(jemalloc::jemalloc STATIC IMPORTED) set_target_properties(jemalloc::jemalloc PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads - IMPORTED_LOCATION "${JEMALLOC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${JEMALLOC_INCLUDE_DIR}") + IMPORTED_LOCATION "${JEMALLOC_STATIC_LIB}") + target_include_directories(jemalloc::jemalloc BEFORE + INTERFACE "${JEMALLOC_INCLUDE_DIR}") add_dependencies(jemalloc::jemalloc jemalloc_ep) list(APPEND ARROW_BUNDLED_STATIC_LIBS jemalloc::jemalloc) @@ -2010,9 +2077,9 @@ if(ARROW_MIMALLOC) add_library(mimalloc::mimalloc STATIC IMPORTED) set_target_properties(mimalloc::mimalloc PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads - IMPORTED_LOCATION "${MIMALLOC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${MIMALLOC_INCLUDE_DIR}") + IMPORTED_LOCATION "${MIMALLOC_STATIC_LIB}") + target_include_directories(mimalloc::mimalloc BEFORE + INTERFACE "${MIMALLOC_INCLUDE_DIR}") if(WIN32) set_property(TARGET mimalloc::mimalloc APPEND @@ -2029,150 +2096,61 @@ endif() # ---------------------------------------------------------------------- # Google gtest -macro(build_gtest) +function(build_gtest) message(STATUS "Building gtest from source") set(GTEST_VENDORED TRUE) - set(GTEST_CMAKE_CXX_FLAGS ${EP_CXX_FLAGS}) - - if(CMAKE_BUILD_TYPE MATCHES DEBUG) - set(CMAKE_GTEST_DEBUG_EXTENSION "d") - else() - set(CMAKE_GTEST_DEBUG_EXTENSION "") - endif() - + fetchcontent_declare(googletest + URL ${GTEST_SOURCE_URL} + URL_HASH "SHA256=${ARROW_GTEST_BUILD_SHA256_CHECKSUM}") + prepare_fetchcontent() if(APPLE) - string(APPEND - GTEST_CMAKE_CXX_FLAGS - " -DGTEST_USE_OWN_TR1_TUPLE=1" - " -Wno-unused-value" - " -Wno-ignored-attributes") - endif() - - if(WIN32) - string(APPEND GTEST_CMAKE_CXX_FLAGS " -DGTEST_CREATE_SHARED_LIBRARY=1") - endif() - - set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix") - set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") - - set(_GTEST_LIBRARY_DIR "${GTEST_PREFIX}/lib") - - if(WIN32) - set(_GTEST_IMPORTED_TYPE IMPORTED_IMPLIB) - set(_GTEST_LIBRARY_SUFFIX - "${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_IMPORT_LIBRARY_SUFFIX}") - else() - set(_GTEST_IMPORTED_TYPE IMPORTED_LOCATION) - set(_GTEST_LIBRARY_SUFFIX - "${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_SHARED_LIBRARY_SUFFIX}") - - endif() - - set(GTEST_SHARED_LIB - "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest${_GTEST_LIBRARY_SUFFIX}") - set(GMOCK_SHARED_LIB - "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gmock${_GTEST_LIBRARY_SUFFIX}") - set(GTEST_MAIN_SHARED_LIB - "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${_GTEST_LIBRARY_SUFFIX}" - ) - set(GTEST_INSTALL_NAME_DIR "$/lib") - # Fix syntax highlighting mess introduced by unclosed bracket above - set(dummy ">") - - set(GTEST_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} - -DBUILD_SHARED_LIBS=ON - -DBUILD_STATIC_LIBS=OFF - -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS} - -DCMAKE_INSTALL_NAME_DIR=${GTEST_INSTALL_NAME_DIR} - -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} - -DCMAKE_MACOSX_RPATH=OFF) - set(GMOCK_INCLUDE_DIR "${GTEST_PREFIX}/include") - - if(WIN32 AND NOT ARROW_USE_STATIC_CRT) - list(APPEND GTEST_CMAKE_ARGS -Dgtest_force_shared_crt=ON) + string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-value" " -Wno-ignored-attributes") endif() - - externalproject_add(googletest_ep - ${EP_COMMON_OPTIONS} - URL ${GTEST_SOURCE_URL} - URL_HASH "SHA256=${ARROW_GTEST_BUILD_SHA256_CHECKSUM}" - BUILD_BYPRODUCTS ${GTEST_SHARED_LIB} ${GTEST_MAIN_SHARED_LIB} - ${GMOCK_SHARED_LIB} - CMAKE_ARGS ${GTEST_CMAKE_ARGS}) - if(WIN32) - # Copy the built shared libraries to the same directory as our - # test programs because Windows doesn't provided rpath (run-time - # search path) feature. We need to put these shared libraries to - # the same directory as our test programs or add - # _GTEST_LIBRARY_DIR to PATH when we run our test programs. We - # choose the former because the latter may be forgotten. - set(_GTEST_RUNTIME_DIR "${GTEST_PREFIX}/bin") - set(_GTEST_RUNTIME_SUFFIX - "${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_SHARED_LIBRARY_SUFFIX}") - set(_GTEST_RUNTIME_LIB - "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest${_GTEST_RUNTIME_SUFFIX}" - ) - set(_GMOCK_RUNTIME_LIB - "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gmock${_GTEST_RUNTIME_SUFFIX}" - ) - set(_GTEST_MAIN_RUNTIME_LIB - "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${_GTEST_RUNTIME_SUFFIX}" - ) - get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) - if(_GENERATOR_IS_MULTI_CONFIG) - set(_GTEST_RUNTIME_OUTPUT_DIR "${BUILD_OUTPUT_ROOT_DIRECTORY}/${CMAKE_BUILD_TYPE}") - else() - set(_GTEST_RUNTIME_OUTPUT_DIR ${BUILD_OUTPUT_ROOT_DIRECTORY}) - endif() - externalproject_add_step(googletest_ep copy - COMMAND ${CMAKE_COMMAND} -E make_directory - ${_GTEST_RUNTIME_OUTPUT_DIR} - COMMAND ${CMAKE_COMMAND} -E copy ${_GTEST_RUNTIME_LIB} - ${_GTEST_RUNTIME_OUTPUT_DIR} - COMMAND ${CMAKE_COMMAND} -E copy ${_GMOCK_RUNTIME_LIB} - ${_GTEST_RUNTIME_OUTPUT_DIR} - COMMAND ${CMAKE_COMMAND} -E copy ${_GTEST_MAIN_RUNTIME_LIB} - ${_GTEST_RUNTIME_OUTPUT_DIR} - DEPENDEES install) + set(BUILD_SHARED_LIBS ON) + set(BUILD_STATIC_LIBS OFF) + # We need to use "cache" variable to override the default + # INSTALL_GTEST option by this value. See also: + # https://cmake.org/cmake/help/latest/policy/CMP0077.html + set(INSTALL_GTEST + OFF + CACHE "BOOL" + "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" + FORCE) + string(APPEND CMAKE_INSTALL_INCLUDEDIR "/arrow-gtest") + fetchcontent_makeavailable(googletest) + set_target_properties(gmock PROPERTIES OUTPUT_NAME "arrow_gmock") + set_target_properties(gmock_main PROPERTIES OUTPUT_NAME "arrow_gmock_main") + set_target_properties(gtest PROPERTIES OUTPUT_NAME "arrow_gtest") + set_target_properties(gtest_main PROPERTIES OUTPUT_NAME "arrow_gtest_main") + install(DIRECTORY "${googletest_SOURCE_DIR}/googlemock/include/" + "${googletest_SOURCE_DIR}/googletest/include/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + install(TARGETS gmock gmock_main gtest gtest_main + EXPORT arrow_testing_targets + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}") + if(MSVC) + install(FILES $ $ + $ $ + DESTINATION "${CMAKE_INSTALL_BINDIR}" + OPTIONAL) endif() - - # The include directory must exist before it is referenced by a target. - file(MAKE_DIRECTORY "${GTEST_INCLUDE_DIR}") - - add_library(arrow::GTest::gtest SHARED IMPORTED) - set_target_properties(arrow::GTest::gtest - PROPERTIES ${_GTEST_IMPORTED_TYPE} "${GTEST_SHARED_LIB}" - INTERFACE_COMPILE_DEFINITIONS - "GTEST_LINKED_AS_SHARED_LIBRARY=1" - INTERFACE_INCLUDE_DIRECTORIES "${GTEST_INCLUDE_DIR}") - - add_library(arrow::GTest::gtest_main SHARED IMPORTED) - set_target_properties(arrow::GTest::gtest_main - PROPERTIES ${_GTEST_IMPORTED_TYPE} "${GTEST_MAIN_SHARED_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GTEST_INCLUDE_DIR}") - - add_library(arrow::GTest::gmock SHARED IMPORTED) - set_target_properties(arrow::GTest::gmock - PROPERTIES ${_GTEST_IMPORTED_TYPE} "${GMOCK_SHARED_LIB}" - INTERFACE_COMPILE_DEFINITIONS - "GMOCK_LINKED_AS_SHARED_LIBRARY=1" - INTERFACE_INCLUDE_DIRECTORIES "${GTEST_INCLUDE_DIR}") - add_dependencies(toolchain-tests googletest_ep) - add_dependencies(arrow::GTest::gtest googletest_ep) - add_dependencies(arrow::GTest::gtest_main googletest_ep) - add_dependencies(arrow::GTest::gmock googletest_ep) -endmacro() + add_library(arrow::GTest::gmock ALIAS gmock) + add_library(arrow::GTest::gmock_main ALIAS gmock_main) + add_library(arrow::GTest::gtest ALIAS gtest) + add_library(arrow::GTest::gtest_main ALIAS gtest_main) +endfunction() if(ARROW_TESTING) set(GTestAlt_NEED_CXX_STANDARD_CHECK TRUE) resolve_dependency(GTest + ARROW_CMAKE_PACKAGE_NAME + ArrowTesting HAVE_ALT TRUE REQUIRED_VERSION - 1.10.0 - ARROW_CMAKE_PACKAGE_NAME - "ArrowTesting") + 1.10.0) if(GTest_SOURCE STREQUAL "SYSTEM") find_package(PkgConfig QUIET) @@ -2195,9 +2173,8 @@ if(ARROW_TESTING) set(ARROW_GTEST_GTEST GTest::gtest) set(ARROW_GTEST_GTEST_MAIN GTest::gtest_main) else() - # TODO: How to solve BUNDLED case? Do we install bundled GoogleTest? - # string(APPEND ARROW_TESTING_PC_CFLAGS " -I${GTEST_INCLUDE_DIR}") - # string(APPEND ARROW_TESTING_PC_LIBS " -lgtest") + string(APPEND ARROW_TESTING_PC_CFLAGS " -I\${includedir}/arrow-gtest") + string(APPEND ARROW_TESTING_PC_LIBS " -larrow_gtest") set(ARROW_GTEST_GMOCK arrow::GTest::gmock) set(ARROW_GTEST_GTEST arrow::GTest::gtest) set(ARROW_GTEST_GTEST_MAIN arrow::GTest::gtest_main) @@ -2241,17 +2218,17 @@ macro(build_benchmark) file(MAKE_DIRECTORY "${GBENCHMARK_INCLUDE_DIR}") add_library(benchmark::benchmark STATIC IMPORTED) - set_target_properties(benchmark::benchmark - PROPERTIES IMPORTED_LOCATION "${GBENCHMARK_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${GBENCHMARK_INCLUDE_DIR}") + set_target_properties(benchmark::benchmark PROPERTIES IMPORTED_LOCATION + "${GBENCHMARK_STATIC_LIB}") + target_include_directories(benchmark::benchmark BEFORE + INTERFACE "${GBENCHMARK_INCLUDE_DIR}") target_compile_definitions(benchmark::benchmark INTERFACE "BENCHMARK_STATIC_DEFINE") add_library(benchmark::benchmark_main STATIC IMPORTED) set_target_properties(benchmark::benchmark_main - PROPERTIES IMPORTED_LOCATION "${GBENCHMARK_MAIN_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${GBENCHMARK_INCLUDE_DIR}") + PROPERTIES IMPORTED_LOCATION "${GBENCHMARK_MAIN_STATIC_LIB}") + target_include_directories(benchmark::benchmark_main BEFORE + INTERFACE "${GBENCHMARK_INCLUDE_DIR}") add_dependencies(toolchain-benchmarks gbenchmark_ep) add_dependencies(benchmark::benchmark gbenchmark_ep) @@ -2351,10 +2328,12 @@ endif() if(ARROW_USE_XSIMD) resolve_dependency(xsimd + FORCE_ANY_NEWER_VERSION + TRUE REQUIRED_VERSION "8.1.0" - FORCE_ANY_NEWER_VERSION - TRUE) + PC_PACKAGE_NAMES + xsimd) if(xsimd_SOURCE STREQUAL "BUNDLED") add_library(arrow::xsimd INTERFACE IMPORTED) @@ -2393,9 +2372,8 @@ macro(build_zlib) add_library(ZLIB::ZLIB STATIC IMPORTED) set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") - set_target_properties(ZLIB::ZLIB - PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIRS}) + set_target_properties(ZLIB::ZLIB PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) + target_include_directories(ZLIB::ZLIB BEFORE INTERFACE "${ZLIB_INCLUDE_DIRS}") add_dependencies(toolchain zlib_ep) add_dependencies(ZLIB::ZLIB zlib_ep) @@ -2431,9 +2409,8 @@ macro(build_lz4) file(MAKE_DIRECTORY "${LZ4_PREFIX}/include") add_library(LZ4::lz4 STATIC IMPORTED) - set_target_properties(LZ4::lz4 - PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${LZ4_PREFIX}/include") + set_target_properties(LZ4::lz4 PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_LIB}") + target_include_directories(LZ4::lz4 BEFORE INTERFACE "${LZ4_PREFIX}/include") add_dependencies(toolchain lz4_ep) add_dependencies(LZ4::lz4 lz4_ep) @@ -2482,9 +2459,10 @@ macro(build_zstd) file(MAKE_DIRECTORY "${ZSTD_PREFIX}/include") add_library(zstd::libzstd_static STATIC IMPORTED) - set_target_properties(zstd::libzstd_static - PROPERTIES IMPORTED_LOCATION "${ZSTD_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${ZSTD_PREFIX}/include") + set_target_properties(zstd::libzstd_static PROPERTIES IMPORTED_LOCATION + "${ZSTD_STATIC_LIB}") + target_include_directories(zstd::libzstd_static BEFORE + INTERFACE "${ZSTD_PREFIX}/include") add_dependencies(toolchain zstd_ep) add_dependencies(zstd::libzstd_static zstd_ep) @@ -2540,9 +2518,8 @@ macro(build_re2) file(MAKE_DIRECTORY "${RE2_PREFIX}/include") add_library(re2::re2 STATIC IMPORTED) - set_target_properties(re2::re2 - PROPERTIES IMPORTED_LOCATION "${RE2_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${RE2_PREFIX}/include") + set_target_properties(re2::re2 PROPERTIES IMPORTED_LOCATION "${RE2_STATIC_LIB}") + target_include_directories(re2::re2 BEFORE INTERFACE "${RE2_PREFIX}/include") add_dependencies(toolchain re2_ep) add_dependencies(re2::re2 re2_ep) @@ -2604,10 +2581,8 @@ macro(build_bzip2) file(MAKE_DIRECTORY "${BZIP2_PREFIX}/include") add_library(BZip2::BZip2 STATIC IMPORTED) - set_target_properties(BZip2::BZip2 - PROPERTIES IMPORTED_LOCATION "${BZIP2_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${BZIP2_PREFIX}/include") + set_target_properties(BZip2::BZip2 PROPERTIES IMPORTED_LOCATION "${BZIP2_STATIC_LIB}") + target_include_directories(BZip2::BZip2 BEFORE INTERFACE "${BZIP2_PREFIX}/include") set(BZIP2_INCLUDE_DIR "${BZIP2_PREFIX}/include") add_dependencies(toolchain bzip2_ep) @@ -2619,13 +2594,6 @@ endmacro() if(ARROW_WITH_BZ2) resolve_dependency(BZip2 PC_PACKAGE_NAMES bzip2) - if(NOT TARGET BZip2::BZip2) - add_library(BZip2::BZip2 UNKNOWN IMPORTED) - set_target_properties(BZip2::BZip2 - PROPERTIES IMPORTED_LOCATION "${BZIP2_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIR}") - endif() - if(${BZip2_SOURCE} STREQUAL "SYSTEM" AND NOT bzip2_PC_FOUND AND ARROW_BUILD_STATIC) @@ -2666,9 +2634,9 @@ macro(build_utf8proc) add_library(utf8proc::utf8proc STATIC IMPORTED) set_target_properties(utf8proc::utf8proc PROPERTIES IMPORTED_LOCATION "${UTF8PROC_STATIC_LIB}" - INTERFACE_COMPILE_DEFINITIONS "UTF8PROC_STATIC" - INTERFACE_INCLUDE_DIRECTORIES - "${UTF8PROC_PREFIX}/include") + INTERFACE_COMPILE_DEFINITIONS "UTF8PROC_STATIC") + target_include_directories(utf8proc::utf8proc BEFORE + INTERFACE "${UTF8PROC_PREFIX}/include") add_dependencies(toolchain utf8proc_ep) add_dependencies(utf8proc::utf8proc utf8proc_ep) @@ -2678,10 +2646,10 @@ endmacro() if(ARROW_WITH_UTF8PROC) resolve_dependency(utf8proc - REQUIRED_VERSION - "2.2.0" PC_PACKAGE_NAMES - libutf8proc) + libutf8proc + REQUIRED_VERSION + "2.2.0") add_definitions(-DARROW_WITH_UTF8PROC) endif() @@ -2710,9 +2678,8 @@ macro(build_cares) add_dependencies(toolchain cares_ep) add_library(c-ares::cares STATIC IMPORTED) - set_target_properties(c-ares::cares - PROPERTIES IMPORTED_LOCATION "${CARES_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${CARES_INCLUDE_DIR}") + set_target_properties(c-ares::cares PROPERTIES IMPORTED_LOCATION "${CARES_STATIC_LIB}") + target_include_directories(c-ares::cares BEFORE INTERFACE "${CARES_INCLUDE_DIR}") add_dependencies(c-ares::cares cares_ep) if(APPLE) @@ -2923,15 +2890,14 @@ macro(build_absl) "${ABSL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}absl_${_ABSL_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) add_library(absl::${_ABSL_LIB} STATIC IMPORTED) - set_target_properties(absl::${_ABSL_LIB} - PROPERTIES IMPORTED_LOCATION ${_ABSL_STATIC_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES "${ABSL_INCLUDE_DIR}") + set_target_properties(absl::${_ABSL_LIB} PROPERTIES IMPORTED_LOCATION + ${_ABSL_STATIC_LIBRARY}) + target_include_directories(absl::${_ABSL_LIB} BEFORE INTERFACE "${ABSL_INCLUDE_DIR}") list(APPEND ABSL_BUILD_BYPRODUCTS ${_ABSL_STATIC_LIBRARY}) endforeach() foreach(_ABSL_LIB ${_ABSL_INTERFACE_LIBS}) add_library(absl::${_ABSL_LIB} INTERFACE IMPORTED) - set_target_properties(absl::${_ABSL_LIB} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${ABSL_INCLUDE_DIR}") + target_include_directories(absl::${_ABSL_LIB} BEFORE INTERFACE "${ABSL_INCLUDE_DIR}") endforeach() # Extracted the dependency information using the Abseil pkg-config files: @@ -3694,6 +3660,10 @@ endmacro() macro(build_grpc) resolve_dependency(c-ares + ARROW_CMAKE_PACKAGE_NAME + ArrowFlight + ARROW_PC_PACKAGE_NAME + arrow-flight HAVE_ALT TRUE PC_PACKAGE_NAMES @@ -3847,9 +3817,9 @@ macro(build_grpc) file(MAKE_DIRECTORY ${GRPC_INCLUDE_DIR}) add_library(gRPC::upb STATIC IMPORTED) - set_target_properties(gRPC::upb - PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_UPB}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") + set_target_properties(gRPC::upb PROPERTIES IMPORTED_LOCATION + "${GRPC_STATIC_LIBRARY_UPB}") + target_include_directories(gRPC::upb BEFORE INTERFACE "${GRPC_INCLUDE_DIR}") set(GRPC_GPR_ABSL_LIBRARIES # We need a flattened list of Abseil libraries for the static linking case, @@ -3910,20 +3880,21 @@ macro(build_grpc) add_library(gRPC::gpr STATIC IMPORTED) set_target_properties(gRPC::gpr PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GPR}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "${GRPC_GPR_ABSL_LIBRARIES}") + target_include_directories(gRPC::gpr BEFORE INTERFACE "${GRPC_INCLUDE_DIR}") add_library(gRPC::address_sorting STATIC IMPORTED) set_target_properties(gRPC::address_sorting PROPERTIES IMPORTED_LOCATION - "${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") + "${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}") + target_include_directories(gRPC::address_sorting BEFORE INTERFACE "${GRPC_INCLUDE_DIR}") add_library(gRPC::grpc++_reflection STATIC IMPORTED) set_target_properties(gRPC::grpc++_reflection PROPERTIES IMPORTED_LOCATION - "${GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") + "${GRPC_STATIC_LIBRARY_GRPCPP_REFLECTION}") + target_include_directories(gRPC::grpc++_reflection BEFORE + INTERFACE "${GRPC_INCLUDE_DIR}") add_library(gRPC::grpc STATIC IMPORTED) set(GRPC_LINK_LIBRARIES @@ -3937,15 +3908,15 @@ macro(build_grpc) Threads::Threads) set_target_properties(gRPC::grpc PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GRPC}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "${GRPC_LINK_LIBRARIES}") + target_include_directories(gRPC::grpc BEFORE INTERFACE "${GRPC_INCLUDE_DIR}") add_library(gRPC::grpc++ STATIC IMPORTED) set(GRPCPP_LINK_LIBRARIES gRPC::grpc ${ARROW_PROTOBUF_LIBPROTOBUF}) set_target_properties(gRPC::grpc++ PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GRPCPP}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "${GRPCPP_LINK_LIBRARIES}") + target_include_directories(gRPC::grpc++ BEFORE INTERFACE "${GRPC_INCLUDE_DIR}") add_executable(gRPC::grpc_cpp_plugin IMPORTED) set_target_properties(gRPC::grpc_cpp_plugin PROPERTIES IMPORTED_LOCATION @@ -3998,12 +3969,16 @@ if(ARROW_WITH_GRPC) set(gRPC_SOURCE "${Protobuf_SOURCE}") endif() resolve_dependency(gRPC + ARROW_CMAKE_PACKAGE_NAME + ArrowFlight + ARROW_PC_PACKAGE_NAME + arrow-flight HAVE_ALT TRUE - REQUIRED_VERSION - ${ARROW_GRPC_REQUIRED_VERSION} PC_PACKAGE_NAMES - grpc++) + grpc++ + REQUIRED_VERSION + ${ARROW_GRPC_REQUIRED_VERSION}) if(GRPC_VENDORED) # Remove "v" from "vX.Y.Z" @@ -4058,10 +4033,9 @@ macro(build_crc32c_once) # Work around https://gitlab.kitware.com/cmake/cmake/issues/15052 file(MAKE_DIRECTORY "${CRC32C_INCLUDE_DIR}") add_library(Crc32c::crc32c STATIC IMPORTED) - set_target_properties(Crc32c::crc32c - PROPERTIES IMPORTED_LOCATION ${_CRC32C_STATIC_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES - "${CRC32C_INCLUDE_DIR}") + set_target_properties(Crc32c::crc32c PROPERTIES IMPORTED_LOCATION + ${_CRC32C_STATIC_LIBRARY}) + target_include_directories(Crc32c::crc32c BEFORE INTERFACE "${CRC32C_INCLUDE_DIR}") add_dependencies(Crc32c::crc32c crc32c_ep) endif() endmacro() @@ -4089,9 +4063,8 @@ macro(build_nlohmann_json) file(MAKE_DIRECTORY ${NLOHMANN_JSON_INCLUDE_DIR}) add_library(nlohmann_json::nlohmann_json INTERFACE IMPORTED) - set_target_properties(nlohmann_json::nlohmann_json - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${NLOHMANN_JSON_INCLUDE_DIR}") + target_include_directories(nlohmann_json::nlohmann_json BEFORE + INTERFACE "${NLOHMANN_JSON_INCLUDE_DIR}") add_dependencies(nlohmann_json::nlohmann_json nlohmann_json_ep) endmacro() if(ARROW_WITH_NLOHMANN_JSON) @@ -4200,9 +4173,9 @@ macro(build_google_cloud_cpp_storage) add_library(google-cloud-cpp::common STATIC IMPORTED) set_target_properties(google-cloud-cpp::common PROPERTIES IMPORTED_LOCATION - "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON}" - INTERFACE_INCLUDE_DIRECTORIES - "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_COMMON}") + target_include_directories(google-cloud-cpp::common BEFORE + INTERFACE "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") # Refer to https://github.com/googleapis/google-cloud-cpp/blob/main/google/cloud/google_cloud_cpp_common.cmake # (subsitute `main` for the SHA of the version we use) # Version 1.39.0 is at a different place (they refactored after): @@ -4222,9 +4195,9 @@ macro(build_google_cloud_cpp_storage) add_library(google-cloud-cpp::rest-internal STATIC IMPORTED) set_target_properties(google-cloud-cpp::rest-internal PROPERTIES IMPORTED_LOCATION - "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL}" - INTERFACE_INCLUDE_DIRECTORIES - "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_REST_INTERNAL}") + target_include_directories(google-cloud-cpp::rest-internal BEFORE + INTERFACE "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") set_property(TARGET google-cloud-cpp::rest-internal PROPERTY INTERFACE_LINK_LIBRARIES absl::span @@ -4237,9 +4210,9 @@ macro(build_google_cloud_cpp_storage) add_library(google-cloud-cpp::storage STATIC IMPORTED) set_target_properties(google-cloud-cpp::storage PROPERTIES IMPORTED_LOCATION - "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE}" - INTERFACE_INCLUDE_DIRECTORIES - "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") + "${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE}") + target_include_directories(google-cloud-cpp::storage BEFORE + INTERFACE "${GOOGLE_CLOUD_CPP_INCLUDE_DIR}") # Update this from https://github.com/googleapis/google-cloud-cpp/blob/main/google/cloud/storage/google_cloud_cpp_storage.cmake set_property(TARGET google-cloud-cpp::storage PROPERTY INTERFACE_LINK_LIBRARIES @@ -4400,9 +4373,8 @@ macro(build_orc) set(ORC_VENDORED 1) add_library(orc::liborc STATIC IMPORTED) - set_target_properties(orc::liborc - PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") + set_target_properties(orc::liborc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") + target_include_directories(orc::liborc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") set(ORC_LINK_LIBRARIES LZ4::lz4 ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET}) # Protobuf generated files may use ABSL_DCHECK*() and # absl::log_internal_check_op is needed for them. @@ -4461,9 +4433,8 @@ macro(build_opentelemetry) foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_APIS}) add_library(opentelemetry-cpp::${_OPENTELEMETRY_LIB} INTERFACE IMPORTED) - set_target_properties(opentelemetry-cpp::${_OPENTELEMETRY_LIB} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${OPENTELEMETRY_INCLUDE_DIR}") + target_include_directories(opentelemetry-cpp::${_OPENTELEMETRY_LIB} BEFORE + INTERFACE "${OPENTELEMETRY_INCLUDE_DIR}") endforeach() foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) # N.B. OTel targets and libraries don't follow any consistent naming scheme @@ -4755,15 +4726,15 @@ macro(build_awssdk) set(_AWSSDK_TARGET_NAME AWS::${_AWSSDK_LIB}) endif() add_library(${_AWSSDK_TARGET_NAME} STATIC IMPORTED) - set_target_properties(${_AWSSDK_TARGET_NAME} - PROPERTIES IMPORTED_LOCATION ${_AWSSDK_STATIC_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES - "${AWSSDK_INCLUDE_DIR}") + set_target_properties(${_AWSSDK_TARGET_NAME} PROPERTIES IMPORTED_LOCATION + ${_AWSSDK_STATIC_LIBRARY}) + target_include_directories(${_AWSSDK_TARGET_NAME} BEFORE + INTERFACE "${AWSSDK_INCLUDE_DIR}") if(${_AWSSDK_LIB} STREQUAL "aws-lc") - set_target_properties(${_AWSSDK_TARGET_NAME} - PROPERTIES IMPORTED_LOCATION ${_AWSSDK_STATIC_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES - "${AWS_LC_INCLUDE_DIR}") + set_target_properties(${_AWSSDK_TARGET_NAME} PROPERTIES IMPORTED_LOCATION + ${_AWSSDK_STATIC_LIBRARY}) + target_include_directories(${_AWSSDK_TARGET_NAME} BEFORE + INTERFACE "${AWS_LC_INCLUDE_DIR}") endif() set("${_AWSSDK_LIB_NAME_PREFIX}_STATIC_LIBRARY" ${_AWSSDK_STATIC_LIBRARY}) @@ -5049,6 +5020,48 @@ if(ARROW_S3) endif() endif() +# ---------------------------------------------------------------------- +# Azure SDK for C++ + +function(build_azure_sdk) + message(STATUS "Building Azure SDK for C++ from source") + fetchcontent_declare(azure_sdk + URL ${ARROW_AZURE_SDK_URL} + URL_HASH "SHA256=${ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM}") + prepare_fetchcontent() + set(BUILD_PERFORMANCE_TESTS FALSE) + set(BUILD_SAMPLES FALSE) + set(BUILD_TESTING FALSE) + set(BUILD_WINDOWS_UWP TRUE) + set(CMAKE_UNITY_BUILD FALSE) + set(DISABLE_AZURE_CORE_OPENTELEMETRY TRUE) + set(ENV{AZURE_SDK_DISABLE_AUTO_VCPKG} TRUE) + set(WARNINGS_AS_ERRORS FALSE) + fetchcontent_makeavailable(azure_sdk) + set(AZURE_SDK_VENDORED + TRUE + PARENT_SCOPE) + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + Azure::azure-core + Azure::azure-identity + Azure::azure-storage-blobs + Azure::azure-storage-common + Azure::azure-storage-files-datalake) + set(ARROW_BUNDLED_STATIC_LIBS + ${ARROW_BUNDLED_STATIC_LIBS} + PARENT_SCOPE) +endfunction() + +if(ARROW_WITH_AZURE_SDK) + resolve_dependency(Azure REQUIRED_VERSION 1.10.2) + set(AZURE_SDK_LINK_LIBRARIES + Azure::azure-storage-files-datalake + Azure::azure-storage-common + Azure::azure-storage-blobs + Azure::azure-identity + Azure::azure-core) +endif() # ---------------------------------------------------------------------- # ucx - communication framework for modern, high-bandwidth and low-latency networks @@ -5119,7 +5132,13 @@ macro(build_ucx) endmacro() if(ARROW_WITH_UCX) - resolve_dependency(ucx PC_PACKAGE_NAMES ucx) + resolve_dependency(ucx + ARROW_CMAKE_PACKAGE_NAME + ArrowFlight + ARROW_PC_PACKAGE_NAME + arrow-flight + PC_PACKAGE_NAMES + ucx) add_library(ucx::ucx INTERFACE IMPORTED) target_include_directories(ucx::ucx INTERFACE "${UCX_INCLUDE_DIRS}") target_link_libraries(ucx::ucx INTERFACE ucx::ucp ucx::uct ucx::ucs) diff --git a/cpp/examples/arrow/join_example.cc b/cpp/examples/arrow/join_example.cc index 17f709c720e43..c1c6e5e82ff11 100644 --- a/cpp/examples/arrow/join_example.cc +++ b/cpp/examples/arrow/join_example.cc @@ -64,7 +64,7 @@ arrow::Result> CreateDataSetFromCSVData std::string csv_data = is_left ? kLeftRelationCsvData : kRightRelationCsvData; std::cout << csv_data << std::endl; std::string_view sv = csv_data; - input = std::make_shared(sv); + input = arrow::io::BufferReader::FromString(std::string(sv)); auto read_options = arrow::csv::ReadOptions::Defaults(); auto parse_options = arrow::csv::ParseOptions::Defaults(); auto convert_options = arrow::csv::ConvertOptions::Defaults(); diff --git a/cpp/examples/minimal_build/run_static.sh b/cpp/examples/minimal_build/run_static.sh index 5b6afbd67aea8..189f59a007b2a 100755 --- a/cpp/examples/minimal_build/run_static.sh +++ b/cpp/examples/minimal_build/run_static.sh @@ -39,24 +39,9 @@ NPROC=$(nproc) cmake $ARROW_DIR/cpp \ -DARROW_BUILD_SHARED=OFF \ -DARROW_BUILD_STATIC=ON \ - -DARROW_COMPUTE=ON \ -DARROW_CSV=ON \ - -DARROW_DATASET=ON \ -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE} \ -DARROW_DEPENDENCY_USE_SHARED=OFF \ - -DARROW_FILESYSTEM=ON \ - -DARROW_HDFS=ON \ - -DARROW_JEMALLOC=ON \ - -DARROW_JSON=ON \ - -DARROW_ORC=ON \ - -DARROW_PARQUET=ON \ - -DARROW_WITH_BROTLI=ON \ - -DARROW_WITH_BZ2=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_ZLIB=ON \ - -DARROW_WITH_ZSTD=ON \ - -DORC_SOURCE=BUNDLED \ -Dxsimd_SOURCE=BUNDLED \ $ARROW_CMAKE_OPTIONS diff --git a/cpp/src/arrow/ArrowConfig.cmake.in b/cpp/src/arrow/ArrowConfig.cmake.in index deb7bf94a80a2..7bd19fb41a31f 100644 --- a/cpp/src/arrow/ArrowConfig.cmake.in +++ b/cpp/src/arrow/ArrowConfig.cmake.in @@ -41,13 +41,7 @@ set(ARROW_SYSTEM_DEPENDENCIES "@ARROW_SYSTEM_DEPENDENCIES@") include("${CMAKE_CURRENT_LIST_DIR}/ArrowOptions.cmake") -if(ARROW_BUILD_STATIC) - include(CMakeFindDependencyMacro) - - set(CMAKE_THREAD_PREFER_PTHREAD TRUE) - set(THREADS_PREFER_PTHREAD_FLAG TRUE) - find_dependency(Threads) - +macro(arrow_find_dependencies dependencies) if(DEFINED CMAKE_MODULE_PATH) set(ARROW_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH}) else() @@ -55,9 +49,9 @@ if(ARROW_BUILD_STATIC) endif() set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") - foreach(_DEPENDENCY ${ARROW_SYSTEM_DEPENDENCIES}) + foreach(dependency ${dependencies}) set(ARROW_OPENSSL_HOMEBREW_MAKE_DETECTABLE FALSE) - if(${_DEPENDENCY} STREQUAL "OpenSSL" AND NOT OPENSSL_ROOT_DIR) + if(${dependency} STREQUAL "OpenSSL" AND NOT OPENSSL_ROOT_DIR) find_program(ARROW_BREW brew) if(ARROW_BREW) set(ARROW_OPENSSL_ROOT_DIR_ORIGINAL ${OPENSSL_ROOT_DIR}) @@ -78,7 +72,7 @@ if(ARROW_BUILD_STATIC) endif() endif() endif() - find_dependency(${_DEPENDENCY}) + find_dependency(${dependency}) if(ARROW_OPENSSL_HOMEBREW_MAKE_DETECTABLE) set(OPENSSL_ROOT_DIR ${ARROW_OPENSSL_ROOT_DIR_ORIGINAL}) endif() @@ -90,6 +84,16 @@ if(ARROW_BUILD_STATIC) else() unset(CMAKE_MODULE_PATH) endif() +endmacro() + +if(ARROW_BUILD_STATIC) + include(CMakeFindDependencyMacro) + + set(CMAKE_THREAD_PREFER_PTHREAD TRUE) + set(THREADS_PREFER_PTHREAD_FLAG TRUE) + find_dependency(Threads) + + arrow_find_dependencies("${ARROW_SYSTEM_DEPENDENCIES}") endif() include("${CMAKE_CURRENT_LIST_DIR}/ArrowTargets.cmake") diff --git a/cpp/src/arrow/ArrowTestingConfig.cmake.in b/cpp/src/arrow/ArrowTestingConfig.cmake.in index b65f6ef0d58b0..148d6516a093f 100644 --- a/cpp/src/arrow/ArrowTestingConfig.cmake.in +++ b/cpp/src/arrow/ArrowTestingConfig.cmake.in @@ -26,26 +26,12 @@ @PACKAGE_INIT@ -set(ARROW_GTEST_SOURCE "@GTest_SOURCE@") +set(ARROW_TESTING_SYSTEM_DEPENDENCIES "@ARROW_TESTING_SYSTEM_DEPENDENCIES@") include(CMakeFindDependencyMacro) find_dependency(Arrow) -if(DEFINED CMAKE_MODULE_PATH) - set(ARROW_TESTING_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH}) -else() - unset(ARROW_TESTING_CMAKE_MODULE_PATH_OLD) -endif() -set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") -if("${ARROW_GTEST_SOURCE}" STREQUAL "SYSTEM") - find_dependency(GTestAlt) -endif() -if(DEFINED ARROW_TESTING_CMAKE_MODULE_PATH_OLD) - set(CMAKE_MODULE_PATH ${ARROW_TESTING_CMAKE_MODULE_PATH_OLD}) - unset(ARROW_TESTING_CMAKE_MODULE_PATH_OLD) -else() - unset(CMAKE_MODULE_PATH) -endif() +arrow_find_dependencies("${ARROW_TESTING_SYSTEM_DEPENDENCIES}") include("${CMAKE_CURRENT_LIST_DIR}/ArrowTestingTargets.cmake") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a398e790de14b..9a6117011535e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -127,6 +127,15 @@ macro(append_runtime_avx2_src SRC) endif() endmacro() +macro(append_runtime_avx2_bmi2_src SRC) + if(ARROW_HAVE_RUNTIME_AVX2 AND ARROW_HAVE_RUNTIME_BMI2) + list(APPEND ARROW_SRCS ${SRC}) + set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) + set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS + "${ARROW_AVX2_FLAG} ${ARROW_BMI2_FLAG}") + endif() +endmacro() + macro(append_runtime_avx512_src SRC) if(ARROW_HAVE_RUNTIME_AVX512) list(APPEND ARROW_SRCS ${SRC}) @@ -324,11 +333,14 @@ if(ARROW_WITH_ZSTD) list(APPEND ARROW_SRCS util/compression_zstd.cc) endif() +set(ARROW_TESTING_SHARED_LINK_LIBS arrow::flatbuffers rapidjson::rapidjson arrow_shared + ${ARROW_GTEST_GTEST}) +set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers rapidjson::rapidjson arrow_static + ${ARROW_GTEST_GTEST}) + set(ARROW_TESTING_SRCS io/test_common.cc ipc/test_common.cc - testing/json_integration.cc - testing/json_internal.cc testing/gtest_util.cc testing/random.cc testing/generator.cc @@ -370,6 +382,14 @@ endif() # Configure the base Arrow libraries # +if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS) + list(APPEND + ARROW_SRCS + integration/c_data_integration_internal.cc + integration/json_integration.cc + integration/json_internal.cc) +endif() + if(ARROW_CSV) list(APPEND ARROW_SRCS @@ -426,10 +446,10 @@ list(APPEND compute/util.cc) append_runtime_avx2_src(compute/key_hash_avx2.cc) -append_runtime_avx2_src(compute/key_map_avx2.cc) +append_runtime_avx2_bmi2_src(compute/key_map_avx2.cc) append_runtime_avx2_src(compute/row/compare_internal_avx2.cc) append_runtime_avx2_src(compute/row/encode_internal_avx2.cc) -append_runtime_avx2_src(compute/util_avx2.cc) +append_runtime_avx2_bmi2_src(compute/util_avx2.cc) if(ARROW_COMPUTE) # Include the remaining kernels @@ -497,22 +517,14 @@ if(ARROW_FILESYSTEM) list(APPEND ARROW_SRCS filesystem/hdfs.cc) endif() if(ARROW_S3) - try_compile(S3_HAS_CRT ${CMAKE_CURRENT_BINARY_DIR}/try_compile - SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/filesystem/try_compile/check_s3fs_crt.cc" - CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CURRENT_INCLUDE_DIRECTORIES}" - LINK_LIBRARIES ${AWSSDK_LINK_LIBRARIES} CXX_STANDARD 17) - - if(S3_HAS_CRT) - message(STATUS "AWS SDK is new enough to have CRT support") - add_definitions(-DARROW_S3_HAS_CRT) - endif() - list(APPEND ARROW_SRCS filesystem/s3fs.cc) set_source_files_properties(filesystem/s3fs.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) endif() + list(APPEND ARROW_TESTING_SHARED_LINK_LIBS ${ARROW_GTEST_GMOCK}) + list(APPEND ARROW_TESTING_STATIC_LINK_LIBS ${ARROW_GTEST_GMOCK}) list(APPEND ARROW_TESTING_SRCS filesystem/test_util.cc) endif() @@ -720,6 +732,9 @@ if(ARROW_TESTING) if(GTest_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS ${ARROW_GTEST_GTEST}) list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_GTEST_GTEST}) + else() + list(APPEND ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS ArrowTesting::gtest) + list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS ArrowTesting::gtest) endif() add_arrow_lib(arrow_testing CMAKE_PACKAGE_NAME @@ -735,17 +750,11 @@ if(ARROW_TESTING) DEPENDENCIES arrow_test_dependencies SHARED_LINK_LIBS - arrow::flatbuffers - rapidjson::rapidjson - arrow_shared - ${ARROW_GTEST_GTEST} + ${ARROW_TESTING_SHARED_LINK_LIBS} SHARED_INSTALL_INTERFACE_LIBS ${ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS} STATIC_LINK_LIBS - arrow::flatbuffers - rapidjson::rapidjson - arrow_static - ${ARROW_GTEST_GTEST} + ${ARROW_TESTING_STATIC_LINK_LIBS} STATIC_INSTALL_INTERFACE_LIBS ${ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS}) @@ -842,12 +851,15 @@ add_subdirectory(tensor) add_subdirectory(util) add_subdirectory(vendored) -if(ARROW_CSV) - add_subdirectory(csv) +if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS) + # We build tests for the JSON integration machinery even if integration + # is not enabled, to ensure it's exercised in more builds than just the + # integration build. + add_subdirectory(integration) endif() -if(ARROW_SUBSTRAIT) - add_subdirectory(engine) +if(ARROW_CSV) + add_subdirectory(csv) endif() if(ARROW_ACERO) @@ -883,6 +895,10 @@ if(ARROW_ORC) add_subdirectory(adapters/orc) endif() +if(ARROW_SUBSTRAIT) + add_subdirectory(engine) +endif() + if(ARROW_TENSORFLOW) add_subdirectory(adapters/tensorflow) endif() diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 93cc4f4649d1f..73ecde6b9b576 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -483,9 +483,9 @@ TEST(TestAdapterRead, ReadCharAndVarcharType) { writer->add(*batch); writer->close(); - std::shared_ptr in_stream(std::make_shared( - reinterpret_cast(mem_stream.getData()), - static_cast(mem_stream.getLength()))); + std::shared_ptr in_stream = std::make_shared( + std::make_shared(reinterpret_cast(mem_stream.getData()), + mem_stream.getLength())); ASSERT_OK_AND_ASSIGN( auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool())); ASSERT_EQ(row_count, reader->NumberOfRows()); @@ -557,9 +557,9 @@ TEST(TestAdapterRead, ReadFieldAttributes) { auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream); writer->close(); - std::shared_ptr in_stream(std::make_shared( - reinterpret_cast(mem_stream.getData()), - static_cast(mem_stream.getLength()))); + std::shared_ptr in_stream = std::make_shared( + std::make_shared(reinterpret_cast(mem_stream.getData()), + mem_stream.getLength())); ASSERT_OK_AND_ASSIGN( auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool())); ASSERT_EQ(0, reader->NumberOfRows()); diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index cccc7bb78220d..c9e2f93cde66f 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -282,9 +282,9 @@ class DictionaryUnifierImpl : public DictionaryUnifier { *out_type = arrow::dictionary(index_type, value_type_); // Build unified dictionary array - std::shared_ptr data; - RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_, - 0 /* start_offset */, &data)); + ARROW_ASSIGN_OR_RAISE( + auto data, DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_, + 0 /* start_offset */)); *out_dict = MakeArray(data); return Status::OK(); } @@ -299,9 +299,9 @@ class DictionaryUnifierImpl : public DictionaryUnifier { } // Build unified dictionary array - std::shared_ptr data; - RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_, - 0 /* start_offset */, &data)); + ARROW_ASSIGN_OR_RAISE( + auto data, DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_, + 0 /* start_offset */)); *out_dict = MakeArray(data); return Status::OK(); } diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index df60074c78470..d8308c824953a 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -627,6 +627,22 @@ std::shared_ptr StructArray::GetFieldByName(const std::string& name) cons return i == -1 ? nullptr : field(i); } +Status StructArray::CanReferenceFieldByName(const std::string& name) const { + if (GetFieldByName(name) == nullptr) { + return Status::Invalid("Field named '", name, + "' not found or not unique in the struct."); + } + return Status::OK(); +} + +Status StructArray::CanReferenceFieldsByNames( + const std::vector& names) const { + for (const auto& name : names) { + ARROW_RETURN_NOT_OK(CanReferenceFieldByName(name)); + } + return Status::OK(); +} + Result StructArray::Flatten(MemoryPool* pool) const { ArrayVector flattened; flattened.resize(data_->child_data.size()); diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 47c1db039ccc9..8d5cc95fec00d 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -404,6 +404,12 @@ class ARROW_EXPORT StructArray : public Array { /// Returns null if name not found std::shared_ptr GetFieldByName(const std::string& name) const; + /// Indicate if field named `name` can be found unambiguously in the struct. + Status CanReferenceFieldByName(const std::string& name) const; + + /// Indicate if fields named `names` can be found unambiguously in the struct. + Status CanReferenceFieldsByNames(const std::vector& names) const; + /// \brief Flatten this array as a vector of arrays, one for each field /// /// \param[in] pool The pool to allocate null bitmaps from, if necessary diff --git a/cpp/src/arrow/array/array_struct_test.cc b/cpp/src/arrow/array/array_struct_test.cc index 318c83860e009..73d53a7efa59b 100644 --- a/cpp/src/arrow/array/array_struct_test.cc +++ b/cpp/src/arrow/array/array_struct_test.cc @@ -303,6 +303,58 @@ TEST(StructArray, FlattenOfSlice) { ASSERT_OK(arr->ValidateFull()); } +TEST(StructArray, CanReferenceFieldByName) { + auto a = ArrayFromJSON(int8(), "[4, 5]"); + auto b = ArrayFromJSON(int16(), "[6, 7]"); + auto c = ArrayFromJSON(int32(), "[8, 9]"); + auto d = ArrayFromJSON(int64(), "[10, 11]"); + auto children = std::vector>{a, b, c, d}; + + auto f0 = field("f0", int8()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", int32()); + auto f3 = field("f1", int64()); + auto type = struct_({f0, f1, f2, f3}); + + auto arr = std::make_shared(type, 2, children); + + ASSERT_OK(arr->CanReferenceFieldByName("f0")); + ASSERT_OK(arr->CanReferenceFieldByName("f2")); + // Not found + ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("nope")); + + // Duplicates + ASSERT_RAISES(Invalid, arr->CanReferenceFieldByName("f1")); +} + +TEST(StructArray, CanReferenceFieldsByNames) { + auto a = ArrayFromJSON(int8(), "[4, 5]"); + auto b = ArrayFromJSON(int16(), "[6, 7]"); + auto c = ArrayFromJSON(int32(), "[8, 9]"); + auto d = ArrayFromJSON(int64(), "[10, 11]"); + auto children = std::vector>{a, b, c, d}; + + auto f0 = field("f0", int8()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", int32()); + auto f3 = field("f1", int64()); + auto type = struct_({f0, f1, f2, f3}); + + auto arr = std::make_shared(type, 2, children); + + ASSERT_OK(arr->CanReferenceFieldsByNames({"f0", "f2"})); + ASSERT_OK(arr->CanReferenceFieldsByNames({"f2", "f0"})); + + // Not found + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"nope"})); + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "nope"})); + // Duplicates + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f1"})); + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1"})); + // Both + ASSERT_RAISES(Invalid, arr->CanReferenceFieldsByNames({"f0", "f1", "nope"})); +} + // ---------------------------------------------------------------------------------- // Struct test class TestStructBuilder : public ::testing::Test { diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 061fb600412fd..525b0afbc908a 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -106,8 +106,9 @@ class DictionaryMemoTable::DictionaryMemoTableImpl { enable_if_memoize Visit(const T&) { using ConcreteMemoTable = typename DictionaryTraits::MemoTableType; auto memo_table = checked_cast(memo_table_); - return DictionaryTraits::GetDictionaryArrayData(pool_, value_type_, *memo_table, - start_offset_, out_); + ARROW_ASSIGN_OR_RAISE(*out_, DictionaryTraits::GetDictionaryArrayData( + pool_, value_type_, *memo_table, start_offset_)); + return Status::OK(); } }; diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 5245c8d0ff313..3c1c8c453d1e7 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -29,6 +29,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -63,11 +64,9 @@ struct DictionaryTraits { using T = BooleanType; using MemoTableType = typename HashTraits::MemoTableType; - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { + static Result> GetDictionaryArrayData( + MemoryPool* pool, const std::shared_ptr& type, + const MemoTableType& memo_table, int64_t start_offset) { if (start_offset < 0) { return Status::Invalid("invalid start_offset ", start_offset); } @@ -82,7 +81,9 @@ struct DictionaryTraits { : builder.Append(bool_values[i])); } - return builder.FinishInternal(out); + std::shared_ptr out; + RETURN_NOT_OK(builder.FinishInternal(&out)); + return out; } }; // namespace internal @@ -91,11 +92,9 @@ struct DictionaryTraits> { using c_type = typename T::c_type; using MemoTableType = typename HashTraits::MemoTableType; - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { + static Result> GetDictionaryArrayData( + MemoryPool* pool, const std::shared_ptr& type, + const MemoTableType& memo_table, int64_t start_offset) { auto dict_length = static_cast(memo_table.size()) - start_offset; // This makes a copy, but we assume a dictionary array is usually small // compared to the size of the dictionary-using array. @@ -112,8 +111,7 @@ struct DictionaryTraits> { RETURN_NOT_OK( ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap)); - *out = ArrayData::Make(type, dict_length, {null_bitmap, dict_buffer}, null_count); - return Status::OK(); + return ArrayData::Make(type, dict_length, {null_bitmap, dict_buffer}, null_count); } }; @@ -121,11 +119,9 @@ template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { + static Result> GetDictionaryArrayData( + MemoryPool* pool, const std::shared_ptr& type, + const MemoTableType& memo_table, int64_t start_offset) { using offset_type = typename T::offset_type; // Create the offsets buffer @@ -148,11 +144,9 @@ struct DictionaryTraits> { RETURN_NOT_OK( ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap)); - *out = ArrayData::Make(type, dict_length, + return ArrayData::Make(type, dict_length, {null_bitmap, std::move(dict_offsets), std::move(dict_data)}, null_count); - - return Status::OK(); } }; @@ -160,11 +154,9 @@ template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { + static Result> GetDictionaryArrayData( + MemoryPool* pool, const std::shared_ptr& type, + const MemoTableType& memo_table, int64_t start_offset) { const T& concrete_type = internal::checked_cast(*type); // Create the data buffer @@ -182,9 +174,8 @@ struct DictionaryTraits> { RETURN_NOT_OK( ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap)); - *out = ArrayData::Make(type, dict_length, {null_bitmap, std::move(dict_data)}, + return ArrayData::Make(type, dict_length, {null_bitmap, std::move(dict_data)}, null_count); - return Status::OK(); } }; diff --git a/cpp/src/arrow/arrow-config.cmake b/cpp/src/arrow/arrow-config.cmake index 8c9173c1710cb..c18c9eff37279 100644 --- a/cpp/src/arrow/arrow-config.cmake +++ b/cpp/src/arrow/arrow-config.cmake @@ -19,8 +19,7 @@ message(WARNING "find_package(arrow) is deprecated. Use find_package(Arrow) inst find_package(Arrow CONFIG) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(arrow - REQUIRED_VARS - ARROW_INCLUDE_DIR - VERSION_VAR - ARROW_VERSION) +find_package_handle_standard_args( + arrow + REQUIRED_VARS ARROW_INCLUDE_DIR + VERSION_VAR ARROW_VERSION) diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 99dc29cfe5296..1bd789b7cafe6 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -114,14 +114,14 @@ void Buffer::CheckCPU() const { Result> Buffer::GetReader( std::shared_ptr buf) { - return buf->memory_manager_->GetBufferReader(buf); + return buf->memory_manager_->GetBufferReader(std::move(buf)); } Result> Buffer::GetWriter(std::shared_ptr buf) { if (!buf->is_mutable()) { return Status::Invalid("Expected mutable buffer"); } - return buf->memory_manager_->GetBufferWriter(buf); + return buf->memory_manager_->GetBufferWriter(std::move(buf)); } Result> Buffer::Copy(std::shared_ptr source, diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index c5e6d7fa4bdf0..12937406e7800 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -30,6 +30,7 @@ #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -111,13 +112,30 @@ bool ChunkedArray::Equals(const ChunkedArray& other) const { .ok(); } -bool ChunkedArray::Equals(const std::shared_ptr& other) const { - if (this == other.get()) { - return true; +namespace { + +bool mayHaveNaN(const arrow::DataType& type) { + if (type.num_fields() == 0) { + return is_floating(type.id()); + } else { + for (const auto& field : type.fields()) { + if (mayHaveNaN(*field->type())) { + return true; + } + } } + return false; +} + +} // namespace + +bool ChunkedArray::Equals(const std::shared_ptr& other) const { if (!other) { return false; } + if (this == other.get() && !mayHaveNaN(*type_)) { + return true; + } return Equals(*other.get()); } diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 08410b4cd5367..46dccaf3c6b86 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -146,6 +146,35 @@ TEST_F(TestChunkedArray, EqualsDifferingMetadata) { ASSERT_TRUE(left.Equals(right)); } +TEST_F(TestChunkedArray, EqualsSameAddressWithNaNs) { + auto chunk_with_nan1 = ArrayFromJSON(float64(), "[0, 1, 2, NaN]"); + auto chunk_without_nan1 = ArrayFromJSON(float64(), "[3, 4, 5]"); + ArrayVector chunks1 = {chunk_with_nan1, chunk_without_nan1}; + ASSERT_OK_AND_ASSIGN(auto chunked_array_with_nan1, ChunkedArray::Make(chunks1)); + ASSERT_FALSE(chunked_array_with_nan1->Equals(chunked_array_with_nan1)); + + auto chunk_without_nan2 = ArrayFromJSON(float64(), "[6, 7, 8, 9]"); + ArrayVector chunks2 = {chunk_without_nan1, chunk_without_nan2}; + ASSERT_OK_AND_ASSIGN(auto chunked_array_without_nan1, ChunkedArray::Make(chunks2)); + ASSERT_TRUE(chunked_array_without_nan1->Equals(chunked_array_without_nan1)); + + auto int32_array = ArrayFromJSON(int32(), "[0, 1, 2]"); + auto float64_array_with_nan = ArrayFromJSON(float64(), "[0, 1, NaN]"); + ArrayVector arrays1 = {int32_array, float64_array_with_nan}; + std::vector fieldnames = {"Int32Type", "Float64Type"}; + ASSERT_OK_AND_ASSIGN(auto struct_with_nan, StructArray::Make(arrays1, fieldnames)); + ArrayVector chunks3 = {struct_with_nan}; + ASSERT_OK_AND_ASSIGN(auto chunked_array_with_nan2, ChunkedArray::Make(chunks3)); + ASSERT_FALSE(chunked_array_with_nan2->Equals(chunked_array_with_nan2)); + + auto float64_array_without_nan = ArrayFromJSON(float64(), "[0, 1, 2]"); + ArrayVector arrays2 = {int32_array, float64_array_without_nan}; + ASSERT_OK_AND_ASSIGN(auto struct_without_nan, StructArray::Make(arrays2, fieldnames)); + ArrayVector chunks4 = {struct_without_nan}; + ASSERT_OK_AND_ASSIGN(auto chunked_array_without_nan2, ChunkedArray::Make(chunks4)); + ASSERT_TRUE(chunked_array_without_nan2->Equals(chunked_array_without_nan2)); +} + TEST_F(TestChunkedArray, SliceEquals) { random::RandomArrayGenerator gen(42); diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index 1d71c14dca714..001424dd42072 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -19,8 +19,10 @@ add_custom_target(arrow_compute) arrow_install_all_headers("arrow/compute") -# pkg-config support -arrow_add_pkg_config("arrow-compute") +if(ARROW_COMPUTE) + # pkg-config support + arrow_add_pkg_config("arrow-compute") +endif() # # Unit tests @@ -87,7 +89,9 @@ add_arrow_test(internals_test kernel_test.cc light_array_test.cc registry_test.cc - key_hash_test.cc) + key_hash_test.cc + EXTRA_LINK_LIBS + ${ARROW_GTEST_GMOCK}) add_arrow_compute_test(expression_test SOURCES expression_test.cc) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 8f45f6199fbe1..3493c3146310d 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -138,7 +138,7 @@ class ARROW_EXPORT QuantileOptions : public FunctionOptions { static constexpr char const kTypeName[] = "QuantileOptions"; static QuantileOptions Defaults() { return QuantileOptions{}; } - /// quantile must be between 0 and 1 inclusive + /// probability level of quantile must be between 0 and 1 inclusive std::vector q; enum Interpolation interpolation; /// If true (the default), null values are ignored. Otherwise, if any value is null, @@ -162,7 +162,7 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions { static constexpr char const kTypeName[] = "TDigestOptions"; static TDigestOptions Defaults() { return TDigestOptions{}; } - /// quantile must be between 0 and 1 inclusive + /// probability level of quantile must be between 0 and 1 inclusive std::vector q; /// compression parameter, default 100 uint32_t delta; diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index d7a61d0a55985..eaec940556361 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -275,6 +275,29 @@ struct EnumTraits } }; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "SetLookupOptions::NullMatchingBehavior"; } + static std::string value_name(compute::SetLookupOptions::NullMatchingBehavior value) { + switch (value) { + case compute::SetLookupOptions::NullMatchingBehavior::MATCH: + return "MATCH"; + case compute::SetLookupOptions::NullMatchingBehavior::SKIP: + return "SKIP"; + case compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL: + return "EMIT_NULL"; + case compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE: + return "INCONCLUSIVE"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -286,6 +309,7 @@ using ::arrow::internal::checked_cast; namespace internal { namespace { +using ::arrow::internal::CoercedDataMember; using ::arrow::internal::DataMember; static auto kArithmeticOptionsType = GetFunctionOptionsType( DataMember("check_overflow", &ArithmeticOptions::check_overflow)); @@ -344,7 +368,8 @@ static auto kRoundToMultipleOptionsType = GetFunctionOptionsType( DataMember("value_set", &SetLookupOptions::value_set), - DataMember("skip_nulls", &SetLookupOptions::skip_nulls)); + CoercedDataMember("null_matching_behavior", &SetLookupOptions::null_matching_behavior, + &SetLookupOptions::GetNullMatchingBehavior)); static auto kSliceOptionsType = GetFunctionOptionsType( DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop), DataMember("step", &SliceOptions::step)); @@ -540,8 +565,29 @@ constexpr char RoundToMultipleOptions::kTypeName[]; SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) : FunctionOptions(internal::kSetLookupOptionsType), value_set(std::move(value_set)), - skip_nulls(skip_nulls) {} -SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {} + skip_nulls(skip_nulls) { + if (skip_nulls) { + this->null_matching_behavior = SetLookupOptions::SKIP; + } else { + this->null_matching_behavior = SetLookupOptions::MATCH; + } +} +SetLookupOptions::SetLookupOptions( + Datum value_set, SetLookupOptions::NullMatchingBehavior null_matching_behavior) + : FunctionOptions(internal::kSetLookupOptionsType), + value_set(std::move(value_set)), + null_matching_behavior(std::move(null_matching_behavior)) {} +SetLookupOptions::SetLookupOptions() + : SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {} +SetLookupOptions::NullMatchingBehavior SetLookupOptions::GetNullMatchingBehavior() const { + if (!this->skip_nulls.has_value()) { + return this->null_matching_behavior; + } else if (this->skip_nulls.value()) { + return SetLookupOptions::SKIP; + } else { + return SetLookupOptions::MATCH; + } +} constexpr char SetLookupOptions::kTypeName[]; SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 0a06a2829f0da..9f12471ddca14 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -268,19 +268,49 @@ class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions { /// Options for IsIn and IndexIn functions class ARROW_EXPORT SetLookupOptions : public FunctionOptions { public: - explicit SetLookupOptions(Datum value_set, bool skip_nulls = false); + /// How to handle null values. + enum NullMatchingBehavior { + /// MATCH, any null in `value_set` is successfully matched in + /// the input. + MATCH, + /// SKIP, any null in `value_set` is ignored and nulls in the input + /// produce null (IndexIn) or false (IsIn) values in the output. + SKIP, + /// EMIT_NULL, any null in `value_set` is ignored and nulls in the + /// input produce null (IndexIn and IsIn) values in the output. + EMIT_NULL, + /// INCONCLUSIVE, null values are regarded as unknown values, which is + /// sql-compatible. nulls in the input produce null (IndexIn and IsIn) + /// values in the output. Besides, if `value_set` contains a null, + /// non-null unmatched values in the input also produce null values + /// (IndexIn and IsIn) in the output. + INCONCLUSIVE + }; + + explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); SetLookupOptions(); + + // DEPRECATED(will be removed after removing of skip_nulls) + explicit SetLookupOptions(Datum value_set, bool skip_nulls); + static constexpr char const kTypeName[] = "SetLookupOptions"; /// The set of values to look up input values into. Datum value_set; + + NullMatchingBehavior null_matching_behavior; + + // DEPRECATED(will be removed after removing of skip_nulls) + NullMatchingBehavior GetNullMatchingBehavior() const; + + // DEPRECATED(use null_matching_behavior instead) /// Whether nulls in `value_set` count for lookup. /// /// If true, any null in `value_set` is ignored and nulls in the input /// produce null (IndexIn) or false (IsIn) values in the output. /// If false, any null in `value_set` is successfully matched in /// the input. - bool skip_nulls; + std::optional skip_nulls; }; /// Options for struct_field function diff --git a/cpp/src/arrow/compute/arrow-compute.pc.in b/cpp/src/arrow/compute/arrow-compute.pc.in index 35bfb51683779..a94a0261cce3b 100644 --- a/cpp/src/arrow/compute/arrow-compute.pc.in +++ b/cpp/src/arrow/compute/arrow-compute.pc.in @@ -20,6 +20,6 @@ includedir=@ARROW_PKG_CONFIG_INCLUDEDIR@ libdir=@ARROW_PKG_CONFIG_LIBDIR@ Name: Apache Arrow Compute -Description: Compute modules for Apache Arrow +Description: All compute kernels for Apache Arrow Version: @ARROW_VERSION@ Requires: arrow diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index b852f6f6b0cdb..44159e76600fb 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -263,8 +263,9 @@ TEST(Expression, ToString) { auto in_12 = call("index_in", {field_ref("beta")}, compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}); - EXPECT_EQ(in_12.ToString(), - "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=false})"); + EXPECT_EQ( + in_12.ToString(), + "index_in(beta, {value_set=int32:[\n 1,\n 2\n], null_matching_behavior=MATCH})"); EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)"); EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)"); diff --git a/cpp/src/arrow/compute/function_internal.cc b/cpp/src/arrow/compute/function_internal.cc index cd73462e953c4..2ef1d265ea09c 100644 --- a/cpp/src/arrow/compute/function_internal.cc +++ b/cpp/src/arrow/compute/function_internal.cc @@ -83,8 +83,10 @@ Result> GenericOptionsType::Deserialize( Result> DeserializeFunctionOptions( const Buffer& buffer) { - io::BufferReader stream(buffer); - ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream)); + // Copying the buffer here is not ideal, but we need to do it to avoid + // use-after-free issues with the zero-copy buffer read. + auto stream = io::BufferReader::FromString(buffer.ToString()); + ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(stream.get())); ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0)); if (batch->num_rows() != 1) { return Status::Invalid( diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 0bd6fe86134ab..78743050625a4 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -23,7 +23,8 @@ if(ARROW_TESTING) add_library(arrow_compute_kernels_testing OBJECT test_util.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly - target_link_libraries(arrow_compute_kernels_testing ${ARROW_GTEST_GTEST}) + target_link_libraries(arrow_compute_kernels_testing ${ARROW_GTEST_GTEST} + ${ARROW_GTEST_GMOCK}) endif() add_arrow_test(scalar_cast_test diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 3de922531ab19..4966e9871d62c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include "arrow/compute/api_aggregate.h" @@ -25,13 +26,13 @@ #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/align_util.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/decimal.h" -namespace arrow { -namespace compute { -namespace internal { +namespace arrow::compute::internal { void AddBasicAggKernels(KernelInit init, const std::vector>& types, @@ -58,16 +59,17 @@ void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func); // ---------------------------------------------------------------------- // Sum implementation -template +template ::Type> struct SumImpl : public ScalarAggregator { - using ThisType = SumImpl; + using ThisType = SumImpl; using CType = typename TypeTraits::CType; - using SumType = typename FindAccumulatorType::Type; + using SumType = ResultType; using SumCType = typename TypeTraits::CType; using OutputType = typename TypeTraits::ScalarType; - SumImpl(std::shared_ptr out_type, const ScalarAggregateOptions& options_) - : out_type(out_type), options(options_) {} + SumImpl(std::shared_ptr out_type, ScalarAggregateOptions options_) + : out_type(std::move(out_type)), options(std::move(options_)) {} Status Consume(KernelContext*, const ExecSpan& batch) override { if (batch[0].is_array()) { @@ -169,14 +171,19 @@ struct NullSumImpl : public NullImpl { } }; +template +struct MeanImpl; + template -struct MeanImpl : public SumImpl { +struct MeanImpl> + : public SumImpl { using SumImpl::SumImpl; + using SumImpl::options; + using SumCType = typename SumImpl::SumCType; + using OutputType = typename SumImpl::OutputType; template - enable_if_decimal FinalizeImpl(Datum* out) { - using SumCType = typename SumImpl::SumCType; - using OutputType = typename SumImpl::OutputType; + Status FinalizeImpl(Datum* out) { if ((!options.skip_nulls && this->nulls_observed) || (this->count < options.min_count) || (this->count == 0)) { out->value = std::make_shared(this->out_type); @@ -196,20 +203,34 @@ struct MeanImpl : public SumImpl { } return Status::OK(); } + + Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); } +}; + +template +struct MeanImpl::value>> + // Override the ResultType of SumImpl because we need to use double for intermediate + // sum to prevent integer overflows + : public SumImpl { + using SumImpl::SumImpl; + using SumImpl::options; + template - enable_if_t::value, Status> FinalizeImpl(Datum* out) { + Status FinalizeImpl(Datum* out) { if ((!options.skip_nulls && this->nulls_observed) || (this->count < options.min_count)) { out->value = std::make_shared(); } else { - const double mean = static_cast(this->sum) / this->count; + static_assert(std::is_same_vsum), double>, + "SumCType must be double for numeric inputs"); + const double mean = this->sum / this->count; out->value = std::make_shared(mean); } return Status::OK(); } - Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); } - using SumImpl::options; + Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); } }; template