From a5edcd02b7501066834ec9748bf1d7337eea2a8e Mon Sep 17 00:00:00 2001 From: "Max H. Gerlach" Date: Mon, 17 Jan 2022 18:07:31 +0100 Subject: [PATCH] Update to CMake 3.13 for better CUDA support and to enable build concurrency (#3261) Signed-off-by: Max H. Gerlach --- CHANGELOG.md | 2 + CMakeLists.txt | 48 +- Dockerfile.test.cpu | 4 +- Dockerfile.test.gpu | 4 +- NOTICE | 35 + cmake/Modules/FindNCCL.cmake | 4 +- cmake/Modules/FindNVTX.cmake | 6 +- cmake/Modules/FindTensorflow.cmake | 2 +- cmake/build_utils.py | 2 +- cmake/upstream/FindCUDAToolkit.cmake | 917 +++++++++++++++++++++++++ docker/README.md | 1 + docker/horovod-cpu/Dockerfile | 5 +- docker/horovod-ray/Dockerfile | 5 +- docker/horovod/Dockerfile | 5 +- docs/install.rst | 4 +- horovod/common/ops/cuda/CMakeLists.txt | 21 +- setup.py | 2 +- 17 files changed, 1033 insertions(+), 34 deletions(-) create mode 100644 cmake/upstream/FindCUDAToolkit.cmake diff --git a/CHANGELOG.md b/CHANGELOG.md index 0093a4377e..c3d82255fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed +- Moved to CMake version 3.13 with first-class CUDA language support and re-enabled parallelized builds. ([#3261](https://github.com/horovod/horovod/pull/3261)) + ### Deprecated - Deprecated ElasticRayExecutor APIs in favor of the new RayExecutor API for issue: [#3190](https://github.com/horovod/horovod/issues/3190). ### Removed diff --git a/CMakeLists.txt b/CMakeLists.txt index 862dd98d32..f79d170b07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,19 +1,17 @@ -cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR) +cmake_minimum_required(VERSION 3.13 FATAL_ERROR) if(POLICY CMP0074) # 1. Introduced with 3.12.4. # 2. *_ROOT variables will be checked cmake_policy(SET CMP0074 NEW) endif() -if(${CMAKE_VERSION} VERSION_GREATER "3.4.0") - find_program(CCACHE_PROGRAM ccache) - if(CCACHE_PROGRAM) +find_program(CCACHE_PROGRAM ccache) +if(CCACHE_PROGRAM) set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") - else() +else() message(STATUS "Could not find CCache. Consider installing CCache to speed up compilation.") - endif() endif() project(horovod CXX) @@ -23,7 +21,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) # Configure path to modules (for find_package) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake/Modules/") +set(CMAKE_MODULE_PATH + ${CMAKE_MODULE_PATH} + "${PROJECT_SOURCE_DIR}/cmake/Modules/" + "${PROJECT_SOURCE_DIR}/cmake/upstream/") include(cmake/Utilities.cmake) create_metadata() @@ -148,14 +149,33 @@ if (NOT "$ENV{HOROVOD_WITHOUT_MPI}" STREQUAL "1") endif() # CUDA and ROCM +set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) +if(NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY) + set(CMAKE_CUDA_RUNTIME_LIBRARY "Shared") # Set to "Static" or "Shared" +endif() +if (DEFINED ENV{HOROVOD_CUDA_HOME}) + set(CMAKE_CUDA_COMPILER $ENV{HOROVOD_CUDA_HOME}/bin/nvcc) +endif() +include(CheckLanguage) +check_language(CUDA) +if (CMAKE_CUDA_COMPILER) + if ((CMAKE_CXX_COMPILER_ID MATCHES GNU) AND (CMAKE_SYSTEM_PROCESSOR MATCHES ppc64le)) + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++11") + endif () + endif () + enable_language(CUDA) +endif () + macro(ADD_CUDA) - if (DEFINED ENV{HOROVOD_CUDA_HOME}) - set(CUDA_TOOLKIT_ROOT_DIR $ENV{HOROVOD_CUDA_HOME}) + find_package(CUDAToolkit REQUIRED) + include_directories(SYSTEM ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + string(TOLOWER "${CMAKE_CUDA_RUNTIME_LIBRARY}" lowercase_CMAKE_CUDA_RUNTIME_LIBRARY) + if (lowercase_CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "static") + list(APPEND LINKER_LIBS CUDA::cudart_static) + elseif (lowercase_CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "shared") + list(APPEND LINKER_LIBS CUDA::cudart) endif() - option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" OFF) - find_package(CUDA REQUIRED) - include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) - list(APPEND LINKER_LIBS ${CUDA_LIBRARIES}) list(APPEND SOURCES "${PROJECT_SOURCE_DIR}/horovod/common/ops/cuda_operations.cc" "${PROJECT_SOURCE_DIR}/horovod/common/ops/gpu_operations.cc") # CUDA + MPI @@ -209,7 +229,7 @@ endif() if(HOROVOD_GPU_ALLREDUCE STREQUAL "D") message(DEPRECATION "DDL backend has been deprecated. Please, start using the NCCL backend by building Horovod with " "'HOROVOD_GPU_OPERATIONS=NCCL'. Will be removed in v0.21.0.") - list(APPEND LINKER_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib/libddl.so" "${CUDA_TOOLKIT_ROOT_DIR}/lib/libddl_pack.so") + list(APPEND LINKER_LIBS "${CUDAToolkit_LIBRARY_ROOT}/lib/libddl.so" "${CUDAToolkit_LIBRARY_ROOT}/lib/libddl_pack.so") list(APPEND SOURCES "${PROJECT_SOURCE_DIR}/horovod/common/mpi/ddl_mpi_context_manager.cc" "${PROJECT_SOURCE_DIR}/horovod/common/ops/ddl_operations.cc") add_definitions(-DHAVE_DDL=1) diff --git a/Dockerfile.test.cpu b/Dockerfile.test.cpu index 35ce208c30..72da015125 100644 --- a/Dockerfile.test.cpu +++ b/Dockerfile.test.cpu @@ -33,7 +33,6 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test RUN apt-get update -qq && apt-get install -y --no-install-recommends \ wget \ ca-certificates \ - cmake \ openssh-client \ openssh-server \ git \ @@ -52,6 +51,9 @@ RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python${PYTHON_VERSION/%. RUN wget --progress=dot:mega https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py RUN pip install --no-cache-dir -U --force pip setuptools requests pytest mock pytest-forked parameterized +# Install recent CMake. +RUN pip install --no-cache-dir -U cmake~=3.13.0 + # Add launch helper scripts RUN echo "env SPARK_HOME=/spark SPARK_DRIVER_MEM=512m PYSPARK_PYTHON=/usr/bin/python${PYTHON_VERSION} PYSPARK_DRIVER_PYTHON=/usr/bin/python${PYTHON_VERSION} \"\$@\"" > /spark_env.sh RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > /pytest.sh diff --git a/Dockerfile.test.gpu b/Dockerfile.test.gpu index 3808b8a541..7388008b0a 100644 --- a/Dockerfile.test.gpu +++ b/Dockerfile.test.gpu @@ -35,7 +35,6 @@ RUN CUDNN_MAJOR=$(cut -d '.' -f 1 <<< "${CUDNN_VERSION}"); \ apt-get update -qq && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ wget \ ca-certificates \ - cmake \ openssh-client \ openssh-server \ git \ @@ -57,6 +56,9 @@ RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python${PYTHON_VERSION/%. RUN wget --progress=dot:mega https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py RUN pip install --no-cache-dir -U --force pip "setuptools<60.1.0" requests pytest mock pytest-forked parameterized +# Install recent CMake. +RUN pip install --no-cache-dir -U cmake~=3.13.0 + # Add launch helper scripts RUN echo "env SPARK_HOME=/spark SPARK_DRIVER_MEM=512m PYSPARK_PYTHON=/usr/bin/python${PYTHON_VERSION} PYSPARK_DRIVER_PYTHON=/usr/bin/python${PYTHON_VERSION} \"\$@\"" > /spark_env.sh RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > /pytest.sh diff --git a/NOTICE b/NOTICE index a03473bfb8..1532d369e4 100644 --- a/NOTICE +++ b/NOTICE @@ -232,3 +232,38 @@ The derived work can be found in the files: - horovod/torch/sync_batch_norm.py + + CMake - Cross Platform Makefile Generator + Copyright 2000-2020 Kitware, Inc. and Contributors + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Kitware, Inc. nor the names of Contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + The derived work can be found in the files: + + - cmake/upstream/FindCUDAToolkit.cmake diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake index 4269a1e22a..0eb9366f0e 100644 --- a/cmake/Modules/FindNCCL.cmake +++ b/cmake/Modules/FindNCCL.cmake @@ -11,14 +11,14 @@ # NCCL_LIBRARIES # NCCL_MAJOR_VERSION # -# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks +# The path hints include CUDAToolkit_LIBRARY_ROOT seeing as some folks # install NCCL in the same location as the CUDA toolkit. set(HOROVOD_NCCL_HOME $ENV{HOROVOD_NCCL_HOME} CACHE PATH "Folder contains NVIDIA NCCL") set(HOROVOD_NCCL_INCLUDE $ENV{HOROVOD_NCCL_INCLUDE} CACHE PATH "Folder contains NVIDIA NCCL headers") set(HOROVOD_NCCL_LIB $ENV{HOROVOD_NCCL_LIB} CACHE PATH "Folder contains NVIDIA NCCL libraries") -list(APPEND NCCL_ROOT ${HOROVOD_NCCL_HOME} ${CUDA_TOOLKIT_ROOT_DIR}) +list(APPEND NCCL_ROOT ${HOROVOD_NCCL_HOME} ${CUDAToolkit_LIBRARY_ROOT}) # Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT}) diff --git a/cmake/Modules/FindNVTX.cmake b/cmake/Modules/FindNVTX.cmake index 4d38ebb3ac..1f5e12275a 100644 --- a/cmake/Modules/FindNVTX.cmake +++ b/cmake/Modules/FindNVTX.cmake @@ -10,13 +10,13 @@ set(HOROVOD_NVTX_INCLUDE $ENV{HOROVOD_NVTX_INCLUDE} CACHE PATH "Folder containing NVIDIA NVTX3 headers") -list(APPEND NVTX_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) +list(APPEND NVTX_ROOT ${CUDAToolkit_LIBRARY_ROOT}) # Compatible layer for CMake <3.12: list(APPEND CMAKE_PREFIX_PATH ${NVTX_ROOT}) find_path(NVTX_INCLUDE_DIR - NAMES nvtx3/nvToolsExt.h - HINTS ${HOROVOD_NVTX_INCLUDE}) + NAMES nvtx3/nvToolsExt.h + HINTS ${HOROVOD_NVTX_INCLUDE}) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(NVTX DEFAULT_MSG NVTX_INCLUDE_DIR) diff --git a/cmake/Modules/FindTensorflow.cmake b/cmake/Modules/FindTensorflow.cmake index e8885903e1..8507c796b6 100644 --- a/cmake/Modules/FindTensorflow.cmake +++ b/cmake/Modules/FindTensorflow.cmake @@ -22,7 +22,7 @@ if (LEN EQUAL "4") string(REPLACE " " ";" Tensorflow_LIBRARIES_LIST "${Tensorflow_LIBRARIES}") list(GET Tensorflow_LIBRARIES_LIST 0 Tensorflow_LIB_PATH_ARGUMENT) string(REGEX REPLACE "^-L" "" Tensorflow_LIB_PATH ${Tensorflow_LIB_PATH_ARGUMENT}) - if (Tensorflow_VERSION VERSION_GREATER "2.6" OR Tensorflow_VERSION VERSION_EQUAL "2.6") + if (Tensorflow_VERSION VERSION_GREATER_EQUAL "2.6") # XLA implementations and helpers for resource variables are in _pywrap_tensorflow_internal.so set(Tensorflow_LIBRARIES "${Tensorflow_LIBRARIES} ${Tensorflow_LIB_PATH}/python/_pywrap_tensorflow_internal.so") endif() diff --git a/cmake/build_utils.py b/cmake/build_utils.py index be7e790751..c2f384c99e 100644 --- a/cmake/build_utils.py +++ b/cmake/build_utils.py @@ -91,7 +91,7 @@ def get_nvcc_bin(): 'Make sure it is added to your path or in $HOROVOD_CUDA_HOME/bin.') def get_nvcc_flags(): - default_flags = ['--std=c++11', '-O3', '-Xcompiler', '-fPIC'] + default_flags = ['-O3', '-Xcompiler', '-fPIC'] cc_list_env = os.environ.get('HOROVOD_BUILD_CUDA_CC_LIST') # Invoke nvcc and extract all supported compute capabilities for CUDA toolkit version diff --git a/cmake/upstream/FindCUDAToolkit.cmake b/cmake/upstream/FindCUDAToolkit.cmake new file mode 100644 index 0000000000..27bd122154 --- /dev/null +++ b/cmake/upstream/FindCUDAToolkit.cmake @@ -0,0 +1,917 @@ +# Copyright 2000-2020 Kitware, Inc. and Contributors +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Kitware, Inc. nor the names of Contributors +# may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +#[=======================================================================[.rst: +FindCUDAToolkit +--------------- + +This script locates the NVIDIA CUDA toolkit and the associated libraries, but +does not require the ``CUDA`` language be enabled for a given project. This +module does not search for the NVIDIA CUDA Samples. + +Search Behavior +^^^^^^^^^^^^^^^ + +Finding the CUDA Toolkit requires finding the ``nvcc`` executable, which is +searched for in the following order: + +1. If the ``CUDA`` language has been enabled we will use the directory + containing the compiler as the first search location for ``nvcc``. + +2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g., + ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it + will be searched. If both an environment variable **and** a + configuration variable are specified, the *configuration* variable takes + precedence. + + The directory specified here must be such that the executable ``nvcc`` can be + found underneath the directory specified by ``CUDAToolkit_ROOT``. If + ``CUDAToolkit_ROOT`` is specified, but no ``nvcc`` is found underneath, this + package is marked as **not** found. No subsequent search attempts are + performed. + +3. If the CUDA_PATH environment variable is defined, it will be searched. + +4. The user's path is searched for ``nvcc`` using :command:`find_program`. If + this is found, no subsequent search attempts are performed. Users are + responsible for ensuring that the first ``nvcc`` to show up in the path is + the desired path in the event that multiple CUDA Toolkits are installed. + +5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is + used. No subsequent search attempts are performed. No default symbolic link + location exists for the Windows platform. + +6. The platform specific default install locations are searched. If exactly one + candidate is found, this is used. The default CUDA Toolkit install locations + searched are: + + +-------------+-------------------------------------------------------------+ + | Platform | Search Pattern | + +=============+=============================================================+ + | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Other Unix | ``/usr/local/cuda-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Windows | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` | + +-------------+-------------------------------------------------------------+ + + Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as + ``/usr/local/cuda-9.0`` or + ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`` + + .. note:: + + When multiple CUDA Toolkits are installed in the default location of a + system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0`` + exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this + package is marked as **not** found. + + There are too many factors involved in making an automatic decision in + the presence of multiple CUDA Toolkits being installed. In this + situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or + (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for + :command:`find_program` to find. + +Options +^^^^^^^ + +``VERSION`` + If specified, describes the version of the CUDA Toolkit to search for. + +``REQUIRED`` + If specified, configuration will error if a suitable CUDA Toolkit is not + found. + +``QUIET`` + If specified, the search for a suitable CUDA Toolkit will not produce any + messages. + +``EXACT`` + If specified, the CUDA Toolkit is considered found only if the exact + ``VERSION`` specified is recovered. + +Imported targets +^^^^^^^^^^^^^^^^ + +An :ref:`imported target ` named ``CUDA::toolkit`` is provided. + +This module defines :prop_tgt:`IMPORTED` targets for each +of the following libraries that are part of the CUDAToolkit: + +- :ref:`CUDA Runtime Library` +- :ref:`CUDA Driver Library` +- :ref:`cuBLAS` +- :ref:`cuFFT` +- :ref:`cuRAND` +- :ref:`cuSOLVER` +- :ref:`cuSPARSE` +- :ref:`cuPTI` +- :ref:`NPP` +- :ref:`nvBLAS` +- :ref:`nvGRAPH` +- :ref:`nvJPEG` +- :ref:`nvidia-ML` +- :ref:`nvRTC` +- :ref:`nvToolsExt` +- :ref:`OpenCL` +- :ref:`cuLIBOS` + +.. _`cuda_toolkit_rt_lib`: + +CUDA Runtime Library +"""""""""""""""""""" + +The CUDA Runtime library (cudart) are what most applications will typically +need to link against to make any calls such as `cudaMalloc`, and `cudaFree`. + +Targets Created: + +- ``CUDA::cudart`` +- ``CUDA::cudart_static`` + +.. _`cuda_toolkit_driver_lib`: + +CUDA Driver Library +"""""""""""""""""""" + +The CUDA Driver library (cuda) are used by applications that use calls +such as `cuMemAlloc`, and `cuMemFree`. This is generally used by advanced + + +Targets Created: + +- ``CUDA::cuda_driver`` +- ``CUDA::cuda_driver`` + +.. _`cuda_toolkit_cuBLAS`: + +cuBLAS +"""""" + +The `cuBLAS `_ library. + +Targets Created: + +- ``CUDA::cublas`` +- ``CUDA::cublas_static`` + +.. _`cuda_toolkit_cuFFT`: + +cuFFT +""""" + +The `cuFFT `_ library. + +Targets Created: + +- ``CUDA::cufft`` +- ``CUDA::cufftw`` +- ``CUDA::cufft_static`` +- ``CUDA::cufftw_static`` + +cuRAND +"""""" + +The `cuRAND `_ library. + +Targets Created: + +- ``CUDA::curand`` +- ``CUDA::curand_static`` + +.. _`cuda_toolkit_cuSOLVER`: + +cuSOLVER +"""""""" + +The `cuSOLVER `_ library. + +Targets Created: + +- ``CUDA::cusolver`` +- ``CUDA::cusolver_static`` + +.. _`cuda_toolkit_cuSPARSE`: + +cuSPARSE +"""""""" + +The `cuSPARSE `_ library. + +Targets Created: + +- ``CUDA::cusparse`` +- ``CUDA::cusparse_static`` + +.. _`cuda_toolkit_cupti`: + +cupti +""""" + +The `NVIDIA CUDA Profiling Tools Interface `_. + +Targets Created: + +- ``CUDA::cupti`` +- ``CUDA::cupti_static`` + +.. _`cuda_toolkit_NPP`: + +NPP +""" + +The `NPP `_ libraries. + +Targets Created: + +- `nppc`: + + - ``CUDA::nppc`` + - ``CUDA::nppc_static`` + +- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h` + + - ``CUDA::nppial`` + - ``CUDA::nppial_static`` + +- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h` + + - ``CUDA::nppicc`` + - ``CUDA::nppicc_static`` + +- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h` + + - ``CUDA::nppicom`` + - ``CUDA::nppicom_static`` + +- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h` + + - ``CUDA::nppidei`` + - ``CUDA::nppidei_static`` + +- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h` + + - ``CUDA::nppif`` + - ``CUDA::nppif_static`` + +- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h` + + - ``CUDA::nppig`` + - ``CUDA::nppig_static`` + +- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h` + + - ``CUDA::nppim`` + - ``CUDA::nppim_static`` + +- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h` + + - ``CUDA::nppist`` + - ``CUDA::nppist_static`` + +- `nppisu`: Memory support functions in `nppi_support_functions.h` + + - ``CUDA::nppisu`` + - ``CUDA::nppisu_static`` + +- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h` + + - ``CUDA::nppitc`` + - ``CUDA::nppitc_static`` + +- `npps`: + + - ``CUDA::npps`` + - ``CUDA::npps_static`` + +.. _`cuda_toolkit_nvBLAS`: + +nvBLAS +"""""" + +The `nvBLAS `_ libraries. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvblas`` + +.. _`cuda_toolkit_nvGRAPH`: + +nvGRAPH +""""""" + +The `nvGRAPH `_ library. + +Targets Created: + +- ``CUDA::nvgraph`` +- ``CUDA::nvgraph_static`` + + +.. _`cuda_toolkit_nvJPEG`: + +nvJPEG +"""""" + +The `nvJPEG `_ library. +Introduced in CUDA 10. + +Targets Created: + +- ``CUDA::nvjpeg`` +- ``CUDA::nvjpeg_static`` + +.. _`cuda_toolkit_nvRTC`: + +nvRTC +""""" + +The `nvRTC `_ (Runtime Compilation) library. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvrtc`` + +.. _`cuda_toolkit_nvml`: + +nvidia-ML +""""""""" + +The `NVIDIA Management Library `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvml`` + +.. _`cuda_toolkit_nvToolsExt`: + +nvToolsExt +"""""""""" + +The `NVIDIA Tools Extension `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvToolsExt`` + +.. _`cuda_toolkit_opencl`: + +OpenCL +"""""" + +The `NVIDIA OpenCL Library `_. +This is a shared library only. + +Targets Created: + +- ``CUDA::OpenCL`` + +.. _`cuda_toolkit_cuLIBOS`: + +cuLIBOS +""""""" + +The cuLIBOS library is a backend thread abstraction layer library which is +static only. The ``CUDA::cublas_static``, ``CUDA::cusparse_static``, +``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP +libraries all automatically have this dependency linked. + +Target Created: + +- ``CUDA::culibos`` + +**Note**: direct usage of this target by consumers should not be necessary. + +.. _`cuda_toolkit_cuRAND`: + + + +Result variables +^^^^^^^^^^^^^^^^ + +``CUDAToolkit_FOUND`` + A boolean specifying whether or not the CUDA Toolkit was found. + +``CUDAToolkit_VERSION`` + The exact version of the CUDA Toolkit found (as reported by + ``nvcc --version``). + +``CUDAToolkit_VERSION_MAJOR`` + The major version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_MAJOR`` + The minor version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_PATCH`` + The patch version of the CUDA Toolkit. + +``CUDAToolkit_BIN_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + executable ``nvcc``. + +``CUDAToolkit_INCLUDE_DIRS`` + The path to the CUDA Toolkit ``include`` folder containing the header files + required to compile a project linking against CUDA. + +``CUDAToolkit_LIBRARY_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + Runtime library ``cudart``. + +``CUDAToolkit_TARGET_DIR`` + The path to the CUDA Toolkit directory including the target architecture + when cross-compiling. When not cross-compiling this will be equivalant to + ``CUDAToolkit_ROOT_DIR``. + +``CUDAToolkit_NVCC_EXECUTABLE`` + The path to the NVIDIA CUDA compiler ``nvcc``. Note that this path may + **not** be the same as + :variable:`CMAKE_CUDA_COMPILER _COMPILER>`. ``nvcc`` must be + found to determine the CUDA Toolkit version as well as determining other + features of the Toolkit. This variable is set for the convenience of + modules that depend on this one. + + +#]=======================================================================] + +# NOTE: much of this was simply extracted from FindCUDA.cmake. + +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR) + get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY) + # use the already detected cuda compiler + set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "") + mark_as_advanced(CUDAToolkit_BIN_DIR) + unset(cuda_dir) +endif() + +# Try language- or user-provided path first. +if(CUDAToolkit_BIN_DIR) + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${CUDAToolkit_BIN_DIR} + NO_DEFAULT_PATH + ) +endif() + +# Search using CUDAToolkit_ROOT +find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ENV CUDA_PATH + PATH_SUFFIXES bin +) + +# If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. +if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) + # Declare error messages now, print later depending on find_package args. + set(fail_base "Could not find nvcc executable in path specified by") + set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") + set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") + + if (CUDAToolkit_FIND_REQUIRED) + if (DEFINED CUDAToolkit_ROOT) + message(FATAL_ERROR ${cuda_root_fail}) + elseif (DEFINED ENV{CUDAToolkit_ROOT}) + message(FATAL_ERROR ${env_cuda_root_fail}) + endif() + else() + if (NOT CUDAToolkit_FIND_QUIETLY) + if (DEFINED CUDAToolkit_ROOT) + message(STATUS ${cuda_root_fail}) + elseif (DEFINED ENV{CUDAToolkit_ROOT}) + message(STATUS ${env_cuda_root_fail}) + endif() + endif() + set(CUDAToolkit_FOUND FALSE) + unset(fail_base) + unset(cuda_root_fail) + unset(env_cuda_root_fail) + return() + endif() +endif() + +# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults. +# +# - Linux: /usr/local/cuda-X.Y +# - macOS: /Developer/NVIDIA/CUDA-X.Y +# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y +# +# We will also search the default symlink location /usr/local/cuda first since +# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked +# directory is the desired location. +if (NOT CUDAToolkit_NVCC_EXECUTABLE) + if (UNIX) + if (NOT APPLE) + set(platform_base "/usr/local/cuda-") + else() + set(platform_base "/Developer/NVIDIA/CUDA-") + endif() + else() + set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v") + endif() + + # Build out a descending list of possible cuda installations, e.g. + file(GLOB possible_paths "${platform_base}*") + # Iterate the glob results and create a descending list. + set(possible_versions) + foreach (p ${possible_paths}) + # Extract version number from end of string + string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) + if (IS_DIRECTORY ${p} AND p_version) + list(APPEND possible_versions ${p_version}) + endif() + endforeach() + + # Cannot use list(SORT) because that is alphabetical, we need numerical. + # NOTE: this is not an efficient sorting strategy. But even if a user had + # every possible version of CUDA installed, this wouldn't create any + # significant overhead. + set(versions) + foreach (v ${possible_versions}) + list(LENGTH versions num_versions) + # First version, nothing to compare with so just append. + if (num_versions EQUAL 0) + list(APPEND versions ${v}) + else() + # Loop through list. Insert at an index when comparison is + # VERSION_GREATER since we want a descending list. Duplicates will not + # happen since this came from a glob list of directories. + set(i 0) + set(early_terminate FALSE) + while (i LESS num_versions) + list(GET versions ${i} curr) + if (v VERSION_GREATER curr) + list(INSERT versions ${i} ${v}) + set(early_terminate TRUE) + break() + endif() + math(EXPR i "${i} + 1") + endwhile() + # If it did not get inserted, place it at the end. + if (NOT early_terminate) + list(APPEND versions ${v}) + endif() + endif() + endforeach() + + # With a descending list of versions, populate possible paths to search. + set(search_paths) + foreach (v ${versions}) + list(APPEND search_paths "${platform_base}${v}") + endforeach() + + # Force the global default /usr/local/cuda to the front on Unix. + if (UNIX) + list(INSERT search_paths 0 "/usr/local/cuda") + endif() + + # Now search for nvcc again using the platform default search paths. + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${search_paths} + PATH_SUFFIXES bin + ) + + # We are done with these variables now, cleanup for caller. + unset(platform_base) + unset(possible_paths) + unset(possible_versions) + unset(versions) + unset(i) + unset(early_terminate) + unset(search_paths) + + if (NOT CUDAToolkit_NVCC_EXECUTABLE) + if (CUDAToolkit_FIND_REQUIRED) + message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") + elseif(NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") + endif() + + set(CUDAToolkit_FOUND FALSE) + return() + endif() +endif() + +if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) + get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY) + set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE) + mark_as_advanced(CUDAToolkit_BIN_DIR) + unset(cuda_dir) +endif() + +if(CUDAToolkit_NVCC_EXECUTABLE AND + CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) + # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value + # This if statement will always match, but is used to provide variables for MATCH 1,2,3... + if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}") + endif() +else() + # Compute the version by invoking nvcc + execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + endif() + unset(NVCC_OUT) +endif() + + +get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) + +# Handle cross compilation +if(CMAKE_CROSSCOMPILING) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") + # Support for NVPACK + set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") + # Support for arm cross compilation + set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + # Support for aarch64 cross compilation + if (ANDROID_ARCH_NAME STREQUAL "arm64") + set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") + else() + set(CUDAToolkit_TARGET_NAME "aarch64-linux") + endif (ANDROID_ARCH_NAME STREQUAL "arm64") + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") + endif() + + if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + # add known CUDA target root path to the set of directories we search for programs, libraries and headers + list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") + + # Mark that we need to pop the root search path changes after we have + # found all cuda libraries so that searches for our cross-compilation + # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or + # PATh + set(_CUDAToolkit_Pop_ROOT_PATH True) + endif() +else() + # Not cross compiling + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}") + # Now that we have the real ROOT_DIR, find components inside it. + list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR}) + + # Mark that we need to pop the prefix path changes after we have + # found the cudart library. + set(_CUDAToolkit_Pop_Prefix True) +endif() + + +# Find the include/ directory +find_path(CUDAToolkit_INCLUDE_DIR + NAMES cuda_runtime.h +) + +# And find the CUDA Runtime Library libcudart +find_library(CUDA_CUDART + NAMES cudart + PATH_SUFFIXES lib64 lib/x64 +) +if (NOT CUDA_CUDART) + find_library(CUDA_CUDART + NAMES cudart + PATH_SUFFIXES lib64/stubs lib/x64/stubs + ) +endif() + +if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Unable to find cudart library.") +endif() + +unset(CUDAToolkit_ROOT_DIR) +if(_CUDAToolkit_Pop_Prefix) + list(REMOVE_AT CMAKE_PREFIX_PATH -1) + unset(_CUDAToolkit_Pop_Prefix) +endif() + +#----------------------------------------------------------------------------- +# Perform version comparison and validate all required variables are set. +# HOROVOD NOTE: This differs from CMake source by ${CMAKE_CURRENT_LIST_DIR} +# replaced with ${CMAKE_ROOT}/Modules +include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) +find_package_handle_standard_args(CUDAToolkit + REQUIRED_VARS + CUDAToolkit_INCLUDE_DIR + CUDA_CUDART + CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR + CUDAToolkit_VERSION +) +mark_as_advanced(CUDA_CUDART + CUDAToolkit_INCLUDE_DIR + CUDAToolkit_NVCC_EXECUTABLE + ) + +#----------------------------------------------------------------------------- +# Construct result variables +if(CUDAToolkit_FOUND) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) +endif() + +#----------------------------------------------------------------------------- +# Construct import targets +if(CUDAToolkit_FOUND) + + function(_CUDAToolkit_find_and_add_import_lib lib_name) + cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN}) + + set(search_names ${lib_name} ${arg_ALT}) + + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_DIR} + ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib + ${arg_EXTRA_PATH_SUFFIXES} + ) + # Don't try any stub directories intil we have exhausted all other + # search locations. + if(NOT CUDA_${lib_name}_LIBRARY) + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_DIR} + ENV CUDA_PATH + PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs + ) + endif() + + mark_as_advanced(CUDA_${lib_name}_LIBRARY) + + if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + add_library(CUDA::${lib_name} IMPORTED INTERFACE) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") + foreach(dep ${arg_DEPS}) + if(TARGET CUDA::${dep}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) + endif() + endforeach() + endif() + endfunction() + + if(NOT TARGET CUDA::toolkit) + add_library(CUDA::toolkit IMPORTED INTERFACE) + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + endif() + + _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + + _CUDAToolkit_find_and_add_import_lib(cudart) + _CUDAToolkit_find_and_add_import_lib(cudart_static) + + # setup dependencies that are required for cudart_static when building + # on linux. These are generally only required when using the CUDA toolkit + # when CUDA language is disabled + if(NOT TARGET CUDA::cudart_static_deps + AND TARGET CUDA::cudart_static) + + add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) + target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) + + if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER)) + find_package(Threads REQUIRED) + target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) + endif() + + if(UNIX AND NOT APPLE) + # On Linux, you must link against librt when using the static cuda runtime. + find_library(CUDAToolkit_rt_LIBRARY rt) + mark_as_advanced(CUDAToolkit_rt_LIBRARY) + if(NOT CUDAToolkit_rt_LIBRARY) + message(WARNING "Could not find librt library, needed by CUDA::cudart_static") + else() + target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) + endif() + endif() + endif() + + _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library + foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + endforeach() + + # cuFFTW depends on cuFFT + _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) + _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + + # cuSOLVER depends on cuBLAS, and cuSPARSE + _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + + # nvGRAPH depends on cuRAND, and cuSOLVER. + _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + + # Process the majority of the NPP libraries. + foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + endforeach() + + _CUDAToolkit_find_and_add_import_lib(cupti + EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/) + _CUDAToolkit_find_and_add_import_lib(cupti_static + EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/) + + _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + + _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + + if(WIN32) + # nvtools can be installed outside the CUDA toolkit directory + # so prefer the NVTOOLSEXT_PATH windows only environment variable + # In addition on windows the most common name is nvToolsExt64_1 + find_library(CUDA_nvToolsExt_LIBRARY + NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt + PATHS ENV NVTOOLSEXT_PATH + ENV CUDA_PATH + PATH_SUFFIXES lib/x64 lib + ) + endif() + _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + + _CUDAToolkit_find_and_add_import_lib(OpenCL) +endif() + +if(_CUDAToolkit_Pop_ROOT_PATH) + list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0) + unset(_CUDAToolkit_Pop_ROOT_PATH) +endif() diff --git a/docker/README.md b/docker/README.md index 468c82691f..79f98baba6 100644 --- a/docker/README.md +++ b/docker/README.md @@ -42,6 +42,7 @@ including: Building the Docker images should be run from the root Horovod directory. For example: ``` +export DOCKER_BUILDKIT=1 docker build \ --build-arg TENSORFLOW_VERSION=2.3.1 \ --build-arg PYTORCH_VERSION=1.7.0+cu110 \ diff --git a/docker/horovod-cpu/Dockerfile b/docker/horovod-cpu/Dockerfile index b590307516..2b559540aa 100644 --- a/docker/horovod-cpu/Dockerfile +++ b/docker/horovod-cpu/Dockerfile @@ -18,9 +18,9 @@ SHELL ["/bin/bash", "-cu"] RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ build-essential \ - cmake \ g++-7 \ git \ + gpg \ curl \ vim \ wget \ @@ -57,6 +57,9 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ python get-pip.py && \ rm get-pip.py +# Install recent CMake. +RUN pip install --no-cache-dir -U cmake~=3.13.0 + # Install PyTorch, TensorFlow, Keras and MXNet RUN pip install --no-cache-dir torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} RUN pip install --no-cache-dir pytorch_lightning==${PYTORCH_LIGHTNING_VERSION} diff --git a/docker/horovod-ray/Dockerfile b/docker/horovod-ray/Dockerfile index 9da83ff6df..7007f5a47b 100644 --- a/docker/horovod-ray/Dockerfile +++ b/docker/horovod-ray/Dockerfile @@ -17,12 +17,15 @@ RUN sudo apt-get update && DEBIAN_FRONTEND="noninteractive" sudo apt-get install build-essential \ wget \ git \ + gpg \ curl \ - cmake \ rsync \ vim \ && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* +# Install recent CMake. +RUN pip install --no-cache-dir -U cmake~=3.13.0 + # Install PyTorch RUN pip install --no-cache-dir \ torch==${PYTORCH_VERSION} \ diff --git a/docker/horovod/Dockerfile b/docker/horovod/Dockerfile index 655eab332c..dacdcbf9c0 100644 --- a/docker/horovod/Dockerfile +++ b/docker/horovod/Dockerfile @@ -24,9 +24,9 @@ SHELL ["/bin/bash", "-cu"] RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ build-essential \ - cmake \ g++-7 \ git \ + gpg \ curl \ vim \ wget \ @@ -66,6 +66,9 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ python get-pip.py && \ rm get-pip.py +# Install recent CMake. +RUN pip install --no-cache-dir -U cmake~=3.13.0 + # Install PyTorch, TensorFlow, Keras and MXNet RUN pip install --no-cache-dir \ torch==${PYTORCH_VERSION} \ diff --git a/docs/install.rst b/docs/install.rst index d1de57c4b2..d339b3cabd 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -8,7 +8,7 @@ Requirements - Python >= 3.6 - `g++-5` or above, or another compiler supporting C++14 -- CMake +- CMake 3.13 or newer - TensorFlow, PyTorch, or MXNet - (Optional) MPI @@ -213,7 +213,7 @@ diagnose failures: Installing Horovod with Conda (+pip) ------------------------------------ -To use Conda to install PyTorch, TensorFlow, MXNet, Horovod, as well as GPU depdencies such as +To use Conda to install PyTorch, TensorFlow, MXNet, Horovod, as well as GPU dependencies such as NVIDIA CUDA Toolkit, cuDNN, NCCL, etc., see `Build a Conda Environment with GPU Support for Horovod `_. Environment Variables diff --git a/horovod/common/ops/cuda/CMakeLists.txt b/horovod/common/ops/cuda/CMakeLists.txt index c01b0aaed2..fde267440a 100644 --- a/horovod/common/ops/cuda/CMakeLists.txt +++ b/horovod/common/ops/cuda/CMakeLists.txt @@ -1,14 +1,25 @@ -set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) - set(ENV{PYTHONPATH} "${PROJECT_SOURCE_DIR}/cmake:$ENV{PYTHONPATH}") execute_process(COMMAND ${PY_EXE} -c "import build_utils; print(' '.join(build_utils.get_nvcc_flags()))" OUTPUT_VARIABLE HVD_NVCC_COMPILE_FLAGS OUTPUT_STRIP_TRAILING_WHITESPACE) MESSAGE(STATUS "HVD_NVCC_COMPILE_FLAGS = ${HVD_NVCC_COMPILE_FLAGS}") -list(APPEND CUDA_NVCC_FLAGS "${HVD_NVCC_COMPILE_FLAGS}") +# If we don't set CMAKE_CUDA_STANDARD, it will default to ${CMAKE_CXX_STANDARD} ("14" at this time). nvcc may fail if +# the --std=c++... argument is passed multiple times. +set(CMAKE_CUDA_STANDARD 11) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) -cuda_add_library(horovod_cuda_kernels cuda_kernels.cu OPTIONS -D_GLIBCXX_USE_CXX11_ABI=1) +add_library(horovod_cuda_kernels cuda_kernels.cu) +target_compile_options(horovod_cuda_kernels PRIVATE $<$: + SHELL:${HVD_NVCC_COMPILE_FLAGS} + -D_GLIBCXX_USE_CXX11_ABI=1 + >) +set_property(TARGET horovod_cuda_kernels PROPERTY CUDA_SEPARABLE_COMPILATION ON) # if we need compatible c++ abi, build a compatible version -cuda_add_library(compatible_horovod_cuda_kernels cuda_kernels.cu OPTIONS -D_GLIBCXX_USE_CXX11_ABI=0) \ No newline at end of file +add_library(compatible_horovod_cuda_kernels cuda_kernels.cu) +target_compile_options(compatible_horovod_cuda_kernels PRIVATE $<$: + SHELL:${HVD_NVCC_COMPILE_FLAGS} + -D_GLIBCXX_USE_CXX11_ABI=0 + >) +set_property(TARGET compatible_horovod_cuda_kernels PROPERTY CUDA_SEPARABLE_COMPILATION ON) diff --git a/setup.py b/setup.py index 15a6f3fede..279df0a89d 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ def build_extensions(self): '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(config.upper(), build_dir), '-DPYTHON_EXECUTABLE:FILEPATH=' + sys.executable] - make_args = [] + make_args = ['-j8'] if not os.environ.get('MAKEFLAGS') else [] if self.verbose: make_args.append('VERBOSE=1')