From a5edcd02b7501066834ec9748bf1d7337eea2a8e Mon Sep 17 00:00:00 2001
From: "Max H. Gerlach" <git@maxgerlach.de>
Date: Mon, 17 Jan 2022 18:07:31 +0100
Subject: [PATCH] Update to CMake 3.13 for better CUDA support and to enable
 build concurrency (#3261)

Signed-off-by: Max H. Gerlach <git@maxgerlach.de>
---
 CHANGELOG.md                           |   2 +
 CMakeLists.txt                         |  48 +-
 Dockerfile.test.cpu                    |   4 +-
 Dockerfile.test.gpu                    |   4 +-
 NOTICE                                 |  35 +
 cmake/Modules/FindNCCL.cmake           |   4 +-
 cmake/Modules/FindNVTX.cmake           |   6 +-
 cmake/Modules/FindTensorflow.cmake     |   2 +-
 cmake/build_utils.py                   |   2 +-
 cmake/upstream/FindCUDAToolkit.cmake   | 917 +++++++++++++++++++++++++
 docker/README.md                       |   1 +
 docker/horovod-cpu/Dockerfile          |   5 +-
 docker/horovod-ray/Dockerfile          |   5 +-
 docker/horovod/Dockerfile              |   5 +-
 docs/install.rst                       |   4 +-
 horovod/common/ops/cuda/CMakeLists.txt |  21 +-
 setup.py                               |   2 +-
 17 files changed, 1033 insertions(+), 34 deletions(-)
 create mode 100644 cmake/upstream/FindCUDAToolkit.cmake

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0093a4377e..c3d82255fb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Moved to CMake version 3.13 with first-class CUDA language support and re-enabled parallelized builds. ([#3261](https://github.com/horovod/horovod/pull/3261))
+
 ### Deprecated
 - Deprecated ElasticRayExecutor APIs in favor of the new RayExecutor API for issue: [#3190](https://github.com/horovod/horovod/issues/3190).
 ### Removed
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 862dd98d32..f79d170b07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,19 +1,17 @@
-cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 if(POLICY CMP0074)
     # 1. Introduced with 3.12.4.
     # 2. *_ROOT variables will be checked
     cmake_policy(SET CMP0074 NEW)
 endif()
 
-if(${CMAKE_VERSION} VERSION_GREATER "3.4.0") 
-  find_program(CCACHE_PROGRAM ccache)
-  if(CCACHE_PROGRAM)
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
     set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
     set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
     set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
-  else()
+else()
     message(STATUS "Could not find CCache. Consider installing CCache to speed up compilation.")
-  endif()
 endif()
 
 project(horovod CXX)
@@ -23,7 +21,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
 # Configure path to modules (for find_package)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake/Modules/")
+set(CMAKE_MODULE_PATH
+        ${CMAKE_MODULE_PATH}
+        "${PROJECT_SOURCE_DIR}/cmake/Modules/"
+        "${PROJECT_SOURCE_DIR}/cmake/upstream/")
 include(cmake/Utilities.cmake)
 
 create_metadata()
@@ -148,14 +149,33 @@ if (NOT "$ENV{HOROVOD_WITHOUT_MPI}" STREQUAL "1")
 endif()
 
 # CUDA and ROCM
+set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+if(NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY)
+    set(CMAKE_CUDA_RUNTIME_LIBRARY "Shared")  # Set to "Static" or "Shared"
+endif()
+if (DEFINED ENV{HOROVOD_CUDA_HOME})
+    set(CMAKE_CUDA_COMPILER $ENV{HOROVOD_CUDA_HOME}/bin/nvcc)
+endif()
+include(CheckLanguage)
+check_language(CUDA)
+if (CMAKE_CUDA_COMPILER)
+    if ((CMAKE_CXX_COMPILER_ID MATCHES GNU) AND (CMAKE_SYSTEM_PROCESSOR MATCHES ppc64le))
+        if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
+            set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++11")
+        endif ()
+    endif ()
+    enable_language(CUDA)
+endif ()
+
 macro(ADD_CUDA)
-    if (DEFINED ENV{HOROVOD_CUDA_HOME})
-        set(CUDA_TOOLKIT_ROOT_DIR $ENV{HOROVOD_CUDA_HOME})
+    find_package(CUDAToolkit REQUIRED)
+    include_directories(SYSTEM ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    string(TOLOWER "${CMAKE_CUDA_RUNTIME_LIBRARY}" lowercase_CMAKE_CUDA_RUNTIME_LIBRARY)
+    if (lowercase_CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "static")
+        list(APPEND LINKER_LIBS CUDA::cudart_static)
+    elseif (lowercase_CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "shared")
+        list(APPEND LINKER_LIBS CUDA::cudart)
     endif()
-    option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" OFF)
-    find_package(CUDA REQUIRED)
-    include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-    list(APPEND LINKER_LIBS ${CUDA_LIBRARIES})
     list(APPEND SOURCES "${PROJECT_SOURCE_DIR}/horovod/common/ops/cuda_operations.cc"
                         "${PROJECT_SOURCE_DIR}/horovod/common/ops/gpu_operations.cc")
     # CUDA + MPI
@@ -209,7 +229,7 @@ endif()
 if(HOROVOD_GPU_ALLREDUCE STREQUAL "D")
     message(DEPRECATION "DDL backend has been deprecated. Please, start using the NCCL backend by building Horovod with "
                         "'HOROVOD_GPU_OPERATIONS=NCCL'. Will be removed in v0.21.0.")
-    list(APPEND LINKER_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib/libddl.so" "${CUDA_TOOLKIT_ROOT_DIR}/lib/libddl_pack.so")
+    list(APPEND LINKER_LIBS "${CUDAToolkit_LIBRARY_ROOT}/lib/libddl.so" "${CUDAToolkit_LIBRARY_ROOT}/lib/libddl_pack.so")
     list(APPEND SOURCES "${PROJECT_SOURCE_DIR}/horovod/common/mpi/ddl_mpi_context_manager.cc"
                         "${PROJECT_SOURCE_DIR}/horovod/common/ops/ddl_operations.cc")
     add_definitions(-DHAVE_DDL=1)
diff --git a/Dockerfile.test.cpu b/Dockerfile.test.cpu
index 35ce208c30..72da015125 100644
--- a/Dockerfile.test.cpu
+++ b/Dockerfile.test.cpu
@@ -33,7 +33,6 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test
 RUN apt-get update -qq && apt-get install -y --no-install-recommends \
         wget \
         ca-certificates \
-        cmake \
         openssh-client \
         openssh-server \
         git \
@@ -52,6 +51,9 @@ RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python${PYTHON_VERSION/%.
 RUN wget --progress=dot:mega https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py
 RUN pip install --no-cache-dir -U --force pip setuptools requests pytest mock pytest-forked parameterized
 
+# Install recent CMake.
+RUN pip install --no-cache-dir -U cmake~=3.13.0
+
 # Add launch helper scripts
 RUN echo "env SPARK_HOME=/spark SPARK_DRIVER_MEM=512m PYSPARK_PYTHON=/usr/bin/python${PYTHON_VERSION} PYSPARK_DRIVER_PYTHON=/usr/bin/python${PYTHON_VERSION} \"\$@\"" > /spark_env.sh
 RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > /pytest.sh
diff --git a/Dockerfile.test.gpu b/Dockerfile.test.gpu
index 3808b8a541..7388008b0a 100644
--- a/Dockerfile.test.gpu
+++ b/Dockerfile.test.gpu
@@ -35,7 +35,6 @@ RUN CUDNN_MAJOR=$(cut -d '.' -f 1 <<< "${CUDNN_VERSION}"); \
     apt-get update -qq && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
         wget \
         ca-certificates \
-        cmake \
         openssh-client \
         openssh-server \
         git \
@@ -57,6 +56,9 @@ RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python${PYTHON_VERSION/%.
 RUN wget --progress=dot:mega https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py
 RUN pip install --no-cache-dir -U --force pip "setuptools<60.1.0" requests pytest mock pytest-forked parameterized
 
+# Install recent CMake.
+RUN pip install --no-cache-dir -U cmake~=3.13.0
+
 # Add launch helper scripts
 RUN echo "env SPARK_HOME=/spark SPARK_DRIVER_MEM=512m PYSPARK_PYTHON=/usr/bin/python${PYTHON_VERSION} PYSPARK_DRIVER_PYTHON=/usr/bin/python${PYTHON_VERSION} \"\$@\"" > /spark_env.sh
 RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > /pytest.sh
diff --git a/NOTICE b/NOTICE
index a03473bfb8..1532d369e4 100644
--- a/NOTICE
+++ b/NOTICE
@@ -232,3 +232,38 @@
    The derived work can be found in the files:
 
         - horovod/torch/sync_batch_norm.py
+
+   CMake - Cross Platform Makefile Generator
+   Copyright 2000-2020 Kitware, Inc. and Contributors
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+   * Neither the name of Kitware, Inc. nor the names of Contributors
+     may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+      The derived work can be found in the files:
+
+          - cmake/upstream/FindCUDAToolkit.cmake
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
index 4269a1e22a..0eb9366f0e 100644
--- a/cmake/Modules/FindNCCL.cmake
+++ b/cmake/Modules/FindNCCL.cmake
@@ -11,14 +11,14 @@
 #  NCCL_LIBRARIES
 #  NCCL_MAJOR_VERSION
 #
-# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# The path hints include CUDAToolkit_LIBRARY_ROOT seeing as some folks
 # install NCCL in the same location as the CUDA toolkit.
 
 set(HOROVOD_NCCL_HOME $ENV{HOROVOD_NCCL_HOME} CACHE PATH "Folder contains NVIDIA NCCL")
 set(HOROVOD_NCCL_INCLUDE $ENV{HOROVOD_NCCL_INCLUDE} CACHE PATH "Folder contains NVIDIA NCCL headers")
 set(HOROVOD_NCCL_LIB $ENV{HOROVOD_NCCL_LIB} CACHE PATH "Folder contains NVIDIA NCCL libraries")
 
-list(APPEND NCCL_ROOT ${HOROVOD_NCCL_HOME} ${CUDA_TOOLKIT_ROOT_DIR})
+list(APPEND NCCL_ROOT ${HOROVOD_NCCL_HOME} ${CUDAToolkit_LIBRARY_ROOT})
 # Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
 list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
 
diff --git a/cmake/Modules/FindNVTX.cmake b/cmake/Modules/FindNVTX.cmake
index 4d38ebb3ac..1f5e12275a 100644
--- a/cmake/Modules/FindNVTX.cmake
+++ b/cmake/Modules/FindNVTX.cmake
@@ -10,13 +10,13 @@
 
 set(HOROVOD_NVTX_INCLUDE $ENV{HOROVOD_NVTX_INCLUDE} CACHE PATH "Folder containing NVIDIA NVTX3 headers")
 
-list(APPEND NVTX_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+list(APPEND NVTX_ROOT ${CUDAToolkit_LIBRARY_ROOT})
 # Compatible layer for CMake <3.12:
 list(APPEND CMAKE_PREFIX_PATH ${NVTX_ROOT})
 
 find_path(NVTX_INCLUDE_DIR
-        NAMES nvtx3/nvToolsExt.h
-        HINTS ${HOROVOD_NVTX_INCLUDE})
+          NAMES nvtx3/nvToolsExt.h
+          HINTS ${HOROVOD_NVTX_INCLUDE})
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NVTX DEFAULT_MSG NVTX_INCLUDE_DIR)
diff --git a/cmake/Modules/FindTensorflow.cmake b/cmake/Modules/FindTensorflow.cmake
index e8885903e1..8507c796b6 100644
--- a/cmake/Modules/FindTensorflow.cmake
+++ b/cmake/Modules/FindTensorflow.cmake
@@ -22,7 +22,7 @@ if (LEN EQUAL "4")
     string(REPLACE " " ";" Tensorflow_LIBRARIES_LIST "${Tensorflow_LIBRARIES}")
     list(GET Tensorflow_LIBRARIES_LIST 0 Tensorflow_LIB_PATH_ARGUMENT)
     string(REGEX REPLACE "^-L" "" Tensorflow_LIB_PATH ${Tensorflow_LIB_PATH_ARGUMENT})
-    if (Tensorflow_VERSION VERSION_GREATER "2.6" OR Tensorflow_VERSION VERSION_EQUAL "2.6")
+    if (Tensorflow_VERSION VERSION_GREATER_EQUAL "2.6")
         # XLA implementations and helpers for resource variables are in _pywrap_tensorflow_internal.so
         set(Tensorflow_LIBRARIES "${Tensorflow_LIBRARIES} ${Tensorflow_LIB_PATH}/python/_pywrap_tensorflow_internal.so")
     endif()
diff --git a/cmake/build_utils.py b/cmake/build_utils.py
index be7e790751..c2f384c99e 100644
--- a/cmake/build_utils.py
+++ b/cmake/build_utils.py
@@ -91,7 +91,7 @@ def get_nvcc_bin():
                        'Make sure it is added to your path or in $HOROVOD_CUDA_HOME/bin.')
 
 def get_nvcc_flags():
-    default_flags = ['--std=c++11', '-O3', '-Xcompiler', '-fPIC']
+    default_flags = ['-O3', '-Xcompiler', '-fPIC']
     cc_list_env = os.environ.get('HOROVOD_BUILD_CUDA_CC_LIST')
 
     # Invoke nvcc and extract all supported compute capabilities for CUDA toolkit version
diff --git a/cmake/upstream/FindCUDAToolkit.cmake b/cmake/upstream/FindCUDAToolkit.cmake
new file mode 100644
index 0000000000..27bd122154
--- /dev/null
+++ b/cmake/upstream/FindCUDAToolkit.cmake
@@ -0,0 +1,917 @@
+# Copyright 2000-2020 Kitware, Inc. and Contributors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of Kitware, Inc. nor the names of Contributors
+#   may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#[=======================================================================[.rst:
+FindCUDAToolkit
+---------------
+
+This script locates the NVIDIA CUDA toolkit and the associated libraries, but
+does not require the ``CUDA`` language be enabled for a given project. This
+module does not search for the NVIDIA CUDA Samples.
+
+Search Behavior
+^^^^^^^^^^^^^^^
+
+Finding the CUDA Toolkit requires finding the ``nvcc`` executable, which is
+searched for in the following order:
+
+1. If the ``CUDA`` language has been enabled we will use the directory
+   containing the compiler as the first search location for ``nvcc``.
+
+2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
+   ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
+   will be searched.  If both an environment variable **and** a
+   configuration variable are specified, the *configuration* variable takes
+   precedence.
+
+   The directory specified here must be such that the executable ``nvcc`` can be
+   found underneath the directory specified by ``CUDAToolkit_ROOT``.  If
+   ``CUDAToolkit_ROOT`` is specified, but no ``nvcc`` is found underneath, this
+   package is marked as **not** found.  No subsequent search attempts are
+   performed.
+
+3. If the CUDA_PATH environment variable is defined, it will be searched.
+
+4. The user's path is searched for ``nvcc`` using :command:`find_program`.  If
+   this is found, no subsequent search attempts are performed.  Users are
+   responsible for ensuring that the first ``nvcc`` to show up in the path is
+   the desired path in the event that multiple CUDA Toolkits are installed.
+
+5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
+   used.  No subsequent search attempts are performed.  No default symbolic link
+   location exists for the Windows platform.
+
+6. The platform specific default install locations are searched.  If exactly one
+   candidate is found, this is used.  The default CUDA Toolkit install locations
+   searched are:
+
+   +-------------+-------------------------------------------------------------+
+   | Platform    | Search Pattern                                              |
+   +=============+=============================================================+
+   | macOS       | ``/Developer/NVIDIA/CUDA-X.Y``                              |
+   +-------------+-------------------------------------------------------------+
+   | Other Unix  | ``/usr/local/cuda-X.Y``                                     |
+   +-------------+-------------------------------------------------------------+
+   | Windows     | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+   +-------------+-------------------------------------------------------------+
+
+   Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
+   ``/usr/local/cuda-9.0`` or
+   ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
+
+   .. note::
+
+       When multiple CUDA Toolkits are installed in the default location of a
+       system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
+       exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
+       package is marked as **not** found.
+
+       There are too many factors involved in making an automatic decision in
+       the presence of multiple CUDA Toolkits being installed.  In this
+       situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
+       (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
+       :command:`find_program` to find.
+
+Options
+^^^^^^^
+
+``VERSION``
+    If specified, describes the version of the CUDA Toolkit to search for.
+
+``REQUIRED``
+    If specified, configuration will error if a suitable CUDA Toolkit is not
+    found.
+
+``QUIET``
+    If specified, the search for a suitable CUDA Toolkit will not produce any
+    messages.
+
+``EXACT``
+    If specified, the CUDA Toolkit is considered found only if the exact
+    ``VERSION`` specified is recovered.
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
+
+This module defines :prop_tgt:`IMPORTED` targets for each
+of the following libraries that are part of the CUDAToolkit:
+
+- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
+- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
+- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
+- :ref:`cuFFT<cuda_toolkit_cuFFT>`
+- :ref:`cuRAND<cuda_toolkit_cuRAND>`
+- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
+- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
+- :ref:`NPP<cuda_toolkit_NPP>`
+- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
+- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
+- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
+- :ref:`nvidia-ML<cuda_toolkit_nvML>`
+- :ref:`nvRTC<cuda_toolkit_nvRTC>`
+- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
+- :ref:`OpenCL<cuda_toolkit_opencl>`
+- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
+
+.. _`cuda_toolkit_rt_lib`:
+
+CUDA Runtime Library
+""""""""""""""""""""
+
+The CUDA Runtime library (cudart) are what most applications will typically
+need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
+
+Targets Created:
+
+- ``CUDA::cudart``
+- ``CUDA::cudart_static``
+
+.. _`cuda_toolkit_driver_lib`:
+
+CUDA Driver Library
+""""""""""""""""""""
+
+The CUDA Driver library (cuda) are used by applications that use calls
+such as `cuMemAlloc`, and `cuMemFree`. This is generally used by advanced
+
+
+Targets Created:
+
+- ``CUDA::cuda_driver``
+- ``CUDA::cuda_driver``
+
+.. _`cuda_toolkit_cuBLAS`:
+
+cuBLAS
+""""""
+
+The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cublas``
+- ``CUDA::cublas_static``
+
+.. _`cuda_toolkit_cuFFT`:
+
+cuFFT
+"""""
+
+The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cufft``
+- ``CUDA::cufftw``
+- ``CUDA::cufft_static``
+- ``CUDA::cufftw_static``
+
+cuRAND
+""""""
+
+The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::curand``
+- ``CUDA::curand_static``
+
+.. _`cuda_toolkit_cuSOLVER`:
+
+cuSOLVER
+""""""""
+
+The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusolver``
+- ``CUDA::cusolver_static``
+
+.. _`cuda_toolkit_cuSPARSE`:
+
+cuSPARSE
+""""""""
+
+The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusparse``
+- ``CUDA::cusparse_static``
+
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
+.. _`cuda_toolkit_NPP`:
+
+NPP
+"""
+
+The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries.
+
+Targets Created:
+
+- `nppc`:
+
+  - ``CUDA::nppc``
+  - ``CUDA::nppc_static``
+
+- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
+
+  - ``CUDA::nppial``
+  - ``CUDA::nppial_static``
+
+- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
+
+  - ``CUDA::nppicc``
+  - ``CUDA::nppicc_static``
+
+- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
+
+  - ``CUDA::nppicom``
+  - ``CUDA::nppicom_static``
+
+- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
+
+  - ``CUDA::nppidei``
+  - ``CUDA::nppidei_static``
+
+- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
+
+  - ``CUDA::nppif``
+  - ``CUDA::nppif_static``
+
+- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
+
+  - ``CUDA::nppig``
+  - ``CUDA::nppig_static``
+
+- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
+
+  - ``CUDA::nppim``
+  - ``CUDA::nppim_static``
+
+- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
+
+  - ``CUDA::nppist``
+  - ``CUDA::nppist_static``
+
+- `nppisu`: Memory support functions in `nppi_support_functions.h`
+
+  - ``CUDA::nppisu``
+  - ``CUDA::nppisu_static``
+
+- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
+
+  - ``CUDA::nppitc``
+  - ``CUDA::nppitc_static``
+
+- `npps`:
+
+  - ``CUDA::npps``
+  - ``CUDA::npps_static``
+
+.. _`cuda_toolkit_nvBLAS`:
+
+nvBLAS
+""""""
+
+The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvblas``
+
+.. _`cuda_toolkit_nvGRAPH`:
+
+nvGRAPH
+"""""""
+
+The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::nvgraph``
+- ``CUDA::nvgraph_static``
+
+
+.. _`cuda_toolkit_nvJPEG`:
+
+nvJPEG
+""""""
+
+The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library.
+Introduced in CUDA 10.
+
+Targets Created:
+
+- ``CUDA::nvjpeg``
+- ``CUDA::nvjpeg_static``
+
+.. _`cuda_toolkit_nvRTC`:
+
+nvRTC
+"""""
+
+The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvrtc``
+
+.. _`cuda_toolkit_nvml`:
+
+nvidia-ML
+"""""""""
+
+The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvml``
+
+.. _`cuda_toolkit_nvToolsExt`:
+
+nvToolsExt
+""""""""""
+
+The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvToolsExt``
+
+.. _`cuda_toolkit_opencl`:
+
+OpenCL
+""""""
+
+The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::OpenCL``
+
+.. _`cuda_toolkit_cuLIBOS`:
+
+cuLIBOS
+"""""""
+
+The cuLIBOS library is a backend thread abstraction layer library which is
+static only.  The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
+``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
+libraries all automatically have this dependency linked.
+
+Target Created:
+
+- ``CUDA::culibos``
+
+**Note**: direct usage of this target by consumers should not be necessary.
+
+.. _`cuda_toolkit_cuRAND`:
+
+
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+``CUDAToolkit_FOUND``
+    A boolean specifying whether or not the CUDA Toolkit was found.
+
+``CUDAToolkit_VERSION``
+    The exact version of the CUDA Toolkit found (as reported by
+    ``nvcc --version``).
+
+``CUDAToolkit_VERSION_MAJOR``
+    The major version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_MAJOR``
+    The minor version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_PATCH``
+    The patch version of the CUDA Toolkit.
+
+``CUDAToolkit_BIN_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    executable ``nvcc``.
+
+``CUDAToolkit_INCLUDE_DIRS``
+    The path to the CUDA Toolkit ``include`` folder containing the header files
+    required to compile a project linking against CUDA.
+
+``CUDAToolkit_LIBRARY_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    Runtime library ``cudart``.
+
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalant to
+    ``CUDAToolkit_ROOT_DIR``.
+
+``CUDAToolkit_NVCC_EXECUTABLE``
+    The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
+    **not** be the same as
+    :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`.  ``nvcc`` must be
+    found to determine the CUDA Toolkit version as well as determining other
+    features of the Toolkit.  This variable is set for the convenience of
+    modules that depend on this one.
+
+
+#]=======================================================================]
+
+# NOTE: much of this was simply extracted from FindCUDA.cmake.
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR)
+  get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+  # use the already detected cuda compiler
+  set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "")
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
+  unset(cuda_dir)
+endif()
+
+# Try language- or user-provided path first.
+if(CUDAToolkit_BIN_DIR)
+  find_program(CUDAToolkit_NVCC_EXECUTABLE
+    NAMES nvcc nvcc.exe
+    PATHS ${CUDAToolkit_BIN_DIR}
+    NO_DEFAULT_PATH
+    )
+endif()
+
+# Search using CUDAToolkit_ROOT
+find_program(CUDAToolkit_NVCC_EXECUTABLE
+  NAMES nvcc nvcc.exe
+  PATHS ENV CUDA_PATH
+  PATH_SUFFIXES bin
+)
+
+# If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error.
+if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
+  # Declare error messages now, print later depending on find_package args.
+  set(fail_base "Could not find nvcc executable in path specified by")
+  set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+  set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
+
+  if (CUDAToolkit_FIND_REQUIRED)
+    if (DEFINED CUDAToolkit_ROOT)
+      message(FATAL_ERROR ${cuda_root_fail})
+    elseif (DEFINED ENV{CUDAToolkit_ROOT})
+      message(FATAL_ERROR ${env_cuda_root_fail})
+    endif()
+  else()
+    if (NOT CUDAToolkit_FIND_QUIETLY)
+      if (DEFINED CUDAToolkit_ROOT)
+        message(STATUS ${cuda_root_fail})
+      elseif (DEFINED ENV{CUDAToolkit_ROOT})
+        message(STATUS ${env_cuda_root_fail})
+      endif()
+    endif()
+    set(CUDAToolkit_FOUND FALSE)
+    unset(fail_base)
+    unset(cuda_root_fail)
+    unset(env_cuda_root_fail)
+    return()
+  endif()
+endif()
+
+# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
+#
+# - Linux: /usr/local/cuda-X.Y
+# - macOS: /Developer/NVIDIA/CUDA-X.Y
+# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
+#
+# We will also search the default symlink location /usr/local/cuda first since
+# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
+# directory is the desired location.
+if (NOT CUDAToolkit_NVCC_EXECUTABLE)
+  if (UNIX)
+    if (NOT APPLE)
+      set(platform_base "/usr/local/cuda-")
+    else()
+      set(platform_base "/Developer/NVIDIA/CUDA-")
+    endif()
+  else()
+    set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
+  endif()
+
+  # Build out a descending list of possible cuda installations, e.g.
+  file(GLOB possible_paths "${platform_base}*")
+  # Iterate the glob results and create a descending list.
+  set(possible_versions)
+  foreach (p ${possible_paths})
+    # Extract version number from end of string
+    string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
+    if (IS_DIRECTORY ${p} AND p_version)
+      list(APPEND possible_versions ${p_version})
+    endif()
+  endforeach()
+
+  # Cannot use list(SORT) because that is alphabetical, we need numerical.
+  # NOTE: this is not an efficient sorting strategy.  But even if a user had
+  # every possible version of CUDA installed, this wouldn't create any
+  # significant overhead.
+  set(versions)
+  foreach (v ${possible_versions})
+    list(LENGTH versions num_versions)
+    # First version, nothing to compare with so just append.
+    if (num_versions EQUAL 0)
+      list(APPEND versions ${v})
+    else()
+      # Loop through list.  Insert at an index when comparison is
+      # VERSION_GREATER since we want a descending list.  Duplicates will not
+      # happen since this came from a glob list of directories.
+      set(i 0)
+      set(early_terminate FALSE)
+      while (i LESS num_versions)
+        list(GET versions ${i} curr)
+        if (v VERSION_GREATER curr)
+          list(INSERT versions ${i} ${v})
+          set(early_terminate TRUE)
+          break()
+        endif()
+        math(EXPR i "${i} + 1")
+      endwhile()
+      # If it did not get inserted, place it at the end.
+      if (NOT early_terminate)
+        list(APPEND versions ${v})
+      endif()
+    endif()
+  endforeach()
+
+  # With a descending list of versions, populate possible paths to search.
+  set(search_paths)
+  foreach (v ${versions})
+    list(APPEND search_paths "${platform_base}${v}")
+  endforeach()
+
+  # Force the global default /usr/local/cuda to the front on Unix.
+  if (UNIX)
+    list(INSERT search_paths 0 "/usr/local/cuda")
+  endif()
+
+  # Now search for nvcc again using the platform default search paths.
+  find_program(CUDAToolkit_NVCC_EXECUTABLE
+    NAMES nvcc nvcc.exe
+    PATHS ${search_paths}
+    PATH_SUFFIXES bin
+  )
+
+  # We are done with these variables now, cleanup for caller.
+  unset(platform_base)
+  unset(possible_paths)
+  unset(possible_versions)
+  unset(versions)
+  unset(i)
+  unset(early_terminate)
+  unset(search_paths)
+
+  if (NOT CUDAToolkit_NVCC_EXECUTABLE)
+    if (CUDAToolkit_FIND_REQUIRED)
+      message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
+    elseif(NOT CUDAToolkit_FIND_QUIETLY)
+      message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
+    endif()
+
+    set(CUDAToolkit_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE)
+  get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+  set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
+  unset(cuda_dir)
+endif()
+
+if(CUDAToolkit_NVCC_EXECUTABLE AND
+   CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
+  # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
+  # This if statement will always match, but is used to provide variables for MATCH 1,2,3...
+  if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+  endif()
+else()
+  # Compute the version by invoking nvcc
+  execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION  "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+  endif()
+  unset(NVCC_OUT)
+endif()
+
+
+get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+
+# Handle cross compilation
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
+    else()
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      set(CUDAToolkit_TARGET_NAME "x86_64-linux")
+  endif()
+
+  if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+    list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+    # Mark that we need to pop the root search path changes after we have
+    # found all cuda libraries so that searches for our cross-compilation
+    # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+    # PATh
+    set(_CUDAToolkit_Pop_ROOT_PATH True)
+  endif()
+else()
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
+
+# Find the include/ directory
+find_path(CUDAToolkit_INCLUDE_DIR
+  NAMES cuda_runtime.h
+)
+
+# And find the CUDA Runtime Library libcudart
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATH_SUFFIXES lib64 lib/x64
+)
+if (NOT CUDA_CUDART)
+  find_library(CUDA_CUDART
+    NAMES cudart
+    PATH_SUFFIXES lib64/stubs lib/x64/stubs
+  )
+endif()
+
+if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cudart library.")
+endif()
+
+unset(CUDAToolkit_ROOT_DIR)
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
+
+#-----------------------------------------------------------------------------
+# Perform version comparison and validate all required variables are set.
+# HOROVOD NOTE: This differs from CMake source by ${CMAKE_CURRENT_LIST_DIR}
+# replaced with ${CMAKE_ROOT}/Modules
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+find_package_handle_standard_args(CUDAToolkit
+  REQUIRED_VARS
+    CUDAToolkit_INCLUDE_DIR
+    CUDA_CUDART
+    CUDAToolkit_NVCC_EXECUTABLE
+  VERSION_VAR
+    CUDAToolkit_VERSION
+)
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_INCLUDE_DIR
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 )
+
+#-----------------------------------------------------------------------------
+# Construct result variables
+if(CUDAToolkit_FOUND)
+ set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR})
+ get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
+endif()
+
+#-----------------------------------------------------------------------------
+# Construct import targets
+if(CUDAToolkit_FOUND)
+
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN})
+
+    set(search_names ${lib_name} ${arg_ALT})
+
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+            ENV CUDA_PATH
+      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+                    ${arg_EXTRA_PATH_SUFFIXES}
+    )
+    # Don't try any stub directories intil we have exhausted all other
+    # search locations.
+    if(NOT CUDA_${lib_name}_LIBRARY)
+      find_library(CUDA_${lib_name}_LIBRARY
+        NAMES ${search_names}
+        HINTS ${CUDAToolkit_LIBRARY_DIR}
+              ENV CUDA_PATH
+        PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs
+      )
+    endif()
+
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
+
+    if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+      add_library(CUDA::${lib_name} IMPORTED INTERFACE)
+      target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+        endif()
+      endforeach()
+    endif()
+  endfunction()
+
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
+
+  _CUDAToolkit_find_and_add_import_lib(cudart)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static)
+
+  # setup dependencies that are required for cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps
+     AND TARGET CUDA::cudart_static)
+
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
+
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
+
+    if(UNIX AND NOT APPLE)
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
+  endforeach()
+
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static)
+
+  # cuSOLVER depends on cuBLAS, and cuSPARSE
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
+
+  # nvGRAPH depends on cuRAND, and cuSOLVER.
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
+
+  # Process the majority of the NPP libraries.
+  foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
+  endforeach()
+
+  _CUDAToolkit_find_and_add_import_lib(cupti
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+  _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
+
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
+
+  if(WIN32)
+    # nvtools can be installed outside the CUDA toolkit directory
+    # so prefer the NVTOOLSEXT_PATH windows only environment variable
+    # In addition on windows the most common name is nvToolsExt64_1
+    find_library(CUDA_nvToolsExt_LIBRARY
+      NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt
+      PATHS ENV NVTOOLSEXT_PATH
+            ENV CUDA_PATH
+      PATH_SUFFIXES lib/x64 lib
+    )
+  endif()
+  _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
+
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
+
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
+endif()
diff --git a/docker/README.md b/docker/README.md
index 468c82691f..79f98baba6 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -42,6 +42,7 @@ including:
 Building the Docker images should be run from the root Horovod directory. For example:
 
 ```
+export DOCKER_BUILDKIT=1
 docker build \
     --build-arg TENSORFLOW_VERSION=2.3.1 \
     --build-arg PYTORCH_VERSION=1.7.0+cu110 \
diff --git a/docker/horovod-cpu/Dockerfile b/docker/horovod-cpu/Dockerfile
index b590307516..2b559540aa 100644
--- a/docker/horovod-cpu/Dockerfile
+++ b/docker/horovod-cpu/Dockerfile
@@ -18,9 +18,9 @@ SHELL ["/bin/bash", "-cu"]
 
 RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
         build-essential \
-        cmake \
         g++-7 \
         git \
+        gpg \
         curl \
         vim \
         wget \
@@ -57,6 +57,9 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
 
+# Install recent CMake.
+RUN pip install --no-cache-dir -U cmake~=3.13.0
+
 # Install PyTorch, TensorFlow, Keras and MXNet
 RUN pip install --no-cache-dir torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION}
 RUN pip install --no-cache-dir pytorch_lightning==${PYTORCH_LIGHTNING_VERSION}
diff --git a/docker/horovod-ray/Dockerfile b/docker/horovod-ray/Dockerfile
index 9da83ff6df..7007f5a47b 100644
--- a/docker/horovod-ray/Dockerfile
+++ b/docker/horovod-ray/Dockerfile
@@ -17,12 +17,15 @@ RUN sudo apt-get update && DEBIAN_FRONTEND="noninteractive" sudo apt-get install
         build-essential \
         wget \
         git \
+        gpg \
         curl \
-        cmake \
         rsync \
         vim \
     && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
 
+# Install recent CMake.
+RUN pip install --no-cache-dir -U cmake~=3.13.0
+
 # Install PyTorch
 RUN pip install --no-cache-dir \
     torch==${PYTORCH_VERSION} \
diff --git a/docker/horovod/Dockerfile b/docker/horovod/Dockerfile
index 655eab332c..dacdcbf9c0 100644
--- a/docker/horovod/Dockerfile
+++ b/docker/horovod/Dockerfile
@@ -24,9 +24,9 @@ SHELL ["/bin/bash", "-cu"]
 
 RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
         build-essential \
-        cmake \
         g++-7 \
         git \
+        gpg \
         curl \
         vim \
         wget \
@@ -66,6 +66,9 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
 
+# Install recent CMake.
+RUN pip install --no-cache-dir -U cmake~=3.13.0
+
 # Install PyTorch, TensorFlow, Keras and MXNet
 RUN pip install --no-cache-dir \
     torch==${PYTORCH_VERSION} \
diff --git a/docs/install.rst b/docs/install.rst
index d1de57c4b2..d339b3cabd 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -8,7 +8,7 @@ Requirements
 
 - Python >= 3.6
 - `g++-5` or above, or another compiler supporting C++14
-- CMake
+- CMake 3.13 or newer
 - TensorFlow, PyTorch, or MXNet
 - (Optional) MPI
 
@@ -213,7 +213,7 @@ diagnose failures:
 Installing Horovod with Conda (+pip)
 ------------------------------------
 
-To use Conda to install PyTorch, TensorFlow, MXNet, Horovod, as well as GPU depdencies such as 
+To use Conda to install PyTorch, TensorFlow, MXNet, Horovod, as well as GPU dependencies such as
 NVIDIA CUDA Toolkit, cuDNN, NCCL, etc., see `Build a Conda Environment with GPU Support for Horovod <conda.rst>`_.
 
 Environment Variables
diff --git a/horovod/common/ops/cuda/CMakeLists.txt b/horovod/common/ops/cuda/CMakeLists.txt
index c01b0aaed2..fde267440a 100644
--- a/horovod/common/ops/cuda/CMakeLists.txt
+++ b/horovod/common/ops/cuda/CMakeLists.txt
@@ -1,14 +1,25 @@
-set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-
 set(ENV{PYTHONPATH} "${PROJECT_SOURCE_DIR}/cmake:$ENV{PYTHONPATH}")
 execute_process(COMMAND ${PY_EXE} -c "import build_utils; print(' '.join(build_utils.get_nvcc_flags()))"
                 OUTPUT_VARIABLE HVD_NVCC_COMPILE_FLAGS OUTPUT_STRIP_TRAILING_WHITESPACE)
 
 MESSAGE(STATUS "HVD_NVCC_COMPILE_FLAGS = ${HVD_NVCC_COMPILE_FLAGS}")
 
-list(APPEND CUDA_NVCC_FLAGS "${HVD_NVCC_COMPILE_FLAGS}")
+# If we don't set CMAKE_CUDA_STANDARD, it will default to ${CMAKE_CXX_STANDARD} ("14" at this time). nvcc may fail if
+# the --std=c++... argument is passed multiple times.
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
-cuda_add_library(horovod_cuda_kernels cuda_kernels.cu OPTIONS -D_GLIBCXX_USE_CXX11_ABI=1)
+add_library(horovod_cuda_kernels cuda_kernels.cu)
+target_compile_options(horovod_cuda_kernels PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+                       SHELL:${HVD_NVCC_COMPILE_FLAGS}
+                       -D_GLIBCXX_USE_CXX11_ABI=1
+                       >)
+set_property(TARGET horovod_cuda_kernels PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
 # if we need compatible c++ abi, build a compatible version
-cuda_add_library(compatible_horovod_cuda_kernels cuda_kernels.cu OPTIONS -D_GLIBCXX_USE_CXX11_ABI=0)
\ No newline at end of file
+add_library(compatible_horovod_cuda_kernels cuda_kernels.cu)
+target_compile_options(compatible_horovod_cuda_kernels PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+                       SHELL:${HVD_NVCC_COMPILE_FLAGS}
+                       -D_GLIBCXX_USE_CXX11_ABI=0
+                       >)
+set_property(TARGET compatible_horovod_cuda_kernels PROPERTY CUDA_SEPARABLE_COMPILATION ON)
diff --git a/setup.py b/setup.py
index 15a6f3fede..279df0a89d 100644
--- a/setup.py
+++ b/setup.py
@@ -82,7 +82,7 @@ def build_extensions(self):
                       '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(config.upper(), build_dir),
                       '-DPYTHON_EXECUTABLE:FILEPATH=' + sys.executable]
 
-        make_args = []
+        make_args = ['-j8'] if not os.environ.get('MAKEFLAGS') else []
         if self.verbose:
             make_args.append('VERBOSE=1')