Spgemm TPL refactor (#1618)

* Moving cusparse spgemm to unification layer Split symbolic, numeric, fused-jacobi into separate TPL avail/decl files. * Finish cusparse spgemm tpl wrapper - Symbolic now backwards compatible with bspgemm_numeric and spgemm_jacobi * Add proper rocsparse spgemm wrappers * More SpGEMM TPL work - Remove mentions of deprecated enum value SPGEMM_MKL and SPGEMM_MKL2PHASE - Add MKL wrapper under unification layer with correct symbolic/numeric separation - Remove SPGEMM_CUSP and SPGEMM_VIENNA enum values * Remove now unused spgemm impl files (CUSP, Vienna, MKL) * perftest: take out deprecated/removed spgemm algos * Enforce sorting of spgemm output * Update rocblas includes <rocblas.h> is deprecated and throws warnings; now we have to use <rocblas/rocblas.h> * Update the rest of the deprecated rocsparse includes <rocsparse.h> is deprecated, using <rocsparse/rocsparse.h> instead * rocsparse spgemm: remove unused typedefs * Make MKL sort the output of spgemm * Change the way fuzzy view comparison is done Assume values are close enough if their absolute value is less than eps. Values close to zero like this might be the result of summing many larger-magnitude terms, so the reference value and actual value might end up being (relatively) very different, even though each algorithm is behaving correctly (ignoring floating-point addition nonassociativity) * spgemm: buffer management fixes for cusparse 11.4+ * Fix cusparse 10 spgemm wrapper * Bump tolerance to make bspgemm test pass w/float * SpGEMM: test symbolic reuse Call symbolic, numeric, then change A/B values and do numeric again. * Remove redundant input check in spgemm * Fix order of m,n,k passed to cusparse 10.x spgemm Also make spgemm test use distinct m,n,k to catch similar bugs * spgemm all-at-once interface, debugging utils Add handle-less (non-reuse) spgemm interface, meaning the user can just compute C = AB in one line. Use this in the wiki example, since this should show off the most straightforward way to use the kernel. Implemented in terms of simplified symbolic/numeric (no separate unification layer yet) Debugging utils: KokkosSparse::Impl::isCrsGraphSorted(rowmap, entries): check that the graph has sorted rows. This test is designed to be efficient enough to run by default before kernels that require sorted input. KokkosSparse::Impl::validateCrsMatrix(m, n, rowmap, entries, values): sanity check the extents and entries of a matrix. Entries should be less than n (numCols), rowmap(m) should be equal to nnz, etc. Use isCrsGraphSorted test on A and B in spgemm_symbolic, unless NDEBUG is defined (as in a release build). This is important since sorted input is a new requirement. * spgemm perf test: fix "--algorithm KKDEBUG" It was causing SPGEMM_KK_LP (a completely differnet, parallel algo) to be used, rather than SPGEMM_DEBUG (the host serial reference implementation). * Sparse utils: zero-initialize, remove unused member
kokkos · Dec 12, 2022 · 4c06b6c · 4c06b6c
1 parent 61ff6a3
commit 4c06b6c
Show file tree

Hide file tree

Showing 42 changed files with 2,463 additions and 3,154 deletions.
diff --git a/blas/tpls/KokkosBlas_tpl_spec.hpp b/blas/tpls/KokkosBlas_tpl_spec.hpp
@@ -137,7 +137,7 @@ inline cublasOperation_t trans_mode_kk_to_cublas(const char kkMode[]) {
 #endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 namespace KokkosBlas {
 namespace Impl {

diff --git a/blas/unit_test/Test_Blas_rocblas.hpp b/blas/unit_test/Test_Blas_rocblas.hpp
@@ -7,7 +7,7 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 #include "KokkosBlas_tpl_spec.hpp"
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 // Just check if we can build against
 // rocblas and get the library version

diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp
@@ -306,20 +306,26 @@ struct IsRelativelyIdenticalFunctor {
       : view1(view1_), view2(view2_), eps(eps_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &i, size_t &is_equal) const {
+  void operator()(const size_t &i, size_t &num_diffs) const {
     typedef typename view_type2::non_const_value_type val_type;
-    typedef Kokkos::Details::ArithTraits<val_type> KAT;
+    typedef Kokkos::ArithTraits<val_type> KAT;
     typedef typename KAT::mag_type mag_type;
-    typedef Kokkos::Details::ArithTraits<mag_type> KATM;
+    typedef Kokkos::ArithTraits<mag_type> KATM;
 
-    mag_type val_diff = KAT::abs(view1(i) - view2(i));
-    if (KAT::abs(view1(i)) > KATM::zero() &&
-        KAT::abs(view2(i)) > KATM::zero()) {
-      val_diff = val_diff / KAT::abs(view2(i));
+    mag_type val_diff = KATM::zero();
+    if (KAT::abs(view1(i)) > mag_type(eps) ||
+        KAT::abs(view2(i)) > mag_type(eps)) {
+      val_diff = KAT::abs(view1(i) - view2(i)) /
+                 (KAT::abs(view1(i)) + KAT::abs(view2(i)));
     }
 
-    if (val_diff > eps) {
-      is_equal += 1;
+    if (val_diff > mag_type(eps)) {
+      printf(
+          "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much "
+          "(eps = %e)\n",
+          (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)),
+          KAT::imag(view2(i)), eps);
+      num_diffs++;
     }
   }
 };
@@ -335,19 +341,14 @@ bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2,
   size_t num_elements = view1.extent(0);
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  size_t issame = 0;
+  size_t numDifferences = 0;
   Kokkos::parallel_reduce(
       "KokkosKernels::Common::IsRelativelyIdenticalView",
       my_exec_space(0, num_elements),
       IsRelativelyIdenticalFunctor<view_type1, view_type2, eps_type>(
           view1, view2, eps),
-      issame);
-  MyExecSpace().fence();
-  if (issame > 0) {
-    return false;
-  } else {
-    return true;
-  }
+      numDifferences);
+  return numDifferences == 0;
 }
 
 template <typename view_type>

diff --git a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp
@@ -16,8 +16,6 @@ int main() {
   using device_type = typename Kokkos::Device<
       Kokkos::DefaultExecutionSpace,
       typename Kokkos::DefaultExecutionSpace::memory_space>;
-  using execution_space = typename device_type::execution_space;
-  using memory_space    = typename device_type::memory_space;
   using matrix_type =
       typename KokkosSparse::CrsMatrix<Scalar, Ordinal, device_type, void,
                                        Offset>;
@@ -32,7 +30,7 @@ int main() {
     // In each row the first entry is the number of grid point in
     // that direction, the second and third entries are used to apply
     // BCs in that direction.
-    Kokkos::View<Ordinal* [3], Kokkos::HostSpace> mat_structure(
+    Kokkos::View<Ordinal * [3], Kokkos::HostSpace> mat_structure(
         "Matrix Structure", 2);
     mat_structure(0, 0) = 10;  // Request 10 grid point in 'x' direction
     mat_structure(0, 1) = 1;   // Add BC to the left
@@ -45,27 +43,11 @@ int main() {
         Test::generate_structured_matrix2D<matrix_type>("FD", mat_structure);
     matrix_type B =
         Test::generate_structured_matrix2D<matrix_type>("FE", mat_structure);
-    matrix_type C;
 
-    // Create KokkosKernelHandle
-    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
-        Offset, Ordinal, Scalar, execution_space, memory_space, memory_space>;
-    KernelHandle kh;
-    kh.set_team_work_size(16);
-    kh.set_dynamic_scheduling(true);
+    matrix_type C = KokkosSparse::spgemm<matrix_type>(A, false, B, false);
 
-    // Select an spgemm algorithm, limited by configuration at compile-time and
-    // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
-    // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
-    std::string myalg("SPGEMM_KK_MEMORY");
-    KokkosSparse::SPGEMMAlgorithm spgemm_algorithm =
-        KokkosSparse::StringToSPGEMMAlgorithm(myalg);
-    kh.create_spgemm_handle(spgemm_algorithm);
-
-    KokkosSparse::spgemm_symbolic(kh, A, false, B, false, C);
-    KokkosSparse::spgemm_numeric(kh, A, false, B, false, C);
-
-    std::cout << "spgemm was performed correctly!" << std::endl;
+    std::cout << "Ran spgemm: product C is " << C.numRows() << 'x'
+              << C.numCols() << " and has " << C.nnz() << " nonzeros.\n";
   }
 
   Kokkos::finalize();

diff --git a/graph/src/KokkosGraph_CoarsenConstruct.hpp b/graph/src/KokkosGraph_CoarsenConstruct.hpp
@@ -285,25 +285,26 @@ class coarse_builder {
       const matrix_t interp_mtx) {
     vtx_view_t f_vtx_w = level.vtx_wgts;
     matrix_t g         = level.mtx;
+    if (!KokkosSparse::Impl::isCrsGraphSorted(g.graph.row_map, g.graph.entries))
+      KokkosSparse::sort_crs_matrix(g);
 
     ordinal_t n  = g.numRows();
     ordinal_t nc = interp_mtx.numCols();
 
     matrix_t interp_transpose =
         KokkosSparse::Impl::transpose_matrix(interp_mtx);
+    KokkosSparse::sort_crs_matrix(interp_transpose);
 
     spgemm_kernel_handle kh;
     kh.set_team_work_size(64);
     kh.set_dynamic_scheduling(true);
-    KokkosSparse::SPGEMMAlgorithm spgemm_algorithm =
-        KokkosSparse::SPGEMM_KK_MEMORY;
-    kh.create_spgemm_handle(spgemm_algorithm);
 
     vtx_view_t adj_coarse;
     wgt_view_t wgt_coarse;
     edge_view_t row_map_coarse;
 
     if (handle.b == Spgemm_transpose_first) {
+      kh.create_spgemm_handle();
       edge_view_t row_map_p1("rows_partial", nc + 1);
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, nc, n, n, interp_transpose.graph.row_map,
@@ -321,8 +322,10 @@ class coarse_builder {
           interp_transpose.graph.entries, interp_transpose.values, false,
           g.graph.row_map, g.graph.entries, g.values, false, row_map_p1,
           entries_p1, values_p1);
+      kh.destroy_spgemm_handle();
 
       row_map_coarse = edge_view_t("rows_coarse", nc + 1);
+      kh.create_spgemm_handle();
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, nc, n, nc, row_map_p1, entries_p1, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, false,
@@ -337,8 +340,10 @@ class coarse_builder {
           &kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values,
           false, row_map_coarse, adj_coarse, wgt_coarse);
+      kh.destroy_spgemm_handle();
     } else {
       edge_view_t row_map_p1("rows_partial", n + 1);
+      kh.create_spgemm_handle();
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, n, n, nc, g.graph.row_map, g.graph.entries, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, false,
@@ -354,8 +359,10 @@ class coarse_builder {
           &kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values,
           false, row_map_p1, entries_p1, values_p1);
+      kh.destroy_spgemm_handle();
 
       row_map_coarse = edge_view_t("rows_coarse", nc + 1);
+      kh.create_spgemm_handle();
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, nc, n, nc, interp_transpose.graph.row_map,
           interp_transpose.graph.entries, false, row_map_p1, entries_p1, false,
@@ -371,6 +378,7 @@ class coarse_builder {
           interp_transpose.graph.entries, interp_transpose.values, false,
           row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse,
           wgt_coarse);
+      kh.destroy_spgemm_handle();
     }
 
     // now we must remove self-loop edges

diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp
@@ -60,11 +60,11 @@ void print_options() {
                "[hipDeviceIndex]' --> if none are specified, Serial is used "
                "(if enabled)"
             << std::endl;
-  std::cerr << "\t[Optional] '--algorithm "
-               "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|"
-               "VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use "
-               "KKSPGEMM instead."
-            << std::endl;
+  std::cerr
+      << "\t[Optional] '--algorithm "
+         "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE]' --> to choose algorithm. "
+         "KKMEM is outdated, use KKSPGEMM instead."
+      << std::endl;
   std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB"
             << std::endl;
   std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> "
@@ -241,18 +241,8 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc,
         params.algorithm = KokkosSparse::SPGEMM_KK_DENSE;
       } else if (0 == Test::string_compare_no_case(algoStr, "KKLP")) {
         params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      } else if (0 == Test::string_compare_no_case(algoStr, "MKL")) {
-        params.algorithm = KokkosSparse::SPGEMM_MKL;
-      } else if (0 == Test::string_compare_no_case(algoStr, "CUSPARSE")) {
-        params.algorithm = KokkosSparse::SPGEMM_CUSPARSE;
-      } else if (0 == Test::string_compare_no_case(algoStr, "CUSP")) {
-        params.algorithm = KokkosSparse::SPGEMM_CUSP;
       } else if (0 == Test::string_compare_no_case(algoStr, "KKDEBUG")) {
-        params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      } else if (0 == Test::string_compare_no_case(algoStr, "MKL2")) {
-        params.algorithm = KokkosSparse::SPGEMM_MKL2PHASE;
-      } else if (0 == Test::string_compare_no_case(algoStr, "VIENNA")) {
-        params.algorithm = KokkosSparse::SPGEMM_VIENNA;
+        params.algorithm = KokkosSparse::SPGEMM_DEBUG;
       }
 
       else {

diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
@@ -1206,7 +1206,8 @@ void KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
       this->handle->get_spgemm_handle()->row_flops;
   size_t original_overall_flops =
       this->handle->get_spgemm_handle()->original_overall_flops;
-  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()
+                          ->template get_max_result_nnz<c_row_view_t>(rowmapC_);
   size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz();
 
   typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>

diff --git a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp
@@ -51,14 +51,10 @@
 #include "KokkosKernels_Handle.hpp"
 // Include the actual functors
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-//#include "KokkosSparse_spgemm_symbolic.hpp"
-#include "KokkosSparse_spgemm_cuSPARSE_impl.hpp"
-#include "KokkosSparse_spgemm_CUSP_impl.hpp"
+#include "KokkosSparse_spgemm_symbolic.hpp"
 #include "KokkosSparse_bspgemm_impl.hpp"
 #include "KokkosSparse_bspgemm_impl_seq.hpp"
-#include "KokkosSparse_spgemm_mkl_impl.hpp"
-#include "KokkosSparse_spgemm_mkl2phase_impl.hpp"
-#include "KokkosSparse_spgemm_viennaCL_impl.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #endif
 
 namespace KokkosSparse {
@@ -219,42 +215,49 @@ struct BSPGEMM_NUMERIC<
     spgemmHandleType *sh = handle->get_spgemm_handle();
     if (!sh->is_symbolic_called()) {
       throw std::runtime_error(
-          "Call spgemm symbolic before calling SpGEMM numeric");
+          "KokkosSparse::bspgemm_numeric: must first call spgemm_symbolic with "
+          "the same handle.");
+    }
+    if (!sh->are_rowptrs_computed()) {
+      // Call symbolic, and make sure rowptrs are populated. This will not
+      // duplicate any work if the user has already called symbolic. Must also
+      // cast away constness of row_mapC.
+      using c_size_view_t_nc = typename c_size_view_t_::non_const_type;
+      using c_size_type      = typename c_size_view_t_::non_const_value_type;
+      c_size_view_t_nc row_mapC_nc(const_cast<c_size_type *>(row_mapC.data()),
+                                   row_mapC.extent(0));
+      KokkosSparse::Experimental::spgemm_symbolic(
+          handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB,
+          transposeB, row_mapC_nc, true);
+    }
+    if (!sh->are_rowflops_computed()) {
+      KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+                   b_size_view_t_, b_lno_view_t, b_scalar_view_t>
+          kspgemm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                  row_mapB, entriesB, valuesB, transposeB);
+      kspgemm.compute_row_flops();
     }
 
     switch (sh->get_algorithm_type()) {
-      case SPGEMM_CUSPARSE:
-        throw std::runtime_error(
-            "cuSPARSE implementation for block SpGEMM is not available");
-      case SPGEMM_CUSP:
-        throw std::runtime_error(
-            "CUSP implementation for block SpGEMM is not available");
-      case SPGEMM_MKL:
-      case SPGEMM_MKL2PHASE:
-        throw std::runtime_error(
-            "MKL implementation available for block SpGEMM is not available");
-      case SPGEMM_VIENNA:
-        throw std::runtime_error(
-            "Vienna implementation available for block SpGEMM is not "
-            "available");
-
-      default:
-
-      {
+      case SPGEMM_SERIAL:
+      case SPGEMM_DEBUG:
+        bspgemm_debug_numeric(handle, m, n, k, blockDim, row_mapA, entriesA,
+                              valuesA, transposeA, row_mapB, entriesB, valuesB,
+                              transposeB, row_mapC, entriesC, valuesC);
+        break;
+      default: {
         KokkosBSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t,
                       a_scalar_view_t, b_size_view_t_, b_lno_view_t,
                       b_scalar_view_t>
             kbspgemm(handle, m, n, k, blockDim, row_mapA, entriesA, valuesA,
                      transposeA, row_mapB, entriesB, valuesB, transposeB);
         kbspgemm.KokkosBSPGEMM_numeric(row_mapC, entriesC, valuesC);
       } break;
-      case SPGEMM_SERIAL:
-      case SPGEMM_DEBUG:
-        bspgemm_debug_numeric(handle, m, n, k, blockDim, row_mapA, entriesA,
-                              valuesA, transposeA, row_mapB, entriesB, valuesB,
-                              transposeB, row_mapC, entriesC, valuesC);
-        break;
     }
+    // Current implementation does not produce sorted matrix
+    // TODO: remove this call when impl sorts
+    KokkosSparse::sort_bsr_matrix<typename KernelHandle::HandleExecSpace>(
+        blockDim, row_mapC, entriesC, valuesC);
   }
 };