From 4c06b6cbd14114aee9db80de84d32019236e4d7d Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Mon, 12 Dec 2022 10:57:10 -0700
Subject: [PATCH] Spgemm TPL refactor (#1618)

* Moving cusparse spgemm to unification layer

Split symbolic, numeric, fused-jacobi into separate TPL avail/decl
files.

* Finish cusparse spgemm tpl wrapper

- Symbolic now backwards compatible with bspgemm_numeric and
  spgemm_jacobi

* Add proper rocsparse spgemm wrappers

* More SpGEMM TPL work

- Remove mentions of deprecated enum value SPGEMM_MKL and SPGEMM_MKL2PHASE
- Add MKL wrapper under unification layer with correct symbolic/numeric separation
- Remove SPGEMM_CUSP and SPGEMM_VIENNA enum values

* Remove now unused spgemm impl files (CUSP, Vienna, MKL)

* perftest: take out deprecated/removed spgemm algos

* Enforce sorting of spgemm output

* Update rocblas includes

<rocblas.h> is deprecated and throws warnings; now we have to use <rocblas/rocblas.h>

* Update the rest of the deprecated rocsparse includes

<rocsparse.h> is deprecated, using <rocsparse/rocsparse.h> instead

* rocsparse spgemm: remove unused typedefs

* Make MKL sort the output of spgemm

* Change the way fuzzy view comparison is done

Assume values are close enough if their absolute value is less than eps.
Values close to zero like this might be the result of summing many larger-magnitude
terms, so the reference value and actual value might end up being (relatively) very different,
even though each algorithm is behaving correctly (ignoring floating-point addition nonassociativity)

* spgemm: buffer management fixes for cusparse 11.4+

* Fix cusparse 10 spgemm wrapper

* Bump tolerance to make bspgemm test pass w/float

* SpGEMM: test symbolic reuse

Call symbolic, numeric, then change A/B values and do numeric again.

* Remove redundant input check in spgemm

* Fix order of m,n,k passed to cusparse 10.x spgemm

Also make spgemm test use distinct m,n,k to catch similar bugs

* spgemm all-at-once interface, debugging utils

Add handle-less (non-reuse) spgemm interface, meaning the user can just
compute C = AB in one line. Use this in the wiki example, since this
should show off the most straightforward way to use the kernel.
Implemented in terms of simplified symbolic/numeric (no separate
unification layer yet)

Debugging utils:
  KokkosSparse::Impl::isCrsGraphSorted(rowmap, entries): check that
    the graph has sorted rows. This test is designed to be efficient
    enough to run by default before kernels that require sorted input.
  KokkosSparse::Impl::validateCrsMatrix(m, n, rowmap, entries, values):
    sanity check the extents and entries of a matrix. Entries should be
    less than n (numCols), rowmap(m) should be equal to nnz, etc.

Use isCrsGraphSorted test on A and B in spgemm_symbolic, unless NDEBUG is
defined (as in a release build). This is important since sorted input is
a new requirement.

* spgemm perf test: fix "--algorithm KKDEBUG"

It was causing SPGEMM_KK_LP (a completely differnet, parallel algo)
to be used, rather than SPGEMM_DEBUG (the host serial reference
implementation).

* Sparse utils: zero-initialize, remove unused member
---
 blas/tpls/KokkosBlas_tpl_spec.hpp             |   2 +-
 blas/unit_test/Test_Blas_rocblas.hpp          |   2 +-
 common/src/KokkosKernels_SimpleUtils.hpp      |  35 +-
 .../wiki/sparse/KokkosSparse_wiki_spgemm.cpp  |  26 +-
 graph/src/KokkosGraph_CoarsenConstruct.hpp    |  14 +-
 perf_test/sparse/KokkosSparse_spgemm.cpp      |  22 +-
 .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp  |   3 +-
 .../KokkosSparse_bspgemm_numeric_spec.hpp     |  65 +-
 sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp | 249 -------
 .../KokkosSparse_spgemm_cuSPARSE_impl.hpp     | 501 -------------
 sparse/impl/KokkosSparse_spgemm_impl.hpp      |  36 +
 sparse/impl/KokkosSparse_spgemm_impl_def.hpp  |  35 +-
 .../impl/KokkosSparse_spgemm_impl_kkmem.hpp   |   3 +-
 .../KokkosSparse_spgemm_impl_symbolic.hpp     |  62 +-
 ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp |   4 +-
 .../impl/KokkosSparse_spgemm_jacobi_spec.hpp  |  32 +-
 .../KokkosSparse_spgemm_mkl2phase_impl.hpp    | 685 -----------------
 sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp  | 314 --------
 .../impl/KokkosSparse_spgemm_numeric_spec.hpp | 211 +-----
 .../KokkosSparse_spgemm_rocSPARSE_impl.hpp    | 435 -----------
 .../KokkosSparse_spgemm_symbolic_spec.hpp     |  98 +--
 .../KokkosSparse_spgemm_viennaCL_impl.hpp     | 234 ------
 sparse/src/KokkosKernels_Controls.hpp         |   2 +-
 sparse/src/KokkosSparse_Utils.hpp             |  99 +++
 sparse/src/KokkosSparse_Utils_rocsparse.hpp   |  17 +-
 sparse/src/KokkosSparse_spgemm.hpp            |  19 +
 sparse/src/KokkosSparse_spgemm_handle.hpp     | 250 ++++---
 sparse/src/KokkosSparse_spgemm_numeric.hpp    |  57 +-
 sparse/src/KokkosSparse_spgemm_symbolic.hpp   |  58 +-
 .../tpls/KokkosKernels_tpl_handles_decl.hpp   |   2 +-
 ...osSparse_spgemm_jacobi_tpl_spec_avail.hpp} |  19 +-
 ...kosSparse_spgemm_jacobi_tpl_spec_decl.hpp} |   0
 ...osSparse_spgemm_numeric_tpl_spec_avail.hpp | 211 ++++++
 ...kosSparse_spgemm_numeric_tpl_spec_decl.hpp | 672 +++++++++++++++++
 ...sSparse_spgemm_symbolic_tpl_spec_avail.hpp | 171 +++++
 ...osSparse_spgemm_symbolic_tpl_spec_decl.hpp | 686 ++++++++++++++++++
 .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp  |   2 +-
 sparse/unit_test/Test_Sparse_CrsMatrix.hpp    |   1 +
 sparse/unit_test/Test_Sparse_bspgemm.hpp      |  48 +-
 sparse/unit_test/Test_Sparse_rocsparse.hpp    |   2 +-
 sparse/unit_test/Test_Sparse_spgemm.hpp       | 224 +++---
 .../unit_test/Test_Sparse_spgemm_jacobi.hpp   |   9 +-
 42 files changed, 2463 insertions(+), 3154 deletions(-)
 delete mode 100644 sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp
 delete mode 100644 sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
 delete mode 100644 sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
 delete mode 100644 sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
 delete mode 100644 sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp
 delete mode 100644 sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp
 rename sparse/tpls/{KokkosSparse_spgemm_tpl_spec_avail.hpp => KokkosSparse_spgemm_jacobi_tpl_spec_avail.hpp} (76%)
 rename sparse/tpls/{KokkosSparse_spgemm_tpl_spec_decl.hpp => KokkosSparse_spgemm_jacobi_tpl_spec_decl.hpp} (100%)
 create mode 100644 sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp
 create mode 100644 sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp
 create mode 100644 sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp
 create mode 100644 sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp

diff --git a/blas/tpls/KokkosBlas_tpl_spec.hpp b/blas/tpls/KokkosBlas_tpl_spec.hpp
index fbb353c656..ffe02f6529 100644
--- a/blas/tpls/KokkosBlas_tpl_spec.hpp
+++ b/blas/tpls/KokkosBlas_tpl_spec.hpp
@@ -137,7 +137,7 @@ inline cublasOperation_t trans_mode_kk_to_cublas(const char kkMode[]) {
 #endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 namespace KokkosBlas {
 namespace Impl {
diff --git a/blas/unit_test/Test_Blas_rocblas.hpp b/blas/unit_test/Test_Blas_rocblas.hpp
index 61ba06aebc..e30adceffa 100644
--- a/blas/unit_test/Test_Blas_rocblas.hpp
+++ b/blas/unit_test/Test_Blas_rocblas.hpp
@@ -7,7 +7,7 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 #include "KokkosBlas_tpl_spec.hpp"
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 // Just check if we can build against
 // rocblas and get the library version
diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp
index 86944e9c4d..08207c199b 100644
--- a/common/src/KokkosKernels_SimpleUtils.hpp
+++ b/common/src/KokkosKernels_SimpleUtils.hpp
@@ -306,20 +306,26 @@ struct IsRelativelyIdenticalFunctor {
       : view1(view1_), view2(view2_), eps(eps_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &i, size_t &is_equal) const {
+  void operator()(const size_t &i, size_t &num_diffs) const {
     typedef typename view_type2::non_const_value_type val_type;
-    typedef Kokkos::Details::ArithTraits<val_type> KAT;
+    typedef Kokkos::ArithTraits<val_type> KAT;
     typedef typename KAT::mag_type mag_type;
-    typedef Kokkos::Details::ArithTraits<mag_type> KATM;
+    typedef Kokkos::ArithTraits<mag_type> KATM;
 
-    mag_type val_diff = KAT::abs(view1(i) - view2(i));
-    if (KAT::abs(view1(i)) > KATM::zero() &&
-        KAT::abs(view2(i)) > KATM::zero()) {
-      val_diff = val_diff / KAT::abs(view2(i));
+    mag_type val_diff = KATM::zero();
+    if (KAT::abs(view1(i)) > mag_type(eps) ||
+        KAT::abs(view2(i)) > mag_type(eps)) {
+      val_diff = KAT::abs(view1(i) - view2(i)) /
+                 (KAT::abs(view1(i)) + KAT::abs(view2(i)));
     }
 
-    if (val_diff > eps) {
-      is_equal += 1;
+    if (val_diff > mag_type(eps)) {
+      printf(
+          "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much "
+          "(eps = %e)\n",
+          (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)),
+          KAT::imag(view2(i)), eps);
+      num_diffs++;
     }
   }
 };
@@ -335,19 +341,14 @@ bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2,
   size_t num_elements = view1.extent(0);
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  size_t issame = 0;
+  size_t numDifferences = 0;
   Kokkos::parallel_reduce(
       "KokkosKernels::Common::IsRelativelyIdenticalView",
       my_exec_space(0, num_elements),
       IsRelativelyIdenticalFunctor<view_type1, view_type2, eps_type>(
           view1, view2, eps),
-      issame);
-  MyExecSpace().fence();
-  if (issame > 0) {
-    return false;
-  } else {
-    return true;
-  }
+      numDifferences);
+  return numDifferences == 0;
 }
 
 template <typename view_type>
diff --git a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp
index 63600e08ba..831cb8ca7a 100644
--- a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp
+++ b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp
@@ -16,8 +16,6 @@ int main() {
   using device_type = typename Kokkos::Device<
       Kokkos::DefaultExecutionSpace,
       typename Kokkos::DefaultExecutionSpace::memory_space>;
-  using execution_space = typename device_type::execution_space;
-  using memory_space    = typename device_type::memory_space;
   using matrix_type =
       typename KokkosSparse::CrsMatrix<Scalar, Ordinal, device_type, void,
                                        Offset>;
@@ -32,7 +30,7 @@ int main() {
     // In each row the first entry is the number of grid point in
     // that direction, the second and third entries are used to apply
     // BCs in that direction.
-    Kokkos::View<Ordinal* [3], Kokkos::HostSpace> mat_structure(
+    Kokkos::View<Ordinal * [3], Kokkos::HostSpace> mat_structure(
         "Matrix Structure", 2);
     mat_structure(0, 0) = 10;  // Request 10 grid point in 'x' direction
     mat_structure(0, 1) = 1;   // Add BC to the left
@@ -45,27 +43,11 @@ int main() {
         Test::generate_structured_matrix2D<matrix_type>("FD", mat_structure);
     matrix_type B =
         Test::generate_structured_matrix2D<matrix_type>("FE", mat_structure);
-    matrix_type C;
 
-    // Create KokkosKernelHandle
-    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
-        Offset, Ordinal, Scalar, execution_space, memory_space, memory_space>;
-    KernelHandle kh;
-    kh.set_team_work_size(16);
-    kh.set_dynamic_scheduling(true);
+    matrix_type C = KokkosSparse::spgemm<matrix_type>(A, false, B, false);
 
-    // Select an spgemm algorithm, limited by configuration at compile-time and
-    // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
-    // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
-    std::string myalg("SPGEMM_KK_MEMORY");
-    KokkosSparse::SPGEMMAlgorithm spgemm_algorithm =
-        KokkosSparse::StringToSPGEMMAlgorithm(myalg);
-    kh.create_spgemm_handle(spgemm_algorithm);
-
-    KokkosSparse::spgemm_symbolic(kh, A, false, B, false, C);
-    KokkosSparse::spgemm_numeric(kh, A, false, B, false, C);
-
-    std::cout << "spgemm was performed correctly!" << std::endl;
+    std::cout << "Ran spgemm: product C is " << C.numRows() << 'x'
+              << C.numCols() << " and has " << C.nnz() << " nonzeros.\n";
   }
 
   Kokkos::finalize();
diff --git a/graph/src/KokkosGraph_CoarsenConstruct.hpp b/graph/src/KokkosGraph_CoarsenConstruct.hpp
index 9e9b4049e6..eb4a2ae556 100644
--- a/graph/src/KokkosGraph_CoarsenConstruct.hpp
+++ b/graph/src/KokkosGraph_CoarsenConstruct.hpp
@@ -285,25 +285,26 @@ class coarse_builder {
       const matrix_t interp_mtx) {
     vtx_view_t f_vtx_w = level.vtx_wgts;
     matrix_t g         = level.mtx;
+    if (!KokkosSparse::Impl::isCrsGraphSorted(g.graph.row_map, g.graph.entries))
+      KokkosSparse::sort_crs_matrix(g);
 
     ordinal_t n  = g.numRows();
     ordinal_t nc = interp_mtx.numCols();
 
     matrix_t interp_transpose =
         KokkosSparse::Impl::transpose_matrix(interp_mtx);
+    KokkosSparse::sort_crs_matrix(interp_transpose);
 
     spgemm_kernel_handle kh;
     kh.set_team_work_size(64);
     kh.set_dynamic_scheduling(true);
-    KokkosSparse::SPGEMMAlgorithm spgemm_algorithm =
-        KokkosSparse::SPGEMM_KK_MEMORY;
-    kh.create_spgemm_handle(spgemm_algorithm);
 
     vtx_view_t adj_coarse;
     wgt_view_t wgt_coarse;
     edge_view_t row_map_coarse;
 
     if (handle.b == Spgemm_transpose_first) {
+      kh.create_spgemm_handle();
       edge_view_t row_map_p1("rows_partial", nc + 1);
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, nc, n, n, interp_transpose.graph.row_map,
@@ -321,8 +322,10 @@ class coarse_builder {
           interp_transpose.graph.entries, interp_transpose.values, false,
           g.graph.row_map, g.graph.entries, g.values, false, row_map_p1,
           entries_p1, values_p1);
+      kh.destroy_spgemm_handle();
 
       row_map_coarse = edge_view_t("rows_coarse", nc + 1);
+      kh.create_spgemm_handle();
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, nc, n, nc, row_map_p1, entries_p1, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, false,
@@ -337,8 +340,10 @@ class coarse_builder {
           &kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values,
           false, row_map_coarse, adj_coarse, wgt_coarse);
+      kh.destroy_spgemm_handle();
     } else {
       edge_view_t row_map_p1("rows_partial", n + 1);
+      kh.create_spgemm_handle();
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, n, n, nc, g.graph.row_map, g.graph.entries, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, false,
@@ -354,8 +359,10 @@ class coarse_builder {
           &kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false,
           interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values,
           false, row_map_p1, entries_p1, values_p1);
+      kh.destroy_spgemm_handle();
 
       row_map_coarse = edge_view_t("rows_coarse", nc + 1);
+      kh.create_spgemm_handle();
       KokkosSparse::Experimental::spgemm_symbolic(
           &kh, nc, n, nc, interp_transpose.graph.row_map,
           interp_transpose.graph.entries, false, row_map_p1, entries_p1, false,
@@ -371,6 +378,7 @@ class coarse_builder {
           interp_transpose.graph.entries, interp_transpose.values, false,
           row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse,
           wgt_coarse);
+      kh.destroy_spgemm_handle();
     }
 
     // now we must remove self-loop edges
diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp
index da705fcdf2..9ec47a068a 100644
--- a/perf_test/sparse/KokkosSparse_spgemm.cpp
+++ b/perf_test/sparse/KokkosSparse_spgemm.cpp
@@ -60,11 +60,11 @@ void print_options() {
                "[hipDeviceIndex]' --> if none are specified, Serial is used "
                "(if enabled)"
             << std::endl;
-  std::cerr << "\t[Optional] '--algorithm "
-               "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|"
-               "VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use "
-               "KKSPGEMM instead."
-            << std::endl;
+  std::cerr
+      << "\t[Optional] '--algorithm "
+         "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE]' --> to choose algorithm. "
+         "KKMEM is outdated, use KKSPGEMM instead."
+      << std::endl;
   std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB"
             << std::endl;
   std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> "
@@ -241,18 +241,8 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc,
         params.algorithm = KokkosSparse::SPGEMM_KK_DENSE;
       } else if (0 == Test::string_compare_no_case(algoStr, "KKLP")) {
         params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      } else if (0 == Test::string_compare_no_case(algoStr, "MKL")) {
-        params.algorithm = KokkosSparse::SPGEMM_MKL;
-      } else if (0 == Test::string_compare_no_case(algoStr, "CUSPARSE")) {
-        params.algorithm = KokkosSparse::SPGEMM_CUSPARSE;
-      } else if (0 == Test::string_compare_no_case(algoStr, "CUSP")) {
-        params.algorithm = KokkosSparse::SPGEMM_CUSP;
       } else if (0 == Test::string_compare_no_case(algoStr, "KKDEBUG")) {
-        params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      } else if (0 == Test::string_compare_no_case(algoStr, "MKL2")) {
-        params.algorithm = KokkosSparse::SPGEMM_MKL2PHASE;
-      } else if (0 == Test::string_compare_no_case(algoStr, "VIENNA")) {
-        params.algorithm = KokkosSparse::SPGEMM_VIENNA;
+        params.algorithm = KokkosSparse::SPGEMM_DEBUG;
       }
 
       else {
diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
index 2e857d2dbe..22c6391d99 100644
--- a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
+++ b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
@@ -1206,7 +1206,8 @@ void KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
       this->handle->get_spgemm_handle()->row_flops;
   size_t original_overall_flops =
       this->handle->get_spgemm_handle()->original_overall_flops;
-  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()
+                          ->template get_max_result_nnz<c_row_view_t>(rowmapC_);
   size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz();
 
   typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
diff --git a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp
index ec7a28189d..a493d50b9a 100644
--- a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp
+++ b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp
@@ -51,14 +51,10 @@
 #include "KokkosKernels_Handle.hpp"
 // Include the actual functors
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-//#include "KokkosSparse_spgemm_symbolic.hpp"
-#include "KokkosSparse_spgemm_cuSPARSE_impl.hpp"
-#include "KokkosSparse_spgemm_CUSP_impl.hpp"
+#include "KokkosSparse_spgemm_symbolic.hpp"
 #include "KokkosSparse_bspgemm_impl.hpp"
 #include "KokkosSparse_bspgemm_impl_seq.hpp"
-#include "KokkosSparse_spgemm_mkl_impl.hpp"
-#include "KokkosSparse_spgemm_mkl2phase_impl.hpp"
-#include "KokkosSparse_spgemm_viennaCL_impl.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #endif
 
 namespace KokkosSparse {
@@ -219,28 +215,37 @@ struct BSPGEMM_NUMERIC<
     spgemmHandleType *sh = handle->get_spgemm_handle();
     if (!sh->is_symbolic_called()) {
       throw std::runtime_error(
-          "Call spgemm symbolic before calling SpGEMM numeric");
+          "KokkosSparse::bspgemm_numeric: must first call spgemm_symbolic with "
+          "the same handle.");
+    }
+    if (!sh->are_rowptrs_computed()) {
+      // Call symbolic, and make sure rowptrs are populated. This will not
+      // duplicate any work if the user has already called symbolic. Must also
+      // cast away constness of row_mapC.
+      using c_size_view_t_nc = typename c_size_view_t_::non_const_type;
+      using c_size_type      = typename c_size_view_t_::non_const_value_type;
+      c_size_view_t_nc row_mapC_nc(const_cast<c_size_type *>(row_mapC.data()),
+                                   row_mapC.extent(0));
+      KokkosSparse::Experimental::spgemm_symbolic(
+          handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB,
+          transposeB, row_mapC_nc, true);
+    }
+    if (!sh->are_rowflops_computed()) {
+      KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+                   b_size_view_t_, b_lno_view_t, b_scalar_view_t>
+          kspgemm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                  row_mapB, entriesB, valuesB, transposeB);
+      kspgemm.compute_row_flops();
     }
 
     switch (sh->get_algorithm_type()) {
-      case SPGEMM_CUSPARSE:
-        throw std::runtime_error(
-            "cuSPARSE implementation for block SpGEMM is not available");
-      case SPGEMM_CUSP:
-        throw std::runtime_error(
-            "CUSP implementation for block SpGEMM is not available");
-      case SPGEMM_MKL:
-      case SPGEMM_MKL2PHASE:
-        throw std::runtime_error(
-            "MKL implementation available for block SpGEMM is not available");
-      case SPGEMM_VIENNA:
-        throw std::runtime_error(
-            "Vienna implementation available for block SpGEMM is not "
-            "available");
-
-      default:
-
-      {
+      case SPGEMM_SERIAL:
+      case SPGEMM_DEBUG:
+        bspgemm_debug_numeric(handle, m, n, k, blockDim, row_mapA, entriesA,
+                              valuesA, transposeA, row_mapB, entriesB, valuesB,
+                              transposeB, row_mapC, entriesC, valuesC);
+        break;
+      default: {
         KokkosBSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t,
                       a_scalar_view_t, b_size_view_t_, b_lno_view_t,
                       b_scalar_view_t>
@@ -248,13 +253,11 @@ struct BSPGEMM_NUMERIC<
                      transposeA, row_mapB, entriesB, valuesB, transposeB);
         kbspgemm.KokkosBSPGEMM_numeric(row_mapC, entriesC, valuesC);
       } break;
-      case SPGEMM_SERIAL:
-      case SPGEMM_DEBUG:
-        bspgemm_debug_numeric(handle, m, n, k, blockDim, row_mapA, entriesA,
-                              valuesA, transposeA, row_mapB, entriesB, valuesB,
-                              transposeB, row_mapC, entriesC, valuesC);
-        break;
     }
+    // Current implementation does not produce sorted matrix
+    // TODO: remove this call when impl sorts
+    KokkosSparse::sort_bsr_matrix<typename KernelHandle::HandleExecSpace>(
+        blockDim, row_mapC, entriesC, valuesC);
   }
 };
 
diff --git a/sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp b/sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp
deleted file mode 100644
index ecabc22ecf..0000000000
--- a/sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef _KOKKOSSPGEMMCUSP_HPP
-#define _KOKKOSSPGEMMCUSP_HPP
-
-#ifdef KERNELS_HAVE_CUSP
-#include <cusp/multiply.h>
-#include <cusp/csr_matrix.h>
-#endif
-
-namespace KokkosSparse {
-namespace Impl {
-
-template <typename cusparray, typename kokkosarray>
-struct CopyArrayToCuspArray {
-  cusparray c;
-  kokkosarray *k;
-
-  CopyArrayToCuspArray(cusparray &c_, kokkosarray *k_) : c(c_), k(k_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &i) const { c[i] = k[i]; }
-};
-
-template <
-    typename KernelHandle, typename ain_row_index_view_type,
-    typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type,
-    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
-    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
-    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
-void CUSP_apply(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-                typename KernelHandle::nnz_lno_t n,
-                typename KernelHandle::nnz_lno_t k,
-                ain_row_index_view_type row_mapA,
-                ain_nonzero_index_view_type entriesA,
-                ain_nonzero_value_view_type valuesA,
-
-                bool /* transposeA */, bin_row_index_view_type row_mapB,
-                bin_nonzero_index_view_type entriesB,
-                bin_nonzero_value_view_type valuesB, bool /* transposeB */,
-                cin_row_index_view_type row_mapC,
-                cin_nonzero_index_view_type &entriesC,
-                cin_nonzero_value_view_type &valuesC) {
-#ifdef KERNELS_HAVE_CUSP
-  typedef typename KernelHandle::nnz_lno_t idx;
-  typedef typename KernelHandle::nnz_scalar_t value_type;
-
-  typedef typename ain_row_index_view_type::device_type device1;
-  typedef typename ain_nonzero_index_view_type::device_type device2;
-  typedef typename ain_nonzero_value_view_type::device_type device3;
-
-  if (std::is_same<Kokkos::Cuda, device1>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
-    // return;
-  }
-  if (std::is_same<Kokkos::Cuda, device2>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
-    // return;
-  }
-  if (std::is_same<Kokkos::Cuda, device3>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
-    // return;
-  }
-
-  // typedef in_row_index_view_type idx_array_type;
-
-  typedef typename Kokkos::RangePolicy<typename KernelHandle::HandleExecSpace>
-      my_exec_space;
-
-  idx nnzA = entriesA.extent(0);
-  idx nnzB = entriesB.extent(0);
-
-  idx *a_xadj = (idx *)row_mapA.data();
-  idx *b_xadj = (idx *)row_mapB.data();
-
-  idx *a_adj = (idx *)entriesA.data();
-  idx *b_adj = (idx *)entriesB.data();
-
-  value_type *a_ew = valuesA.data();
-  value_type *b_ew = valuesB.data();
-
-  /*
-  thrust::device_ptr<idx> dev_a_xadj(a_xadj);
-  thrust::device_ptr<idx> dev_a_adj(a_adj);
-  thrust::device_ptr<idx> dev_b_xadj(b_xadj);
-  thrust::device_ptr<idx> dev_b_adj(b_adj);
-  thrust::device_ptr<value_type> dev_a_ew(a_ew);
-  thrust::device_ptr<value_type> dev_b_ew(b_ew);
-
-  */
-  typedef typename cusp::array1d_view<thrust::device_ptr<idx> > IDXArray1dView;
-  typedef typename cusp::array1d_view<thrust::device_ptr<value_type> >
-      VALUEArray1dView;
-  // typedef typename cusp::array1d<idx, cusp::device_memory> IDXArray1dView;
-  // typedef typename cusp::array1d<value_type, cusp::device_memory>
-  // VALUEArray1dView;
-  IDXArray1dView arraya_xadj(thrust::device_pointer_cast(a_xadj),
-                             thrust::device_pointer_cast(a_xadj) + m + 1);
-  IDXArray1dView arraya_adj(thrust::device_pointer_cast(a_adj),
-                            thrust::device_pointer_cast(a_adj) + nnzA);
-  IDXArray1dView arrayb_xadj(thrust::device_pointer_cast(b_xadj),
-                             thrust::device_pointer_cast(b_xadj) + n + 1);
-  IDXArray1dView arrayb_adj(thrust::device_pointer_cast(b_adj),
-                            thrust::device_pointer_cast(b_adj) + nnzB);
-  VALUEArray1dView arraya_ew(thrust::device_pointer_cast(a_ew),
-                             thrust::device_pointer_cast(a_ew) + nnzA);
-  VALUEArray1dView arrayb_ew(thrust::device_pointer_cast(b_ew),
-                             thrust::device_pointer_cast(b_ew) + nnzB);
-
-  typedef typename cusp::csr_matrix_view<IDXArray1dView, IDXArray1dView,
-                                         VALUEArray1dView, idx, value_type,
-                                         cusp::device_memory>
-      cuspMatrix_View;
-
-  cuspMatrix_View A(m, n, entriesA.extent(0), arraya_xadj, arraya_adj,
-                    arraya_ew);
-  cuspMatrix_View B(n, k, entriesB.extent(0), arrayb_xadj, arrayb_adj,
-                    arrayb_ew);
-
-  /*
-  CopyArrayToCuspArray<typename cuspMatrix::row_offsets_array_type, typename
-  KernelHandle::idx_array_type> Aforward(A.row_offsets, row_mapA);
-  Kokkos::parallel_for (my_exec_space (0, m + 1) , Aforward);
-  Kokkos::parallel_for (my_exec_space (0, n + 1) , CopyArrayToCuspArray<typename
-  cuspMatrix::row_offsets_array_type, typename
-  KernelHandle::idx_array_type>(B.row_offsets, row_mapB));
-
-  Kokkos::parallel_for (my_exec_space (0, entriesA.extent(0)) ,
-  CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename
-  KernelHandle::idx_edge_array_type>(A.column_indices, entriesA));
-  Kokkos::parallel_for (my_exec_space (0, entriesB.extent(0)) ,
-  CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename
-  KernelHandle::idx_edge_array_type>(B.column_indices, entriesB));
-
-  Kokkos::parallel_for (my_exec_space (0, valuesA.extent(0)) ,
-  CopyArrayToCuspArray<typename cuspMatrix::values_array_type, typename
-  KernelHandle::value_array_type>(A.values, valuesA)); Kokkos::parallel_for
-  (my_exec_space (0, valuesB.extent(0)) , CopyArrayToCuspArray<typename
-  cuspMatrix::values_array_type, typename
-  KernelHandle::value_array_type>(B.values, valuesB));
-  */
-
-  typedef typename cusp::csr_matrix<idx, value_type, cusp::device_memory>
-      cuspMatrix;
-  // typedef cuspMatrix_View cuspMatrix;
-  cuspMatrix C;
-
-  Kokkos::Timer timer1;
-  cusp::multiply(A, B, C);
-  KernelHandle::HandleExecSpace().fence();
-  std::cout << "Actual CUSP SPMM Time:" << timer1.seconds() << std::endl;
-
-  // std::cout << " C.column_indices.size():" <<  C.column_indices.size() <<
-  // std::endl; std::cout << " C.values.size():" <<  C.values.size() <<
-  // std::endl; row_mapC = typename
-  // cin_row_index_view_type::non_const_type("rowmapC", m + 1);
-
-  handle->set_c_nnz(C.values.size());
-
-  entriesC = typename cin_nonzero_index_view_type::non_const_type(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "EntriesC"),
-      C.column_indices.size());
-  valuesC = typename cin_nonzero_value_view_type::non_const_type(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"),
-      C.values.size());
-
-  Kokkos::parallel_for(
-      my_exec_space(0, m + 1),
-      CopyArrayToCuspArray<typename cin_row_index_view_type::non_const_type,
-                           idx>(
-          row_mapC, (idx *)thrust::raw_pointer_cast(C.row_offsets.data())));
-  Kokkos::parallel_for(
-      my_exec_space(0, C.column_indices.size()),
-      CopyArrayToCuspArray<typename cin_nonzero_index_view_type::non_const_type,
-                           idx>(
-          entriesC, (idx *)thrust::raw_pointer_cast(C.column_indices.data())));
-  Kokkos::parallel_for(
-      my_exec_space(0, C.values.size()),
-      CopyArrayToCuspArray<typename cin_nonzero_value_view_type::non_const_type,
-                           value_type>(
-          valuesC, (value_type *)thrust::raw_pointer_cast(C.values.data())));
-
-#else
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)entriesC;
-  (void)valuesA;
-  (void)valuesB;
-  (void)valuesC;
-  throw std::runtime_error("CUSP IS NOT DEFINED\n");
-  // return;
-#endif
-}
-}  // namespace Impl
-
-}  // namespace KokkosSparse
-#endif
diff --git a/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
deleted file mode 100644
index 30e2fdbd79..0000000000
--- a/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef _KOKKOSSPGEMMCUSPARSE_HPP
-#define _KOKKOSSPGEMMCUSPARSE_HPP
-
-#include "KokkosKernels_Controls.hpp"
-#include "KokkosSparse_Utils_cusparse.hpp"
-
-namespace KokkosSparse {
-
-namespace Impl {
-
-template <
-    typename KernelHandle, typename ain_row_index_view_type,
-    typename ain_nonzero_index_view_type, typename bin_row_index_view_type,
-    typename bin_nonzero_index_view_type, typename cin_row_index_view_type>
-void cuSPARSE_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-                       typename KernelHandle::nnz_lno_t n,
-                       typename KernelHandle::nnz_lno_t k,
-                       ain_row_index_view_type row_mapA,
-                       ain_nonzero_index_view_type entriesA,
-
-                       bool transposeA, bin_row_index_view_type row_mapB,
-                       bin_nonzero_index_view_type entriesB, bool transposeB,
-                       cin_row_index_view_type row_mapC) {
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  using device1 = typename ain_row_index_view_type::device_type;
-  using device2 = typename ain_nonzero_index_view_type::device_type;
-  using idx     = typename KernelHandle::nnz_lno_t;
-
-  // TODO this is not correct, check memory space.
-  if (std::is_same<Kokkos::Cuda, device1>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-    // return;
-  }
-  if (std::is_same<Kokkos::Cuda, device2>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-    // return;
-  }
-
-  // CUDA_VERSION coming along with CUDAToolkit is easier to find than
-  // CUSPARSE_VERSION
-#if (CUDA_VERSION >= 11040)
-  // Newest versions of cuSPARSE have the generic SpGEMM interface, with "reuse"
-  // functions.
-  if (!std::is_same<typename std::remove_cv<idx>::type, int>::value ||
-      !std::is_same<
-          typename std::remove_cv<typename KernelHandle::size_type>::type,
-          int>::value) {
-    throw std::runtime_error(
-        "cusparseSpGEMMreuse requires local ordinals to be 32-bit integer.");
-  }
-
-  handle->set_sort_option(1);  // tells users the output is sorted
-  handle->create_cusparse_spgemm_handle(transposeA, transposeB);
-  typename KernelHandle::cuSparseSpgemmHandleType *h =
-      handle->get_cusparse_spgemm_handle();
-
-  // Follow
-  // https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/spgemm_reuse
-  void *buffer1      = NULL;
-  void *buffer2      = NULL;
-  size_t bufferSize1 = 0;
-  size_t bufferSize2 = 0;
-
-  // When nnz is not zero, cusparseCreateCsr insists non-null a value pointer,
-  // which however is not available in this function. So we fake it with the
-  // entries instead. Fortunately, it seems cupsarse does not access that in the
-  // symbolic phase.
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
-      &h->descr_A, m, n, entriesA.extent(0), (void *)row_mapA.data(),
-      (void *)entriesA.data(), (void *)entriesA.data() /*fake*/,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
-      h->scalarType));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
-      &h->descr_B, n, k, entriesB.extent(0), (void *)row_mapB.data(),
-      (void *)entriesB.data(), (void *)entriesB.data() /*fake*/,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
-      h->scalarType));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
-      &h->descr_C, m, k, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
-
-  //----------------------------------------------------------------------
-  // ask bufferSize1 bytes for external memory
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_workEstimation(
-      h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
-      h->alg, h->spgemmDescr, &bufferSize1, NULL));
-
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&buffer1, bufferSize1));
-  // inspect matrices A and B to understand the memory requirement for the next
-  // step
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_workEstimation(
-      h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
-      h->alg, h->spgemmDescr, &bufferSize1, buffer1));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer1));
-
-  //----------------------------------------------------------------------
-  // Compute nnz of C
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_nnz(
-      h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
-      h->alg, h->spgemmDescr, &bufferSize2, NULL, &h->bufferSize3, NULL,
-      &h->bufferSize4, NULL));
-
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&buffer2, bufferSize2));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&h->buffer3, h->bufferSize3));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&h->buffer4, h->bufferSize4));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_nnz(
-      h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
-      h->alg, h->spgemmDescr, &bufferSize2, buffer2, &h->bufferSize3,
-      h->buffer3, &h->bufferSize4, h->buffer4));
-
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer2));
-
-  int64_t C_nrow, C_ncol, C_nnz;
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseSpMatGetSize(h->descr_C, &C_nrow, &C_ncol, &C_nnz));
-  if (C_nnz > std::numeric_limits<int>::max()) {
-    throw std::runtime_error("nnz of C overflowed over 32-bit int\n");
-  }
-  handle->set_c_nnz(C_nnz);
-  h->C_populated = false;  // sparsity pattern of C is not set yet
-  (void)row_mapC;
-
-#elif defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION)
-  using scalar_type = typename KernelHandle::nnz_scalar_t;
-  // cuSPARSE from CUDA 11.0-11.3 (inclusive) supports the new "generic" SpGEMM
-  // interface, just not the "reuse" set of functions. This means compute must
-  // be called in both symbolic and numeric (otherwise, the NNZ of C can't be
-  // known by symbolic)
-  if (!std::is_same<typename std::remove_cv<idx>::type, int>::value ||
-      !std::is_same<
-          typename std::remove_cv<typename KernelHandle::size_type>::type,
-          int>::value) {
-    throw std::runtime_error(
-        "cusparseSpGEMM requires local ordinals to be 32-bit integer.");
-  }
-
-  handle->set_sort_option(1);  // tells users the output is sorted
-  handle->create_cusparse_spgemm_handle(transposeA, transposeB);
-  typename KernelHandle::cuSparseSpgemmHandleType *h =
-      handle->get_cusparse_spgemm_handle();
-
-  // Follow
-  // https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm
-
-  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
-  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
-
-  // In non-reuse interface, forced to give A,B dummy values to
-  // cusparseSpGEMM_compute. And it actually reads them, so they must be
-  // allocated and of the correct type. This compute will be called again in
-  // numeric with the real values.
-  //
-  // The dummy values can be uninitialized. cusparseSpGEMM_compute does
-  // not remove numerical zeros from the sparsity pattern.
-  void *dummyValues;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(
-      &dummyValues,
-      sizeof(scalar_type) * std::max(entriesA.extent(0), entriesB.extent(0))));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
-      &h->descr_A, m, n, entriesA.extent(0), (void *)row_mapA.data(),
-      (void *)entriesA.data(), dummyValues, CUSPARSE_INDEX_32I,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
-      &h->descr_B, n, k, entriesB.extent(0), (void *)row_mapB.data(),
-      (void *)entriesB.data(), dummyValues, CUSPARSE_INDEX_32I,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
-      &h->descr_C, m, k, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
-
-  //----------------------------------------------------------------------
-  // query workEstimation buffer size, allocate, then call again with buffer.
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_workEstimation(
-      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
-      h->descr_C, h->scalarType, h->alg, h->spgemmDescr, &h->bufferSize3,
-      NULL));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&h->buffer3, h->bufferSize3));
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_workEstimation(
-      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
-      h->descr_C, h->scalarType, h->alg, h->spgemmDescr, &h->bufferSize3,
-      h->buffer3));
-
-  //----------------------------------------------------------------------
-  // query compute buffer size, allocate, then call again with buffer.
-
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute(
-      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
-      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr,
-      &h->bufferSize4, NULL));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&h->buffer4, h->bufferSize4));
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute(
-      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
-      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr,
-      &h->bufferSize4, h->buffer4));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dummyValues));
-
-  int64_t C_nrow, C_ncol, C_nnz;
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseSpMatGetSize(h->descr_C, &C_nrow, &C_ncol, &C_nnz));
-  if (C_nnz > std::numeric_limits<int>::max()) {
-    throw std::runtime_error("nnz of C overflowed over 32-bit int\n");
-  }
-  handle->set_c_nnz(C_nnz);
-  h->C_populated = false;  // sparsity pattern of C is not set yet
-  (void)row_mapC;
-
-#else
-
-  if (std::is_same<idx, int>::value &&
-      std::is_same<typename KernelHandle::size_type, int>::value) {
-    const idx *a_xadj = (const idx *)row_mapA.data();
-    const idx *b_xadj = (const idx *)row_mapB.data();
-    idx *c_xadj       = (idx *)row_mapC.data();
-
-    const idx *a_adj = entriesA.data();
-    const idx *b_adj = entriesB.data();
-    handle->create_cusparse_spgemm_handle(transposeA, transposeB);
-    typename KernelHandle::cuSparseSpgemmHandleType *h =
-        handle->get_cusparse_spgemm_handle();
-
-    int nnzA = entriesA.extent(0);
-    int nnzB = entriesB.extent(0);
-
-    int baseC, nnzC;
-    int *nnzTotalDevHostPtr = &nnzC;
-
-    handle->set_sort_option(1);  // tells users the output is sorted
-    cusparseXcsrgemmNnz(h->handle, h->transA, h->transB, (int)m, (int)n, (int)k,
-                        h->a_descr, nnzA, (int *)a_xadj, (int *)a_adj,
-                        h->b_descr, nnzB, (int *)b_xadj, (int *)b_adj,
-                        h->c_descr, (int *)c_xadj, nnzTotalDevHostPtr);
-
-    if (NULL != nnzTotalDevHostPtr) {
-      nnzC = *nnzTotalDevHostPtr;
-    } else {
-      cudaMemcpy(&nnzC, c_xadj + m, sizeof(int), cudaMemcpyDeviceToHost);
-      cudaMemcpy(&baseC, c_xadj, sizeof(int), cudaMemcpyDeviceToHost);
-      nnzC -= baseC;
-    }
-    handle->set_c_nnz(nnzC);
-    // entriesC =
-    // cin_nonzero_index_view_type(Kokkos::view_alloc(Kokkos::WithoutInitializing,
-    // "entriesC"), nnzC);
-  } else {
-    throw std::runtime_error(
-        "CUSPARSE requires local ordinals to be integer.\n");
-    // return;
-  }
-#endif
-#else
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)transposeA;
-  (void)transposeB;
-  throw std::runtime_error("CUSPARSE IS NOT DEFINED\n");
-  // return;
-#endif
-}
-
-template <
-    typename KernelHandle, typename ain_row_index_view_type,
-    typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type,
-    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
-    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
-    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
-void cuSPARSE_apply(
-    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
-    ain_row_index_view_type row_mapA, ain_nonzero_index_view_type entriesA,
-    ain_nonzero_value_view_type valuesA,
-
-    bool /* transposeA */, bin_row_index_view_type row_mapB,
-    bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB,
-    bool /* transposeB */, cin_row_index_view_type row_mapC,
-    cin_nonzero_index_view_type entriesC, cin_nonzero_value_view_type valuesC) {
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  typedef typename KernelHandle::nnz_lno_t idx;
-
-  typedef typename KernelHandle::nnz_scalar_t scalar_type;
-
-  typedef typename ain_row_index_view_type::device_type device1;
-  typedef typename ain_nonzero_index_view_type::device_type device2;
-  typedef typename ain_nonzero_value_view_type::device_type device3;
-
-  if (std::is_same<Kokkos::Cuda, device1>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-    // return;
-  }
-  if (std::is_same<Kokkos::Cuda, device2>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-    // return;
-  }
-  if (std::is_same<Kokkos::Cuda, device3>::value) {
-    throw std::runtime_error(
-        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-    // return;
-  }
-#if (CUDA_VERSION >= 11040)
-  typename KernelHandle::cuSparseSpgemmHandleType *h =
-      handle->get_cusparse_spgemm_handle();
-
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseCsrSetPointers(h->descr_A, (void *)row_mapA.data(),
-                             (void *)entriesA.data(), (void *)valuesA.data()));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseCsrSetPointers(h->descr_B, (void *)row_mapB.data(),
-                             (void *)entriesB.data(), (void *)valuesB.data()));
-
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseCsrSetPointers(h->descr_C, (void *)row_mapC.data(),
-                             (void *)entriesC.data(), (void *)valuesC.data()));
-
-  if (!h->C_populated) {
-    cusparseSpGEMMreuse_copy(h->cusparseHandle, h->opA, h->opB, h->descr_A,
-                             h->descr_B, h->descr_C, h->alg, h->spgemmDescr,
-                             &h->bufferSize5, NULL);
-    cudaMalloc((void **)&h->buffer5, h->bufferSize5);
-    cusparseSpGEMMreuse_copy(h->cusparseHandle, h->opA, h->opB, h->descr_A,
-                             h->descr_B, h->descr_C, h->alg, h->spgemmDescr,
-                             &h->bufferSize5, h->buffer5);
-    cudaFree(h->buffer3);
-    h->buffer3     = NULL;
-    h->C_populated = true;
-  }
-
-  // C' = alpha * opA(A) * opB(B) + beta * C
-  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
-  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
-
-  // alpha, beta are on host, but since we use singleton on the cusparse
-  // handle, we save/restore the pointer mode to not interference with
-  // others' use
-  cusparsePointerMode_t oldPtrMode;
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseGetPointerMode(h->cusparseHandle, &oldPtrMode));
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseSetPointerMode(h->cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_compute(
-      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
-      h->descr_C, h->scalarType, h->alg, h->spgemmDescr));
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseSetPointerMode(h->cusparseHandle, oldPtrMode));
-
-  (void)m;
-  (void)n;
-  (void)k;
-
-#elif (CUSPARSE_VERSION >= 11000)
-  using scalar_type = typename KernelHandle::nnz_scalar_t;
-  const auto alpha  = Kokkos::ArithTraits<scalar_type>::one();
-  const auto beta   = Kokkos::ArithTraits<scalar_type>::zero();
-  typename KernelHandle::cuSparseSpgemmHandleType *h =
-      handle->get_cusparse_spgemm_handle();
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseCsrSetPointers(h->descr_A, (void *)row_mapA.data(),
-                             (void *)entriesA.data(), (void *)valuesA.data()));
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseCsrSetPointers(h->descr_B, (void *)row_mapB.data(),
-                             (void *)entriesB.data(), (void *)valuesB.data()));
-  KOKKOS_CUSPARSE_SAFE_CALL(
-      cusparseCsrSetPointers(h->descr_C, (void *)row_mapC.data(),
-                             (void *)entriesC.data(), (void *)valuesC.data()));
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute(
-      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
-      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr,
-      &h->bufferSize4, h->buffer4));
-  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_copy(
-      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
-      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr));
-  (void)m;
-  (void)n;
-  (void)k;
-
-#else
-
-  if (std::is_same<idx, int>::value) {
-    int *a_xadj = (int *)row_mapA.data();
-    int *b_xadj = (int *)row_mapB.data();
-    int *c_xadj = (int *)row_mapC.data();
-
-    int *a_adj = (int *)entriesA.data();
-    int *b_adj = (int *)entriesB.data();
-    int *c_adj = (int *)entriesC.data();
-
-    typename KernelHandle::cuSparseSpgemmHandleType *h =
-        handle->get_cusparse_spgemm_handle();
-
-    int nnzA = entriesA.extent(0);
-    int nnzB = entriesB.extent(0);
-
-    scalar_type *a_ew = (scalar_type *)valuesA.data();
-    scalar_type *b_ew = (scalar_type *)valuesB.data();
-    scalar_type *c_ew = (scalar_type *)valuesC.data();
-
-    if (std::is_same<scalar_type, float>::value) {
-      cusparseScsrgemm(h->handle, h->transA, h->transB, m, n, k, h->a_descr,
-                       nnzA, (float *)a_ew, a_xadj, a_adj, h->b_descr, nnzB,
-                       (float *)b_ew, b_xadj, b_adj, h->c_descr, (float *)c_ew,
-                       c_xadj, c_adj);
-    } else if (std::is_same<scalar_type, double>::value) {
-      cusparseDcsrgemm(h->handle, h->transA, h->transB, m, n, k, h->a_descr,
-                       nnzA, (double *)a_ew, a_xadj, a_adj, h->b_descr, nnzB,
-                       (double *)b_ew, b_xadj, b_adj, h->c_descr,
-                       (double *)c_ew, c_xadj, c_adj);
-    } else {
-      throw std::runtime_error(
-          "CUSPARSE requires float or double values. cuComplex and "
-          "cuDoubleComplex are not implemented yet.\n");
-      // return;
-    }
-
-  } else {
-    throw std::runtime_error(
-        "CUSPARSE requires local ordinals to be integer.\n");
-    // return;
-  }
-#endif
-#else
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)entriesC;
-  (void)valuesA;
-  (void)valuesB;
-  (void)valuesC;
-  throw std::runtime_error("CUSPARSE IS NOT DEFINED\n");
-  // return;
-#endif
-}
-}  // namespace Impl
-}  // namespace KokkosSparse
-
-#endif
diff --git a/sparse/impl/KokkosSparse_spgemm_impl.hpp b/sparse/impl/KokkosSparse_spgemm_impl.hpp
index 776579ded2..e0b3f7b143 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl.hpp
@@ -829,6 +829,42 @@ class KokkosSPGEMM {
     }
     return po2_num_chunks;
   }
+
+ public:
+  void compute_row_flops() {
+    auto sh                    = this->handle->get_spgemm_handle();
+    nnz_lno_t maxNumRoughZeros = 0;
+    size_t overall_flops       = 0;
+    Kokkos::Timer timer1;
+    auto new_row_mapB_begin =
+        Kokkos::subview(row_mapB, std::make_pair(nnz_lno_t(0), b_row_cnt));
+    auto new_row_mapB_end =
+        Kokkos::subview(row_mapB, std::make_pair(nnz_lno_t(1), b_row_cnt + 1));
+    row_lno_persistent_work_view_t flops_per_row(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "original row flops"),
+        a_row_cnt);
+
+    // get maximum row flops.
+    maxNumRoughZeros = this->getMaxRoughRowNNZ(
+        a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end,
+        flops_per_row.data());
+
+    // calculate overal flops.
+    KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t,
+                                         MyExecSpace>(a_row_cnt, flops_per_row,
+                                                      overall_flops);
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tOriginal Max Row Flops:" << maxNumRoughZeros << std::endl;
+      std::cout << "\tOriginal overall_flops Flops:" << overall_flops
+                << std::endl;
+      std::cout << "\ttOriginal Max Row Flop Calc Time:" << timer1.seconds()
+                << std::endl;
+    }
+    sh->original_max_row_flops = maxNumRoughZeros;
+    sh->original_overall_flops = overall_flops;
+    sh->row_flops              = flops_per_row;
+    sh->set_computed_rowflops();
+  }
 };
 
 }  // namespace Impl
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
index 173a58b568..94b16680a8 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
@@ -92,40 +92,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
       std::cout << "SYMBOLIC PHASE" << std::endl;
     }
     // first calculate the number of original flops required.
-    {
-      nnz_lno_t maxNumRoughZeros = 0;
-      size_t overall_flops       = 0;
-      Kokkos::Timer timer1;
-      auto new_row_mapB_begin =
-          Kokkos::subview(row_mapB, std::make_pair(nnz_lno_t(0), b_row_cnt));
-      auto new_row_mapB_end = Kokkos::subview(
-          row_mapB, std::make_pair(nnz_lno_t(1), b_row_cnt + 1));
-      row_lno_persistent_work_view_t flops_per_row(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "original row flops"),
-          a_row_cnt);
-
-      // get maximum row flops.
-      maxNumRoughZeros = this->getMaxRoughRowNNZ(
-          a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end,
-          flops_per_row.data());
-
-      // calculate overal flops.
-      KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t,
-                                           MyExecSpace>(
-          a_row_cnt, flops_per_row, overall_flops);
-      if (KOKKOSKERNELS_VERBOSE) {
-        std::cout << "\tOriginal Max Row Flops:" << maxNumRoughZeros
-                  << std::endl;
-        std::cout << "\tOriginal overall_flops Flops:" << overall_flops
-                  << std::endl;
-        std::cout << "\ttOriginal Max Row Flop Calc Time:" << timer1.seconds()
-                  << std::endl;
-      }
-      this->handle->get_spgemm_handle()->original_max_row_flops =
-          maxNumRoughZeros;
-      this->handle->get_spgemm_handle()->original_overall_flops = overall_flops;
-      this->handle->get_spgemm_handle()->row_flops              = flops_per_row;
-    }
+    this->compute_row_flops();
 
     // number of rows and nnzs
     nnz_lno_t n   = this->row_mapB.extent(0) - 1;
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index 50d3681eed..6683fb0bc4 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -1286,7 +1286,8 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
       this->handle->get_spgemm_handle()->row_flops;
   size_t original_overall_flops =
       this->handle->get_spgemm_handle()->original_overall_flops;
-  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()
+                          ->template get_max_result_nnz<c_row_view_t>(rowmapC_);
   size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz();
 
   typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index 9fc1b8fe72..9ee1cb2673 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -1699,32 +1699,15 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     std::cout << "\tStructureC Kernel time:" << timer1.seconds() << std::endl
               << std::endl;
   }
-  // we need to find the max nnz in a row.
-  {
-    Kokkos::Timer timer1_;
-    size_type c_max_nnz = 0;
-    if (m > 0) {
-      KokkosKernels::Impl::view_reduce_max<c_row_view_t, MyExecSpace>(
-          m, rowmapC, c_max_nnz);
-      MyExecSpace().fence();
-    }
-    this->handle->get_spgemm_handle()->set_max_result_nnz(c_max_nnz);
-
-    if (KOKKOSKERNELS_VERBOSE) {
-      std::cout << "\tReduce Max Row Size Time:" << timer1_.seconds()
-                << std::endl;
-    }
-  }
-
+  typename c_row_view_t::non_const_value_type c_nnz_size = 0;
   KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<c_row_view_t,
-                                                        MyExecSpace>(m + 1,
-                                                                     rowmapC);
-  MyExecSpace().fence();
-  auto d_c_nnz_size = Kokkos::subview(rowmapC, m);
-  auto h_c_nnz_size = Kokkos::create_mirror_view(d_c_nnz_size);
-  Kokkos::deep_copy(h_c_nnz_size, d_c_nnz_size);
-  typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size();
+                                                        MyExecSpace>(
+      m + 1, rowmapC, c_nnz_size);
   this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size);
+  nnz_lno_t c_max_nnz =
+      KokkosSparse::Impl::graph_max_degree<MyExecSpace, size_type,
+                                           c_row_view_t>(rowmapC);
+  this->handle->get_spgemm_handle()->set_max_result_nnz(c_max_nnz);
 }  // end: symbolic_c_no_compression
 
 template <typename HandleType, typename a_row_view_t_,
@@ -2232,32 +2215,15 @@ void KokkosSPGEMM<
 	  }
   }
 #endif
-  // if we need to find the max nnz in a row.
-  {
-    Kokkos::Timer timer1_;
-    size_type c_max_nnz = 0;
-    if (m > 0) {
-      KokkosKernels::Impl::view_reduce_max<c_row_view_t, MyExecSpace>(
-          m, rowmapC, c_max_nnz);
-      MyExecSpace().fence();
-    }
-    this->handle->get_spgemm_handle()->set_max_result_nnz(c_max_nnz);
-
-    if (KOKKOSKERNELS_VERBOSE) {
-      std::cout << "\tReduce Max Row Size Time:" << timer1_.seconds()
-                << std::endl;
-    }
-  }
-
+  typename c_row_view_t::non_const_value_type c_nnz_size = 0;
   KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<c_row_view_t,
-                                                        MyExecSpace>(m + 1,
-                                                                     rowmapC);
-  MyExecSpace().fence();
-  auto d_c_nnz_size = Kokkos::subview(rowmapC, m);
-  auto h_c_nnz_size = Kokkos::create_mirror_view(d_c_nnz_size);
-  Kokkos::deep_copy(h_c_nnz_size, d_c_nnz_size);
-  typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size();
+                                                        MyExecSpace>(
+      m + 1, rowmapC, c_nnz_size);
   this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size);
+  nnz_lno_t c_max_nnz =
+      KokkosSparse::Impl::graph_max_degree<MyExecSpace, size_type,
+                                           c_row_view_t>(rowmapC);
+  this->handle->get_spgemm_handle()->set_max_result_nnz(c_max_nnz);
 }  // symbolic_c (end)
 
 template <typename HandleType, typename a_row_view_t_,
diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
index e0035eb8de..fb0ccd2353 100644
--- a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
@@ -1284,7 +1284,9 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
       this->handle->get_spgemm_handle()->row_flops;
   size_t original_overall_flops =
       this->handle->get_spgemm_handle()->original_overall_flops;
-  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  nnz_lno_t max_nnz =
+      this->handle->get_spgemm_handle()
+          ->template get_max_result_nnz<c_row_view_t>(row_mapC_);
   size_type overall_nnz   = this->handle->get_spgemm_handle()->get_c_nnz();
   nnz_lno_t min_hash_size = 1;
   size_t chunksize        = 1;
diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp
index d95425c352..2a2de2815f 100644
--- a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp
@@ -51,10 +51,12 @@
 #include "KokkosKernels_Handle.hpp"
 // Include the actual functors
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#include "KokkosSparse_spgemm_symbolic.hpp"
 #include "KokkosSparse_spgemm_impl.hpp"
 #include "KokkosSparse_spgemm_jacobi_denseacc_impl.hpp"
 #include "KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp"
 #include "KokkosSparse_spgemm_jacobi_seq_impl.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #endif
 
 namespace KokkosSparse {
@@ -113,7 +115,7 @@ struct spgemm_jacobi_eti_spec_avail {
   };
 
 // Include the actual specialization declarations
-#include <KokkosSparse_spgemm_tpl_spec_avail.hpp>
+#include <KokkosSparse_spgemm_jacobi_tpl_spec_avail.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
@@ -187,7 +189,27 @@ struct SPGEMM_JACOBI<KernelHandle, a_size_view_t_, a_lno_view_t,
     spgemmHandleType *sh = handle->get_spgemm_handle();
     if (!sh->is_symbolic_called()) {
       throw std::runtime_error(
-          "Call spgemm symbolic before calling SpGEMM jacobi");
+          "KokkosSparse::spgemm_jacobi: must first call spgemm_symbolic with "
+          "the same handle.");
+    }
+    if (!sh->are_rowptrs_computed()) {
+      // Call symbolic, and make sure rowptrs are populated. This will not
+      // duplicate any work if the user has already called symbolic. Must also
+      // cast away constness of row_mapC.
+      using c_size_view_t_nc = typename c_size_view_t_::non_const_type;
+      using c_size_type      = typename c_size_view_t_::non_const_value_type;
+      c_size_view_t_nc row_mapC_nc(const_cast<c_size_type *>(row_mapC.data()),
+                                   row_mapC.extent(0));
+      KokkosSparse::Experimental::spgemm_symbolic(
+          handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB,
+          transposeB, row_mapC_nc, true);
+    }
+    if (!sh->are_rowflops_computed()) {
+      KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+                   b_size_view_t_, b_lno_view_t, b_scalar_view_t>
+          kspgemm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                  row_mapB, entriesB, valuesB, transposeB);
+      kspgemm.compute_row_flops();
     }
 
     if (sh->get_algorithm_type() == SPGEMM_SERIAL) {
@@ -206,6 +228,10 @@ struct SPGEMM_JACOBI<KernelHandle, a_size_view_t_, a_lno_view_t,
       kspgemm.KokkosSPGEMM_jacobi_sparseacc(row_mapC, entriesC, valuesC, omega,
                                             dinv, myExecSpace);
     }
+    // Current implementation does not produce sorted matrix
+    // TODO: remove this call when impl sorts
+    KokkosSparse::sort_crs_matrix<typename KernelHandle::HandleExecSpace>(
+        row_mapC, entriesC, valuesC);
   }
 };
 
@@ -292,7 +318,7 @@ struct SPGEMM_JACOBI<KernelHandle, a_size_view_t_, a_lno_view_t,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
       false, true>;
 
-#include <KokkosSparse_spgemm_tpl_spec_decl.hpp>
+#include <KokkosSparse_spgemm_jacobi_tpl_spec_decl.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp>
 
 #endif
diff --git a/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp b/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
deleted file mode 100644
index e6f0c26497..0000000000
--- a/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
+++ /dev/null
@@ -1,685 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-
-#ifndef _KOKKOSSPGEMMMKL2_HPP
-#define _KOKKOSSPGEMMMKL2_HPP
-
-//#define KOKKOSKERNELS_ENABLE_TPL_MKL
-
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-#include "mkl.h"
-#endif
-
-#include "KokkosKernels_Utils.hpp"
-
-namespace KokkosSparse {
-namespace Impl {
-
-template <typename KernelHandle, typename in_row_index_view_type,
-          typename in_nonzero_index_view_type, typename bin_row_index_view_type,
-          typename bin_nonzero_index_view_type,
-          typename cin_row_index_view_type>
-void mkl2phase_symbolic(
-    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
-    in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA,
-
-    bool transposeA, bin_row_index_view_type row_mapB,
-    bin_nonzero_index_view_type entriesB,
-
-    bool transposeB, cin_row_index_view_type row_mapC, bool verbose = false) {
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-
-  typedef typename KernelHandle::nnz_lno_t idx;
-
-  typedef typename KernelHandle::HandlePersistentMemorySpace
-      HandlePersistentMemorySpace;
-
-  typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
-      int_persistent_work_view_t;
-
-  typedef typename KernelHandle::HandleExecSpace MyExecSpace;
-
-  if (std::is_same<idx, int>::value) {
-    int_persistent_work_view_t a_xadj_v, b_xadj_v;
-
-    const int max_integer = 2147483647;
-    if (entriesB.extent(0) > max_integer || entriesA.extent(0) > max_integer) {
-      throw std::runtime_error(
-          "MKL requires integer values for size type for SPGEMM. Copying to "
-          "integer will cause overflow.\n");
-    }
-
-    int *a_adj = (int *)entriesA.data();
-    int *b_adj = (int *)entriesB.data();
-
-    int *a_xadj = (int *)row_mapA.data();
-    int *b_xadj = (int *)row_mapB.data();
-    int *c_xadj = (int *)row_mapC.data();
-
-    if (handle->mkl_convert_to_1base) {
-      handle->persistent_a_xadj = int_persistent_work_view_t("tmpa", m + 1);
-      handle->persistent_b_xadj = int_persistent_work_view_t("tmpb", n + 1);
-      handle->persistent_c_xadj = int_persistent_work_view_t("tmpc", m + 1);
-      int_persistent_work_view_t a_plus_one("a_plus_one", entriesA.extent(0));
-      int_persistent_work_view_t b_plus_one("b_plus_one", entriesB.extent(0));
-      handle->persistent_a_adj = a_plus_one;
-      handle->persistent_b_adj = b_plus_one;
-
-      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
-                                               in_row_index_view_type, int, int,
-                                               MyExecSpace>(
-          m + 1, handle->persistent_a_xadj, row_mapA, 1, 1);
-      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
-                                               bin_row_index_view_type, int,
-                                               int, MyExecSpace>(
-          n + 1, handle->persistent_b_xadj, row_mapB, 1, 1);
-      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
-                                               in_nonzero_index_view_type, int,
-                                               int, MyExecSpace>(
-          entriesA.extent(0), a_plus_one, entriesA, 1, 1);
-      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
-                                               bin_nonzero_index_view_type, int,
-                                               int, MyExecSpace>(
-          entriesB.extent(0), b_plus_one, entriesB, 1, 1);
-
-      a_adj  = (int *)handle->persistent_a_adj.data();
-      b_adj  = (int *)handle->persistent_b_adj.data();
-      a_xadj = handle->persistent_a_xadj.data();
-      b_xadj = handle->persistent_b_xadj.data();
-      c_xadj = handle->persistent_c_xadj.data();
-    }
-
-#if __INTEL_MKL__ < 2018
-    (void)transposeA;
-    (void)transposeB;  // supress unused-parameter warning
-    char trans      = 'N';
-    MKL_INT request = 1;
-    MKL_INT sort    = handle->get_mkl_sort_option();
-    MKL_INT mklm = m, mkln = n, mklk = k;
-    MKL_INT info = 0;
-
-    double *mynullptr = NULL;
-    int *mynulladj    = NULL;
-    const int nzmax   = 0;
-
-    /*
-    KokkosKernels::Impl::print_1Dview(handle->persistent_a_xadj);
-    KokkosKernels::Impl::print_1Dview(a_plus_one);
-    KokkosKernels::Impl::print_1Dview(handle->persistent_b_xadj);
-    KokkosKernels::Impl::print_1Dview(b_plus_one);
-    */
-    Kokkos::Timer timer1;
-
-    mkl_dcsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk, mynullptr,
-                    a_adj, a_xadj, mynullptr, b_adj, b_xadj, mynullptr,
-                    mynulladj, c_xadj, &nzmax, &info);
-
-    if (verbose) {
-      std::cout << "Sort:" << sort
-                << " Actual MKL2 Symbolic Time:" << timer1.seconds()
-                << std::endl;
-    }
-
-    if (handle->mkl_convert_to_1base) {
-      KokkosKernels::Impl::kk_a_times_x_plus_b<cin_row_index_view_type,
-                                               int_persistent_work_view_t, int,
-                                               int, MyExecSpace>(
-          m + 1, row_mapC, handle->persistent_c_xadj, 1, -1);
-      handle->set_c_nnz(row_mapC(m));
-    } else {
-      handle->set_c_nnz(row_mapC(m) - 1);
-    }
-#endif  // __INTEL_MKL__ < 2018
-
-#if __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
-    MKL_INT mklm = m, mkln = n;
-    double *mynullptr = NULL;
-
-    sparse_matrix_t A;
-    sparse_matrix_t B;
-    sparse_matrix_t C;
-
-    // Goal: Set c_xadj (which is the rowptr) from C
-
-    if (handle->mkl_convert_to_1base) {  // a*, b* already converted to 1base
-                                         // above...
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ONE, mklm, mkln, a_xadj,
-                                  a_xadj + 1, a_adj, mynullptr)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-      }
-
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj,
-                                  b_xadj + 1, b_adj, mynullptr)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-      }
-    } else {
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln,
-                                  a_xadj, a_xadj + 1, a_adj, mynullptr)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-      }
-
-      if (SPARSE_STATUS_SUCCESS !=
-          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
-                                  b_xadj + 1, b_adj, mynullptr)) {
-        throw std::runtime_error(
-            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-      }
-    }
-
-    sparse_operation_t operation;
-    if (transposeA && transposeB) {
-      operation = SPARSE_OPERATION_TRANSPOSE;
-    } else if (!(transposeA || transposeB)) {
-      operation = SPARSE_OPERATION_NON_TRANSPOSE;
-    } else {
-      throw std::runtime_error(
-          "MKL either transpose both matrices, or none for SPGEMM\n");
-    }
-
-    matrix_descr common_mtx_props;
-    common_mtx_props.type = SPARSE_MATRIX_TYPE_GENERAL;
-    common_mtx_props.mode = SPARSE_FILL_MODE_FULL;
-    common_mtx_props.diag = SPARSE_DIAG_NON_UNIT;
-
-    Kokkos::Timer timer1;
-    // options: SPARSE_STAGE_FULL_MULT vs SPARSE_STAGE_NNZ_COUNT then
-    // SPARSE_STAGE_FINALIZE_MULT
-    bool success = SPARSE_STATUS_SUCCESS !=
-                   mkl_sparse_sp2m(operation, common_mtx_props, A, operation,
-                                   common_mtx_props, B, SPARSE_STAGE_NNZ_COUNT,
-                                   &C);  // success is "true" if mkl_sparse_spmm
-                                         // does not return success
-
-    if (verbose) {
-      std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds()
-                << std::endl;
-    }
-
-    if (success) {
-      throw std::runtime_error(
-          "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-    } else {
-      // Copy sparse_matrix_t C results back to input data structure
-      sparse_index_base_t c_indexing;
-      MKL_INT c_rows, c_cols, *rows_end, *columns;  // use c_xadj as rows_start
-      double *values;                               // should return null
-
-      if (SPARSE_STATUS_SUCCESS !=
-          // mkl_sparse_s_export_csr (C, &c_indexing, &c_rows, &c_cols,
-          // &rows_start, &rows_end, &columns, &values))
-          mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, &c_xadj,
-                                  &rows_end, &columns, &values)) {
-        throw std::runtime_error(
-            "ERROR at exporting result matrix in mkl_sparse_spmm\n");
-      }
-
-      //      if (SPARSE_INDEX_BASE_ZERO != c_indexing){
-      //        throw std::runtime_error ("C is not zero based indexed\n");
-      //      }
-      if (handle->mkl_convert_to_1base &&
-          (c_indexing ==
-           SPARSE_INDEX_BASE_ONE)) {  // Need to convert back to base0
-        KokkosKernels::Impl::kk_a_times_x_plus_b<cin_row_index_view_type,
-                                                 int_persistent_work_view_t,
-                                                 int, int, MyExecSpace>(
-            m + 1, row_mapC, handle->persistent_c_xadj, 1, -1);
-        handle->set_c_nnz(row_mapC(m));
-      } else {
-        handle->set_c_nnz(row_mapC(m) - 1);
-      }
-
-    }  // end else !success
-
-    // Cleanup...
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
-      throw std::runtime_error("Error at mkl_sparse_destroy A\n");
-    }
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
-      throw std::runtime_error("Error at mkl_sparse_destroy B\n");
-    }
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
-      throw std::runtime_error("Error at mkl_sparse_destroy C\n");
-    }
-#elif __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ < 2
-    throw std::runtime_error(
-        "Intel MKL version 18 must have update 2 - use intel/18.2.xyz\n");
-    (void)k;
-    (void)transposeA;
-    (void)transposeB;
-    (void)verbose;
-#else
-    throw std::runtime_error(
-        "Intel MKL versions > 18 are not yet tested/supported\n");
-    (void)k;
-    (void)transposeA;
-    (void)transposeB;
-    (void)verbose;
-    (void)a_xadj;
-    (void)b_xadj;
-    (void)c_xadj;
-    (void)a_adj;
-    (void)b_adj;
-#endif
-
-  } else {
-    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
-    (void)k;
-    (void)transposeA;
-    (void)transposeB;
-    (void)verbose;
-  }
-#else   // KOKKOSKERNELS_ENABLE_TPL_MKL
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)transposeA;
-  (void)transposeB;
-  (void)verbose;
-  throw std::runtime_error("MKL IS NOT DEFINED\n");
-#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
-}
-
-template <
-    typename KernelHandle, typename in_row_index_view_type,
-    typename in_nonzero_index_view_type, typename in_nonzero_value_view_type,
-    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
-    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
-    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
-void mkl2phase_apply(
-    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
-    in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA,
-    in_nonzero_value_view_type valuesA,
-
-    bool transposeA, bin_row_index_view_type row_mapB,
-    bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB,
-    bool transposeB, cin_row_index_view_type row_mapC,
-    cin_nonzero_index_view_type &entriesC, cin_nonzero_value_view_type &valuesC,
-    bool verbose = false) {
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-
-  using HandlePersistentMemorySpace =
-      typename KernelHandle::HandlePersistentMemorySpace;
-  using int_persistent_work_view_t =
-      typename Kokkos::View<int *, HandlePersistentMemorySpace>;
-  using idx = typename KernelHandle::nnz_lno_t;
-
-  if (std::is_same<idx, int>::value) {
-    int *a_xadj = (int *)row_mapA.data();
-    int *b_xadj = (int *)row_mapB.data();
-    int *c_xadj = (int *)row_mapC.data();
-
-    int *a_adj = (int *)entriesA.data();
-    int *b_adj = (int *)entriesB.data();
-
-    if (handle->mkl_convert_to_1base) {
-      int_persistent_work_view_t a_xadj_v, b_xadj_v, c_xadj_v;
-      a_xadj = (int *)handle->persistent_a_xadj.data();
-      b_xadj = (int *)handle->persistent_b_xadj.data();
-      c_xadj = (int *)handle->persistent_c_xadj.data();
-      int_persistent_work_view_t a_plus_one = handle->persistent_a_adj;
-      int_persistent_work_view_t b_plus_one = handle->persistent_b_adj;
-
-      a_adj = (int *)a_plus_one.data();
-      b_adj = (int *)b_plus_one.data();
-    }
-
-#if __INTEL_MKL__ < 2018
-    (void)transposeA;
-    (void)transposeB;
-    const value_type *a_ew = valuesA.data();
-    const value_type *b_ew = valuesB.data();
-
-    char trans      = 'N';
-    MKL_INT request = 2;
-    MKL_INT sort    = handle->get_mkl_sort_option();
-    MKL_INT mklm = m, mkln = n, mklk = k;
-    MKL_INT info = 0, nzmax = 2147483647;
-    /*
-          KokkosKernels::Impl::print_1Dview(handle->persistent_a_xadj);
-          KokkosKernels::Impl::print_1Dview(a_plus_one);
-          KokkosKernels::Impl::print_1Dview(handle->persistent_b_xadj);
-          KokkosKernels::Impl::print_1Dview(b_plus_one);
-          KokkosKernels::Impl::print_1Dview(handle->persistent_c_xadj);
-          KokkosKernels::Impl::print_1Dview(valuesA);
-          KokkosKernels::Impl::print_1Dview(valuesB);
-
-
-          std::cout << "A" << std::endl;
-          KokkosKernels::Impl::print_1Dview(row_mapA);
-          KokkosKernels::Impl::print_1Dview(entriesA);
-          std::cout << "B" << std::endl;
-          KokkosKernels::Impl::print_1Dview(row_mapB);
-          KokkosKernels::Impl::print_1Dview(entriesB);
-          std::cout << "c:" << "entriesC:" << entriesC.extent(0) << std::endl;
-          KokkosKernels::Impl::print_1Dview(row_mapC);
-    */
-    Kokkos::Timer timer1;
-
-    if (std::is_same<value_type, float>::value) {
-      mkl_scsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk,
-                      (float *)a_ew, a_adj, a_xadj, (float *)b_ew, b_adj,
-                      b_xadj, (float *)valuesC.data(), entriesC.data(), c_xadj,
-                      &nzmax, &info);
-      mkl_free_buffers();
-    } else if (std::is_same<value_type, double>::value) {
-      mkl_dcsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk,
-                      (double *)a_ew, a_adj, a_xadj, (double *)b_ew, b_adj,
-                      b_xadj, (double *)valuesC.data(), entriesC.data(), c_xadj,
-                      &nzmax, &info);
-      mkl_free_buffers();
-    } else {
-      throw std::runtime_error(
-          "MKL requires float or double values. Complex values are not "
-          "implemented yet.\n");
-    }
-    if (verbose) {
-      std::cout << "Sort:" << sort
-                << " Actual MKL2 Numeric Time:" << timer1.seconds()
-                << std::endl;
-    }
-
-    if (handle->mkl_convert_to_1base) {
-      KokkosKernels::Impl::kk_a_times_x_plus_b<cin_nonzero_index_view_type,
-                                               cin_nonzero_index_view_type, int,
-                                               int, MyExecSpace>(
-          entriesC.extent(0), entriesC, entriesC, 1, -1);
-    }
-#endif  // __INTEL_MKL__ < 2018
-
-#if __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
-    value_type *a_ew = const_cast<value_type *>(valuesA.data());
-    value_type *b_ew = const_cast<value_type *>(valuesB.data());
-
-    char trans      = 'N';
-    MKL_INT request = 2;
-    MKL_INT sort    = handle->get_mkl_sort_option();
-    MKL_INT mklm = m, mkln = n, mklk = k;
-    MKL_INT info = 0, nzmax = 2147483647;
-
-    sparse_matrix_t A;
-    sparse_matrix_t B;
-    sparse_matrix_t C;
-
-    if (handle->mkl_convert_to_1base) {  // a*, b* already converted to 1base
-                                         // above...
-      if (std::is_same<value_type, double>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ONE, mklm, mkln,
-                                    a_xadj, a_xadj + 1, a_adj,
-                                    reinterpret_cast<double *>(a_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        }
-      } else if (std::is_same<value_type, float>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ONE, mklm, mkln,
-                                    a_xadj, a_xadj + 1, a_adj,
-                                    reinterpret_cast<float *>(a_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        }
-      }
-
-      if (std::is_same<value_type, double>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj,
-                                    b_xadj + 1, b_adj,
-                                    reinterpret_cast<double *>(b_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        }
-      } else if (std::is_same<value_type, float>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj,
-                                    b_xadj + 1, b_adj,
-                                    reinterpret_cast<float *>(b_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        }
-      }
-    } else {
-      if (std::is_same<value_type, double>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln,
-                                    a_xadj, a_xadj + 1, a_adj,
-                                    reinterpret_cast<double *>(a_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        }
-      } else if (std::is_same<value_type, float>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln,
-                                    a_xadj, a_xadj + 1, a_adj,
-                                    reinterpret_cast<float *>(a_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-        }
-      }
-
-      if (std::is_same<value_type, double>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
-                                    b_xadj + 1, b_adj,
-                                    reinterpret_cast<double *>(b_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        }
-      } else if (std::is_same<value_type, float>::value) {
-        if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
-                                    b_xadj + 1, b_adj,
-                                    reinterpret_cast<float *>(b_ew))) {
-          throw std::runtime_error(
-              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-        }
-      }
-    }
-
-    sparse_operation_t operation;
-    if (transposeA && transposeB) {
-      operation = SPARSE_OPERATION_TRANSPOSE;
-    } else if (!(transposeA || transposeB)) {
-      operation = SPARSE_OPERATION_NON_TRANSPOSE;
-    } else {
-      throw std::runtime_error(
-          "MKL either transpose both matrices, or none for SPGEMM\n");
-    }
-
-    matrix_descr common_mtx_props;
-    common_mtx_props.type = SPARSE_MATRIX_TYPE_GENERAL;
-    common_mtx_props.mode = SPARSE_FILL_MODE_FULL;
-    common_mtx_props.diag = SPARSE_DIAG_NON_UNIT;
-
-    Kokkos::Timer timer1;
-    // options: SPARSE_STAGE_FULL_MULT vs SPARSE_STAGE_NNZ_COUNT then
-    // SPARSE_STAGE_FINALIZE_MULT
-    bool success =
-        SPARSE_STATUS_SUCCESS !=
-        mkl_sparse_sp2m(operation, common_mtx_props, A, operation,
-                        common_mtx_props, B, SPARSE_STAGE_FINALIZE_MULT,
-                        &C);  // success is "true" if mkl_sparse_spmm does not
-                              // return success
-
-    if (verbose) {
-      std::cout << "Actual MKL SPMM Time:" << timer1.seconds() << std::endl;
-    }
-
-    if (success) {
-      throw std::runtime_error(
-          "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-    }
-
-    // Copy C components back
-    sparse_index_base_t c_indexing;
-    MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-    typedef double values_type;
-    values_type *values;
-
-    if (SPARSE_STATUS_SUCCESS !=
-        // mkl_sparse_s_export_csr (C,
-        mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, &rows_start,
-                                &rows_end, &columns, &values)) {
-      throw std::runtime_error(
-          "ERROR at exporting result matrix in mkl_sparse_spmm\n");
-    }
-
-    if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
-      throw std::runtime_error("C is not zero based indexed\n");
-    }
-
-    // KokkosKernels::Impl::copy_vector<MKL_INT *, typename
-    // cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start,
-    // row_mapC); idx nnz = row_mapC(m) =  rows_end[m - 1];
-    idx nnz = rows_end[m - 1];
-    using non_const_size_type =
-        typename cin_row_index_view_type::non_const_value_type;
-    auto *tmpPtr = const_cast<non_const_size_type *>(row_mapC.data());
-    tmpPtr[m]    = nnz;
-
-    KokkosKernels::Impl::copy_vector<
-        MKL_INT *, typename cin_nonzero_index_view_type::non_const_type,
-        MyExecSpace>(nnz, columns, entriesC);
-    KokkosKernels::Impl::copy_vector<
-        values_type *, typename cin_nonzero_value_view_type::non_const_type,
-        MyExecSpace>(nnz, values, valuesC);
-
-    if (handle->mkl_convert_to_1base) {
-      KokkosKernels::Impl::kk_a_times_x_plus_b<cin_nonzero_index_view_type,
-                                               cin_nonzero_index_view_type, int,
-                                               int, MyExecSpace>(
-          entriesC.extent(0), entriesC, entriesC, 1, -1);
-    }
-
-    // Cleanup
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
-      throw std::runtime_error("Error at mkl_sparse_destroy A\n");
-    }
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
-      throw std::runtime_error("Error at mkl_sparse_destroy B\n");
-    }
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
-      throw std::runtime_error("Error at mkl_sparse_destroy C\n");
-    }
-#elif __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ < 2
-    throw std::runtime_error(
-        "Intel MKL version 18 must have update 2 - use intel/18.2.xyz\n");
-    (void)m;
-    (void)n;
-    (void)k;
-    (void)entriesC;
-    (void)valuesA;
-    (void)valuesB;
-    (void)valuesC;
-    (void)transposeA;
-    (void)transposeB;
-    (void)verbose;
-#else
-    throw std::runtime_error(
-        "Intel MKL versions > 18 are not yet tested/supported\n");
-    // Supress -Wunused-parameter on intel-18
-    (void)m;
-    (void)n;
-    (void)k;
-    (void)entriesC;
-    (void)valuesA;
-    (void)valuesB;
-    (void)valuesC;
-    (void)transposeA;
-    (void)transposeB;
-    (void)verbose;
-    (void)a_xadj;
-    (void)b_xadj;
-    (void)c_xadj;
-    (void)a_adj;
-    (void)b_adj;
-#endif  // __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
-  } else {
-    (void)m;
-    (void)n;
-    (void)k;
-    (void)entriesC;
-    (void)valuesA;
-    (void)valuesB;
-    (void)valuesC;
-    (void)transposeA;
-    (void)transposeB;
-    (void)verbose;
-    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
-  }
-#else
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)entriesC;
-  (void)valuesA;
-  (void)valuesB;
-  (void)valuesC;
-  (void)transposeA;
-  (void)transposeB;
-  (void)verbose;
-  throw std::runtime_error("MKL IS NOT DEFINED\n");
-#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
-}  // end mkl2phase_apply
-}  // namespace Impl
-}  // namespace KokkosSparse
-
-#endif
diff --git a/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
deleted file mode 100644
index 9a6ab70f9e..0000000000
--- a/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef _KOKKOSSPGEMMMKL_HPP
-#define _KOKKOSSPGEMMMKL_HPP
-
-#include "KokkosKernels_config.h"
-#include "KokkosSparse_Utils_mkl.hpp"
-
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-#include "mkl_spblas.h"
-
-namespace KokkosSparse {
-namespace Impl {
-
-// multiplies two sparse MKL matrices and returns sparse MKL matrix
-template <typename value_type>
-inline static MKLSparseMatrix<value_type> mkl_spmm(
-    sparse_operation_t operation, const MKLSparseMatrix<value_type> &A,
-    const MKLSparseMatrix<value_type> &B) {
-  sparse_matrix_t C;
-  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_spmm(operation, A, B, &C));
-  return MKLSparseMatrix<value_type>(C);
-}
-
-template <typename KernelHandle, typename a_rowmap_view_type,
-          typename a_index_view_type, typename a_values_view_type,
-          typename b_rowmap_view_type, typename b_index_view_type,
-          typename b_values_view_type, typename c_rowmap_view_type,
-          typename c_index_view_type, typename c_values_view_type>
-class MKL_SPGEMM {
- public:
-  typedef typename KernelHandle::nnz_lno_t nnz_lno_t;
-  typedef typename KernelHandle::size_type size_type;
-  typedef typename KernelHandle::nnz_scalar_t value_type;
-  typedef typename KernelHandle::HandleExecSpace MyExecSpace;
-  typedef typename Kokkos::View<int *, Kokkos::HostSpace> int_tmp_view_t;
-
- public:
-  static void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n,
-                           nnz_lno_t k, a_rowmap_view_type row_mapA,
-                           a_index_view_type entriesA, bool transposeA,
-                           b_rowmap_view_type row_mapB,
-                           b_index_view_type entriesB, bool transposeB,
-                           c_rowmap_view_type row_mapC, bool verbose = false) {
-    if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 ||
-        entriesB.extent(0) < 1) {
-      // set correct values in non-empty 0-nnz corner case
-      handle->set_c_nnz(0);
-      Kokkos::deep_copy(row_mapC, 0);
-      return;
-    }
-
-    Kokkos::Timer timer;
-    using scalar_t = typename KernelHandle::nnz_scalar_t;
-
-    const auto export_rowmap = [&](MKL_INT num_rows, MKL_INT *rows_start,
-                                   MKL_INT * /*columns*/,
-                                   scalar_t * /*values*/) {
-      if (handle->mkl_keep_output) {
-        Kokkos::Timer copy_time;
-        const nnz_lno_t nnz = rows_start[num_rows];
-        handle->set_c_nnz(nnz);
-        copy(make_host_view(rows_start, num_rows + 1), row_mapC);
-        if (verbose)
-          std::cout << "\tMKL rowmap export time:" << copy_time.seconds()
-                    << std::endl;
-      }
-    };
-
-    // use dummy values for A and B inputs
-    a_values_view_type tmp_valsA(
-        Kokkos::ViewAllocateWithoutInitializing("tmp_valuesA"),
-        entriesA.extent(0));
-    b_values_view_type tmp_valsB(
-        Kokkos::ViewAllocateWithoutInitializing("tmp_valuesB"),
-        entriesB.extent(0));
-
-    spmm(handle, m, n, k, row_mapA, entriesA, tmp_valsA, transposeA, row_mapB,
-         entriesB, tmp_valsB, transposeB, verbose, export_rowmap);
-
-    if (verbose)
-      std::cout << "MKL symbolic time:" << timer.seconds() << std::endl;
-  }
-
-  static void mkl_numeric(
-      KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k,
-      a_rowmap_view_type row_mapA, a_index_view_type entriesA,
-      a_values_view_type valuesA, bool transposeA, b_rowmap_view_type row_mapB,
-      b_index_view_type entriesB, b_values_view_type valuesB, bool transposeB,
-      c_rowmap_view_type /* row_mapC */, c_index_view_type entriesC,
-      c_values_view_type valuesC, bool verbose = false) {
-    Kokkos::Timer timer;
-
-    const auto export_values =
-        [&](MKL_INT num_rows, MKL_INT *rows_start, MKL_INT *columns,
-            typename KernelHandle::nnz_scalar_t *values) {
-          if (handle->mkl_keep_output) {
-            Kokkos::Timer copy_time;
-            const nnz_lno_t nnz = rows_start[num_rows];
-            copy(make_host_view(columns, nnz), entriesC);
-            copy(make_host_view(values, nnz), valuesC);
-            if (verbose)
-              std::cout << "\tMKL values export time:" << copy_time.seconds()
-                        << std::endl;
-          }
-        };
-
-    spmm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB,
-         entriesB, valuesB, transposeB, verbose, export_values);
-
-    if (verbose)
-      std::cout << "MKL numeric time:" << timer.seconds() << std::endl;
-  }
-
- private:
-  template <typename CB>
-  static void spmm(KernelHandle * /* handle */, nnz_lno_t m, nnz_lno_t n,
-                   nnz_lno_t k, a_rowmap_view_type row_mapA,
-                   a_index_view_type entriesA, a_values_view_type valuesA,
-
-                   bool transposeA, b_rowmap_view_type row_mapB,
-                   b_index_view_type entriesB, b_values_view_type valuesB,
-                   bool transposeB, bool verbose, const CB &callback) {
-    if (!std::is_same<nnz_lno_t, int>::value) {
-      throw std::runtime_error("MKL requires local ordinals to be integer.\n");
-    }
-
-    if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 ||
-        entriesB.extent(0) < 1) {
-      return;
-    }
-
-    const auto create_mirror = [](auto view) {
-      return Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view);
-    };
-
-    auto h_rowsA      = create_mirror(row_mapA);
-    auto h_rowsB      = create_mirror(row_mapB);
-    const int *a_xadj = reinterpret_cast<const int *>(h_rowsA.data());
-    const int *b_xadj = reinterpret_cast<const int *>(h_rowsB.data());
-    int_tmp_view_t a_xadj_v, b_xadj_v;
-
-    if (!std::is_same<size_type, int>::value) {
-      if (entriesA.extent(0) > INT_MAX || entriesB.extent(0) > INT_MAX) {
-        throw std::runtime_error(
-            "MKL requires integer values for size type for SPGEMM. Copying "
-            "to "
-            "integer will cause overflow.\n");
-      }
-      static_assert(
-          std::is_same<typename int_tmp_view_t::value_type,
-                       typename int_tmp_view_t::non_const_value_type>::value,
-          "deep_copy requires non-const destination type");
-
-      Kokkos::Timer copy_time;
-      a_xadj_v = int_tmp_view_t("tmpa", m + 1);
-      b_xadj_v = int_tmp_view_t("tmpb", n + 1);
-      Kokkos::deep_copy(a_xadj_v, h_rowsA);
-      Kokkos::deep_copy(b_xadj_v, h_rowsB);
-      a_xadj = (int *)a_xadj_v.data();
-      b_xadj = (int *)b_xadj_v.data();
-      if (verbose)
-        std::cout << "\tMKL int-type temp rowmap copy time:"
-                  << copy_time.seconds() << std::endl;
-    }
-
-    auto h_valsA           = create_mirror(valuesA);
-    auto h_valsB           = create_mirror(valuesB);
-    auto h_entriesA        = create_mirror(entriesA);
-    auto h_entriesB        = create_mirror(entriesB);
-    const int *a_adj       = reinterpret_cast<const int *>(h_entriesA.data());
-    const int *b_adj       = reinterpret_cast<const int *>(h_entriesB.data());
-    const value_type *a_ew = h_valsA.data();
-    const value_type *b_ew = h_valsB.data();
-
-    // Hack: we discard const with pointer casts here to work around MKL
-    // requiring mutable input and our symbolic interface not providing it
-    using Matrix = MKLSparseMatrix<value_type>;
-    Matrix A(m, n, (int *)a_xadj, (int *)a_adj, (value_type *)a_ew);
-    Matrix B(n, k, (int *)b_xadj, (int *)b_adj, (value_type *)b_ew);
-
-    sparse_operation_t operation;
-    if (transposeA && transposeB) {
-      operation = SPARSE_OPERATION_TRANSPOSE;
-    } else if (!(transposeA || transposeB)) {
-      operation = SPARSE_OPERATION_NON_TRANSPOSE;
-    } else {
-      throw std::runtime_error(
-          "MKL either transpose both matrices, or none for SPGEMM\n");
-    }
-
-    Kokkos::Timer timer1;
-    Matrix C = mkl_spmm(operation, A, B);
-    if (verbose) {
-      std::cout << "\tMKL spmm (";
-      if (std::is_same<float, value_type>::value)
-        std::cout << "FLOAT";
-      else if (std::is_same<double, value_type>::value)
-        std::cout << "DOUBLE";
-      else
-        std::cout << "?";
-      std::cout << ") time:" << timer1.seconds() << std::endl;
-    }
-
-    MKL_INT num_rows, num_cols, *rows_start, *columns;
-    value_type *values;
-    C.export_data(num_rows, num_cols, rows_start, columns, values);
-    callback(m, rows_start, columns, values);
-
-    A.destroy();
-    B.destroy();
-    C.destroy();
-  }
-
-  template <typename from_view_type, typename dst_view_type>
-  inline static void copy(from_view_type from, dst_view_type to) {
-    auto h_from =
-        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), from);
-    auto h_to = Kokkos::create_mirror_view(Kokkos::HostSpace(), to);
-    Kokkos::deep_copy(h_to, h_from);  // view copy (for different element types)
-    Kokkos::deep_copy(to, h_to);
-  }
-
-  template <typename T,
-            typename view_type = Kokkos::View<const T *, Kokkos::HostSpace>>
-  inline static view_type make_host_view(const T *data, size_t num_elems) {
-    return view_type(data, num_elems);
-  }
-};
-
-template <typename KernelHandle, typename a_rowmap_type, typename a_index_type,
-          typename b_rowmap_type, typename b_index_type, typename c_rowmap_type,
-          typename nnz_lno_t = typename KernelHandle::nnz_lno_t>
-void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k,
-                  a_rowmap_type row_mapA, a_index_type entriesA,
-                  bool transposeA, b_rowmap_type row_mapB,
-                  b_index_type entriesB, bool transposeB,
-                  c_rowmap_type row_mapC, bool verbose = false) {
-  using values_type  = typename KernelHandle::scalar_temp_work_view_t;
-  using c_index_type = b_index_type;
-  using mkl = MKL_SPGEMM<KernelHandle, a_rowmap_type, a_index_type, values_type,
-                         b_rowmap_type, b_index_type, values_type,
-                         c_rowmap_type, c_index_type, values_type>;
-  mkl::mkl_symbolic(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB,
-                    entriesB, transposeB, row_mapC, verbose);
-}
-
-template <typename KernelHandle, typename a_rowmap_type, typename a_index_type,
-          typename a_values_type, typename b_rowmap_type, typename b_index_type,
-          typename b_values_type, typename c_rowmap_type, typename c_index_type,
-          typename c_values_type,
-          typename nnz_lno_t = typename KernelHandle::nnz_lno_t>
-void mkl_numeric(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k,
-                 a_rowmap_type row_mapA, a_index_type entriesA,
-                 a_values_type valuesA, bool transposeA, b_rowmap_type row_mapB,
-                 b_index_type entriesB, b_values_type valuesB, bool transposeB,
-                 c_rowmap_type row_mapC, c_index_type entriesC,
-                 c_values_type valuesC, bool verbose = false) {
-  using mkl =
-      MKL_SPGEMM<KernelHandle, a_rowmap_type, a_index_type, a_values_type,
-                 b_rowmap_type, b_index_type, b_values_type, c_rowmap_type,
-                 c_index_type, c_values_type>;
-  mkl::mkl_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA,
-                   row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC,
-                   valuesC, verbose);
-}
-
-}  // namespace Impl
-}  // namespace KokkosSparse
-
-#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
-#endif  // _KOKKOSSPGEMMMKL_HPP
diff --git a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
index 0a09040559..359fbf5fc0 100644
--- a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
@@ -51,15 +51,10 @@
 #include "KokkosKernels_Handle.hpp"
 // Include the actual functors
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-//#include "KokkosSparse_spgemm_symbolic.hpp"
-#include "KokkosSparse_spgemm_cuSPARSE_impl.hpp"
-#include "KokkosSparse_spgemm_rocSPARSE_impl.hpp"
-#include "KokkosSparse_spgemm_CUSP_impl.hpp"
+#include "KokkosSparse_spgemm_symbolic.hpp"
 #include "KokkosSparse_spgemm_impl.hpp"
 #include "KokkosSparse_spgemm_impl_seq.hpp"
-#include "KokkosSparse_spgemm_mkl_impl.hpp"
-#include "KokkosSparse_spgemm_mkl2phase_impl.hpp"
-#include "KokkosSparse_spgemm_viennaCL_impl.hpp"
+#include "KokkosSparse_SortCrs.hpp"
 #endif
 
 namespace KokkosSparse {
@@ -112,45 +107,10 @@ struct spgemm_numeric_eti_spec_avail {
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
     enum : bool { value = true };                                         \
-  };                                                                      \
-                                                                          \
-  template <>                                                             \
-  struct spgemm_numeric_eti_spec_avail<                                   \
-      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
-          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
-          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
-      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
-    enum : bool { value = true };                                         \
   };
 
 // Include the actual specialization declarations
-#include <KokkosSparse_spgemm_tpl_spec_avail.hpp>
+#include <KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
@@ -174,17 +134,14 @@ template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
               b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_,
               c_lno_view_t, c_scalar_view_t>::value>
 struct SPGEMM_NUMERIC {
-  static void spgemm_numeric(KernelHandle *handle,
-                             typename KernelHandle::const_nnz_lno_t m,
-                             typename KernelHandle::const_nnz_lno_t n,
-                             typename KernelHandle::const_nnz_lno_t k,
-                             a_size_view_t_ row_mapA, a_lno_view_t entriesA,
-                             a_scalar_view_t valuesA,
+  static void spgemm_numeric(
+      KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+      typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+      a_size_view_t_ row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA,
 
-                             bool transposeA, b_size_view_t_ row_mapB,
-                             b_lno_view_t entriesB, b_scalar_view_t valuesB,
-                             bool transposeB, c_size_view_t_ row_mapC,
-                             c_lno_view_t &entriesC, c_scalar_view_t &valuesC);
+      bool transposeA, b_size_view_t_ row_mapB, b_lno_view_t entriesB,
+      b_scalar_view_t valuesB, bool transposeB, c_size_view_t_ row_mapC,
+      c_lno_view_t entriesC, c_scalar_view_t valuesC);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
@@ -203,71 +160,30 @@ struct SPGEMM_NUMERIC<
       KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
       typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
       a_size_view_t_ row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA,
-
       bool transposeA, b_size_view_t_ row_mapB, b_lno_view_t entriesB,
       b_scalar_view_t valuesB, bool transposeB, c_size_view_t_ row_mapC,
-      c_lno_view_t &entriesC, c_scalar_view_t &valuesC) {
+      c_lno_view_t entriesC, c_scalar_view_t valuesC) {
     typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
     spgemmHandleType *sh = handle->get_spgemm_handle();
     if (!sh->is_symbolic_called()) {
       throw std::runtime_error(
           "Call spgemm symbolic before calling SpGEMM numeric");
     }
-
+    if (m == 0 || n == 0 || k == 0 || !entriesA.extent(0) ||
+        !entriesB.extent(0)) {
+      sh->set_call_numeric();
+      sh->set_computed_entries();
+      return;
+    }
     switch (sh->get_algorithm_type()) {
-      case SPGEMM_CUSPARSE:
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-        cuSPARSE_apply<spgemmHandleType>(
-            sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB,
-            entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC);
-#else
-        throw std::runtime_error(
-            "Requiring SPGEMM_CUSPARSE but TPL_CUSPARSE was not enabled!");
-#endif
-        break;
-      case SPGEMM_ROCSPARSE:
-#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
-        rocsparse_spgemm_numeric<spgemmHandleType>(
-            sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB,
-            entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC);
-#else
-        throw std::runtime_error(
-            "Requiring SPGEMM_ROCSPARSE but TPL_ROCSPARSE was not enabled!");
-#endif
-        break;
-      case SPGEMM_CUSP:
-        CUSP_apply<spgemmHandleType, a_size_view_t_, a_lno_view_t,
-                   a_scalar_view_t, b_size_view_t_, b_lno_view_t,
-                   b_scalar_view_t, c_size_view_t_, c_lno_view_t,
-                   c_scalar_view_t>(sh, m, n, k, row_mapA, entriesA, valuesA,
-                                    transposeA, row_mapB, entriesB, valuesB,
-                                    transposeB, row_mapC, entriesC, valuesC);
-        break;
-      case SPGEMM_MKL:
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-        mkl_numeric(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA,
-                    row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC,
-                    valuesC, handle->get_verbose());
-        break;
-#else
-        throw std::runtime_error("MKL was not enabled in this build!");
-#endif
-      case SPGEMM_MKL2PHASE:
-        mkl2phase_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA,
-                        row_mapB, entriesB, valuesB, transposeB, row_mapC,
-                        entriesC, valuesC, handle->get_verbose());
-        break;
+      case SPGEMM_SERIAL:
+      case SPGEMM_DEBUG:
+        spgemm_debug_numeric(handle, m, n, k, row_mapA, entriesA, valuesA,
 
-      case SPGEMM_VIENNA:
-        viennaCL_apply<spgemmHandleType>(
-            sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB,
-            entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC,
-            handle->get_verbose());
+                             transposeA, row_mapB, entriesB, valuesB,
+                             transposeB, row_mapC, entriesC, valuesC);
         break;
-
-      default:
-
-      {
+      default: {
         KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t,
                      a_scalar_view_t, b_size_view_t_, b_lno_view_t,
                      b_scalar_view_t>
@@ -275,14 +191,13 @@ struct SPGEMM_NUMERIC<
                     row_mapB, entriesB, valuesB, transposeB);
         kspgemm.KokkosSPGEMM_numeric(row_mapC, entriesC, valuesC);
       } break;
-      case SPGEMM_SERIAL:
-      case SPGEMM_DEBUG:
-        spgemm_debug_numeric(handle, m, n, k, row_mapA, entriesA, valuesA,
-
-                             transposeA, row_mapB, entriesB, valuesB,
-                             transposeB, row_mapC, entriesC, valuesC);
-        break;
     }
+    // Current implementation does not produce sorted matrix
+    // TODO: remove this call when impl sorts
+    KokkosSparse::sort_crs_matrix<typename KernelHandle::HandleExecSpace>(
+        row_mapC, entriesC, valuesC);
+    sh->set_call_numeric();
+    sh->set_computed_entries();
   }
 };
 
@@ -325,39 +240,6 @@ struct SPGEMM_NUMERIC<
       Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      false, true>;                                                       \
-                                                                          \
-  extern template struct SPGEMM_NUMERIC<                                  \
-      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
-          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
-          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
-      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
       false, true>;
 
 #define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_INST(                        \
@@ -394,42 +276,9 @@ struct SPGEMM_NUMERIC<
       Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      false, true>;                                                       \
-                                                                          \
-  template struct SPGEMM_NUMERIC<                                         \
-      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
-          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
-          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,               \
-      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
-      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
-                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
       false, true>;
 
-#include <KokkosSparse_spgemm_tpl_spec_decl.hpp>
+#include <KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp>
 
 #endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp b/sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp
deleted file mode 100644
index f2b4f0e38f..0000000000
--- a/sparse/impl/KokkosSparse_spgemm_rocSPARSE_impl.hpp
+++ /dev/null
@@ -1,435 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef _KOKKOSSPGEMMROCSPARSE_HPP
-#define _KOKKOSSPGEMMROCSPARSE_HPP
-
-#include <KokkosKernels_config.h>
-
-#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
-
-#include "KokkosKernels_Controls.hpp"
-#include "rocsparse.h"
-
-namespace KokkosSparse {
-
-namespace Impl {
-
-//=============================================================================
-// Overload rocsparse_Xcsrgemm_buffer_size() over scalar types
-inline rocsparse_status rocsparse_Xcsrgemm_buffer_size(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const float *alpha, const rocsparse_mat_descr descr_A,
-    rocsparse_int nnz_A, const rocsparse_int *csr_row_ptr_A,
-    const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B,
-    rocsparse_int nnz_B, const rocsparse_int *csr_row_ptr_B,
-    const rocsparse_int *csr_col_ind_B, const float *beta,
-    const rocsparse_mat_descr descr_D, rocsparse_int nnz_D,
-    const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D,
-    rocsparse_mat_info info_C, size_t *buffer_size) {
-  return rocsparse_scsrgemm_buffer_size(
-      handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_row_ptr_A,
-      csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B, csr_col_ind_B, beta,
-      descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size);
-}
-
-inline rocsparse_status rocsparse_Xcsrgemm_buffer_size(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const double *alpha, const rocsparse_mat_descr descr_A,
-    rocsparse_int nnz_A, const rocsparse_int *csr_row_ptr_A,
-    const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B,
-    rocsparse_int nnz_B, const rocsparse_int *csr_row_ptr_B,
-    const rocsparse_int *csr_col_ind_B, const double *beta,
-    const rocsparse_mat_descr descr_D, rocsparse_int nnz_D,
-    const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D,
-    rocsparse_mat_info info_C, size_t *buffer_size) {
-  return rocsparse_dcsrgemm_buffer_size(
-      handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_row_ptr_A,
-      csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B, csr_col_ind_B, beta,
-      descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size);
-}
-
-inline rocsparse_status rocsparse_Xcsrgemm_buffer_size(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const Kokkos::complex<float> *alpha,
-    const rocsparse_mat_descr descr_A, rocsparse_int nnz_A,
-    const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A,
-    const rocsparse_mat_descr descr_B, rocsparse_int nnz_B,
-    const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B,
-    const Kokkos::complex<float> *beta, const rocsparse_mat_descr descr_D,
-    rocsparse_int nnz_D, const rocsparse_int *csr_row_ptr_D,
-    const rocsparse_int *csr_col_ind_D, rocsparse_mat_info info_C,
-    size_t *buffer_size) {
-  return rocsparse_ccsrgemm_buffer_size(
-      handle, trans_A, trans_B, m, n, k,
-      reinterpret_cast<const rocsparse_float_complex *>(alpha), descr_A, nnz_A,
-      csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B,
-      csr_col_ind_B, reinterpret_cast<const rocsparse_float_complex *>(beta),
-      descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size);
-}
-
-inline rocsparse_status rocsparse_Xcsrgemm_buffer_size(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const Kokkos::complex<double> *alpha,
-    const rocsparse_mat_descr descr_A, rocsparse_int nnz_A,
-    const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A,
-    const rocsparse_mat_descr descr_B, rocsparse_int nnz_B,
-    const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B,
-    const Kokkos::complex<double> *beta, const rocsparse_mat_descr descr_D,
-    rocsparse_int nnz_D, const rocsparse_int *csr_row_ptr_D,
-    const rocsparse_int *csr_col_ind_D, rocsparse_mat_info info_C,
-    size_t *buffer_size) {
-  return rocsparse_zcsrgemm_buffer_size(
-      handle, trans_A, trans_B, m, n, k,
-      reinterpret_cast<const rocsparse_double_complex *>(alpha), descr_A, nnz_A,
-      csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B,
-      csr_col_ind_B, reinterpret_cast<const rocsparse_double_complex *>(beta),
-      descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D, info_C, buffer_size);
-}
-
-//=============================================================================
-// Overload rocsparse_Xcsrgemm_numeric() over scalar types
-inline rocsparse_status rocsparse_Xcsrgemm_numeric(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const float *alpha, const rocsparse_mat_descr descr_A,
-    rocsparse_int nnz_A, const float *csr_val_A,
-    const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A,
-    const rocsparse_mat_descr descr_B, rocsparse_int nnz_B,
-    const float *csr_val_B, const rocsparse_int *csr_row_ptr_B,
-    const rocsparse_int *csr_col_ind_B, const float *beta,
-    const rocsparse_mat_descr descr_D, rocsparse_int nnz_D,
-    const float *csr_val_D, const rocsparse_int *csr_row_ptr_D,
-    const rocsparse_int *csr_col_ind_D, const rocsparse_mat_descr descr_C,
-    rocsparse_int nnz_C, float *csr_val_C, const rocsparse_int *csr_row_ptr_C,
-    const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C,
-    void *buffer) {
-  return rocsparse_scsrgemm_numeric(
-      handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_val_A,
-      csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_val_B, csr_row_ptr_B,
-      csr_col_ind_B, beta, descr_D, nnz_D, csr_val_D, csr_row_ptr_D,
-      csr_col_ind_D, descr_C, nnz_C, csr_val_C, csr_row_ptr_C, csr_col_ind_C,
-      info_C, buffer);
-}
-
-inline rocsparse_status rocsparse_Xcsrgemm_numeric(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const double *alpha, const rocsparse_mat_descr descr_A,
-    rocsparse_int nnz_A, const double *csr_val_A,
-    const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A,
-    const rocsparse_mat_descr descr_B, rocsparse_int nnz_B,
-    const double *csr_val_B, const rocsparse_int *csr_row_ptr_B,
-    const rocsparse_int *csr_col_ind_B, const double *beta,
-    const rocsparse_mat_descr descr_D, rocsparse_int nnz_D,
-    const double *csr_val_D, const rocsparse_int *csr_row_ptr_D,
-    const rocsparse_int *csr_col_ind_D, const rocsparse_mat_descr descr_C,
-    rocsparse_int nnz_C, double *csr_val_C, const rocsparse_int *csr_row_ptr_C,
-    const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C,
-    void *buffer) {
-  return rocsparse_dcsrgemm_numeric(
-      handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_val_A,
-      csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_val_B, csr_row_ptr_B,
-      csr_col_ind_B, beta, descr_D, nnz_D, csr_val_D, csr_row_ptr_D,
-      csr_col_ind_D, descr_C, nnz_C, csr_val_C, csr_row_ptr_C, csr_col_ind_C,
-      info_C, buffer);
-}
-
-inline rocsparse_status rocsparse_Xcsrgemm_numeric(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const Kokkos::complex<float> *alpha,
-    const rocsparse_mat_descr descr_A, rocsparse_int nnz_A,
-    const Kokkos::complex<float> *csr_val_A, const rocsparse_int *csr_row_ptr_A,
-    const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B,
-    rocsparse_int nnz_B, const Kokkos::complex<float> *csr_val_B,
-    const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B,
-    const Kokkos::complex<float> *beta, const rocsparse_mat_descr descr_D,
-    rocsparse_int nnz_D, const Kokkos::complex<float> *csr_val_D,
-    const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D,
-    const rocsparse_mat_descr descr_C, rocsparse_int nnz_C,
-    Kokkos::complex<float> *csr_val_C, const rocsparse_int *csr_row_ptr_C,
-    const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C,
-    void *buffer) {
-  return rocsparse_ccsrgemm_numeric(
-      handle, trans_A, trans_B, m, n, k,
-      reinterpret_cast<const rocsparse_float_complex *>(alpha), descr_A, nnz_A,
-      reinterpret_cast<const rocsparse_float_complex *>(csr_val_A),
-      csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B,
-      reinterpret_cast<const rocsparse_float_complex *>(csr_val_B),
-      csr_row_ptr_B, csr_col_ind_B,
-      reinterpret_cast<const rocsparse_float_complex *>(beta), descr_D, nnz_D,
-      reinterpret_cast<const rocsparse_float_complex *>(csr_val_D),
-      csr_row_ptr_D, csr_col_ind_D, descr_C, nnz_C,
-      reinterpret_cast<rocsparse_float_complex *>(csr_val_C), csr_row_ptr_C,
-      csr_col_ind_C, info_C, buffer);
-}
-
-inline rocsparse_status rocsparse_Xcsrgemm_numeric(
-    rocsparse_handle handle, rocsparse_operation trans_A,
-    rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,
-    rocsparse_int k, const Kokkos::complex<double> *alpha,
-    const rocsparse_mat_descr descr_A, rocsparse_int nnz_A,
-    const Kokkos::complex<double> *csr_val_A,
-    const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A,
-    const rocsparse_mat_descr descr_B, rocsparse_int nnz_B,
-    const Kokkos::complex<double> *csr_val_B,
-    const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B,
-    const Kokkos::complex<double> *beta, const rocsparse_mat_descr descr_D,
-    rocsparse_int nnz_D, const Kokkos::complex<double> *csr_val_D,
-    const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D,
-    const rocsparse_mat_descr descr_C, rocsparse_int nnz_C,
-    Kokkos::complex<double> *csr_val_C, const rocsparse_int *csr_row_ptr_C,
-    const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C,
-    void *buffer) {
-  return rocsparse_zcsrgemm_numeric(
-      handle, trans_A, trans_B, m, n, k,
-      reinterpret_cast<const rocsparse_double_complex *>(alpha), descr_A, nnz_A,
-      reinterpret_cast<const rocsparse_double_complex *>(csr_val_A),
-      csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B,
-      reinterpret_cast<const rocsparse_double_complex *>(csr_val_B),
-      csr_row_ptr_B, csr_col_ind_B,
-      reinterpret_cast<const rocsparse_double_complex *>(beta), descr_D, nnz_D,
-      reinterpret_cast<const rocsparse_double_complex *>(csr_val_D),
-      csr_row_ptr_D, csr_col_ind_D, descr_C, nnz_C,
-      reinterpret_cast<rocsparse_double_complex *>(csr_val_C), csr_row_ptr_C,
-      csr_col_ind_C, info_C, buffer);
-}
-
-//=============================================================================
-template <typename KernelHandle, typename index_type, typename size_type,
-          typename scalar_type>
-inline typename std::enable_if<
-    (not std::is_same<index_type, rocsparse_int>::value) or
-        (not std::is_same<size_type, rocsparse_int>::value),
-    void>::type
-rocsparse_spgemm_symbolic_internal(KernelHandle *handle, index_type m,
-                                   index_type n, index_type k, size_type nnz_A,
-                                   const size_type *rowptrA,
-                                   const index_type *colidxA, bool transposeA,
-                                   size_type nnz_B, const size_type *rowptrB,
-                                   const index_type *colidxB, bool transposeB,
-                                   size_type *rowptrC) {
-  // normal code should use the specializations and not go here
-  throw std::runtime_error(
-      "The installed rocsparse does not support the index type and size type");
-}
-
-template <typename KernelHandle, typename index_type, typename size_type,
-          typename scalar_type>
-inline
-    typename std::enable_if<std::is_same<index_type, rocsparse_int>::value and
-                                std::is_same<size_type, rocsparse_int>::value,
-                            void>::type
-    rocsparse_spgemm_symbolic_internal(
-        KernelHandle *handle, index_type m, index_type n, index_type k,
-        size_type nnz_A, const size_type *rowptrA, const index_type *colidxA,
-        bool transposeA, size_type nnz_B, const size_type *rowptrB,
-        const index_type *colidxB, bool transposeB, size_type *rowptrC) {
-  handle->create_rocsparse_spgemm_handle(transposeA, transposeB);
-  typename KernelHandle::rocSparseSpgemmHandleType *h =
-      handle->get_rocsparse_spgemm_handle();
-
-  // alpha, beta are on host, but since we use singleton on the rocsparse
-  // handle, we save/restore the pointer mode to not interference with
-  // others' use
-  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
-  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
-  rocsparse_pointer_mode oldPtrMode;
-
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
-      rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode));
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(
-      h->rocsparseHandle, rocsparse_pointer_mode_host));
-
-  // C = alpha * OpA(A) * OpB(B) + beta * D
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_buffer_size(
-      h->rocsparseHandle, h->opA, h->opB, m, n, k, &alpha, h->descr_A, nnz_A,
-      rowptrA, colidxA, h->descr_B, nnz_B, rowptrB, colidxB, &beta, h->descr_D,
-      0, NULL, NULL, h->info_C, &h->bufferSize));
-
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&h->buffer, h->bufferSize));
-
-  rocsparse_int C_nnz = 0;
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_nnz(
-      h->rocsparseHandle, h->opA, h->opB, m, n, k, h->descr_A, nnz_A, rowptrA,
-      colidxA, h->descr_B, nnz_B, rowptrB, colidxB, h->descr_D, 0, NULL, NULL,
-      h->descr_C, rowptrC, &C_nnz, h->info_C, h->buffer));
-
-  handle->set_c_nnz(C_nnz);
-  h->C_populated = false;  // sparsity pattern of C is not set yet, so this is a
-                           // fake symbolic
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
-      rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode));
-}
-
-template <
-    typename KernelHandle, typename ain_row_index_view_type,
-    typename ain_nonzero_index_view_type, typename bin_row_index_view_type,
-    typename bin_nonzero_index_view_type, typename cin_row_index_view_type>
-void rocsparse_spgemm_symbolic(
-    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
-    ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA,
-    bool transposeA, bin_row_index_view_type rowptrB,
-    bin_nonzero_index_view_type colidxB, bool transposeB,
-    cin_row_index_view_type rowptrC) {
-  using index_type  = typename KernelHandle::nnz_lno_t;
-  using size_type   = typename KernelHandle::size_type;
-  using scalar_type = typename KernelHandle::nnz_scalar_t;
-
-  // In case the KernelHandle uses const types!
-  using non_const_index_type  = typename std::remove_cv<index_type>::type;
-  using non_const_size_type   = typename std::remove_cv<size_type>::type;
-  using non_const_scalar_type = typename std::remove_cv<scalar_type>::type;
-
-  handle->set_sort_option(1);  // tells users the output matrix is sorted
-  rocsparse_spgemm_symbolic_internal<KernelHandle, non_const_index_type,
-                                     non_const_size_type,
-                                     non_const_scalar_type>(
-      handle, m, n, k, colidxA.extent(0), rowptrA.data(), colidxA.data(),
-      transposeA, colidxB.extent(0), rowptrB.data(), colidxB.data(), transposeB,
-      rowptrC.data());
-}
-
-//=============================================================================
-template <typename KernelHandle, typename index_type, typename size_type,
-          typename scalar_type>
-inline typename std::enable_if<
-    (not std::is_same<index_type, rocsparse_int>::value) or
-        (not std::is_same<size_type, rocsparse_int>::value),
-    void>::type
-rocsparse_spgemm_numeric_internal(
-    KernelHandle *handle, index_type m, index_type n, index_type k,
-    size_type nnz_A, const size_type *rowptrA, const index_type *colidxA,
-    const scalar_type *valuesA, size_type nnz_B, const size_type *rowptrB,
-    const index_type *colidxB, const scalar_type *valuesB, size_type nnz_C,
-    const size_type *rowptrC, index_type *colidxC, scalar_type *valuesC) {
-  // normal code should use the specializations and not go here
-  throw std::runtime_error(
-      "The installed rocsparse does not support the index type and size type");
-}
-
-template <typename KernelHandle, typename index_type, typename size_type,
-          typename scalar_type>
-inline
-    typename std::enable_if<std::is_same<index_type, rocsparse_int>::value and
-                                std::is_same<size_type, rocsparse_int>::value,
-                            void>::type
-    rocsparse_spgemm_numeric_internal(
-        KernelHandle *handle, index_type m, index_type n, index_type k,
-        size_type nnz_A, const size_type *rowptrA, const index_type *colidxA,
-        const scalar_type *valuesA, size_type nnz_B, const size_type *rowptrB,
-        const index_type *colidxB, const scalar_type *valuesB, size_type nnz_C,
-        const size_type *rowptrC, index_type *colidxC, scalar_type *valuesC) {
-  typename KernelHandle::rocSparseSpgemmHandleType *h =
-      handle->get_rocsparse_spgemm_handle();
-
-  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
-  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
-  rocsparse_pointer_mode oldPtrMode;
-
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
-      rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode));
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(
-      h->rocsparseHandle, rocsparse_pointer_mode_host));
-
-  if (!h->C_populated) {
-    KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_symbolic(
-        h->rocsparseHandle, h->opA, h->opB, m, n, k, h->descr_A, nnz_A, rowptrA,
-        colidxA, h->descr_B, nnz_B, rowptrB, colidxB, h->descr_D, 0, NULL, NULL,
-        h->descr_C, nnz_C, rowptrC, colidxC, h->info_C, h->buffer));
-    h->C_populated = true;
-  }
-
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_numeric(
-      h->rocsparseHandle, h->opA, h->opB, m, n, k, &alpha, h->descr_A, nnz_A,
-      valuesA, rowptrA, colidxA, h->descr_B, nnz_B, valuesB, rowptrB, colidxB,
-      &beta, h->descr_D, 0, NULL, NULL, NULL, h->descr_C, nnz_C, valuesC,
-      rowptrC, colidxC, h->info_C, h->buffer));
-  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
-      rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode));
-}
-
-template <
-    typename KernelHandle, typename ain_row_index_view_type,
-    typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type,
-    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
-    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
-    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
-void rocsparse_spgemm_numeric(
-    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
-    ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA,
-    ain_nonzero_value_view_type valuesA, bool /* transposeA */,
-    bin_row_index_view_type rowptrB, bin_nonzero_index_view_type colidxB,
-    bin_nonzero_value_view_type valuesB, bool /* transposeB */,
-    cin_row_index_view_type rowptrC, cin_nonzero_index_view_type colidxC,
-    cin_nonzero_value_view_type valuesC) {
-  using index_type  = typename KernelHandle::nnz_lno_t;
-  using size_type   = typename KernelHandle::size_type;
-  using scalar_type = typename KernelHandle::nnz_scalar_t;
-
-  // In case the KernelHandle uses const types!
-  using non_const_index_type  = typename std::remove_cv<index_type>::type;
-  using non_const_size_type   = typename std::remove_cv<size_type>::type;
-  using non_const_scalar_type = typename std::remove_cv<scalar_type>::type;
-
-  rocsparse_spgemm_numeric_internal<KernelHandle, non_const_index_type,
-                                    non_const_size_type, non_const_scalar_type>(
-      handle, m, n, k, colidxA.extent(0), rowptrA.data(), colidxA.data(),
-      valuesA.data(), colidxB.extent(0), rowptrB.data(), colidxB.data(),
-      valuesB.data(), colidxC.extent(0), rowptrC.data(), colidxC.data(),
-      valuesC.data());
-}
-
-}  // namespace Impl
-}  // namespace KokkosSparse
-
-#endif
-#endif
diff --git a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
index a85ac45a7a..76b6bd2365 100644
--- a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
@@ -51,14 +51,8 @@
 #include "KokkosKernels_Handle.hpp"
 // Include the actual functors
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include "KokkosSparse_spgemm_cuSPARSE_impl.hpp"
-#include "KokkosSparse_spgemm_rocSPARSE_impl.hpp"
-#include "KokkosSparse_spgemm_CUSP_impl.hpp"
 #include "KokkosSparse_spgemm_impl.hpp"
 #include "KokkosSparse_spgemm_impl_seq.hpp"
-#include "KokkosSparse_spgemm_mkl_impl.hpp"
-#include "KokkosSparse_spgemm_mkl2phase_impl.hpp"
-#include "KokkosSparse_spgemm_viennaCL_impl.hpp"
 #endif
 
 namespace KokkosSparse {
@@ -100,7 +94,7 @@ struct spgemm_symbolic_eti_spec_avail {
   };
 
 // Include the actual specialization declarations
-#include <KokkosSparse_spgemm_tpl_spec_avail.hpp>
+#include <KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
@@ -120,13 +114,13 @@ template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
               b_lno_view_t, c_size_view_t_>::value>
 struct SPGEMM_SYMBOLIC {
   static void spgemm_symbolic(KernelHandle *handle,
-                              typename KernelHandle::const_nnz_lno_t m,
-                              typename KernelHandle::const_nnz_lno_t n,
-                              typename KernelHandle::const_nnz_lno_t k,
+                              typename KernelHandle::nnz_lno_t m,
+                              typename KernelHandle::nnz_lno_t n,
+                              typename KernelHandle::nnz_lno_t k,
                               a_size_view_t_ row_mapA, a_lno_view_t entriesA,
                               bool transposeA, b_size_view_t_ row_mapB,
                               b_lno_view_t entriesB, bool transposeB,
-                              c_size_view_t_ row_mapC);
+                              c_size_view_t_ row_mapC, bool computeRowptrs);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
@@ -137,49 +131,32 @@ template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
 struct SPGEMM_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t,
                        b_size_view_t_, b_lno_view_t, c_size_view_t_, false,
                        KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
-  static void spgemm_symbolic(KernelHandle *handle,
-                              typename KernelHandle::nnz_lno_t m,
-                              typename KernelHandle::nnz_lno_t n,
-                              typename KernelHandle::nnz_lno_t k,
-                              a_size_view_t_ row_mapA, a_lno_view_t entriesA,
-                              bool transposeA, b_size_view_t_ row_mapB,
-                              b_lno_view_t entriesB, bool transposeB,
-                              c_size_view_t_ row_mapC) {
+  static void spgemm_symbolic(
+      KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+      typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+      a_size_view_t_ row_mapA, a_lno_view_t entriesA, bool transposeA,
+      b_size_view_t_ row_mapB, b_lno_view_t entriesB, bool transposeB,
+      c_size_view_t_ row_mapC, bool /* computeRowptrs */) {
     typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
     spgemmHandleType *sh = handle->get_spgemm_handle();
+    if (sh->is_symbolic_called() && sh->are_rowptrs_computed()) return;
+    if (m == 0 || n == 0 || k == 0 || !entriesA.extent(0) ||
+        !entriesB.extent(0)) {
+      sh->set_computed_rowptrs();
+      sh->set_call_symbolic();
+      sh->set_c_nnz(0);
+      Kokkos::deep_copy(typename spgemmHandleType::HandleExecSpace(), row_mapC,
+                        typename c_size_view_t_::non_const_value_type(0));
+      return;
+    }
     switch (sh->get_algorithm_type()) {
-      case SPGEMM_CUSPARSE:
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-        cuSPARSE_symbolic<spgemmHandleType, a_size_view_t_, a_lno_view_t,
-                          b_size_view_t_, b_lno_view_t, c_size_view_t_>(
-            sh, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB,
-            transposeB, row_mapC);
-#else
-        throw std::runtime_error(
-            "Requiring SPGEMM_CUSPARSE but TPL_CUSPARSE was not enabled!");
-#endif
-        break;
-      case SPGEMM_ROCSPARSE:
-#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
-        rocsparse_spgemm_symbolic<spgemmHandleType, a_size_view_t_,
-                                  a_lno_view_t, b_size_view_t_, b_lno_view_t,
-                                  c_size_view_t_>(
-            sh, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB,
-            transposeB, row_mapC);
-#else
-        throw std::runtime_error(
-            "Requiring SPGEMM_ROCSPARSE but TPL_ROCSPARSE was not enabled!");
-#endif
-        break;
-      case SPGEMM_CUSP:
-      case SPGEMM_VIENNA: break;
+      case SPGEMM_SERIAL:
+      case SPGEMM_DEBUG:
+        spgemm_debug_symbolic(handle, m, n, k, row_mapA, entriesA,
 
-      case SPGEMM_MKL2PHASE:
-        mkl2phase_symbolic(sh, m, n, k, row_mapA, entriesA, transposeA,
-                           row_mapB, entriesB, transposeB, row_mapC,
-                           handle->get_verbose());
+                              transposeA, row_mapB, entriesB, transposeB,
+                              row_mapC);
         break;
-
       default: {
         KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t,
                      typename KernelHandle::in_scalar_nnz_view_t,
@@ -189,23 +166,10 @@ struct SPGEMM_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t,
                     entriesB, transposeB);
         kspgemm.KokkosSPGEMM_symbolic(row_mapC);
       } break;
-      case SPGEMM_SERIAL:
-      case SPGEMM_DEBUG:
-        spgemm_debug_symbolic(handle, m, n, k, row_mapA, entriesA,
-
-                              transposeA, row_mapB, entriesB, transposeB,
-                              row_mapC);
-        break;
-      case SPGEMM_MKL:
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-        mkl_symbolic(sh, m, n, k, row_mapA, entriesA, transposeA, row_mapB,
-                     entriesB, transposeB, row_mapC, handle->get_verbose());
-        break;
-#else
-        throw std::runtime_error("MKL was not enabled in this build!");
-#endif
     }
     sh->set_call_symbolic();
+    // The KokkosKernels implementation of symbolic always populates rowptrs.
+    sh->set_computed_rowptrs();
   }
 };
 
@@ -216,10 +180,8 @@ struct SPGEMM_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t,
 
 //
 // Macro for declaration of full specialization of
-// KokkosSparse::Impl::Dot for rank == 2.  This is NOT for users!!!  All
+// KokkosSparse::Impl::SPGEMM_SYMBOLIC.  This is NOT for users!!!  All
 // the declarations of full specializations go in this header file.
-// We may spread out definitions (see _DEF macro below) across one or
-// more .cpp files.
 //
 #define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL(                       \
     SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
@@ -269,7 +231,7 @@ struct SPGEMM_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
       false, true>;
 
-#include <KokkosSparse_spgemm_tpl_spec_decl.hpp>
+#include <KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp>
 #include <generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp>
 
 #endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp b/sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp
deleted file mode 100644
index 7c2ace3338..0000000000
--- a/sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef _KOKKOSSPGEMMVIENNACL_HPP
-#define _KOKKOSSPGEMMVIENNACL_HPP
-
-//#define KERNELS_HAVE_VIENNACL
-
-#ifdef KERNELS_HAVE_VIENNACL1
-#define VIENNACL_WITH_OPENMP
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/host_based/common.hpp"
-
-#endif
-
-#include "KokkosKernels_Utils.hpp"
-namespace KokkosSparse {
-
-namespace Impl {
-
-template <
-    typename KernelHandle, typename in_row_index_view_type,
-    typename in_nonzero_index_view_type, typename in_nonzero_value_view_type,
-    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
-    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
-    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
-void viennaCL_apply(
-    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
-    in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA,
-    in_nonzero_value_view_type valuesA,
-
-    bool transposeA, bin_row_index_view_type row_mapB,
-    bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB,
-    bool transposeB, cin_row_index_view_type row_mapC,
-    cin_nonzero_index_view_type &entriesC, cin_nonzero_value_view_type &valuesC,
-    bool verbose = false) {
-#ifdef KERNELS_HAVE_VIENNACL1
-
-  typedef typename KernelHandle::nnz_lno_t idx;
-  typedef in_row_index_view_type idx_array_type;
-
-  typedef typename KernelHandle::nnz_scalar_t value_type;
-
-  typedef typename in_row_index_view_type::device_type device1;
-  typedef typename in_nonzero_index_view_type::device_type device2;
-  typedef typename in_nonzero_value_view_type::device_type device3;
-
-  typedef typename KernelHandle::HandleExecSpace MyExecSpace;
-
-  typedef typename viennacl::compressed_matrix<value_type>::handle_type it;
-  typedef typename viennacl::compressed_matrix<value_type>::value_type vt;
-
-  if ((std::is_same<idx, int>::value &&
-       std::is_same<typename KernelHandle::size_type, int>::value) ||
-      (std::is_same<idx, unsigned int>::value &&
-       std::is_same<typename KernelHandle::size_type, unsigned int>::value) ||
-      (std::is_same<idx, it>::value &&
-       std::is_same<typename KernelHandle::size_type, it>::value)) {
-    unsigned int *a_xadj = (unsigned int *)row_mapA.data();
-    unsigned int *b_xadj = (unsigned int *)row_mapB.data();
-    unsigned int *c_xadj = (unsigned int *)row_mapC.data();
-
-    unsigned int *a_adj = (unsigned int *)entriesA.data();
-    unsigned int *b_adj = (unsigned int *)entriesB.data();
-    unsigned int *c_adj = (unsigned int *)entriesC.data();
-
-    int nnzA = entriesA.extent(0);
-    int nnzB = entriesB.extent(0);
-
-    value_type *a_ew = valuesA.data();
-    value_type *b_ew = valuesB.data();
-    value_type *c_ew = valuesC.data();
-
-    /*
-      std::cout << "create a" << std::endl;
-      std::cout << "m:" << m << " n:" << n << std::endl;
-      std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] <<
-      std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] <<
-      " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
-     */
-
-    Kokkos::Timer timerset;
-    if (verbose) std::cout << "viennacl matrix create begins here" << std::endl;
-#ifdef VIENNACL_WITH_CUDA
-    viennacl::compressed_matrix<value_type> A(
-        a_xadj, a_adj, a_ew, viennacl::CUDA_MEMORY, m, n, nnzA);
-    viennacl::compressed_matrix<value_type> B(
-        b_xadj, b_adj, b_ew, viennacl::CUDA_MEMORY, n, k, nnzB);
-#else
-    viennacl::compressed_matrix<value_type> A(
-        a_xadj, a_adj, a_ew, viennacl::MAIN_MEMORY, m, n, nnzA);
-    viennacl::compressed_matrix<value_type> B(
-        b_xadj, b_adj, b_ew, viennacl::MAIN_MEMORY, n, k, nnzB);
-
-#endif
-    // viennacl::compressed_matrix<value_type> A;
-    // viennacl::compressed_matrix<value_type> B;
-    // A.set(a_xadj, a_adj, a_ew, m, n, nnzA);
-    // B.set(b_xadj, b_adj, b_ew, n, k, nnzB);
-    if (verbose)
-      std::cout << "VIENNACL compress matrix create:" << timerset.seconds()
-                << std::endl;
-
-    Kokkos::Timer timer1;
-    viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B);
-    MyExecSpace().fence();
-
-    if (verbose)
-      std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds()
-                << std::endl;
-
-    {
-      unsigned int c_rows = m, c_cols = k, cnnz = C.nnz();
-
-#ifdef VIENNACL_WITH_CUDA
-      value_type const *values = viennacl::cuda_arg<value_type>(C.handle());
-      unsigned int const *rows_start =
-          viennacl::cuda_arg<unsigned int>(C.handle1());
-      unsigned int const *columns =
-          viennacl::cuda_arg<unsigned int>(C.handle2());
-#else
-      value_type const *values =
-          viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(
-              C.handle());
-      unsigned int const *rows_start =
-          viennacl::linalg::host_based::detail::extract_raw_pointer<
-              unsigned int>(C.handle1());
-      unsigned int const *columns =
-          viennacl::linalg::host_based::detail::extract_raw_pointer<
-              unsigned int>(C.handle2());
-
-#endif
-
-      {
-        Kokkos::Timer copy_time;
-        // row_mapC = typename
-        // cin_row_index_view_type::non_const_type(Kokkos::view_alloc(Kokkos::WithoutInitializing,
-        // "rowmapC"), c_rows + 1);
-        entriesC = typename cin_nonzero_index_view_type::non_const_type(
-            Kokkos::view_alloc(Kokkos::WithoutInitializing, "EntriesC"), cnnz);
-        valuesC = typename cin_nonzero_value_view_type::non_const_type(
-            Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), cnnz);
-        MyExecSpace().fence();
-        KokkosKernels::Impl::copy_vector<
-            unsigned int const *,
-            typename cin_row_index_view_type::non_const_type, MyExecSpace>(
-            m + 1, rows_start, row_mapC);
-        KokkosKernels::Impl::copy_vector<
-            unsigned int const *,
-            typename cin_nonzero_index_view_type::non_const_type, MyExecSpace>(
-            cnnz, columns, entriesC);
-        KokkosKernels::Impl::copy_vector<
-            value_type const *,
-            typename cin_nonzero_value_view_type::non_const_type, MyExecSpace>(
-            cnnz, values, valuesC);
-        MyExecSpace().fence();
-
-        double copy_time_d = copy_time.seconds();
-        if (verbose)
-          std::cout << "VIENNACL COPYTIME:" << copy_time_d << std::endl;
-      }
-    }
-  } else {
-    throw std::runtime_error(
-        "VIENNACL requires local ordinals to be integer.\n");
-  }
-#else
-  (void)handle;
-  (void)m;
-  (void)n;
-  (void)k;
-  (void)row_mapA;
-  (void)row_mapB;
-  (void)row_mapC;
-  (void)entriesA;
-  (void)entriesB;
-  (void)entriesC;
-  (void)valuesA;
-  (void)valuesB;
-  (void)valuesC;
-  (void)transposeA;
-  (void)transposeB;
-  (void)verbose;
-  throw std::runtime_error("VIENNACL IS NOT DEFINED\n");
-  // return;
-#endif
-}
-}  // namespace Impl
-}  // namespace KokkosSparse
-
-#endif
diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp
index aabe0069be..ec809da194 100644
--- a/sparse/src/KokkosKernels_Controls.hpp
+++ b/sparse/src/KokkosKernels_Controls.hpp
@@ -62,7 +62,7 @@
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
-#include "rocsparse.h"
+#include <rocsparse/rocsparse.h>
 #endif
 
 namespace KokkosKernels {
diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp
index 2616d8fbb7..f01e238d5d 100644
--- a/sparse/src/KokkosSparse_Utils.hpp
+++ b/sparse/src/KokkosSparse_Utils.hpp
@@ -2075,6 +2075,105 @@ struct MatrixConverter<SparseMatrixFormat::BSR> {
   }
 };
 
+template <typename Rowmap, typename Entries>
+struct CountEntriesFallingEdges {
+  using size_type = typename Rowmap::non_const_value_type;
+
+  CountEntriesFallingEdges(const Entries &entries_) : entries(entries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(size_type i,
+                                         size_type &numFallingEdges) const {
+    if (entries(i) > entries(i + 1)) numFallingEdges++;
+  }
+
+  Entries entries;
+};
+
+template <typename Rowmap, typename Entries>
+struct CountRowBoundaryFallingEdges {
+  using size_type    = typename Rowmap::non_const_value_type;
+  using ordinal_type = typename Entries::non_const_value_type;
+
+  CountRowBoundaryFallingEdges(const Rowmap &rowmap_, const Entries &entries_)
+      : rowmap(rowmap_), entries(entries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(
+      ordinal_type i, size_type &numBoundaryFallingEdges) const {
+    // Comparing the entries at end of row i, and beginning of row i+1
+    size_type rowBegin = rowmap(i);
+    size_type rowEnd   = rowmap(i + 1);
+    // But skip this row if empty (meaning there is no last entry), because
+    // there would be double-counting.
+    if (rowBegin == rowEnd) return;
+    // rowEnd is also the beginning of the next (nonempty) row.
+    // But if it points the end of all entries, skip this row because it's the
+    // last nonempty row.
+    if (rowEnd == size_type(entries.extent(0))) return;
+    if (entries(rowEnd - 1) > entries(rowEnd)) numBoundaryFallingEdges++;
+  }
+
+  Rowmap rowmap;
+  Entries entries;
+};
+
+// Efficient test for whether a StaticCrsGraph has sorted rows
+//(parallel and not affected by imbalanced rows).
+// Unmerged/repeated entries in a row are still considered sorted.
+template <typename Rowmap, typename Entries>
+bool isCrsGraphSorted(const Rowmap &rowmap, const Entries &entries) {
+  using size_type    = typename Rowmap::non_const_value_type;
+  using ordinal_type = typename Entries::non_const_value_type;
+  using exec_space   = typename Entries::execution_space;
+  size_type nnz      = entries.extent(0);
+  // Catch case of zero rows, and zero-length rowmap
+  if (rowmap.extent(0) == size_t(0)) return true;
+  // Eliminate cases with zero rows/cols/entries.
+  // This also eliminates cases where row_map is extent 0.
+  if (nnz == size_type(0)) return true;
+  ordinal_type numRows = rowmap.extent(0) - 1;
+  // A "falling edge" is where entry i is greater than entry i+1.
+  // A graph is unsorted if and only if there is a falling edge where i, i+1 are
+  // in the same row. So count the total falling edges, and then subtract the
+  // falling edges which cross row boundaries.
+  size_type totalFallingEdges = 0;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<exec_space>(0, nnz - 1),
+                          CountEntriesFallingEdges<Rowmap, Entries>(entries),
+                          totalFallingEdges);
+  size_type rowBoundaryFallingEdges = 0;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<exec_space>(0, numRows - 1),
+      CountRowBoundaryFallingEdges<Rowmap, Entries>(rowmap, entries),
+      rowBoundaryFallingEdges);
+  return totalFallingEdges == rowBoundaryFallingEdges;
+}
+
+template <typename Rowmap, typename Entries, typename Values>
+void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn,
+                       const Entries &entriesIn, const Values &valuesIn) {
+  auto rowmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmapIn);
+  auto entries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entriesIn);
+  auto values =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), valuesIn);
+  size_t nnz = entries.extent(0);
+  if (nnz != values.extent(0))
+    throw std::runtime_error(
+        "Matrix entries/values views have different lengths");
+  if ((m == 0 && rowmap.extent(0) > size_t(1)) ||
+      (rowmap.extent(0) != size_t(m + 1)))
+    throw std::runtime_error("Matrix rowmap has wrong length");
+  if (m && nnz != rowmap(m))
+    throw std::runtime_error("Matrix rowmap final entry doesn't match nnz");
+  for (int i = 0; i < m; i++) {
+    if (rowmap(i) > rowmap(i + 1))
+      throw std::runtime_error("Matrix rowmap not ascending");
+  }
+  for (size_t i = 0; i < size_t(nnz); i++) {
+    if (entries(i) >= n) throw std::runtime_error("Matrix entry out of bounds");
+  }
+}
+
 }  // namespace Impl
 }  // namespace KokkosSparse
 
diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp
index 4637aab5a4..780cb41189 100644
--- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp
+++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp
@@ -46,7 +46,7 @@
 #define _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
-#include "rocsparse.h"
+#include "rocsparse/rocsparse.h"
 
 namespace KokkosSparse {
 namespace Impl {
@@ -177,6 +177,21 @@ inline rocsparse_datatype rocsparse_compute_type<Kokkos::complex<double>>() {
   return rocsparse_datatype_f64_c;
 }
 
+template <typename Scalar>
+struct kokkos_to_rocsparse_type {
+  using type = Scalar;
+};
+
+template <>
+struct kokkos_to_rocsparse_type<Kokkos::complex<float>> {
+  using type = rocsparse_float_complex;
+};
+
+template <>
+struct kokkos_to_rocsparse_type<Kokkos::complex<double>> {
+  using type = rocsparse_double_complex;
+};
+
 }  // namespace Impl
 
 }  // namespace KokkosSparse
diff --git a/sparse/src/KokkosSparse_spgemm.hpp b/sparse/src/KokkosSparse_spgemm.hpp
index 0cee2979a2..0fdf6bc646 100644
--- a/sparse/src/KokkosSparse_spgemm.hpp
+++ b/sparse/src/KokkosSparse_spgemm.hpp
@@ -150,6 +150,25 @@ void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode,
       B.values, Bmode, C.graph.row_map, C.graph.entries, C.values, blockDim);
 }
 
+template <class CMatrix, class AMatrix, class BMatrix>
+CMatrix spgemm(const AMatrix& A, const bool Amode, const BMatrix& B,
+               const bool Bmode) {
+  using device_t  = typename CMatrix::device_type;
+  using scalar_t  = typename CMatrix::value_type;
+  using ordinal_t = typename CMatrix::ordinal_type;
+  using size_type = typename CMatrix::size_type;
+  using KKH       = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, ordinal_t, scalar_t, typename device_t::execution_space,
+      typename device_t::memory_space, typename device_t::memory_space>;
+  KKH kh;
+  kh.create_spgemm_handle();
+  CMatrix C;
+  spgemm_symbolic(kh, A, Amode, B, Bmode, C);
+  spgemm_numeric(kh, A, Amode, B, Bmode, C);
+  kh.destroy_spgemm_handle();
+  return C;
+}
+
 }  // namespace KokkosSparse
 
 #endif
diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp
index 9f89083957..8d9ebc9ed7 100644
--- a/sparse/src/KokkosSparse_spgemm_handle.hpp
+++ b/sparse/src/KokkosSparse_spgemm_handle.hpp
@@ -46,6 +46,7 @@
 
 #include <KokkosKernels_config.h>
 #include <KokkosKernels_Controls.hpp>
+#include <KokkosSparse_Utils.hpp>
 #include <Kokkos_Core.hpp>
 #include <iostream>
 #include <string>
@@ -59,6 +60,10 @@
 #include "KokkosSparse_Utils_cusparse.hpp"
 #endif
 
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#include "KokkosSparse_Utils_mkl.hpp"
+#endif
+
 namespace KokkosSparse {
 
 // TODO:SPGEMM_KK_MEMORY2 option is for testing in openmp.
@@ -69,12 +74,14 @@ enum SPGEMMAlgorithm {
   SPGEMM_KK_DENSE,
   SPGEMM_KK_MEMORY,
   SPGEMM_KK_LP,  // KKVARIANTS
-  SPGEMM_CUSPARSE,
-  SPGEMM_CUSP,
-  SPGEMM_MKL,
-  SPGEMM_MKL2PHASE,
-  SPGEMM_VIENNA,  // TPLS
-  SPGEMM_ROCSPARSE,
+  SPGEMM_CUSPARSE [[deprecated("cuSPARSE is now used automatically in all "
+                               "supported SpGEMM calls, if enabled.")]],
+  SPGEMM_MKL [[deprecated("MKL is now used automatically in all supported "
+                          "SpGEMM calls, if enabled.")]],
+  SPGEMM_MKL2PHASE [[deprecated("MKL is now used automatically in all "
+                                "supported SpGEMM calls, if enabled.")]],
+  SPGEMM_ROCSPARSE [[deprecated("rocSPARSE is now used automatically in all "
+                                "supported SpGEMM calls, if enabled.")]],
 
   // TRIANGLE COUNTING SPECIALIZED
   SPGEMM_KK_TRIANGLE_AI,  // SPGEMM_KK_TRIANGLE_DEFAULT, SPGEMM_KK_TRIANGLE_MEM,
@@ -158,7 +165,6 @@ class SPGEMMHandle {
     rocsparse_mat_info info_C;
     size_t bufferSize;
     void *buffer;
-    bool C_populated;
 
     rocSparseSpgemmHandleType(bool transposeA, bool transposeB) {
       opA =
@@ -166,9 +172,8 @@ class SPGEMMHandle {
       opB =
           transposeB ? rocsparse_operation_transpose : rocsparse_operation_none;
 
-      bufferSize  = 0;
-      buffer      = NULL;
-      C_populated = false;
+      bufferSize = 0;
+      buffer     = nullptr;
       KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_A));
       KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_B));
       KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_C));
@@ -205,8 +210,6 @@ class SPGEMMHandle {
     size_t bufferSize3, bufferSize4, bufferSize5;
     void *buffer3, *buffer4, *buffer5;
 
-    bool C_populated;
-
     cuSparseSpgemmHandleType(bool transposeA, bool transposeB) {
       opA = transposeA ? CUSPARSE_OPERATION_TRANSPOSE
                        : CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -216,95 +219,75 @@ class SPGEMMHandle {
 
       alg         = CUSPARSE_SPGEMM_DEFAULT;
       bufferSize3 = bufferSize4 = bufferSize5 = 0;
-      buffer3 = buffer4 = buffer5 = NULL;
-      C_populated                 = false;
+      buffer3 = buffer4 = buffer5 = nullptr;
 
       cusparseHandle = kkControls.getCusparseHandle();
       KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_createDescr(&spgemmDescr));
     }
 
     ~cuSparseSpgemmHandleType() {
-      cusparseDestroySpMat(descr_A);
-      cusparseDestroySpMat(descr_B);
-      cusparseDestroySpMat(descr_C);
-      cusparseSpGEMM_destroyDescr(spgemmDescr);
-      cudaFree(buffer3);
-      cudaFree(buffer4);
-      cudaFree(buffer5);
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(descr_A));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(descr_B));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(descr_C));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_destroyDescr(spgemmDescr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer3));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer4));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer5));
     }
   };
 #else
   struct cuSparseSpgemmHandleType {
-    cusparseHandle_t handle;
-    cusparseOperation_t transA;
-    cusparseOperation_t transB;
-    cusparseMatDescr_t a_descr;
-    cusparseMatDescr_t b_descr;
-    cusparseMatDescr_t c_descr;
-    cuSparseSpgemmHandleType(bool transposeA, bool transposeB) {
-      cusparseStatus_t status;
-      status = cusparseCreate(&handle);
-      if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error("cusparseCreate ERROR\n");
-        // return;
-      }
-      cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
-
-      if (transposeA) {
-        transA = CUSPARSE_OPERATION_TRANSPOSE;
-      } else {
-        transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-      }
-      if (transposeB) {
-        transB = CUSPARSE_OPERATION_TRANSPOSE;
-      } else {
-        transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
-      }
-
-      status = cusparseCreateMatDescr(&a_descr);
-      if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error("cusparseCreateMatDescr a_descr ERROR\n");
-        // return;
-      }
-      cusparseSetMatType(a_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(a_descr, CUSPARSE_INDEX_BASE_ZERO);
-
-      status = cusparseCreateMatDescr(&b_descr);
-      if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error("cusparseCreateMatDescr b_descr ERROR\n");
-        // return;
-      }
-      cusparseSetMatType(b_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(b_descr, CUSPARSE_INDEX_BASE_ZERO);
-
-      status = cusparseCreateMatDescr(&c_descr);
-      if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error("cusparseCreateMatDescr  c_descr ERROR\n");
-        // return;
-      }
-      cusparseSetMatType(c_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(c_descr, CUSPARSE_INDEX_BASE_ZERO);
+    cusparseHandle_t cusparseHandle;
+    // Descriptor for any general matrix with index base 0
+    cusparseMatDescr_t generalDescr;
+
+    cuSparseSpgemmHandleType(bool /* transposeA */, bool /* transposeB */) {
+      KokkosKernels::Experimental::Controls kkControls;
+      // Get singleton cusparse handle from default controls
+      cusparseHandle = kkControls.getCusparseHandle();
+
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&generalDescr));
+      KOKKOS_CUSPARSE_SAFE_CALL(
+          cusparseSetMatType(generalDescr, CUSPARSE_MATRIX_TYPE_GENERAL));
+      KOKKOS_CUSPARSE_SAFE_CALL(
+          cusparseSetMatIndexBase(generalDescr, CUSPARSE_INDEX_BASE_ZERO));
     }
     ~cuSparseSpgemmHandleType() {
-      cusparseDestroyMatDescr(a_descr);
-      cusparseDestroyMatDescr(b_descr);
-      cusparseDestroyMatDescr(c_descr);
-      cusparseDestroy(handle);
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(generalDescr));
     }
   };
 #endif
 #endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+  struct mklSpgemmHandleType {
+    // Allow mkl_sparse_sp2m (in SPARSE_STAGE_NNZ_COUNT mode) to construct C.
+    // Then this assumes ownership of it and will destroy it later.
+    mklSpgemmHandleType(const sparse_matrix_t &C_) : C(C_) {}
+
+    ~mklSpgemmHandleType() {
+      KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(C));
+    }
+
+    sparse_matrix_t C;
+  };
+#endif
+
  private:
   SPGEMMAlgorithm algorithm_type;
   SPGEMMAccumulator accumulator_type;
   size_type result_nnz_size;
 
   bool called_symbolic;
+  bool computed_rowptrs;
+  bool computed_rowflops;
+  bool computed_entries;
   bool called_numeric;
 
   int suggested_vector_size;
   int suggested_team_size;
-  nnz_lno_t max_nnz_inresult;
+  nnz_lno_t max_nnz_inresult;  // C max nonzeros per row.
+  bool computed_max_nnz_inresult;
   nnz_lno_t max_nnz_compressed_result;
 
   size_type compressed_b_size;
@@ -330,7 +313,6 @@ class SPGEMMHandle {
   bool create_lower_triangular;
   int sort_lower_triangular;  // 0 - do not sort // 1 - sort // 2 - Algorithm
                               // decides (default)
-  int sort_option;
   nnz_lno_persistent_work_view_t lower_triangular_permutation;
 
   row_lno_persistent_work_view_t lower_triangular_matrix_rowmap;
@@ -402,6 +384,13 @@ class SPGEMMHandle {
  public:
 #endif
 
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+ private:
+  mklSpgemmHandleType *mkl_spgemm_handle;
+
+ public:
+#endif
+
   void set_c_column_indices(nnz_lno_temp_work_view_t c_col_indices_) {
     this->c_column_indices = c_col_indices_;
   }
@@ -464,17 +453,11 @@ class SPGEMMHandle {
     compressed_c_rowmap_ = compressed_c_rowmap;
   }
 
-  // TODO: store transpose here.
-  void get_c_transpose_symbolic() {}
-
   void set_sort_lower_triangular(int option) {
     this->sort_lower_triangular = option;
   }
   int get_sort_lower_triangular() { return this->sort_lower_triangular; }
 
-  void set_sort_option(int option) { this->sort_option = option; }
-  int get_sort_option() { return this->sort_option; }
-
   void set_create_lower_triangular(bool option) {
     this->create_lower_triangular = option;
   }
@@ -529,10 +512,14 @@ class SPGEMMHandle {
         accumulator_type(SPGEMM_ACC_DEFAULT),
         result_nnz_size(0),
         called_symbolic(false),
+        computed_rowptrs(false),
+        computed_rowflops(false),
+        computed_entries(false),
         called_numeric(false),
         suggested_vector_size(0),
         suggested_team_size(0),
         max_nnz_inresult(0),
+        computed_max_nnz_inresult(false),
         c_column_indices(),
         tranpose_a_xadj(),
         tranpose_b_xadj(),
@@ -553,7 +540,6 @@ class SPGEMMHandle {
 
         create_lower_triangular(false),
         sort_lower_triangular(2),
-        sort_option(-1),
         lower_triangular_permutation(),
         lower_triangular_matrix_rowmap(),
         lower_triangular_matrix_entries(),
@@ -581,11 +567,15 @@ class SPGEMMHandle {
         is_compression_single_step(false)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
         ,
-        rocsparse_spgemm_handle(NULL)
+        rocsparse_spgemm_handle(nullptr)
 #endif
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
         ,
-        cusparse_spgemm_handle(NULL)
+        cusparse_spgemm_handle(nullptr)
+#endif
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+        ,
+        mkl_spgemm_handle(nullptr)
 #endif
   {
     if (gs == SPGEMM_DEFAULT) {
@@ -599,6 +589,9 @@ class SPGEMMHandle {
 #endif
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
     this->destroy_cusparse_spgemm_handle();
+#endif
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+    this->destroy_mkl_spgemm_handle();
 #endif
   };
 
@@ -614,9 +607,9 @@ class SPGEMMHandle {
   }
 
   void destroy_rocsparse_spgemm_handle() {
-    if (this->rocsparse_spgemm_handle != NULL) {
+    if (this->rocsparse_spgemm_handle != nullptr) {
       delete this->rocsparse_spgemm_handle;
-      this->rocsparse_spgemm_handle = NULL;
+      this->rocsparse_spgemm_handle = nullptr;
     }
   }
 
@@ -628,9 +621,9 @@ class SPGEMMHandle {
     this->cusparse_spgemm_handle = new cuSparseSpgemmHandleType(transA, transB);
   }
   void destroy_cusparse_spgemm_handle() {
-    if (this->cusparse_spgemm_handle != NULL) {
+    if (this->cusparse_spgemm_handle != nullptr) {
       delete this->cusparse_spgemm_handle;
-      this->cusparse_spgemm_handle = NULL;
+      this->cusparse_spgemm_handle = nullptr;
     }
   }
 
@@ -639,6 +632,23 @@ class SPGEMMHandle {
   }
 #endif
 
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+  void create_mkl_spgemm_handle(sparse_matrix_t C) {
+    this->destroy_mkl_spgemm_handle();
+    this->mkl_spgemm_handle = new mklSpgemmHandleType(C);
+  }
+  void destroy_mkl_spgemm_handle() {
+    if (this->mkl_spgemm_handle != nullptr) {
+      delete this->mkl_spgemm_handle;
+      this->mkl_spgemm_handle = nullptr;
+    }
+  }
+
+  mklSpgemmHandleType *get_mkl_spgemm_handle() {
+    return this->mkl_spgemm_handle;
+  }
+#endif
+
   void choose_default_algorithm() {
 #if defined(KOKKOS_ENABLE_SERIAL)
     if (std::is_same<Kokkos::Serial, ExecutionSpace>::value) {
@@ -672,29 +682,20 @@ class SPGEMMHandle {
 
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::Cuda, ExecutionSpace>::value) {
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-      this->algorithm_type = SPGEMM_CUSPARSE;
-#ifdef VERBOSE
-      std::cout << "Cuda Execution Space, Default Algorithm: SPGEMM_CUSPARSE"
-                << std::endl;
-#endif
-#else
       this->algorithm_type = SPGEMM_KK;
 #ifdef VERBOSE
-      std::cout << "Cuda Execution Space, without cuSPARSE, Default Algorithm: "
-                   "SPGEMM_KK"
+      std::cout << "Cuda Execution Space, Default Algorithm: SPGEMM_KK"
                 << std::endl;
-#endif
 #endif
     }
 #endif
 
 #if defined(KOKKOS_ENABLE_HIP)
     if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace>::value) {
-#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
-      this->algorithm_type = SPGEMM_ROCSPARSE;
-#else
       this->algorithm_type = SPGEMM_KK;
+#ifdef VERBOSE
+      std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK"
+                << std::endl;
 #endif
     }
 #endif
@@ -727,9 +728,21 @@ class SPGEMMHandle {
   SPGEMMAlgorithm get_algorithm_type() const { return this->algorithm_type; }
 
   bool is_symbolic_called() { return this->called_symbolic; }
+  bool are_rowptrs_computed() { return this->computed_rowptrs; }
+  bool are_rowflops_computed() { return this->computed_rowflops; }
+  bool are_entries_computed() { return this->computed_entries; }
   bool is_numeric_called() { return this->called_numeric; }
 
-  nnz_lno_t get_max_result_nnz() const { return this->max_nnz_inresult; }
+  template <typename c_row_view_t>
+  nnz_lno_t get_max_result_nnz(const c_row_view_t &row_mapC) {
+    if (!this->computed_max_nnz_inresult) {
+      this->max_nnz_inresult =
+          KokkosSparse::Impl::graph_max_degree<HandleExecSpace, size_type,
+                                               c_row_view_t>(row_mapC);
+      this->computed_max_nnz_inresult = true;
+    }
+    return this->max_nnz_inresult;
+  }
 
   nnz_lno_t get_max_compresed_result_nnz() const {
     return this->max_nnz_compressed_result;
@@ -740,10 +753,14 @@ class SPGEMMHandle {
     this->algorithm_type = sgs_algo;
   }
   void set_call_symbolic(bool call = true) { this->called_symbolic = call; }
+  void set_computed_rowptrs() { this->computed_rowptrs = true; }
+  void set_computed_rowflops() { this->computed_rowflops = true; }
+  void set_computed_entries() { this->computed_entries = true; }
   void set_call_numeric(bool call = true) { this->called_numeric = call; }
 
-  void set_max_result_nnz(nnz_lno_t num_result_nnz_) {
-    this->max_nnz_inresult = num_result_nnz_;
+  void set_max_result_nnz(nnz_lno_t nz) {
+    this->max_nnz_inresult          = nz;
+    this->computed_max_nnz_inresult = true;
   }
 
   void set_max_compresed_result_nnz(nnz_lno_t num_result_nnz_) {
@@ -804,21 +821,26 @@ inline SPGEMMAlgorithm StringToSPGEMMAlgorithm(std::string &name) {
     return SPGEMM_KK_LP;
   else if (name == "SPGEMM_KK_MEMSPEED")
     return SPGEMM_KK;
-
   else if (name == "SPGEMM_DEBUG")
     return SPGEMM_SERIAL;
   else if (name == "SPGEMM_SERIAL")
     return SPGEMM_SERIAL;
   else if (name == "SPGEMM_CUSPARSE")
-    return SPGEMM_CUSPARSE;
-  else if (name == "SPGEMM_ROCSPARSE")
-    return SPGEMM_ROCSPARSE;
-  else if (name == "SPGEMM_CUSP")
-    return SPGEMM_CUSP;
+    throw std::runtime_error(
+        "Enum value SPGEMM_CUSPARSE is deprecated. cuSPARSE is automatically "
+        "used in all supported SpGEMM calls.");
   else if (name == "SPGEMM_MKL")
-    return SPGEMM_MKL;
-  else if (name == "SPGEMM_VIENNA")
-    return SPGEMM_VIENNA;
+    throw std::runtime_error(
+        "Enum value SPGEMM_MKL is deprecated. MKL is automatically used in all "
+        "supported SpGEMM calls.");
+  else if (name == "SPGEMM_MKL2PHASE")
+    throw std::runtime_error(
+        "Enum value SPGEMM_MKL2PHASE is deprecated. MKL is automatically used "
+        "in all supported SpGEMM calls.");
+  else if (name == "SPGEMM_ROCSPARSE")
+    throw std::runtime_error(
+        "Enum value SPGEMM_ROCSPARSE is deprecated. rocSPARSE is automatically "
+        "used in all supported SpGEMM calls.");
   else
     throw std::runtime_error("Invalid SPGEMMAlgorithm name");
 }
diff --git a/sparse/src/KokkosSparse_spgemm_numeric.hpp b/sparse/src/KokkosSparse_spgemm_numeric.hpp
index 313922dc62..edb4b006ed 100644
--- a/sparse/src/KokkosSparse_spgemm_numeric.hpp
+++ b/sparse/src/KokkosSparse_spgemm_numeric.hpp
@@ -148,10 +148,6 @@ void spgemm_numeric(KernelHandle *handle,
         "If you need this case please let kokkos-kernels developers know.\n");
   }
 
-  if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 ||
-      entriesB.extent(0) < 1)
-    return;
-
   typedef typename KernelHandle::const_size_type c_size_t;
   typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
   typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
@@ -217,7 +213,7 @@ void spgemm_numeric(KernelHandle *handle,
 
   // static assert clno_row_view_t_ can be const type (row map is fixed after
   // symbolic phase).
-  typedef Kokkos::View<typename clno_row_view_t_::value_type *,
+  typedef Kokkos::View<typename clno_row_view_t_::const_value_type *,
                        typename KokkosKernels::Impl::GetUnifiedLayout<
                            clno_row_view_t_>::array_layout,
                        UniformDevice_t,  //       typename
@@ -247,7 +243,7 @@ void spgemm_numeric(KernelHandle *handle,
   Internal_blno_row_view_t_ const_b_r(row_mapB.data(), row_mapB.extent(0));
   Internal_blno_nnz_view_t_ const_b_l(entriesB.data(), entriesB.extent(0));
   Internal_bscalar_nnz_view_t_ const_b_s(valuesB.data(), valuesB.extent(0));
-  Internal_clno_row_view_t_ nonconst_c_r(row_mapC.data(), row_mapC.extent(0));
+  Internal_clno_row_view_t_ const_c_r(row_mapC.data(), row_mapC.extent(0));
   Internal_clno_nnz_view_t_ nonconst_c_l(entriesC.data(), entriesC.extent(0));
   Internal_cscalar_nnz_view_t_ nonconst_c_s(valuesC.data(), valuesC.extent(0));
 
@@ -262,27 +258,48 @@ void spgemm_numeric(KernelHandle *handle,
                                                        const_a_l, const_a_s,
                                                        transposeA, const_b_r,
                                                        const_b_l, const_b_s,
-                                                       transposeB, nonconst_c_r,
+                                                       transposeB, const_c_r,
                                                        nonconst_c_l,
                                                        nonconst_c_s);
     return;
   }
 
-  KokkosSparse::Impl::SPGEMM_NUMERIC<
-      const_handle_type,  // KernelHandle,
-      Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
-      Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_,
-      Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
-      Internal_clno_row_view_t_, Internal_clno_nnz_view_t_,
-      Internal_cscalar_nnz_view_t_>::spgemm_numeric(&tmp_handle,  // handle,
-                                                    m, n, k, const_a_r,
-                                                    const_a_l, const_a_s,
-                                                    transposeA, const_b_r,
-                                                    const_b_l, const_b_s,
-                                                    transposeB, nonconst_c_r,
-                                                    nonconst_c_l, nonconst_c_s);
+  auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type();
+
+  if (algo == SPGEMM_DEBUG || algo == SPGEMM_SERIAL) {
+    // Never call a TPL if serial/debug is requested (this is needed for
+    // testing)
+    KokkosSparse::Impl::SPGEMM_NUMERIC<
+        const_handle_type,  // KernelHandle,
+        Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
+        Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_,
+        Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
+        Internal_clno_row_view_t_, Internal_clno_nnz_view_t_,
+        Internal_cscalar_nnz_view_t_,
+        false>::spgemm_numeric(&tmp_handle,  // handle,
+                               m, n, k, const_a_r, const_a_l, const_a_s,
+                               transposeA, const_b_r, const_b_l, const_b_s,
+                               transposeB, const_c_r, nonconst_c_l,
+                               nonconst_c_s);
+  } else {
+    KokkosSparse::Impl::SPGEMM_NUMERIC<
+        const_handle_type,  // KernelHandle,
+        Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
+        Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_,
+        Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
+        Internal_clno_row_view_t_, Internal_clno_nnz_view_t_,
+        Internal_cscalar_nnz_view_t_>::spgemm_numeric(&tmp_handle,  // handle,
+                                                      m, n, k, const_a_r,
+                                                      const_a_l, const_a_s,
+                                                      transposeA, const_b_r,
+                                                      const_b_l, const_b_s,
+                                                      transposeB, const_c_r,
+                                                      nonconst_c_l,
+                                                      nonconst_c_s);
+  }
 }
 
 }  // namespace Experimental
 }  // namespace KokkosSparse
+
 #endif
diff --git a/sparse/src/KokkosSparse_spgemm_symbolic.hpp b/sparse/src/KokkosSparse_spgemm_symbolic.hpp
index 4c30016b50..a46c4161f1 100644
--- a/sparse/src/KokkosSparse_spgemm_symbolic.hpp
+++ b/sparse/src/KokkosSparse_spgemm_symbolic.hpp
@@ -45,8 +45,8 @@
 #define _KOKKOS_SPGEMM_SYMBOLIC_HPP
 
 #include "KokkosKernels_helpers.hpp"
-
 #include "KokkosSparse_spgemm_symbolic_spec.hpp"
+#include "KokkosSparse_Utils.hpp"
 
 namespace KokkosSparse {
 
@@ -62,7 +62,7 @@ void spgemm_symbolic(KernelHandle *handle,
                      alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
                      bool transposeA, blno_row_view_t_ row_mapB,
                      blno_nnz_view_t_ entriesB, bool transposeB,
-                     clno_row_view_t_ row_mapC) {
+                     clno_row_view_t_ row_mapC, bool computeRowptrs = false) {
   static_assert(
       std::is_same<typename clno_row_view_t_::value_type,
                    typename clno_row_view_t_::non_const_value_type>::value,
@@ -163,18 +163,48 @@ void spgemm_symbolic(KernelHandle *handle,
   Internal_alno_nnz_view_t_ const_a_l(entriesA.data(), entriesA.extent(0));
   Internal_blno_row_view_t_ const_b_r(row_mapB.data(), row_mapB.extent(0));
   Internal_blno_nnz_view_t_ const_b_l(entriesB.data(), entriesB.extent(0));
-  Internal_clno_row_view_t_ const_c_r(row_mapC.data(), row_mapC.extent(0));
-
-  using namespace KokkosSparse::Impl;
-  SPGEMM_SYMBOLIC<
-      const_handle_type,  // KernelHandle,
-      Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
-      Internal_blno_row_view_t_, Internal_blno_nnz_view_t_,
-      Internal_clno_row_view_t_>::spgemm_symbolic(&tmp_handle,  // handle,
-                                                  m, n, k, const_a_r, const_a_l,
-                                                  transposeA, const_b_r,
-                                                  const_b_l, transposeB,
-                                                  const_c_r);
+  Internal_clno_row_view_t_ c_r(row_mapC.data(), row_mapC.extent(0));
+
+  // Verify that graphs A and B are sorted.
+  // This test is designed to be as efficient as possible, but still skip
+  // it in a release build.
+#ifndef NDEBUG
+  if (!KokkosSparse::Impl::isCrsGraphSorted(const_a_r, const_a_l))
+    throw std::runtime_error(
+        "KokkosSparse::spgemm_symbolic: entries of A are not sorted within "
+        "rows. May use KokkosSparse::sort_crs_matrix to sort it.");
+  if (!KokkosSparse::Impl::isCrsGraphSorted(const_b_r, const_b_l))
+    throw std::runtime_error(
+        "KokkosSparse::spgemm_symbolic: entries of B are not sorted within "
+        "rows. May use KokkosSparse::sort_crs_matrix to sort it.");
+#endif
+
+  auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type();
+
+  if (algo == SPGEMM_DEBUG || algo == SPGEMM_SERIAL) {
+    // Never call a TPL if serial/debug is requested (this is needed for
+    // testing)
+    KokkosSparse::Impl::SPGEMM_SYMBOLIC<
+        const_handle_type,  // KernelHandle,
+        Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
+        Internal_blno_row_view_t_, Internal_blno_nnz_view_t_,
+        Internal_clno_row_view_t_,
+        false>::spgemm_symbolic(&tmp_handle,  // handle,
+                                m, n, k, const_a_r, const_a_l, transposeA,
+                                const_b_r, const_b_l, transposeB, c_r,
+                                computeRowptrs);
+  } else {
+    KokkosSparse::Impl::SPGEMM_SYMBOLIC<
+        const_handle_type,  // KernelHandle,
+        Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
+        Internal_blno_row_view_t_, Internal_blno_nnz_view_t_,
+        Internal_clno_row_view_t_>::spgemm_symbolic(&tmp_handle,  // handle,
+                                                    m, n, k, const_a_r,
+                                                    const_a_l, transposeA,
+                                                    const_b_r, const_b_l,
+                                                    transposeB, c_r,
+                                                    computeRowptrs);
+  }
 }
 
 }  // namespace Experimental
diff --git a/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp b/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp
index aef089fd06..eae0f941d3 100644
--- a/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp
+++ b/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp
@@ -66,7 +66,7 @@ struct CusparseSingleton {
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
-#include <rocsparse.h>
+#include <rocsparse/rocsparse.h>
 
 namespace KokkosKernels {
 namespace Impl {
diff --git a/sparse/tpls/KokkosSparse_spgemm_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_jacobi_tpl_spec_avail.hpp
similarity index 76%
rename from sparse/tpls/KokkosSparse_spgemm_tpl_spec_avail.hpp
rename to sparse/tpls/KokkosSparse_spgemm_jacobi_tpl_spec_avail.hpp
index 2067ad2b38..c451d91cb4 100644
--- a/sparse/tpls/KokkosSparse_spgemm_tpl_spec_avail.hpp
+++ b/sparse/tpls/KokkosSparse_spgemm_jacobi_tpl_spec_avail.hpp
@@ -42,26 +42,11 @@
 //@HEADER
 */
 
-#ifndef KOKKOSPARSE_SPGEMM_TPL_SPEC_AVAIL_HPP_
-#define KOKKOSPARSE_SPGEMM_TPL_SPEC_AVAIL_HPP_
+#ifndef KOKKOSPARSE_SPGEMM_JACOBI_TPL_SPEC_AVAIL_HPP_
+#define KOKKOSPARSE_SPGEMM_JACOBI_TPL_SPEC_AVAIL_HPP_
 
 namespace KokkosSparse {
 namespace Impl {
-// Specialization struct which defines whether a specialization exists
-template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
-          class b_size_view_t_, class b_lno_view_t, class c_size_view_t_>
-struct spgemm_symbolic_tpl_spec_avail {
-  enum : bool { value = false };
-};
-
-// Specialization struct which defines whether a specialization exists
-template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
-          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
-          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
-          class c_scalar_view_t>
-struct spgemm_numeric_tpl_spec_avail {
-  enum : bool { value = false };
-};
 
 // Specialization struct which defines whether a specialization exists
 template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
diff --git a/sparse/tpls/KokkosSparse_spgemm_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_jacobi_tpl_spec_decl.hpp
similarity index 100%
rename from sparse/tpls/KokkosSparse_spgemm_tpl_spec_decl.hpp
rename to sparse/tpls/KokkosSparse_spgemm_jacobi_tpl_spec_decl.hpp
diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp
new file mode 100644
index 0000000000..b7e755f868
--- /dev/null
+++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp
@@ -0,0 +1,211 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_
+#define KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Impl {
+
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
+struct spgemm_numeric_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+// NOTE: all versions of cuSPARSE 10.x and 11.x support exactly the same matrix
+// types, so there is no ifdef'ing on versions needed in avail. Offset and
+// Ordinal must both be 32-bit. Even though the "generic" API lets you specify
+// offsets and ordinals independently as either 16, 32 or 64-bit, SpGEMM will
+// just fail at runtime if you don't use 32 for both.
+
+#define SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, MEMSPACE)               \
+  template <>                                                         \
+  struct spgemm_numeric_tpl_spec_avail<                               \
+      KokkosKernels::Experimental::KokkosKernelsHandle<               \
+          const int, const int, const SCALAR, Kokkos::Cuda, MEMSPACE, \
+          MEMSPACE>,                                                  \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const SCALAR *, default_layout,                    \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const SCALAR *, default_layout,                    \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<int *, default_layout,                             \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<SCALAR *, default_layout,                          \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {      \
+    enum : bool { value = true };                                     \
+  };
+
+#define SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(SCALAR)            \
+  SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \
+  SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace)
+
+SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(float)
+SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(double)
+SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex<float>)
+SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex<double>)
+
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#define SPGEMM_NUMERIC_AVAIL_ROCSPARSE(SCALAR)                               \
+  template <>                                                                \
+  struct spgemm_numeric_tpl_spec_avail<                                      \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                      \
+          const int, const int, const SCALAR, Kokkos::HIP, Kokkos::HIPSpace, \
+          Kokkos::HIPSpace>,                                                 \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const SCALAR *, default_layout,                           \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const SCALAR *, default_layout,                           \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<int *, default_layout,                                    \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR *, default_layout,                                 \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {             \
+    enum : bool { value = true };                                            \
+  };
+
+SPGEMM_NUMERIC_AVAIL_ROCSPARSE(float)
+SPGEMM_NUMERIC_AVAIL_ROCSPARSE(double)
+SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex<float>)
+SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex<double>)
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC)                         \
+  template <>                                                          \
+  struct spgemm_numeric_tpl_spec_avail<                                \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                \
+          const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \
+          Kokkos::HostSpace>,                                          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const SCALAR *, default_layout,                     \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const SCALAR *, default_layout,                     \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<int *, default_layout,                              \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<SCALAR *, default_layout,                           \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {       \
+    enum : bool { value = true };                                      \
+  };
+
+#define SPGEMM_NUMERIC_AVAIL_MKL_E(EXEC)                 \
+  SPGEMM_NUMERIC_AVAIL_MKL(float, EXEC)                  \
+  SPGEMM_NUMERIC_AVAIL_MKL(double, EXEC)                 \
+  SPGEMM_NUMERIC_AVAIL_MKL(Kokkos::complex<float>, EXEC) \
+  SPGEMM_NUMERIC_AVAIL_MKL(Kokkos::complex<double>, EXEC)
+
+#ifdef KOKKOS_ENABLE_SERIAL
+SPGEMM_NUMERIC_AVAIL_MKL_E(Kokkos::Serial)
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+SPGEMM_NUMERIC_AVAIL_MKL_E(Kokkos::OpenMP)
+#endif
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp
new file mode 100644
index 0000000000..5a0cdb3f24
--- /dev/null
+++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp
@@ -0,0 +1,672 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_DECL_HPP_
+#define KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_DECL_HPP_
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#include "cusparse.h"
+#include "KokkosSparse_Utils_cusparse.hpp"
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#include "rocsparse/rocsparse.h"
+#include "KokkosSparse_Utils_rocsparse.hpp"
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#include "KokkosSparse_Utils_mkl.hpp"
+#include "mkl_spblas.h"
+#endif
+
+namespace KokkosSparse {
+namespace Impl {
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if (CUDA_VERSION >= 11040)
+
+// 11.4+ supports generic API with reuse (full symbolic/numeric separation)
+template <typename KernelHandle, typename lno_t, typename ConstRowMapType,
+          typename ConstEntriesType, typename ConstValuesType,
+          typename EntriesType, typename ValuesType>
+void spgemm_numeric_cusparse(
+    KernelHandle *handle, lno_t m, lno_t n, lno_t k,
+    const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA,
+    const ConstValuesType &valuesA, const ConstRowMapType &row_mapB,
+    const ConstEntriesType &entriesB, const ConstValuesType &valuesB,
+    const ConstRowMapType &row_mapC, const EntriesType &entriesC,
+    const ValuesType &valuesC) {
+  using scalar_type = typename KernelHandle::nnz_scalar_t;
+  using size_type   = typename KernelHandle::size_type;
+  auto h            = handle->get_cusparse_spgemm_handle();
+
+  if (handle->get_c_nnz() == size_type(0)) {
+    // Handle empty C case. entriesC and valuesC have extent 0 so nothing needs
+    // to be done for them, but we must populate row_mapC to zeros if not
+    // already done.
+    if (!handle->are_rowptrs_computed()) {
+      Kokkos::View<size_type *, typename ConstRowMapType::device_type,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      row_mapC_nonconst(const_cast<size_type *>(row_mapC.data()),
+                        row_mapC.extent(0));
+      Kokkos::deep_copy(typename KernelHandle::HandleExecSpace(),
+                        row_mapC_nonconst, size_type(0));
+      handle->set_computed_rowptrs();
+    }
+    handle->set_computed_entries();
+    handle->set_call_numeric();
+    return;
+  }
+
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCsrSetPointers(h->descr_A, (void *)row_mapA.data(),
+                             (void *)entriesA.data(), (void *)valuesA.data()));
+
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCsrSetPointers(h->descr_B, (void *)row_mapB.data(),
+                             (void *)entriesB.data(), (void *)valuesB.data()));
+
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCsrSetPointers(h->descr_C, (void *)row_mapC.data(),
+                             (void *)entriesC.data(), (void *)valuesC.data()));
+
+  if (!handle->are_entries_computed()) {
+    if (!h->buffer5) {
+      // If symbolic was previously called with computeRowptrs=true, then
+      // buffer5 will have already been allocated to the correct size. Otherwise
+      // size and allocate it here.
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_copy(
+          h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
+          h->alg, h->spgemmDescr, &h->bufferSize5, nullptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaMalloc((void **)&h->buffer5, h->bufferSize5));
+    }
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_copy(
+        h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
+        h->alg, h->spgemmDescr, &h->bufferSize5, h->buffer5));
+    handle->set_computed_rowptrs();
+    handle->set_computed_entries();
+  }
+
+  // C' = alpha * opA(A) * opB(B) + beta * C
+  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
+  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
+
+  // alpha, beta are on host, but since we use singleton on the cusparse
+  // handle, we save/restore the pointer mode to not interference with
+  // others' use
+  cusparsePointerMode_t oldPtrMode;
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseGetPointerMode(h->cusparseHandle, &oldPtrMode));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetPointerMode(h->cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_compute(
+      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
+      h->descr_C, h->scalarType, h->alg, h->spgemmDescr));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetPointerMode(h->cusparseHandle, oldPtrMode));
+  handle->set_call_numeric();
+}
+
+#elif (CUDA_VERSION >= 11000)
+// 11.0-11.3 supports only the generic API, but not reuse.
+template <typename KernelHandle, typename lno_t, typename ConstRowMapType,
+          typename ConstEntriesType, typename ConstValuesType,
+          typename EntriesType, typename ValuesType>
+void spgemm_numeric_cusparse(
+    KernelHandle *handle, lno_t m, lno_t n, lno_t k,
+    const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA,
+    const ConstValuesType &valuesA, const ConstRowMapType &row_mapB,
+    const ConstEntriesType &entriesB, const ConstValuesType &valuesB,
+    const ConstRowMapType &row_mapC, const EntriesType &entriesC,
+    const ValuesType &valuesC) {
+  using scalar_type = typename KernelHandle::nnz_scalar_t;
+  auto h            = handle->get_cusparse_spgemm_handle();
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCsrSetPointers(h->descr_A, (void *)row_mapA.data(),
+                             (void *)entriesA.data(), (void *)valuesA.data()));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCsrSetPointers(h->descr_B, (void *)row_mapB.data(),
+                             (void *)entriesB.data(), (void *)valuesB.data()));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCsrSetPointers(h->descr_C, (void *)row_mapC.data(),
+                             (void *)entriesC.data(), (void *)valuesC.data()));
+  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
+  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute(
+      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
+      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr,
+      &h->bufferSize4, h->buffer4));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_copy(
+      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
+      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr));
+  handle->set_computed_entries();
+  handle->set_call_numeric();
+}
+
+#else
+
+// Generic (using overloads) wrapper for cusparseXcsrgemm (where X is S, D, C,
+// or Z). Accepts Kokkos types (e.g. Kokkos::complex<float>) for Scalar and
+// handles casting to cuSparse types internally.
+
+#define CUSPARSE_XCSRGEMM_SPEC(KokkosType, CusparseType, Abbreviation)     \
+  inline cusparseStatus_t cusparseXcsrgemm(                                \
+      cusparseHandle_t handle, cusparseOperation_t transA,                 \
+      cusparseOperation_t transB, int m, int n, int k,                     \
+      const cusparseMatDescr_t descrA, const int nnzA,                     \
+      const KokkosType *csrSortedValA, const int *csrSortedRowPtrA,        \
+      const int *csrSortedColIndA, const cusparseMatDescr_t descrB,        \
+      const int nnzB, const KokkosType *csrSortedValB,                     \
+      const int *csrSortedRowPtrB, const int *csrSortedColIndB,            \
+      const cusparseMatDescr_t descrC, KokkosType *csrSortedValC,          \
+      const int *csrSortedRowPtrC, int *csrSortedColIndC) {                \
+    return cusparse##Abbreviation##csrgemm(                                \
+        handle, transA, transB, m, n, k, descrA, nnzA,                     \
+        reinterpret_cast<const CusparseType *>(csrSortedValA),             \
+        csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,                  \
+        reinterpret_cast<const CusparseType *>(csrSortedValB),             \
+        csrSortedRowPtrB, csrSortedColIndB, descrC,                        \
+        reinterpret_cast<CusparseType *>(csrSortedValC), csrSortedRowPtrC, \
+        csrSortedColIndC);                                                 \
+  }
+
+CUSPARSE_XCSRGEMM_SPEC(float, float, S)
+CUSPARSE_XCSRGEMM_SPEC(double, double, D)
+CUSPARSE_XCSRGEMM_SPEC(Kokkos::complex<float>, cuComplex, C)
+CUSPARSE_XCSRGEMM_SPEC(Kokkos::complex<double>, cuDoubleComplex, Z)
+
+#undef CUSPARSE_XCSRGEMM_SPEC
+
+// 10.x supports the pre-generic interface.
+template <typename KernelHandle, typename lno_t, typename ConstRowMapType,
+          typename ConstEntriesType, typename ConstValuesType,
+          typename EntriesType, typename ValuesType>
+void spgemm_numeric_cusparse(
+    KernelHandle *handle, lno_t m, lno_t n, lno_t k,
+    const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA,
+    const ConstValuesType &valuesA, const ConstRowMapType &row_mapB,
+    const ConstEntriesType &entriesB, const ConstValuesType &valuesB,
+    const ConstRowMapType &row_mapC, const EntriesType &entriesC,
+    const ValuesType &valuesC) {
+  auto h = handle->get_cusparse_spgemm_handle();
+
+  int nnzA = entriesA.extent(0);
+  int nnzB = entriesB.extent(0);
+
+  // Only call numeric if C actually has entries
+  if (handle->get_c_nnz()) {
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgemm(
+        h->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, m, k, n, h->generalDescr, nnzA,
+        valuesA.data(), row_mapA.data(), entriesA.data(), h->generalDescr, nnzB,
+        valuesB.data(), row_mapB.data(), entriesB.data(), h->generalDescr,
+        valuesC.data(), row_mapC.data(), entriesC.data()));
+  }
+  handle->set_computed_entries();
+  handle->set_call_numeric();
+}
+
+#endif
+
+#define SPGEMM_NUMERIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL)              \
+  template <>                                                                  \
+  struct SPGEMM_NUMERIC<KokkosKernels::Experimental::KokkosKernelsHandle<      \
+                            const int, const int, const SCALAR, Kokkos::Cuda,  \
+                            MEMSPACE, MEMSPACE>,                               \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const SCALAR *, default_layout,           \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const SCALAR *, default_layout,           \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<int *, default_layout,                    \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<SCALAR *, default_layout,                 \
+                                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,   \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        true, TPL_AVAIL> {                                     \
+    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<     \
+        const int, const int, const SCALAR, Kokkos::Cuda, MEMSPACE, MEMSPACE>; \
+    using c_int_view_t =                                                       \
+        Kokkos::View<const int *, default_layout,                              \
+                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                   \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using int_view_t = Kokkos::View<int *, default_layout,                     \
+                                    Kokkos::Device<Kokkos::Cuda, MEMSPACE>,    \
+                                    Kokkos::MemoryTraits<Kokkos::Unmanaged>>;  \
+    using c_scalar_view_t =                                                    \
+        Kokkos::View<const SCALAR *, default_layout,                           \
+                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                   \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using scalar_view_t =                                                      \
+        Kokkos::View<SCALAR *, default_layout,                                 \
+                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                   \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    static void spgemm_numeric(KernelHandle *handle,                           \
+                               typename KernelHandle::nnz_lno_t m,             \
+                               typename KernelHandle::nnz_lno_t n,             \
+                               typename KernelHandle::nnz_lno_t k,             \
+                               c_int_view_t row_mapA, c_int_view_t entriesA,   \
+                               c_scalar_view_t valuesA, bool,                  \
+                               c_int_view_t row_mapB, c_int_view_t entriesB,   \
+                               c_scalar_view_t valuesB, bool,                  \
+                               c_int_view_t row_mapC, int_view_t entriesC,     \
+                               scalar_view_t valuesC) {                        \
+      std::string label = "KokkosSparse::spgemm_numeric[TPL_CUSPARSE," +       \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spgemm_numeric_cusparse(handle->get_spgemm_handle(), m, n, k, row_mapA,  \
+                              entriesA, valuesA, row_mapB, entriesB, valuesB,  \
+                              row_mapC, entriesC, valuesC);                    \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define SPGEMM_NUMERIC_DECL_CUSPARSE_S(SCALAR, TPL_AVAIL)            \
+  SPGEMM_NUMERIC_DECL_CUSPARSE(SCALAR, Kokkos::CudaSpace, TPL_AVAIL) \
+  SPGEMM_NUMERIC_DECL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace, TPL_AVAIL)
+
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(float, true)
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(double, true)
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(Kokkos::complex<float>, true)
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(Kokkos::complex<double>, true)
+
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(float, false)
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(double, false)
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(Kokkos::complex<float>, false)
+SPGEMM_NUMERIC_DECL_CUSPARSE_S(Kokkos::complex<double>, false)
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+//=============================================================================
+// Overload rocsparse_Xcsrgemm_numeric() over scalar types
+#define ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(scalar_type, TOKEN)                   \
+  inline rocsparse_status rocsparse_Xcsrgemm_numeric(                         \
+      rocsparse_handle handle, rocsparse_operation trans_A,                   \
+      rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,          \
+      rocsparse_int k, const scalar_type *alpha,                              \
+      const rocsparse_mat_descr descr_A, rocsparse_int nnz_A,                 \
+      const scalar_type *csr_val_A, const rocsparse_int *csr_row_ptr_A,       \
+      const rocsparse_int *csr_col_ind_A, const rocsparse_mat_descr descr_B,  \
+      rocsparse_int nnz_B, const scalar_type *csr_val_B,                      \
+      const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, \
+      const scalar_type *beta, const rocsparse_mat_descr descr_D,             \
+      rocsparse_int nnz_D, const scalar_type *csr_val_D,                      \
+      const rocsparse_int *csr_row_ptr_D, const rocsparse_int *csr_col_ind_D, \
+      const rocsparse_mat_descr descr_C, rocsparse_int nnz_C,                 \
+      scalar_type *csr_val_C, const rocsparse_int *csr_row_ptr_C,             \
+      const rocsparse_int *csr_col_ind_C, const rocsparse_mat_info info_C,    \
+      void *buffer) {                                                         \
+    return rocsparse_##TOKEN##csrgemm_numeric(                                \
+        handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A, csr_val_A,  \
+        csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_val_B,              \
+        csr_row_ptr_B, csr_col_ind_B, beta, descr_D, nnz_D, csr_val_D,        \
+        csr_row_ptr_D, csr_col_ind_D, descr_C, nnz_C, csr_val_C,              \
+        csr_row_ptr_C, csr_col_ind_C, info_C, buffer);                        \
+  }
+
+ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(float, s)
+ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(double, d)
+ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(rocsparse_float_complex, c)
+ROCSPARSE_XCSRGEMM_NUMERIC_SPEC(rocsparse_double_complex, z)
+
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type,
+    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
+    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
+    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
+void spgemm_numeric_rocsparse(
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA,
+    ain_nonzero_value_view_type valuesA, bin_row_index_view_type rowptrB,
+    bin_nonzero_index_view_type colidxB, bin_nonzero_value_view_type valuesB,
+    cin_row_index_view_type rowptrC, cin_nonzero_index_view_type colidxC,
+    cin_nonzero_value_view_type valuesC) {
+  using scalar_type = typename KernelHandle::nnz_scalar_t;
+  using rocsparse_scalar_type =
+      typename kokkos_to_rocsparse_type<scalar_type>::type;
+
+  typename KernelHandle::rocSparseSpgemmHandleType *h =
+      handle->get_rocsparse_spgemm_handle();
+
+  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
+  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
+  rocsparse_pointer_mode oldPtrMode;
+
+  auto nnz_A = colidxA.extent(0);
+  auto nnz_B = colidxB.extent(0);
+  auto nnz_C = colidxC.extent(0);
+
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
+      rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(
+      h->rocsparseHandle, rocsparse_pointer_mode_host));
+
+  if (!handle->are_entries_computed()) {
+    KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_symbolic(
+        h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A,
+        rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, rowptrB.data(),
+        colidxB.data(), h->descr_D, 0, nullptr, nullptr, h->descr_C, nnz_C,
+        rowptrC.data(), colidxC.data(), h->info_C, h->buffer));
+    handle->set_computed_entries();
+  }
+
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_numeric(
+      h->rocsparseHandle, h->opA, h->opB, m, k, n,
+      reinterpret_cast<const rocsparse_scalar_type *>(&alpha), h->descr_A,
+      nnz_A, reinterpret_cast<const rocsparse_scalar_type *>(valuesA.data()),
+      rowptrA.data(), colidxA.data(), h->descr_B, nnz_B,
+      reinterpret_cast<const rocsparse_scalar_type *>(valuesB.data()),
+      rowptrB.data(), colidxB.data(),
+      reinterpret_cast<const rocsparse_scalar_type *>(&beta), h->descr_D, 0,
+      nullptr, nullptr, nullptr, h->descr_C, nnz_C,
+      reinterpret_cast<rocsparse_scalar_type *>(valuesC.data()), rowptrC.data(),
+      colidxC.data(), h->info_C, h->buffer));
+  // Restore old pointer mode
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
+      rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode));
+  handle->set_call_numeric();
+}
+
+#define SPGEMM_NUMERIC_DECL_ROCSPARSE(SCALAR, TPL_AVAIL)                       \
+  template <>                                                                  \
+  struct SPGEMM_NUMERIC<                                                       \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                        \
+          const int, const int, const SCALAR, Kokkos::HIP, Kokkos::HIPSpace,   \
+          Kokkos::HIPSpace>,                                                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const SCALAR *, default_layout,                             \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const SCALAR *, default_layout,                             \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<int *, default_layout,                                      \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<SCALAR *, default_layout,                                   \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      true, TPL_AVAIL> {                                                       \
+    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<     \
+        const int, const int, const SCALAR, Kokkos::HIP, Kokkos::HIPSpace,     \
+        Kokkos::HIPSpace>;                                                     \
+    using c_int_view_t =                                                       \
+        Kokkos::View<const int *, default_layout,                              \
+                     Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using int_view_t =                                                         \
+        Kokkos::View<int *, default_layout,                                    \
+                     Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using c_scalar_view_t =                                                    \
+        Kokkos::View<const SCALAR *, default_layout,                           \
+                     Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using scalar_view_t =                                                      \
+        Kokkos::View<SCALAR *, default_layout,                                 \
+                     Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    static void spgemm_numeric(KernelHandle *handle,                           \
+                               typename KernelHandle::nnz_lno_t m,             \
+                               typename KernelHandle::nnz_lno_t n,             \
+                               typename KernelHandle::nnz_lno_t k,             \
+                               c_int_view_t row_mapA, c_int_view_t entriesA,   \
+                               c_scalar_view_t valuesA, bool,                  \
+                               c_int_view_t row_mapB, c_int_view_t entriesB,   \
+                               c_scalar_view_t valuesB, bool,                  \
+                               c_int_view_t row_mapC, int_view_t entriesC,     \
+                               scalar_view_t valuesC) {                        \
+      std::string label = "KokkosSparse::spgemm_numeric[TPL_ROCSPARSE," +      \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spgemm_numeric_rocsparse(handle->get_spgemm_handle(), m, n, k, row_mapA, \
+                               entriesA, valuesA, row_mapB, entriesB, valuesB, \
+                               row_mapC, entriesC, valuesC);                   \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+SPGEMM_NUMERIC_DECL_ROCSPARSE(float, true)
+SPGEMM_NUMERIC_DECL_ROCSPARSE(double, true)
+SPGEMM_NUMERIC_DECL_ROCSPARSE(Kokkos::complex<float>, true)
+SPGEMM_NUMERIC_DECL_ROCSPARSE(Kokkos::complex<double>, true)
+
+SPGEMM_NUMERIC_DECL_ROCSPARSE(float, false)
+SPGEMM_NUMERIC_DECL_ROCSPARSE(double, false)
+SPGEMM_NUMERIC_DECL_ROCSPARSE(Kokkos::complex<float>, false)
+SPGEMM_NUMERIC_DECL_ROCSPARSE(Kokkos::complex<double>, false)
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type,
+    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
+    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
+    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
+void spgemm_numeric_mkl(
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA,
+    ain_nonzero_value_view_type valuesA, bin_row_index_view_type rowptrB,
+    bin_nonzero_index_view_type colidxB, bin_nonzero_value_view_type valuesB,
+    cin_row_index_view_type rowptrC, cin_nonzero_index_view_type colidxC,
+    cin_nonzero_value_view_type valuesC) {
+  using ExecSpace   = typename KernelHandle::HandleExecSpace;
+  using index_type  = typename KernelHandle::nnz_lno_t;
+  using size_type   = typename KernelHandle::size_type;
+  using scalar_type = typename KernelHandle::nnz_scalar_t;
+  using MKLMatrix   = MKLSparseMatrix<scalar_type>;
+  size_type c_nnz   = handle->get_c_nnz();
+  if (c_nnz == size_type(0)) {
+    handle->set_computed_entries();
+    handle->set_call_numeric();
+    return;
+  }
+  MKLMatrix A(m, n, const_cast<size_type *>(rowptrA.data()),
+              const_cast<index_type *>(colidxA.data()),
+              const_cast<scalar_type *>(valuesA.data()));
+  MKLMatrix B(n, k, const_cast<size_type *>(rowptrB.data()),
+              const_cast<index_type *>(colidxB.data()),
+              const_cast<scalar_type *>(valuesB.data()));
+  auto mklSpgemmHandle = handle->get_mkl_spgemm_handle();
+  bool computedEntries = false;
+  matrix_descr generalDescr;
+  generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL;
+  generalDescr.mode = SPARSE_FILL_MODE_FULL;
+  generalDescr.diag = SPARSE_DIAG_NON_UNIT;
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE, generalDescr, A,
+                      SPARSE_OPERATION_NON_TRANSPOSE, generalDescr, B,
+                      SPARSE_STAGE_FINALIZE_MULT_NO_VAL, &mklSpgemmHandle->C));
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE, generalDescr, A,
+                      SPARSE_OPERATION_NON_TRANSPOSE, generalDescr, B,
+                      SPARSE_STAGE_FINALIZE_MULT, &mklSpgemmHandle->C));
+  KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_order(mklSpgemmHandle->C));
+  MKLMatrix wrappedC(mklSpgemmHandle->C);
+  MKL_INT nrows = 0, ncols = 0;
+  MKL_INT *rowptrRaw     = nullptr;
+  MKL_INT *colidxRaw     = nullptr;
+  scalar_type *valuesRaw = nullptr;
+  wrappedC.export_data(nrows, ncols, rowptrRaw, colidxRaw, valuesRaw);
+  Kokkos::View<index_type *, Kokkos::HostSpace,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      colidxRawView(colidxRaw, c_nnz);
+  Kokkos::View<scalar_type *, Kokkos::HostSpace,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      valuesRawView(valuesRaw, c_nnz);
+  Kokkos::deep_copy(ExecSpace(), colidxC, colidxRawView);
+  Kokkos::deep_copy(ExecSpace(), valuesC, valuesRawView);
+  handle->set_call_numeric();
+  handle->set_computed_entries();
+}
+
+#define SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL)                       \
+  template <>                                                                  \
+  struct SPGEMM_NUMERIC<KokkosKernels::Experimental::KokkosKernelsHandle<      \
+                            const int, const int, const SCALAR, EXEC,          \
+                            Kokkos::HostSpace, Kokkos::HostSpace>,             \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const SCALAR *, default_layout,           \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const SCALAR *, default_layout,           \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<const int *, default_layout,              \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<int *, default_layout,                    \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        Kokkos::View<SCALAR *, default_layout,                 \
+                                     Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                        true, TPL_AVAIL> {                                     \
+    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<     \
+        const int, const int, const SCALAR, EXEC, Kokkos::HostSpace,           \
+        Kokkos::HostSpace>;                                                    \
+    using c_int_view_t =                                                       \
+        Kokkos::View<const int *, default_layout,                              \
+                     Kokkos::Device<EXEC, Kokkos::HostSpace>,                  \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using int_view_t = Kokkos::View<int *, default_layout,                     \
+                                    Kokkos::Device<EXEC, Kokkos::HostSpace>,   \
+                                    Kokkos::MemoryTraits<Kokkos::Unmanaged>>;  \
+    using c_scalar_view_t =                                                    \
+        Kokkos::View<const SCALAR *, default_layout,                           \
+                     Kokkos::Device<EXEC, Kokkos::HostSpace>,                  \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using scalar_view_t =                                                      \
+        Kokkos::View<SCALAR *, default_layout,                                 \
+                     Kokkos::Device<EXEC, Kokkos::HostSpace>,                  \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    static void spgemm_numeric(KernelHandle *handle,                           \
+                               typename KernelHandle::nnz_lno_t m,             \
+                               typename KernelHandle::nnz_lno_t n,             \
+                               typename KernelHandle::nnz_lno_t k,             \
+                               c_int_view_t row_mapA, c_int_view_t entriesA,   \
+                               c_scalar_view_t valuesA, bool,                  \
+                               c_int_view_t row_mapB, c_int_view_t entriesB,   \
+                               c_scalar_view_t valuesB, bool,                  \
+                               c_int_view_t row_mapC, int_view_t entriesC,     \
+                               scalar_view_t valuesC) {                        \
+      std::string label = "KokkosSparse::spgemm_numeric[TPL_MKL," +            \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spgemm_numeric_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA,       \
+                         entriesA, valuesA, row_mapB, entriesB, valuesB,       \
+                         row_mapC, entriesC, valuesC);                         \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define SPGEMM_NUMERIC_DECL_MKL_SE(SCALAR, EXEC) \
+  SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, true)    \
+  SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, false)
+
+#define SPGEMM_NUMERIC_DECL_MKL_E(EXEC)                    \
+  SPGEMM_NUMERIC_DECL_MKL_SE(float, EXEC)                  \
+  SPGEMM_NUMERIC_DECL_MKL_SE(double, EXEC)                 \
+  SPGEMM_NUMERIC_DECL_MKL_SE(Kokkos::complex<float>, EXEC) \
+  SPGEMM_NUMERIC_DECL_MKL_SE(Kokkos::complex<double>, EXEC)
+
+#ifdef KOKKOS_ENABLE_SERIAL
+SPGEMM_NUMERIC_DECL_MKL_E(Kokkos::Serial)
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+SPGEMM_NUMERIC_DECL_MKL_E(Kokkos::OpenMP)
+#endif
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp
new file mode 100644
index 0000000000..b526619c18
--- /dev/null
+++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp
@@ -0,0 +1,171 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_
+#define KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class b_size_view_t_, class b_lno_view_t, class c_size_view_t_>
+struct spgemm_symbolic_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+// NOTE: all versions of cuSPARSE 10.x and 11.x support exactly the same matrix
+// types, so there is no ifdef'ing on versions needed in avail. Offset and
+// Ordinal must both be 32-bit. Even though the "generic" API lets you specify
+// offsets and ordinals independently as either 16, 32 or 64-bit, SpGEMM will
+// just fail at runtime if you don't use 32 for both.
+
+#define SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, MEMSPACE)              \
+  template <>                                                         \
+  struct spgemm_symbolic_tpl_spec_avail<                              \
+      KokkosKernels::Experimental::KokkosKernelsHandle<               \
+          const int, const int, const SCALAR, Kokkos::Cuda, MEMSPACE, \
+          MEMSPACE>,                                                  \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const int *, default_layout,                       \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<int *, default_layout,                             \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {      \
+    enum : bool { value = true };                                     \
+  };
+
+#define SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(SCALAR)            \
+  SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \
+  SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace)
+
+SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(float)
+SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(double)
+SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex<float>)
+SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex<double>)
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#define SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(SCALAR)                              \
+  template <>                                                                \
+  struct spgemm_symbolic_tpl_spec_avail<                                     \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                      \
+          const int, const int, const SCALAR, Kokkos::HIP, Kokkos::HIPSpace, \
+          Kokkos::HIPSpace>,                                                 \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const int *, default_layout,                              \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<int *, default_layout,                                    \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {             \
+    enum : bool { value = true };                                            \
+  };
+
+SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(float)
+SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(double)
+SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex<float>)
+SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex<double>)
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC)                        \
+  template <>                                                          \
+  struct spgemm_symbolic_tpl_spec_avail<                               \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                \
+          const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \
+          Kokkos::HostSpace>,                                          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const int *, default_layout,                        \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<int *, default_layout,                              \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {       \
+    enum : bool { value = true };                                      \
+  };
+
+#define SPGEMM_SYMBOLIC_AVAIL_MKL_E(EXEC)                 \
+  SPGEMM_SYMBOLIC_AVAIL_MKL(float, EXEC)                  \
+  SPGEMM_SYMBOLIC_AVAIL_MKL(double, EXEC)                 \
+  SPGEMM_SYMBOLIC_AVAIL_MKL(Kokkos::complex<float>, EXEC) \
+  SPGEMM_SYMBOLIC_AVAIL_MKL(Kokkos::complex<double>, EXEC)
+
+#ifdef KOKKOS_ENABLE_SERIAL
+SPGEMM_SYMBOLIC_AVAIL_MKL_E(Kokkos::Serial)
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+SPGEMM_SYMBOLIC_AVAIL_MKL_E(Kokkos::OpenMP)
+#endif
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp
new file mode 100644
index 0000000000..7db9ce5fc3
--- /dev/null
+++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp
@@ -0,0 +1,686 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_DECL_HPP_
+#define KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_DECL_HPP_
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#include "cusparse.h"
+#include "KokkosSparse_Utils_cusparse.hpp"
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#include "rocsparse/rocsparse.h"
+#include "KokkosSparse_Utils_rocsparse.hpp"
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#include "KokkosSparse_Utils_mkl.hpp"
+#include "mkl_spblas.h"
+#endif
+
+namespace KokkosSparse {
+namespace Impl {
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+// NOTE: all versions of cuSPARSE 10.x and 11.x support exactly the same matrix
+// types, so there is no ifdef'ing on versions needed in avail. Offset and
+// Ordinal must both be 32-bit. Even though the "generic" API lets you specify
+// offsets and ordinals independently as either 16, 32 or 64-bit integers,
+// cusparse will just fail at runtime if you don't use 32 for both.
+
+#if (CUDA_VERSION >= 11040)
+// 11.4+ supports generic API with reuse (full symbolic/numeric separation)
+// However, its "symbolic" (cusparseSpGEMMreuse_nnz) does not populate C's
+// rowptrs.
+template <typename KernelHandle, typename lno_t, typename ConstRowMapType,
+          typename ConstEntriesType, typename RowMapType>
+void spgemm_symbolic_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k,
+                              const ConstRowMapType &row_mapA,
+                              const ConstEntriesType &entriesA,
+                              const ConstRowMapType &row_mapB,
+                              const ConstEntriesType &entriesB,
+                              const RowMapType &row_mapC, bool computeRowptrs) {
+  // Split symbolic into two sub-phases: handle/buffer setup and nnz(C), and
+  // then rowptrs (if requested). That way, calling symbolic once with
+  // computeRowptrs=false, and then again with computeRowptrs=true will not
+  // duplicate any work.
+  if (!handle->is_symbolic_called()) {
+    handle->create_cusparse_spgemm_handle(false, false);
+    auto h = handle->get_cusparse_spgemm_handle();
+
+    // Follow
+    // https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/spgemm_reuse
+    void *buffer1      = nullptr;
+    void *buffer2      = nullptr;
+    size_t bufferSize1 = 0;
+    size_t bufferSize2 = 0;
+
+    // When nnz is not zero, cusparseCreateCsr insists non-null a value pointer,
+    // which however is not available in this function. So we fake it with the
+    // entries instead. Fortunately, it seems cupsarse does not access that in
+    // the symbolic phase.
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+        &h->descr_A, m, n, entriesA.extent(0), (void *)row_mapA.data(),
+        (void *)entriesA.data(), (void *)entriesA.data() /*fake*/,
+        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+        h->scalarType));
+
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+        &h->descr_B, n, k, entriesB.extent(0), (void *)row_mapB.data(),
+        (void *)entriesB.data(), (void *)entriesB.data() /*fake*/,
+        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+        h->scalarType));
+
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+        &h->descr_C, m, k, 0, nullptr, nullptr, nullptr, CUSPARSE_INDEX_32I,
+        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
+
+    //----------------------------------------------------------------------
+    // ask bufferSize1 bytes for external memory
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_workEstimation(
+        h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
+        h->alg, h->spgemmDescr, &bufferSize1, nullptr));
+
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&buffer1, bufferSize1));
+    // inspect matrices A and B to understand the memory requirement for the
+    // next step
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_workEstimation(
+        h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
+        h->alg, h->spgemmDescr, &bufferSize1, buffer1));
+
+    //----------------------------------------------------------------------
+    // Compute nnz of C
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_nnz(
+        h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
+        h->alg, h->spgemmDescr, &bufferSize2, nullptr, &h->bufferSize3, nullptr,
+        &h->bufferSize4, nullptr));
+
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&buffer2, bufferSize2));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMalloc((void **)&h->buffer3, h->bufferSize3));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMalloc((void **)&h->buffer4, h->bufferSize4));
+
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_nnz(
+        h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
+        h->alg, h->spgemmDescr, &bufferSize2, buffer2, &h->bufferSize3,
+        h->buffer3, &h->bufferSize4, h->buffer4));
+
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer2));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer1));
+
+    int64_t C_nrow, C_ncol, C_nnz;
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSpMatGetSize(h->descr_C, &C_nrow, &C_ncol, &C_nnz));
+    if (C_nnz > std::numeric_limits<int>::max()) {
+      throw std::runtime_error("nnz of C overflowed over 32-bit int\n");
+    }
+    handle->set_c_nnz(C_nnz);
+    handle->set_call_symbolic();
+  }
+  if (computeRowptrs && !handle->are_rowptrs_computed()) {
+    using Scalar  = typename KernelHandle::nnz_scalar_t;
+    using Ordinal = typename KernelHandle::nnz_lno_t;
+    using Offset  = typename KernelHandle::size_type;
+    Ordinal *dummyEntries;
+    Scalar *dummyValues;
+    auto C_nnz = handle->get_c_nnz();
+    auto h     = handle->get_cusparse_spgemm_handle();
+    // We just want rowptrs, but since C's entries/values are not yet allocated,
+    // we must use dummy versions and then discard them.
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMalloc((void **)&dummyEntries, C_nnz * sizeof(Ordinal)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMalloc((void **)&dummyValues, C_nnz * sizeof(Scalar)));
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCsrSetPointers(
+        h->descr_C, row_mapC.data(), dummyEntries, dummyValues));
+    //--------------------------------------------------------------------------
+
+    cusparseSpGEMMreuse_copy(h->cusparseHandle, h->opA, h->opB, h->descr_A,
+                             h->descr_B, h->descr_C, h->alg, h->spgemmDescr,
+                             &h->bufferSize5, nullptr);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMalloc((void **)&h->buffer5, h->bufferSize5));
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMMreuse_copy(
+        h->cusparseHandle, h->opA, h->opB, h->descr_A, h->descr_B, h->descr_C,
+        h->alg, h->spgemmDescr, &h->bufferSize5, h->buffer5));
+    if (!handle->get_c_nnz()) {
+      // cuSPARSE does not populate C rowptrs if C has no entries
+      Kokkos::deep_copy(typename KernelHandle::HandleExecSpace(), row_mapC,
+                        Offset(0));
+    }
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dummyValues));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dummyEntries));
+    handle->set_computed_rowptrs();
+  }
+}
+
+#elif (CUDA_VERSION >= 11000)
+// 11.0-11.3 supports only the generic API, but not reuse.
+template <typename KernelHandle, typename lno_t, typename ConstRowMapType,
+          typename ConstEntriesType, typename RowMapType>
+void spgemm_symbolic_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k,
+                              const ConstRowMapType &row_mapA,
+                              const ConstEntriesType &entriesA,
+                              const ConstRowMapType &row_mapB,
+                              const ConstEntriesType &entriesB,
+                              const RowMapType &row_mapC,
+                              bool /* computeRowptrs */) {
+  using scalar_type = typename KernelHandle::nnz_scalar_t;
+  using Offset      = typename KernelHandle::size_type;
+  if (handle->is_symbolic_called() && handle->are_rowptrs_computed()) return;
+  handle->create_cusparse_spgemm_handle(false, false);
+  auto h = handle->get_cusparse_spgemm_handle();
+
+  // Follow
+  // https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm
+
+  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
+  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
+
+  // In non-reuse interface, forced to give A,B dummy values to
+  // cusparseSpGEMM_compute. And it actually reads them, so they must be
+  // allocated and of the correct type. This compute will be called again in
+  // numeric with the real values.
+  //
+  // The dummy values can be uninitialized. cusparseSpGEMM_compute does
+  // not remove numerical zeros from the sparsity pattern.
+  void *dummyValues;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(
+      &dummyValues,
+      sizeof(scalar_type) * std::max(entriesA.extent(0), entriesB.extent(0))));
+
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+      &h->descr_A, m, n, entriesA.extent(0), (void *)row_mapA.data(),
+      (void *)entriesA.data(), dummyValues, CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
+
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+      &h->descr_B, n, k, entriesB.extent(0), (void *)row_mapB.data(),
+      (void *)entriesB.data(), dummyValues, CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
+
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseCreateCsr(&h->descr_C, m, k, 0, row_mapC.data(), nullptr, nullptr,
+                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                        CUSPARSE_INDEX_BASE_ZERO, h->scalarType));
+
+  //----------------------------------------------------------------------
+  // query workEstimation buffer size, allocate, then call again with buffer.
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_workEstimation(
+      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
+      h->descr_C, h->scalarType, h->alg, h->spgemmDescr, &h->bufferSize3,
+      nullptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&h->buffer3, h->bufferSize3));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_workEstimation(
+      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
+      h->descr_C, h->scalarType, h->alg, h->spgemmDescr, &h->bufferSize3,
+      h->buffer3));
+  cudaFree(h->buffer3);
+  h->buffer3 = nullptr;
+
+  //----------------------------------------------------------------------
+  // query compute buffer size, allocate, then call again with buffer.
+
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute(
+      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
+      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr,
+      &h->bufferSize4, nullptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&h->buffer4, h->bufferSize4));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute(
+      h->cusparseHandle, h->opA, h->opB, &alpha, h->descr_A, h->descr_B, &beta,
+      h->descr_C, h->scalarType, CUSPARSE_SPGEMM_DEFAULT, h->spgemmDescr,
+      &h->bufferSize4, h->buffer4));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dummyValues));
+
+  int64_t C_nrow, C_ncol, C_nnz;
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSpMatGetSize(h->descr_C, &C_nrow, &C_ncol, &C_nnz));
+  if (C_nnz > std::numeric_limits<int>::max()) {
+    throw std::runtime_error("nnz of C overflowed over 32-bit int\n");
+  }
+  handle->set_c_nnz(C_nnz);
+  handle->set_call_symbolic();
+  handle->set_computed_rowptrs();
+}
+
+#else
+// 10.x supports the pre-generic interface (cusparseXcsrgemmNnz). It always
+// populates C rowptrs.
+template <typename KernelHandle, typename lno_t, typename ConstRowMapType,
+          typename ConstEntriesType, typename RowMapType>
+void spgemm_symbolic_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k,
+                              const ConstRowMapType &row_mapA,
+                              const ConstEntriesType &entriesA,
+                              const ConstRowMapType &row_mapB,
+                              const ConstEntriesType &entriesB,
+                              const RowMapType &row_mapC,
+                              bool /* computeRowptrs */) {
+  // using scalar_type = typename KernelHandle::nnz_scalar_t;
+  using size_type = typename KernelHandle::size_type;
+  if (handle->are_rowptrs_computed()) return;
+  handle->create_cusparse_spgemm_handle(false, false);
+  auto h   = handle->get_cusparse_spgemm_handle();
+  int nnzA = entriesA.extent(0);
+  int nnzB = entriesB.extent(0);
+
+  int baseC, nnzC;
+  int *nnzTotalDevHostPtr = &nnzC;
+
+  // In empty (zero entries) matrix case, cusparse does not populate rowptrs to
+  // zeros
+  if (m == 0 || n == 0 || k == 0 || entriesA.extent(0) == size_type(0) ||
+      entriesB.extent(0) == size_type(0)) {
+    Kokkos::deep_copy(typename KernelHandle::HandleExecSpace(), row_mapC,
+                      size_type(0));
+    nnzC = 0;
+  } else {
+    KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgemmNnz(
+        h->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, m, k, n, h->generalDescr, nnzA,
+        row_mapA.data(), entriesA.data(), h->generalDescr, nnzB,
+        row_mapB.data(), entriesB.data(), h->generalDescr, row_mapC.data(),
+        nnzTotalDevHostPtr));
+    if (nullptr != nnzTotalDevHostPtr) {
+      nnzC = *nnzTotalDevHostPtr;
+    } else {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(
+          &nnzC, row_mapC.data() + m, sizeof(int), cudaMemcpyDeviceToHost));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(
+          &baseC, row_mapC.data(), sizeof(int), cudaMemcpyDeviceToHost));
+      nnzC -= baseC;
+    }
+  }
+  handle->set_c_nnz(nnzC);
+  handle->set_call_symbolic();
+  handle->set_computed_rowptrs();
+}
+
+#endif
+
+#define SPGEMM_SYMBOLIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL)             \
+  template <>                                                                  \
+  struct SPGEMM_SYMBOLIC<                                                      \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                        \
+          const int, const int, const SCALAR, Kokkos::Cuda, MEMSPACE,          \
+          MEMSPACE>,                                                           \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const int *, default_layout,                                \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<int *, default_layout,                                      \
+                   Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      true, TPL_AVAIL> {                                                       \
+    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<     \
+        const int, const int, const SCALAR, Kokkos::Cuda, MEMSPACE, MEMSPACE>; \
+    using c_int_view_t =                                                       \
+        Kokkos::View<const int *, default_layout,                              \
+                     Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                   \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                 \
+    using int_view_t = Kokkos::View<int *, default_layout,                     \
+                                    Kokkos::Device<Kokkos::Cuda, MEMSPACE>,    \
+                                    Kokkos::MemoryTraits<Kokkos::Unmanaged>>;  \
+    static void spgemm_symbolic(KernelHandle *handle,                          \
+                                typename KernelHandle::nnz_lno_t m,            \
+                                typename KernelHandle::nnz_lno_t n,            \
+                                typename KernelHandle::nnz_lno_t k,            \
+                                c_int_view_t row_mapA, c_int_view_t entriesA,  \
+                                bool, c_int_view_t row_mapB,                   \
+                                c_int_view_t entriesB, bool,                   \
+                                int_view_t row_mapC, bool computeRowptrs) {    \
+      std::string label = "KokkosSparse::spgemm[TPL_CUSPARSE," +               \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spgemm_symbolic_cusparse(handle->get_spgemm_handle(), m, n, k, row_mapA, \
+                               entriesA, row_mapB, entriesB, row_mapC,         \
+                               computeRowptrs);                                \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(SCALAR, TPL_AVAIL)            \
+  SPGEMM_SYMBOLIC_DECL_CUSPARSE(SCALAR, Kokkos::CudaSpace, TPL_AVAIL) \
+  SPGEMM_SYMBOLIC_DECL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace, TPL_AVAIL)
+
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(float, true)
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(double, true)
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(Kokkos::complex<float>, true)
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(Kokkos::complex<double>, true)
+
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(float, false)
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(double, false)
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(Kokkos::complex<float>, false)
+SPGEMM_SYMBOLIC_DECL_CUSPARSE_S(Kokkos::complex<double>, false)
+
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+//=============================================================================
+// Overload rocsparse_Xcsrgemm_buffer_size() over scalar types
+#define ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(scalar_type, TOKEN)               \
+  inline rocsparse_status rocsparse_Xcsrgemm_buffer_size(                     \
+      rocsparse_handle handle, rocsparse_operation trans_A,                   \
+      rocsparse_operation trans_B, rocsparse_int m, rocsparse_int n,          \
+      rocsparse_int k, const scalar_type *alpha,                              \
+      const rocsparse_mat_descr descr_A, rocsparse_int nnz_A,                 \
+      const rocsparse_int *csr_row_ptr_A, const rocsparse_int *csr_col_ind_A, \
+      const rocsparse_mat_descr descr_B, rocsparse_int nnz_B,                 \
+      const rocsparse_int *csr_row_ptr_B, const rocsparse_int *csr_col_ind_B, \
+      const scalar_type *beta, const rocsparse_mat_descr descr_D,             \
+      rocsparse_int nnz_D, const rocsparse_int *csr_row_ptr_D,                \
+      const rocsparse_int *csr_col_ind_D, rocsparse_mat_info info_C,          \
+      size_t *buffer_size) {                                                  \
+    return rocsparse_##TOKEN##csrgemm_buffer_size(                            \
+        handle, trans_A, trans_B, m, n, k, alpha, descr_A, nnz_A,             \
+        csr_row_ptr_A, csr_col_ind_A, descr_B, nnz_B, csr_row_ptr_B,          \
+        csr_col_ind_B, beta, descr_D, nnz_D, csr_row_ptr_D, csr_col_ind_D,    \
+        info_C, buffer_size);                                                 \
+  }
+
+ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(float, s)
+ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(double, d)
+ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(rocsparse_float_complex, c)
+ROCSPARSE_XCSRGEMM_BUFFER_SIZE_SPEC(rocsparse_double_complex, z)
+
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename bin_row_index_view_type,
+    typename bin_nonzero_index_view_type, typename cin_row_index_view_type>
+void spgemm_symbolic_rocsparse(
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA,
+    bin_row_index_view_type rowptrB, bin_nonzero_index_view_type colidxB,
+    cin_row_index_view_type rowptrC) {
+  using index_type  = typename KernelHandle::nnz_lno_t;
+  using scalar_type = typename KernelHandle::nnz_scalar_t;
+  using rocsparse_scalar_type =
+      typename kokkos_to_rocsparse_type<scalar_type>::type;
+
+  auto nnz_A = colidxA.extent(0);
+  auto nnz_B = colidxB.extent(0);
+
+  if (handle->is_symbolic_called()) {
+    return;
+  }
+  handle->create_rocsparse_spgemm_handle(false, false);
+  typename KernelHandle::rocSparseSpgemmHandleType *h =
+      handle->get_rocsparse_spgemm_handle();
+
+  // alpha, beta are on host, but since we use singleton on the rocsparse
+  // handle, we save/restore the pointer mode to not interference with
+  // others' use
+  const auto alpha = Kokkos::ArithTraits<scalar_type>::one();
+  const auto beta  = Kokkos::ArithTraits<scalar_type>::zero();
+  rocsparse_pointer_mode oldPtrMode;
+
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
+      rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(
+      h->rocsparseHandle, rocsparse_pointer_mode_host));
+
+  // C = alpha * OpA(A) * OpB(B) + beta * D
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_buffer_size(
+      h->rocsparseHandle, h->opA, h->opB, m, k, n,
+      reinterpret_cast<const rocsparse_scalar_type *>(&alpha), h->descr_A,
+      nnz_A, rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, rowptrB.data(),
+      colidxB.data(), reinterpret_cast<const rocsparse_scalar_type *>(&beta),
+      h->descr_D, 0, nullptr, nullptr, h->info_C, &h->bufferSize));
+
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&h->buffer, h->bufferSize));
+
+  rocsparse_int nnz_C = 0;
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_nnz(
+      h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A,
+      rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, rowptrB.data(),
+      colidxB.data(), h->descr_D, 0, nullptr, nullptr, h->descr_C,
+      rowptrC.data(), &nnz_C, h->info_C, h->buffer));
+  // If C has zero rows, its rowptrs are not populated
+  if (m == 0) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipMemset(rowptrC.data(), 0, rowptrC.extent(0) * sizeof(index_type)));
+  }
+  // Restore previous pointer mode
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
+      rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode));
+
+  handle->set_c_nnz(nnz_C);
+  handle->set_call_symbolic();
+  handle->set_computed_rowptrs();
+}
+
+#define SPGEMM_SYMBOLIC_DECL_ROCSPARSE(SCALAR, TPL_AVAIL)                     \
+  template <>                                                                 \
+  struct SPGEMM_SYMBOLIC<                                                     \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                       \
+          const int, const int, const SCALAR, Kokkos::HIP, Kokkos::HIPSpace,  \
+          Kokkos::HIPSpace>,                                                  \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<int *, default_layout,                                     \
+                   Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      true, TPL_AVAIL> {                                                      \
+    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<    \
+        const int, const int, const SCALAR, Kokkos::HIP, Kokkos::HIPSpace,    \
+        Kokkos::HIPSpace>;                                                    \
+    using c_int_view_t =                                                      \
+        Kokkos::View<const int *, default_layout,                             \
+                     Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,           \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                \
+    using int_view_t =                                                        \
+        Kokkos::View<int *, default_layout,                                   \
+                     Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace>,           \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                \
+    static void spgemm_symbolic(KernelHandle *handle,                         \
+                                typename KernelHandle::nnz_lno_t m,           \
+                                typename KernelHandle::nnz_lno_t n,           \
+                                typename KernelHandle::nnz_lno_t k,           \
+                                c_int_view_t row_mapA, c_int_view_t entriesA, \
+                                bool, c_int_view_t row_mapB,                  \
+                                c_int_view_t entriesB, bool,                  \
+                                int_view_t row_mapC, bool) {                  \
+      std::string label = "KokkosSparse::spgemm[TPL_ROCSPARSE," +             \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";          \
+      Kokkos::Profiling::pushRegion(label);                                   \
+      spgemm_symbolic_rocsparse(handle->get_spgemm_handle(), m, n, k,         \
+                                row_mapA, entriesA, row_mapB, entriesB,       \
+                                row_mapC);                                    \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(float, false)
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(double, false)
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(Kokkos::complex<float>, false)
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(Kokkos::complex<double>, false)
+
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(float, true)
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(double, true)
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(Kokkos::complex<float>, true)
+SPGEMM_SYMBOLIC_DECL_ROCSPARSE(Kokkos::complex<double>, true)
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename bin_row_index_view_type,
+    typename bin_nonzero_index_view_type, typename cin_row_index_view_type>
+void spgemm_symbolic_mkl(
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    ain_row_index_view_type rowptrA, ain_nonzero_index_view_type colidxA,
+    bin_row_index_view_type rowptrB, bin_nonzero_index_view_type colidxB,
+    cin_row_index_view_type rowptrC) {
+  using ExecSpace   = typename KernelHandle::HandleExecSpace;
+  using index_type  = typename KernelHandle::nnz_lno_t;
+  using size_type   = typename KernelHandle::size_type;
+  using scalar_type = typename KernelHandle::nnz_scalar_t;
+  using MKLMatrix   = MKLSparseMatrix<scalar_type>;
+  if (m == 0 || n == 0 || k == 0 || colidxA.extent(0) == size_type(0) ||
+      colidxB.extent(0) == size_type(0)) {
+    Kokkos::deep_copy(ExecSpace(), rowptrC, size_type(0));
+    handle->set_call_symbolic();
+    handle->set_computed_rowptrs();
+    handle->set_c_nnz(0);
+    return;
+  }
+  MKLMatrix A(m, n, (int *)rowptrA.data(), (int *)colidxA.data(), nullptr);
+  MKLMatrix B(n, k, (int *)rowptrB.data(), (int *)colidxB.data(), nullptr);
+  sparse_matrix_t C;
+  matrix_descr generalDescr;
+  generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL;
+  generalDescr.mode = SPARSE_FILL_MODE_FULL;
+  generalDescr.diag = SPARSE_DIAG_NON_UNIT;
+  KOKKOSKERNELS_MKL_SAFE_CALL(
+      mkl_sparse_sp2m(SPARSE_OPERATION_NON_TRANSPOSE, generalDescr, A,
+                      SPARSE_OPERATION_NON_TRANSPOSE, generalDescr, B,
+                      SPARSE_STAGE_NNZ_COUNT, &C));
+  MKLMatrix wrappedC(C);
+  MKL_INT nrows = 0, ncols = 0;
+  MKL_INT *rowptrRaw     = nullptr;
+  MKL_INT *colidxRaw     = nullptr;
+  scalar_type *valuesRaw = nullptr;
+  wrappedC.export_data(nrows, ncols, rowptrRaw, colidxRaw, valuesRaw);
+  Kokkos::View<index_type *, Kokkos::HostSpace,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      rowptrRawView(rowptrRaw, nrows + 1);
+  Kokkos::deep_copy(ExecSpace(), rowptrC, rowptrRawView);
+  handle->create_mkl_spgemm_handle(C);
+  handle->set_call_symbolic();
+  handle->set_computed_rowptrs();
+  handle->set_c_nnz(rowptrC(m));
+}
+
+#define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL)                     \
+  template <>                                                                 \
+  struct SPGEMM_SYMBOLIC<                                                     \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                       \
+          const int, const int, const SCALAR, EXEC, Kokkos::HostSpace,        \
+          Kokkos::HostSpace>,                                                 \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const int *, default_layout,                               \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<int *, default_layout,                                     \
+                   Kokkos::Device<EXEC, Kokkos::HostSpace>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      true, TPL_AVAIL> {                                                      \
+    using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<    \
+        const int, const int, const SCALAR, EXEC, Kokkos::HostSpace,          \
+        Kokkos::HostSpace>;                                                   \
+    using c_int_view_t =                                                      \
+        Kokkos::View<const int *, default_layout,                             \
+                     Kokkos::Device<EXEC, Kokkos::HostSpace>,                 \
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>;                \
+    using int_view_t = Kokkos::View<int *, default_layout,                    \
+                                    Kokkos::Device<EXEC, Kokkos::HostSpace>,  \
+                                    Kokkos::MemoryTraits<Kokkos::Unmanaged>>; \
+    static void spgemm_symbolic(KernelHandle *handle,                         \
+                                typename KernelHandle::nnz_lno_t m,           \
+                                typename KernelHandle::nnz_lno_t n,           \
+                                typename KernelHandle::nnz_lno_t k,           \
+                                c_int_view_t row_mapA, c_int_view_t entriesA, \
+                                bool, c_int_view_t row_mapB,                  \
+                                c_int_view_t entriesB, bool,                  \
+                                int_view_t row_mapC, bool) {                  \
+      std::string label = "KokkosSparse::spgemm_symbolic[TPL_MKL," +          \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";          \
+      Kokkos::Profiling::pushRegion(label);                                   \
+      spgemm_symbolic_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA,     \
+                          entriesA, row_mapB, entriesB, row_mapC);            \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+#define SPGEMM_SYMBOLIC_DECL_MKL_SE(SCALAR, EXEC) \
+  SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, true)    \
+  SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, false)
+
+#define SPGEMM_SYMBOLIC_DECL_MKL_E(EXEC)                    \
+  SPGEMM_SYMBOLIC_DECL_MKL_SE(float, EXEC)                  \
+  SPGEMM_SYMBOLIC_DECL_MKL_SE(double, EXEC)                 \
+  SPGEMM_SYMBOLIC_DECL_MKL_SE(Kokkos::complex<float>, EXEC) \
+  SPGEMM_SYMBOLIC_DECL_MKL_SE(Kokkos::complex<double>, EXEC)
+
+#ifdef KOKKOS_ENABLE_SERIAL
+SPGEMM_SYMBOLIC_DECL_MKL_E(Kokkos::Serial)
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+SPGEMM_SYMBOLIC_DECL_MKL_E(Kokkos::OpenMP)
+#endif
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
index b4c73a12ff..bf65961131 100644
--- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
+++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
@@ -370,7 +370,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
 
 // rocSPARSE
 #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
-#include <rocsparse.h>
+#include <rocsparse/rocsparse.h>
 #include "KokkosSparse_Utils_rocsparse.hpp"
 
 namespace KokkosSparse {
diff --git a/sparse/unit_test/Test_Sparse_CrsMatrix.hpp b/sparse/unit_test/Test_Sparse_CrsMatrix.hpp
index 27152d76a6..8a85e43670 100644
--- a/sparse/unit_test/Test_Sparse_CrsMatrix.hpp
+++ b/sparse/unit_test/Test_Sparse_CrsMatrix.hpp
@@ -47,6 +47,7 @@
 #include <Kokkos_Core.hpp>
 #include <stdexcept>
 #include "KokkosSparse_CrsMatrix.hpp"
+#include "Kokkos_ArithTraits.hpp"
 
 // #ifndef kokkos_complex_double
 // #define kokkos_complex_double Kokkos::complex<double>
diff --git a/sparse/unit_test/Test_Sparse_bspgemm.hpp b/sparse/unit_test/Test_Sparse_bspgemm.hpp
index 1b9ec84651..7ac382ad01 100644
--- a/sparse/unit_test/Test_Sparse_bspgemm.hpp
+++ b/sparse/unit_test/Test_Sparse_bspgemm.hpp
@@ -121,7 +121,7 @@ bool is_same_block_matrix(bsrMat_t output_mat_actual,
     return false;
   }
 
-  KokkosSparse::sort_bsr_matrix(output_mat_actual);
+  // Do not sort the actual product matrix - test that it's already sorted
   KokkosSparse::sort_bsr_matrix(output_mat_reference);
 
   bool is_identical = true;
@@ -153,7 +153,7 @@ bool is_same_block_matrix(bsrMat_t output_mat_actual,
 
   typedef typename Kokkos::Details::ArithTraits<
       typename scalar_view_t::non_const_value_type>::mag_type eps_type;
-  eps_type eps = std::is_same<eps_type, float>::value ? 3.7e-3 : 1e-7;
+  eps_type eps = std::is_same<eps_type, float>::value ? 3e-2 : 5e-7;
 
   is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view<
       scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>(
@@ -202,17 +202,16 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz,
   bsrMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix<bsrMat_t>(
       blkDim, k, n, nnz, row_size_variance, bandwidth);
 
-  const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1;
+  KokkosSparse::sort_bsr_matrix(A);
+  KokkosSparse::sort_bsr_matrix(B);
 
   bsrMat_t output_mat2;
   run_block_spgemm(A, B, output_mat2, SPGEMM_DEBUG, use_dynamic_scheduling,
                    shared_memory_size);
 
   std::vector<SPGEMMAlgorithm> algorithms = {
-      SPGEMM_KK,
-      SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */,
-      SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */,
-      SPGEMM_MKL /* verify failure in case of missing build */,
+      SPGEMM_KK, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */,
+      SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */
   };
 
   if (!KokkosKernels::Impl::kk_is_gpu_exec_space<
@@ -223,41 +222,10 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz,
   }
 
   for (auto spgemm_algorithm : algorithms) {
-    const uint64_t max_integer = Kokkos::ArithTraits<int>::max();
-    std::string algo           = "UNKNOWN";
-    bool is_expected_to_fail   = false;
+    std::string algo         = "UNKNOWN";
+    bool is_expected_to_fail = false;
 
     switch (spgemm_algorithm) {
-      case SPGEMM_CUSPARSE:
-        // TODO: add these test failure cases for cusparse too.
-        algo = "SPGEMM_CUSPARSE";
-#ifndef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-        is_expected_to_fail = true;
-#endif
-        break;
-
-      case SPGEMM_MKL:
-        algo                = "SPGEMM_MKL";
-        is_expected_to_fail = !is_empy_case;  // TODO: add block MKL impl
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-        if (!KokkosSparse::Impl::mkl_is_supported_value_type<scalar_t>::value) {
-          is_expected_to_fail = true;
-        }
-#else
-        is_expected_to_fail = true;  // fail: MKL not enabled in build
-#endif
-        // MKL requires local ordinals to be int.
-        // Note: empty-array special case will NOT fail on this.
-        if (!std::is_same<int, lno_t>::value && !is_empy_case) {
-          is_expected_to_fail = true;
-        }
-        // if size_type is larger than int, mkl casts it to int.
-        // it will fail if casting cause overflow.
-        if (A.values.extent(0) > max_integer) {
-          is_expected_to_fail = true;
-        }
-        break;
-
       case SPGEMM_KK: algo = "SPGEMM_KK"; break;
       case SPGEMM_KK_LP: algo = "SPGEMM_KK_LP"; break;
       case SPGEMM_KK_MEMSPEED: algo = "SPGEMM_KK_MEMSPEED"; break;
diff --git a/sparse/unit_test/Test_Sparse_rocsparse.hpp b/sparse/unit_test/Test_Sparse_rocsparse.hpp
index fe1bf8e9b2..0c4d2303ef 100644
--- a/sparse/unit_test/Test_Sparse_rocsparse.hpp
+++ b/sparse/unit_test/Test_Sparse_rocsparse.hpp
@@ -6,7 +6,7 @@
 
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
-#include <rocsparse.h>
+#include <rocsparse/rocsparse.h>
 #include "KokkosSparse_Utils_rocsparse.hpp"
 
 void test_rocsparse_version() {
diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp
index 2f425b07d2..1e60b35b81 100644
--- a/sparse/unit_test/Test_Sparse_spgemm.hpp
+++ b/sparse/unit_test/Test_Sparse_spgemm.hpp
@@ -66,10 +66,6 @@
 // const char *input_filename = "Si2.mtx";
 // const char *input_filename = "wathen_30_30.mtx";
 // const size_t expected_num_cols = 9906;
-using namespace KokkosSparse;
-using namespace KokkosSparse::Experimental;
-using namespace KokkosKernels;
-using namespace KokkosKernels::Experimental;
 
 // #ifndef kokkos_complex_double
 // #define kokkos_complex_double Kokkos::complex<double>
@@ -81,9 +77,21 @@ typedef Kokkos::complex<float> kokkos_complex_float;
 
 namespace Test {
 
+// Randomize matrix values again from the same uniform distribution as
+// kk_generate_sparse_matrix uses.
+template <typename Values>
+void randomize_matrix_values(const Values &v) {
+  using ScalarType = typename Values::value_type;
+  ScalarType randStart, randEnd;
+  KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd);
+  Kokkos::Random_XorShift64_Pool<typename Values::execution_space> pool(13718);
+  Kokkos::fill_random(v, pool, randStart, randEnd);
+}
+
 template <typename crsMat_t, typename device>
 int run_spgemm(crsMat_t A, crsMat_t B,
-               KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C) {
+               KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C,
+               bool testReuse) {
   typedef typename crsMat_t::size_type size_type;
   typedef typename crsMat_t::ordinal_type lno_t;
   typedef typename crsMat_t::value_type scalar_t;
@@ -98,18 +106,41 @@ int run_spgemm(crsMat_t A, crsMat_t B,
   kh.set_dynamic_scheduling(true);
 
   kh.create_spgemm_handle(spgemm_algorithm);
+  {
+    auto sh = kh.get_spgemm_handle();
+
+    EXPECT_FALSE(sh->is_symbolic_called());
+    EXPECT_FALSE(sh->is_numeric_called());
+    EXPECT_FALSE(sh->are_rowptrs_computed());
+    EXPECT_FALSE(sh->are_entries_computed());
+
+    KokkosSparse::spgemm_symbolic(kh, A, false, B, false, C);
 
-  KokkosSparse::spgemm_symbolic(kh, A, false, B, false, C);
-  KokkosSparse::spgemm_numeric(kh, A, false, B, false, C);
+    EXPECT_TRUE(sh->is_symbolic_called());
+
+    KokkosSparse::spgemm_numeric(kh, A, false, B, false, C);
+
+    EXPECT_TRUE(sh->are_entries_computed());
+    EXPECT_TRUE(sh->is_numeric_called());
+
+    if (testReuse) {
+      // Give A and B completely new random values, and re-run just numeric
+      randomize_matrix_values(A.values);
+      randomize_matrix_values(B.values);
+      KokkosSparse::spgemm_numeric(kh, A, false, B, false, C);
+      EXPECT_TRUE(sh->are_entries_computed());
+      EXPECT_TRUE(sh->is_numeric_called());
+    }
+  }
   kh.destroy_spgemm_handle();
 
   return 0;
 }
 
 template <typename crsMat_t, typename device>
-int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2,
+int run_spgemm_old_interface(crsMat_t A, crsMat_t B,
                              KokkosSparse::SPGEMMAlgorithm spgemm_algorithm,
-                             crsMat_t &result) {
+                             crsMat_t &result, bool testReuse) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -130,42 +161,62 @@ int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2,
   // kh.set_verbose(true);
 
   kh.create_spgemm_handle(spgemm_algorithm);
+  {
+    auto sh = kh.get_spgemm_handle();
+
+    const size_t num_rows_A = A.numRows();
+    const size_t num_rows_B = B.numRows();
+    const size_t num_cols_B = B.numCols();
+
+    lno_view_t row_mapC("non_const_lnow_row", num_rows_A + 1);
+    lno_nnz_view_t entriesC;
+    scalar_view_t valuesC;
+
+    EXPECT_FALSE(sh->is_symbolic_called());
+    EXPECT_FALSE(sh->is_numeric_called());
+    EXPECT_FALSE(sh->are_rowptrs_computed());
+    EXPECT_FALSE(sh->are_entries_computed());
+
+    KokkosSparse::Experimental::spgemm_symbolic(
+        &kh, num_rows_A, num_rows_B, num_cols_B, A.graph.row_map,
+        A.graph.entries, false, B.graph.row_map, B.graph.entries, false,
+        row_mapC);
+
+    EXPECT_TRUE(sh->is_symbolic_called());
+
+    size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
+    entriesC          = lno_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+        c_nnz_size);
+    valuesC = scalar_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
+    KokkosSparse::Experimental::spgemm_numeric(
+        &kh, num_rows_A, num_rows_B, num_cols_B, A.graph.row_map,
+        A.graph.entries, A.values, false, B.graph.row_map, B.graph.entries,
+        B.values, false, row_mapC, entriesC, valuesC);
+
+    EXPECT_TRUE(sh->are_entries_computed());
+    EXPECT_TRUE(sh->is_numeric_called());
+
+    if (testReuse) {
+      // Give A and B completely new random values, and re-run just numeric
+      randomize_matrix_values(A.values);
+      randomize_matrix_values(B.values);
+      KokkosSparse::Experimental::spgemm_numeric(
+          &kh, num_rows_A, num_rows_B, num_cols_B, A.graph.row_map,
+          A.graph.entries, A.values, false, B.graph.row_map, B.graph.entries,
+          B.values, false, row_mapC, entriesC, valuesC);
+      EXPECT_TRUE(sh->are_entries_computed());
+      EXPECT_TRUE(sh->is_numeric_called());
+    }
 
-  const size_t num_rows_1 = input_mat.numRows();
-  const size_t num_rows_2 = input_mat2.numRows();
-  const size_t num_cols_2 = input_mat2.numCols();
-
-  const size_t num_cols_1 = input_mat.numCols();
-  bool equal              = num_rows_2 == num_cols_1;
-  if (!equal) return 1;
-
-  lno_view_t row_mapC("non_const_lnow_row", num_rows_1 + 1);
-  lno_nnz_view_t entriesC;
-  scalar_view_t valuesC;
-
-  spgemm_symbolic(&kh, num_rows_1, num_rows_2, num_cols_2,
-                  input_mat.graph.row_map, input_mat.graph.entries, false,
-                  input_mat2.graph.row_map, input_mat2.graph.entries, false,
-                  row_mapC);
-
-  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
-  entriesC          = lno_nnz_view_t(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-  valuesC = scalar_view_t(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
-  spgemm_numeric(&kh, num_rows_1, num_rows_2, num_cols_2,
-                 input_mat.graph.row_map, input_mat.graph.entries,
-                 input_mat.values, false,
-
-                 input_mat2.graph.row_map, input_mat2.graph.entries,
-                 input_mat2.values, false, row_mapC, entriesC, valuesC);
-
-  graph_t static_graph(entriesC, row_mapC);
-  result = crsMat_t("CrsMatrix", num_cols_2, valuesC, static_graph);
+    graph_t static_graph(entriesC, row_mapC);
+    result = crsMat_t("CrsMatrix", num_cols_B, valuesC, static_graph);
+  }
   kh.destroy_spgemm_handle();
-
   return 0;
 }
+
 template <typename crsMat_t, typename device>
 bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
@@ -197,7 +248,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) {
     return false;
   }
 
-  KokkosSparse::sort_crs_matrix(output_mat_actual);
+  // Do not sort the actual product matrix - test that it's already sorted
   KokkosSparse::sort_crs_matrix(output_mat_reference);
 
   bool is_identical = true;
@@ -209,9 +260,10 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) {
   if (!is_identical) {
     std::cout << "rowmaps are different." << std::endl;
     std::cout << "Actual rowmap:\n";
-    KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.row_map);
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.row_map, true);
     std::cout << "Correct rowmap (SPGEMM_DEBUG):\n";
-    KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.row_map);
+    KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.row_map,
+                                         true);
     return false;
   }
 
@@ -251,7 +303,8 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) {
 template <typename scalar_t, typename lno_t, typename size_type,
           typename device>
 void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
-                 lno_t row_size_variance, bool oldInterface = false) {
+                 lno_t row_size_variance, bool oldInterface = false,
+                 bool testReuse = false) {
 #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
   {
     std::cerr
@@ -278,56 +331,26 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
   crsMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       k, n, nnz, row_size_variance, bandwidth);
 
-  const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1;
+  KokkosSparse::sort_crs_matrix(A);
+  KokkosSparse::sort_crs_matrix(B);
 
   crsMat_t output_mat2;
-  if (oldInterface)
-    run_spgemm_old_interface<crsMat_t, device>(A, B, SPGEMM_DEBUG, output_mat2);
-  else
-    run_spgemm<crsMat_t, device>(A, B, SPGEMM_DEBUG, output_mat2);
+  // If this test won't reuse symbolic, we can compute the reference matrix once
+  // here
+  if (!testReuse) {
+    run_spgemm<crsMat_t, device>(A, B, SPGEMM_DEBUG, output_mat2, false);
+  }
 
   std::vector<SPGEMMAlgorithm> algorithms = {
       SPGEMM_KK, SPGEMM_KK_LP, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */,
       SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */
   };
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-  algorithms.push_back(SPGEMM_MKL);
-#endif
-
   for (auto spgemm_algorithm : algorithms) {
-    const uint64_t max_integer = Kokkos::ArithTraits<int>::max();
-    std::string algo           = "UNKNOWN";
-    bool is_expected_to_fail   = false;
+    std::string algo         = "UNKNOWN";
+    bool is_expected_to_fail = false;
 
     switch (spgemm_algorithm) {
-      case SPGEMM_CUSPARSE:
-        // TODO: add these test failure cases for cusparse too.
-        algo = "SPGEMM_CUSPARSE";
-#if !defined(KERNELS_HAVE_CUSPARSE) && \
-    !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-        is_expected_to_fail = true;
-#endif
-        break;
-
-      case SPGEMM_MKL: algo = "SPGEMM_MKL";
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-        if (!KokkosSparse::Impl::mkl_is_supported_value_type<scalar_t>::value) {
-          is_expected_to_fail = true;
-        }
-#endif
-        // MKL requires local ordinals to be int.
-        // Note: empty-array special case will NOT fail on this.
-        if (!std::is_same<int, lno_t>::value && !is_empy_case) {
-          is_expected_to_fail = true;
-        }
-        // if size_type is larger than int, mkl casts it to int.
-        // it will fail if casting cause overflow.
-        if (A.values.extent(0) > max_integer) {
-          is_expected_to_fail = true;
-        }
-        break;
-
       case SPGEMM_KK: algo = "SPGEMM_KK"; break;
       case SPGEMM_KK_LP: algo = "SPGEMM_KK_LP"; break;
       case SPGEMM_KK_MEMSPEED: algo = "SPGEMM_KK_MEMSPEED"; break;
@@ -344,9 +367,10 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
     try {
       if (oldInterface)
         res = run_spgemm_old_interface<crsMat_t, device>(A, B, spgemm_algorithm,
-                                                         output_mat);
+                                                         output_mat, testReuse);
       else
-        res = run_spgemm<crsMat_t, device>(A, B, spgemm_algorithm, output_mat);
+        res = run_spgemm<crsMat_t, device>(A, B, spgemm_algorithm, output_mat,
+                                           testReuse);
     } catch (const char *message) {
       EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
       failed = true;
@@ -359,6 +383,12 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
     }
     EXPECT_EQ(is_expected_to_fail, failed);
 
+    // If this is testing reuse, the values of A and B changed so
+    // the reference matrix must be recomputed
+    if (testReuse) {
+      run_spgemm<crsMat_t, device>(A, B, SPGEMM_DEBUG, output_mat2, false);
+    }
+
     // double spgemm_time = timer1.seconds();
 
     timer1.reset();
@@ -426,13 +456,15 @@ void test_issue402() {
       scalar_view_t, lno_view_t, typename device::execution_space>(
       numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues);
   crsMat_t B("B=A^T", numRows, numRows, nnz, Bvalues, Browmap, Bentries);
+  KokkosSparse::sort_crs_matrix(A);
+  KokkosSparse::sort_crs_matrix(B);
   crsMat_t Cgold;
-  run_spgemm<crsMat_t, device>(A, B, SPGEMM_DEBUG, Cgold);
+  run_spgemm<crsMat_t, device>(A, B, SPGEMM_DEBUG, Cgold, false);
   crsMat_t C;
   bool success = true;
   std::string errMsg;
   try {
-    int res = run_spgemm<crsMat_t, device>(A, B, SPGEMM_KK_MEMORY, C);
+    int res = run_spgemm<crsMat_t, device>(A, B, SPGEMM_KK_MEMORY, C, false);
     if (res) throw "run_spgemm returned error code";
   } catch (const char *message) {
     errMsg  = message;
@@ -444,20 +476,24 @@ void test_issue402() {
     errMsg  = e.what();
     success = false;
   }
-  EXPECT_TRUE(success) << "KKMEM still has issue 402 bug! Error message:\n"
+  EXPECT_TRUE(success) << "SpGEMM still has issue 402 bug! Error message:\n"
                        << errMsg << '\n';
   bool correctResult = is_same_matrix<crsMat_t, device>(C, Cgold);
   EXPECT_TRUE(correctResult)
-      << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n";
+      << "SpGEMM still has issue 402 bug; C=AA' is incorrect!\n";
 }
 
 #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)            \
   TEST_F(TestCategory,                                                         \
          sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {     \
-    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 10000, 10000,          \
-                                                 10000 * 20, 500, 10, false);  \
-    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 10000, 10000,          \
-                                                 10000 * 20, 500, 10, true);   \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 8000, 6000, 8000 * 20, \
+                                                 500, 10, false);              \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 8000, 6000, 8000 * 20, \
+                                                 500, 10, true);               \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 500, 1600, 1000 * 20,   \
+                                                 500, 10, false, true);        \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 500, 1600, 1000 * 20,   \
+                                                 500, 10, true, true);         \
     test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(0, 0, 0, 0, 10, 10, false);   \
     test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(0, 0, 0, 0, 10, 10, true);    \
     test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(0, 12, 5, 0, 10, 0, false);   \
diff --git a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp
index 12cfd983f1..9f2fddeaa2 100644
--- a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp
+++ b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp
@@ -154,8 +154,6 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) {
   size_t nentries2 = output_mat2.graph.entries.extent(0);
   size_t nvals2    = output_mat2.values.extent(0);
 
-  KokkosSparse::sort_crs_matrix(output_mat1);
-
   if (nrows1 != nrows2) {
     std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl;
     return false;
@@ -170,8 +168,6 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) {
     return false;
   }
 
-  KokkosSparse::sort_crs_matrix(output_mat2);
-
   bool is_identical = true;
   is_identical      = KokkosKernels::Impl::kk_is_identical_view<
       typename graph_t::row_map_type, typename graph_t::row_map_type,
@@ -237,6 +233,8 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth,
       KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix<
           crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
 
+  KokkosSparse::sort_crs_matrix(input_mat);
+
   crsMat_t output_mat2;
   scalar_t omega = 3.0;
 
@@ -263,6 +261,9 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth,
 
   run_spgemm_jacobi<crsMat_t, device>(input_mat, input_mat, omega, dinv,
                                       spgemm_algorithm, output_mat);
+  // Sort the reference output_mat2, but not output_mat. It should already be
+  // soted.
+  KokkosSparse::sort_crs_matrix(output_mat2);
   bool is_identical = is_same_mat<crsMat_t, device>(output_mat, output_mat2);
   EXPECT_TRUE(is_identical);
 }