Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cluster gs streams #2159

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
4 changes: 3 additions & 1 deletion docs/developer/apidocs/sparse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,11 @@ block_spgemm

gauss_seidel
------------
.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm)
.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm)
.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm)
.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm)
.. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm)
.. doxygenfunction:: destroy_gs_handle()
.. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric)
.. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric)
.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric)
Expand Down
107 changes: 68 additions & 39 deletions sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class ClusterGaussSeidel {
typedef typename HandleType::scalar_persistent_work_view_t
scalar_persistent_work_view_t;

typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
typedef Kokkos::RangePolicy<MyExecSpace> range_policy_t;
typedef nnz_lno_t color_t;
typedef Kokkos::View<color_t*, MyTempMemorySpace> color_view_t;
typedef Kokkos::Bitset<MyExecSpace> bitset_t;
Expand Down Expand Up @@ -519,6 +519,7 @@ class ClusterGaussSeidel {
using raw_colinds_t = Kokkos::View<const nnz_lno_t*, MyTempMemorySpace,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
auto gsHandle = get_gs_handle();
auto my_exec_space = gsHandle->get_execution_space();
#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
Kokkos::Timer timer;
#endif
Expand All @@ -528,6 +529,8 @@ class ClusterGaussSeidel {
// symmetric and non-symmetric input cases.
rowmap_t sym_xadj;
colinds_t sym_adj;
// TODO: pass my_exec_space into KokkosGraph kernels. Requires
// https://github.com/kokkos/kokkos-kernels/issues/1879.
if (!this->is_symmetric) {
KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>(
Expand Down Expand Up @@ -649,10 +652,14 @@ class ClusterGaussSeidel {
#endif
nnz_lno_persistent_work_view_t color_xadj;
nnz_lno_persistent_work_view_t color_adj;

// Wait for coloring to finish on its stream
using ColoringExecSpace = typename HandleType::HandleExecSpace;
ColoringExecSpace().fence();
KokkosKernels::Impl::create_reverse_map<
typename HandleType::GraphColoringHandleType::color_view_t,
nnz_lno_persistent_work_view_t, MyExecSpace>(
numClusters, numColors, colors, color_xadj, color_adj);
my_exec_space, numClusters, numColors, colors, color_xadj, color_adj);
#ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
MyExecSpace().fence();
std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl;
Expand All @@ -661,6 +668,8 @@ class ClusterGaussSeidel {
nnz_lno_persistent_work_host_view_t color_xadj_host(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "Color xadj"),
color_xadj.extent(0));
// NOTE: the below deep copy ensures that we don't start running numeric
// on a non-default stream until our symbolic data has landed.
Kokkos::deep_copy(color_xadj_host, color_xadj);
gsHandle->set_color_xadj(color_xadj_host);
gsHandle->set_color_adj(color_adj);
Expand Down Expand Up @@ -735,7 +744,8 @@ class ClusterGaussSeidel {
};

void initialize_numeric() {
auto gsHandle = get_gs_handle();
auto gsHandle = get_gs_handle();
auto my_exec_space = gsHandle->get_execution_space();
if (!gsHandle->is_symbolic_called()) {
this->initialize_symbolic();
}
Expand All @@ -751,25 +761,29 @@ class ClusterGaussSeidel {
this->handle->get_suggested_team_size(suggested_vector_size);

scalar_persistent_work_view_t inverse_diagonal(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "Aii^-1"), num_rows);
Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing,
"Aii^-1"),
num_rows);
nnz_lno_t rows_per_team = this->handle->get_team_work_size(
suggested_team_size, MyExecSpace().concurrency(), num_rows);
suggested_team_size, my_exec_space.concurrency(), num_rows);

if (have_diagonal_given) {
Kokkos::deep_copy(inverse_diagonal, this->given_inverse_diagonal);
Kokkos::deep_copy(my_exec_space, inverse_diagonal,
this->given_inverse_diagonal);
} else {
// extract inverse diagonal from matrix
Get_Matrix_Diagonals gmd(this->row_map, this->entries, this->values,
inverse_diagonal, num_rows, rows_per_team);
if (gsHandle->use_teams()) {
Kokkos::parallel_for(
"KokkosSparse::GaussSeidel::team_get_matrix_diagonals",
team_policy_t((num_rows + rows_per_team - 1) / rows_per_team,
team_policy_t(my_exec_space,
(num_rows + rows_per_team - 1) / rows_per_team,
suggested_team_size, suggested_vector_size),
gmd);
} else {
Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals",
my_exec_space(0, num_rows), gmd);
range_policy_t(my_exec_space, 0, num_rows), gmd);
}
}
gsHandle->set_inverse_diagonal(inverse_diagonal);
Expand All @@ -787,7 +801,8 @@ class ClusterGaussSeidel {
nnz_scalar_t omega = Kokkos::ArithTraits<nnz_scalar_t>::one(),
bool apply_forward = true, bool apply_backward = true,
bool /*update_y_vector*/ = true) {
auto gsHandle = get_gs_handle();
auto gsHandle = get_gs_handle();
auto my_exec_space = gsHandle->get_execution_space();

size_type nnz = entries.extent(0);
nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
Expand All @@ -797,8 +812,8 @@ class ClusterGaussSeidel {
color_t numColors = gsHandle->get_num_colors();

if (init_zero_x_vector) {
KokkosKernels::Impl::zero_vector<x_value_array_type, MyExecSpace>(
num_cols, x_lhs_output_vec);
KokkosKernels::Impl::zero_vector(my_exec_space, num_cols,
x_lhs_output_vec);
}

scalar_persistent_work_view_t inverse_diagonal =
Expand All @@ -811,7 +826,7 @@ class ClusterGaussSeidel {
this->handle->get_suggested_team_size(suggested_vector_size);

nnz_lno_t rows_per_team = this->handle->get_team_work_size(
suggested_team_size, MyExecSpace().concurrency(), num_rows);
suggested_team_size, my_exec_space.concurrency(), num_rows);
// Get clusters per team. Round down to favor finer granularity, since
// this is sensitive to load imbalance
nnz_lno_t clusters_per_team =
Expand All @@ -825,33 +840,34 @@ class ClusterGaussSeidel {
color_adj, gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(),
inverse_diagonal, clusters_per_team, omega);

this->IterativeTeamPSGS(gs, numColors, h_color_xadj, suggested_team_size,
suggested_vector_size, numIter, apply_forward,
apply_backward);
this->IterativeTeamPSGS(my_exec_space, gs, numColors, h_color_xadj,
suggested_team_size, suggested_vector_size,
numIter, apply_forward, apply_backward);
} else {
PSGS<x_value_array_type, y_value_array_type> gs(
this->row_map, this->entries, this->values, x_lhs_output_vec,
y_rhs_input_vec, color_adj, gsHandle->get_cluster_xadj(),
gsHandle->get_cluster_adj(), omega, inverse_diagonal);

this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward,
apply_backward);
this->IterativePSGS(my_exec_space, gs, numColors, h_color_xadj, numIter,
apply_forward, apply_backward);
}
}

template <typename TPSGS>
void IterativeTeamPSGS(TPSGS& gs, color_t numColors,
void IterativeTeamPSGS(MyExecSpace& my_exec_space, TPSGS& gs,
color_t numColors,
nnz_lno_persistent_work_host_view_t h_color_xadj,
nnz_lno_t team_size, nnz_lno_t vec_size,
int num_iteration, bool apply_forward,
bool apply_backward) {
for (int i = 0; i < num_iteration; ++i)
this->DoTeamPSGS(gs, numColors, h_color_xadj, team_size, vec_size,
apply_forward, apply_backward);
this->DoTeamPSGS(my_exec_space, gs, numColors, h_color_xadj, team_size,
vec_size, apply_forward, apply_backward);
}

template <typename TPSGS>
void DoTeamPSGS(TPSGS& gs, color_t numColors,
void DoTeamPSGS(MyExecSpace& my_exec_space, TPSGS& gs, color_t numColors,
nnz_lno_persistent_work_host_view_t h_color_xadj,
nnz_lno_t team_size, nnz_lno_t vec_size, bool apply_forward,
bool apply_backward) {
Expand All @@ -865,9 +881,12 @@ class ClusterGaussSeidel {
gs._color_set_end = color_index_end;
Kokkos::parallel_for(
"KokkosSparse::GaussSeidel::Team_PSGS::forward",
team_policy_t((overall_work + gs._clusters_per_team - 1) /
gs._clusters_per_team,
team_size, vec_size),
Kokkos::Experimental::require(
team_policy_t(my_exec_space,
(overall_work + gs._clusters_per_team - 1) /
gs._clusters_per_team,
team_size, vec_size),
Kokkos::Experimental::WorkItemProperty::HintLightWeight),
gs);
}
}
Expand All @@ -882,10 +901,13 @@ class ClusterGaussSeidel {
gs._color_set_begin = color_index_begin;
gs._color_set_end = color_index_end;
Kokkos::parallel_for(
"KokkosSparse::GaussSeidel::Team_PSGS::forward",
team_policy_t((overall_work + gs._clusters_per_team - 1) /
gs._clusters_per_team,
team_size, vec_size),
"KokkosSparse::GaussSeidel::Team_PSGS::backward",
Kokkos::Experimental::require(
team_policy_t(my_exec_space,
(overall_work + gs._clusters_per_team - 1) /
gs._clusters_per_team,
team_size, vec_size),
Kokkos::Experimental::WorkItemProperty::HintLightWeight),
gs);
if (i == 0) {
break;
Expand All @@ -895,17 +917,18 @@ class ClusterGaussSeidel {
}

template <typename PSGS>
void IterativePSGS(PSGS& gs, color_t numColors,
void IterativePSGS(MyExecSpace& my_exec_space, PSGS& gs, color_t numColors,
nnz_lno_persistent_work_host_view_t h_color_xadj,
int num_iteration, bool apply_forward,
bool apply_backward) {
for (int i = 0; i < num_iteration; ++i) {
this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward);
this->DoPSGS(my_exec_space, gs, numColors, h_color_xadj, apply_forward,
apply_backward);
}
}

template <typename PSGS>
void DoPSGS(PSGS& gs, color_t numColors,
void DoPSGS(MyExecSpace& my_exec_space, PSGS& gs, color_t numColors,
nnz_lno_persistent_work_host_view_t h_color_xadj,
bool apply_forward, bool apply_backward) {
if (apply_forward) {
Expand All @@ -914,10 +937,13 @@ class ClusterGaussSeidel {
nnz_lno_t color_index_end = h_color_xadj(i + 1);
gs._color_set_begin = color_index_begin;
gs._color_set_end = color_index_end;
Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::forward",
Kokkos::RangePolicy<MyExecSpace, PSGS_ForwardTag>(
0, color_index_end - color_index_begin),
gs);
Kokkos::parallel_for(
"KokkosSparse::GaussSeidel::PSGS::forward",
Kokkos::Experimental::require(
Kokkos::RangePolicy<MyExecSpace, PSGS_ForwardTag>(
my_exec_space, 0, color_index_end - color_index_begin),
Kokkos::Experimental::WorkItemProperty::HintLightWeight),
gs);
}
}
if (apply_backward && numColors) {
Expand All @@ -926,10 +952,13 @@ class ClusterGaussSeidel {
nnz_lno_t color_index_end = h_color_xadj(i + 1);
gs._color_set_begin = color_index_begin;
gs._color_set_end = color_index_end;
Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::backward",
Kokkos::RangePolicy<MyExecSpace, PSGS_BackwardTag>(
0, color_index_end - color_index_begin),
gs);
Kokkos::parallel_for(
"KokkosSparse::GaussSeidel::PSGS::backward",
Kokkos::Experimental::require(
Kokkos::RangePolicy<MyExecSpace, PSGS_BackwardTag>(
my_exec_space, 0, color_index_end - color_index_begin),
Kokkos::Experimental::WorkItemProperty::HintLightWeight),
gs);
if (i == 0) {
break;
}
Expand Down
9 changes: 4 additions & 5 deletions sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle,
Kokkos::parallel_reduce(
"check_count host",
Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(
0, nodes_per_level.extent(0)),
space, 0, nodes_per_level.extent(0)),
KOKKOS_LAMBDA(const long i, long& update) {
update += nodes_per_level(i);
},
Expand All @@ -285,8 +285,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle,
check_count = 0; // reset
Kokkos::parallel_reduce(
"check_count device",
Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(
0, dnodes_per_level.extent(0)),
Kokkos::RangePolicy<ExecutionSpace>(0, dnodes_per_level.extent(0)),
KOKKOS_LAMBDA(const long i, long& update) {
update += dnodes_per_level(i);
},
Expand Down Expand Up @@ -740,8 +739,8 @@ void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle,
check_count = 0; // reset
Kokkos::parallel_reduce(
"check_count device",
Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(
0, dnodes_per_level.extent(0)),
Kokkos::RangePolicy<ExecutionSpace>(space, 0,
dnodes_per_level.extent(0)),
KOKKOS_LAMBDA(const long i, long& update) {
update += dnodes_per_level(i);
},
Expand Down
Loading
Loading