Skip to content

Commit

Permalink
Make deep_copy a non-collective operation
Browse files Browse the repository at this point in the history
  • Loading branch information
janciesko committed May 8, 2024
1 parent 467e8c0 commit 9138c7a
Show file tree
Hide file tree
Showing 10 changed files with 26 additions and 32 deletions.
1 change: 1 addition & 0 deletions src/core/Kokkos_RemoteSpaces_DeepCopy.hpp
Expand Up @@ -1192,6 +1192,7 @@ inline void deep_copy(
((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) &&
((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) {
const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();

Kokkos::fence(
"Kokkos::deep_copy: copy between contiguous views, pre view equality "
"check");
Expand Down
6 changes: 0 additions & 6 deletions src/impl/mpispace/Kokkos_MPISpace.cpp
Expand Up @@ -152,7 +152,6 @@ void MPISpace::impl_deallocate(
const Kokkos::Tools::SpaceHandle arg_handle) const {
if (arg_alloc_ptr) {
Kokkos::fence("HostSpace::impl_deallocate before free");
fence();
size_t reported_size =
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
if (Kokkos::Profiling::profileLibraryLoaded()) {
Expand Down Expand Up @@ -214,13 +213,11 @@ namespace Impl {

Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::MPISpace>::DeepCopy(
void *dst, const void *src, size_t n) {
Kokkos::Experimental::MPISpace().fence();
memcpy(dst, src, n);
}

Kokkos::Impl::DeepCopy<Kokkos::Experimental::MPISpace, HostSpace>::DeepCopy(
void *dst, const void *src, size_t n) {
Kokkos::Experimental::MPISpace().fence();
memcpy((char *)dst, (char *)src, n);
}

Expand All @@ -229,7 +226,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::MPISpace,
const void
*src,
size_t n) {
Kokkos::Experimental::MPISpace().fence();
memcpy(dst, src, n);
}

Expand All @@ -238,7 +234,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::MPISpace,
Kokkos::Experimental::MPISpace,
ExecutionSpace>::DeepCopy(void *dst, const void *src,
size_t n) {
Kokkos::Experimental::MPISpace().fence();
memcpy(dst, src, n);
}

Expand All @@ -248,7 +243,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::MPISpace,
ExecutionSpace>::DeepCopy(const ExecutionSpace &exec,
void *dst, const void *src,
size_t n) {
Kokkos::Experimental::MPISpace().fence();
memcpy(dst, src, n);
}

Expand Down
5 changes: 0 additions & 5 deletions src/impl/nvshmemspace/Kokkos_NVSHMEMSpace.cpp
Expand Up @@ -117,7 +117,6 @@ void NVSHMEMSpace::impl_deallocate(
const Kokkos::Tools::SpaceHandle arg_handle) const {
if (arg_alloc_ptr) {
Kokkos::fence("HostSpace::impl_deallocate before free");
fence();
size_t reported_size =
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
if (Kokkos::Profiling::profileLibraryLoaded()) {
Expand All @@ -142,13 +141,11 @@ namespace Impl {

Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::NVSHMEMSpace>::DeepCopy(
void *dst, const void *src, size_t n) {
Kokkos::Experimental::NVSHMEMSpace().fence();
cudaMemcpy(dst, src, n, cudaMemcpyDefault);
}

Kokkos::Impl::DeepCopy<Kokkos::Experimental::NVSHMEMSpace, HostSpace>::DeepCopy(
void *dst, const void *src, size_t n) {
Kokkos::Experimental::NVSHMEMSpace().fence();
cudaMemcpy(dst, src, n, cudaMemcpyDefault);
}

Expand All @@ -157,7 +154,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::NVSHMEMSpace,
Kokkos::Experimental::NVSHMEMSpace,
ExecutionSpace>::DeepCopy(void *dst, const void *src,
size_t n) {
Kokkos::Experimental::NVSHMEMSpace().fence();
cudaMemcpy(dst, src, n, cudaMemcpyDefault);
}

Expand All @@ -167,7 +163,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::NVSHMEMSpace,
ExecutionSpace>::DeepCopy(const ExecutionSpace &exec,
void *dst, const void *src,
size_t n) {
Kokkos::Experimental::NVSHMEMSpace().fence();
cudaMemcpy(dst, src, n, cudaMemcpyDefault);
}

Expand Down
5 changes: 0 additions & 5 deletions src/impl/rocshmemspace/Kokkos_ROCSHMEMSpace.cpp
Expand Up @@ -117,7 +117,6 @@ void ROCSHMEMSpace::impl_deallocate(
const Kokkos::Tools::SpaceHandle arg_handle) const {
if (arg_alloc_ptr) {
Kokkos::fence("HostSpace::impl_deallocate before free");
fence();
size_t reported_size =
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
if (Kokkos::Profiling::profileLibraryLoaded()) {
Expand All @@ -144,14 +143,12 @@ Kokkos::Impl::DeepCopy<
HostSpace, Kokkos::Experimental::ROCSHMEMSpace>::DeepCopy(void *dst,
const void *src,
size_t n) {
Kokkos::Experimental::ROCSHMEMSpace().fence();
hipMemcpy(dst, src, n, hipMemcpyDefault);
}

Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCSHMEMSpace,
HostSpace>::DeepCopy(void *dst, const void *src,
size_t n) {
Kokkos::Experimental::ROCSHMEMSpace().fence();
hipMemcpy(dst, src, n, hipMemcpyDefault);
}

Expand All @@ -160,7 +157,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCSHMEMSpace,
Kokkos::Experimental::ROCSHMEMSpace,
ExecutionSpace>::DeepCopy(void *dst, const void *src,
size_t n) {
Kokkos::Experimental::ROCSHMEMSpace().fence();
hipMemcpy(dst, src, n, hipMemcpyDefault);
}

Expand All @@ -170,7 +166,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCSHMEMSpace,
ExecutionSpace>::DeepCopy(const ExecutionSpace &exec,
void *dst, const void *src,
size_t n) {
Kokkos::Experimental::ROCSHMEMSpace().fence();
hipMemcpy(dst, src, n, hipMemcpyDefault);
}

Expand Down
6 changes: 0 additions & 6 deletions src/impl/shmemspace/Kokkos_SHMEMSpace.cpp
Expand Up @@ -130,7 +130,6 @@ void SHMEMSpace::impl_deallocate(
const Kokkos::Tools::SpaceHandle arg_handle) const {
if (arg_alloc_ptr) {
Kokkos::fence("HostSpace::impl_deallocate before free");
fence();
size_t reported_size =
(arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
if (Kokkos::Profiling::profileLibraryLoaded()) {
Expand All @@ -152,13 +151,11 @@ namespace Impl {

Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::SHMEMSpace>::DeepCopy(
void *dst, const void *src, size_t n) {
Kokkos::Experimental::SHMEMSpace().fence();
memcpy(dst, src, n);
}

Kokkos::Impl::DeepCopy<Kokkos::Experimental::SHMEMSpace, HostSpace>::DeepCopy(
void *dst, const void *src, size_t n) {
Kokkos::Experimental::SHMEMSpace().fence();
memcpy(dst, src, n);
}

Expand All @@ -167,7 +164,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::SHMEMSpace,
const void
*src,
size_t n) {
Kokkos::Experimental::SHMEMSpace().fence();
memcpy(dst, src, n);
}

Expand All @@ -176,7 +172,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::SHMEMSpace,
Kokkos::Experimental::SHMEMSpace,
ExecutionSpace>::DeepCopy(void *dst, const void *src,
size_t n) {
Kokkos::Experimental::SHMEMSpace().fence();
memcpy(dst, src, n);
}

Expand All @@ -186,7 +181,6 @@ Kokkos::Impl::DeepCopy<Kokkos::Experimental::SHMEMSpace,
ExecutionSpace>::DeepCopy(const ExecutionSpace &exec,
void *dst, const void *src,
size_t n) {
Kokkos::Experimental::SHMEMSpace().fence();
memcpy(dst, src, n);
}

Expand Down
12 changes: 10 additions & 2 deletions unit_tests/Test_DeepCopy.cpp
Expand Up @@ -46,6 +46,8 @@ void test_deepcopy(
Kokkos::parallel_for(
"Team", 1, KOKKOS_LAMBDA(const int i) { v_R(my_rank, 0) = 0x123; });

Kokkos::fence();
RemoteSpace_t::fence();
Kokkos::deep_copy(v_H, v_R);
ASSERT_EQ(0x123, v_H(0, 0));
}
Expand All @@ -66,10 +68,13 @@ void test_deepcopy(

ViewHost_t v_H("HostView", 1, i1);
ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1);
RemoteSpace_t::fence();

Kokkos::parallel_for(
"Team", i1, KOKKOS_LAMBDA(const int i) { v_R(my_rank, i) = 0x123; });

Kokkos::fence();
RemoteSpace_t::fence();
Kokkos::deep_copy(v_H, v_R);
for (int i = 0; i < i1; ++i) {
ASSERT_EQ(0x123, v_H(0, i));
Expand All @@ -92,12 +97,14 @@ void test_deepcopy(

ViewHost_t v_H("HostView", 1, i1, i2);
ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1, i2);
RemoteSpace_t::fence();

Kokkos::parallel_for(
"Team", i1, KOKKOS_LAMBDA(const int i) {
for (int j = 0; j < i2; ++j) v_R(my_rank, i, j) = 0x123;
});

Kokkos::fence();
Kokkos::deep_copy(v_H, v_R);
for (int i = 0; i < i1; ++i)
for (int j = 0; j < i2; ++j) ASSERT_EQ(0x123, v_H(0, i, j));
Expand All @@ -119,7 +126,7 @@ void test_deepcopy(
ViewHost_t v_H("HostView", 1, 1);
v_H(0, 0) = 0x123;
ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, 1);

RemoteSpace_t::fence();
Kokkos::deep_copy(v_R, v_H);

Kokkos::parallel_for(
Expand Down Expand Up @@ -147,6 +154,7 @@ void test_deepcopy(
for (int i = 0; i < i1; ++i) v_H(0, i) = 0x123;

ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1);
RemoteSpace_t::fence();
Kokkos::deep_copy(v_R, v_H);

Kokkos::parallel_for(
Expand Down Expand Up @@ -176,7 +184,7 @@ void test_deepcopy(
for (int j = 0; j < i2; ++j) v_H(0, i, j) = 0x123;

ViewRemote_t v_R = ViewRemote_t("RemoteView", num_ranks, i1, i2);

RemoteSpace_t::fence();
Kokkos::deep_copy(v_R, v_H);

Kokkos::parallel_for(
Expand Down
2 changes: 1 addition & 1 deletion unit_tests/Test_LocalDeepCopy.cpp
Expand Up @@ -824,7 +824,7 @@ void test_localdeepcopy_withSubview(
Kokkos::subview(v_R, prev_rank, Kokkos::ALL, Kokkos::ALL);
auto v_R_subview_local =
Kokkos::subview(v_R, my_rank, Kokkos::ALL, Kokkos::ALL);
return;

Kokkos::parallel_for(
"Init", i1, KOKKOS_LAMBDA(const int i) {
for (int j = 0; j < i2; ++j) v_R_subview_local(i, j) = my_rank;
Expand Down
1 change: 1 addition & 0 deletions unit_tests/Test_Reduction.cpp
Expand Up @@ -112,6 +112,7 @@ void test_scalar_reduce_partitioned_1D(int dim1) {
v(my_rank, i) = static_cast<Data_t>(start + i);
});

Kokkos::fence();
RemoteSpace_t::fence();

Data_t gsum = 0;
Expand Down
4 changes: 4 additions & 0 deletions unit_tests/Test_RemoteAccess.cpp
Expand Up @@ -38,6 +38,8 @@ void test_remote_accesses(

int next_rank = (my_rank + 1) % num_ranks;

RemoteSpace_t::fence();

Kokkos::parallel_for(
"Update", size, KOKKOS_LAMBDA(const int i) {
/*Get Op*/
Expand Down Expand Up @@ -72,6 +74,8 @@ void test_remote_accesses(
int next_rank = (my_rank + 1) % num_ranks;
int prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;

RemoteSpace_t::fence();

Kokkos::parallel_for(
"Update", size, KOKKOS_LAMBDA(const int i) {
/*Put Op*/
Expand Down
16 changes: 9 additions & 7 deletions unit_tests/Test_Subview.cpp
Expand Up @@ -210,7 +210,6 @@ void test_subview3D_byScalar(int i1, int i2, int i3) {

Kokkos::fence();
RemoteSpace_t::fence();

Kokkos::deep_copy(v_h, v);

for (int i = 0; i < v_h.extent(0); ++i) {
Expand Down Expand Up @@ -415,13 +414,12 @@ void test_partitioned_subview1D(int i1, int i2, int sub1, int sub2) {
// Init
deep_copy(v_h, VAL);

auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1),
auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1),
Kokkos::ALL, Kokkos::ALL);

auto v_sub_1 = Kokkos::subview(v, Kokkos::ALL, sub1, sub2);
auto v_sub_2 = ViewRemote_1D_t(v, Kokkos::ALL, sub1, sub2);

Kokkos::deep_copy(v_sub, v_h);
RemoteSpace_t::fence();

Kokkos::parallel_for(
"Increment", 1, KOKKOS_LAMBDA(const int i) {
Expand Down Expand Up @@ -457,13 +455,12 @@ void test_partitioned_subview2D(int i1, int i2, int sub1) {
// Init
deep_copy(v_h, VAL);

auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1),
auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1),
Kokkos::ALL, Kokkos::ALL);

auto v_sub_1 = Kokkos::subview(v, Kokkos::ALL, sub1, Kokkos::ALL);
auto v_sub_2 = ViewRemote_2D_t(v, Kokkos::ALL, sub1, Kokkos::ALL);

Kokkos::deep_copy(v_sub, v_h);
RemoteSpace_t::fence();

Kokkos::parallel_for(
"Increment", v_sub_1.extent(1), KOKKOS_LAMBDA(const int i) {
Expand Down Expand Up @@ -503,6 +500,7 @@ void test_partitioned_subview3D(int i1, int i2, int sub1, int sub2) {
Kokkos::subview(v, Kokkos::ALL, Kokkos::ALL, std::make_pair(sub1, sub2));

Kokkos::deep_copy(v_sub, v_h);
RemoteSpace_t::fence();

Kokkos::parallel_for(
"Increment", v_sub_1.extent(1), KOKKOS_LAMBDA(const int i) {
Expand Down Expand Up @@ -579,6 +577,7 @@ void test_partitioned_subview2D_byRank_nextRank(int i1, int i2) {
auto v_sub = Kokkos::subview(v, std::make_pair(my_rank, my_rank + 1),
Kokkos::ALL, Kokkos::ALL);
auto v_sub_next = Kokkos::subview(v, next_rank, Kokkos::ALL, Kokkos::ALL);
RemoteSpace_t::fence();
Kokkos::deep_copy(v_sub, v_h);

Kokkos::parallel_for(
Expand Down Expand Up @@ -621,6 +620,7 @@ void test_partitioned_subviewOfSubviewRange_2D(int i1, int i2) {
auto v_sub_next_half = Kokkos::subview(
v_sub_next, Kokkos::pair<int, int>(i1_half, i1), Kokkos::ALL);
Kokkos::deep_copy(v_sub, v_h);
RemoteSpace_t::fence();

Kokkos::parallel_for(
"Increment", v_sub_next_half.extent(0), KOKKOS_LAMBDA(const int i) {
Expand Down Expand Up @@ -663,7 +663,9 @@ void test_partitioned_subviewOfSubviewScalar_2D(int i1, int i2) {
Kokkos::ALL, Kokkos::ALL);
auto v_sub_next = Kokkos::subview(v, next_rank, Kokkos::ALL, Kokkos::ALL);
auto v_sub_next_half = Kokkos::subview(v_sub_next, i1_half, Kokkos::ALL);

Kokkos::deep_copy(v_sub, v_h);
RemoteSpace_t::fence();

Kokkos::parallel_for(
"Increment", v_sub_next_half.extent(0),
Expand Down

0 comments on commit 9138c7a

Please sign in to comment.