Skip to content

Commit

Permalink
Simplify handling of subviews over dim0, add unit tests, fix issues
Browse files Browse the repository at this point in the history
  • Loading branch information
janciesko committed Feb 29, 2024
1 parent c60152d commit eb220d4
Show file tree
Hide file tree
Showing 10 changed files with 479 additions and 203 deletions.
22 changes: 14 additions & 8 deletions benchmarks/access_overhead/access_overhead_p2p.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ struct Access<ViewType_t, typename std::enable_if_t<
double gups = 1e-9 * ((N * iters) / time);
double size = N * sizeof(double) / 1024.0 / 1024.0;
double bw = gups * sizeof(double);
printf("access_overhead,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
modes[mode].c_str(), N, size, iters, time, gups, bw);
}
}
Expand Down Expand Up @@ -275,7 +275,7 @@ struct Access_CudaAware<
double gups = 1e-9 * ((N * iters) / time);
double size = N * sizeof(double) / 1024.0 / 1024.0;
double bw = gups * sizeof(double);
printf("access_overhead,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
(modes[mode]).c_str(), N, size, iters, time, gups, bw);
}
}
Expand Down Expand Up @@ -397,7 +397,7 @@ struct Access<ViewType_t, typename std::enable_if_t<
double size = N * sizeof(double) / 1024.0 / 1024.0;
double bw = gups * sizeof(double);
if (rma_op == RMA_GET) {
printf("access_overhead_p2p_get,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
modes[mode].c_str(), N, size, iters, time, gups, bw);
} else {
printf("access_overhead_p2p_put,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
Expand Down Expand Up @@ -438,7 +438,7 @@ struct Access_LDC<
void operator()(const size_t i) const {
double val1 = v_tmp(i);
double val2 = v(i);
printf("debug: %li, %f, %f\n", i, val1, val2);
printf("debug: %li, %.2f, %.2f\n", i, val1, val2);
}

KOKKOS_FUNCTION
Expand Down Expand Up @@ -486,6 +486,12 @@ struct Access_LDC<
Kokkos::pair(remote_range.first + start_offset,
remote_range.first + start_offset + team_block);

printf("[%lu, %lu], [%lu, %lu], [%lu, %lu], [%lu, %lu]\n",
team_remote_range.first, team_remote_range.second,
team_local_range.first, team_local_range.second,
remote_range.first, remote_range.second, local_range.first,
local_range.second);

// Construct team subviews
auto v_subview_remote = Kokkos::subview(v, team_remote_range);
auto v_tmp_subview_local = Kokkos::subview(v_tmp, team_local_range);
Expand Down Expand Up @@ -611,14 +617,14 @@ struct Access_LDC<
time_a = timer.seconds();
#if defined(ACCESS_LDC_USE_MULTI_LDC) || \
defined(ACCESS_LDC_USE_MULTI_LDC_BUILTIN)
Kokkos::parallel_for("block_transfer",
team_policy_get_update_t(64, 1), *this);
Kokkos::parallel_for("block_transfer", team_policy_get_update_t(4, 1),
*this);
#else
Kokkos::parallel_for("block_transfer", team_policy_get_update_t(1, 1),
*this);
#endif
Kokkos::fence();
#if defined(KOKKOS_REMOTE_SPACES_ENABLE_DEBUG) && (0)
#if defined(KOKKOS_REMOTE_SPACES_ENABLE_DEBUG) && (1)
Kokkos::parallel_for(
"printf values for debugging",
Kokkos::RangePolicy(local_range.first, local_range.second),
Expand Down Expand Up @@ -697,7 +703,7 @@ struct Access_LDC<
double size = N * sizeof(double) / 1024.0 / 1024.0;
double bw = gups * sizeof(double);
if (rma_op == RMA_GET) {
printf("access_overhead_p2p_get,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
printf("access_overhead_p2p,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
modes[mode].c_str(), N, size, iters, time, gups, bw);
} else {
printf("access_overhead_p2p_put,%s,%lu,%lf,%lu,%lf,%lf,%lf\n",
Expand Down
70 changes: 34 additions & 36 deletions benchmarks/access_overhead/scripts/run_over_size_p2p.sh
Original file line number Diff line number Diff line change
@@ -1,58 +1,56 @@
#/bin/bash
BENCHMARK=$1
HOST=$2
DEFAULT_SIZE=1000
DEFAULT_SIZE=100

#exports
export OMP_PROC_BIND=spread
export OMP_PLACES=threads
export OMP_NUM_THREADS=32

ITERS=30
ITERS=1

HASH=`date|md5sum|head -c 5`
DEVS="0,1"
FILENAME="${BENCHMARK}_${HASH}_p2p.res"
echo $FILENAME
echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME
VARS0="--bind-to core --map-by socket -x CUDA_VISIBLE_DEVICES=0,2"
VARS0="--bind-to core --map-by socket"
VARS1=" -x UCX_WARN_UNUSED_ENV_VARS=n -x HCOLL_RCACHE=^ucs -x \
LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/12.0.0/gcc/12.2.0/base/rantbbm/lib64/:$LD_LIBRARY_PATH" # -x NVSHMEM_SYMMETRIC_SIZE=10730741824"

#MPI + Kokkos
let SIZE=$DEFAULT_SIZE
for S in $(seq 1 21); do
for reps in $(seq 1 3); do
mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME
done
let SIZE=$SIZE*2
done

#Cuda-ware MPI + Kokkos
let SIZE=$DEFAULT_SIZE
for S in $(seq 1 21); do
for reps in $(seq 1 3); do
mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME
done
let SIZE=$SIZE*2
done

#Kokkos Remote Spaces
let SIZE=$DEFAULT_SIZE
for S in $(seq 1 21); do
for reps in $(seq 1 3); do
mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME
done
let SIZE=$SIZE*2
done
LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/12.0.0/gcc/12.2.0/base/rantbbm/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=10730741824"

#Kokkos Remote Spaces + LDC
let SIZE=$DEFAULT_SIZE
for S in $(seq 1 5); do
for reps in $(seq 1 3); do
mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 3 | tee -a $FILENAME
for S in $(seq 1 1); do
for reps in $(seq 1 1); do
CUDA_VISIBLE_DEVICES=$DEVS mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 3 | tee -a $FILENAME
done
let SIZE=$SIZE*2
done



# #Kokkos Remote Spaces
# let SIZE=$DEFAULT_SIZE
# for S in $(seq 1 20); do
# for reps in $(seq 1 3); do
# CUDA_VISIBLE_DEVICES=$DEVS mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME
# done
# let SIZE=$SIZE*2
# done

# #Cuda-ware MPI + Kokkos
# let SIZE=$DEFAULT_SIZE
# for S in $(seq 1 20); do
# for reps in $(seq 1 3); do
# CUDA_VISIBLE_DEVICES=$DEVS mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME
# done
# let SIZE=$SIZE*2
# done

# #MPI + Kokkos
# let SIZE=$DEFAULT_SIZE
# for S in $(seq 1 20); do
# for reps in $(seq 1 3); do
# CUDA_VISIBLE_DEVICES=$DEVS mpirun -np 2 $VARS0 $VARS1 $VARS2 -host $HOST ./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME
# done
# let SIZE=$SIZE*2
# done
50 changes: 24 additions & 26 deletions src/core/Kokkos_RemoteSpaces_Helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ bool is_local_view(
view_type v,
std::enable_if_t<Is_View_Of_Type_RemoteSpaces<view_type>::value> * =
nullptr) {
return (v.impl_map().get_lowest_participating_PE() ==
v.impl_map().get_my_PE());
return (v.impl_map().get_PE() == v.impl_map().get_logical_PE());
}

template <class view_type>
Expand All @@ -67,12 +66,14 @@ bool is_local_view(

template <class T>
struct RemoteSpaces_View_Properties {
/* Is the first index denoting PE*/
bool R0;
/* Is this view a subview of another view*/
bool is_subview;
/* Is the first index denoting PE o*/
// bool R0;
/* Is this view a subview created over dim0, then the
* we use indexing of an ordinary view
*/
bool using_local_indexing;
/* Index offset in dim0 */
T R0_domain_offset;
T R0_offset;
/* Num local elems in dim0 */
T R0_size;
/* Com size and rank*/
Expand All @@ -81,32 +82,29 @@ struct RemoteSpaces_View_Properties {

KOKKOS_FUNCTION
RemoteSpaces_View_Properties() {
R0 = true; /* default is true */
is_subview = false;
R0_domain_offset = 0;
R0_size = 0;
num_PEs = Kokkos::Experimental::get_num_pes();
my_PE = Kokkos::Experimental::get_my_pe();
using_local_indexing = false;
R0_offset = 0;
R0_size = 0;
num_PEs = Kokkos::Experimental::get_num_pes();
my_PE = Kokkos::Experimental::get_my_pe();
}

KOKKOS_FUNCTION
RemoteSpaces_View_Properties(const RemoteSpaces_View_Properties &rhs) {
R0 = rhs.R0;
is_subview = rhs.is_subview;
R0_domain_offset = rhs.R0_domain_offset;
R0_size = rhs.R0_size;
num_PEs = rhs.num_PEs;
my_PE = rhs.my_PE;
using_local_indexing = rhs.using_local_indexing;
R0_offset = rhs.R0_offset;
R0_size = rhs.R0_size;
num_PEs = rhs.num_PEs;
my_PE = rhs.my_PE;
}

KOKKOS_FUNCTION RemoteSpaces_View_Properties &operator=(
const RemoteSpaces_View_Properties &rhs) {
R0 = rhs.R0;
is_subview = rhs.is_subview;
R0_domain_offset = rhs.R0_domain_offset;
R0_size = rhs.R0_size;
num_PEs = rhs.num_PEs;
my_PE = rhs.my_PE;
using_local_indexing = rhs.using_local_indexing;
R0_offset = rhs.R0_offset;
R0_size = rhs.R0_size;
num_PEs = rhs.num_PEs;
my_PE = rhs.my_PE;
return *this;
}
};
Expand Down Expand Up @@ -146,7 +144,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::pair<T, T> get_range(T size, int pe) {

template <typename T>
KOKKOS_INLINE_FUNCTION Kokkos::pair<T, T> get_local_range(T size) {
auto pe = Kokkos::Experimental::get_my_pe();
auto pe = get_my_pe();
return getRange(size, pe);
}

Expand Down
26 changes: 14 additions & 12 deletions src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,9 @@ auto KOKKOS_INLINE_FUNCTION get_local_subview(T view, P r) {
}

template <class T>
auto KOKKOS_INLINE_FUNCTION get_subview_start_adr(T view) {
return view.data();
auto KOKKOS_INLINE_FUNCTION get_view_adr(T view) {
return view.impl_map().handle().ptr;
}

} // namespace Impl

namespace Experimental {
Expand All @@ -73,8 +72,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
std::is_same<typename ViewTraits<ST, SP...>::specialize,
Kokkos::Experimental::RemoteSpaceSpecializeTag>::value)>::
type * = nullptr) {
int src_rank = src.impl_map().get_lowest_participating_PE();
int dst_rank = dst.impl_map().get_lowest_participating_PE();
int src_rank = src.impl_map().get_logical_PE();
int dst_rank = dst.impl_map().get_logical_PE();
int my_rank = get_my_pe();

if (src_rank != my_rank && dst_rank != my_rank) {
Expand Down Expand Up @@ -115,8 +114,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
auto dst_subview = Kokkos::Impl::get_local_subview(dst, team_range);

// Construct subview offsets
auto src_subview_ptr = Kokkos::Impl::get_subview_start_adr(src_subview);
auto dst_subview_ptr = Kokkos::Impl::get_subview_start_adr(dst_subview);
auto src_subview_ptr = Kokkos::Impl::get_view_adr(src_subview);
auto dst_subview_ptr = Kokkos::Impl::get_view_adr(dst_subview);

if (src_rank != my_rank) {
team.team_barrier();
Expand Down Expand Up @@ -166,8 +165,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
std::is_same<typename ViewTraits<ST, SP...>::specialize,
Kokkos::Experimental::RemoteSpaceSpecializeTag>::value)>::
type * = nullptr) {
int src_rank = src.impl_map().get_lowest_participating_PE();
int dst_rank = dst.impl_map().get_lowest_participating_PE();
int src_rank = src.impl_map().get_logical_PE();
int dst_rank = dst.impl_map().get_logical_PE();
int my_rank = get_my_pe();

if (src_rank != my_rank && dst_rank != my_rank) {
Expand All @@ -188,9 +187,12 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
Kokkos::Impl::BlockDataHandle<typename ViewTraits<DT, DP...>::value_type,
ViewTraits<DT, DP...>>;

// Construct view offsets
auto src_subview_ptr = src.data();
auto dst_subview_ptr = dst.data();
// Construct subview offsets
auto src_subview_ptr = Kokkos::Impl::get_view_adr(src);
auto dst_subview_ptr = Kokkos::Impl::get_view_adr(dst);

printf("LDC: %p, %p, %p %p\n", dst.data(), src.data(), dst_subview_ptr,
src_subview_ptr);

if (src_rank != my_rank) {
#ifdef KRS_ENABLE_MPISPACE
Expand Down
1 change: 1 addition & 0 deletions src/core/Kokkos_RemoteSpaces_Options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ struct RemoteSpaces_MemoryTraits;

template <unsigned T>
struct RemoteSpaces_MemoryTraits<MemoryTraits<T>> {
/*Remove as obsolete*/
enum : bool { dim0_is_pe = (unsigned(0) != (T & unsigned(Dim0IsPE))) };
enum : int { state = T };
};
Expand Down

0 comments on commit eb220d4

Please sign in to comment.