Skip to content

Commit

Permalink
VPICIO bugfix (#196)
Browse files Browse the repository at this point in the history
* Fix VPICIO bug
* Add more checks and error out when no server is selected
* Committing clang-format changes
* Add VPICIO and BDCATS to MPI test

---------

Co-authored-by: github-actions <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Jean Luca Bez <jlbez@lbl.gov>
  • Loading branch information
3 people committed May 24, 2024
1 parent 3af431f commit 3a52620
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 12 deletions.
34 changes: 23 additions & 11 deletions src/api/pdc_region/pdc_region_transfer.c
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ register_metadata(pdc_transfer_request_start_all_pkg **transfer_request_input, i
* sorted in terms of data_server_id. We pack data from user buffer to contiguous buffers. Static partitioning
* requires having at most n_data_servers number of contiguous regions.
*/
static int
static perr_t
prepare_start_all_requests(pdcid_t *transfer_request_id, int size,
pdc_transfer_request_start_all_pkg ***write_transfer_request_ptr,
pdc_transfer_request_start_all_pkg ***read_transfer_request_ptr,
Expand Down Expand Up @@ -934,7 +934,7 @@ prepare_start_all_requests(pdcid_t *transfer_request_id, int size,
printf("PDC Client PDCregion_transfer_start_all attempt to start existing transfer request @ "
"line %d\n",
__LINE__);
return 1;
return FAIL;
}
if (transfer_request->consistency == PDC_CONSISTENCY_POSIX) {
posix_transfer_request_id_ptr[0][posix_size_ptr[0]] = transfer_request_id[i];
Expand All @@ -959,6 +959,11 @@ prepare_start_all_requests(pdcid_t *transfer_request_id, int size,
&(transfer_request->n_obj_servers), &(transfer_request->obj_servers),
&(transfer_request->sub_offsets), &(transfer_request->output_offsets),
&(transfer_request->output_sizes), &(transfer_request->output_buf));
if (transfer_request->n_obj_servers == 0) {
printf("PDC_Client %d, %s: error with static region partition, no server is selected!\n",
pdc_client_mpi_rank_g, __func__);
return FAIL;
}
for (j = 0; j < transfer_request->n_obj_servers; ++j) {
request_pkgs =
(pdc_transfer_request_start_all_pkg *)malloc(sizeof(pdc_transfer_request_start_all_pkg));
Expand Down Expand Up @@ -1082,7 +1087,7 @@ prepare_start_all_requests(pdcid_t *transfer_request_id, int size,
else {
*read_size_ptr = 0;
}
return 0;
return SUCCEED;
}

static int
Expand Down Expand Up @@ -1321,8 +1326,9 @@ PDCregion_transfer_start_all(pdcid_t *transfer_request_id, int size)
FUNC_ENTER(NULL);
// Split write and read requests. Handle them separately.
// printf("PDCregion_transfer_start_all: checkpoint %d\n", __LINE__);
prepare_start_all_requests(transfer_request_id, size, &write_transfer_requests, &read_transfer_requests,
&write_size, &read_size, &posix_transfer_request_id, &posix_size);
ret_value = prepare_start_all_requests(transfer_request_id, size, &write_transfer_requests,
&read_transfer_requests, &write_size, &read_size,
&posix_transfer_request_id, &posix_size);
/*
printf("PDCregion_transfer_start_all: checkpoint %d, write_size = %d, read_size = %d\n", __LINE__,
write_size, read_size);
Expand Down Expand Up @@ -1454,12 +1460,18 @@ PDCregion_transfer_start(pdcid_t transfer_request_id)

if (transfer_request->region_partition == PDC_REGION_STATIC) {
// Identify which part of the region is going to which data server.
static_region_partition(transfer_request->new_buf, transfer_request->remote_region_ndim, unit,
transfer_request->access_type, transfer_request->obj_dims,
transfer_request->remote_region_offset, transfer_request->remote_region_size,
1, &(transfer_request->n_obj_servers), &(transfer_request->obj_servers),
&(transfer_request->sub_offsets), &(transfer_request->output_offsets),
&(transfer_request->output_sizes), &(transfer_request->output_buf));
ret_value = static_region_partition(
transfer_request->new_buf, transfer_request->remote_region_ndim, unit,
transfer_request->access_type, transfer_request->obj_dims, transfer_request->remote_region_offset,
transfer_request->remote_region_size, 1, &(transfer_request->n_obj_servers),
&(transfer_request->obj_servers), &(transfer_request->sub_offsets),
&(transfer_request->output_offsets), &(transfer_request->output_sizes),
&(transfer_request->output_buf));
if (transfer_request->n_obj_servers == 0) {
printf("PDC_Client %d, %s: error with static region partition, no server is selected!\n",
pdc_client_mpi_rank_g, __func__);
return FAIL;
}
/*
printf("n_obj_servers = %d\n", transfer_request->n_obj_servers);
for ( i = 0; i < transfer_request->n_obj_servers; ++i ) {
Expand Down
3 changes: 3 additions & 0 deletions src/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ set(SCRIPTS
run_test.sh
mpi_test.sh
run_multiple_test.sh
run_multiple_mpi_test.sh
run_checkpoint_restart_test.sh
)

Expand Down Expand Up @@ -487,6 +488,7 @@ if(BUILD_MPI_TESTING)
add_test(NAME obj_info_mpi WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} COMMAND mpi_test.sh ./obj_info ${MPI_RUN_CMD} 4 6 )
add_test(NAME obj_put_data_mpi WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} COMMAND mpi_test.sh ./obj_put_data ${MPI_RUN_CMD} 4 6 )
add_test(NAME obj_get_data_mpi WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} COMMAND mpi_test.sh ./obj_get_data ${MPI_RUN_CMD} 4 6 )
add_test(NAME vpicio_bdcats_mpi WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} COMMAND run_multiple_mpi_test.sh ${MPI_RUN_CMD} 4 6 ./vpicio ./bdcats)

set_tests_properties(read_obj_shared_int PROPERTIES LABELS "parallel;parallel_obj" )
set_tests_properties(read_obj_shared_float PROPERTIES LABELS "parallel;parallel_obj" )
Expand Down Expand Up @@ -514,6 +516,7 @@ if(BUILD_MPI_TESTING)
# set_tests_properties(region_transfer_2D_skewed_mpi PROPERTIES LABELS "parallel;parallel_region_transfer" )
# set_tests_properties(region_transfer_3D_skewed_mpi PROPERTIES LABELS "parallel;parallel_region_transfer" )
set_tests_properties(region_transfer_write_read_mpi PROPERTIES LABELS "parallel;parallel_region_transfer" )
set_tests_properties(vpicio_bdcats_mpi PROPERTIES LABELS "parallel;parallel_region_transfer" )
set_tests_properties(region_transfer_all_mpi PROPERTIES LABELS "parallel;parallel_region_transfer_all" )
set_tests_properties(region_transfer_all_2D_mpi PROPERTIES LABELS "parallel;parallel_region_transfer_all" )
set_tests_properties(region_transfer_all_3D_mpi PROPERTIES LABELS "parallel;parallel_region_transfer_all" )
Expand Down
52 changes: 52 additions & 0 deletions src/tests/run_multiple_mpi_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
# This version of the test runner doesn't attempt to run any parallel tests.
# We assume too, that if the library build has enabled MPI, that LD_LIBRARY_PATH is
# defined and points to the MPI libraries used by the linker (e.g. -L<path -lmpi)

extra_cmd=""

if [[ "$SUPERCOMPUTER" == "perlmutter" ]]; then
extra_cmd="--mem=25600 --cpu_bind=cores --overlap"
fi

if [ $# -lt 1 ]; then echo "missing test argument" && exit -1 ; fi
# check the test to be run:
mpi_cmd="$1"
n_servers="$2"
n_client="$3"
# copy the remaining test input arguments (if any)
all_test="${@:4}"
test_args=""
echo $all_test

rm -rf pdc_tmp pdc_data

# START the server (in the background)
echo "$mpi_cmd -n $n_servers $extra_cmd ./pdc_server.exe &"
$mpi_cmd -n $n_servers $extra_cmd ./pdc_server.exe &

# WAIT a bit, for 1 second
sleep 1

# RUN the actual test(s)
for test_exe in $all_test
do
if [ -x $test_exe ]; then echo "testing: $test_exe"; else echo "test: $test_exe not found or not and executable" && exit -2; fi
if [[ "$test_exe" = *vpicio ]]; then
test_args="1024"
fi
if [[ "$test_exe" = *bdcats ]]; then
test_args="1024"
fi
echo "$mpi_cmd -n $n_client $extra_cmd $test_exe $test_args"
$mpi_cmd -n $n_client $extra_cmd $test_exe $test_args
ret="$?"
done

# Need to test the return value
ret="$?"
# and shutdown the SERVER before exiting
echo "Close server"
echo "$mpi_cmd -n 1 $extra_cmd ./close_server"
$mpi_cmd -n 1 $extra_cmd ./close_server
exit $ret
2 changes: 1 addition & 1 deletion src/tests/vpicio.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ main(int argc, char **argv)
printf("Writing %" PRIu64 " number of particles with %d clients.\n", numparticles, size);
}

dims[0] = numparticles;
dims[0] = numparticles * size;

x = (float *)malloc(numparticles * sizeof(float));
y = (float *)malloc(numparticles * sizeof(float));
Expand Down

0 comments on commit 3a52620

Please sign in to comment.