Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow Kokkos remap to turn off CUDA-aware MPI #2053

Merged
merged 1 commit into from May 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 12 additions & 7 deletions src/KOKKOS/fft3d_kokkos.cpp
Expand Up @@ -38,7 +38,8 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective) :
int scaled, int permute, int *nbuf, int usecollective,
int usecuda_aware) :
Pointers(lmp)
{
int nthreads = lmp->kokkos->nthreads;
Expand Down Expand Up @@ -70,7 +71,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
scaled,permute,nbuf,usecollective,nthreads);
scaled,permute,nbuf,usecollective,nthreads,usecuda_aware);
if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
}

Expand Down Expand Up @@ -368,6 +369,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
2 = permute twice = slow->fast, fast->mid, mid->slow
nbuf returns size of internal storage buffers used by FFT
usecollective use collective MPI operations for remapping data
usecuda_aware use CUDA-Aware MPI or not
------------------------------------------------------------------------- */

template<class DeviceType>
Expand All @@ -378,7 +380,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective,
int nthreads)
int nthreads, int usecuda_aware)
{
struct fft_plan_3d_kokkos<DeviceType> *plan;
int me,nprocs;
Expand Down Expand Up @@ -435,7 +437,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
plan->pre_plan =
remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
first_ilo,first_ihi,first_jlo,first_jhi,
first_klo,first_khi,2,0,0,FFT_PRECISION,0);
first_klo,first_khi,2,0,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->pre_plan == NULL) return NULL;
}

Expand All @@ -460,7 +463,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
first_klo,first_khi,
second_ilo,second_ihi,second_jlo,second_jhi,
second_klo,second_khi,2,1,0,FFT_PRECISION,
usecollective);
usecollective,usecuda_aware);
if (plan->mid1_plan == NULL) return NULL;

// 1d FFTs along mid axis
Expand Down Expand Up @@ -500,7 +503,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
second_jlo,second_jhi,second_klo,second_khi,
second_ilo,second_ihi,
third_jlo,third_jhi,third_klo,third_khi,
third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
third_ilo,third_ihi,2,1,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->mid2_plan == NULL) return NULL;

// 1d FFTs along slow axis
Expand All @@ -527,7 +531,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
third_klo,third_khi,third_ilo,third_ihi,
third_jlo,third_jhi,
out_klo,out_khi,out_ilo,out_ihi,
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->post_plan == NULL) return NULL;
}

Expand Down
4 changes: 2 additions & 2 deletions src/KOKKOS/fft3d_kokkos.h
Expand Up @@ -77,7 +77,7 @@ class FFT3dKokkos : protected Pointers {

FFT3dKokkos(class LAMMPS *, MPI_Comm,
int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
int,int,int *,int);
int,int,int *,int,int);
~FFT3dKokkos();
void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
Expand All @@ -95,7 +95,7 @@ class FFT3dKokkos : protected Pointers {
struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
int, int, int, int, int,
int, int, int, int, int, int, int,
int, int, int *, int, int);
int, int, int *, int, int, int);

void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);

Expand Down
8 changes: 5 additions & 3 deletions src/KOKKOS/pppm_kokkos.cpp
Expand Up @@ -840,21 +840,23 @@ void PPPMKokkos<DeviceType>::allocate()
// 2nd FFT returns data in 3d brick decomposition
// remap takes data from 3d brick to FFT decomposition

int collective_flag = 0; // not yet supported in Kokkos version
int cuda_aware_flag = lmp->kokkos->cuda_aware_flag;
int tmp;

fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
0,0,&tmp,collective_flag);
0,0,&tmp,collective_flag,cuda_aware_flag);

fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
0,0,&tmp,collective_flag);
0,0,&tmp,collective_flag,cuda_aware_flag);
remap = new RemapKokkos<DeviceType>(lmp,world,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,0,FFT_PRECISION,collective_flag);
1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);

// create ghost grid object for rho and electric field communication

Expand Down
34 changes: 29 additions & 5 deletions src/KOKKOS/remap_kokkos.cpp
Expand Up @@ -38,12 +38,14 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int nqty, int permute, int memory,
int precision, int usecollective) : Pointers(lmp)
int precision, int usecollective,
int usecuda_aware) : Pointers(lmp)
{
plan = remap_3d_create_plan_kokkos(comm,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
nqty,permute,memory,precision,usecollective);
nqty,permute,memory,precision,usecollective,
usecuda_aware);
if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
}

Expand Down Expand Up @@ -119,20 +121,36 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d

// post all recvs into scratch space

FFT_SCALAR* v_scratch = d_scratch.data();
if (!plan->usecuda_aware) {
plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
v_scratch = plan->h_scratch.data();
}

for (irecv = 0; irecv < plan->nrecv; irecv++) {
FFT_SCALAR* scratch = d_scratch.data() + plan->recv_bufloc[irecv];
FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
MPI_Irecv(scratch,plan->recv_size[irecv],
MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
plan->comm,&plan->request[irecv]);
}

FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
if (!plan->usecuda_aware) {
plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
v_sendbuf = plan->h_sendbuf.data();
}

// send all messages to other procs

for (isend = 0; isend < plan->nsend; isend++) {
int in_offset = plan->send_offset[isend];
plan->pack(d_in,in_offset,
plan->d_sendbuf,0,&plan->packplan[isend]);
MPI_Send(plan->d_sendbuf.data(),plan->send_size[isend],MPI_FFT_SCALAR,

if (!plan->usecuda_aware)
Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);

MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
plan->send_proc[isend],0,plan->comm);
}

Expand Down Expand Up @@ -161,6 +179,9 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];

if (!plan->usecuda_aware)
Kokkos::deep_copy(d_scratch,plan->h_scratch);

plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
}
Expand Down Expand Up @@ -189,6 +210,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
1 = single precision (4 bytes per datum)
2 = double precision (8 bytes per datum)
usecollective whether to use collective MPI or point-to-point
usecuda_aware whether to use CUDA-Aware MPI or not
------------------------------------------------------------------------- */

template<class DeviceType>
Expand All @@ -198,7 +220,8 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int nqty, int permute, int memory, int precision, int usecollective)
int nqty, int permute, int memory, int precision,
int usecollective, int usecuda_aware)
{

struct remap_plan_3d_kokkos<DeviceType> *plan;
Expand All @@ -216,6 +239,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
plan = new struct remap_plan_3d_kokkos<DeviceType>;
if (plan == NULL) return NULL;
plan->usecollective = usecollective;
plan->usecuda_aware = usecuda_aware;

// store parameters in local data structs

Expand Down
7 changes: 5 additions & 2 deletions src/KOKKOS/remap_kokkos.h
Expand Up @@ -28,7 +28,9 @@ struct remap_plan_3d_kokkos {
typedef DeviceType device_type;
typedef FFTArrayTypes<DeviceType> FFT_AT;
typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
FFT_HAT::t_FFT_SCALAR_1d h_sendbuf; // host buffer for MPI sends
typename FFT_AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
FFT_HAT::t_FFT_SCALAR_1d h_scratch; // host scratch buffer for MPI recvs
void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
// which pack function to use
void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
Expand All @@ -51,6 +53,7 @@ struct remap_plan_3d_kokkos {
int usecollective; // use collective or point-to-point MPI
int commringlen; // length of commringlist
int *commringlist; // ranks on communication ring of this plan
int usecuda_aware; // use CUDA-Aware MPI or not
};

template<class DeviceType>
Expand All @@ -60,7 +63,7 @@ class RemapKokkos : protected Pointers {
typedef FFTArrayTypes<DeviceType> FFT_AT;
RemapKokkos(class LAMMPS *);
RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,int);
int,int,int,int,int,int,int,int,int,int,int,int);
~RemapKokkos();
void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);

Expand All @@ -70,7 +73,7 @@ class RemapKokkos : protected Pointers {
struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
int, int, int, int, int, int,
int, int, int, int, int, int,
int, int, int, int, int);
int, int, int, int, int, int);
void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
};

Expand Down