Merge pull request #3740 from stanmoore1/kk_sort

Add Kokkos support for atom sorting on device
lammps · May 1, 2023 · 41a0196 · 41a0196
2 parents 4a608dc + e679936
commit 41a0196
Show file tree

Hide file tree

Showing 49 changed files with 639 additions and 97 deletions.
diff --git a/doc/src/Speed_kokkos.rst b/doc/src/Speed_kokkos.rst
@@ -285,6 +285,16 @@ one or more nodes, each with two GPUs:
    settings. Experimenting with its options can provide a speed-up for
    specific calculations. For example:
 
+.. note::
+
+   The default binsize for :doc:`atom sorting <atom_modify>` on GPUs
+   is equal to the default CPU neighbor binsize (i.e. 2x smaller than the
+   default GPU neighbor binsize). When running simple pair-wise
+   potentials like Lennard Jones on GPUs, using a 2x larger binsize for
+   atom sorting (equal to the default GPU neighbor binsize) and a more
+   frequent sorting than default (e.g. sorting every 100 time steps
+   instead of 1000) may improve performance.
+
 .. code-block:: bash
 
    mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj      # Newton on, half neighbor list, set binsize = neighbor ghost cutoff

diff --git a/doc/src/atom_modify.rst b/doc/src/atom_modify.rst
@@ -153,6 +153,13 @@ cache locality will be undermined.
    order of atoms in a :doc:`dump <dump>` file will also typically change
    if sorting is enabled.
 
+.. note::
+
+   When running simple pair-wise potentials like Lennard Jones on GPUs
+   with the KOKKOS package, using a larger binsize (e.g. 2x larger than
+   default) and a more frequent reordering than default (e.g. every 100
+   time steps) may improve performance.
+
 Restrictions
 """"""""""""
 

diff --git a/doc/src/package.rst b/doc/src/package.rst
@@ -71,7 +71,7 @@ Syntax
            *no_affinity* values = none
        *kokkos* args = keyword value ...
          zero or more keyword/value pairs may be appended
-         keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *gpu/aware* or *pair/only*
+         keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *sort* or *gpu/aware* or *pair/only*
            *neigh* value = *full* or *half*
              full = full neighbor list
              half = half neighbor list built in thread-safe manner
@@ -102,6 +102,9 @@ Syntax
            *comm/pair/reverse* value = *no* or *device*
              *no* = perform communication pack/unpack in non-KOKKOS mode
              *device* = perform pack/unpack on device (e.g. on GPU)
+           *sort* value = *no* or *device*
+             *no* = perform atom sorting in non-KOKKOS mode
+             *device* = perform atom sorting on device (e.g. on GPU)
            *gpu/aware* = *off* or *on*
              *off* = do not use GPU-aware MPI
              *on* = use GPU-aware MPI (default)
@@ -554,6 +557,17 @@ pack/unpack communicated data. When running small systems on a GPU,
 performing the exchange pack/unpack on the host CPU can give speedup
 since it reduces the number of CUDA kernel launches.
 
+The *sort* keyword determines whether the host or device performs atom
+sorting, see the :doc:`atom_modify sort <atom_modify>` command.  The
+value options for the *sort* keyword are *no* or *device* similar to the
+*comm* keywords above. If a value of *host* is used it will be
+automatically be changed to *no* since the *sort* keyword doesn't
+support *host* mode. The value of *no* will also always be used when
+running on the CPU, i.e. setting the value to *device* will have no
+effect if the simulation is running on the CPU. Not all fix styles with
+extra atom data support *device* mode and in that case a warning will be
+given and atom sorting will run in *no* mode instead.
+
 The *gpu/aware* keyword chooses whether GPU-aware MPI will be used. When
 this keyword is set to *on*, buffers in GPU memory are passed directly
 through MPI send/receive calls. This reduces overhead of first copying
@@ -705,12 +719,12 @@ script or via the "-pk intel" :doc:`command-line switch <Run_options>`.
 
 For the KOKKOS package, the option defaults for GPUs are neigh = full,
 neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default
-value, comm = device, neigh/transpose = off, gpu/aware = on. When
-LAMMPS can safely detect that GPU-aware MPI is not available, the
-default value of gpu/aware becomes "off". For CPUs or Xeon Phis, the
-option defaults are neigh = half, neigh/qeq = half, newton = on,
-binsize = 0.0, and comm = no.  The option neigh/thread = on when there
-are 16K atoms or less on an MPI rank, otherwise it is "off". These
+value, comm = device, sort = device, neigh/transpose = off, gpu/aware =
+on. When LAMMPS can safely detect that GPU-aware MPI is not available,
+the default value of gpu/aware becomes "off". For CPUs or Xeon Phis, the
+option defaults are neigh = half, neigh/qeq = half, newton = on, binsize
+= 0.0, comm = no, and sort = no.  The option neigh/thread = on when
+there are 16K atoms or less on an MPI rank, otherwise it is "off". These
 settings are made automatically by the required "-k on"
 :doc:`command-line switch <Run_options>`. You can change them by using
 the package kokkos command in your input script or via the :doc:`-pk

diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
@@ -22,6 +22,11 @@
 #include "kokkos.h"
 #include "memory_kokkos.h"
 #include "update.h"
+#include "kokkos_base.h"
+#include "modify.h"
+#include "fix.h"
+
+#include <Kokkos_Sort.hpp>
 
 using namespace LAMMPS_NS;
 
@@ -103,6 +108,15 @@ AtomKokkos::~AtomKokkos()
 
 /* ---------------------------------------------------------------------- */
 
+void AtomKokkos::init()
+{
+  Atom::init();
+
+  sort_classic = lmp->kokkos->sort_classic;
+}
+
+/* ---------------------------------------------------------------------- */
+
 void AtomKokkos::sync(const ExecutionSpace space, unsigned int mask)
 {
   if (space == Device && lmp->kokkos->auto_sync) avecKK->modified(Host, mask);
@@ -140,8 +154,37 @@ void AtomKokkos::allocate_type_arrays()
 
 void AtomKokkos::sort()
 {
-  int i, m, n, ix, iy, iz, ibin, empty;
+  // check if all fixes with atom-based arrays support sort on device
+
+  if (!sort_classic) {
+    int flag = 1;
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
+      auto fix_iextra = modify->fix[atom->extra_grow[iextra]];
+      if (!fix_iextra->sort_device) {
+        flag = 0;
+        break;
+      }
+    }
+    if (!flag) {
+      if (comm->me == 0) {
+        error->warning(FLERR,"Fix with atom-based arrays not compatible with Kokkos sorting on device, "
+                           "switching to classic host sorting");
+      }
+      sort_classic = true;
+    }
+  }
+
+  if (sort_classic) {
+    sync(Host, ALL_MASK);
+    Atom::sort();
+    modified(Host, ALL_MASK);
+  } else sort_device();
+}
+
+/* ---------------------------------------------------------------------- */
 
+void AtomKokkos::sort_device()
+{
   // set next timestep for sorting to take place
 
   nextsort = (update->ntimestep / sortfreq) * sortfreq + sortfreq;
@@ -151,93 +194,36 @@ void AtomKokkos::sort()
   if (domain->box_change) setup_sort_bins();
   if (nbins == 1) return;
 
-  // reallocate per-atom vectors if needed
-
-  if (atom->nmax > maxnext) {
-    memory->destroy(next);
-    memory->destroy(permute);
-    maxnext = atom->nmax;
-    memory->create(next, maxnext, "atom:next");
-    memory->create(permute, maxnext, "atom:permute");
-  }
-
-  // ensure there is one extra atom location at end of arrays for swaps
-
-  if (nlocal == nmax) avec->grow(0);
-
   // for triclinic, atoms must be in box coords (not lamda) to match bbox
 
   if (domain->triclinic) domain->lamda2x(nlocal);
 
-  sync(Host, ALL_MASK);
-
-  // bin atoms in reverse order so linked list will be in forward order
-
-  for (i = 0; i < nbins; i++) binhead[i] = -1;
-
-  HAT::t_x_array_const h_x = k_x.view<LMPHostType>();
-  for (i = nlocal - 1; i >= 0; i--) {
-    ix = static_cast<int>((h_x(i, 0) - bboxlo[0]) * bininvx);
-    iy = static_cast<int>((h_x(i, 1) - bboxlo[1]) * bininvy);
-    iz = static_cast<int>((h_x(i, 2) - bboxlo[2]) * bininvz);
-    ix = MAX(ix, 0);
-    iy = MAX(iy, 0);
-    iz = MAX(iz, 0);
-    ix = MIN(ix, nbinx - 1);
-    iy = MIN(iy, nbiny - 1);
-    iz = MIN(iz, nbinz - 1);
-    ibin = iz * nbiny * nbinx + iy * nbinx + ix;
-    next[i] = binhead[ibin];
-    binhead[ibin] = i;
-  }
+  auto d_x = k_x.d_view;
+  sync(Device, X_MASK);
 
-  // permute = desired permutation of atoms
-  // permute[I] = J means Ith new atom will be Jth old atom
+  // sort
 
-  n = 0;
-  for (m = 0; m < nbins; m++) {
-    i = binhead[m];
-    while (i >= 0) {
-      permute[n++] = i;
-      i = next[i];
-    }
-  }
+  int max_bins[3];
+  max_bins[0] = nbinx;
+  max_bins[1] = nbiny;
+  max_bins[2] = nbinz;
 
-  // current = current permutation, just reuse next vector
-  // current[I] = J means Ith current atom is Jth old atom
-
-  int *current = next;
-  for (i = 0; i < nlocal; i++) current[i] = i;
-
-  // reorder local atom list, when done, current = permute
-  // perform "in place" using copy() to extra atom location at end of list
-  // inner while loop processes one cycle of the permutation
-  // copy before inner-loop moves an atom to end of atom list
-  // copy after inner-loop moves atom at end of list back into list
-  // empty = location in atom list that is currently empty
-
-  for (i = 0; i < nlocal; i++) {
-    if (current[i] == permute[i]) continue;
-    avec->copy(i, nlocal, 0);
-    empty = i;
-    while (permute[empty] != i) {
-      avec->copy(permute[empty], empty, 0);
-      empty = current[empty] = permute[empty];
-    }
-    avec->copy(nlocal, empty, 0);
-    current[empty] = permute[empty];
-  }
+  using KeyViewType = DAT::t_x_array;
+  using BinOp = BinOp3DLAMMPS<KeyViewType>;
+  BinOp binner(max_bins, bboxlo, bboxhi);
+  Kokkos::BinSort<KeyViewType, BinOp> Sorter(d_x, 0, nlocal, binner, false);
+  Sorter.create_permute_vector(LMPDeviceType());
 
-  // sanity check that current = permute
+  avecKK->sort_kokkos(Sorter);
 
-  //int flag = 0;
-  //for (i = 0; i < nlocal; i++)
-  //  if (current[i] != permute[i]) flag = 1;
-  //int flagall;
-  //MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
-  //if (flagall) errorX->all(FLERR,"Atom sort did not operate correctly");
+  if (atom->nextra_grow) {
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
+      auto fix_iextra = modify->fix[atom->extra_grow[iextra]];
+      KokkosBase *kkbase = dynamic_cast<KokkosBase*>(fix_iextra);
 
-  modified(Host, ALL_MASK);
+      kkbase->sort_kokkos(Sorter);
+    }
+  }
 
  //  convert back to lamda coords
 
@@ -250,7 +236,6 @@ void AtomKokkos::sort()
 
 void AtomKokkos::grow(unsigned int mask)
 {
-
   if (mask & SPECIAL_MASK) {
     memoryKK->destroy_kokkos(k_special, special);
     sync(Device, mask);

diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
@@ -22,6 +22,8 @@ namespace LAMMPS_NS {
 
 class AtomKokkos : public Atom {
  public:
+  bool sort_classic;
+
   DAT::tdual_tagint_1d k_tag;
   DAT::tdual_int_1d k_type, k_mask;
   DAT::tdual_imageint_1d k_image;
@@ -108,6 +110,7 @@ class AtomKokkos : public Atom {
     return local;
   }
 
+  void init() override;
   void allocate_type_arrays() override;
   void sync(const ExecutionSpace space, unsigned int mask);
   void modified(const ExecutionSpace space, unsigned int mask);
@@ -119,6 +122,7 @@ class AtomKokkos : public Atom {
   virtual void deallocate_topology();
   void sync_modify(ExecutionSpace, unsigned int, unsigned int) override;
  private:
+  void sort_device();
   class AtomVec *new_avec(const std::string &, int, int &) override;
 };
 

diff --git a/src/KOKKOS/atom_vec_angle_kokkos.cpp b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@@ -155,6 +155,35 @@ void AtomVecAngleKokkos::grow_pointers()
   h_angle_atom3 = atomKK->k_angle_atom3.h_view;
 }
 
+/* ----------------------------------------------------------------------
+   sort atom arrays on device
+------------------------------------------------------------------------- */
+
+void AtomVecAngleKokkos::sort_kokkos(Kokkos::BinSort<KeyViewType, BinOp> &Sorter)
+{
+  atomKK->sync(Device, ALL_MASK & ~F_MASK);
+
+  Sorter.sort(LMPDeviceType(), d_tag);
+  Sorter.sort(LMPDeviceType(), d_type);
+  Sorter.sort(LMPDeviceType(), d_mask);
+  Sorter.sort(LMPDeviceType(), d_image);
+  Sorter.sort(LMPDeviceType(), d_x);
+  Sorter.sort(LMPDeviceType(), d_v);
+  Sorter.sort(LMPDeviceType(), d_molecule);
+  Sorter.sort(LMPDeviceType(), d_num_bond);
+  Sorter.sort(LMPDeviceType(), d_bond_type);
+  Sorter.sort(LMPDeviceType(), d_bond_atom);
+  Sorter.sort(LMPDeviceType(), d_nspecial);
+  Sorter.sort(LMPDeviceType(), d_special);
+  Sorter.sort(LMPDeviceType(), d_num_angle);
+  Sorter.sort(LMPDeviceType(), d_angle_type);
+  Sorter.sort(LMPDeviceType(), d_angle_atom1);
+  Sorter.sort(LMPDeviceType(), d_angle_atom2);
+  Sorter.sort(LMPDeviceType(), d_angle_atom3);
+
+  atomKK->modified(Device, ALL_MASK & ~F_MASK);
+}
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType,int PBC_FLAG,int TRICLINIC>

diff --git a/src/KOKKOS/atom_vec_angle_kokkos.h b/src/KOKKOS/atom_vec_angle_kokkos.h
@@ -34,6 +34,7 @@ class AtomVecAngleKokkos : public AtomVecKokkos, public AtomVecAngle {
 
   void grow(int) override;
   void grow_pointers() override;
+  void sort_kokkos(Kokkos::BinSort<KeyViewType, BinOp> &Sorter) override;
   int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
                        const int & iswap,
                        const DAT::tdual_xfloat_2d &buf,

diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -100,6 +100,24 @@ void AtomVecAtomicKokkos::grow_pointers()
   h_f = atomKK->k_f.h_view;
 }
 
+/* ----------------------------------------------------------------------
+   sort atom arrays on device
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::sort_kokkos(Kokkos::BinSort<KeyViewType, BinOp> &Sorter)
+{
+  atomKK->sync(Device, ALL_MASK & ~F_MASK);
+
+  Sorter.sort(LMPDeviceType(), d_tag);
+  Sorter.sort(LMPDeviceType(), d_type);
+  Sorter.sort(LMPDeviceType(), d_mask);
+  Sorter.sort(LMPDeviceType(), d_image);
+  Sorter.sort(LMPDeviceType(), d_x);
+  Sorter.sort(LMPDeviceType(), d_v);
+
+  atomKK->modified(Device, ALL_MASK & ~F_MASK);
+}
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType,int PBC_FLAG>

diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h
@@ -35,6 +35,7 @@ class AtomVecAtomicKokkos : public AtomVecKokkos, public AtomVecAtomic {
 
   void grow(int) override;
   void grow_pointers() override;
+  void sort_kokkos(Kokkos::BinSort<KeyViewType, BinOp> &Sorter) override;
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space) override;