lammps · akohlmey · Jun 11, 2019 · Jun 4, 2019 · Jun 6, 2019 · Jun 7, 2019
diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -12,7 +12,8 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Ray Shan (SNL), Stan Moore (SNL)
+   Contributing authors: Ray Shan (SNL), Stan Moore (SNL),
+                          Kamesh Arumugam (NVIDIA)
 ------------------------------------------------------------------------- */
 
 #include <cmath>
@@ -68,6 +69,8 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
   memory->destroy(s_hist);
   memory->destroy(t_hist);
   grow_arrays(atom->nmax);
+
+  d_mfill_offset = typename AT::t_int_scalar("qeq/kk:mfill_offset");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -215,17 +218,46 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
   copymode = 1;
 
   // allocate
+
   allocate_array();
 
   // get max number of neighbor
+
   if (!allocated_flag || update->ntimestep == neighbor->lastcall)
     allocate_matrix();
 
   // compute_H
-  FixQEqReaxKokkosComputeHFunctor<DeviceType> computeH_functor(this);
-  Kokkos::parallel_scan(inum,computeH_functor);
+
+  if (lmp->kokkos->ngpus == 0) { // CPU
+    if (neighflag == FULL) {
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this);
+      Kokkos::parallel_scan(inum,computeH_functor);
+    } else { // HALF and HALFTHREAD are the same
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this);
+      Kokkos::parallel_scan(inum,computeH_functor);
+    }
+  } else { // GPU, use teams
+    Kokkos::deep_copy(d_mfill_offset,0);
+
+    int vector_length = 32;
+    int atoms_per_team = 4;
+    int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
+
+    Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
+                                          vector_length);
+    if (neighflag == FULL) {
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
+          this, atoms_per_team, vector_length);
+      Kokkos::parallel_for(policy, computeH_functor);
+    } else { // HALF and HALFTHREAD are the same
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
+          this, atoms_per_team, vector_length);
+      Kokkos::parallel_for(policy, computeH_functor);
+    }
+  }
 
   // init_matvec
+
   k_s_hist.template sync<DeviceType>();
   k_t_hist.template sync<DeviceType>();
   FixQEqReaxKokkosMatVecFunctor<DeviceType> matvec_functor(this);
@@ -255,12 +287,15 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
     ndup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> (d_o);
 
   // 1st cg solve over b_s, s
+
   cg_solve1();
 
   // 2nd cg solve over b_t, t
+
   cg_solve2();
 
   // calculate_Q();
+
   calculate_q();
   k_s_hist.template modify<DeviceType>();
   k_t_hist.template modify<DeviceType>();
@@ -271,6 +306,7 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
     allocated_flag = 1;
 
   // free duplicated memory
+
   if (need_dup)
     dup_o = decltype(dup_o)();
 }
@@ -373,6 +409,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
+template <int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const bool &final) const
 {
@@ -399,7 +436,7 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
       const X_FLOAT dely = x(j,1) - ytmp;
       const X_FLOAT delz = x(j,2) - ztmp;
 
-      if (neighflag != FULL) {
+      if (NEIGHFLAG != FULL) {
         // skip half of the interactions
         const tagint jtag = tag(j);
         if (j >= nlocal) {
@@ -433,6 +470,217 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
 
 /* ---------------------------------------------------------------------- */
 
+// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j
+// d_val     - contains the non-zero entries of sparse matrix H
+// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
+// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val
+// d_jlist   - contains the column index corresponding to each entry in d_val
+
+template <class DeviceType>
+template <int NEIGHFLAG>
+void FixQEqReaxKokkos<DeviceType>::compute_h_team(
+    const typename Kokkos::TeamPolicy<DeviceType>::member_type &team,
+    int atoms_per_team, int vector_length) const {
+
+  // scratch space setup
+  Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_ilist(team.team_shmem(), atoms_per_team);
+  Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_numnbrs(team.team_shmem(), atoms_per_team);
+  Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_firstnbr(team.team_shmem(), atoms_per_team);
+
+  Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_jtype(team.team_shmem(), atoms_per_team, vector_length);
+  Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_jlist(team.team_shmem(), atoms_per_team, vector_length);
+  Kokkos::View<F_FLOAT **, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_r(team.team_shmem(), atoms_per_team, vector_length);
+
+  // team of threads work on atoms with index in [firstatom, lastatom)
+  int firstatom = team.league_rank() * atoms_per_team;
+  int lastatom =
+      (firstatom + atoms_per_team < inum) ? (firstatom + atoms_per_team) : inum;
+
+  // kokkos-thread-0 is used to load info from global memory into scratch space
+  if (team.team_rank() == 0) {
+
+    // copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
+    // copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
+    // calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
+    // upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])
+
+    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team),
+                          [&](const int &idx, int &totalnbrs, bool final) {
+                            int ii = firstatom + idx;
+
+                            if (ii < inum) {
+                              const int i = d_ilist[ii];
+                              int jnum = d_numneigh[i];
+
+                              if (final) {
+                                s_ilist[idx] = i;
+                                s_numnbrs[idx] = jnum;
+                                s_firstnbr[idx] = totalnbrs;
+                              }
+                              totalnbrs += jnum;
+                            } else {
+                              s_numnbrs[idx] = 0;
+                            }
+                          });
+  }
+
+  // barrier ensures that the data moved to scratch space is visible to all the
+  // threads of the corresponding team
+  team.team_barrier();
+
+  // calculate the global memory offset from where the H matrix values to be
+  // calculated by the current team will be stored in d_val
+  int team_firstnbr_idx = 0;
+  Kokkos::single(Kokkos::PerTeam(team),
+                 [=](int &val) {
+                   int totalnbrs = s_firstnbr[lastatom - firstatom - 1] +
+                                   s_numnbrs[lastatom - firstatom - 1];
+                   val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
+                 },
+                 team_firstnbr_idx);
+
+  // map the H matrix computation of each atom to kokkos-thread (one atom per
+  // kokkos-thread) neighbor computation for each atom is assigned to vector
+  // lanes of the corresponding thread
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) {
+        int ii = firstatom + idx;
+
+        if (ii < inum) {
+          const int i = s_ilist[idx];
+
+          if (mask[i] & groupbit) {
+            const X_FLOAT xtmp = x(i, 0);
+            const X_FLOAT ytmp = x(i, 1);
+            const X_FLOAT ztmp = x(i, 2);
+            const int itype = type(i);
+            const tagint itag = tag(i);
+            const int jnum = s_numnbrs[idx];
+
+            // calculate the write-offset for atom-i's first neighbor
+            int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { d_firstnbr[i] = atomi_firstnbr_idx; });
+
+            // current # of neighbor atoms with non-zero electrostatic
+            // interaction coefficients with atom-i which represents the # of
+            // non-zero elements in row-i of H matrix
+            int atomi_nbrs_inH = 0;
+
+            // calculate H matrix values corresponding to atom-i where neighbors
+            // are processed in batches and the batch size is vector_length
+            for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) {
+
+              int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;
+
+              // count the # of neighbor atoms with non-zero electrostatic
+              // interaction coefficients with atom-i in the current batch
+              int atomi_nbrs_curbatch = 0;
+
+              // compute rsq, jtype, j and store in scratch space which is
+              // reused later
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, vector_length),
+                  [&](const int &idx, int &m_fill) {
+                    const int jj = jj_start + idx;
+
+                    // initialize: -1 represents no interaction with atom-j
+                    // where j = d_neighbors(i,jj)
+                    s_jlist(team.team_rank(), idx) = -1;
+
+                    if (jj < jnum) {
+                      int j = d_neighbors(i, jj);
+                      j &= NEIGHMASK;
+                      const int jtype = type(j);
+
+                      const X_FLOAT delx = x(j, 0) - xtmp;
+                      const X_FLOAT dely = x(j, 1) - ytmp;
+                      const X_FLOAT delz = x(j, 2) - ztmp;
+
+                      // valid nbr interaction
+                      bool valid = true;
+                      if (NEIGHFLAG != FULL) {
+                        // skip half of the interactions
+                        const tagint jtag = tag(j);
+                        if (j >= nlocal) {
+                          if (itag > jtag) {
+                            if ((itag + jtag) % 2 == 0)
+                              valid = false;
+                          } else if (itag < jtag) {
+                            if ((itag + jtag) % 2 == 1)
+                              valid = false;
+                          } else {
+                            if (x(j, 2) < ztmp)
+                              valid = false;
+                            if (x(j, 2) == ztmp && x(j, 1) < ytmp)
+                              valid = false;
+                            if (x(j, 2) == ztmp && x(j, 1) == ytmp &&
+                                x(j, 0) < xtmp)
+                              valid = false;
+                          }
+                        }
+                      }
+
+                      const F_FLOAT rsq =
+                          delx * delx + dely * dely + delz * delz;
+                      if (rsq > cutsq)
+                        valid = false;
+
+                      if (valid) {
+                        s_jlist(team.team_rank(), idx) = j;
+                        s_jtype(team.team_rank(), idx) = jtype;
+                        s_r(team.team_rank(), idx) = sqrt(rsq);
+                        m_fill++;
+                      }
+                    }
+                  },
+                  atomi_nbrs_curbatch);
+
+              // write non-zero entries of H to global memory
+              Kokkos::parallel_scan(
+                  Kokkos::ThreadVectorRange(team, vector_length),
+                  [&](const int &idx, int &m_fill, bool final) {
+                    int j = s_jlist(team.team_rank(), idx);
+                    if (final) {
+                      if (j != -1) {
+                        const int jtype = s_jtype(team.team_rank(), idx);
+                        const F_FLOAT r = s_r(team.team_rank(), idx);
+                        const F_FLOAT shldij = d_shield(itype, jtype);
+
+                        d_jlist[atomi_nbr_writeIdx + m_fill] = j;
+                        d_val[atomi_nbr_writeIdx + m_fill] =
+                            calculate_H_k(r, shldij);
+                      }
+                    }
+
+                    if (j != -1) {
+                      m_fill++;
+                    }
+                  });
+              atomi_nbrs_inH += atomi_nbrs_curbatch;
+            }
+
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { d_numnbrs[i] = atomi_nbrs_inH; });
+          }
+        }
+      });
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 double FixQEqReaxKokkos<DeviceType>::calculate_H_k(const F_FLOAT &r, const F_FLOAT &shld) const