lammps · akohlmey · Jun 17, 2019 · Apr 17, 2019 · Apr 17, 2019 · Apr 17, 2019
diff --git a/doc/src/Errors_messages.txt b/doc/src/Errors_messages.txt
@@ -2146,10 +2146,6 @@ Self-explanatory. :dd
 
 This is a current restriction in LAMMPS. :dd
 
-{Cannot use pair hybrid with GPU neighbor list builds} :dt
-
-Neighbor list builds must be done on the CPU for this pair style. :dd
-
 {Cannot use pair tail corrections with 2d simulations} :dt
 
 The correction factors are only currently defined for 3d systems. :dd
@@ -5467,10 +5463,6 @@ Self-explanatory. :dd
 For this pair style, you cannot run part of the force calculation on
 the host.  See the package command. :dd
 
-{GPU split param must be positive for hybrid pair styles} :dt
-
-See the package gpu command. :dd
-
 {GPUs are requested but Kokkos has not been compiled for CUDA} :dt
 
 Re-compile Kokkos with CUDA support to use GPUs. :dd

diff --git a/doc/src/Speed_compare.txt b/doc/src/Speed_compare.txt
@@ -104,7 +104,7 @@ code (with a performance penalty due to having data transfers between
 host and GPU). :ulb,l
 
 The GPU package requires neighbor lists to be built on the CPU when using
-exclusion lists, hybrid pair styles, or a triclinic simulation box. :l
+exclusion lists, or a triclinic simulation box. :l
 
 The GPU package can be compiled for CUDA or OpenCL and thus supports
 both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically

diff --git a/doc/src/package.txt b/doc/src/package.txt
@@ -173,12 +173,10 @@ computation will be built.  If {neigh} is {yes}, which is the default,
 neighbor list building is performed on the GPU.  If {neigh} is {no},
 neighbor list building is performed on the CPU.  GPU neighbor list
 building currently cannot be used with a triclinic box.  GPU neighbor
-list calculation currently cannot be used with
-"hybrid"_pair_hybrid.html pair styles.  GPU neighbor lists are not
-compatible with commands that are not GPU-enabled.  When a non-GPU
-enabled command requires a neighbor list, it will also be built on the
-CPU.  In these cases, it will typically be more efficient to only use
-CPU neighbor list builds.
+lists are not compatible with commands that are not GPU-enabled.  When
+a non-GPU enabled command requires a neighbor list, it will also be
+built on the CPU.  In these cases, it will typically be more efficient
+to only use CPU neighbor list builds.
 
 The {newton} keyword sets the Newton flags for pairwise (not bonded)
 interactions to {off} or {on}, the same as the "newton"_newton.html

diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
@@ -64,9 +64,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
   } else
     _nbor_data=&(nbor->dev_nbor);
 
-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
   if (success!=0)
     return success;
 

diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
@@ -65,9 +65,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
   } else
     _nbor_data=&(nbor->dev_nbor);
 
-  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
   if (success!=0)
     return success;
 

diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
@@ -66,9 +66,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
   } else
     _nbor_data=&(nbor->dev_nbor);
 
-  int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
   if (success!=0)
     return success;
 

diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
@@ -65,9 +65,13 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
   } else
     _nbor_data=&(nbor->dev_nbor);
 
-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom,true);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
   if (success!=0)
     return success;
 

diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
@@ -71,12 +71,15 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_atom();
 
-  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,true,
-                           1);
+  int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,true,1);
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
 

diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
@@ -78,9 +78,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   if (_threads_per_atom*_threads_per_atom>device->warp_size())
     return -10;
 
-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
   if (success!=0)
     return success;
 

diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
@@ -246,11 +246,8 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                   const bool rot, const int nlocal,
-                  const int host_nlocal, const int nall,
-                  Neighbor *nbor, const int maxspecial,
-                  const int gpu_host, const int max_nbors,
-                  const double cell_size, const bool pre_cut,
-                  const int threads_per_atom, const bool vel) {
+                  const int nall, const int maxspecial,
+                  const bool vel) {
   if (!_device_init)
     return -1;
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@@ -301,16 +298,6 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
   if (!ans.init(ef_nlocal,charge,rot,*gpu))
     return -3;
 
-  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
-                  _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device, compile_string()))
-    return -3;
-  if (_cell_size<0.0)
-    nbor->cell_size(cell_size,cell_size);
-  else
-    nbor->cell_size(_cell_size,cell_size);
-
   _init_count++;
   return 0;
 }
@@ -338,6 +325,39 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
+                  const int host_nlocal, const int nall,
+                  const int maxspecial, const int gpu_host,
+                  const int max_nbors, const double cell_size,
+                  const bool pre_cut, const int threads_per_atom) {
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+
+  int gpu_nbor=0;
+  if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+  #ifndef USE_CUDPP
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
+
+  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
+                  _block_cell_id, _block_nbor_build, threads_per_atom,
+                  _warp_size, _time_device, compile_string()))
+    return -3;
+  if (_cell_size<0.0)
+    nbor->cell_size(cell_size,cell_size);
+  else
+    nbor->cell_size(_cell_size,cell_size);
+
+  return 0;
+}
+
 template <class numtyp, class acctyp>
 void DeviceT::set_single_precompute
                      (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
@@ -614,7 +634,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
     if (screen && times[6]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"    Device Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average) for kspace: ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 

diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
@@ -53,31 +53,23 @@ class Device {
                   const int t_per_atom, const double cell_size,
                   char *vendor_string, const int block_pair);
 
-  /// Initialize the device for Atom and Neighbor storage
-  /** \param rot True if quaternions need to be stored
+  /// Initialize the device for Atom storage
+  /** \param charge True if charges need to be stored 
+    * \param rot True if quaternions need to be stored
     * \param nlocal Total number of local particles to allocate memory for
-    * \param host_nlocal Initial number of host particles to allocate memory for
     * \param nall Total number of local+ghost particles
-    * \param gpu_host 0 if host will not perform force calculations,
-    *                 1 if gpu_nbor is true, and host needs a half nbor list,
-    *                 2 if gpu_nbor is true, and host needs a full nbor list
-    * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param cell_size cutoff+skin
-    * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel
-    * \param threads_per_atom value to be used by the neighbor list only
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
+    * \param vel True if velocities need to be stored
     *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot,
-           const int nlocal, const int host_nlocal, const int nall,
-           Neighbor *nbor, const int maxspecial, const int gpu_host,
-           const int max_nbors, const double cell_size, const bool pre_cut,
-           const int threads_per_atom, const bool vel=false);
+  int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
+           const int nlocal, const int nall, const int maxspecial,
+           const bool vel=false);
 
   /// Initialize the device for Atom storage only
   /** \param nlocal Total number of local particles to allocate memory for
@@ -91,6 +83,34 @@ class Device {
     * - -5 Double precision is not supported on card **/
   int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
 
+  /// Initialize the neighbor list storage
+  /** \param charge True if charges need to be stored
+    * \param rot True if quaternions need to be stored
+    * \param nlocal Total number of local particles to allocate memory for
+    * \param host_nlocal Initial number of host particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
+    * \param gpu_host 0 if host will not perform force calculations,
+    *                 1 if gpu_nbor is true, and host needs a half nbor list,
+    *                 2 if gpu_nbor is true, and host needs a full nbor list
+    * \param max_nbors Initial number of rows in the neighbor matrix
+    * \param cell_size cutoff+skin
+    * \param pre_cut True if cutoff test will be performed in separate kernel
+    *                than the force kernel
+    * \param threads_per_atom value to be used by the neighbor list only
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_nbor(Neighbor *nbor, const int nlocal,
+                const int host_nlocal, const int nall,
+                const int maxspecial, const int gpu_host,
+                const int max_nbors, const double cell_size,
+                const bool pre_cut, const int threads_per_atom);
+
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
                     const int first_gpu, const int last_gpu);
@@ -173,7 +193,7 @@ class Device {
   /// Return host memory usage in bytes
   double host_memory_usage() const;
 
-  /// Return the number of procs sharing a device (size of device commincator)
+  /// Return the number of procs sharing a device (size of device communicator)
   inline int procs_per_gpu() const { return _procs_per_gpu; }
   /// Return the number of threads per proc
   inline int num_threads() const { return _nthreads; }
@@ -260,12 +280,12 @@ class Device {
   /// Atom Data
   Atom<numtyp,acctyp> atom;
 
-  // --------------------------- NBOR DATA ----------------------------
+  // --------------------------- NBOR SHARED KERNELS ----------------
 
-  /// Neighbor Data
+  /// Shared kernels for neighbor lists
   NeighborShared _neighbor_shared;
 
-  // ------------------------ LONG RANGE DATA -------------------------
+  // ------------------------ LONG RANGE DATA -----------------------
 
   // Long Range Data
   int _long_range_precompute;

diff --git a/src/GPU/README b/src/GPU/README
@@ -1,9 +1,8 @@
 This package implements GPU optimizations of various LAMMPS styles.
 
-Section 5.3.1 on the manual gives details of what hardware and Cuda
+Section 3.7 of the manual gives details of what hardware and Cuda
 software is required on your system, and full details on how to build
-and use this package.  See the KOKKOS package, which also has
-GPU-enabled styles.
+and use this package.  The KOKKOS package also has GPU-enabled styles.
 
 This package uses an external library provided in lib/gpu which must
 be compiled before making LAMMPS.  See the lib/gpu/README file and the

diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
@@ -219,17 +219,6 @@ void FixGPU::init()
     error->all(FLERR,"GPU package does not (yet) work with "
                "atom_style template");
 
-  // hybrid cannot be used with force/neigh option
-
-  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
-
-  if (_particle_split < 0)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"GPU split param must be positive "
-                 "for hybrid pair styles");
-
   // neighbor list builds on the GPU with triclinic box is not yet supported
 
   if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) &&

diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h
@@ -65,14 +65,6 @@ E: GPU package does not (yet) work with atom_style template
 
 Self-explanatory.
 
-E: Cannot use pair hybrid with GPU neighbor list builds
-
-Neighbor list builds must be done on the CPU for this pair style.
-
-E: GPU split param must be positive for hybrid pair styles
-
-See the package gpu command.
-
 E: Cannot use package gpu neigh yes with triclinic box
 
 This is a current restriction in LAMMPS.