@@ -206,8 +206,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
206
206
207
207
EV_FLOAT ev;
208
208
209
- int idxu_max = snaKK.idxu_max ;
210
-
211
209
while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
212
210
213
211
EV_FLOAT ev_tmp;
@@ -246,6 +244,13 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
246
244
Kokkos::parallel_for (" ComputeUiCPU" ,policy_ui_cpu,*this );
247
245
}
248
246
247
+ {
248
+ // Expand ulisttot -> ulisttot_full
249
+ // Zero out ylist
250
+ typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int >, Kokkos::Rank<2 , Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu ({0 ,0 },{twojmax+1 ,chunk_size});
251
+ Kokkos::parallel_for (" TransformUiCPU" ,policy_transform_ui_cpu,*this );
252
+ }
253
+
249
254
// Compute bispectrum
250
255
if (quadraticflag || eflag) {
251
256
// ComputeZi
@@ -261,20 +266,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
261
266
Kokkos::parallel_for (" ComputeBiCPU" ,policy_bi_cpu,*this );
262
267
}
263
268
264
- // ZeroYi, ComputeYi
269
+ // ComputeYi
265
270
{
266
- int vector_length = vector_length_default;
267
- int team_size = team_size_default;
268
-
269
271
// Compute beta = dE_i/dB_i for all i in list
270
272
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta (0 ,chunk_size);
271
273
Kokkos::parallel_for (" ComputeBetaCPU" ,policy_beta,*this );
272
274
273
- // ZeroYi
274
- check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
275
- typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi (((idxu_max+team_size-1 )/team_size)*chunk_size,team_size,vector_length);
276
- Kokkos::parallel_for (" ZeroYiCPU" ,policy_zero_yi,*this );
277
-
278
275
// ComputeYi
279
276
int idxz_max = snaKK.idxz_max ;
280
277
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu (0 ,chunk_size*idxz_max);
@@ -294,6 +291,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
294
291
295
292
Kokkos::parallel_for (" ComputeDeidrjCPU" ,policy_deidrj_cpu,*this );
296
293
}
294
+
297
295
} else { // GPU
298
296
299
297
#ifdef LMP_KOKKOS_GPU
@@ -313,10 +311,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
313
311
int team_size = 4 ; // need to cap b/c of shared memory reqs
314
312
check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
315
313
316
- // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
314
+ // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry
317
315
// 2 is for double buffer
318
316
319
- const int tile_size = (twojmax+1 )*(twojmax+1 );
317
+ const int tile_size = (twojmax+1 )*(twojmax/ 2 +1 );
320
318
typedef Kokkos::View< SNAcomplex*,
321
319
Kokkos::DefaultExecutionSpace::scratch_memory_space,
322
320
Kokkos::MemoryTraits<Kokkos::Unmanaged> >
@@ -329,7 +327,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
329
327
Kokkos::parallel_for (" ComputeUi" ,policy_ui,*this );
330
328
331
329
// Transform data layout of ulisttot to AoSoA, zero ylist
332
- typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int >, Kokkos::Rank<3 , Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui ({0 ,0 ,0 },{32 ,idxu_max ,(chunk_size + 32 - 1 ) / 32 },{32 ,4 ,1 });
330
+ typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int >, Kokkos::Rank<3 , Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui ({0 ,0 ,0 },{32 ,twojmax+ 1 ,(chunk_size + 32 - 1 ) / 32 },{32 ,4 ,1 });
333
331
Kokkos::parallel_for (" TransformUi" ,policy_transform_ui,*this );
334
332
335
333
}
@@ -367,7 +365,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
367
365
Kokkos::parallel_for (" ComputeYi" ,policy_compute_yi,*this );
368
366
369
367
// Transform data layout of ylist out of AoSoA
370
- typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int >, Kokkos::Rank<3 , Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi ({0 ,0 ,0 },{32 ,idxu_max,(chunk_size + 32 - 1 ) / 32 },{32 ,4 ,1 });
368
+ const int idxu_half_max = snaKK.idxu_half_max ;
369
+ typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int >, Kokkos::Rank<3 , Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi ({0 ,0 ,0 },{32 ,idxu_half_max,(chunk_size + 32 - 1 ) / 32 },{32 ,4 ,1 });
371
370
Kokkos::parallel_for (" TransformYi" ,policy_transform_yi,*this );
372
371
373
372
}
@@ -397,7 +396,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
397
396
}
398
397
}
399
398
400
- #endif // KOKKOS_ENABLE_CUDA
399
+ #endif // LMP_KOKKOS_GPU
401
400
402
401
}
403
402
@@ -608,12 +607,21 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
608
607
609
608
if ( rsq < rnd_cutsq (itype,jtype) ) {
610
609
if (final ) {
611
- my_sna.rij (ii,offset,0 ) = dx;
612
- my_sna.rij (ii,offset,1 ) = dy;
613
- my_sna.rij (ii,offset,2 ) = dz;
610
+ #ifdef LMP_KOKKOS_GPU
611
+ if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
612
+ my_sna.compute_cayley_klein (ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac,
613
+ d_wjelem[elem_j]);
614
+ } else {
615
+ #endif
616
+ my_sna.rij (ii,offset,0 ) = dx;
617
+ my_sna.rij (ii,offset,1 ) = dy;
618
+ my_sna.rij (ii,offset,2 ) = dz;
619
+ my_sna.wj (ii,offset) = d_wjelem[elem_j];
620
+ my_sna.rcutij (ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
621
+ #ifdef LMP_KOKKOS_GPU
622
+ }
623
+ #endif
614
624
my_sna.inside (ii,offset) = j;
615
- my_sna.wj (ii,offset) = d_wjelem[elem_j];
616
- my_sna.rcutij (ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
617
625
if (chemflag)
618
626
my_sna.element (ii,offset) = elem_j;
619
627
else
@@ -704,27 +712,54 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
704
712
705
713
template <class DeviceType >
706
714
KOKKOS_INLINE_FUNCTION
707
- void PairSNAPKokkos<DeviceType>::operator () (TagPairSNAPTransformUi,const int iatom_mod, const int idxu , const int iatom_div) const {
715
+ void PairSNAPKokkos<DeviceType>::operator () (TagPairSNAPTransformUi,const int iatom_mod, const int j , const int iatom_div) const {
708
716
SNAKokkos<DeviceType> my_sna = snaKK;
709
717
710
718
const int iatom = iatom_mod + iatom_div * 32 ;
711
719
if (iatom >= chunk_size) return ;
712
720
713
- if (idxu >= my_sna. idxu_max ) return ;
721
+ if (j > twojmax ) return ;
714
722
715
723
int elem_count = chemflag ? nelements : 1 ;
716
724
717
725
for (int ielem = 0 ; ielem < elem_count; ielem++) {
726
+ const int jju_half = my_sna.idxu_half_block (j);
727
+ const int jju = my_sna.idxu_block (j);
728
+
729
+ for (int mb = 0 ; 2 *mb <= j; mb++) {
730
+ for (int ma = 0 ; ma <= j; ma++) {
731
+ // Extract top half
732
+
733
+ const int idxu_shift = mb * (j + 1 ) + ma;
734
+ const int idxu_half = jju_half + idxu_shift;
735
+ const int idxu = jju + idxu_shift;
736
+
737
+ auto utot_re = my_sna.ulisttot_re (idxu_half, ielem, iatom);
738
+ auto utot_im = my_sna.ulisttot_im (idxu_half, ielem, iatom);
739
+
740
+ // Store
741
+ my_sna.ulisttot_pack (iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
742
+
743
+ // Also zero yi
744
+ my_sna.ylist_pack_re (iatom_mod, idxu_half, ielem, iatom_div) = 0 .;
745
+ my_sna.ylist_pack_im (iatom_mod, idxu_half, ielem, iatom_div) = 0 .;
746
+
747
+ // Symmetric term
748
+ const int sign_factor = (((ma+mb)%2 ==0 )?1 :-1 );
749
+ const int idxu_flip = jju + (j + 1 - mb) * (j + 1 ) - (ma + 1 );
750
+
751
+ if (sign_factor == 1 ) {
752
+ utot_im = -utot_im;
753
+ } else {
754
+ utot_re = -utot_re;
755
+ }
718
756
719
- const auto utot_re = my_sna.ulisttot_re (idxu, ielem, iatom);
720
- const auto utot_im = my_sna.ulisttot_im (idxu, ielem, iatom);
721
-
722
- my_sna.ulisttot_pack (iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
757
+ my_sna.ulisttot_pack (iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im };
723
758
724
- my_sna.ylist_pack_re (iatom_mod, idxu, ielem, iatom_div) = 0 .;
725
- my_sna.ylist_pack_im (iatom_mod, idxu, ielem, iatom_div) = 0 .;
759
+ // No need to zero symmetrized ylist
760
+ }
761
+ }
726
762
}
727
-
728
763
}
729
764
730
765
template <class DeviceType >
@@ -742,20 +777,20 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iato
742
777
743
778
template <class DeviceType >
744
779
KOKKOS_INLINE_FUNCTION
745
- void PairSNAPKokkos<DeviceType>::operator () (TagPairSNAPTransformYi,const int iatom_mod, const int idxu , const int iatom_div) const {
780
+ void PairSNAPKokkos<DeviceType>::operator () (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half , const int iatom_div) const {
746
781
SNAKokkos<DeviceType> my_sna = snaKK;
747
782
748
783
const int iatom = iatom_mod + iatom_div * 32 ;
749
784
if (iatom >= chunk_size) return ;
750
785
751
- if (idxu >= my_sna.idxu_max ) return ;
786
+ if (idxu_half >= my_sna.idxu_half_max ) return ;
752
787
753
788
int elem_count = chemflag ? nelements : 1 ;
754
789
for (int ielem = 0 ; ielem < elem_count; ielem++) {
755
- const auto y_re = my_sna.ylist_pack_re (iatom_mod, idxu , ielem, iatom_div);
756
- const auto y_im = my_sna.ylist_pack_im (iatom_mod, idxu , ielem, iatom_div);
790
+ const auto y_re = my_sna.ylist_pack_re (iatom_mod, idxu_half , ielem, iatom_div);
791
+ const auto y_im = my_sna.ylist_pack_im (iatom_mod, idxu_half , ielem, iatom_div);
757
792
758
- my_sna.ylist (idxu , ielem, iatom) = { y_re, y_im };
793
+ my_sna.ylist (idxu_half , ielem, iatom) = { y_re, y_im };
759
794
}
760
795
761
796
}
@@ -904,22 +939,52 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typen
904
939
905
940
template <class DeviceType >
906
941
KOKKOS_INLINE_FUNCTION
907
- void PairSNAPKokkos<DeviceType>::operator () (TagPairSNAPZeroYiCPU, const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team ) const {
942
+ void PairSNAPKokkos<DeviceType>::operator () (TagPairSNAPTransformUiCPU, const int j, const int iatom ) const {
908
943
SNAKokkos<DeviceType> my_sna = snaKK;
909
944
910
- // Extract the quantum number
911
- const int idx = team.team_rank () + team.team_size () * (team.league_rank () % ((my_sna.idxu_max +team.team_size ()-1 )/team.team_size ()));
912
- if (idx >= my_sna.idxu_max ) return ;
945
+ if (iatom >= chunk_size) return ;
913
946
914
- // Extract the atomic index
915
- const int ii = team.league_rank () / ((my_sna.idxu_max +team.team_size ()-1 )/team.team_size ());
916
- if (ii >= chunk_size) return ;
947
+ if (j > twojmax) return ;
948
+
949
+ int elem_count = chemflag ? nelements : 1 ;
950
+
951
+ // De-symmetrize ulisttot
952
+ for (int ielem = 0 ; ielem < elem_count; ielem++) {
953
+
954
+ const int jju_half = my_sna.idxu_half_block (j);
955
+ const int jju = my_sna.idxu_block (j);
956
+
957
+ for (int mb = 0 ; 2 *mb <= j; mb++) {
958
+ for (int ma = 0 ; ma <= j; ma++) {
959
+ // Extract top half
960
+
961
+ const int idxu_shift = mb * (j + 1 ) + ma;
962
+ const int idxu_half = jju_half + idxu_shift;
963
+ const int idxu = jju + idxu_shift;
917
964
918
- if (chemflag)
919
- for (int ielem = 0 ; ielem < nelements; ielem++)
920
- my_sna.zero_yi_cpu (idx,ii,ielem);
921
- else
922
- my_sna.zero_yi_cpu (idx,ii,0 );
965
+ // Load ulist
966
+ auto utot = my_sna.ulisttot (idxu_half, ielem, iatom);
967
+
968
+ // Store
969
+ my_sna.ulisttot_full (idxu, ielem, iatom) = utot;
970
+
971
+ // Zero Yi
972
+ my_sna.ylist (idxu_half, ielem, iatom) = {0 ., 0 .};
973
+
974
+ // Symmetric term
975
+ const int sign_factor = (((ma+mb)%2 ==0 )?1 :-1 );
976
+ const int idxu_flip = jju + (j + 1 - mb) * (j + 1 ) - (ma + 1 );
977
+
978
+ if (sign_factor == 1 ) {
979
+ utot.im = -utot.im ;
980
+ } else {
981
+ utot.re = -utot.re ;
982
+ }
983
+
984
+ my_sna.ulisttot_full (idxu_flip, ielem, iatom) = utot;
985
+ }
986
+ }
987
+ }
923
988
}
924
989
925
990
template <class DeviceType >
0 commit comments