Skip to content

Commit

Permalink
compi
Browse files Browse the repository at this point in the history
  • Loading branch information
hannorein committed Jun 2, 2023
1 parent 140a86e commit d2b3569
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 66 deletions.
2 changes: 1 addition & 1 deletion examples/avx512_performance/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export OPENGL=0
export OPT=-march=skylake-avx512 -mavx512f
export OPT=-march=skylake-avx512 -mavx512f -fmax-errors=3
#export OPT=-march=native -mavx512f
include ../../src/Makefile.defs

Expand Down
239 changes: 176 additions & 63 deletions src/integrator_whfast512.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,9 @@ static void inline mm_stiefel_Gs3_avx512(__m512d * Gs1, __m512d * Gs2, __m512d *


/*
Warning: assumes *particles points at the first planet (and not the star)
Warning: assumes p512 points at the first planet (and not the star)
*/
void kepler_fast_avx512(struct reb_particle * restrict particles, double GM, double dt){
const int N = 8;

void kepler_fast_avx512(struct reb_particle_avx512 * restrict p512, double GM, double dt){
__m512d one = _mm512_set1_pd(1.);
__m512d _dt = _mm512_set1_pd(dt);
__m512d two = _mm512_set1_pd(2.);
Expand All @@ -102,30 +100,12 @@ void kepler_fast_avx512(struct reb_particle * restrict particles, double GM, dou
__m512d inv_M_twopi = _mm512_mul_pd(_M, twopi);
inv_M_twopi = _mm512_div_pd(one,inv_M_twopi);

__m512d x = _mm512_set_pd(
particles[7].x, particles[6].x, particles[5].x, particles[4].x,
particles[3].x, particles[2].x, particles[1].x, particles[0].x
);
__m512d y = _mm512_set_pd(
particles[7].y, particles[6].y, particles[5].y, particles[4].y,
particles[3].y, particles[2].y, particles[1].y, particles[0].y
);
__m512d z = _mm512_set_pd(
particles[7].z, particles[6].z, particles[5].z, particles[4].z,
particles[3].z, particles[2].z, particles[1].z, particles[0].z
);
__m512d vx = _mm512_set_pd(
particles[7].vx, particles[6].vx, particles[5].vx, particles[4].vx,
particles[3].vx, particles[2].vx, particles[1].vx, particles[0].vx
);
__m512d vy = _mm512_set_pd(
particles[7].vy, particles[6].vy, particles[5].vy, particles[4].vy,
particles[3].vy, particles[2].vy, particles[1].vy, particles[0].vy
);
__m512d vz = _mm512_set_pd(
particles[7].vz, particles[6].vz, particles[5].vz, particles[4].vz,
particles[3].vz, particles[2].vz, particles[1].vz, particles[0].vz
);
__m512d x = p512->x;
__m512d y = p512->y;
__m512d z = p512->z;
__m512d vx = p512->vx;
__m512d vy = p512->vy;
__m512d vz = p512->vz;

__m512d x2 = _mm512_mul_pd(x, x);
__m512d y2 = _mm512_mul_pd(y, y);
Expand Down Expand Up @@ -226,34 +206,19 @@ void kepler_fast_avx512(struct reb_particle * restrict particles, double GM, dou
__m512d nvz = _mm512_fnmadd_pd(nfd, z, vz);
nvz = _mm512_fnmadd_pd(ngd, vz, nvz);


double _nx[8] ;
double _ny[8] ;
double _nz[8] ;
double _nvx[8] ;
double _nvy[8] ;
double _nvz[8] ;
_mm512_store_pd(&_nx[0],nx);
_mm512_store_pd(&_ny[0],ny);
_mm512_store_pd(&_nz[0],nz);
_mm512_store_pd(&_nvx[0],nvx);
_mm512_store_pd(&_nvy[0],nvy);
_mm512_store_pd(&_nvz[0],nvz);
for (unsigned int i=0;i<N;i++){
particles[i].x = _nx[i];
particles[i].y = _ny[i];
particles[i].z = _nz[i];
particles[i].vx = _nvx[i];
particles[i].vy = _nvy[i];
particles[i].vz = _nvz[i];
}
p512->x = nx;
p512->y = ny;
p512->z = nz;
p512->vx = nvx;
p512->vy = nvy;
p512->vz = nvz;
}


void reb_whfast_kepler_step(const struct reb_simulation* const r, const double _dt){
static void reb_whfast512_kepler_step(const struct reb_simulation* const r, const double _dt){
const double m0 = r->particles[0].m;
const double G = r->G;
kepler_fast_avx512(p_j+1, m0*G, _dt);
kepler_fast_avx512(r->ri_whfast512.p_jh, m0*G, _dt);
}

// ~~~~~~~~~~~~~ Test functions ~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -446,7 +411,7 @@ void gravity_interaction_fast_avx512(

static void particles_to_avx(struct reb_simulation* r){
struct reb_simulation_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle_avx512* p512 = &(ri_whfast512->particles);
struct reb_particle_avx512* p512 = ri_whfast512->particles;
struct reb_particle* particles = r->particles;
p512->m = _mm512_set_pd( particles[8].m, particles[7].m, particles[6].m, particles[5].m, particles[4].m, particles[3].m, particles[2].m, particles[1].m );
p512->x = _mm512_set_pd( particles[8].x, particles[7].x, particles[6].x, particles[5].x, particles[4].x, particles[3].x, particles[2].x, particles[1].x );
Expand All @@ -455,11 +420,12 @@ static void particles_to_avx(struct reb_simulation* r){
p512->vx = _mm512_set_pd( particles[8].vx, particles[7].vx, particles[6].vx, particles[5].vx, particles[4].vx, particles[3].vx, particles[2].vx, particles[1].vx );
p512->vy = _mm512_set_pd( particles[8].vy, particles[7].vy, particles[6].vy, particles[5].vy, particles[4].vy, particles[3].vy, particles[2].vy, particles[1].vy );
p512->vz = _mm512_set_pd( particles[8].vz, particles[7].vz, particles[6].vz, particles[5].vz, particles[4].vz, particles[3].vz, particles[2].vz, particles[1].vz );

}

static void particles_from_avx(struct reb_simulation* r){
struct reb_simulation_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle_avx512* p512 = &(ri_whfast512->particles);
struct reb_particle_avx512* p512 = ri_whfast512->particles;
struct reb_particle* particles = r->particles;

double tmp[8];
Expand All @@ -477,6 +443,140 @@ static void particles_from_avx(struct reb_simulation* r){
particles[1].vz = tmp[0]; particles[2].vz = tmp[1]; particles[3].vz = tmp[2]; particles[4].vz = tmp[3]; particles[5].vz = tmp[4]; particles[6].vz = tmp[5]; particles[7].vz = tmp[6]; particles[8].vz = tmp[7];
}

void inertial_to_democraticheliocentric_posvel(struct reb_simulation* r){
struct reb_simulation_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle* particles = r->particles;
struct reb_particle_avx512* p512 = ri_whfast512->particles;
struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;

// should be optimized by assuming mass doesn't change
p_jh->m = p512->m;
double mtot = _mm512_reduce_add_pd(p512->m) + particles[0].m;
ri_whfast512->p_jh0.m = mtot;


__m512d xm = _mm512_mul_pd(p512->x,p512->m);
double x0 = _mm512_reduce_add_pd(xm) + particles[0].m*particles[0].x;
ri_whfast512->p_jh0.x = x0/mtot;
__m512d ym = _mm512_mul_pd(p512->y,p512->m);
double y0 = _mm512_reduce_add_pd(ym) + particles[0].m*particles[0].y;
ri_whfast512->p_jh0.y = y0/mtot;
__m512d zm = _mm512_mul_pd(p512->z,p512->m);
double z0 = _mm512_reduce_add_pd(zm) + particles[0].m*particles[0].z;
ri_whfast512->p_jh0.z = z0/mtot;

__m512d vxm = _mm512_mul_pd(p512->vx,p512->m);
double vx0 = (_mm512_reduce_add_pd(vxm) + particles[0].m*particles[0].vx)/mtot;
ri_whfast512->p_jh0.vx = vx0;
__m512d vym = _mm512_mul_pd(p512->vy,p512->m);
double vy0 = (_mm512_reduce_add_pd(vym) + particles[0].m*particles[0].vy)/mtot;
ri_whfast512->p_jh0.vy = vy0;
__m512d vzm = _mm512_mul_pd(p512->vz,p512->m);
double vz0 = (_mm512_reduce_add_pd(vzm) + particles[0].m*particles[0].vz)/mtot;
ri_whfast512->p_jh0.vz = vz0;

xm = _mm512_set_pd( particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x);
p_jh->x = _mm512_sub_pd(p512->x, xm);
ym = _mm512_set_pd( particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y);
p_jh->y = _mm512_sub_pd(p512->y, ym);
zm = _mm512_set_pd( particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z);
p_jh->z = _mm512_sub_pd(p512->z, zm);

vxm = _mm512_set_pd( vx0, vx0, vx0, vx0, vx0, vx0, vx0, vx0);
p_jh->vx = _mm512_sub_pd(p512->vx, vxm);
vym = _mm512_set_pd( vy0, vy0, vy0, vy0, vy0, vy0, vy0, vy0);
p_jh->vy = _mm512_sub_pd(p512->vy, vym);
vzm = _mm512_set_pd( vz0, vz0, vz0, vz0, vz0, vz0, vz0, vz0);
p_jh->vz = _mm512_sub_pd(p512->vz, vzm);


}

static void democraticheliocentric_to_inertial_posvel(struct reb_simulation* r){
struct reb_simulation_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle* particles = r->particles;
struct reb_particle_avx512* p512 = ri_whfast512->particles;
struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
const double mtot = ri_whfast512->p_jh0.m;

__m512d x0 = _mm512_mul_pd(p_jh->x,p_jh->m);
double x0s = _mm512_reduce_add_pd(x0)/mtot;
__m512d y0 = _mm512_mul_pd(p_jh->y,p_jh->m);
double y0s = _mm512_reduce_add_pd(y0)/mtot;
__m512d z0 = _mm512_mul_pd(p_jh->z,p_jh->m);
double z0s = _mm512_reduce_add_pd(z0)/mtot;

particles[0].x = ri_whfast512->p_jh0.x - x0s;
particles[0].y = ri_whfast512->p_jh0.y - y0s;
particles[0].z = ri_whfast512->p_jh0.z - z0s;


x0 = _mm512_set_pd( particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x, particles[0].x);
p512->x = _mm512_add_pd(p_jh->x, x0);
y0 = _mm512_set_pd( particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y, particles[0].y);
p512->y = _mm512_add_pd(p_jh->y, y0);
z0 = _mm512_set_pd( particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z, particles[0].z);
p512->z = _mm512_add_pd(p_jh->z, z0);


__m512d vx0 = _mm512_set_pd( ri_whfast512->p_jh0.vx, ri_whfast512->p_jh0.vx, ri_whfast512->p_jh0.vx, ri_whfast512->p_jh0.vx, ri_whfast512->p_jh0.vx, ri_whfast512->p_jh0.vx, ri_whfast512->p_jh0.vx, ri_whfast512->p_jh0.vx);
p512->vx = _mm512_add_pd(p_jh->vx, vx0);
__m512d vy0 = _mm512_set_pd( ri_whfast512->p_jh0.vy, ri_whfast512->p_jh0.vy, ri_whfast512->p_jh0.vy, ri_whfast512->p_jh0.vy, ri_whfast512->p_jh0.vy, ri_whfast512->p_jh0.vy, ri_whfast512->p_jh0.vy, ri_whfast512->p_jh0.vy);
p512->vy = _mm512_add_pd(p_jh->vy, vy0);
__m512d vz0 = _mm512_set_pd( ri_whfast512->p_jh0.vz, ri_whfast512->p_jh0.vz, ri_whfast512->p_jh0.vz, ri_whfast512->p_jh0.vz, ri_whfast512->p_jh0.vz, ri_whfast512->p_jh0.vz, ri_whfast512->p_jh0.vz, ri_whfast512->p_jh0.vz);
p512->vz = _mm512_add_pd(p_jh->vz, vz0);

const double m0 = particles[0].m;

// Note: can be precalculated and reused.
// Note: might accumulate roundoff errors
vx0 = _mm512_mul_pd(p_jh->vx, p_jh->m);
double vx0s = _mm512_reduce_add_pd(vx0)/m0;
vy0 = _mm512_mul_pd(p_jh->vy, p_jh->m);
double vy0s = _mm512_reduce_add_pd(vy0)/m0;
vz0 = _mm512_mul_pd(p_jh->vz, p_jh->m);
double vz0s = _mm512_reduce_add_pd(vz0)/m0;


particles[0].vx = ri_whfast512->p_jh0.vx -vx0s;
particles[0].vy = ri_whfast512->p_jh0.vy -vy0s;
particles[0].vz = ri_whfast512->p_jh0.vz -vz0s;

}

static void reb_whfast512_jump_step(struct reb_simulation* r, const double _dt){
struct reb_simulation_integrator_whfast512* ri_whfast512 = &(r->ri_whfast512);
struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
double m0 = r->particles[0].m;

__m512d mvx = _mm512_mul_pd(p_jh->m, p_jh->vx);
double px = _mm512_reduce_add_pd(mvx);

__m512d mvy = _mm512_mul_pd(p_jh->m, p_jh->vy);
double py = _mm512_reduce_add_pd(mvy);

__m512d mvz = _mm512_mul_pd(p_jh->m, p_jh->vz);
double pz = _mm512_reduce_add_pd(mvz);

// Note: can be precalculated and reused.
// Note: might accumulate roundoff errors
double pf;
__m512d pf512;

pf = _dt/m0*px;
pf512 = _mm512_set_pd(pf,pf,pf,pf,pf,pf,pf,pf);
p_jh->x = _mm512_add_pd(pf512, p_jh->x);

pf = _dt/m0*py;
pf512 = _mm512_set_pd(pf,pf,pf,pf,pf,pf,pf,pf);
p_jh->y = _mm512_add_pd(pf512, p_jh->y);

pf = _dt/m0*pz;
pf512 = _mm512_set_pd(pf,pf,pf,pf,pf,pf,pf,pf);
p_jh->z = _mm512_add_pd(pf512, p_jh->z);
}


void reb_integrator_whfast512_part1(struct reb_simulation* const r){
struct reb_simulation_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
const int N = r->N;
Expand All @@ -485,22 +585,29 @@ void reb_integrator_whfast512_part1(struct reb_simulation* const r){
assert(r->N_active==-1 || r->N_active==N);

if (ri_whfast512->allocated_N==0){
ri_whfast512->particles = calloc(1,sizeof(__m512d));
ri_whfast512->p_jh = calloc(1,sizeof(__m512d));
ri_whfast512->allocated_N=1;
}

if (ri_whfast512->is_synchronized){
particles_to_avx(r);
inertial_to_democraticheliocentric_posvel(r);
}

if (ri_whfast512->is_synchronized){
// First half DRIFT step
reb_whfast_kepler_step(r, r->dt/2.);
reb_whfast512_kepler_step(r, r->dt/2.);
reb_whfast_com_step(r, r->dt/2.);
}else{
// Combined DRIFT step
reb_whfast_kepler_step(r, r->dt); // full timestep
reb_whfast512_kepler_step(r, r->dt); // full timestep
reb_whfast_com_step(r, r->dt);
}

reb_whfast_jump_step(r,r->dt/2.);
reb_whfast512_jump_step(r,r->dt/2.);

reb_integrator_whfast_to_inertial(r);
democraticheliocentric_to_inertial_posvel(r); // Note: don't need velocities! TODO

r->t+=r->dt/2.;
}
Expand All @@ -510,26 +617,32 @@ void reb_integrator_whfast512_part2(struct reb_simulation* const r){
const double dt = r->dt;

reb_whfast_interaction_step(r, dt);
reb_whfast_jump_step(r,dt/2.);
reb_whfast512_jump_step(r,dt/2.);

ri_whfast512->is_synchronized = 0;

r->t+=r->dt/2.;
r->dt_last_done = r->dt;
}


void reb_integrator_whfast512_synchronize(struct reb_simulation* const r){
struct reb_simulation_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
if (ri_whfast512->is_synchronized == 0){
reb_whfast_kepler_step(r, r->dt/2.);
reb_whfast512_kepler_step(r, r->dt/2.);
reb_whfast_com_step(r, r->dt/2.);
reb_transformations_democraticheliocentric_to_inertial_posvel(r->particles, ri_whfast->p_jh, N_real, N_active);
democraticheliocentric_to_inertial_posvel(r);
ri_whfast512->is_synchronized = 1;
}
particles_from_avx(r);
}

void reb_integrator_whfast512_reset(struct reb_simulation* const r){
struct reb_simulation_integrator_whfast512* const ri_whfast = &(r->ri_whfast512);
ri_whfast->is_synchronized = 1;
ri_whfast->allocated_N = 0;
struct reb_simulation_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
ri_whfast512->is_synchronized = 1;
if (ri_whfast512->allocated_N){
free(ri_whfast512->particles);
free(ri_whfast512->p_jh);
}
ri_whfast512->allocated_N = 0;
}
5 changes: 3 additions & 2 deletions src/rebound.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,9 @@ struct reb_particle_avx512{
struct reb_simulation_integrator_whfast512 {
unsigned int is_synchronized;
unsigned int allocated_N;
struct reb_particle_avx512 particles;
struct reb_particle_avx512 p_jh;
struct reb_particle_avx512* particles;
struct reb_particle_avx512* p_jh;
struct reb_particle p_jh0;

};

Expand Down

0 comments on commit d2b3569

Please sign in to comment.