Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HAXWell/Nbody.cpp
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
802 lines (655 sloc)
23.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "GENCoder.h" | |
| #include "GENDisassembler.h" | |
| #include "GENAssembler.h" | |
| #include "HAXWell.h" | |
| #include "Misc.h" | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <algorithm> | |
| #define GLSL(...) "#version 430 core\n" #__VA_ARGS__ | |
| #define STRINGIFY(...) #__VA_ARGS__ | |
| const char* NBODYGLSL = GLSL( | |
| layout (local_size_x = 16) in; | |
| layout (std430,binding=0) | |
| buffer Output0 | |
| { | |
| uint g_nBodies; | |
| float g_TimeStep; | |
| float g_GraviationConst; | |
| }; | |
| layout (std430,binding=1) | |
| buffer BodiesIn | |
| { | |
| vec4 g_PositionAndMass[]; | |
| }; | |
| layout (std430,binding=2) | |
| buffer VelocitiesIn | |
| { | |
| vec4 g_Velocity[]; | |
| }; | |
| layout (std430,binding=3) | |
| buffer BodiesOut | |
| { | |
| vec4 g_PositionAndMassOut[]; | |
| }; | |
| layout (std430,binding=4) | |
| buffer VelocitiesOut | |
| { | |
| vec4 g_VelocityOut[]; | |
| }; | |
| void main() | |
| { | |
| uint tid = gl_GlobalInvocationID.x; | |
| vec4 body = g_PositionAndMass[tid]; | |
| vec3 force = vec3(0.f,0.f,0.f); | |
| for( uint i=0; i<g_nBodies; i++ ) | |
| { | |
| if( i != tid ) | |
| { | |
| vec4 otherBody = g_PositionAndMass[i]; | |
| vec3 D = body.xyz - otherBody.xyz; | |
| // force = G*massi*massj * (D ) / length(D)^3 | |
| // length(D)^3 is dot(D,D) * sqrt(dot(D,D)) | |
| float lengthSq = dot( D, D ); | |
| float lengthCubed = lengthSq * sqrt(lengthSq); | |
| float scale = ( g_GraviationConst * otherBody.w) / lengthCubed; | |
| force += D*scale; | |
| } | |
| } | |
| vec3 velocity = g_Velocity[tid].xyz; | |
| vec3 accel = force; // mass cancels out | |
| velocity += accel * g_TimeStep; | |
| body.xyz += velocity * g_TimeStep; | |
| g_PositionAndMassOut[tid].xyz = body.xyz; | |
| g_VelocityOut[tid].xyz = velocity.xyz; | |
| } | |
| ); | |
| #define SM_SIZE 256 | |
| const char* NBODYGLSL_SM = GLSL( | |
| layout (local_size_x = 256) in; | |
| layout (std430,binding=0) | |
| buffer Output0 | |
| { | |
| uint g_nBodies; | |
| float g_TimeStep; | |
| float g_GraviationConst; | |
| }; | |
| layout (std430,binding=1) | |
| buffer BodiesIn | |
| { | |
| vec4 g_PositionAndMass[]; | |
| }; | |
| layout (std430,binding=2) | |
| buffer VelocitiesIn | |
| { | |
| vec4 g_Velocity[]; | |
| }; | |
| layout (std430,binding=3) | |
| buffer BodiesOut | |
| { | |
| vec4 g_PositionAndMassOut[]; | |
| }; | |
| layout (std430,binding=4) | |
| buffer VelocitiesOut | |
| { | |
| vec4 g_VelocityOut[]; | |
| }; | |
| shared vec4 SHARED_Bodies[256]; | |
| void main() | |
| { | |
| uint tid = gl_GlobalInvocationID.x; | |
| uint tid_local = gl_LocalInvocationID.x; | |
| vec4 body = g_PositionAndMass[tid]; | |
| vec3 force = vec3(0.f,0.f,0.f); | |
| for( uint i=0; i<g_nBodies; i+= 256 ) | |
| { | |
| // load a bunch of bodies into SM across the thread group | |
| vec4 other = g_PositionAndMass[i+tid_local]; | |
| other.w = g_GraviationConst*other.w; | |
| SHARED_Bodies[tid_local] = other; | |
| memoryBarrierShared(); | |
| // now add up all the forces | |
| for( uint j=0; j<256;j++ ) | |
| { | |
| if( (i+j) != tid ) | |
| { | |
| vec4 otherBody = SHARED_Bodies[j]; | |
| vec3 D = body.xyz - otherBody.xyz; | |
| // force = G*massi*massj * (D ) / length(D)^3 | |
| // length(D)^3 is dot(D,D) * sqrt(dot(D,D)) | |
| float lengthSq = dot( D, D ); | |
| float lengthCubed = lengthSq * sqrt(lengthSq); | |
| float scale = ( otherBody.w) / lengthCubed; | |
| force += D*scale; | |
| } | |
| } | |
| memoryBarrierShared(); | |
| } | |
| vec3 velocity = g_Velocity[tid].xyz; | |
| vec3 accel = force; // mass cancels out | |
| velocity += accel * g_TimeStep; | |
| body.xyz += velocity * g_TimeStep; | |
| g_PositionAndMassOut[tid].xyz = body.xyz; | |
| g_VelocityOut[tid].xyz = velocity.xyz; | |
| } | |
| ); | |
| float Rnd() | |
| { | |
| return (float)rand() / (float)RAND_MAX; | |
| } | |
| struct Simulator | |
| { | |
| HAXWell::ShaderHandle hKernel; | |
| size_t nBodiesPerThreadGroup; | |
| size_t nBodies; | |
| size_t nSteps; | |
| float dt; | |
| float GraviationalConstant; | |
| }; | |
| float* Nbody( Simulator& sim ) | |
| { | |
| srand(4); | |
| float dt =sim.dt; | |
| size_t nSteps = sim.nSteps; | |
| size_t nBodies = sim.nBodies; | |
| // make some positions | |
| float* pPositions = new float[4*sim.nBodies]; | |
| float* pMassInit = new float[4*sim.nBodies]; | |
| float* pVelocities = new float[4*sim.nBodies]; | |
| for( size_t i=0; i<nBodies; i++ ) | |
| { | |
| pPositions[4*i] = 100.0f*Rnd(); | |
| pPositions[4*i+1] = 100.0f*Rnd(); | |
| pPositions[4*i+2] = 100.0f*Rnd(); | |
| pPositions[4*i+3] = 0.5f + Rnd()*5.0f; | |
| pMassInit[4*i] = 0; | |
| pMassInit[4*i+1] = 0; | |
| pMassInit[4*i+2] = 0; | |
| pMassInit[4*i+3] = pPositions[4*i+3]; | |
| } | |
| memset( pVelocities,0,sizeof(float)*4*nBodies ); | |
| HAXWell::BufferHandle hPositions[2] ={ | |
| HAXWell::CreateBuffer( pPositions, 4*sizeof(float)*nBodies ), | |
| HAXWell::CreateBuffer( pMassInit, 4*sizeof(float)*nBodies ) | |
| }; | |
| HAXWell::BufferHandle hVelocities[2] ={ | |
| HAXWell::CreateBuffer( pVelocities, 4*sizeof(float)*nBodies ), | |
| HAXWell::CreateBuffer( pVelocities, 4*sizeof(float)*nBodies ) | |
| }; | |
| struct Globals | |
| { | |
| unsigned int nBodies; | |
| float fTimeStep; | |
| float fGravitationalConstant; | |
| }; | |
| Globals g; | |
| g.nBodies = nBodies; | |
| g.fTimeStep = dt; | |
| g.fGravitationalConstant = sim.GraviationalConstant; | |
| HAXWell::BufferHandle hGlobals = HAXWell::CreateBuffer(&g,sizeof(g)); | |
| HAXWell::Finish(); // flush buffer creation | |
| HAXWell::TimerHandle hTimer = HAXWell::BeginTimer(); | |
| for( size_t j=0; j<nSteps; j++ ) | |
| { | |
| HAXWell::BufferHandle hDispatch[5] ={ | |
| hGlobals, | |
| hPositions[0], | |
| hVelocities[0], | |
| hPositions[1], | |
| hVelocities[1] | |
| }; | |
| HAXWell::DispatchShader( sim.hKernel, hDispatch, 5, nBodies/sim.nBodiesPerThreadGroup ); | |
| std::swap(hPositions[0],hPositions[1]); | |
| std::swap(hVelocities[0],hVelocities[1]); | |
| HAXWell::Finish();// need a 'finish' in here, or else Haxwell version sporadically crashes | |
| } | |
| HAXWell::EndTimer(hTimer); | |
| HAXWell::Finish(); // flush compute | |
| HAXWell::timer_t nTime = HAXWell::ReadTimer(hTimer); | |
| printf("%012u\n", nTime); | |
| delete[] pVelocities; | |
| delete[] pMassInit; | |
| void* pResults = HAXWell::MapBuffer(hPositions[0]); | |
| memcpy( pPositions, pResults, 4*sizeof(float)*nBodies ); | |
| HAXWell::UnmapBuffer(hPositions[0]); | |
| HAXWell::ReleaseBuffer(hPositions[0]); | |
| HAXWell::ReleaseBuffer(hPositions[1]); | |
| HAXWell::ReleaseBuffer(hVelocities[0]); | |
| HAXWell::ReleaseBuffer(hVelocities[1]); | |
| HAXWell::ReleaseBuffer(hGlobals); | |
| return pPositions; | |
| } | |
| static const char* NBODY_HAXWELL = STRINGIFY( | |
| curbe OFFSETS[2] = {{0,4,8,12,16,20,24,28},{32,36,40,44,48,52,56,60}} | |
| curbe INDICES[1] = {0,1,2,3,4,5,6,7} | |
| threads 64 // launch 64 HW threads at once | |
| reg globals | |
| reg i | |
| reg base[2] | |
| reg addrx[4] | |
| reg addry[4] | |
| reg addrz[4] | |
| reg X [8] | |
| reg Y [8] | |
| reg Z [8] | |
| reg VX [8] | |
| reg VY [8] | |
| reg VZ [8] | |
| reg Fx [8] | |
| reg Fy [8] | |
| reg Fz [8] | |
| reg Dx0 [2] | |
| reg Dx1 [2] | |
| reg Dy0 [2] | |
| reg Dy1 [2] | |
| reg Dz0 [2] | |
| reg Dz1 [2] | |
| reg lengthSq0 [2] | |
| reg lengthSq1 [2] | |
| reg length0[2] | |
| reg length1[2] | |
| reg BodyPair | |
| bind Globals 0x38 // {#bodies,timestep,gravitation} | |
| bind PositionIn 0x39 // xyzm | |
| bind VelocityIn 0x3a // xyz- | |
| bind PositionOut 0x3b // xyzm | |
| bind VelocityOut 0x3c | |
| // buffer bind points: bind 0: xyzm per body, bind 1: velocity (xyz) per body | |
| // | |
| // threads evaluate M bodies each | |
| begin: | |
| // Load blocks of body positions which this thread will process | |
| // Load velocities up front as well. We've got plenty of regs and we know we'll eventually need them | |
| mul(16) base.u, r0.u1<0,1,0>, 256 // base address for this thread's bodies (64 bodies * 4 dwords) | |
| add(16) addrx.u, base.u, OFFSETS.u // x addresses for first 16-body group | |
| add(16) addry.u, addrx.u, 1 // y addresses for first 16-body group | |
| add(16) addrz.u, addrx.u, 2 // z addresses for first 16-body group | |
| send DwordLoad16(PositionIn), X0.f, addrx.u | |
| send DwordLoad16(PositionIn), Y0.f, addry.u | |
| send DwordLoad16(PositionIn), Z0.f, addrz.u | |
| send DwordLoad16(VelocityIn), VX0.f, addrx.u | |
| send DwordLoad16(VelocityIn), VY0.f, addry.u | |
| send DwordLoad16(VelocityIn), VZ0.f, addrz.u | |
| add(16) addrx.u, addrx.u, 64 // skip 16 bodies (64 dwords) | |
| add(16) addry.u, addry.u, 64 // skip 16 bodies (64 dwords) | |
| add(16) addrz.u, addrz.u, 64 // skip 16 bodies (64 dwords) | |
| send DwordLoad16(PositionIn), X2.f, addrx.u | |
| send DwordLoad16(PositionIn), Y2.f, addry.u | |
| send DwordLoad16(PositionIn), Z2.f, addrz.u | |
| send DwordLoad16(VelocityIn), VX2.f, addrx.u | |
| send DwordLoad16(VelocityIn), VY2.f, addry.u | |
| send DwordLoad16(VelocityIn), VZ2.f, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| send DwordLoad16(PositionIn), X4.f, addrx.u | |
| send DwordLoad16(PositionIn), Y4.f, addry.u | |
| send DwordLoad16(PositionIn), Z4.f, addrz.u | |
| send DwordLoad16(VelocityIn), VX4.f, addrx.u | |
| send DwordLoad16(VelocityIn), VY4.f, addry.u | |
| send DwordLoad16(VelocityIn), VZ4.f, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| send DwordLoad16(PositionIn), X6.f, addrx.u | |
| send DwordLoad16(PositionIn), Y6.f, addry.u | |
| send DwordLoad16(PositionIn), Z6.f, addrz.u | |
| send DwordLoad16(VelocityIn), VX6.f, addrx.u | |
| send DwordLoad16(VelocityIn), VY6.f, addry.u | |
| send DwordLoad16(VelocityIn), VZ6.f, addrz.u | |
| // load the global sim constants | |
| send DwordLoad8(Globals), globals.u, INDICES.u | |
| // clear force accumulators | |
| mov(16) Fx0.f, 0.0f | |
| mov(16) Fx2.f, 0.0f | |
| mov(16) Fx4.f, 0.0f | |
| mov(16) Fx6.f, 0.0f | |
| mov(16) Fy0.f, 0.0f | |
| mov(16) Fy2.f, 0.0f | |
| mov(16) Fy4.f, 0.0f | |
| mov(16) Fy6.f, 0.0f | |
| mov(16) Fz0.f, 0.0f | |
| mov(16) Fz2.f, 0.0f | |
| mov(16) Fz4.f, 0.0f | |
| mov(16) Fz6.f, 0.0f | |
| mov(1) f0.u, 0 | |
| mov(1) f1.u, 0 | |
| mov(8) i.u, 0 | |
| cmpge(1)(f0.0) null.u, i.u0<1,1,1>, globals.u0<1,1,1> | |
| jmpif(f0.0) write_out | |
| loop: | |
| mul(8) base.u, i.u<0,1,0>, 4 | |
| add(8) base.u, base.u<0,1,0>, INDICES.u | |
| add(1) i.u0 , i.u0<1,1,1>, 2 | |
| send DwordLoad8(PositionIn), BodyPair.f, base.u | |
| cmplt(1)(f0.0) null.u, i.u0<1,1,1>, globals.u0<1,1,1> | |
| mul(1) BodyPair.f3, BodyPair.f3<1,1,1>, globals.f2<1,1,1> // pre-multiply body mass by gravitation constant | |
| mul(1) BodyPair.f7, BodyPair.f7<1,1,1>, globals.f2<1,1,1> // TODO: Do in one SIMD4x2 op with write masking...? | |
| //vec3 D = body.xyz - otherBody.xyz; | |
| //float lengthSq = dot( D, D ); | |
| //float lengthCubed = lengthSq * sqrt(lengthSq); | |
| //float scale = ( g_GraviationConst * otherBody.w) / lengthCubed; | |
| //force += scale*D; | |
| // Bodies 0-16 | |
| sub(16) Dx0.f, X0.f, BodyPair.f0<0,1,0> | |
| sub(16) Dy0.f, Y0.f, BodyPair.f1<0,1,0> | |
| sub(16) Dz0.f, Z0.f, BodyPair.f2<0,1,0> | |
| sub(16) Dx1.f, X0.f, BodyPair.f4<0,1,0> | |
| sub(16) Dy1.f, Y0.f, BodyPair.f5<0,1,0> | |
| sub(16) Dz1.f, Z0.f, BodyPair.f6<0,1,0> | |
| mul(16) lengthSq0.f, Dx0.f, Dx0.f | |
| mul(16) lengthSq1.f, Dx1.f, Dx1.f | |
| fma(16) lengthSq0.f, Dy0.f, Dy0.f | |
| fma(16) lengthSq1.f, Dy1.f, Dy1.f | |
| fma(16) lengthSq0.f, Dz0.f, Dz0.f | |
| fma(16) lengthSq1.f, Dz1.f, Dz1.f | |
| sqrt(16) length0.f, lengthSq0.f | |
| sqrt(16) length1.f, lengthSq1.f | |
| cmpgt(16)(f1.0) null.u, lengthSq0.f, 0.0f | |
| cmpgt(16)(f1.1) null.u, lengthSq1.f, 0.0f | |
| pred(f1.0){ mul(16) length0.f, length0.f, lengthSq0.f } | |
| pred(f1.1){ mul(16) length1.f, length1.f, lengthSq1.f } | |
| pred(f1.0){ rcp(16) length0.f, length0.f } | |
| pred(f1.1){ rcp(16) length1.f, length1.f } | |
| pred(f1.0){ mul(16) length0.f, length0.f, BodyPair.f3<0,1,0> } | |
| pred(f1.1){ mul(16) length1.f, length1.f, BodyPair.f7<0,1,0> } | |
| pred(f1.0){ | |
| fma(16) Fx0.f, Dx0.f, length0.f | |
| fma(16) Fy0.f, Dy0.f, length0.f | |
| fma(16) Fz0.f, Dz0.f, length0.f | |
| } | |
| pred(f1.1){ | |
| fma(16) Fx0.f, Dx1.f, length1.f | |
| fma(16) Fy0.f, Dy1.f, length1.f | |
| fma(16) Fz0.f, Dz1.f, length1.f | |
| } | |
| // Bodies 16-32 | |
| sub(16) Dx0.f, X2.f, BodyPair.f0<0,1,0> | |
| sub(16) Dy0.f, Y2.f, BodyPair.f1<0,1,0> | |
| sub(16) Dz0.f, Z2.f, BodyPair.f2<0,1,0> | |
| sub(16) Dx1.f, X2.f, BodyPair.f4<0,1,0> | |
| sub(16) Dy1.f, Y2.f, BodyPair.f5<0,1,0> | |
| sub(16) Dz1.f, Z2.f, BodyPair.f6<0,1,0> | |
| mul(16) lengthSq0.f, Dx0.f, Dx0.f | |
| mul(16) lengthSq1.f, Dx1.f, Dx1.f | |
| fma(16) lengthSq0.f, Dy0.f, Dy0.f | |
| fma(16) lengthSq1.f, Dy1.f, Dy1.f | |
| fma(16) lengthSq0.f, Dz0.f, Dz0.f | |
| fma(16) lengthSq1.f, Dz1.f, Dz1.f | |
| sqrt(16) length0.f, lengthSq0.f | |
| sqrt(16) length1.f, lengthSq1.f | |
| cmpgt(16)(f1.0) null.u, lengthSq0.f, 0.0f | |
| cmpgt(16)(f1.1) null.u, lengthSq1.f, 0.0f | |
| pred(f1.0){ mul(16) length0.f, length0.f, lengthSq0.f } | |
| pred(f1.1){ mul(16) length1.f, length1.f, lengthSq1.f } | |
| pred(f1.0){ rcp(16) length0.f, length0.f } | |
| pred(f1.1){ rcp(16) length1.f, length1.f } | |
| pred(f1.0){ mul(16) length0.f, length0.f, BodyPair.f3<0,1,0> } | |
| pred(f1.1){ mul(16) length1.f, length1.f, BodyPair.f7<0,1,0> } | |
| pred(f1.0){ | |
| fma(16) Fx2.f, Dx0.f, length0.f | |
| fma(16) Fy2.f, Dy0.f, length0.f | |
| fma(16) Fz2.f, Dz0.f, length0.f | |
| } | |
| pred(f1.1){ | |
| fma(16) Fx2.f, Dx1.f, length1.f | |
| fma(16) Fy2.f, Dy1.f, length1.f | |
| fma(16) Fz2.f, Dz1.f, length1.f | |
| } | |
| // Bodies 32-48 | |
| sub(16) Dx0.f, X4.f, BodyPair.f0<0,1,0> | |
| sub(16) Dy0.f, Y4.f, BodyPair.f1<0,1,0> | |
| sub(16) Dz0.f, Z4.f, BodyPair.f2<0,1,0> | |
| sub(16) Dx1.f, X4.f, BodyPair.f4<0,1,0> | |
| sub(16) Dy1.f, Y4.f, BodyPair.f5<0,1,0> | |
| sub(16) Dz1.f, Z4.f, BodyPair.f6<0,1,0> | |
| mul(16) lengthSq0.f, Dx0.f, Dx0.f | |
| mul(16) lengthSq1.f, Dx1.f, Dx1.f | |
| fma(16) lengthSq0.f, Dy0.f, Dy0.f | |
| fma(16) lengthSq1.f, Dy1.f, Dy1.f | |
| fma(16) lengthSq0.f, Dz0.f, Dz0.f | |
| fma(16) lengthSq1.f, Dz1.f, Dz1.f | |
| sqrt(16) length0.f, lengthSq0.f | |
| sqrt(16) length1.f, lengthSq1.f | |
| cmpgt(16)(f1.0) null.u, lengthSq0.f, 0.0f | |
| cmpgt(16)(f1.1) null.u, lengthSq1.f, 0.0f | |
| pred(f1.0){ mul(16) length0.f, length0.f, lengthSq0.f } | |
| pred(f1.1){ mul(16) length1.f, length1.f, lengthSq1.f } | |
| pred(f1.0){ rcp(16) length0.f, length0.f } | |
| pred(f1.1){ rcp(16) length1.f, length1.f } | |
| pred(f1.0){ mul(16) length0.f, length0.f, BodyPair.f3<0,1,0> } | |
| pred(f1.1){ mul(16) length1.f, length1.f, BodyPair.f7<0,1,0> } | |
| pred(f1.0){ | |
| fma(16) Fx4.f, Dx0.f, length0.f | |
| fma(16) Fy4.f, Dy0.f, length0.f | |
| fma(16) Fz4.f, Dz0.f, length0.f | |
| } | |
| pred(f1.1){ | |
| fma(16) Fx4.f, Dx1.f, length1.f | |
| fma(16) Fy4.f, Dy1.f, length1.f | |
| fma(16) Fz4.f, Dz1.f, length1.f | |
| } | |
| // Bodies 48-64 | |
| sub(16) Dx0.f, X6.f, BodyPair.f0<0,1,0> | |
| sub(16) Dy0.f, Y6.f, BodyPair.f1<0,1,0> | |
| sub(16) Dz0.f, Z6.f, BodyPair.f2<0,1,0> | |
| sub(16) Dx1.f, X6.f, BodyPair.f4<0,1,0> | |
| sub(16) Dy1.f, Y6.f, BodyPair.f5<0,1,0> | |
| sub(16) Dz1.f, Z6.f, BodyPair.f6<0,1,0> | |
| mul(16) lengthSq0.f, Dx0.f, Dx0.f | |
| mul(16) lengthSq1.f, Dx1.f, Dx1.f | |
| fma(16) lengthSq0.f, Dy0.f, Dy0.f | |
| fma(16) lengthSq1.f, Dy1.f, Dy1.f | |
| fma(16) lengthSq0.f, Dz0.f, Dz0.f | |
| fma(16) lengthSq1.f, Dz1.f, Dz1.f | |
| sqrt(16) length0.f, lengthSq0.f | |
| sqrt(16) length1.f, lengthSq1.f | |
| cmpgt(16)(f1.0) null.u, lengthSq0.f, 0.0f | |
| cmpgt(16)(f1.1) null.u, lengthSq1.f, 0.0f | |
| pred(f1.0){ mul(16) length0.f, length0.f, lengthSq0.f } | |
| pred(f1.1){ mul(16) length1.f, length1.f, lengthSq1.f } | |
| pred(f1.0){ rcp(16) length0.f, length0.f } | |
| pred(f1.1){ rcp(16) length1.f, length1.f } | |
| pred(f1.0){ mul(16) length0.f, length0.f, BodyPair.f3<0,1,0> } | |
| pred(f1.1){ mul(16) length1.f, length1.f, BodyPair.f7<0,1,0> } | |
| pred(f1.0){ | |
| fma(16) Fx6.f, Dx0.f, length0.f | |
| fma(16) Fy6.f, Dy0.f, length0.f | |
| fma(16) Fz6.f, Dz0.f, length0.f | |
| } | |
| pred(f1.1){ | |
| fma(16) Fx6.f, Dx1.f, length1.f | |
| fma(16) Fy6.f, Dy1.f, length1.f | |
| fma(16) Fz6.f, Dz1.f, length1.f | |
| } | |
| jmpif(f0.0) loop | |
| write_out: | |
| // TODO: try fma, but I'm skeptical because mul/add can do the "1.5x-issue" thing | |
| // velocity += force * timestep | |
| mul(16) Fx0.f, Fx0.f, globals.f1<0,1,0> | |
| mul(16) Fx2.f, Fx2.f, globals.f1<0,1,0> | |
| mul(16) Fx4.f, Fx4.f, globals.f1<0,1,0> | |
| mul(16) Fx6.f, Fx6.f, globals.f1<0,1,0> | |
| mul(16) Fy0.f, Fy0.f, globals.f1<0,1,0> | |
| mul(16) Fy2.f, Fy2.f, globals.f1<0,1,0> | |
| mul(16) Fy4.f, Fy4.f, globals.f1<0,1,0> | |
| mul(16) Fy6.f, Fy6.f, globals.f1<0,1,0> | |
| mul(16) Fz0.f, Fz0.f, globals.f1<0,1,0> | |
| mul(16) Fz2.f, Fz2.f, globals.f1<0,1,0> | |
| mul(16) Fz4.f, Fz4.f, globals.f1<0,1,0> | |
| mul(16) Fz6.f, Fz6.f, globals.f1<0,1,0> | |
| add(16) VX0.f, VX0.f, Fx0.f | |
| add(16) VX2.f, VX2.f, Fx2.f | |
| add(16) VX4.f, VX4.f, Fx4.f | |
| add(16) VX6.f, VX6.f, Fx6.f | |
| add(16) VY0.f, VY0.f, Fy0.f | |
| add(16) VY2.f, VY2.f, Fy2.f | |
| add(16) VY4.f, VY4.f, Fy4.f | |
| add(16) VY6.f, VY6.f, Fy6.f | |
| add(16) VZ0.f, VZ0.f, Fz0.f | |
| add(16) VZ2.f, VZ2.f, Fz2.f | |
| add(16) VZ4.f, VZ4.f, Fz4.f | |
| add(16) VZ6.f, VZ6.f, Fz6.f | |
| // write velocities | |
| mul(16) base.u, r0.u1<0,1,0>, 256 // base address for this thread's bodies | |
| add(16) addrx.u, base.u, OFFSETS.u // x addresses for first 16-body group | |
| add(16) addry.u, addrx.u, 1 // y addresses for first 16-body group | |
| add(16) addrz.u, addrx.u, 2 // z addresses for first 16-body group | |
| mov(16) addrx2.f, VX0.f | |
| mov(16) addry2.f, VY0.f | |
| mov(16) addrz2.f, VZ0.f | |
| send DwordStore16(VelocityOut), null.u, addrx.u | |
| send DwordStore16(VelocityOut), null.u, addry.u | |
| send DwordStore16(VelocityOut), null.u, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| mov(16) addrx2.f, VX2.f | |
| mov(16) addry2.f, VY2.f | |
| mov(16) addrz2.f, VZ2.f | |
| send DwordStore16(VelocityOut), null.u, addrx.u | |
| send DwordStore16(VelocityOut), null.u, addry.u | |
| send DwordStore16(VelocityOut), null.u, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| mov(16) addrx2.f, VX4.f | |
| mov(16) addry2.f, VY4.f | |
| mov(16) addrz2.f, VZ4.f | |
| send DwordStore16(VelocityOut), null.u, addrx.u | |
| send DwordStore16(VelocityOut), null.u, addry.u | |
| send DwordStore16(VelocityOut), null.u, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| mov(16) addrx2.f, VX6.f | |
| mov(16) addry2.f, VY6.f | |
| mov(16) addrz2.f, VZ6.f | |
| send DwordStore16(VelocityOut), null.u, addrx.u | |
| send DwordStore16(VelocityOut), null.u, addry.u | |
| send DwordStore16(VelocityOut), null.u, addrz.u | |
| // position += velocity* timestep | |
| mul(16) VX0.f, VX0.f, globals.f1<0,1,0> | |
| mul(16) VX2.f, VX2.f, globals.f1<0,1,0> | |
| mul(16) VX4.f, VX4.f, globals.f1<0,1,0> | |
| mul(16) VX6.f, VX6.f, globals.f1<0,1,0> | |
| mul(16) VY0.f, VY0.f, globals.f1<0,1,0> | |
| mul(16) VY2.f, VY2.f, globals.f1<0,1,0> | |
| mul(16) VY4.f, VY4.f, globals.f1<0,1,0> | |
| mul(16) VY6.f, VY6.f, globals.f1<0,1,0> | |
| mul(16) VZ0.f, VZ0.f, globals.f1<0,1,0> | |
| mul(16) VZ2.f, VZ2.f, globals.f1<0,1,0> | |
| mul(16) VZ4.f, VZ4.f, globals.f1<0,1,0> | |
| mul(16) VZ6.f, VZ6.f, globals.f1<0,1,0> | |
| add(16) X0.f, X0.f, VX0.f | |
| add(16) X2.f, X2.f, VX2.f | |
| add(16) X4.f, X4.f, VX4.f | |
| add(16) X6.f, X6.f, VX6.f | |
| add(16) Y0.f, Y0.f, VY0.f | |
| add(16) Y2.f, Y2.f, VY2.f | |
| add(16) Y4.f, Y4.f, VY4.f | |
| add(16) Y6.f, Y6.f, VY6.f | |
| add(16) Z0.f, Z0.f, VZ0.f | |
| add(16) Z2.f, Z2.f, VZ2.f | |
| add(16) Z4.f, Z4.f, VZ4.f | |
| add(16) Z6.f, Z6.f, VZ6.f | |
| // write positions | |
| mul(16) base.u, r0.u1<0,1,0>, 256 // base address for this thread's bodies | |
| add(16) addrx.u, base.u, OFFSETS.u // x addresses for first 16-body group | |
| add(16) addry.u, addrx.u, 1 // y addresses for first 16-body group | |
| add(16) addrz.u, addrx.u, 2 // z addresses for first 16-body group | |
| mov(16) addrx2.f, X0.f | |
| mov(16) addry2.f, Y0.f | |
| mov(16) addrz2.f, Z0.f | |
| send DwordStore16(PositionOut), null.u, addrx.u | |
| send DwordStore16(PositionOut), null.u, addry.u | |
| send DwordStore16(PositionOut), null.u, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| mov(16) addrx2.f, X2.f | |
| mov(16) addry2.f, Y2.f | |
| mov(16) addrz2.f, Z2.f | |
| send DwordStore16(PositionOut), null.u, addrx.u | |
| send DwordStore16(PositionOut), null.u, addry.u | |
| send DwordStore16(PositionOut), null.u, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| mov(16) addrx2.f, X4.f | |
| mov(16) addry2.f, Y4.f | |
| mov(16) addrz2.f, Z4.f | |
| send DwordStore16(PositionOut), null.u, addrx.u | |
| send DwordStore16(PositionOut), null.u, addry.u | |
| send DwordStore16(PositionOut), null.u, addrz.u | |
| add(16) addrx.u, addrx.u, 64 | |
| add(16) addry.u, addry.u, 64 | |
| add(16) addrz.u, addrz.u, 64 | |
| mov(16) addrx2.f, X6.f | |
| mov(16) addry2.f, Y6.f | |
| mov(16) addrz2.f, Z6.f | |
| send DwordStore16(PositionOut), null.u, addrx.u | |
| send DwordStore16(PositionOut), null.u, addry.u | |
| send DwordStore16(PositionOut), null.u, addrz.u | |
| end | |
| ); | |
| // even: 42/96 | |
| // odd: 52/106/10 -> result first buffer | |
| void Nbody() | |
| { | |
| // HAXWell::Blob blob; | |
| // HAXWell::RipIsaFromGLSL( blob, NBODYGLSL ); | |
| // PrintISA(stdout, blob ); | |
| Simulator sim; | |
| sim.hKernel = HAXWell::CreateGLSLShader( NBODYGLSL ); | |
| sim.nBodiesPerThreadGroup = 16; | |
| sim.nBodies = 64*256; | |
| sim.dt = 1.0f/10.0f; | |
| sim.nSteps = 32; | |
| sim.GraviationalConstant = 10.01f; // whatever... | |
| float* pGLSL = Nbody(sim); | |
| HAXWell::ReleaseShader(sim.hKernel); | |
| class Printer : public GEN::IPrinter{ | |
| public: | |
| virtual void Push( const char* p ) | |
| { | |
| printf("%s", p ); | |
| } | |
| }; | |
| GEN::Encoder encoder; | |
| GEN::Decoder decoder; | |
| Printer pr; | |
| GEN::Assembler::Program program; | |
| if( program.Assemble( &encoder, NBODY_HAXWELL, &pr ) ) | |
| { | |
| HAXWell::ShaderArgs args; | |
| args.nCURBEAllocsPerThread = program.GetCURBERegCount(); | |
| args.nDispatchThreadCount = program.GetThreadsPerDispatch(); | |
| args.nSIMDMode = 16; | |
| args.nIsaLength = program.GetIsaLengthInBytes(); | |
| args.pCURBE = program.GetCURBE(); | |
| args.pIsa = program.GetIsa(); | |
| sim.hKernel = HAXWell::CreateShader( args ); | |
| sim.nBodiesPerThreadGroup = 64*program.GetThreadsPerDispatch(); | |
| // PrintISA( stdout,args.pIsa, args.nIsaLength); | |
| float* pHSW = Nbody(sim); | |
| // NOTE: There are differences in the results, but I suspect | |
| // that they are down to precision differences | |
| /* | |
| for( size_t i=0; i<4*sim.nBodies; i+= 4 ) | |
| { | |
| float dx = (pGLSL[i]-pGLSLSM[i])/pGLSL[i]; | |
| if( fabs(dx) > 0.1 ) | |
| printf("foo(%u) %f, %f\n", i/4, pGLSL[i], pGLSLSM[i]); | |
| } | |
| */ | |
| // for( size_t i=0; i<4*sim.nBodies; i+= 4 ) | |
| // printf( "%f,%f,%f -- %f,%f,%f\n", pGLSL[i],pGLSL[i+1],pGLSL[i+2], | |
| // pHSW[i], pHSW[i+1],pHSW[i+2] ); | |
| } | |
| sim.hKernel = HAXWell::CreateGLSLShader( NBODYGLSL_SM ); | |
| sim.nBodiesPerThreadGroup = SM_SIZE; | |
| float* pGLSLSM = Nbody(sim); | |
| HAXWell::ReleaseShader(sim.hKernel); | |
| } |