108-line SPIR-V kernel + 83-line OpenCL host driver.
; Minimal reproducer for IGC SubgroupSize 32 SIMD emulation bug
; OpAtomicIAdd fires 4x (once per SIMD8 iteration) instead of once
; Expected: force[0] scaled = 0.5, Actual: force[0] scaled = 2.0
OpCapability Addresses
OpCapability Kernel
OpCapability Float64
OpCapability Int64
OpCapability Int64Atomics
OpCapability Int8
OpCapability SubgroupDispatch
OpCapability SubgroupShuffleINTEL
OpExtension "SPV_INTEL_subgroups"
%1 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %fn "test" %gid
OpExecutionMode %fn SubgroupSize 32
OpDecorate %gid BuiltIn LocalInvocationId
%ulong = OpTypeInt 64 0
%uint = OpTypeInt 32 0
%uchar = OpTypeInt 8 0
%double = OpTypeFloat 64
%double_0 = OpConstant %double 0
%ulong_8 = OpConstant %ulong 8
%ulong_16 = OpConstant %ulong 16
%ulong_24 = OpConstant %ulong 24
%ulong_32 = OpConstant %ulong 32
%uint_31 = OpConstant %uint 31
%uint_32 = OpConstant %uint 32
%uint_0 = OpConstant %uint 0
%uint_1 = OpConstant %uint 1
%uint_528 = OpConstant %uint 528
%double_scale = OpConstant %double 4294967296
%void = OpTypeVoid
%bool = OpTypeBool
%v3ulong = OpTypeVector %ulong 3
%ptr_in_v3 = OpTypePointer Input %v3ulong
%ptr_cw_ulong = OpTypePointer CrossWorkgroup %ulong
%ptr_cw_uchar = OpTypePointer CrossWorkgroup %uchar
%ptr_cw_double = OpTypePointer CrossWorkgroup %double
%fntype = OpTypeFunction %void %ptr_cw_uchar %ptr_cw_uchar %ptr_cw_uchar
%gid = OpVariable %ptr_in_v3 Input
%fn = OpFunction %void None %fntype
%p_force = OpFunctionParameter %ptr_cw_uchar
%p_posq = OpFunctionParameter %ptr_cw_uchar
%p_excl = OpFunctionParameter %ptr_cw_uchar
%entry = OpLabel
%fc_cw = OpBitcast %ptr_cw_ulong %p_force
%gidv = OpLoad %v3ulong %gid Aligned 32
%gid0 = OpCompositeExtract %ulong %gidv 0
%tid32 = OpUConvert %uint %gid0
%tgx = OpBitwiseAnd %uint %tid32 %uint_31
%tgx64 = OpUConvert %ulong %tgx
%boff = OpIMul %ulong %tgx64 %ulong_32
%pq_uc = OpInBoundsPtrAccessChain %ptr_cw_uchar %p_posq %boff
%pq_dp = OpBitcast %ptr_cw_double %pq_uc
%pos_x = OpLoad %double %pq_dp Aligned 32
%pq_y_p = OpInBoundsPtrAccessChain %ptr_cw_uchar %pq_uc %ulong_8
%pq_y_d = OpBitcast %ptr_cw_double %pq_y_p
%pos_y = OpLoad %double %pq_y_d Aligned 8
%pq_z_p = OpInBoundsPtrAccessChain %ptr_cw_uchar %pq_uc %ulong_16
%pq_z_d = OpBitcast %ptr_cw_double %pq_z_p
%pos_z = OpLoad %double %pq_z_d Aligned 16
%pq_w_p = OpInBoundsPtrAccessChain %ptr_cw_uchar %pq_uc %ulong_24
%pq_w_d = OpBitcast %ptr_cw_double %pq_w_p
%pos_w = OpLoad %double %pq_w_d Aligned 8
OpBranch %loop_hdr
%loop_hdr = OpLabel
%j = OpPhi %uint %uint_0 %entry %jinc %if_end
%f_x = OpPhi %double %double_0 %entry %nf_x %if_end
%cond = OpULessThan %bool %j %uint_32
OpBranchConditional %cond %loop_body %loop_end
%loop_body = OpLabel
%sh_x = OpSubgroupShuffleINTEL %double %pos_x %j
%sh_y = OpSubgroupShuffleINTEL %double %pos_y %j
%sh_z = OpSubgroupShuffleINTEL %double %pos_z %j
%sh_w = OpSubgroupShuffleINTEL %double %pos_w %j
%dx = OpFSub %double %sh_x %pos_x
%dy = OpFSub %double %sh_y %pos_y
%dz = OpFSub %double %sh_z %pos_z
%dx2 = OpFMul %double %dx %dx
%dy2 = OpFMul %double %dy %dy
%dxy = OpFAdd %double %dx2 %dy2
%dz2 = OpFMul %double %dz %dz
%r2 = OpFAdd %double %dxy %dz2
%invR = OpExtInst %double %1 rsqrt %r2
%tobool = OpINotEqual %bool %j %tgx
OpBranchConditional %tobool %if_then %if_end
%if_then = OpLabel
%qq = OpFMul %double %pos_w %sh_w
%qi1 = OpFMul %double %qq %invR
%dEdR = OpFMul %double %qi1 %invR
OpBranch %if_end
%if_end = OpLabel
%dEdR_f = OpPhi %double %dEdR %if_then %double_0 %loop_body
%fmul = OpFMul %double %dx %dEdR_f
%nf_x = OpFSub %double %f_x %fmul
%jinc = OpIAdd %uint %j %uint_1
OpBranch %loop_hdr
%loop_end = OpLabel
%fp_fin = OpInBoundsPtrAccessChain %ptr_cw_ulong %fc_cw %tgx64
%sc_fx = OpFMul %double %f_x %double_scale
%fx_i = OpConvertFToS %ulong %sc_fx
%r = OpAtomicIAdd %ulong %fp_fin %uint_1 %uint_528 %fx_i
OpReturn
OpFunctionEnd
// Build: g++ -std=c++17 -O2 -o host_fixed host_fixed.cpp -lOpenCL
#include <CL/cl.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <fstream>
#define CK(c) do{cl_int e=(c);if(e){fprintf(stderr,"CL err %d line %d\n",e,__LINE__);exit(1);}}while(0)
int main(int argc, char** argv) {
if(argc<2){fprintf(stderr,"Usage: %s <spv> [platform]\n",argv[0]);return 1;}
int pi=argc>2?atoi(argv[2]):0;
cl_uint np; clGetPlatformIDs(0,0,&np);
std::vector<cl_platform_id> ps(np); clGetPlatformIDs(np,ps.data(),0);
char pn[256]; clGetPlatformInfo(ps[pi],CL_PLATFORM_NAME,256,pn,0);
cl_uint nd; clGetDeviceIDs(ps[pi],CL_DEVICE_TYPE_ALL,0,0,&nd);
std::vector<cl_device_id> ds(nd); clGetDeviceIDs(ps[pi],CL_DEVICE_TYPE_ALL,nd,ds.data(),0);
char dn[256]; clGetDeviceInfo(ds[0],CL_DEVICE_NAME,256,dn,0);
printf("Platform: %s\nDevice: %s\n",pn,dn);
cl_int e;
cl_context ctx=clCreateContext(0,1,&ds[0],0,0,&e); CK(e);
cl_command_queue q=clCreateCommandQueueWithProperties(ctx,ds[0],0,&e); CK(e);
std::ifstream f(argv[1],std::ios::binary|std::ios::ate);
if(!f){fprintf(stderr,"Cannot open %s\n",argv[1]);return 1;}
size_t sz=f.tellg(); f.seekg(0);
std::vector<char> spv(sz); f.read(spv.data(),sz);
cl_program prog=clCreateProgramWithIL(ctx,spv.data(),spv.size(),&e);
if(e){fprintf(stderr,"clCreateProgramWithIL: %d\n",e);return 1;}
e=clBuildProgram(prog,1,&ds[0],0,0,0);
if(e){size_t ls;clGetProgramBuildInfo(prog,ds[0],CL_PROGRAM_BUILD_LOG,0,0,&ls);
std::vector<char> log(ls);clGetProgramBuildInfo(prog,ds[0],CL_PROGRAM_BUILD_LOG,ls,log.data(),0);
fprintf(stderr,"Build log:\n%s\n",log.data());return 1;}
cl_kernel k=clCreateKernel(prog,"_Z4testPyPK15HIP_vector_typeIdLj4EEPKj",&e);
if(e) k=clCreateKernel(prog,"test",&e);
CK(e);
struct double4{double x,y,z,w;};
const int P=32;
double4 hp[P];
for(int i=0;i<P;i++) hp[i]={0,0,0,0};
hp[0]={0,0,0,1}; // atom 0: position (0,0,0), charge +1
hp[1]={2,0,0,-1}; // atom 1: position (2,0,0), charge -1
unsigned he[P]={}; // unused by minimal kernel
unsigned long long hf[96]={};
cl_mem df=clCreateBuffer(ctx,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,sizeof(hf),hf,&e); CK(e);
cl_mem dp=clCreateBuffer(ctx,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,P*sizeof(double4),hp,&e); CK(e);
cl_mem dx=clCreateBuffer(ctx,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,P*4,he,&e); CK(e);
CK(clSetKernelArg(k,0,sizeof(df),&df));
CK(clSetKernelArg(k,1,sizeof(dp),&dp));
CK(clSetKernelArg(k,2,sizeof(dx),&dx));
size_t gs=32,ls=32;
CK(clEnqueueNDRangeKernel(q,k,1,0,&gs,&ls,0,0,0));
CK(clFinish(q));
CK(clEnqueueReadBuffer(q,df,1,0,sizeof(hf),hf,0,0,0));
double sc=1.0/(double)0x100000000LL;
printf("force[0] raw = %lld (expected 2147483648)\n",(long long)hf[0]);
printf("force[0] scaled = %f (expected 0.5)\n",sc*(long long)hf[0]);
double fx=sc*(long long)hf[0];
int pass = (fx > 0.2 && fx < 0.8);
printf("%s\n",pass?"PASS":"FAIL");
clReleaseMemObject(df);clReleaseMemObject(dp);clReleaseMemObject(dx);
clReleaseKernel(k);clReleaseProgram(prog);
clReleaseCommandQueue(q);clReleaseContext(ctx);
return pass?0:1;
}
Platform: Intel(R) OpenCL Graphics
Device: Intel(R) Arc(TM) A770 Graphics
force[0] raw = 2147483648 (expected 2147483648)
force[0] scaled = 0.500000 (expected 0.5)
PASS
Platform: Intel(R) OpenCL Graphics
Device: Intel(R) Arc(TM) A770 Graphics
force[0] raw = 8589934592 (expected 2147483648)
force[0] scaled = 2.000000 (expected 0.5)
FAIL
8589934592 / 2147483648 = 4.0 (exactly 4x the expected value).
The 4x multiplier matches 32 / 8 = 4 SIMD8 iterations needed to emulate SubgroupSize 32 on hardware with native SIMD8 for FP64. This suggests OpAtomicIAdd executes once per SIMD iteration rather than once per logical work item.
This breaks OpenMM (molecular dynamics toolkit) on Intel GPUs via chipStar (HIP-to-SPIR-V). All double-precision nonbonded force tests produce wrong results because the accumulated forces are multiplied by 4.
Summary
When a SPIR-V kernel uses
OpExecutionMode SubgroupSize 32and contains a loop withOpSubgroupShuffleINTEL(double),rsqrt(OpenCL.std ExtInst), and a conditional branch,OpAtomicIAdd(int64) produces values that are exactly 4x the expected result. This suggests the atomic fires once per SIMD8 hardware iteration instead of once per logical work item.Environment
Reproducer
108-line SPIR-V kernel + 83-line OpenCL host driver.
SPIR-V kernel (minimal_bug.spvasm)
The kernel does a simplified Coulomb force computation:
OpenCL host driver (host_fixed.cpp)
Build and run
Expected output
Actual output
8589934592 / 2147483648 = 4.0 (exactly 4x the expected value).
Analysis
The 4x multiplier matches 32 / 8 = 4 SIMD8 iterations needed to emulate SubgroupSize 32 on hardware with native SIMD8 for FP64. This suggests OpAtomicIAdd executes once per SIMD iteration rather than once per logical work item.
Evidence:
Elements required to trigger the bug (removing any one makes it disappear):
OpExecutionMode SubgroupSize 32OpSubgroupShuffleINTEL(double) in a looprsqrt(OpenCL.std ExtInst) in the loopOpAtomicIAdd(int64) after the loopImpact
This breaks OpenMM (molecular dynamics toolkit) on Intel GPUs via chipStar (HIP-to-SPIR-V). All double-precision nonbonded force tests produce wrong results because the accumulated forces are multiplied by 4.