From 1e43b180e34af9499fad46e4c34ddf0d154f286f Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 26 Feb 2021 11:11:41 -0500 Subject: [PATCH 1/2] [SYCL][CUDA] Add initial support for FP atomics Generates native FP32 and FP64 atomics with the following flags: -DSYCL_USE_NATIVE_FP_ATOMICS -Xsycl-target-backend --cuda-gpu-arch=sm_60 Several known issues: - __spirv_AtomicFAddExt is not inlined, so order and scope do not propagate - Generated PTX does not respect order or scope (defaults to relaxed) - Fatal error when compiling with --cuda-gpu-arch <= sm_50 A complete implementation of this feature requires libspirv to be made aware of __nvvm_reflect, so that NVVMReflect can be used to branch on __CUDA_ARCH. Signed-off-by: John Pennycook --- libclc/ptx-nvidiacl/libspirv/SOURCES | 2 + .../atomicfaddext.cl | 131 ++++++++++++++++++ .../faddext_helpers.ll | 125 +++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl create mode 100644 libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/faddext_helpers.ll diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES index c3abf64f652b6..0fae31ff572c0 100644 --- a/libclc/ptx-nvidiacl/libspirv/SOURCES +++ b/libclc/ptx-nvidiacl/libspirv/SOURCES @@ -85,3 +85,5 @@ images/image_helpers.ll images/image.cl group/collectives_helpers.ll group/collectives.cl +SPV_EXT_shader_atomic_float_add/atomicfaddext.cl +SPV_EXT_shader_atomic_float_add/faddext_helpers.ll diff --git a/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl new file mode 100644 index 0000000000000..a26abeb91f892 --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl @@ -0,0 +1,131 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// CLC helpers +float __clc__atomic_fetch_add_float_global_relaxed( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_relaxed"); +float __clc__atomic_fetch_add_float_global_acquire( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_acquire"); +float __clc__atomic_fetch_add_float_global_release( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_release"); +float __clc__atomic_fetch_add_float_global_acq_rel( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_acq_rel"); +float __clc__atomic_fetch_add_float_global_seq_cst( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_seq_cst"); +double __clc__atomic_fetch_add_double_global_relaxed( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_relaxed"); +double __clc__atomic_fetch_add_double_global_acquire( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_acquire"); +double __clc__atomic_fetch_add_double_global_release( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_release"); +double __clc__atomic_fetch_add_double_global_acq_rel( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_acq_rel"); +double __clc__atomic_fetch_add_double_global_seq_cst( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_seq_cst"); +float __clc__atomic_fetch_add_float_local_relaxed(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_relaxed"); +float __clc__atomic_fetch_add_float_local_acquire(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_acquire"); +float __clc__atomic_fetch_add_float_local_release(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_release"); +float __clc__atomic_fetch_add_float_local_acq_rel(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_acq_rel"); +float __clc__atomic_fetch_add_float_local_seq_cst(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_seq_cst"); +double __clc__atomic_fetch_add_double_local_relaxed( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_relaxed"); +double __clc__atomic_fetch_add_double_local_acquire( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_acquire"); +double __clc__atomic_fetch_add_double_local_release( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_release"); +double __clc__atomic_fetch_add_double_local_acq_rel( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_acq_rel"); +double __clc__atomic_fetch_add_double_local_seq_cst( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_seq_cst"); + +// TODO: Convert scope to LLVM IR syncscope if __CUDA_ARCH >= sm_60 +// TODO: Error if scope is not relaxed and __CUDA_ARCH <= sm_60 +#define __CLC_ATOMICFADDEXT(TYPE, AS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_AtomicFAddEXT( \ + __##AS TYPE *pointer, unsigned int scope, unsigned int semantics, \ + TYPE value) { \ + /* Semantics mask may include memory order, storage class and other info \ + Memory order is stored in the lowest 5 bits */ \ + unsigned int order = semantics & 0x1F; \ + \ + switch (order) { \ + case None: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_relaxed(pointer, value); \ + case Acquire: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_acquire(pointer, value); \ + case Release: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_release(pointer, value); \ + case AcquireRelease: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_acq_rel(pointer, value); \ + default: \ + /* Sequentially consistent atomics should never be incorrect */ \ + case SequentiallyConsistent: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_seq_cst(pointer, value); \ + } \ + } +__CLC_ATOMICFADDEXT(float, global) +__CLC_ATOMICFADDEXT(double, global) +__CLC_ATOMICFADDEXT(float, local) +__CLC_ATOMICFADDEXT(double, local) +#undef __CLC_ATOMICFADDEXT + +// TODO: Stop manually mangling these names. +_CLC_DECL float +_Z21__spirv_AtomicFAddEXTPU3AS1fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( + __global float *pointer, unsigned int scope, unsigned int semantics, + float value) { + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +_CLC_DECL double +_Z21__spirv_AtomicFAddEXTPU3AS1dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd( + __global double *pointer, unsigned int scope, unsigned int semantics, + double value) { + // FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50 + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +_CLC_DECL float +_Z21__spirv_AtomicFAddEXTPU3AS3fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( + __local float *pointer, unsigned int scope, unsigned int semantics, + float value) { + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +_CLC_DECL double +_Z21__spirv_AtomicFAddEXTPU3AS3dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd( + __local double *pointer, unsigned int scope, unsigned int semantics, + double value) { + // FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50 + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} diff --git a/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/faddext_helpers.ll b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/faddext_helpers.ll new file mode 100644 index 0000000000000..ea0245d57e59c --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/faddext_helpers.ll @@ -0,0 +1,125 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif + +define float @__clc__atomic_fetch_add_float_global_relaxed(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value monotonic + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_acquire(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acquire + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_release(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value release + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_acq_rel(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acq_rel + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_seq_cst(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value seq_cst + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_relaxed(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value monotonic + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_acquire(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acquire + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_release(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value release + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_acq_rel(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acq_rel + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_seq_cst(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value seq_cst + ret float %0 +} + +define double @__clc__atomic_fetch_add_double_global_relaxed(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value monotonic + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_acquire(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acquire + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_release(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value release + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_acq_rel(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acq_rel + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_seq_cst(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value seq_cst + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_relaxed(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value monotonic + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_acquire(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acquire + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_release(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value release + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_acq_rel(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acq_rel + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_seq_cst(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value seq_cst + ret double %0 +} From 611e25ed97298a1bc8bdabfa0338db1811e3f8e6 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 3 Mar 2021 12:50:21 -0500 Subject: [PATCH 2/2] [SYCL][CUDA][NFC] Guard FP64 atomics with ifdef Signed-off-by: John Pennycook --- .../atomicfaddext.cl | 115 +++++++++--------- 1 file changed, 60 insertions(+), 55 deletions(-) diff --git a/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl index a26abeb91f892..1e165f2e87ec7 100644 --- a/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl +++ b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl @@ -9,9 +9,33 @@ #include #include -#pragma OPENCL EXTENSION cl_khr_fp64 : enable +// TODO: Convert scope to LLVM IR syncscope if __CUDA_ARCH >= sm_60 +// TODO: Error if scope is not relaxed and __CUDA_ARCH <= sm_60 +#define __CLC_ATOMICFADDEXT(TYPE, AS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_AtomicFAddEXT( \ + __##AS TYPE *pointer, unsigned int scope, unsigned int semantics, \ + TYPE value) { \ + /* Semantics mask may include memory order, storage class and other info \ + Memory order is stored in the lowest 5 bits */ \ + unsigned int order = semantics & 0x1F; \ + \ + switch (order) { \ + case None: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_relaxed(pointer, value); \ + case Acquire: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_acquire(pointer, value); \ + case Release: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_release(pointer, value); \ + case AcquireRelease: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_acq_rel(pointer, value); \ + default: \ + /* Sequentially consistent atomics should never be incorrect */ \ + case SequentiallyConsistent: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_seq_cst(pointer, value); \ + } \ + } -// CLC helpers +// FP32 atomics - must work without additional extensions float __clc__atomic_fetch_add_float_global_relaxed( __global float *, float) __asm("__clc__atomic_fetch_add_float_global_relaxed"); @@ -27,6 +51,37 @@ float __clc__atomic_fetch_add_float_global_acq_rel( float __clc__atomic_fetch_add_float_global_seq_cst( __global float *, float) __asm("__clc__atomic_fetch_add_float_global_seq_cst"); +float __clc__atomic_fetch_add_float_local_relaxed(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_relaxed"); +float __clc__atomic_fetch_add_float_local_acquire(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_acquire"); +float __clc__atomic_fetch_add_float_local_release(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_release"); +float __clc__atomic_fetch_add_float_local_acq_rel(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_acq_rel"); +float __clc__atomic_fetch_add_float_local_seq_cst(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_seq_cst"); + +__CLC_ATOMICFADDEXT(float, global) +__CLC_ATOMICFADDEXT(float, local) + +_CLC_DECL float +_Z21__spirv_AtomicFAddEXTPU3AS1fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( + __global float *pointer, unsigned int scope, unsigned int semantics, + float value) { + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +_CLC_DECL float +_Z21__spirv_AtomicFAddEXTPU3AS3fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( + __local float *pointer, unsigned int scope, unsigned int semantics, + float value) { + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +// FP64 atomics - require cl_khr_fp64 extension +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable double __clc__atomic_fetch_add_double_global_relaxed( __global double *, double) __asm("__clc__atomic_fetch_add_double_global_relaxed"); @@ -42,16 +97,6 @@ double __clc__atomic_fetch_add_double_global_acq_rel( double __clc__atomic_fetch_add_double_global_seq_cst( __global double *, double) __asm("__clc__atomic_fetch_add_double_global_seq_cst"); -float __clc__atomic_fetch_add_float_local_relaxed(__local float *, float) __asm( - "__clc__atomic_fetch_add_float_local_relaxed"); -float __clc__atomic_fetch_add_float_local_acquire(__local float *, float) __asm( - "__clc__atomic_fetch_add_float_local_acquire"); -float __clc__atomic_fetch_add_float_local_release(__local float *, float) __asm( - "__clc__atomic_fetch_add_float_local_release"); -float __clc__atomic_fetch_add_float_local_acq_rel(__local float *, float) __asm( - "__clc__atomic_fetch_add_float_local_acq_rel"); -float __clc__atomic_fetch_add_float_local_seq_cst(__local float *, float) __asm( - "__clc__atomic_fetch_add_float_local_seq_cst"); double __clc__atomic_fetch_add_double_local_relaxed( __local double *, double) __asm("__clc__atomic_fetch_add_double_local_relaxed"); @@ -68,44 +113,8 @@ double __clc__atomic_fetch_add_double_local_seq_cst( __local double *, double) __asm("__clc__atomic_fetch_add_double_local_seq_cst"); -// TODO: Convert scope to LLVM IR syncscope if __CUDA_ARCH >= sm_60 -// TODO: Error if scope is not relaxed and __CUDA_ARCH <= sm_60 -#define __CLC_ATOMICFADDEXT(TYPE, AS) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_AtomicFAddEXT( \ - __##AS TYPE *pointer, unsigned int scope, unsigned int semantics, \ - TYPE value) { \ - /* Semantics mask may include memory order, storage class and other info \ - Memory order is stored in the lowest 5 bits */ \ - unsigned int order = semantics & 0x1F; \ - \ - switch (order) { \ - case None: \ - return __clc__atomic_fetch_add_##TYPE##_##AS##_relaxed(pointer, value); \ - case Acquire: \ - return __clc__atomic_fetch_add_##TYPE##_##AS##_acquire(pointer, value); \ - case Release: \ - return __clc__atomic_fetch_add_##TYPE##_##AS##_release(pointer, value); \ - case AcquireRelease: \ - return __clc__atomic_fetch_add_##TYPE##_##AS##_acq_rel(pointer, value); \ - default: \ - /* Sequentially consistent atomics should never be incorrect */ \ - case SequentiallyConsistent: \ - return __clc__atomic_fetch_add_##TYPE##_##AS##_seq_cst(pointer, value); \ - } \ - } -__CLC_ATOMICFADDEXT(float, global) __CLC_ATOMICFADDEXT(double, global) -__CLC_ATOMICFADDEXT(float, local) __CLC_ATOMICFADDEXT(double, local) -#undef __CLC_ATOMICFADDEXT - -// TODO: Stop manually mangling these names. -_CLC_DECL float -_Z21__spirv_AtomicFAddEXTPU3AS1fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( - __global float *pointer, unsigned int scope, unsigned int semantics, - float value) { - return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); -} _CLC_DECL double _Z21__spirv_AtomicFAddEXTPU3AS1dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd( @@ -115,13 +124,6 @@ _Z21__spirv_AtomicFAddEXTPU3AS1dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4Fla return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); } -_CLC_DECL float -_Z21__spirv_AtomicFAddEXTPU3AS3fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( - __local float *pointer, unsigned int scope, unsigned int semantics, - float value) { - return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); -} - _CLC_DECL double _Z21__spirv_AtomicFAddEXTPU3AS3dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd( __local double *pointer, unsigned int scope, unsigned int semantics, @@ -129,3 +131,6 @@ _Z21__spirv_AtomicFAddEXTPU3AS3dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4Fla // FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50 return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); } +#endif // cl_khr_fp64 + +#undef __CLC_ATOMICFADDEXT