diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES index c3abf64f652b6..0fae31ff572c0 100644 --- a/libclc/ptx-nvidiacl/libspirv/SOURCES +++ b/libclc/ptx-nvidiacl/libspirv/SOURCES @@ -85,3 +85,5 @@ images/image_helpers.ll images/image.cl group/collectives_helpers.ll group/collectives.cl +SPV_EXT_shader_atomic_float_add/atomicfaddext.cl +SPV_EXT_shader_atomic_float_add/faddext_helpers.ll diff --git a/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl new file mode 100644 index 0000000000000..1e165f2e87ec7 --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/atomicfaddext.cl @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +// TODO: Convert scope to LLVM IR syncscope if __CUDA_ARCH >= sm_60 +// TODO: Error if scope is not relaxed and __CUDA_ARCH <= sm_60 +#define __CLC_ATOMICFADDEXT(TYPE, AS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_AtomicFAddEXT( \ + __##AS TYPE *pointer, unsigned int scope, unsigned int semantics, \ + TYPE value) { \ + /* Semantics mask may include memory order, storage class and other info \ + Memory order is stored in the lowest 5 bits */ \ + unsigned int order = semantics & 0x1F; \ + \ + switch (order) { \ + case None: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_relaxed(pointer, value); \ + case Acquire: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_acquire(pointer, value); \ + case Release: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_release(pointer, value); \ + case AcquireRelease: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_acq_rel(pointer, value); \ + default: \ + /* Sequentially consistent atomics should never be incorrect */ \ + case SequentiallyConsistent: \ + return __clc__atomic_fetch_add_##TYPE##_##AS##_seq_cst(pointer, value); \ + } \ + } + +// FP32 atomics - must work without additional extensions +float __clc__atomic_fetch_add_float_global_relaxed( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_relaxed"); +float __clc__atomic_fetch_add_float_global_acquire( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_acquire"); +float __clc__atomic_fetch_add_float_global_release( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_release"); +float __clc__atomic_fetch_add_float_global_acq_rel( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_acq_rel"); +float __clc__atomic_fetch_add_float_global_seq_cst( + __global float *, + float) __asm("__clc__atomic_fetch_add_float_global_seq_cst"); +float __clc__atomic_fetch_add_float_local_relaxed(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_relaxed"); +float __clc__atomic_fetch_add_float_local_acquire(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_acquire"); +float __clc__atomic_fetch_add_float_local_release(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_release"); +float __clc__atomic_fetch_add_float_local_acq_rel(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_acq_rel"); +float __clc__atomic_fetch_add_float_local_seq_cst(__local float *, float) __asm( + "__clc__atomic_fetch_add_float_local_seq_cst"); + +__CLC_ATOMICFADDEXT(float, global) +__CLC_ATOMICFADDEXT(float, local) + +_CLC_DECL float +_Z21__spirv_AtomicFAddEXTPU3AS1fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( + __global float *pointer, unsigned int scope, unsigned int semantics, + float value) { + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +_CLC_DECL float +_Z21__spirv_AtomicFAddEXTPU3AS3fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf( + __local float *pointer, unsigned int scope, unsigned int semantics, + float value) { + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +// FP64 atomics - require cl_khr_fp64 extension +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +double __clc__atomic_fetch_add_double_global_relaxed( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_relaxed"); +double __clc__atomic_fetch_add_double_global_acquire( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_acquire"); +double __clc__atomic_fetch_add_double_global_release( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_release"); +double __clc__atomic_fetch_add_double_global_acq_rel( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_acq_rel"); +double __clc__atomic_fetch_add_double_global_seq_cst( + __global double *, + double) __asm("__clc__atomic_fetch_add_double_global_seq_cst"); +double __clc__atomic_fetch_add_double_local_relaxed( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_relaxed"); +double __clc__atomic_fetch_add_double_local_acquire( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_acquire"); +double __clc__atomic_fetch_add_double_local_release( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_release"); +double __clc__atomic_fetch_add_double_local_acq_rel( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_acq_rel"); +double __clc__atomic_fetch_add_double_local_seq_cst( + __local double *, + double) __asm("__clc__atomic_fetch_add_double_local_seq_cst"); + +__CLC_ATOMICFADDEXT(double, global) +__CLC_ATOMICFADDEXT(double, local) + +_CLC_DECL double +_Z21__spirv_AtomicFAddEXTPU3AS1dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd( + __global double *pointer, unsigned int scope, unsigned int semantics, + double value) { + // FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50 + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} + +_CLC_DECL double +_Z21__spirv_AtomicFAddEXTPU3AS3dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd( + __local double *pointer, unsigned int scope, unsigned int semantics, + double value) { + // FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50 + return __spirv_AtomicFAddEXT(pointer, scope, semantics, value); +} +#endif // cl_khr_fp64 + +#undef __CLC_ATOMICFADDEXT diff --git a/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/faddext_helpers.ll b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/faddext_helpers.ll new file mode 100644 index 0000000000000..ea0245d57e59c --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/SPV_EXT_shader_atomic_float_add/faddext_helpers.ll @@ -0,0 +1,125 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif + +define float @__clc__atomic_fetch_add_float_global_relaxed(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value monotonic + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_acquire(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acquire + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_release(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value release + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_acq_rel(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acq_rel + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_global_seq_cst(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value seq_cst + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_relaxed(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value monotonic + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_acquire(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acquire + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_release(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value release + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_acq_rel(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acq_rel + ret float %0 +} + +define float @__clc__atomic_fetch_add_float_local_seq_cst(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value seq_cst + ret float %0 +} + +define double @__clc__atomic_fetch_add_double_global_relaxed(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value monotonic + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_acquire(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acquire + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_release(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value release + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_acq_rel(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acq_rel + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_global_seq_cst(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value seq_cst + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_relaxed(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value monotonic + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_acquire(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acquire + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_release(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value release + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_acq_rel(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acq_rel + ret double %0 +} + +define double @__clc__atomic_fetch_add_double_local_seq_cst(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline { +entry: + %0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value seq_cst + ret double %0 +}